├── .coveragerc
├── .gitignore
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── COPYING
├── LICENSE
├── MANIFEST.in
├── README.md
├── build_instructions.txt
├── docs
    ├── .gitignore
    ├── api
    │   ├── api.rst
    │   ├── index.rst
    │   └── internal_api.rst
    ├── conf.py
    ├── config.rst
    ├── index.rst
    ├── initialize.rst
    ├── install.rst
    ├── links.rst
    ├── projects
    │   └── index.rst
    ├── quick-start
    │   ├── index.rst
    │   └── run_spacemake.rst
    ├── requirements.txt
    ├── run.rst
    ├── shared
    │   ├── shared_sample_variables.rst
    │   └── spacemake_init.rst
    ├── smk_logo.png
    ├── troubleshooting.rst
    └── tutorials
    │   ├── .gitignore
    │   ├── he_integration.ipynb
    │   ├── img
    │       ├── .gitignore
    │       ├── manual_alignment_1.png
    │       ├── manual_alignment_2.png
    │       ├── manual_alignment_3.png
    │       ├── manual_alignment_4.png
    │       ├── manual_alignment_5.png
    │       ├── manual_alignment_6.png
    │       ├── test_longread.donuts.png
    │       ├── test_longread.hists.png
    │       └── test_longread.oligo_edits.png
    │   ├── index.rst
    │   ├── longreads.rst
    │   ├── manual_he_integration.rst
    │   ├── novosparc_integration.ipynb
    │   └── process_single_cell_data.rst
├── environment.yaml
├── pyproject.toml
├── sequences
    ├── .gitignore
    └── primers.fa
├── setup.cfg
├── setup.py
├── spacemake
    ├── .gitignore
    ├── __init__.py
    ├── alnstats.py
    ├── annotator.py
    ├── bin
    │   ├── BamTagHistogram.py
    │   └── fastq_to_uBAM.py
    ├── cmdline.py
    ├── config.py
    ├── contrib.py
    ├── cutadapt_bam.py
    ├── data
    │   ├── .gitignore
    │   ├── config
    │   │   ├── config.yaml
    │   │   ├── longread.yaml
    │   │   └── species_data_url.yaml
    │   ├── puck_collection
    │   │   ├── create_novaseq_S4_coordinate_system.py
    │   │   └── openst_coordinate_system.csv
    │   ├── test
    │   │   ├── test_bc1.csv
    │   │   ├── test_bc2.csv
    │   │   ├── visium_public_lane_joined_1m_R1.fastq.gz
    │   │   └── visium_public_lane_joined_1m_R2.fastq.gz
    │   └── visium_barcode_positions.csv
    ├── errors.py
    ├── longread
    │   ├── __main__.py
    │   ├── annotation.py
    │   ├── cache.py
    │   ├── cmdline.py
    │   ├── overview.py
    │   ├── report.py
    │   └── signature.py
    ├── map_strategy.py
    ├── parallel.py
    ├── preprocess
    │   ├── __init__.py
    │   ├── cmdline.py
    │   ├── dge.py
    │   └── fastq.py
    ├── project_df.py
    ├── quant.py
    ├── reporting.py
    ├── smk.py
    ├── snakemake
    │   ├── __init__.py
    │   ├── downsample.smk
    │   ├── dropseq.smk
    │   ├── longread.smk
    │   ├── main.smk
    │   ├── mapping.smk
    │   ├── merge_samples.smk
    │   ├── scripts
    │   │   ├── .gitignore
    │   │   ├── automated_analysis.py
    │   │   ├── automated_analysis_create_processed_data_files.py
    │   │   ├── automated_analysis_create_report.Rmd
    │   │   ├── clean_top_barcodes.py
    │   │   ├── create_sample_db.R
    │   │   ├── create_sample_overview.Rmd
    │   │   ├── create_spatial_barcode_file.py
    │   │   ├── create_spatial_dge.py
    │   │   ├── filter_mm_reads.py
    │   │   ├── fix_bam_header.py
    │   │   ├── kmer_stats_from_fastq.py
    │   │   ├── n_intersect_sequences.py
    │   │   ├── parse_ribo_log.py
    │   │   ├── qc_sequencing_create_sheet.Rmd
    │   │   ├── saturation_analysis.Rmd
    │   │   ├── shared_functions.R
    │   │   ├── snakemake_helper_functions.py
    │   │   ├── splice_bam_header.py
    │   │   └── split_reads_by_strand_info.py
    │   ├── species_init.smk
    │   ├── variables.py
    │   └── visium.smk
    ├── spatial
    │   ├── __init__.py
    │   ├── cmdline.py
    │   ├── he_integration.py
    │   ├── novosparc_integration.py
    │   ├── puck_collection.py
    │   └── util.py
    ├── tag_alignments.py
    ├── unittests.py
    └── util.py
├── test.sh
├── test_data
    ├── README.md
    ├── make_chr22_test_data.py
    ├── mirgenedb.hsa.mature.fa.gz
    ├── mirgenedb.hsa.mature.gtf.gz
    ├── rRNA_hsa.fa.gz
    ├── reads_chr22_R1.fastq.gz
    ├── reads_chr22_R2.fastq.gz
    ├── test_annotation.gtf.gz
    ├── test_bam_md5.txt
    ├── test_config.yaml
    ├── test_genome.fa.gz
    ├── test_genome.gtf.gz
    ├── test_project_df.csv
    ├── test_reads.R1.fastq.gz
    ├── test_reads.R2.fastq.gz
    ├── tile_1.txt
    ├── tile_2.txt
    └── tile_3.txt
└── tests
    ├── fixtures.py
    ├── test_cmdline.py
    ├── test_fastq_to_ubam.py
    └── test_map_strategy.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | concurrency = multiprocessing
3 | parallel = true
4 | sigterm = true
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | projects
 3 | config.yaml
 4 | samples.yaml
 5 | *.ipynb
 6 | *.tif
 7 | *.jpg
 8 | *.jpeg
 9 | *.png
10 | *.rds
11 | *.html
12 | *.csv
13 | *.pdf
14 | *.txt.gz
15 | *.fastq.gz
16 | *.fa
17 | *.gtf
18 | *.DS_Store
19 | *.log
20 | *.bam
21 | .coverage*
22 | .snakemake
23 | build
24 | __pycache__
25 | spacemake.egg-info
26 | scratch
27 | 
28 | *.icloud
29 | *.h5ad
30 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |     os: "ubuntu-20.04"
 5 |     tools:
 6 |       python: "3.9"
 7 | 
 8 | sphinx:
 9 |     configuration: docs/conf.py
10 | 
11 | python:
12 |     install:
13 |         - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to spacemake
 2 | We want to make contributing to this project as easy and transparent as possible, whether it's:
 3 | 
 4 | - Reporting a bug
 5 | - Discussing the current state of the code
 6 | - Submitting a fix
 7 | - Proposing new features
 8 | - Becoming a maintainer
 9 | 
10 | ## We Develop with Github
11 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
12 | 
13 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
14 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
15 | 
16 | 1. Fork the repo and create your branch from `master`.
17 | 2. If you've added code that should be tested, add tests.
18 | 3. If you've changed APIs, update the documentation.
19 | 4. Ensure the test suite passes.
20 | 5. Make sure your code lints.
21 | 6. Issue that pull request!
22 | 
23 | ## Any contributions you make will be under the GNU Software License
24 | When you submit code changes, your submissions are understood to be under the same [GNU License](http://choosealicense.com/licenses/gnu/) that covers the project. Feel free to contact the maintainers if that's a concern.
25 | 
26 | ## Report bugs using Github's [issues](https://github.com/rajewsky-lab/spacemake/issues)
27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/rajewsky-lab/spacemake/issues/new?assignees=&labels=&template=bug-report.md&title=).; it's that easy!
28 | 
29 | ## Write bug reports with detail, background, and sample code
30 | **Great Bug Reports** tend to have:
31 | 
32 | - A quick summary and/or background
33 | - Steps to reproduce
34 |   - Be specific!
35 |   - Give sample code if you can.
36 | - What you expected would happen
37 | - What actually happens
38 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
39 | 
40 | ## Issue Triage
41 | Here are some tags that we're using to better organize issues in this repo:
42 | 
43 | * `good first issue` - Good candidates for someone new to the project to contribute.
44 | * `help wanted` - Issues that should be addressed and which we would welcome a
45 | PR for but may need significant investigation or work
46 | * `support` - Request for help with a concept or piece of code but this isn't an
47 | issue with the project.
48 | * `needs more info` - Missing repro steps or context for both project issues \&
49 | support questions.
50 | * `discussion` - Issues where folks are discussing various approaches \& ideas.
51 | * `question` - Something that is a question specifically for the maintainers such
52 | as [this issue about the license](https://github.com/facebook/draft-js/issues/1819).
53 | * `documentation` - Relating to improving documentation for the project.
54 | - Browser \& OS-specific tags for anything that is specific to a particular
55 | environment (e.g. `chrome`, `firefox`, `macos`, `android` and so forth).
56 | 
57 | ## References
58 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md)
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | GNU General Public License, version 2 (GPL-2.0)
 2 | 
 3 | spacemake: pipeline for processing and analysing sequencing based spatial-transcriptomics data.
 4 | 
 5 | Copyright (C) 2021 Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikolaos Karaiskos and Nikolaus Rajewsky.
 6 | All rights reserved.
 7 | 
 8 | This file is part of spacemake.
 9 | 
10 | Spacemake is free software; you can redistribute it and/or modify
11 | it under the terms of the GNU General Public License as published by
12 | the Free Software Foundation; either version 2 of the License, or
13 | (at your option) any later version.
14 | 
15 | Spacemake is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 | GNU General Public License for more details.
19 | 
20 | You should have received a copy of the GNU General Public License
21 | along with this program; if not, write to the Free Software
22 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include *.smk *.csv *.py *.R *.Rmd *.yaml
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![docs](https://readthedocs.org/projects/spacemake/badge/?version=latest)](https://spacemake.readthedocs.io/)
 2 | [![Downloads](https://pepy.tech/badge/spacemake)](https://pepy.tech/project/spacemake)
 3 | [![PyPI Version](https://img.shields.io/pypi/v/spacemake.svg)](https://pypi.org/project/spacemake)
 4 | [![PyPI License](https://img.shields.io/pypi/l/spacemake.svg)](https://pypi.org/project/spacemake)
 5 | 
 6 | 
 7 | # Spacemake: processing and analysis of large-scale spatial transcriptomics data
 8 | ### [🌐 docs](https://spacemake.readthedocs.io/en/latest/) | [📜 paper](https://doi.org/10.1093/gigascience/giac064) | [💬 discussions](https://github.com/rajewsky-lab/spacemake/discussions)
 9 | <img src="https://raw.githubusercontent.com/rajewsky-lab/spacemake/master/docs/smk_logo.png" width="200" />
10 | 
11 | Spacemake is a modular, robust, and scalable spatial transcriptomics pipeline built in `Snakemake` and `Python`. Spacemake is designed to handle all major spatial transcriptomics datasets and can be readily configured for other technologies. It can process and analyze several samples in parallel, even if they stem from different experimental methods. Spacemake's unified framework enables reproducible data processing from raw sequencing data to automatically generated downstream analysis reports. Spacemake is built with a modular design and offers additional functionality such as sample merging, saturation analysis, and analysis of long reads as separate modules.
12 | 
13 | If you find Spacemake useful in your work, consider citing it: 
14 | 
15 | ```
16 | Spacemake: processing and analysis of large-scale spatial transcriptomics data
17 | Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky
18 | GigaScience, Volume 11, 2022, giac064
19 | ```
20 | 
21 | Documentation can be found [here](https://spacemake.readthedocs.io/en/latest/).
22 | 
23 | ## Unit testing
24 | 
25 | We are committed to achieving a high code coverage with unit tests. The master branch utilizes the `unittest` module to run spacemake with small test data sets. On the current development branches, we have switched to `pytest` and cover a much broader range of the code. This work is ongoing.
26 | 
27 | To run the currently implemented tests on master, run `python spacemake/unittests.py`. This will create a directory `spacemake/_tests/` inside which a minimal spacemake directory structure will be created using `spacemake init` and subsequently some of the core functionality (adding genomes/species, samples, changing configuration, etc.) will be executed. All output will be logged to `spacemake/_tests/run_spacemake.out.log`. If you encounter any weird behavior, please make sure to include the content of this file in your ticket on the issue tracker. Thank you!
28 | ...
29 | 
30 | ## Contributing
31 | `Spacemake` is an open-source project mostly maintained by the [Rajewsky lab @ MDC Berlin](https://www.mdc-berlin.de/n-rajewsky) - so, your involvement is warmly welcome! 
32 | If you're excited to join us, we recommend the following steps:
33 | 
34 | - Found a bug? Contact an admin in the form of an [issue](https://github.com/rajewsky-lab/openst/issues/new?assignees=&labels=&template=bug-report.md&title=).
35 | - Implement your idea following guidelines set by the [official contributing guide](CONTRIBUTING.md)
36 | - Wait for admin approval; approval is iterative, but if accepted will belong to the main repository.
37 | 
38 | In general, you can always refer to the [contribution guidelines](CONTRIBUTING.md) for more details!
39 | Currently, only [admins](https://github.com/orgs/rajewsky-lab/people) will be merging all accepted changes.
40 | 
41 | ## Code of Conduct
42 | Everyone interacting in `spacemake`'s codebases, issue trackers, and discussion forums is expected to follow the [PSF Code of Conduct](https://www.python.org/psf/conduct/).
43 | 


--------------------------------------------------------------------------------
/build_instructions.txt:
--------------------------------------------------------------------------------
 1 | # How this package was built for PyPI
 2 | 
 3 | - installing 'build' and 'twine' via pip
 4 | 
 5 |     `python3 -m pip install --upgrade build`
 6 |     `python3 -m pip install --upgrade twine`
 7 | 
 8 | - getting an API token from PyPI and placing in ~/.pypirc
 9 | 
10 | - in top-level (where pyptoject.toml resides) `python -m build`
11 |  
12 |   This creates package files in the dists/ subdirectory
13 | 
14 | - upload
15 |  testpypi: `python3 -m twine upload --repository testpypi dist/* `
16 |  live pypi: `python3 -m twine upload --repository pypi dist/* `
17 | 
18 |  
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | !*.ipynb
2 | !projects
3 | 


--------------------------------------------------------------------------------
/docs/api/api.rst:
--------------------------------------------------------------------------------
 1 | API
 2 | ===
 3 | 
 4 | Spacemake class
 5 | ---------------
 6 | 
 7 | Accessing spacemake objects from python
 8 | 
 9 | .. autoclass:: spacemake.Spacemake
10 |     :members:
11 | 
12 | H&E integration module
13 | ----------------------
14 | 
15 | .. autofunction:: spacemake.spatial.he_integration.align_he_spot_img
16 | 
17 | .. autofunction:: spacemake.spatial.he_integration.align_he_aggregated_img
18 | 
19 | .. autofunction:: spacemake.spatial.he_integration.attach_he_adata
20 | 
21 | novosparc integration module
22 | ----------------------------
23 | 
24 | .. autofunction:: spacemake.spatial.novosparc_integration.novosparc_denovo
25 | 
26 | .. autofunction:: spacemake.spatial.novosparc_integration.save_novosparc_res
27 | 
28 | .. autofunction:: spacemake.spatial.novosparc_integration.novosparc_mapping
29 | 
30 | .. autofunction:: spacemake.spatial.novosparc_integration.quantify_clusters_spatially
31 | 


--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
1 | API and Internal API
2 | ====================
3 | 
4 | .. toctree::
5 |     
6 |     api
7 |     internal_api    
8 | 


--------------------------------------------------------------------------------
/docs/api/internal_api.rst:
--------------------------------------------------------------------------------
 1 | Internal API
 2 | ============
 3 | 
 4 | ProjectDF
 5 | ---------
 6 | 
 7 | The ProjectDF class is the core back-end class of spacemake. 
 8 | 
 9 | .. automodule:: spacemake.project_df
10 |    :members:
11 | 
12 | ConfigFile
13 | ----------
14 | 
15 | This class is responsible for updating spacemake's configuration.
16 | 
17 | .. automodule:: spacemake.config
18 |    :members:
19 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | 
 3 | # -- Project information
 4 | 
 5 | project = 'spacemake'
 6 | copyright = '2021-2024, Rajewsky Lab'
 7 | author = 'Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky'
 8 | 
 9 | version = '0.8.0'
10 | release = version
11 | 
12 | # -- General configuration
13 | 
14 | extensions = [
15 |     "sphinx_rtd_theme",
16 |     'sphinx.ext.duration',
17 |     'sphinx.ext.doctest',
18 |     'sphinx.ext.autodoc',
19 |     'sphinx.ext.autosummary',
20 |     'sphinx.ext.intersphinx',
21 |     'sphinx.ext.autosectionlabel',
22 |     'nbsphinx'
23 | ]
24 | 
25 | intersphinx_mapping = {
26 |     'python': ('https://docs.python.org/3/', None),
27 |     'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
28 | }
29 | 
30 | intersphinx_disabled_domains = ['std']
31 | 
32 | templates_path = ['_templates']
33 | 
34 | # -- Options for HTML output
35 | html_theme = "sphinx_rtd_theme"
36 | html_theme_options = {
37 |     'navigation_depth': 3
38 | }
39 | 
40 | # -- Options for EPUB output
41 | epub_show_urls = 'footnote'
42 | 
43 | import os
44 | import sys
45 | sys.path.insert(0, os.path.abspath('../'))
46 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Spacemake: processing and analyzing sequenced-based spatial transcriptomics data
 2 | ================================================================================
 3 | 
 4 | Spacemake is a modular, robust, and scalable spatial transcriptomics pipeline built 
 5 | in Snakemake and Python. Spacemake is designed to handle all major spatial transcriptomics 
 6 | datasets and can be readily configured for other technologies. It can process and analyze 
 7 | several samples in parallel, even if they stem from different experimental methods. 
 8 | Spacemake's unified framework enables reproducible data processing from raw sequencing 
 9 | data to automatically generated downstream analysis reports. Spacemake is built with 
10 | a modular design and offers additional functionality such as sample merging, saturation 
11 | analysis, and analysis of long reads as separate modules.
12 | 
13 | .. toctree::
14 |     :maxdepth: 3
15 |     :hidden:
16 | 
17 |     install
18 |     quick-start/index.rst
19 |     initialize
20 |     config
21 |     projects/index
22 |     run
23 |     tutorials/index
24 |     troubleshooting
25 |     api/index
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/initialize.rst:
--------------------------------------------------------------------------------
 1 | Initialization
 2 | ==============
 3 | 
 4 | Initializing using required arguments
 5 | -------------------------------------
 6 | 
 7 | .. include:: shared/spacemake_init.rst
 8 | 
 9 | Optional arguments
10 | ------------------
11 | 
12 | The `spacemake init` command takes the following optional arguments:
13 | 
14 | ``root-dir``
15 |     The ``root-dir`` for the spacemake instance. Defaults to ``.``, the directory in which `spacemake init` is ran.
16 | 
17 | ``temp-dir``
18 |     Path to the temporary directory, defaults to ``/tmp``.
19 | 
20 | ``download-species``
21 |     If set, spacemake will download the genome (.fa) and annotation (.gtf) files for mouse and 
22 |     human from gencode, as specified `here <https://github.com/rajewsky-lab/spacemake/blob/master/spacemake/data/config/species_data_url.yaml>`_.
23 | 
24 | Hence, the complete `spacemake init` command looks like this::
25 |     
26 |     spacemake init \
27 |       --root-dir ROOT-DIR \             # optional
28 |       --temp-dir TEMP-DIR \             # optional
29 |       --download-species \              # optional
30 |       --dropseq-tools DROPSEQ-TOOLS     # required
31 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Step 1: create conda environment
 5 | --------------------------------
 6 | 
 7 | The most straightforward way to install spacemake, is to first create conda environment with the above packages.
 8 | We highly recommend using `mamba <https://github.com/mamba-org/mamba>`_, a much faster conda package manager than conda itself.
 9 | After mamba is installed, download the `environment.yaml <https://raw.githubusercontent.com/rajewsky-lab/spacemake/master/environment.yaml>`_.
10 | This file contains all dependencies required by spacemake.
11 | 
12 | Once downloaded, to install all spacemake dependencies type::
13 | 
14 |     mamba env create -f environment.yaml
15 | 
16 | This will create a conda environment called ``spacemake``. To activate the newly created environment type::
17 | 
18 |    conda activate spacemake
19 | 
20 | Step 2: download Dropseq-tools
21 | ------------------------------
22 | 
23 | Spacemake currently requires to download `Dropseq-tools <https://github.com/broadinstitute/Drop-seq>`_.
24 | This package is a collection of processing tools originally written for `Drop-seq <https://www.cell.com/cell/fulltext/S0092-8674(15)00549-8>`_. Spacemake uses several functions from this package during pre-processing and processing, and without it it is impossible to run spacemake.
25 | 
26 | Simply download one of the releases (we recommend using `2.5.1 <https://github.com/broadinstitute/Drop-seq/releases/download/v2.5.1/Drop-seq_tools-2.5.1.zip>`_) and place it somewhere in your filesystem.
27 | 
28 | 
29 | Step 3: install spacemake
30 | -------------------------
31 | 
32 | **After creating the conda environment and downloading Dropseq-tools** (as described above),
33 | spacemake can be installed via ``pip``::
34 | 
35 |    pip install spacemake
36 | 
37 | This will install spacemake, you should be good to go :)
38 | 
39 | .. warning::
40 |     Make sure to first create the conda environment as described above.
41 | 
42 |     Although it is also possible to install the required packages independently, and then
43 |     to install spacemake, this option has not been tested, and one can quickly run into
44 |     dependency issues and errors.
45 | 
46 | To make sure spacemake has been properly installed, run::
47 | 
48 |    spacemake --version
49 | 
50 | This should output the latest spacemake version available in ``pip``.


--------------------------------------------------------------------------------
/docs/links.rst:
--------------------------------------------------------------------------------
1 | 
2 | .. _Seq-scope: https://www.sciencedirect.com/science/article/pii/S0092867421006279
3 | .. _Visium: https://www.10xgenomics.com/products/spatial-gene-expression
4 | .. _Slide-seq: https://www.nature.com/articles/s41587-020-0739-1
5 | .. _Drop-seq: https://mccarrolllab.org/dropseq/
6 | .. _10X Chromium: https://www.10xgenomics.com/products/single-cell-gene-expression
7 | 


--------------------------------------------------------------------------------
/docs/quick-start/run_spacemake.rst:
--------------------------------------------------------------------------------
1 | After a sample is added spacemake can be run with::
2 | 
3 |    spacemake run --cores <n_cores> --keep-going
4 | 
5 | The ``--keep-going`` flag is optional, however it will ensure that spacemake runs all
6 | the jobs it can, even if one job fails (this logic is directly taken from snakemake).
7 | 
8 | For a complete explanation on the `spacemake run` command :ref:`check out the documentation here <Running spacemake general>`.
9 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==5.0.2
2 | sphinxcontrib-napoleon
3 | docutils>=0.14
4 | sphinx_rtd_theme
5 | sphinx-argparse
6 | nbsphinx>=0.4
7 | importlib-metadata
8 | Jinja2<3.1
9 | 


--------------------------------------------------------------------------------
/docs/run.rst:
--------------------------------------------------------------------------------
 1 | .. _Running spacemake general:
 2 | 
 3 | Running spacemake
 4 | =================
 5 | 
 6 | Main modules
 7 | ------------
 8 | 
 9 | After spacemake in configured with the ``spacemake config`` command, and projects/samples
10 | are added with the ``spacemake projects`` command, spacemake can be run with the 
11 | ``spacemake run`` command. It takes the following parameters::
12 | 
13 |     spacemake run \ 
14 |         --cores CORES \     # number of cores to be used in total
15 |         --dryrun, -n  \     # invokes a dry snakemake run, printing only commands
16 |         --rerun-incomplete, --ri \
17 |                             # forces snakemake to rerun incompletely generated files
18 |         --keep-going  \     # if a job fails, keep executing independent jobs.
19 |                             # we recommend to always set this when running spacemake
20 |                             # overnight
21 |         --printshellcmds, -p \
22 |                             # print shell commands for each rule, if exist
23 |         --touch, -t   \     # rather than running the rules, just touch each file
24 |         --with_fastqc, -wfqc
25 |                             # Run also fastqc as part of the spacemake run
26 | 
27 | Downsampling
28 | ------------
29 | 
30 | To run a downsampling (or saturation) analysis, one can use the following command::
31 | 
32 |     spacemake run downsample \
33 |         --project_id_list [PROJECT_ID_LIST ...] \
34 |         --sample_id_list [SAMPLE_ID_LIST ...]
35 | 
36 | In the ``project_id_list`` and ``sample_id_list`` arguments one can specify which a 
37 | list of ``project_id``-s and ``sample_id``-s respectively, for which the downsampling
38 | should be run. It is possible to set only one, or both of these arguments. If both are
39 | set the downsampling will be run on samples for which the ``project_id`` and the ``sample_id`` are in both lists (intersection).
40 | 
41 | .. note::
42 | 
43 |     In addition to the list arguments specified above, the downsample command also
44 |     takes the same arguments as the simple ``spacemake run`` command.
45 | 


--------------------------------------------------------------------------------
/docs/shared/shared_sample_variables.rst:
--------------------------------------------------------------------------------
 1 | One of the most important parts of spacemake are the so-called 'shared sample-variables'.
 2 | These are reusable, user-definable variables, which we can assign to several samples.
 3 | They can be shortly defined as follows:
 4 | 
 5 | ``species``
 6 |    a collection of genome, annotation and rRNA\_genome. There is no default species, and each sample can have exactly one species.
 7 | 
 8 | ``barcode_flavor``
 9 |    the variable which specifies the structure of Read1 and Read2, namely how the cell\_barcode and UMI should be extracted. If no value provided for a sample, the default will be used.
10 | 
11 | ``run_mode``
12 |    each sample can have several ``run_mode``-s, all of which are user definable. If no ``run_mode``-s are specified, a sample will be processed using ``default`` ``run_mode`` settings.
13 | 
14 | ``puck`` (spatial only)
15 |    if a sample is spatial, it has to have a puck variable. If no puck is specified, a default puck will be used.  
16 | 
17 | 
18 | To add, update, delete or list a shared sample-variable, you can use the following commands::
19 | 
20 |    spacemake config add_<shared-sample-variable>
21 |    spacemake config update_<shared-sample-variable>
22 |    spacemake config delete_<shared-sample-variable>
23 |    spacemake config list_<shared-sample-variable>
24 | 
25 | where ``<shared-sample-variable>`` is one of ``species, barcode_flavor, run_mode or puck``
26 | 


--------------------------------------------------------------------------------
/docs/shared/spacemake_init.rst:
--------------------------------------------------------------------------------
 1 | After you have installed spacemake as specified :ref:`here <installation>`, you are ready to process and analyze spatial samples.
 2 | 
 3 | To initialize spacemake ``cd`` into the directory in which you want to start spacemake. This directory will be your ``project_root``.
 4 | Then simply type::
 5 |    
 6 |    spacemake init \
 7 |       --dropseq_tools <path_to_dropseq_tools_dir>
 8 | 
 9 | Here the `path_to_dropseq_tools_dir` should point to the directory of the downloaded Dropseq-tools package, downloaded :ref:`in Step 2 of the installation <step 2: download dropseq-tools>`.
10 | 


--------------------------------------------------------------------------------
/docs/smk_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/smk_logo.png


--------------------------------------------------------------------------------
/docs/troubleshooting.rst:
--------------------------------------------------------------------------------
 1 | Troubleshooting
 2 | ===============
 3 | Below is a list of known issues you may encounter when running spacemake.
 4 | As spacemake depends on several external libraries and tools, it is not
 5 | always possible to resolve some of these issues.
 6 | 
 7 | Bumped on another problem which is not documented here? Feel free to `open
 8 | an issue on Github. <https://github.com/rajewsky-lab/spacemake/issues>`_
 9 | 
10 | 
11 | GLIBCXX_xxx not found
12 | ^^^^^^^^^^^^^^^^^^^^^
13 | In certain environments you might run into the following error:
14 | 
15 | .. code-block:: console
16 | 
17 |     ImportError: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /.../envs/spacemake/lib/python3.10/site-packages/matplotlib/_path.cpython-310-x86_64-linux-gnu.so)
18 | 
19 | Certain dependencies (e.g. scipy) might affect this. To overcome it, try the following:
20 | 
21 | .. code-block:: console
22 | 
23 |    export LD_LIBRARY_PATH=/conda_installation_folder/envs/spacemake/lib:$LD_LIBRARY_PATH
24 | 
25 | For more details on this and further troubleshooting visit `stackoverflow <https://stackoverflow.com/questions/72540359/glibcxx-3-4-30-not-found-for-librosa-in-conda-virtual-environment-after-tryin>`_.
26 | 
27 | 
28 | Issues with memory usage
29 | ^^^^^^^^^^^^^^^^^^^^^^^^
30 | Certain steps of the spacemake workflow might currently result in excessive memory
31 | usage. These occur in samples where a large number of barcodes exist in the data,
32 | such as for ``stero-seq`` or ``open-ST``. The excessive memory usage is due to using
33 | the ``Drop-seq`` tools, where a specific memory size is allocated for ``java``.
34 | 
35 | We are working on removing the depndency to the ``Drop-seq`` tools altogether,
36 | which will also speed up several steps. If you run into memory errors, however, you
37 | can solve them by modifying the ``main.smk`` file inside your spacemake installation,
38 | which should be somewhere in
39 | 
40 | .. code-block:: console
41 | 
42 |     /path_to_conda/envs/spacemake/lib/python3.10/site-packages/spacemake/snakemake
43 | 
44 | inside your conda installation folder. Simply modify the following lines
45 | 
46 | .. code-block:: console
47 | 
48 |     {dropseq_tools}/BamTagHistogram -m 32g
49 |     {dropseq_tools}/DigitalExpression -m 16g
50 | 
51 | by increasing the value of ``-m`` accordingly.
52 | 
53 | 
54 | Issues with STAR
55 | ^^^^^^^^^^^^^^^^
56 | To reduce memory usage when running several samples at the same time, 
57 | spacemake uses STAR's shared memory capability. This currently has
58 | the following limitations:
59 | 
60 | 1. It is not possible for one user to run two distinct spacemake instances with the same genome index. Multiple spacemake instances (each processing several samples) can run at the same time if different species indexes are used.
61 | 2. Similarly, it is not possible that two users can run spacemake with the same genome index loaded at the same time.
62 | 
63 | In addition to the above, you might run into STAR-related errors if the spacemake
64 | instance was killed before finishing. This occurs when the genome index is still loaded
65 | into memory, and STAR will either throw an error and exit, or just stall. In that case,
66 | try to run:
67 | 
68 | .. code-block:: console
69 | 
70 |     STAR --genomeLoad Remove --genomeDir <genome_dir>
71 | 
72 | In case the shared memory cannot be released (Linux), try the following:
73 | 
74 | .. code-block:: console
75 | 
76 |     ipcs -m | grep `whoami` | awk '{ print $2 }' | xargs -n1 ipcrm -m
77 | 


--------------------------------------------------------------------------------
/docs/tutorials/.gitignore:
--------------------------------------------------------------------------------
1 | !*.png
2 | 


--------------------------------------------------------------------------------
/docs/tutorials/img/.gitignore:
--------------------------------------------------------------------------------
1 | !*.png
2 | !*.jpg
3 | 


--------------------------------------------------------------------------------
/docs/tutorials/img/manual_alignment_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_1.png


--------------------------------------------------------------------------------
/docs/tutorials/img/manual_alignment_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_2.png


--------------------------------------------------------------------------------
/docs/tutorials/img/manual_alignment_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_3.png


--------------------------------------------------------------------------------
/docs/tutorials/img/manual_alignment_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_4.png


--------------------------------------------------------------------------------
/docs/tutorials/img/manual_alignment_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_5.png


--------------------------------------------------------------------------------
/docs/tutorials/img/manual_alignment_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_6.png


--------------------------------------------------------------------------------
/docs/tutorials/img/test_longread.donuts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/test_longread.donuts.png


--------------------------------------------------------------------------------
/docs/tutorials/img/test_longread.hists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/test_longread.hists.png


--------------------------------------------------------------------------------
/docs/tutorials/img/test_longread.oligo_edits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/test_longread.oligo_edits.png


--------------------------------------------------------------------------------
/docs/tutorials/index.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | =========
 3 | 
 4 | .. toctree::
 5 |     he_integration
 6 |     manual_he_integration
 7 |     process_single_cell_data
 8 |     longreads
 9 |     novosparc_integration
10 | 


--------------------------------------------------------------------------------
/docs/tutorials/manual_he_integration.rst:
--------------------------------------------------------------------------------
  1 | Manual H&E alignment
  2 | ====================
  3 | 
  4 | Before you begin
  5 | ----------------
  6 | 
  7 | Before start, make sure that you have installed spacemake as specified :ref:`here <installation>`
  8 | 
  9 | For the manual allignment we will use Fiji, an open-source image processing tool. Download it from `here <https://imagej.net/software/fiji/downloads>`_.
 10 | 
 11 | We will be using tile nr 2105 from `Seq-scope <https://www.sciencedirect.com/science/article/abs/pii/S0092867421006279>`_ for this tutorial. The corresponding H&E image is
 12 | `wt_4X_2.jpg <https://deepblue.lib.umich.edu/data/downloads/qv33rw833>`_.
 13 | 
 14 | Step 1 - generate an expression image
 15 | -------------------------------------
 16 | 
 17 | First using the command line we generate an aggregated expression image. In the directory of your spacemake project, type:
 18 | 
 19 | .. code-block:: console
 20 | 
 21 |     spacemake spatial create_aggregated_expression_img \
 22 |         --project_id seq_scope \
 23 |         --sample_id seq_scope_liver_2105 \
 24 |         --run_mode seq_scope \
 25 |         --processed_data False \
 26 |         --binary True \
 27 |         --out_img aggregated_seq_scope_2105_img_bw.png
 28 | 
 29 | This will generate a black and white image based on expression data.
 30 | 
 31 | Step 2 - load images into Fiji
 32 | ------------------------------
 33 | 
 34 | In the next step we load both images into Fiji like below:
 35 | 
 36 | .. image:: img/manual_alignment_1.png
 37 |     :width: 100%
 38 |     :alt: Manual alignment first step
 39 | 
 40 | Step 3 - select corresponding points
 41 | ------------------------------------
 42 | 
 43 | Next, using the *Multi-point Tool* we manually select corresponding points between our expression image and the H&E image. 
 44 | Select a point on one of the images, and then select a corresponding point on the other image. Do this for at least 4-5 corresponding points for a better match.
 45 | 
 46 | .. image:: img/manual_alignment_2.png
 47 |     :width: 100%
 48 |     :alt: Manual alignment second step
 49 | 
 50 | Step 4 - align the images
 51 | -------------------------
 52 | 
 53 | We then use the `Landmark Correspondences <https://imagej.net/plugins/landmark-correspondences>`_ plugin to align the two images based on the correspondencing points we
 54 | selected in the previous step. We go to *Plugins -> Transform -> Landmark Correspondences*:
 55 | 
 56 | .. image:: img/manual_alignment_3.png
 57 |     :width: 100%
 58 | 
 59 | In the pop-up window we select H&E image as the *source image* and expression image as the *template image*.
 60 | For the *transformation method* select *Moving Least Squares (non-linear)*. Set the *alpha* to *1.00* and the *mesh resolution* to *32*.
 61 | Set the *transformation class* to *Affine*.
 62 | 
 63 | .. image:: img/manual_alignment_4.png
 64 |     :width: 100%
 65 | 
 66 | After the transformation we have the two images aligned. We can now save our transformed H&E image (which is aligned with our spatial data).
 67 | 
 68 | .. image:: img/manual_alignment_5.png
 69 |     :width: 100%
 70 | 
 71 | 
 72 | Step 5 - attach the aligned image
 73 | ---------------------------------
 74 | 
 75 | First we load the spacemake processed Seq-scope tile nr 2105 data:
 76 | 
 77 | .. code-block:: ipython3
 78 | 
 79 |     from spacemake import Spacemake
 80 | 
 81 |     spmk = Spacemake('/path/to/your/spacemake/project')
 82 | 
 83 |     adata_2105 = spmk.load_processed_adata(
 84 |         project_id = 'seq_scope',
 85 |         sample_id = 'seq_scope_liver_2105',
 86 |         run_mode_name = 'seq_scope',
 87 |         umi_cutoff = 300
 88 |     )
 89 | 
 90 | Then we load the previously manually aligned image and attach it to our data:
 91 | 
 92 | .. code-block:: ipython3
 93 | 
 94 |     from spacemake.spatial.he_integration import attach_he_adata
 95 |     import cv2
 96 | 
 97 |     matched_he = cv2.imread('./Transformedwt_4X_2.tif')
 98 | 
 99 |     adata = attach_he_adata(adata_2105.copy(),
100 |                             matched_he,
101 |                             push_by_spot_diameter=False,
102 |                             raw_aligned=True)
103 | 
104 | After attachment, we can plot our expression data on top of the aligned H&E with `scanpy <https://github.com/theislab/scanpy>`_:
105 | 
106 | .. code-block:: ipython3
107 | 
108 |     import scanpy as sc
109 | 
110 |     sc.set_figure_params(dpi=300)
111 | 
112 |     sc.pl.spatial(adata, color='total_counts')
113 | 
114 | .. image:: img/manual_alignment_6.png
115 |     :width: 100%
116 | 
117 | 
118 | .. note::
119 |     
120 |     As axes in scanpy are flipped with respect to the axes in Fiji, because Fiji reads the image axes in different order.
121 | 


--------------------------------------------------------------------------------
/docs/tutorials/process_single_cell_data.rst:
--------------------------------------------------------------------------------
  1 | .. include:: ../links.rst
  2 | 
  3 | Processing a custom single-cell sample
  4 | ======================================
  5 | 
  6 | In this tutorial we will process a custom single cell sample. 
  7 | 
  8 | As an example we will be using 1 million reads from `this Visium dataset <https://www.10xgenomics.com/resources/datasets/mouse-brain-section-coronal-1-standard-1-0-0>`_.
  9 | 
 10 | .. note::
 11 |     
 12 |     Firstly, the example data used here is a 10X `Visium`_ dataset, hence it is spatial.
 13 |     However, for the sake of this tutorial, we will be treating it as a single-cell sample.
 14 |     
 15 |     Secondly, for many methods (such as `Visium`_, `10X Chromium`_ `Slide-seq`_ or `Seq-scope`_)
 16 |     spacemake provides pre-defined variables. If you are using
 17 |     one of these methods follow our :ref:`Quick start guide <quick start guide>` instead.
 18 | 
 19 | Step 1: install and initialize spacemake
 20 | -----------------------------------------
 21 | 
 22 | To install spacemake follow the :ref:`installation guide here <installation>`.
 23 | 
 24 | To initialize spacemake follow the :ref:`initialization guide here <initialization>`.
 25 | 
 26 | Step 2: download test data
 27 | --------------------------
 28 | 
 29 | For the sake of this tutorial we will work with a test dataset: 1 million Read1 and 1 million Read2 reads from a `Visium`_ adult mouse brain.
 30 | 
 31 | To download the test data:
 32 | 
 33 | .. code-block::
 34 | 
 35 |     wget -nv http://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/visium/test_fastq/visium_public_lane_joined_1m_R1.fastq.gz
 36 |     wget -nv http://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/visium/test_fastq/visium_public_lane_joined_1m_R2.fastq.gz
 37 | 
 38 | .. note:: 
 39 | 
 40 |     If there is already data available, to be processed and analyzed, this step can be omitted.
 41 | 
 42 | Step 3: add a new species
 43 | -------------------------
 44 | 
 45 | .. note::
 46 | 
 47 |     If you initialized spacemake with the ``--download-species`` flag, you can
 48 |     omit this step, as spacemake will automatically download and configure
 49 |     mm10 mouse genome.fa and annotation.gtf files for you.
 50 | 
 51 | The sample we are working with here is a mouse brain sample, so we have to add a new species:
 52 | 
 53 | .. code-block:: console
 54 | 
 55 |    spacemake config add_species --name mouse \
 56 |    --annotation /path/to/mouse/annotation.gtf \
 57 |    --genome /path/to/mouse/genome.fa
 58 | 
 59 | 
 60 | Step 4: add a new barcode\_flavor
 61 | ---------------------------------
 62 | 
 63 | The ``barcode_flavor`` will decide which nucletodies of Read1/Read2 extract the UMIs and cell-barcodes from.
 64 | 
 65 | In this perticular test sample, the first 16 nucleotides of Read1 are the cell-barcode, and the following 12 nucleotides are the UMIs.
 66 | 
 67 | Consequently, we create a new ``barcode_flavor`` like this:
 68 | 
 69 | .. code-block:: console
 70 | 
 71 |     spacemake config add_barcode_flavor --name test_barcode_flavor \
 72 |     --cell_barcode r1[0:16] \
 73 |     --umi r1[16:28]
 74 | 
 75 | .. note:: 
 76 | 
 77 |     There are several ``barcode_flavors`` provided by spacemake out of the box,
 78 |     such as ``visium`` for 10X `Visium`_ or ``sc_10x_v2`` for `10X Chromium`_ v2 
 79 |     kits. The ``default`` flavor is identical to a `Drop-seq`_ library, with 12
 80 |     nucleotide cell-barcode and 8 nucleotide UMI. 
 81 | 
 82 |     :ref:`More info about provided flavors here <provided barcode\\_flavors>`.
 83 | 
 84 |     If you want to use one of these, there is no need to add your own flavor.
 85 | 
 86 | Step 5: add a new run\_mode
 87 | ---------------------------
 88 | 
 89 | A ``run_mode`` in spacemake defines how a sample should processed downstream. 
 90 | In this tutorial, we will trim the PolyA stretches from the 3' end of Read2,
 91 | count both exonic and intronic reads, expect 5000 cells, and analyze the data,
 92 | turn off multi-mapper counting (so only unique reads are counted),
 93 | using 50, 100 and 300 UMI cutoffs. To set these parameters, we define a 
 94 | ``test_run_mode`` like this:
 95 | 
 96 | .. code-block:: console
 97 | 
 98 |     spacemake config add_run_mode --name test_run_mode \
 99 |     --polyA_adapter_trimming True \
100 |     --count_mm_reads False \
101 |     --n_beads 5000 \
102 |     --count_intronic_reads True \
103 |     --umi_cutoff 50 100 300
104 | 
105 | .. note:: 
106 | 
107 |     As with ``barcode_flavors``, spacemake provides several ``run_modes`` out
108 |     of the box. For more info :ref:`check out a more detailed guide here <configure run\\_modes>`.
109 | 
110 | Step 6: add the sample
111 | ----------------------
112 | 
113 | After configuring all the steps above, we are ready to add our (test) sample:
114 | 
115 | .. code-block:: console
116 | 
117 |     spacemake projects add_sample --project_id test_project \
118 |     --sample_id test_sample \
119 |     --R1 visium_public_lane_joined_1m_R1.fastq.gz \
120 |     --R2 visium_public_lane_joined_1m_R1.fastq.gz \
121 |     --species mouse \
122 |     --barcode_flavor test_barcode_flavor \
123 |     --run_mode test_run_mode
124 | 
125 | .. note::
126 | 
127 |     If there is already data available, here the Read1 and Read2 ``.fastq.gz`` files should be added,
128 |     instead of the test files.
129 | 
130 | Step 7: runn spacemake
131 | ----------------------
132 | 
133 | Now we can process our samples with spacemake. Since we added only one sample, only one sample will be processed
134 | and analyzed. To start spacemake, simply write:
135 | 
136 | .. code-block:: console
137 |     
138 |     spacemake run --cores 16
139 | 
140 | .. note::
141 |     
142 |     The number of cores used should be suited for the machine on which spacemake is ran.
143 |     When processing more than one samle, we recommend using spacemake with at least 8 cores.
144 |     In order to achieve maximum parallelism.
145 | 
146 | Step 8: results 
147 | ---------------
148 | 
149 | The results of the analysis for this sample will be under ``projects/test_project/processed_data/test_sample/illumina/complete_data/``
150 | 
151 | Under this directory, there are several files and directories which are important:
152 | 
153 | * ``final.polyA_adapter_trimmed.bam``: final, mapped, tagged ``.bam`` file. ``CB`` tag contains the cell barcode, and the ``MI`` contains the UMI-s. 
154 | 
155 | * ``qc_sheet_test_sample_no_spatial_data.html``: the QC-sheet for this sample, as a self-contained ``.html`` file.
156 | 
157 | * ``dge/``: a directory containing the Digital Expression Matrices (DGEs)
158 | 
159 |     * ``dge.all.polyA_adapter_trimmed.5000_beads.txt.gz``: a compressed, text based DGE
160 | 
161 |     * ``dge.all.polyA_adapter_trimmed.5000_beads.h5ad``: the same DGE but stored in ``.h5ad`` format (`used by the anndata python package <https://github.com/theislab/anndata/issues/180>`_). This matrix is stored as a Compressed Sparse Column matrix (using `scipy.sparse.csc_matrix <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html>`_).
162 | 
163 |     * ``dge.all.polyA_adapter_trimmed.5000_beads.summary.txt``: the summary of the DGE, one line per cell.
164 | 
165 |     * ``dge.all.polyA_adapter_trimmed.5000_beads.obs.csv``: the observation table of the matrix. Similar to the previous file, more detailed.
166 | 
167 | * ``automated_analysis/test_run_mode/umi_cutoff_50/``: In this directory the results of the automated analysis can be found. As it can be seen under the ``automated_analysis`` directory there are two further levels, one for ``run_mode`` and one for ``umi_cutoff``. This is because one sample can have several ``run_modes`` and in the same way one ``run_mode`` can have several UMI cutoffs.
168 | 
169 |     * ``results.h5ad``: the result of the automated analysis, stored in an anndata object. Same as the DGE before, but containing processed data.
170 | 
171 |     * ``test_sample_no_spatial_data_illumina_automated_report.html``: automated analysis self-contained ``.html`` report.
172 | 
173 | .. note::
174 | 
175 |     If the ``test_project`` had more samples, than those would be automatically placed under ``projects/test_project``. Similarily, under one spacemake
176 |     directory there can be several projects in parallel, and each will have their own directory structure under the ``projects/`` folder.
177 | 
178 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: spacemake
 2 | channels:
 3 |     - bih-cubi
 4 |     - conda-forge
 5 |     - bioconda
 6 |     - nodefaults
 7 | dependencies:
 8 |     - python>=3.6,<3.12
 9 |     - snakemake>=5.32.0,<6.4.0
10 |     - star>=2.7.1a
11 |     - samtools>=1.13
12 |     - sambamba>=0.6.8
13 |     - bowtie2>=2.3.4
14 |     - bcl2fastq2>=2.19
15 |     - fastqc>=0.11.9
16 |     - pip>=21.1
17 |     - r-base>=4.0.3
18 |     - r-rmarkdown>=2.7
19 |     - r-tidyverse>=1.3.1
20 |     - r-kableextra>=1.3.4
21 |     - r-cowplot>=1.1.1
22 |     - r-pals>=1.7
23 |     - r-hexbin
24 |     - r-scales
25 |     - pysam>=0.16.0.1
26 |     - pot
27 |     - openjdk==11.0.15
28 |     - pigz
29 |     - pip:
30 |         - setproctitle
31 |         - isal
32 |         - pytest
33 |         - pytest-cov
34 |         - mrfifo>=0.3.0
35 |         - pandas>2
36 |         - scanpy>=1.8.1
37 |         - leidenalg>=0.8.1
38 |         - numpy>=1.18.1
39 |         - more-itertools>=8.7.0
40 |         - biopython>=1.78
41 |         - scipy>=1.5.0
42 |         - scikit-misc>=0.1.3
43 |         - scikit-learn>=0.23.1
44 |         - squidpy>=1.0.0
45 |         - novosparc
46 |         - opencv-python
47 |         - jinja2>=3.1.3
48 |         - matplotlib==3.8.4
49 | #        - pytest-optional-tests
50 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/sequences/.gitignore:
--------------------------------------------------------------------------------
1 | primers.fa.nhr
2 | primers.fa.nin
3 | primers.fa.nog
4 | primers.fa.nsd
5 | primers.fa.nsi
6 | primers.fa.nsq
7 | 


--------------------------------------------------------------------------------
/sequences/primers.fa:
--------------------------------------------------------------------------------
 1 | >dropseq_template_switch_oligo_tso
 2 | AAGCAGTGGTATCAACGCAGAGTGAATG
 3 | >second_strand_synthesis_oligo_dn_smrt
 4 | AAGCAGTGGTATCAACGCAGAGTGANNNGGNNNB
 5 | >smart_pcr_primer
 6 | AAGCAGTGGTATCAACGCAGAGT
 7 | >new_p5_smart_pcr_hybrid_oligo
 8 | AATGATACGGCGACCACCGAGATCTACACGCCTGTCCGCGGAAGCAGTGGTATCAACGCAGAGT
 9 | >nextera_n701_oligo
10 | CAAGCAGAAGACGGCATACGAGATTCGCCTTAGTCTCGTGGGCTCGG
11 | >next_tn5_rev_primer
12 | GTCTCGTGGGCTCGGAGAT
13 | >imaging_primer
14 | GAATCACGATACGTACACCA
15 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = spacemake
 3 | version = attr: spacemake.contrib.__version__
 4 | author = Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky 
 5 | author_email = TamasRyszard.Sztanka-Toth@mdc-berlin.de
 6 | description = A bioinformatic pipeline for the analysis of spatial transcriptomic data
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/rajewsky-lab/spacemake
10 | project_urls =
11 |     Bug Tracker = https://github.com/rajewsky-lab/spacemake/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)
15 |     Operating System :: OS Independent
16 | license = GPL
17 | 
18 | [options]
19 | zip_safe = False
20 | python_requires = >=3.8
21 | include_package_data = True
22 | package_dir =
23 |     spacemake = spacemake
24 | packages = spacemake
25 | 
26 | [options.package_data]
27 | spacemake = 
28 |     snakemake/*.smk
29 |     snakemake/scripts/*.R
30 |     snakemake/scripts/*.Rmd
31 |     snakemake/scripts/*.py
32 |     data/*.csv
33 |     data/*.fa
34 |     config/*.yaml
35 |     longread/*.py
36 | 
37 | [options.entry_points]
38 | console_scripts = 
39 |     alnstats = spacemake.alnstats:cmdline
40 |     preprocess = spacemake.preprocess:cmdline
41 |     spacemake = spacemake.cmdline:cmdline
42 |     pb_annotate = spacemake.longread.cmdline:cmdline
43 | 
44 | [tool:pytest]
45 | testpaths = tests
46 | markers = 
47 |     big_download: needs to download large-ish files
48 | addopts = --cov=spacemake --cov-report html
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 |     from setuptools import setup
3 | 
4 |     setup()
5 | 


--------------------------------------------------------------------------------
/spacemake/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 


--------------------------------------------------------------------------------
/spacemake/__init__.py:
--------------------------------------------------------------------------------
1 | # __version__ = 1.0
2 | # import matplotlib._path
3 | # from . import preprocess as pp
4 | # from . import spatial as sp
5 | 
6 | # from .smk import Spacemake
7 | 


--------------------------------------------------------------------------------
/spacemake/bin/BamTagHistogram.py:
--------------------------------------------------------------------------------
  1 | import mrfifo as mf
  2 | import logging
  3 | 
  4 | 
  5 | def parse_args():
  6 |     from spacemake.util import make_minimal_parser
  7 | 
  8 |     parser = make_minimal_parser("BamTagHistogram")
  9 | 
 10 |     parser.add_argument("--parallel", type=int, default=8)
 11 |     parser.add_argument("--input", default="/dev/stdin")
 12 |     parser.add_argument("--output", default="/dev/stdout")
 13 |     parser.add_argument(
 14 |         "--prefix-size",
 15 |         default=4,
 16 |         type=int,
 17 |         help=(
 18 |             "how many letters of the tag value are used to split the stream. "
 19 |             "default=4 allows for up to (alphabet_size)^4 distinct parallel workers. "
 20 |             "will be spread across workers by mod <args.parallel>"
 21 |         ),
 22 |     )
 23 |     parser.add_argument("--prefix-alphabet", default="ACGTN")
 24 |     parser.add_argument("--min-count", default=10, type=int)
 25 |     parser.add_argument(
 26 |         "--sort-mem",
 27 |         default=8,
 28 |         type=int,
 29 |         help="how many GB are allowed to be used for sorting (default=8)",
 30 |     )
 31 |     parser.add_argument(
 32 |         "--tag", default="CB", help="which BAM tag to count (default='CB')"
 33 |     )
 34 | 
 35 |     return parser.parse_args()
 36 | 
 37 | 
 38 | def CB_distributor(
 39 |     input, outputs, tag="CB", prefix_size=3, prefix_alphabet="ACGTN", n=8, **kw
 40 | ):
 41 |     "ensure that the FIFOs are not managed"
 42 |     assert type(input) is str
 43 |     logger = logging.getLogger("mrfifo.parts.CB_distributor")
 44 |     logger.debug(
 45 |         f"reading from {input}, writing to {outputs} "
 46 |         f"tag={tag} prefix_size={prefix_size} prefix_alphabet={prefix_alphabet} "
 47 |         f"kw={kw}"
 48 |     )
 49 | 
 50 |     lkup = {}
 51 |     from itertools import product
 52 | 
 53 |     i = 0
 54 |     for letters in product(*([prefix_alphabet] * prefix_size)):
 55 |         prefix = "".join(letters).encode("ascii")
 56 |         lkup[prefix] = i % n
 57 |         i += 1
 58 | 
 59 |     # for k, v in sorted(lkup.items()):
 60 |     #     print(f"{k}\t{v}")
 61 | 
 62 |     from mrfifo.fast_loops import distribute_by_substr
 63 | 
 64 |     tag_lead = b"\t" + tag.encode("ascii") + b":Z:"
 65 |     logger.debug(
 66 |         f"scanning for tag-lead {tag_lead} and using next {prefix_size} bytes as prefix"
 67 |     )
 68 |     res = distribute_by_substr(
 69 |         fin_name=input,
 70 |         fifo_names=outputs,
 71 |         sub_lookup=lkup,
 72 |         sub_size=prefix_size,
 73 |         sub_lead=tag_lead,
 74 |         # **kw,
 75 |     )
 76 |     logger.debug("distribution complete")
 77 |     return res
 78 | 
 79 | 
 80 | def tag_counter(input, output, tag="CB", min_count=10):
 81 |     from collections import defaultdict
 82 | 
 83 |     counter = defaultdict(int)
 84 |     stats = defaultdict(int)
 85 |     import re
 86 | 
 87 |     pattern = re.compile(f"{tag}:Z:(\S+)")
 88 |     for sam_line in input:
 89 |         stats["n_records"] += 1
 90 |         flags = int(sam_line.split("\t")[1])
 91 |         if flags & 256:
 92 |             # 'not primary alignment' bit is set
 93 |             stats["n_secondary"] += 1
 94 |             continue
 95 | 
 96 |         if m := re.search(pattern, sam_line):
 97 |             stats["n_tagged"] += 1
 98 |             tag_val = m.groups(0)[0]
 99 |             counter[tag_val] += 1
100 | 
101 |     stats["n_values"] = len(counter)
102 |     for value, count in counter.items():
103 |         if count >= min_count:
104 |             stats["n_above_cut"] += 1
105 |             output.write(f"{count}\t{value}\n")
106 | 
107 |     return stats
108 | 
109 | 
110 | def sort_function(input, output, n=8, sort_mem_gigs=8, header=None):
111 |     import os
112 | 
113 |     if header is None:
114 |         header = rf"# INPUT={args.input} TAG={args.tag} FILTER_PCR_DUPLICATES=false READ_QUALITY=0\n"
115 | 
116 |     if output.endswith(".gz"):
117 |         cmd = (
118 |             f'{{ printf "{header}"; sort -rnk 1 -S {sort_mem_gigs}G --parallel={n} {input}; }}'
119 |             f"| python -m isal.igzip -c > {output}"
120 |         )
121 |     else:
122 |         cmd = f'{{ printf "{header}"; sort -rnk 1 -S {sort_mem_gigs}G --parallel={n} {input}; }} > {output}'
123 | 
124 |     import subprocess
125 | 
126 |     subprocess.call(cmd, shell=True)
127 | 
128 | 
129 | def main(args):
130 |     w = (
131 |         mf.Workflow("BamTagHistogram", total_pipe_buffer_MB=4)
132 |         .BAM_reader(
133 |             input=args.input,
134 |             mode="S",
135 |             threads=4,
136 |         )
137 |         .distribute(
138 |             input=mf.FIFO("input_sam", "rt"),
139 |             outputs=mf.FIFO("dist_{n}", "wt", n=args.parallel),
140 |             func=CB_distributor,
141 |             tag=args.tag,
142 |             prefix_size=args.prefix_size,
143 |             prefix_alphabet=args.prefix_alphabet,
144 |             n=args.parallel,
145 |         )
146 |         .workers(
147 |             func=tag_counter,
148 |             tag=args.tag,
149 |             input=mf.FIFO("dist_{n}", "rt"),
150 |             output=mf.FIFO("counts_{n}", "wt"),
151 |             n=args.parallel,
152 |             min_count=args.min_count,
153 |         )
154 |         .collect(
155 |             inputs=mf.FIFO("counts_{n}", "rt", n=args.parallel),
156 |             output=mf.FIFO("unsorted", "wt"),
157 |             chunk_size=1,
158 |         )
159 |         .funnel(
160 |             input=mf.FIFO("unsorted", "rt"),
161 |             output=args.output,
162 |             func=sort_function,
163 |             _manage_fifos=False,
164 |         )
165 |         .run()
166 |     )
167 |     stats = mf.util.CountDict()
168 |     for jobname, d in w.result_dict.items():
169 |         if "worker" in jobname:
170 |             stats.add_other_stats(d)
171 | 
172 |     df = stats.get_stats_df()
173 |     df["input"] = args.input
174 |     print(df.set_index("input"))
175 |     return w
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     args = parse_args()
180 |     import spacemake.util as util
181 | 
182 |     util.setup_logging(args)
183 |     main(args)
184 | 


--------------------------------------------------------------------------------
/spacemake/contrib.py:
--------------------------------------------------------------------------------
 1 | import importlib.metadata
 2 | __version__ = "0.8.1"
 3 | __author__ = ["Nikos Karaiskos", "Tamas Ryszard Sztanka-Toth",
 4 |               "Marvin Jens", "Daniel Leon-Perinan"]
 5 | __license__ = "GPL"
 6 | __email__ = [
 7 |     "nikolaos.karaiskos@mdc-berlin.de",
 8 |     "tamasryszard.sztanka-toth@mdc-berlin.de",
 9 |     "marvin.jens@charite.de",
10 |     "daniel.leonperinan@mdc-berlin.de"
11 | ]
12 | 
13 | author_contributions = """
14 | Spacemake is built on snakemake scripts originally developed by Nikos Karaiskos
15 | for the analysis of dropseq data. These gradually evolved into a robust workflow for
16 | spatial transcriptomics data analysis that was improved and generalized to work
17 | with different ST technologies by Tamas Ryszard Sztanka-Toth. Marvin Jens contributed
18 | longread analysis code and support for converting fastq to BAM as a first step.
19 | Many features of the automated analysis and integration with Novosparc were added by
20 | Tamas, in close collaboration with Nikos, culminating in the first spacemake
21 | publication:
22 | 
23 |     https://doi.org/10.1093/gigascience/giac064
24 | 
25 | Marvin then added new building blocks to successively replace the java-based 
26 | dropseq tools with python/pysam based code: cutadapt_bam.py, annotator.py, as well
27 | as the ability to align raw reads to multiple indices, in close collaboration
28 | with Nikos & Tamas.
29 | 
30 | Spacemake is actively maintained by Dani, Marvin and Nikos.
31 | """
32 | 
33 | roadmap = [
34 |     ("0.5.5", "universal ST support and utility, novosparc integration. Sztanka-Toth et al. 2022"),
35 |     ("0.7", "support multiple mapping indices, bulk samples, custom user-defined snakemake rules"),
36 |     ("1.x", "replace dropseq tools. Own annotator and towards entirely scanpy workflow"),
37 |     ("1.x", "efficient handling of 1E8+ spatial barcodes (seq-scope etc.)"),
38 |     ("1.x", "add interactive data exploration support (shiny?)"),
39 |     ("2.x", "cmdline interface cleanup and remote API support"),
40 |     ("2.x", "cython magic to speed up parallel BAM processing via shared memory"),
41 | ]
42 | 


--------------------------------------------------------------------------------
/spacemake/data/.gitignore:
--------------------------------------------------------------------------------
1 | !*
2 | 


--------------------------------------------------------------------------------
/spacemake/data/config/config.yaml:
--------------------------------------------------------------------------------
  1 | puck_data:
  2 |     barcode_file: 'predictions_ml.csv'
  3 |     root: 'puck_data'
  4 | 
  5 | pucks:
  6 |     default:
  7 |         width_um: 3000
  8 |         spot_diameter_um: 10
  9 |         coordinate_system: ''
 10 |     visium:
 11 |         barcodes: 'puck_data/visium_barcode_positions.csv'
 12 |         width_um: 6500
 13 |         spot_diameter_um: 55
 14 |     seq_scope:
 15 |         width_um: 1000
 16 |         spot_diameter_um: 1
 17 |     slide_seq:
 18 |         width_um: 3000
 19 |         spot_diameter_um: 10
 20 |     openst:
 21 |         width_um: 1200
 22 |         spot_diameter_um: 0.6
 23 |         coordinate_system: 'puck_data/openst_coordinate_system.csv'
 24 | 
 25 | run_modes:
 26 |     default:
 27 |         n_beads: 100000
 28 |         umi_cutoff: [100, 300, 500]
 29 |         clean_dge: False
 30 |         detect_tissue: False
 31 |         polyA_adapter_trimming: True
 32 |         count_intronic_reads: True
 33 |         count_mm_reads: False
 34 |         mesh_data: False
 35 |         mesh_type: 'circle'
 36 |         mesh_spot_diameter_um: 55
 37 |         mesh_spot_distance_um: 100
 38 |         spatial_barcode_min_matches: 0
 39 |     visium:
 40 |         n_beads: 10000
 41 |         umi_cutoff: [1000]
 42 |         clean_dge: False
 43 |         detect_tissue: True
 44 |         count_intronic_reads: False
 45 |         count_mm_reads: True
 46 |     slide_seq:
 47 |         n_beads: 100000
 48 |         umi_cutoff: [50]
 49 |         clean_dge: False
 50 |         detect_tissue: False
 51 |     scRNA_seq:
 52 |         n_beads: 10000
 53 |         umi_cutoff: [500]
 54 |         detect_tissue: False
 55 |         count_intronic_reads: True
 56 |         count_mm_reads: False
 57 |     seq_scope:
 58 |         clean_dge: false
 59 |         count_intronic_reads: false
 60 |         count_mm_reads: false
 61 |         detect_tissue: false
 62 |         mesh_data: true
 63 |         mesh_spot_diameter_um: 10
 64 |         mesh_spot_distance_um: 15
 65 |         mesh_type: hexagon
 66 |         n_beads: 1000
 67 |         umi_cutoff:
 68 |         - 100
 69 |         - 300
 70 |     openst:
 71 |         clean_dge: false
 72 |         count_intronic_reads: true
 73 |         count_mm_reads: true
 74 |         detect_tissue: false
 75 |         mesh_data: true
 76 |         mesh_spot_diameter_um: 7
 77 |         mesh_spot_distance_um: 7
 78 |         mesh_type: hexagon
 79 |         n_beads: 100000
 80 |         polyA_adapter_trimming: true
 81 |         spatial_barcode_min_matches: 0.1
 82 |         umi_cutoff:
 83 |         - 100
 84 |         - 250
 85 |         - 500
 86 |     
 87 | 
 88 | barcode_flavors:
 89 |     default:
 90 |         cell: "r1[0:12]"
 91 |         UMI: "r1[12:20]"
 92 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
 93 |     dropseq:
 94 |         cell: "r1[0:12]"
 95 |         UMI: "r1[12:20]"
 96 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
 97 |     slide_seq_14bc:
 98 |         cell: "r1[0:14]"
 99 |         UMI: "r1[14:23]"
100 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
101 |     slide_seq_15bc:
102 |         cell: "r1[0:14]"
103 |         UMI: "r1[15:23]"
104 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
105 |     visium:
106 |         cell: "r1[0:16]"
107 |         UMI: "r1[16:28]"
108 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
109 |     sc_10x_v2:
110 |         cell: "r1[0:16]"
111 |         UMI: "r1[16:26]"
112 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
113 |     seq_scope:
114 |         UMI: "r2[0:9]"
115 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
116 |         cell: "r1[0:20]"
117 |     openst:
118 |         UMI: "r2[0:9]"
119 |         #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
120 |         cell: "r1[2:27]"
121 | 
122 | adapter_flavors:
123 |   SMARTER:
124 |     cut_right:
125 |     - Q:
126 |         min_base_quality: 30
127 |     - polyA:
128 |         max_error: 0.25
129 |         min_overlap: 3
130 |     paired_end: replace_N
131 |   chromium:
132 |     cut_right:
133 |     - Q:
134 |         min_base_quality: 32
135 |     - polyA:
136 |         max_error: 0.25
137 |         min_overlap: 3
138 |     - polyG:
139 |         max_error: 0.1
140 |         min_overlap: 3
141 |   default:
142 |     cut_left:
143 |     - TSO_SMART:
144 |         max_error: 0.1
145 |         min_overlap: 10
146 |     cut_right:
147 |     - Q:
148 |         min_base_quality: 30
149 |     - polyA:
150 |         max_error: 0.25
151 |         min_overlap: 3
152 |     - polyG:
153 |         max_error: 0.1
154 |         min_overlap: 3
155 |     paired_end: single-end
156 |   dropseq:
157 |     cut_left:
158 |     - TSO_SMART:
159 |         max_errors: 0.1
160 |         min_overlap: 10
161 |     cut_right:
162 |     - Q:
163 |         min_base_quality: 30
164 |     - polyA:
165 |         max_errors: 0.25
166 |         min_overlap: 3
167 |     - polyG:
168 |         max_errors: 0.1
169 |         min_overlap: 3
170 |     paired_end: single-end
171 |   fc_SMART_UMI_RPE:
172 |     cut_left:
173 |     - TSO_SMART:
174 |         max_errors: 0.1
175 |         min_overlap: 10
176 |     cut_right:
177 |     - Q:
178 |         min_base_quality: 32
179 |     - polyG:
180 |         max_errors: 0.25
181 |         min_overlap: 3
182 |     - Q:
183 |         min_base_quality: 32
184 |     - polyA:
185 |         max_errors: 0.25
186 |         min_overlap: 3
187 | 
188 | adapters:
189 |     smart: 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTC'
190 |     optical_primer: 'GAATCACGATACGTACACCA'
191 |     TSO_SMART: AAGCAGTGGTATCAACGCAGAGTGAATGGG
192 |     SMART: AAGCAGTGGTATCAACGCAGAGTG
193 |     TSO_10x: AAGCAGTGGTATCAACGCAGAGTACATGGG
194 |     chromium_bead: CTACACGACGCTCTTCCGATCT
195 |     dropseq_bead: AAGCAGTGGTATCAACGCAGAGTAC
196 |     polyA: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
197 |     polyG: GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
198 |     nextflex_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTGAA
199 |     truseq_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCA
200 | 


--------------------------------------------------------------------------------
/spacemake/data/config/longread.yaml:
--------------------------------------------------------------------------------
  1 | blocks:
  2 |     P5: AATGATACGGCGACCACCGAGATCTACACGCCTGTCCGCGG
  3 |     N70X: CTGTCTCTTATACACATCTCCGAGCCCACGAGACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG
  4 |     SMART_primer: AAGCAGTGGTATCAACGCAGAGT
  5 |     SMART_bead: AAGCAGTGGTATCAACGCAGAGTAC
  6 |     dN-SMRT: AAGCAGTGGTATCAACGCAGAGTGA
  7 |     TSO: AAGCAGTGGTATCAACGCAGAGTGAATGGG
  8 |     sc_primer: CTCGGAGATGTGTATAAGAGACAGTATGGG
  9 |     # random_primer: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
 10 |     OP1: GAATCACGATACGTACACCA
 11 |     OP2_2s: GCGTTGCGTTCCTAGCCGCTAC
 12 |     # OP3: CGCAGTCTCCGTCGATAAGGTC
 13 |     OP2: GCGTGTGGTCGGACGCACCCAC
 14 |     OP3: GCAAAGCTGCTGCCTCCGCTAGC
 15 |     polyT: TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
 16 |     #dN-NEXT_Tn5: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
 17 |     #NEXT_Tn5_Rev_Primer: GTCTCGTGGGCTCGGAGAT
 18 |     #Nextera_TN5: CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG
 19 |     10X_start: CTACACGACGCTCTTCCGATCT
 20 |     10X_TSO: AAGCAGTGGTATCAACGCAGAGTACATGGG
 21 |     10X_C3_RT_PRIMER: AAGCAGTGGTATCAACGCAGAG
 22 |     10X_C2_RT_PRIMER: AAGCAGTGGTATCAACGCAGAGTACAT
 23 | 
 24 | signatures:
 25 |     visium:
 26 |         label: visium
 27 |         prio: 0
 28 |         color: pink
 29 |         intact: 10X_start,polyT,10X_TSO_RC
 30 |         other: 10X_C3_RT_PRIMER,10X_C2_RT_PRIMER
 31 |         prefixes: P5
 32 |         suffixes: N70X
 33 |         CB: r1[0:16]
 34 |         UMI: r1[16:28]
 35 |         cDNA_after: polyT
 36 |         read1_primer: 10X_start
 37 |         read2_primer: 10X_TSO
 38 | 
 39 |     chromium:
 40 |         label: chromium
 41 |         prio: 1
 42 |         color: gray
 43 |         CB: r1[0:16]
 44 |         UMI: r1[16:26]
 45 |         intact: 10X_start,polyT,10X_TSO_RC
 46 |         other: 10X_C3_RT_PRIMER,10X_C2_RT_PRIMER
 47 |         cDNA_after: polyT
 48 |         prefixes: P5
 49 |         suffixes: N70X
 50 |         read1_primer: 10X_start
 51 |         read2_primer: 10X_TSO
 52 | 
 53 |     dropseq:
 54 |         label: dropseq
 55 |         prio: 2
 56 |         color: gray
 57 |         CB: r1[8:20]
 58 |         UMI: r1[0:8]
 59 |         intact: SMART_bead,polyT
 60 |         cDNA_after: polyT
 61 |         other: SMART_primer,dN-SMRT,TSO,sc_primer
 62 |         prefixes: P5
 63 |         suffixes: N70X
 64 |         read1_primer: SMART_bead
 65 |         read2_primer: N70X
 66 | 
 67 |     # in-house experimental
 68 |     withUMI:
 69 |         label: withUMI
 70 |         prio: 3
 71 |         color: blue
 72 |         CB: r1[0:12]
 73 |         UMI: r1[12:20]
 74 |         intact: SMART_bead,OP1,polyT
 75 |         other: SMART_primer,dN-SMRT,TSO,sc_primer
 76 |         prefixes: P5
 77 |         suffixes: N70X
 78 |         cDNA_after: polyT
 79 |         read1_primer: SMART_bead
 80 |         read2_primer: N70X
 81 | 
 82 |     noUMI:
 83 |         label: noUMI
 84 |         prio: 10
 85 |         color: lightblue
 86 |         CB: r1[0:12]
 87 |         UMI: r2[0:8]
 88 |         intact: SMART_bead,OP1,polyT
 89 |         other: SMART_primer,dN-SMRT,TSO,sc_primer
 90 |         prefixes: P5
 91 |         suffixes: N70X
 92 |         cDNA_after: polyT
 93 |         read1_primer: SMART_bead
 94 |         read2_primer: N70X
 95 | 
 96 |     combv1:
 97 |         label: comb_2seg_pilot
 98 |         prio: 20
 99 |         color: blue
100 |         CB: r1[0:12]
101 |         UMI: r2[0:8]
102 |         intact_bead: SMART_bead,OP1,OP2_2s,polyT
103 |         other: SMART_primer,dN-SMRT,TSO,sc_primer,OP3
104 |         cDNA_after: polyT
105 |         read1_primer: SMART_bead
106 |         read2_primer: N70X
107 | 
108 |     hybridv1:
109 |         label: comb_hybrid
110 |         prio: 30
111 |         color: lightblue
112 |         CB: r1[0:8]+r1[31:39]
113 |         UMI: r2[0:8]
114 |         intact: SMART_bead,OP2,OP3,polyT
115 |         other: SMART_primer,dN-SMRT,TSO,sc_primer,OP3
116 |         prefixes: P5
117 |         suffixes: N70X
118 |         cDNA_after: polyT
119 |         read1_primer: SMART_bead
120 |         read2_primer: N70X
121 | 
122 |     scsmrna:
123 |         label: smallRNA
124 |         prio: 40
125 |         color: red
126 |         CB: r1[0:12]
127 |         UMI: r1[12:20]
128 |         intact: SMART_bead,polyT,sc_primer_RC
129 |         other: SMART_primer,dN-SMRT,TSO,sc_primer,OP1
130 |         prefixes: P5
131 |         suffixes: N70X
132 |         cDNA_after: polyT
133 |         read1_primer: SMART_bead
134 |         read2_primer: TSO
135 | 
136 | default: withUMI


--------------------------------------------------------------------------------
/spacemake/data/config/species_data_url.yaml:
--------------------------------------------------------------------------------
1 | mouse:
2 |     annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/gencode.vM27.primary_assembly.annotation.gtf.gz'
3 |     genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/GRCm39.primary_assembly.genome.fa.gz'
4 | human:
5 |     annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.primary_assembly.annotation.gtf.gz'
6 |     genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.primary_assembly.genome.fa.gz'
7 | 


--------------------------------------------------------------------------------
/spacemake/data/puck_collection/create_novaseq_S4_coordinate_system.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | 
  4 | """
  5 | Global Coordinate System Generator for NovaSeq S4 Flow Cell
  6 | 
  7 | This Python script is designed to create a global coordinate system 
  8 | for a NovaSeq S4 flow cell. 
  9 | 
 10 | It generates a DataFrame with puck names and their corresponding global 
 11 | (x, y, z) coordinates and saves it to a CSV file.
 12 | 
 13 | Usage:
 14 |     python create_openst_coordinate_system.py --output <output_file> [options]
 15 | 
 16 | Example:
 17 |     python create_openst_coordinate_system.py \
 18 |         --output output.csv \
 19 |         --format-string fc_1_L{lane}{side_letter}_tile_{side_number}{column}{row:02d} \
 20 |         --x-offset 33739 \
 21 |         --y-offset 36282 \
 22 |         --swath-offset-odd 0 \
 23 |         --swath-offset-even 6201 \
 24 |         --rows 78 \
 25 |         --columns 6 \
 26 |         --n_lanes 4 \
 27 |         --zero-coded
 28 | 
 29 | Author:
 30 |     Daniel León-Periñán
 31 | """
 32 | 
 33 | def setup_parser(parser):
 34 |     parser.add_argument(
 35 |         "--output",
 36 |         type=str,
 37 |         help="where to store the output file with puck names and global (x,y,z) coordinates",
 38 |         required=True,
 39 |     )
 40 | 
 41 |     parser.add_argument(
 42 |         "--format-string",
 43 |         type=str,
 44 |         help="this the format for puck names. There are 4 attributes that can be chosen:"
 45 |         + "\{lane\} (int), \{column\} (int), \{row\} (int), \{side_letter\} (str), \{side_number\} (int).\n"
 46 |         + "For instance, a valid string format would be: \n"
 47 |         + "fc_1_L{lane}{side_letter}_tile_{side_number}{column}{row:02d}\n"
 48 |         + "This name must be used, as is, when creating a new sample in spacemake.",
 49 |         default="L{lane}{side_letter}_tile_{side_number}{column}{row:02d}",
 50 |     )
 51 | 
 52 |     parser.add_argument(
 53 |         "--x-offset",
 54 |         type=int,
 55 |         help="the offset in the x axis. Units are important during puck collection generation.",
 56 |         default=33739,
 57 |     )
 58 | 
 59 |     parser.add_argument(
 60 |         "--y-offset",
 61 |         type=int,
 62 |         help="the offset of the y axis. Units are important during puck collection generation.",
 63 |         default=36282,
 64 |     )
 65 | 
 66 |     parser.add_argument(
 67 |         "--swath-offset-odd",
 68 |         type=int,
 69 |         help="the swath offset for odd columns",
 70 |         default=0,
 71 |     )
 72 | 
 73 |     parser.add_argument(
 74 |         "--swath-offset-even",
 75 |         type=int,
 76 |         help="the swath offset for even columns",
 77 |         default=6201,
 78 |     )
 79 | 
 80 |     parser.add_argument(
 81 |         "--rows",
 82 |         type=int,
 83 |         help="number of rows",
 84 |         default=78,
 85 |     )
 86 | 
 87 |     parser.add_argument(
 88 |         "--columns",
 89 |         type=int,
 90 |         help="number of columns",
 91 |         default=6,
 92 |     )
 93 | 
 94 |     parser.add_argument(
 95 |         "--n_lanes",
 96 |         type=int,
 97 |         help="number of lanes",
 98 |         default=4,
 99 |     )
100 | 
101 |     parser.add_argument(
102 |         "--zero-coded",
103 |         default=False,
104 |         action="store_true",
105 |         help="whether row and column indices should start at 0, instead of 1",
106 |     )
107 | 
108 |     return parser
109 | 
110 | 
111 | def create_coordinate_system(
112 |     n_lanes: int,
113 |     n_cols: int,
114 |     n_rows: int,
115 |     x_offset: int,
116 |     y_offset: int,
117 |     swath_offsets_odd: int,
118 |     swath_offsets_even: int,
119 |     zero_coded: bool,
120 |     format_string: str,
121 | ) -> pd.DataFrame:
122 |     """
123 |     Create a global coordinate system for a NovaSeq S4 flow cell.
124 | 
125 |     :param n_lanes: Number of lanes in the flow cell.
126 |     :type n_lanes: int
127 |     :param n_cols: Number of columns in the flow cell.
128 |     :type n_cols: int
129 |     :param n_rows: Number of rows in the flow cell.
130 |     :type n_rows: int
131 |     :param x_offset: Offset in the x-axis for coordinate calculations.
132 |     :type x_offset: int
133 |     :param y_offset: Offset in the y-axis for coordinate calculations.
134 |     :type y_offset: int
135 |     :param swath_offsets_odd: Swath offset for odd columns.
136 |     :type swath_offsets_odd: int
137 |     :param swath_offsets_even: Swath offset for even columns.
138 |     :type swath_offsets_even: int
139 |     :param zero_coded: Whether row and column indices should start at 0, instead of 1.
140 |     :type zero_coded: bool
141 |     :param format_string:The format for puck names.
142 |     :type format_string: str
143 |     :returns: DataFrame with puck names and their corresponding global coordinates.
144 |     :rtype: pd.DataFrame
145 |     """
146 | 
147 |     one_coded_offset = 0 if zero_coded else 1
148 |     swath_offsets = [swath_offsets_even, swath_offsets_odd]
149 |     sides_letter = {1: "a", 2: "b"}
150 |     l = []
151 |     for lane in range(one_coded_offset, n_lanes + one_coded_offset):
152 |         for side in [1, 2]:
153 |             for col in range(n_cols + one_coded_offset):
154 |                 for row in range(one_coded_offset, n_rows + one_coded_offset):
155 |                     puck_id = format_string.format(
156 |                         lane=lane,
157 |                         side_letter=sides_letter[side],
158 |                         side_number=side,
159 |                         column=col,
160 |                         row=row,
161 |                     )
162 | 
163 |                     x_ofs = int(col) * x_offset
164 | 
165 |                     swath_offset = swath_offsets[int(col) % 2]
166 |                     swath_offset = -swath_offset if side == 1 else swath_offset
167 | 
168 |                     y_ofs = int(row) * y_offset + swath_offset
169 | 
170 |                     z_ofs = 0
171 | 
172 |                     l.append(
173 |                         pd.DataFrame(
174 |                             {
175 |                                 "puck_id": [puck_id],
176 |                                 "x_offset": [x_ofs],
177 |                                 "y_offset": [y_ofs],
178 |                                 "z_offset": [z_ofs],
179 |                             }
180 |                         )
181 |                     )
182 | 
183 |     puck_names_coords = pd.concat(l)
184 | 
185 |     return puck_names_coords
186 | 
187 | 
188 | def cmdline():
189 |     """cmdline."""
190 |     parser = argparse.ArgumentParser(
191 |         allow_abbrev=False,
192 |         description="Global Coordinate System Generator for NovaSeq S4 Flow Cell",
193 |     )
194 |     parser = setup_parser(parser)
195 |     args = parser.parse_args()
196 | 
197 |     puck_names_coords = create_coordinate_system(
198 |         n_lanes=args.n_lanes,
199 |         n_cols=args.columns,
200 |         n_rows=args.rows,
201 |         x_offset=args.x_offset,
202 |         y_offset=args.y_offset,
203 |         swath_offsets_odd=args.swath_offset_odd,
204 |         swath_offsets_even=args.swath_offset_even,
205 |         zero_coded=args.zero_coded,
206 |         format_string=args.format_string,
207 |     )
208 | 
209 |     puck_names_coords.to_csv(args.output, index=False)
210 | 
211 | 
212 | if __name__ == "__main__":
213 |     cmdline()
214 | 


--------------------------------------------------------------------------------
/spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz


--------------------------------------------------------------------------------
/spacemake/data/test/visium_public_lane_joined_1m_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/spacemake/data/test/visium_public_lane_joined_1m_R2.fastq.gz


--------------------------------------------------------------------------------
/spacemake/errors.py:
--------------------------------------------------------------------------------
  1 | class SpacemakeError(Exception):
  2 |     def __init__(self, msg=None):
  3 |         self.msg = msg
  4 | 
  5 |     def __str__(self):
  6 |         msg = 'ERROR: ' + str(self.__class__.__name__) + '\n'
  7 | 
  8 |         if hasattr(self, 'msg') and self.msg is not None:
  9 |             msg += self.msg
 10 | 
 11 |         return msg
 12 | 
 13 | class FileWrongExtensionError(SpacemakeError):
 14 |     def __init__(self, filename, expected_extension):
 15 |         self.filename = filename
 16 |         self.expected_extension = expected_extension
 17 | 
 18 |     def __str__(self):
 19 |         msg = super().__str__()
 20 |         msg += f'File {self.filename} has wrong extension.\n'
 21 |         msg += f'The extension should be {self.expected_extension}.\n'
 22 | 
 23 |         return msg
 24 | 
 25 | class ConfigVariableError(SpacemakeError):
 26 |     def __init__(self, variable_name, variable_value):
 27 |         self.variable_name = variable_name
 28 |         self.variable_value = variable_value
 29 | 
 30 | class UnrecognisedConfigVariable(SpacemakeError):
 31 |     def __init__(self, variable_name, variable_options):
 32 |         self.variable_name = variable_name
 33 |         self.variable_options = variable_options
 34 | 
 35 |     def __str__(self):
 36 |         msg = super().__str__()
 37 |         msg += f'unrecognised variable {self.variable_name}\n'
 38 |         msg += f'it has to be one of {self.variable_options}.'
 39 | 
 40 |         return msg
 41 | 
 42 | class EmptyConfigVariableError(SpacemakeError):
 43 |     def __init__(self, variable_name):
 44 |         self.variable_name = variable_name
 45 | 
 46 |     def __str__(self):
 47 |         msg = super().__str__()
 48 |         msg += f'cannot remove, or set {self.variable_name} to emtpy list, or None\n'
 49 |         msg += 'this ERROR could happen in two cases: \n'
 50 |         msg += f'1) you tried to remove a {self.variable_name}, '
 51 |         msg += f'and as a result the sample would not have'
 52 |         msg += f' any {self.variable_name} available.\n'
 53 |         msg += f'2) you tried to remove the `default` value of'
 54 |         msg += f' {self.variable_name} from the configuration.\n'
 55 | 
 56 |         return msg
 57 | 
 58 | class ConfigVariableNotFoundError(ConfigVariableError):
 59 |     def __str__(self):
 60 |         msg = super().__str__()
 61 |         msg += f'{self.variable_name}: {self.variable_value} not found.\n'
 62 |         msg += f'you can add a new {self.variable_name} using the '
 63 |         msg += f'`spacemake config add_{self.variable_name}` command.\n'
 64 | 
 65 |         return msg
 66 | 
 67 | class ConfigVariableIncompleteError(ConfigVariableError):
 68 |     def __init__(self, missing_key, **kwargs):
 69 |         super().__init__(**kwargs)
 70 |         self.missing_key = missing_key
 71 | 
 72 |     def __str__(self):
 73 |         msg = super().__str__()
 74 |         msg += f'{self.variable_name}: {self.variable_value} '
 75 |         msg += f'is missing required key {self.required_key}.\n'
 76 |         msg += f'You can update this key of {self.variable_value} using the '
 77 |         msg += f'`spacemake config update_{self.variable_name}` command.\n'
 78 | 
 79 |         return msg
 80 | 
 81 | class InvalidBarcodeStructureError(SpacemakeError):
 82 |     def __init__(self, tag_name, to_match):
 83 |         self.tag_name = tag_name
 84 |         self.to_match = to_match
 85 | 
 86 |     def __str__(self):
 87 |         msg = super().__str__()
 88 |         msg += f'{self.tag_name} does not match {self.to_match}.\n'
 89 |         msg += f'Example matching would be: r1[0:12] for the first 12n of Read1 '
 90 |         msg += f'for {self.tag_name}\n'
 91 | 
 92 | 
 93 | class DuplicateConfigVariableError(ConfigVariableError):
 94 |     def __str__(self):
 95 |         msg = super().__str__()
 96 |         msg += f'{self.variable_name}: {self.variable_value} already exists.\n'
 97 |         msg += f'To update it use `spacemake config update_{self.variable_name}`,\n'
 98 |         msg += f'To delete it use `spacemake config delete_{self.variable_name}.\n'
 99 | 
100 |         return msg
101 | 
102 | class NoProjectSampleProvidedError(SpacemakeError):
103 |     def __init__(self):
104 |         pass
105 | 
106 |     def __str__(self):
107 |         msg = super().__str__()
108 |         msg += f'no projects or samples were provided.\n'
109 | 
110 |         return msg
111 | 
112 | class ProjectSampleNotFoundError(SpacemakeError):
113 |     def __init__(self, var_name, var_value):
114 |         self.var_name = var_name
115 |         self.var_value = var_value
116 | 
117 |     def __str__(self):
118 |         msg = super().__str__()
119 |         msg += f'sample with {self.var_name}={self.var_value} not found.\n'
120 |         msg += 'you can add a new sample with `spacemake projects add_sample` command.\n'
121 | 
122 |         return msg
123 | 
124 | class SampleAlreadyExistsError(SpacemakeError):
125 |     def __init__(self, ix):
126 |         self.ix = ix
127 | 
128 |     def __str__(self):
129 |         msg = super().__str__()
130 |         msg += f'sample with (project_id, sample_id)={self.ix} already exists.\n'
131 |         msg += 'in order to update this sample use `spacemake projects update_sample`,\n'
132 |         msg += 'to delete it use `spacemake projects delete_sample`.\n'
133 | 
134 |         return msg
135 | 
136 | class InconsistentVariablesDuringMerge(ConfigVariableError):
137 |     def __init__(self, ix, **kwargs):
138 |         super().__init__(**kwargs)
139 |         self.ix = ix
140 | 
141 |     def __str__(self):
142 |         msg = super().__str__()
143 |         msg += f'\nthe samples that you trying to merge have different '
144 |         msg += f'{self.variable_name} values.\n\ninconsistent values:'
145 |         msg += f' {self.variable_value}\n'
146 |         msg += f'samples: {self.ix}.\n\n'
147 |         msg += 'You can only merge samples which have the same '
148 |         msg += f'{self.variable_name}, or if there is an overlap.\n'
149 | 
150 |         return msg
151 | 


--------------------------------------------------------------------------------
/spacemake/longread/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 |     from spacemake.longread.cmdline import cmdline
3 | 
4 |     cmdline()
5 | 


--------------------------------------------------------------------------------
/spacemake/longread/overview.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import numpy as np
  4 | from glob import glob
  5 | from matplotlib.backends.backend_pdf import PdfPages
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | def main(args):
 10 |     dfs = []
 11 |     for fname in list(args.fnames) + list(glob(args.glob_pattern)):
 12 |         print(f"loading {fname}")
 13 |         df = pd.read_csv(fname, sep='\t')
 14 |         df['stats_file'] = fname
 15 |         dfs.append(df)
 16 | 
 17 |     df = pd.concat(dfs)
 18 | 
 19 |     repriming = ['TSO,TSO_RC', 'dN-SMRT,dN-SMRT_RC', ]
 20 |     concatenation = [c for c in df.columns if c.endswith('+') and ',' not in c]
 21 |     bead = ["bead_complete", "bead_only_handle", "bead_no_dT", "bead_no_opseq"][::-1]
 22 | 
 23 |     # avoid crash if columns are missing
 24 |     for r in repriming + concatenation + bead:
 25 |         if r not in df.columns:
 26 |             df[r] = 0
 27 | 
 28 |     # print(df)
 29 |     # print(f"concat columns {concatenation}")
 30 |     # print(f"bead columns {bead}")
 31 |     df['reprimed'] = df[repriming].sum(axis=1)
 32 |     df['bead_complete'] = np.nan_to_num(df['bead_complete'], nan=0.0)
 33 |     df['concat'] = df[concatenation].sum(axis=1)
 34 |     df['bead_related'] = np.nan_to_num(df[bead].sum(axis=1), nan=0.0)
 35 |     df['bead_dropseq'] = np.nan_to_num(df['bead_no_opseq'], nan=0.0)
 36 |     df['bead_incomplete'] = df['bead_related'] - df['bead_complete'] - df['bead_dropseq']
 37 |     df['non_bead'] = 100 - df['bead_related']
 38 |     df['bead_fidelity'] = 100 * df['bead_complete'] / df['bead_related']
 39 |     df = df.fillna(0)
 40 |     # print(df)
 41 |     if args.csv_out:
 42 |         df.to_csv(args.csv_out, float_format='%.2f', sep='\t', index=False)
 43 | 
 44 |     def clean(txt):
 45 |         txt = os.path.basename(txt)
 46 |         t = txt\
 47 |             .replace('source/','') \
 48 |             .replace('sts_', '') \
 49 |             .replace('pb_', '') \
 50 |             .replace('ds_', '') \
 51 |             .replace('.fq', '') \
 52 |             .replace('.bam', '') \
 53 |             .replace('lima.', '')
 54 |         
 55 |         if t.count('_') > 1:
 56 |             t = "_".join(t.split('_')[:2])
 57 |         
 58 |         return t
 59 | 
 60 |     df['name'] = df['qfa'].apply(clean)
 61 |     # df = df.sort_values('bead_related')
 62 |     df = df.sort_values('name')
 63 | 
 64 |     def guess_rRNA_file(path):
 65 |         # print("guessrRNA raw path", path)
 66 |         name = os.path.basename(path).replace('.summary', '.rRNA')
 67 |         
 68 |         if args.rRNA_same_place:
 69 |             place = os.path.dirname(path)
 70 |         else:
 71 |             place = args.rRNA
 72 | 
 73 |         return [
 74 |             os.path.join(place, name.replace(".fq", ".txt")),
 75 |             os.path.join(place, name.replace(".fq", ".txt")).replace('.rRNA.tsv', '.txt'),
 76 |             os.path.join(place, name.replace(".fq", ".txt")).replace('.rRNA.tsv', '.rRNA.txt'),
 77 |             os.path.join(place, name.replace(".bam", ".txt").replace("lima.", "")),
 78 |             os.path.join(place, name.replace(".bam", ".txt").replace("lima.", "")).replace('.rRNA.tsv', '.txt'),
 79 |             os.path.join(place, name.replace(".bam", ".txt").replace("lima.", "")).replace('.rRNA.tsv', '.rRNA.txt'),
 80 |         ]
 81 | 
 82 |     rRNA_fracs = []
 83 |     for row in df[['stats_file', 'N_reads']].itertuples():
 84 |         rcount = np.nan
 85 |         for fname in guess_rRNA_file(row.stats_file):
 86 |             print(fname)
 87 |             try:
 88 |                 rcount = int(open(fname).read())
 89 |             except (FileNotFoundError, ValueError):
 90 |                 pass
 91 |             else:
 92 |                 break
 93 |         if rcount == np.nan:
 94 |             raise ValueError
 95 | 
 96 |         rRNA_fracs.append(100. * rcount / row.N_reads)
 97 | 
 98 |     df['rRNA'] = rRNA_fracs
 99 |     # print(df[['qfa', 'rRNA']])
100 | 
101 |     def make_bars(ax, df, kinds, labels, cmap=plt.get_cmap('tab10'), w=0.9, colors=None):
102 |         n = len(kinds)
103 |         if colors is None:
104 |             colors = cmap(np.linspace(0, 1, n))
105 | 
106 |         x = np.arange(len(df)) - w/2.0
107 |         y0 = np.zeros(len(x), dtype=float)
108 |         for kind, label, color in zip(kinds, labels, colors):
109 |             y = np.nan_to_num(df[kind], nan=0.0)
110 |             # print(kind)
111 |             # print(y)
112 |             ax.bar(x, y, bottom=y0, label=label, width=w, color=color)
113 |             y0 += y
114 | 
115 |         ax.set_ylabel('fraction of library')
116 |         ax.set_xticks(x)
117 |         labels = df['name']  # [clean(fq) for fq in df['qfa']]
118 |         ax.set_xticklabels(labels, rotation=90)
119 |         ax.set_ylim(0, 100)
120 | 
121 |     marie = ["non_bead", "bead_incomplete", "bead_dropseq", "bead_complete", ]
122 |     marie_colors = ["gray", "royalblue", "green", "gold"]
123 |     
124 |     w = max(8 / 25. * len(df), 3)
125 |     if args.multi_page:
126 |         pdf = PdfPages(args.breakdown)
127 |         fig, ax1 = plt.subplots(1, figsize=(w, 4))
128 |     else:
129 |         fig, (ax1, ax2) = plt.subplots(2, figsize=(w, 6), sharex=True)
130 | 
131 |     make_bars(ax1, df, marie, labels=[b.replace('bead_', '') for b in marie], colors=marie_colors)
132 |     ax1.legend(title='Marie-stats', ncol=len(marie))
133 |     if args.multi_page:
134 |         fig.tight_layout()
135 |         pdf.savefig()
136 |         plt.close()
137 |         fig, ax2 = plt.subplots(1, figsize=(w, 4))
138 | 
139 |     make_bars(ax2, df, ["bead_fidelity"], labels=["bead fidelity"])
140 |     ax2.set_ylabel("bead fidelity")
141 |     if args.multi_page:
142 |         fig.tight_layout()
143 |         pdf.savefig()
144 |         pdf.close()
145 |     else:
146 |         fig.tight_layout()
147 |         plt.savefig(args.breakdown)
148 | 
149 |     plt.close()
150 | 
151 |     if args.multi_page:
152 |         pdf = PdfPages(args.output)
153 |         fig, ax1 = plt.subplots(1, figsize=(w, 4))
154 |     else:
155 |         fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, figsize=(w, 12), sharex=True)
156 | 
157 |     # print("bead related", bead)
158 |     make_bars(ax1, df, bead, labels=[b.replace('bead_', '') for b in bead])
159 |     ax1.legend(title='bead-related', ncol=len(bead))
160 |     if args.multi_page:
161 |         fig.tight_layout()
162 |         pdf.savefig()
163 |         plt.close()
164 |         fig, ax2 = plt.subplots(1, figsize=(w, 4))
165 | 
166 |     # print("repriming events", repriming)
167 |     make_bars(ax2, df, repriming, labels=[r.split(',')[0] for r in repriming], cmap=plt.get_cmap('tab20c'))
168 |     ax2.legend(title='repriming', ncol=len(repriming))
169 |     if args.multi_page:
170 |         fig.tight_layout()
171 |         pdf.savefig()
172 |         plt.close()
173 |         fig, ax3 = plt.subplots(1, figsize=(w, 4))
174 | 
175 |     # print("concat events", concatenation)
176 |     make_bars(ax3, df, concatenation, labels=concatenation, cmap=plt.get_cmap('tab20b'))
177 |     ax3.legend(title='concatamers', ncol=len(concatenation))
178 |     if args.multi_page:
179 |         fig.tight_layout()
180 |         pdf.savefig()
181 |         plt.close()
182 |         fig, ax4 = plt.subplots(1, figsize=(w, 4))
183 | 
184 |     make_bars(ax4, df, ["rRNA",], labels = ["rRNA"], cmap=plt.get_cmap('tab20c'))
185 |     ax4.legend(title='human rRNA', ncol=1)
186 |     if args.multi_page:
187 |         fig.tight_layout()
188 |         pdf.savefig()
189 |         pdf.close()
190 |     else:
191 |         fig.tight_layout()
192 |         plt.savefig(args.output)
193 | 
194 |     plt.close()
195 | 
196 | 
197 | def setup_parser(parser):
198 |     parser.add_argument("fnames", nargs='*')
199 |     parser.add_argument("--output", default="pb_overview.pdf",
200 |                         help="path/name of detailed report PDF")
201 |     parser.add_argument("--csv-out", default="all_pb_stats.csv",
202 |                         help="path/name of detailed report PDF")
203 |     parser.add_argument("--breakdown", default="bead_overview.pdf",
204 |                         help="path/name of bead report (Marie style) PDF")
205 |     parser.add_argument("--glob-pattern", default="stats/*summary.tsv",
206 |                         help="search pattern to gather summary files generated by the scan command")
207 |     parser.add_argument("--rRNA", default="rRNA/", 
208 |                         help="path to search for rRNA counts corresponding to samples")
209 |     parser.add_argument("--rRNA-same-place", default=False, action='store_true',
210 |                         help="If set, look for rRNA txt file with same sample name in same directory")
211 |     parser.add_argument("--multi-page", default=False, action="store_true",
212 |                         help="If set, generate multiple PDF pages instead of subplots")
213 | 
214 | 
215 | if __name__ == "__main__":
216 |     # setup own parser
217 |     import argparse
218 |     parser = argparse.ArgumentParser(prog='pb_overview')
219 |     setup_parser(parser)
220 |     main(parser.parse_args())
221 | 


--------------------------------------------------------------------------------
/spacemake/longread/signature.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from collections import OrderedDict, defaultdict
  4 | from spacemake.util import rev_comp
  5 | 
  6 | """
  7 | Small helper class to load longread signature definitions (see docs/tutorials/longreads)
  8 | and make them accessible to the various cmdline tools.
  9 | """
 10 | 
 11 | logger = logging.getLogger("spacemake.longread.signature")
 12 | 
 13 | 
 14 | class SignatureDB:
 15 |     def __init__(self, blocks=OrderedDict(), **kw):
 16 |         self.blocks = blocks
 17 |         self.lkup = {}
 18 |         self.fields = sorted(kw.keys())
 19 |         for f in self.fields:
 20 |             self.lkup[f] = kw[f]
 21 | 
 22 |     @classmethod
 23 |     def from_YAML(cls, fname="samples.yaml"):
 24 |         import yaml
 25 | 
 26 |         logger = logging.getLogger("spacemake.longread.SignatureDB.from_YAML")
 27 |         logger.info(f"reading longread signature definitions from '{fname}'")
 28 | 
 29 |         groups = yaml.load(open(fname), Loader=yaml.SafeLoader)
 30 |         signatures = groups["signatures"]
 31 |         default = signatures[groups["default"]]
 32 | 
 33 |         # load all building block oligo sequences and their reverse complements
 34 |         blocks = OrderedDict()
 35 |         for fa_id, seq in groups["blocks"].items():
 36 |             blocks[fa_id] = seq
 37 |             blocks[fa_id + "_RC"] = rev_comp(seq)
 38 | 
 39 |         logger.info(f"load_oligos(): loaded {len(blocks)} sequences from '{fname}'")
 40 | 
 41 |         # load the signature definitions and split into separate dictionaries
 42 |         field_lkups = {}
 43 |         for name, d in signatures.items():
 44 |             # print(f"name={name} d={d}")
 45 |             for f in d.keys():
 46 |                 if f not in field_lkups:
 47 |                     field_lkups[f] = defaultdict(lambda: default[f])
 48 | 
 49 |                 field_lkups[f][name] = d[f]
 50 | 
 51 |         logger.info(
 52 |             f"found {len(signatures)} signature definitions:"
 53 |             f"{sorted(signatures.keys())}."
 54 |         )
 55 |         return cls(blocks, **field_lkups)
 56 | 
 57 |     def __getattr__(self, attr):
 58 |         return self.lkup[attr]
 59 | 
 60 |     def sort_samples(self, samples, signatures):
 61 |         """
 62 |         Sort samples by the priority assigned in the signature definitions first,
 63 |         then lexicographically. Used for overview plots combining multiple longread
 64 |         sample results to group samples sharing a signature.
 65 |         """
 66 |         return sorted(
 67 |             zip(samples, signatures), key=lambda x: (self.prio.get(x[1], np.inf), x[0])
 68 |         )
 69 | 
 70 | 
 71 | def get_signature_db(try_path):
 72 |     """
 73 |     try to load a YAML file with longread signature definitions from <try_path>.
 74 |     If that fails, default to spacemake/data/config/longread.yaml
 75 |     """
 76 |     if os.access(try_path, os.R_OK):
 77 |         cfg = try_path
 78 |     else:
 79 |         cfg = os.path.join(os.path.dirname(__file__), "../data/config/longread.yaml")
 80 | 
 81 |     return SignatureDB.from_YAML(cfg)
 82 | 
 83 | 
 84 | def process_intact_signature(complete_signature, prefixes=["P5"], suffixes=["N70X"]):
 85 |     complete = complete_signature.split(",")
 86 |     while complete and complete[0] in prefixes:
 87 |         complete.pop(0)
 88 | 
 89 |     while complete and complete[-1] in suffixes:
 90 |         complete.pop()
 91 | 
 92 |     complete_order = dict(x[::-1] for x in enumerate(complete))
 93 |     # print(f"complete={complete}")
 94 | 
 95 |     return tuple(complete), complete_order
 96 | 
 97 | 
 98 | def digest_signatures(
 99 |     sig_counts,
100 |     bead_related="bead_start",
101 |     complete_signature="P5,bead_start,OP1,polyT,N70X",
102 |     prefixes=[
103 |         "P5",
104 |     ],
105 |     suffixes=[
106 |         "N70X",
107 |     ],
108 | ):
109 |     bead_counts = defaultdict(int)
110 |     ov_counts = defaultdict(int)
111 |     n_bead_related = 0
112 | 
113 |     complete, complete_order = process_intact_signature(
114 |         complete_signature, prefixes, suffixes
115 |     )
116 |     complete_set = set(complete)
117 |     found_part_counts = defaultdict(int)
118 | 
119 |     def describe(found_set):
120 |         missing = complete_set - found_set
121 |         if not missing:
122 |             descr = "complete"
123 |         elif len(missing) < len(found_set):
124 |             descr = f"missing_{','.join(sorted(missing))}"
125 |         else:
126 |             descr = f"only_{','.join(sorted(found_set))}"
127 | 
128 |         return descr
129 | 
130 |     def bead_relation(parts):
131 |         search = list(complete)
132 |         at = 0
133 | 
134 |         try:
135 |             i = parts.index(search[0])  # look for first part, e.g. bead_start
136 |         except ValueError:
137 |             i = 0
138 | 
139 |         found = []
140 |         for part in parts[i:]:
141 |             # find co-linear matches,
142 |             # ignore extra inserted segments
143 |             # (for now)
144 |             if part in search[at:]:
145 |                 found.append(part)
146 |                 at = search.index(part)
147 | 
148 |         found_set = set(found)
149 |         found_tup = tuple(sorted(found_set, key=lambda x: complete_order[x]))
150 | 
151 |         return describe(found_set), found_tup
152 | 
153 |     for sig, count in sig_counts.items():
154 |         parts = sig.split(",")
155 |         if bead_related in parts:
156 |             br, found_tup = bead_relation(parts)
157 |             bead_counts[br] += count
158 |             n_bead_related += count
159 | 
160 |             for i in range(1, len(found_tup) + 1):
161 |                 found_part_counts[found_tup[:i]] += count
162 |         else:
163 |             ov_counts[sig] = count
164 | 
165 |     ov_counts["bead-related"] = n_bead_related
166 |     return ov_counts, bead_counts, found_part_counts, complete
167 | 


--------------------------------------------------------------------------------
/spacemake/parallel.py:
--------------------------------------------------------------------------------
  1 | __version__ = "0.9"
  2 | __author__ = ["Marvin Jens"]
  3 | __license__ = "GPL"
  4 | __email__ = ["marvin.jens@mdc-berlin.de"]
  5 | 
  6 | import logging
  7 | import time
  8 | 
  9 | 
 10 | def put_or_abort(Q, item, abort_flag, timeout=1):
 11 |     """
 12 |     Small wrapper around queue.put() to prevent
 13 |     dead-locks in the event of (detectable) errors
 14 |     that might cause put() to block forever.
 15 |     Expects a shared mp.Value instance as abort_flag
 16 | 
 17 |     Returns: False if put() was succesful, True if execution
 18 |     should be aborted.
 19 |     """
 20 |     import queue
 21 | 
 22 |     sent = False
 23 |     # logging.warning(f"sent={sent} abort_flag={abort_flag}")
 24 |     while not (sent or abort_flag.value):
 25 |         try:
 26 |             Q.put(item, timeout=timeout)
 27 |         except queue.Full:
 28 |             pass
 29 |         else:
 30 |             sent = True
 31 | 
 32 |     return abort_flag.value
 33 | 
 34 | 
 35 | def queue_iter(Q, abort_flag, stop_item=None, timeout=1):
 36 |     """
 37 |     Small generator/wrapper around multiprocessing.Queue allowing simple
 38 |     for-loop semantics:
 39 | 
 40 |         for item in queue_iter(queue, abort_flag):
 41 |             ...
 42 |     The abort_flag is handled analogous to put_or_abort, only
 43 |     that it ends the iteration instead
 44 |     """
 45 |     import queue
 46 | 
 47 |     # logging.debug(f"queue_iter({queue})")
 48 |     while True:
 49 |         if abort_flag.value:
 50 |             break
 51 |         try:
 52 |             item = Q.get(timeout=timeout)
 53 |         except queue.Empty:
 54 |             pass
 55 |         else:
 56 |             if item == stop_item:
 57 |                 # signals end->exit
 58 |                 break
 59 |             else:
 60 |                 # logging.debug(f"queue_iter->item {item}")
 61 |                 yield item
 62 | 
 63 | 
 64 | def join_with_empty_queues(proc, Qs, abort_flag, timeout=1):
 65 |     """
 66 |     joins() a process that writes data to queues Qs w/o deadlock.
 67 |     In case of an abort, the subprocess normally would not join
 68 |     until the Qs are emptied. join_with_empty_queues() monitors a global
 69 |     abort flag and empties the queues if needed, allowing the sub-process
 70 |     to terminate properly.
 71 |     """
 72 | 
 73 |     def drain(Q):
 74 |         content = []
 75 |         while not Q.empty():
 76 |             try:
 77 |                 item = Q.get(timeout=timeout)
 78 |             except queue.Empty:
 79 |                 pass
 80 |             else:
 81 |                 content.append(item)
 82 | 
 83 |         return content
 84 | 
 85 |     contents = [list() for i in range(len(Qs))]
 86 |     while proc.exitcode is None:
 87 |         proc.join(timeout)
 88 |         if abort_flag.value:
 89 |             for Q, content in zip(Qs, contents):
 90 |                 content.extend(drain(Q))
 91 | 
 92 |     return contents
 93 | 
 94 | 
 95 | def chunkify(src, n_chunk=1000):
 96 |     """
 97 |     Iterator which collects up to n_chunk items from iterable <src> and yields them
 98 |     as a list.
 99 |     """
100 |     chunk = []
101 |     n = 0
102 |     for x in src:
103 |         chunk.append(x)
104 |         if len(chunk) >= n_chunk:
105 |             yield n, chunk
106 |             n += 1
107 |             chunk = []
108 | 
109 |     if chunk:
110 |         yield n, chunk
111 | 
112 | 
113 | def log_qerr(qerr):
114 |     "helper function for reporting errors in sub processes"
115 |     for name, lines in qerr:
116 |         for line in lines:
117 |             logging.error(f"subprocess {name} exception {line}")
118 | 
119 | 
120 | class ExceptionLogging:
121 |     """
122 |     A context manager that handles otherwise uncaught exceptions by logging
123 |     the event and traceback info, optinally raises a flag.
124 |     Very handy for wrapping the main function in a sub-process!
125 |     """
126 | 
127 |     def __init__(self, name, Qerr=None, exc_flag=None):
128 |         # print('__init__ called')
129 |         self.Qerr = Qerr
130 |         self.exc_flag = exc_flag
131 |         self.name = name
132 |         self.logger = logging.getLogger(name)
133 |         self.exception = None
134 | 
135 |     def __enter__(self):
136 |         self.t0 = time.time()
137 |         # print('__enter__ called')
138 |         return self
139 | 
140 |     def __exit__(self, exc_type, exc_value, exc_traceback):
141 |         # print('__exit__ called')
142 |         self.t1 = time.time()
143 |         self.logger.info(f"CPU time: {self.t1 - self.t0:.3f} seconds.")
144 |         if exc_type and (exc_type != SystemExit):
145 |             import traceback
146 | 
147 |             lines = "\n".join(
148 |                 traceback.format_exception(exc_type, exc_value, exc_traceback)
149 |             ).split("\n")
150 |             self.exception = lines
151 |             self.logger.error(f"an unhandled exception occurred")
152 |             for l in lines:
153 |                 self.logger.error(l)
154 | 
155 |             if self.Qerr is not None:
156 |                 self.Qerr.put((self.name, lines))
157 | 
158 |             if self.exc_flag:
159 |                 self.logger.error(f"raising exception flag {self.exc_flag}")
160 |                 self.exc_flag.value = True
161 | 


--------------------------------------------------------------------------------
/spacemake/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | #from .cmdline import cmdline
2 | #from .dge import calculate_adata_metrics,\
3 |     #calculate_shannon_entropy_scompression, dge_to_sparse_adata,\
4 |     #attach_barcode_file, parse_barcode_file, load_external_dge,\
5 |     #attach_puck_variables, attach_puck
6 | 
7 | 


--------------------------------------------------------------------------------
/spacemake/preprocess/cmdline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | __version__ = "0.9"
 3 | __author__ = ["Marvin Jens"]
 4 | __license__ = "GPL"
 5 | __email__ = ["marvin.jens@mdc-berlin.de"]
 6 | 
 7 | from spacemake.preprocess.fastq import (
 8 |     parse_args,
 9 |     setup_logging,
10 |     main_combinatorial,
11 |     main_dropseq,
12 | )
13 | 
14 | from spacemake.parallel import ExceptionLogging
15 | 
16 | 
17 | def cmdline():
18 |     with ExceptionLogging("main"):
19 |         args = parse_args()
20 |         NO_CALL = args.na
21 |         setup_logging(args)
22 | 
23 |         if args.out_format == "bam" and not args.read2:
24 |             raise ValueError("bam output format requires --read2 parameter")
25 | 
26 |         if ("bc1" in args.cell and not args.bc1_ref) or (
27 |             "bc2" in args.cell and not args.bc2_ref
28 |         ):
29 |             raise ValueError(
30 |                 "bc1/2 are referenced in --cell or --cell-raw, but no reference barcodes are specified via --bc{{1,2}}-ref"
31 |             )
32 | 
33 |         if args.bc1_ref or args.bc2_ref:
34 |             main_combinatorial(args)
35 |         else:
36 |             main_dropseq(args)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     cmdline()
41 | 


--------------------------------------------------------------------------------
/spacemake/preprocess/dge.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | logger_name = "spacemake.preprocess.dge"
  4 | logger = logging.getLogger(logger_name)
  5 | 
  6 | def calculate_adata_metrics(adata, dge_summary_path=None, n_reads=None):
  7 |     import scanpy as sc
  8 |     import pandas as pd
  9 | 
 10 |     # calculate mitochondrial gene percentage
 11 |     adata.var["mt"] = (
 12 |         adata.var_names.str.startswith("Mt-")
 13 |         | adata.var_names.str.startswith("mt-")
 14 |         | adata.var_names.str.startswith("MT-")
 15 |     )
 16 | 
 17 |     sc.pp.calculate_qc_metrics(
 18 |         adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
 19 |     )
 20 | 
 21 |     add_reads = False
 22 |     if dge_summary_path is not None:
 23 |         dge_summary = pd.read_csv(
 24 |             dge_summary_path,
 25 |             skiprows=7,
 26 |             sep="\t",
 27 |             index_col="cell_bc",
 28 |             names=["cell_bc", "n_reads", "n_umi", "n_genes"],
 29 |         )
 30 | 
 31 |         adata.obs = pd.merge(
 32 |             adata.obs, dge_summary[["n_reads"]], left_index=True, right_index=True
 33 |         )
 34 | 
 35 |         add_reads = True
 36 | 
 37 |     if n_reads is not None:
 38 |         adata.obs["n_reads"] = n_reads
 39 |         add_reads = True
 40 | 
 41 |     if add_reads:
 42 |         adata.obs["reads_per_counts"] = adata.obs.n_reads / adata.obs.total_counts
 43 | 
 44 | 
 45 | def calculate_shannon_entropy_scompression(adata):
 46 |     import math
 47 |     import itertools
 48 |     import numpy as np
 49 |     from collections import Counter
 50 | 
 51 |     def compute_shannon_entropy(barcode):
 52 |         prob, length = Counter(barcode), float(len(barcode))
 53 |         return -sum(
 54 |             count / length * math.log(count / length, 2) for count in prob.values()
 55 |         )
 56 | 
 57 |     def compute_string_compression(barcode):
 58 |         compressed_barcode = "".join(
 59 |             letter + str(len(list(group)))
 60 |             for letter, group in itertools.groupby(barcode)
 61 |         )
 62 | 
 63 |         return len(compressed_barcode)
 64 | 
 65 |     bc = adata.obs.index.to_numpy()
 66 |     bc_len = len(bc[0])
 67 |     theoretical_barcodes = np.random.choice(
 68 |         ["A", "C", "T", "G"], size=(bc.shape[0], bc_len)
 69 |     )
 70 | 
 71 |     adata.obs["exact_entropy"] = np.round(
 72 |         np.array([compute_shannon_entropy(cell_bc) for cell_bc in bc]), 2
 73 |     )
 74 |     adata.obs["theoretical_entropy"] = np.round(
 75 |         np.array(
 76 |             [compute_shannon_entropy(cell_bc) for cell_bc in theoretical_barcodes]
 77 |         ),
 78 |         2,
 79 |     )
 80 |     adata.obs["exact_compression"] = np.round(
 81 |         np.array([compute_string_compression(cell_bc) for cell_bc in bc]), 2
 82 |     )
 83 |     adata.obs["theoretical_compression"] = np.round(
 84 |         np.array(
 85 |             [compute_string_compression(cell_bc) for cell_bc in theoretical_barcodes]
 86 |         ),
 87 |         2,
 88 |     )
 89 | 
 90 | 
 91 | def dge_to_sparse_adata(dge_path, dge_summary_path):
 92 |     import anndata
 93 |     import numpy as np
 94 |     import gzip
 95 |     import pandas as pd
 96 |     from scipy.sparse import coo_matrix, hstack
 97 | 
 98 |     gene_names = []
 99 | 
100 |     with gzip.open(dge_path, "rt") as dge:
101 |         first_line = dge.readline().strip().split("\t")
102 |         has_mt = False
103 |         barcodes = first_line[1:]
104 |         N_bc = len(barcodes)
105 |         X = None
106 | 
107 |         # read DGE line by line
108 |         # first row: contains CELL BARCODEs
109 |         # each next row contains one gene name, and the counts of that gene
110 |         for line in dge:
111 |             vals = line.strip()
112 |             _idx_tab = vals.index("\t")
113 |             _gene_name = vals[:_idx_tab]
114 |             gene_names.append(_gene_name)
115 | 
116 |             if _gene_name.lower().startswith("mt-"):
117 |                 has_mt = True
118 |                 
119 |             # store counts as np.array
120 |             _vals = np.fromstring(vals[_idx_tab:], dtype=np.int32, count=N_bc, sep='\t').flatten()
121 |             _idx_nonzero = np.argwhere(_vals != 0).flatten()
122 | 
123 |             if len(_idx_nonzero) > 0:
124 |                 gene_sp = coo_matrix((_vals[_idx_nonzero].astype(np.int32), (_idx_nonzero, np.zeros(len(_idx_nonzero)))), shape=(N_bc, 1), dtype=np.int32)
125 |             else:
126 |                 gene_sp = coo_matrix((N_bc, 1), dtype=np.int32)
127 | 
128 |             if X is None:
129 |                  X = gene_sp
130 |             else:
131 |                  X = hstack([X, gene_sp])
132 | 
133 |         if X is None:
134 |             X = coo_matrix((len(barcodes), 0), dtype=np.int32)
135 |     
136 |         if not has_mt:
137 |             # ensure we have an entry for mitochondrial transcripts even if it's just all zeros
138 |             print(
139 |                 "need to add mt-missing because no mitochondrial stuff was among the genes for annotation"
140 |             )
141 |             gene_names.append("mt-missing")
142 |             X = hstack([X, np.zeros(X.shape[0])[:, None]])
143 | 
144 |         X = X.tocsr()
145 |         X = X.astype(np.float32)
146 |         adata = anndata.AnnData(
147 |             X, obs=pd.DataFrame(index=barcodes), var=pd.DataFrame(index=gene_names)
148 |         )
149 | 
150 |         # name the index
151 |         adata.obs.index.name = "cell_bc"
152 | 
153 |         # attach metrics such as: total_counts, pct_mt_counts, etc
154 |         # also attach n_genes, and calculate pcr
155 |         calculate_adata_metrics(adata, dge_summary_path)
156 | 
157 |         # calculate per shannon_entropy and string_compression per bead
158 |         calculate_shannon_entropy_scompression(adata)
159 | 
160 |         if adata.X.sum() == 0:
161 |             logger.warn(f"The DGE from {dge_path} is empty")
162 | 
163 |         return adata
164 | 
165 | 
166 | def load_external_dge(dge_path):
167 |     import scanpy as sc
168 | 
169 |     from scanpy._utils import check_nonnegative_integers
170 |     from scipy.sparse import issparse, csc_matrix
171 |     from spacemake.errors import SpacemakeError
172 | 
173 |     adata = sc.read(dge_path)
174 | 
175 |     if not check_nonnegative_integers(adata.X):
176 |         raise SpacemakeError(
177 |             f"External dge seems to contain values "
178 |             + "which are already normalised. Raw-count matrix expected."
179 |         )
180 | 
181 |     if not issparse(adata.X):
182 |         adata.X = csc_matrix(adata.X)
183 | 
184 |     # name the index
185 |     adata.obs.index.name = "cell_bc"
186 | 
187 |     # attach metrics such as: total_counts, pct_mt_counts, etc
188 |     # also attach n_genes, and calculate pcr
189 |     calculate_adata_metrics(adata)
190 | 
191 |     return adata
192 | 
193 | 
194 | def parse_barcode_file(barcode_file):
195 |     import pandas as pd
196 | 
197 |     bc = pd.read_csv(barcode_file, sep="[,|\t]", engine='python')
198 | 
199 |     # rename columns
200 |     bc = (
201 |         bc.rename(
202 |             columns={
203 |                 "xcoord": "x_pos",
204 |                 "ycoord": "y_pos",
205 |                 "barcodes": "cell_bc",
206 |                 "barcode": "cell_bc",
207 |             }
208 |         )
209 |         .set_index("cell_bc")
210 |         .loc[:, ["x_pos", "y_pos"]]
211 |     )
212 | 
213 |     bc = bc.loc[~bc.index.duplicated(keep="first")]
214 | 
215 |     bc = bc.loc[~bc.index.duplicated(keep="first")]
216 | 
217 |     return bc
218 | 
219 | 
220 | def attach_barcode_file(adata, barcode_file):
221 |     bc = parse_barcode_file(barcode_file)
222 | 
223 |     # new obs has only the indices of the exact barcode matches
224 |     new_obs = adata.obs.merge(bc, left_index=True, right_index=True, how="inner")
225 |     adata = adata[new_obs.index, :]
226 |     adata.obs = new_obs
227 |     adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].to_numpy()
228 | 
229 |     return adata
230 | 
231 | 
232 | def attach_puck_variables(adata, puck_variables):
233 |     if "spatial" not in adata.obsm.keys():
234 |         raise SpacemakeError(
235 |             f"this dataset has no spatial information "
236 |             + "available. Please attach the spatial information using the "
237 |             + "spacemake.preprocess.attach_barcode_file() function first"
238 |         )
239 | 
240 |     adata.uns["puck_variables"] = puck_variables
241 | 
242 |     x_pos_max, y_pos_max = tuple(adata.obsm["spatial"].max(axis=0))
243 |     x_pos_min, y_pos_min = tuple(adata.obsm["spatial"].min(axis=0))
244 |     #print(f"PUCK VARS {puck_variables} X MIN {x_pos_min} X MAX {x_pos_max} Y MIN {y_pos_min} Y MAX {y_pos_max}")
245 | 
246 |     width_um = adata.uns["puck_variables"]["width_um"]
247 |     coord_by_um = (x_pos_max - x_pos_min) / width_um
248 | 
249 |     # this can be NaN if only one coordinate (only one cell, will fail)
250 |     if coord_by_um > 0:
251 |         height_um = int((y_pos_max - y_pos_min) / coord_by_um)
252 |     else:
253 |         height_um = 1 # avoid division by zero and error in reports
254 |         coord_by_um = 1
255 | 
256 |     adata.uns["puck_variables"]["height_um"] = height_um
257 |     adata.uns["puck_variables"]["coord_by_um"] = coord_by_um
258 | 
259 |     return adata
260 | 
261 | 
262 | def attach_puck(adata, puck):
263 |     attach_puck_variables(adata, puck.variables)
264 |     adata.uns["puck_name"] = puck.name
265 | 
266 |     return adata


--------------------------------------------------------------------------------
/spacemake/reporting.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import logging
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | 
  8 | def count_dict_collapse_misc(
  9 |     counts, misc_thresh=0.01, total=1, add_up=None, sig_intact=None
 10 | ):
 11 |     out_counts = {}
 12 |     out_frac = {}
 13 | 
 14 |     misc = 0
 15 |     sum = 0
 16 |     if sig_intact is not None:
 17 |         complete = ",".join(sig_intact)
 18 |         everything = set(sig_intact)
 19 |     else:
 20 |         complete = None
 21 |         everything = set()
 22 | 
 23 |     def relkey(key):
 24 |         if sig_intact is None:
 25 |             return key
 26 | 
 27 |         if key == complete:
 28 |             return "complete"
 29 | 
 30 |         obs = set(key.split(","))
 31 |         there = obs & everything
 32 |         extra = obs - everything
 33 |         missing = everything - obs
 34 | 
 35 |         if len(missing) <= len(there):
 36 |             res = "missing_" + ",".join(sorted(missing))
 37 |         else:
 38 |             res = "only_" + ",".join(sorted(there))
 39 |         if extra:
 40 |             res += "_extra_" + ",".join(sorted(extra))
 41 | 
 42 |         return res
 43 | 
 44 |     for key, n in sorted(counts.items()):
 45 |         key = relkey(key)
 46 |         sum += n
 47 |         f = n / float(total)
 48 |         if f < misc_thresh:
 49 |             misc += n
 50 |         else:
 51 |             out_counts[key] = n
 52 |             out_frac[key] = f
 53 | 
 54 |         if misc > 0:
 55 |             out_counts["misc"] = misc
 56 |             out_frac["misc"] = misc / float(total)
 57 | 
 58 |     if add_up is None:
 59 |         other = total - sum
 60 |     else:
 61 |         other = total - counts[add_up]
 62 | 
 63 |     if other > 0:
 64 |         out_counts["NA"] = other
 65 |         out_frac["NA"] = other / float(total)
 66 |     return out_counts, out_frac
 67 | 
 68 | 
 69 | def count_dict_out(counts, title, misc_thresh=0.01, total=1, **kw):
 70 |     print(f"### {title}")
 71 |     out_counts, out_frac = count_dict_collapse_misc(counts, misc_thresh, total, **kw)
 72 |     for key in sorted(out_counts.keys()):
 73 |         print(f"{key}\t{out_counts[key]}\t{out_frac[key]:.3f}")
 74 | 
 75 | 
 76 | def to_hist(d, normed=True):
 77 |     x = np.array(list(d.keys()))
 78 |     x0 = x.min()
 79 |     x1 = x.max() + 1
 80 |     counts = np.zeros(x1, dtype=np.float32)
 81 | 
 82 |     for i in x:
 83 |         counts[i] = d[i]
 84 | 
 85 |     n = counts.sum()
 86 |     if normed:
 87 |         counts /= n
 88 | 
 89 |     return counts, n
 90 | 
 91 | 
 92 | def donut_plot(
 93 |     ax, data, sa=10, explode=None, colors=None, labels=None, title="", cmap="tab20"
 94 | ):
 95 |     import matplotlib.pyplot as plt
 96 | 
 97 |     if labels is None:
 98 |         labels = sorted(data.keys())
 99 | 
100 |     counts = [data.get(k, 0) for k in labels]
101 | 
102 |     if colors is None:
103 |         colors = list(plt.cm.get_cmap(cmap)(np.linspace(0, 1, len(labels))))
104 | 
105 |     wedges, texts = ax.pie(
106 |         counts,
107 |         wedgeprops=dict(width=0.5),
108 |         startangle=sa,
109 |         explode=explode,
110 |         colors=colors,
111 |     )
112 | 
113 |     bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.5)
114 |     kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
115 |     c = np.array(counts)
116 |     pcts = 100.0 * c / float(c.sum())
117 |     for i, p in enumerate(wedges):
118 |         ang = (p.theta2 - p.theta1) / 2.0 + p.theta1
119 |         y = np.sin(np.deg2rad(ang))
120 |         x = np.cos(np.deg2rad(ang))
121 |         horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
122 |         connectionstyle = "angle,angleA=0,angleB={}".format(ang)
123 |         kw["arrowprops"].update({"connectionstyle": connectionstyle})
124 |         if pcts[i] > 0:
125 |             ax.text(x * 0.75, y * 0.75, f"{pcts[i]:.1f}", horizontalalignment="center")
126 |             ax.annotate(
127 |                 labels[i],
128 |                 xy=(x, y),
129 |                 xytext=(1.4 * np.sign(x), 1.4 * y),
130 |                 horizontalalignment=horizontalalignment,
131 |                 **kw,
132 |             )
133 | 
134 |     if title:
135 |         ax.set_title(title)
136 | 
137 |     return labels, colors
138 | 
139 | 
140 | def approximate(intvalue):
141 |     suffixes = {9: "G", 6: "M", 3: "k", 0: ""}
142 |     dec = int(np.floor(np.log10(intvalue) / 3)) * 3
143 |     x = np.round(intvalue / 10 ** dec, decimals=2)
144 |     return f"{x:.2f} {suffixes.get(dec, '?')}"
145 | 
146 | 
147 | def len_plot(
148 |     ax,
149 |     data,
150 |     labels=None,
151 |     colors=None,
152 |     xlabel="aligned bases",
153 |     ylabel="fraction",
154 |     title="type",
155 |     cmap="tab20",
156 |     min_count=10,
157 |     cumulative=False,
158 |     legend=True,
159 | ):
160 |     import matplotlib.pyplot as plt
161 | 
162 |     if labels is None:
163 |         labels = sorted(data.keys())
164 | 
165 |     if colors is None:
166 |         colors = plt.cm.get_cmap(cmap)(np.linspace(0, 1, len(labels)))
167 | 
168 |     color_dict = {}
169 |     for cig_type, color in zip(labels, colors):
170 |         color_dict[cig_type] = color
171 | 
172 |         if not cig_type in data:
173 |             continue
174 |         ml, n = to_hist(data[cig_type], normed=True)
175 |         if n < min_count:
176 |             continue
177 | 
178 |         x = np.arange(len(ml))
179 |         y = ml.cumsum() if cumulative else ml
180 | 
181 |         ax.step(
182 |             x,
183 |             y,
184 |             where="mid",
185 |             label=f"{cig_type} ({approximate(n)})",
186 |             color=color,
187 |             lw=2,
188 |             solid_capstyle="round",
189 |         )
190 | 
191 |     if cumulative:
192 |         ax.axhline(0.5, lw=0.5, ls="dashed", color="k")
193 | 
194 |     if legend:
195 |         ax.legend(title=title, bbox_to_anchor=(0.5, 1.05), loc="lower center", ncol=2)
196 | 
197 |     ax.set_xlabel(xlabel)
198 |     ax.set_ylabel(ylabel)
199 | 
200 |     return color_dict
201 | 
202 | 
203 | # def make_colors_explode(labels, cmap="Blues", hilight="bead-related", hicolor="red"):
204 | #     import matplotlib.pyplot as plt
205 | #     ex = np.zeros(len(labels))
206 | #     colors = list(plt.get_cmap(cmap)(np.linspace(0.2, 0.8, len(labels))))
207 | #     try:
208 | #         i = labels.index(hilight)
209 | #     except ValueError:
210 | #         pass
211 | #     else:
212 | #         ex[i] = 0.1
213 | #         colors[i] = hicolor
214 | #     return ex, colors
215 | 


--------------------------------------------------------------------------------
/spacemake/smk.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | logger_name = "spacemake.main"
  4 | logger = logging.getLogger(logger_name)
  5 | 
  6 | 
  7 | class Spacemake:
  8 |     """Spacemake.
  9 | 
 10 |     Class to access spacemake processed data from python.
 11 | 
 12 |     """
 13 | 
 14 |     def __init__(self, root):
 15 |         """__init__ constructor function of the Spacemake class
 16 | 
 17 |         :param root: Path to the spacemake root directory.
 18 |         :type root: str
 19 |         """
 20 |         from spacemake.config import get_global_config
 21 |         from spacemake.project_df import get_global_ProjectDF
 22 | 
 23 |         self.root = root
 24 |         self.config = get_global_config(root)
 25 |         self.project_df = get_global_ProjectDF(root)
 26 | 
 27 |     def load_processed_adata(
 28 |         self, project_id, sample_id, run_mode_name, umi_cutoff
 29 |     ): #-> anndata.AnnData:
 30 |         """Load spacemake processed data.
 31 | 
 32 |         :param project_id: project_id of the data to be loaded.
 33 |         :type project_id: str
 34 |         :param sample_id: sample_id of the data to be loaded.
 35 |         :type sample_id: str
 36 |         :param run_mode_name: name of the run mode of the data to be loaded.
 37 |             Each sample can have several run_modes during sample addition,
 38 |             here only one option needs to be provided.
 39 |         :type run_mode_name: str
 40 |         :param umi_cutoff: the umi_cutoff of the data to be loaded. Each
 41 |             run_mode can have several umi_cutoffs provided during configuration
 42 |             here only one option needs to be provided.
 43 |         :type umi_cutoff: int
 44 |         :returns: A spacemake processed and analyzed AnnData object, containing
 45 |             the results of the analysis.
 46 |         :rtype: anndata.AnnData
 47 |         """
 48 |         import scanpy as sc
 49 |         # import anndata
 50 | 
 51 | 
 52 |         self.project_df.assert_run_mode(project_id, sample_id, run_mode_name)
 53 |         run_mode = self.config.get_run_mode(run_mode_name)
 54 | 
 55 |         if not int(umi_cutoff) in [int(uc) for uc in run_mode.variables["umi_cutoff"]]:
 56 |             raise SpacemakeError(
 57 |                 f"run_mode={run_mode} has no " + f"umi_cutoff={umi_cutoff}"
 58 |             )
 59 | 
 60 |         adata_raw = self.load_raw_spatial_adata(
 61 |             project_id=project_id, sample_id=sample_id, run_mode_name=run_mode_name
 62 |         )
 63 | 
 64 |         adata = sc.read(
 65 |             f"{self.root}/projects/{project_id}/processed_data/{sample_id}/"
 66 |             + f"illumina/complete_data/automated_analysis/{run_mode_name}/"
 67 |             + f"umi_cutoff_{umi_cutoff}/results.h5ad"
 68 |         )
 69 | 
 70 |         if "run_mode_variables" not in adata.uns.keys():
 71 |             adata.uns["run_mode_variables"] = run_mode.variables
 72 |         if "puck_variables" not in adata.uns.keys():
 73 |             adata.uns["puck_variables"] = adata_raw.uns["puck_variables"]
 74 | 
 75 |         return adata
 76 | 
 77 |     def load_raw_spatial_adata(
 78 |         self, project_id, sample_id, run_mode_name
 79 |     ): #-> anndata.AnnData:
 80 |         """Load raw, spacemake processed data.
 81 | 
 82 |         This function will load the raw countr matrix, created by spacemake.
 83 | 
 84 |         :param project_id: project_id of the raw data to be loaded.
 85 |         :type project_id: str
 86 |         :param sample_id: sample_id of the raw data to be loaded.
 87 |         :type sample_id: str
 88 |         :param run_mode_name: name of the run mode of the raw data to be loaded.
 89 |             Each sample can have several run_modes during sample addition,
 90 |             here only one option needs to be provided.
 91 |         :type run_mode_name: str
 92 |         :returns: A spacemake processed AnnData object, containing unfiltered
 93 |             raw expression data, and all cells or spatial units in the dataset.
 94 |         :rtype: anndata.AnnData
 95 |         """
 96 |         import scanpy as sc
 97 | 
 98 |         self.project_df.assert_run_mode(project_id, sample_id, run_mode_name)
 99 |         run_mode = self.config.get_run_mode(run_mode_name)
100 | 
101 |         dge_type = ""
102 |         dge_cleaned = ""
103 |         polyA_adapter_trimmed = ""
104 |         mm_included = ""
105 | 
106 |         if run_mode.variables["polyA_adapter_trimming"]:
107 |             polyA_adapter_trimmed = ".polyA_adapter_trimmed"
108 | 
109 |         if run_mode.variables["count_intronic_reads"]:
110 |             dge_type = ".all"
111 |         else:
112 |             dge_type = ".exon"
113 | 
114 |         if run_mode.variables["count_mm_reads"]:
115 |             mm_included = ".mm_included"
116 | 
117 |         if run_mode.variables["clean_dge"]:
118 |             dge_cleaned = ".cleaned"
119 | 
120 |         adata = sc.read(
121 |             f"{self.root}/projects/{project_id}/processed_data/{sample_id}/"
122 |             + f"illumina/complete_data/dge/dge{dge_type}{dge_cleaned}"
123 |             + f"{polyA_adapter_trimmed}{mm_included}.spatial_beads.h5ad"
124 |         )
125 | 
126 |         if "puck_variables" not in adata.uns.keys():
127 |             from spacemake.preprocess import attach_puck_variables
128 | 
129 |             adata = attach_puck_variables(
130 |                 adata,
131 |                 puck_variables=self.project_df.get_puck_variables(
132 |                     project_id=project_id, sample_id=sample_id
133 |                 ),
134 |             )
135 | 
136 |         if "run_mode_variables" not in adata.uns.keys():
137 |             adata.uns["run_mode_variables"] = run_mode.variables
138 | 
139 |         return adata
140 | 
141 | 
142 | def get_novosparc_variables(pdf, args):
143 |     """get_novosparc_variables.
144 | 
145 |     :param pdf:
146 |     :param args:
147 |     """
148 |     # assert that sample exists
149 |     pdf.assert_sample(args["project_id"], args["sample_id"])
150 | 
151 |     def populate_variables_from_args(pdf, args, arg_prefix=""):
152 |         """populate_variables_from_args.
153 | 
154 |         :param pdf:
155 |         :param args:
156 |         :param arg_prefix:
157 |         """
158 |         # get sample info
159 |         sample_info = pdf.get_sample_info(
160 |             project_id=args[f"{arg_prefix}project_id"],
161 |             sample_id=args[f"{arg_prefix}sample_id"],
162 |         )
163 | 
164 |         # populate return dictionary
165 |         ret = {
166 |             f"{arg_prefix}project_id": args[f"{arg_prefix}project_id"],
167 |             f"{arg_prefix}sample_id": args[f"{arg_prefix}sample_id"],
168 |         }
169 | 
170 |         # get run mode
171 |         if f"{arg_prefix}run_mode" in args:
172 |             ret[f"{arg_prefix}run_mode"] = args[f"{arg_prefix}run_mode"]
173 |         else:
174 |             run_mode_name = sample_info["run_mode"][0]
175 |             ret[f"{arg_prefix}run_mode"] = run_mode_name
176 |             logger.info(f"No run_mode provided, using {run_mode_name}")
177 | 
178 |         run_mode = pdf.config.get_run_mode(ret[f"{arg_prefix}run_mode"])
179 | 
180 |         if f"{arg_prefix}umi_cutoff" not in args:
181 |             umi_cutoff = run_mode.variables["umi_cutoff"][0]
182 |             ret[f"{arg_prefix}umi_cutoff"] = umi_cutoff
183 |             logger.info(f"No umi_cutoff provided, using {umi_cutoff}")
184 |         else:
185 |             ret[f"{arg_prefix}umi_cutoff"] = args[f"{arg_prefix}umi_cutoff"]
186 | 
187 |         return ret
188 | 
189 |     ret = populate_variables_from_args(pdf, args)
190 | 
191 |     if "reference_project_id" not in args or "reference_sample_id" not in args:
192 |         logger.info(
193 |             "No reference_project_id or reference_sample_id provided,"
194 |             + " running novosparc de-novo..."
195 |         )
196 |         ret["reference_project_id"] = ""
197 |         ret["reference_sample_id"] = ""
198 |         ret["reference_umi_cutoff"] = ""
199 |         ret["reference_run_mode"] = ""
200 |     else:
201 |         pdf.assert_sample(args["reference_project_id"], args["reference_sample_id"])
202 | 
203 |         logger.info(
204 |             "Using (project_id, sample_id)="
205 |             + f"({args['reference_project_id']}, {args['reference_sample_id']})"
206 |             + " reference, running novosparc with reference..."
207 |         )
208 | 
209 |         novosparc_ret = populate_variables_from_args(pdf, args, arg_prefix="reference_")
210 | 
211 |         ret = {**ret, **novosparc_ret}
212 | 
213 |     return ret
214 | 
215 | 
216 | _spacemake_instance = None
217 | 
218 | 
219 | def get_spacemake_object():
220 |     global _spacemake_instance
221 |     if _spacemake_instance is None:
222 |         _spacemake_instance = Spacemake(".")
223 | 
224 |     return _spacemake_instance
225 | 
226 | 
227 | # def get_ConfigFile():
228 | #     spmk = get_spacemake_object()
229 | #     return spmk.config
230 | 
231 | 
232 | # def get_ProjectDF():
233 | #     spmk = get_spacemake_object()
234 | #     return spmk.project_df
235 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/spacemake/snakemake/__init__.py


--------------------------------------------------------------------------------
/spacemake/snakemake/downsample.smk:
--------------------------------------------------------------------------------
 1 | #########
 2 | # about #
 3 | #########
 4 | __version__ = '0.1.0'
 5 | __author__ = ['Nikos Karaiskos', 'Tamas Ryszard Sztanka-Toth']
 6 | __licence__ = 'GPL'
 7 | __email__ = ['nikolaos.karaiskos@mdc-berlin.de', 'tamasryszard.sztanka-toth@mdc-berlin.de']
 8 | 
 9 | # first create downsampling for 10, 20 .. 90
10 | downsampled_ratios = range(10,100,10)
11 | 
12 | rule downsample_bam:
13 |     input:
14 |         unpack(get_final_bam)
15 |     output:
16 |         downsampled_bam
17 |     params:
18 |         downsample_dir = downsampled_data_prefix,
19 |         ratio = lambda wildcards: wildcards.downsampling_percentage[1:]
20 |     threads: 4
21 |     shell:
22 |         """
23 |         mkdir -p {params.downsample_dir}
24 | 
25 |         sambamba view -o {output} -f bam -t {threads} \
26 |             -s 0.{params.ratio} {input}
27 |         """
28 | 
29 | rule downsampled_filter_mm_reads:
30 |     input:
31 |         downsampled_bam
32 |     output:
33 |         temp(downsampled_bam_mm_included_pipe)
34 |     shell:
35 |         """
36 |         python {repo_dir}/scripts/filter_mm_reads.py \
37 |             --in-bam {input} \
38 |             --out-bam {output}
39 |         """
40 | 
41 | def get_saturation_analysis_input(wildcards):
42 |     # create dictionary with the right downsampling files where  the key
43 |     files = {}
44 | 
45 |     run_modes = get_run_modes_from_sample(wildcards.project_id, wildcards.sample_id)
46 | 
47 |     if project_df.is_spatial(project_id=wildcards.project_id,
48 |                              sample_id=wildcards.sample_id,
49 |                              puck_barcode_file_id=wildcards.puck_barcode_file_id):
50 |         puck_barcode_file_ids = [wildcards.puck_barcode_file_id, 'no_spatial_data']
51 |     else:
52 |         puck_barcode_file_ids = ['no_spatial_data']
53 | 
54 |     for run_mode in run_modes:
55 |         for ratio in downsampled_ratios:
56 |             for puck_barcode_file_id in puck_barcode_file_ids:
57 |                 # dge_files contains dge/summary file paths per run_mode
58 |                 files[f'downsampled_dge_summary.{run_mode}.{ratio}.{puck_barcode_file_id}'] = get_dge_from_run_mode(
59 |                     project_id = wildcards.project_id,
60 |                     sample_id = wildcards.sample_id,
61 |                     run_mode = run_mode,
62 |                     data_root_type = 'downsampled_data',
63 |                     puck_barcode_file_id = puck_barcode_file_id,
64 |                     downsampling_percentage = '/' + str(ratio))['dge_summary']
65 | 
66 |         for puck_barcode_file_id in puck_barcode_file_ids:
67 |             files[f'downsampled_dge_summary.{run_mode}.100.{puck_barcode_file_id}'] = get_dge_from_run_mode(
68 |                 project_id = wildcards.project_id,
69 |                 sample_id = wildcards.sample_id,
70 |                 run_mode = run_mode,
71 |                 data_root_type = 'complete_data',
72 |                 puck_barcode_file_id = puck_barcode_file_id,
73 |                 downsampling_percentage = '')['dge_summary']
74 | 
75 |     return files
76 | 
77 | rule create_saturation_analysis:
78 |     input:
79 |         unpack(get_saturation_analysis_input)
80 |     output:
81 |         downsample_saturation_analysis
82 |     params:
83 |         sample_info = lambda wildcards: project_df.get_sample_info(
84 |             wildcards.project_id, wildcards.sample_id),
85 |         run_modes = lambda wildcards: get_run_modes_from_sample(
86 |             wildcards.project_id, wildcards.sample_id)
87 |     script:
88 |         "scripts/saturation_analysis.Rmd"
89 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/dropseq.smk:
--------------------------------------------------------------------------------
 1 | #########
 2 | # about #
 3 | #########
 4 | __version__ = '0.1.0'
 5 | __author__ = ['Nikos Karaiskos', 'Tamas Ryszard Sztanka-Toth']
 6 | __licence__ = 'GPL'
 7 | __email__ = ['nikolaos.karaiskos@mdc-berlin.de', 'tamasryszard.sztanka-toth@mdc-berlin.de']
 8 | 
 9 | ###################################################
10 | # Snakefile containing the dropseq pipeline rules #
11 | ###################################################
12 | rule remove_smart_adapter:
13 |     input:
14 |         tagged_bam
15 |     output:
16 |         pipe(tagged_trimmed_bam)
17 |     params:
18 |         reports_dir = reports_dir
19 |     shell:
20 |         """
21 |         mkdir -p {params.reports_dir}
22 | 
23 |         {dropseq_tools}/TrimStartingSequence OUTPUT_SUMMARY={params.reports_dir}/remove_smart_adapter.report.txt \
24 |             INPUT={input} \
25 |             OUTPUT={output} \
26 |             SEQUENCE={smart_adapter} \
27 |             MISMATCHES=0 \
28 |             NUM_BASES=5 \
29 |             COMPRESSION_LEVEL=0
30 |         """
31 | 
32 | rule remove_polyA:
33 |     input:
34 |         tagged_trimmed_bam
35 |     output:
36 |         temp(tagged_polyA_adapter_trimmed_bam)
37 |     params:
38 |         reports_dir = reports_dir
39 |     shell:
40 |         """
41 |         {dropseq_tools}/PolyATrimmer OUTPUT_SUMMARY={params.reports_dir}/remove_polyA.report.txt \
42 |             MISMATCHES=0 \
43 |             INPUT={input} \
44 |             OUTPUT={output} \
45 |             NUM_BASES=6
46 |         """
47 | 
48 | rule filter_mm_reads:
49 |     input:
50 |         unpack(get_final_bam)
51 |     output:
52 |         pipe(final_bam_mm_included_pipe)
53 |     shell:
54 |         """
55 |         python {repo_dir}/scripts/filter_mm_reads.py \
56 |             --in-bam {input} \
57 |             --out-bam {output}
58 |         """
59 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/longread.smk:
--------------------------------------------------------------------------------
  1 | #########
  2 | # about #
  3 | #########
  4 | __version__ = '0.2'
  5 | __author__ = ['Marvin Jens', 'Tamas Ryszard Sztanka-Toth']
  6 | __email__ = ['marvin.jens@mdc-berlin.de', 'tamasryszard.sztanka-toth@mdc-berlin.de']
  7 | 
  8 | lr_root = project_dir + "/processed_data/{sample_id}/longread"
  9 | lr_cache_dir = lr_root + "/cache/"
 10 | lr_ann_dir = lr_root + "/annotation/"
 11 | lr_stats_dir = lr_root + "/stats/"
 12 | lr_report_dir = lr_root + "/reports/"
 13 | lr_examples_dir = lr_root + "/examples/"
 14 | lr_cDNA_dir = lr_root + "/cDNA/"
 15 | 
 16 | # targets
 17 | lr_ann = lr_ann_dir + "{sample_id}.annotation.tsv"
 18 | lr_stats = lr_stats_dir + "{sample_id}.stats.tsv"
 19 | lr_report = lr_report_dir + "{sample_id}.donuts.pdf"
 20 | lr_report_stats = lr_stats_dir + "{sample_id}.report.tsv"
 21 | lr_edits = lr_report_dir + "{sample_id}.oligo_edits.pdf"
 22 | lr_cDNA = lr_cDNA_dir + "{sample_id}.fa"
 23 | lr_cDNA_log = lr_cDNA_dir + "{sample_id}.log"
 24 | lr_cDNA_oligo_analysis = lr_cDNA_dir + "{sample_id}.oligo_analysis.csv"
 25 | lr_cDNA_bam = lr_cDNA_dir + "{sample_id}.bam"
 26 | lr_examples = lr_examples_dir + "{sample_id}.txt"
 27 | 
 28 | lr_overview_dir = os.path.join(config['root_dir'], 'longread_overview/')
 29 | lr_overview_pdf = lr_overview_dir + 'fidelity.pdf'
 30 | lr_overview_csv = lr_overview_dir + 'overview.csv'
 31 | 
 32 | LR_RAW_FILES = {}
 33 | LR_SIGNATURE = {}
 34 | LR_REPORT_STATS = []
 35 | def get_longread_output(project_df=None, config=None, **kw):
 36 |     """
 37 |     This function is called from main.smk at least once 
 38 |     to determine which output files need to be generated
 39 |     from longread longread analysis.
 40 |     We use this opportunity to populate LR_RAW_FILES
 41 |     """
 42 |     out_files = []
 43 |     for index, row in project_df.df.iterrows():
 44 |         # for run_mode in row["run_mode"]:
 45 |         #     run_mode_variables = project_df.config.get_run_mode(run_mode).variables
 46 |             if row.longreads:
 47 |                 LR_REPORT_STATS.extend(
 48 |                     expand(lr_report_stats, project_id=index[0], sample_id=index[1])
 49 |                 )
 50 |                 out_files += \
 51 |                 expand(
 52 |                     lr_report,
 53 |                     project_id=index[0],
 54 |                     sample_id=index[1],
 55 |                 ) + \
 56 |                 expand(
 57 |                     lr_edits,
 58 |                     project_id=index[0],
 59 |                     sample_id=index[1],
 60 |                 ) + \
 61 |                 expand(
 62 |                     lr_cDNA_bam,
 63 |                     project_id=index[0],
 64 |                     sample_id=index[1],
 65 |                 ) + \
 66 |                 expand(
 67 |                     lr_cDNA_oligo_analysis,
 68 |                     project_id=index[0],
 69 |                     sample_id=index[1],
 70 |                 )
 71 | 
 72 |                 LR_RAW_FILES[index[1]] = row.longreads
 73 |                 LR_SIGNATURE[index[1]] = row.longread_signature
 74 | 
 75 |     # if we have any longread analysis, generate an overview plot
 76 |     if out_files:
 77 |         out_files.append(lr_overview_pdf)
 78 | 
 79 |     return out_files
 80 | 
 81 | register_module_output_hook(get_longread_output, "longread.smk")
 82 | 
 83 | def get_args(wc):
 84 |     args = f""" \
 85 |     --cache={lr_cache_dir} \
 86 |     --annotation-out={lr_ann_dir} \
 87 |     --stats-out={lr_stats_dir} \
 88 |     --report-out={lr_report_dir} \
 89 |     --examples-out={lr_examples_dir} \
 90 |     --sample={wc.sample_id} \
 91 |     --signature={LR_SIGNATURE[wc.sample_id]} \
 92 |     """.format(sample_id=wc.sample_id, project_id=wc.project_id)
 93 |     return args
 94 | 
 95 | # Use {root_dir}/longread.yaml to set intact_bead layout and other settings that only make sense for
 96 | # long reads
 97 | longread_cmd = """
 98 | python -m spacemake.longread \
 99 |     --parallel={threads} \
100 |     --config=longread.yaml \
101 |     {params.args} \
102 | """
103 | 
104 | rule map_cDNA:
105 |     input: lr_cDNA
106 |     output:
107 |         bam=lr_cDNA_bam,
108 |         tmp=temp(directory(lr_cDNA_dir + 'tmp/'))
109 |     params:
110 |         index = lambda wc : get_star_index(wc)['index'],
111 |         annotation = lambda wc: get_species_genome_annotation(wc)['annotation'],
112 |         star_prefix = lr_cDNA_dir + 'tmp/',
113 |     threads: 64
114 |     shell:
115 |         """
116 |         mkdir -p {params.star_prefix}
117 |         STARlong \
118 |             --runThreadN {threads} \
119 |             --genomeDir  {params.index} \
120 |             --genomeLoad NoSharedMemory \
121 |             --readFilesIn {input} \
122 |             --readFilesType Fastx \
123 |             --outSAMtype BAM Unsorted \
124 |             --outSAMunmapped Within \
125 |             --outSAMattributes All \
126 |             --outSAMprimaryFlag AllBestScore \
127 |             --outStd BAM_Unsorted \
128 |             --outFilterMultimapScoreRange 2 \
129 |             --outFilterScoreMin 0 \
130 |             --outFilterScoreMinOverLread 0 \
131 |             --outFilterMatchNminOverLread 0 \
132 |             --outFilterMatchNmin 30 \
133 |             --outFilterMismatchNmax 1000 \
134 |             --winAnchorMultimapNmax 200 \
135 |             --seedSearchStartLmax 12 \
136 |             --seedPerReadNmax 100000 \
137 |             --seedPerWindowNmax 100 \
138 |             --alignTranscriptsPerReadNmax 100000 \
139 |             --alignTranscriptsPerWindowNmax 10000 \
140 |             --outFileNamePrefix {output.tmp} | \
141 |             {dropseq_tools}/TagReadWithGeneFunction \
142 |             I=/dev/stdin \
143 |             O={output.bam} \
144 |             ANNOTATIONS_FILE={params.annotation}
145 |         """       
146 | 
147 | rule cmd_alnstats:
148 |     input:
149 |         rules.map_cDNA.output.bam
150 |     output:
151 |         oligo_csv=lr_cDNA_oligo_analysis,
152 |     params:
153 |         out = lambda wc: lr_cDNA_dir.format(**wc),
154 |     shell:
155 |         "alnstats --parse-oligos --out-csv={params.out} --out-pdf={params.out} --out-png={params.out} {input}"
156 | 
157 | rule cmd_overview:
158 |     input:
159 |         reports=lambda wc: LR_REPORT_STATS
160 |     output:
161 |         pdf=lr_overview_pdf,
162 |         csv=lr_overview_csv,
163 |     params:
164 |         out_path=lambda wc: lr_overview_dir.format(**wc),
165 |         args=""
166 |     shell: longread_cmd + " overview --output {params.out_path} {input.reports} "
167 | 
168 | rule cmd_report:
169 |     input:
170 |         stats=lr_stats
171 |     output:
172 |         donuts=lr_report,
173 |         repstats=lr_report_stats
174 |     params:
175 |         args=get_args
176 |     threads: 1
177 |     shell: longread_cmd + " report"
178 | 
179 | rule cmd_extract:
180 |     input: 
181 |         fname = lambda wc: LR_RAW_FILES[wc.sample_id],
182 |         ann = lr_ann
183 |     output: lr_cDNA
184 |     params:
185 |         args=get_args
186 |     log: lr_cDNA_log
187 |     # params:
188 |     #     known_barcodes = lambda wc: known_barcodes.get(wc.name,"")
189 |     shell: longread_cmd + " extract {input.fname} 2> {log} > {output}"
190 | 
191 | rule cmd_edits:
192 |     input: 
193 |         fname = lambda wc: LR_RAW_FILES[wc.sample_id],
194 |         stats = lr_stats
195 |     output: lr_edits
196 |     params:
197 |         args=get_args
198 |     threads: 1
199 |     shell: longread_cmd + " edits {input.fname}"
200 | 
201 | rule cmd_annotate:
202 |     input:
203 |         fname = lambda wc: LR_RAW_FILES[wc.sample_id],
204 |         ann = lr_ann
205 |     output: lr_stats
206 |     params:
207 |         args=get_args
208 |     threads: 1
209 |     shell: longread_cmd + " annotate  {input.fname}"
210 | 
211 | rule cmd_align:
212 |     input: 
213 |         fname = lambda wc: LR_RAW_FILES[wc.sample_id]
214 |     output: lr_ann
215 |     params:
216 |         args=get_args
217 |     threads: 64
218 |     shell: longread_cmd + " align {input.fname}"
219 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/merge_samples.smk:
--------------------------------------------------------------------------------
 1 | final_merged_bam = complete_data_root + final_bam_suffix + '.merged.bam'
 2 | merged_ribo_depletion_log = complete_data_root + '/ribo_depletion_log.merged.txt'
 3 | merged_star_log_file = complete_data_root + '/star.merged.Log.final.out'
 4 | 
 5 | rule create_final_merged_bam:
 6 |     input:
 7 |         unpack(get_files_to_merge_snakemake(final_bam))
 8 |     output:
 9 |         final_merged_bam
10 |     threads: 4
11 |     shell:
12 |         "samtools merge -n -@ {threads} -o {output} {input}"
13 | 
14 | rule create_merged_ribo_log:
15 |     input:
16 |         unpack(get_files_to_merge_snakemake(ribo_depletion_log))
17 |     output:
18 |         merged_ribo_depletion_log
19 |     shell:
20 |         "cat {input} > {output}"
21 | 
22 | rule create_merged_star_log:
23 |     input:
24 |         unpack(get_files_to_merge_snakemake(star_log_file))
25 |     output:
26 |         merged_star_log_file
27 |     run:
28 |         logs = []
29 |         for f in input:
30 |             with open(f, 'r') as fi:
31 |                 logs = logs + [fi.read().splitlines()]
32 |         
33 |         indices_to_save = [5, 8, 23, 10, 30]
34 |         value_dict = {ix: 0 for ix in indices_to_save}
35 |         indices_to_normalise = [10]
36 | 
37 |         # extract info from all logfiles, and add them up
38 |         # we are only interested in lines 5, 8, 23, 10, 30
39 |         # so: inp_reads, uniq_mapped_reads, avg_mapped_length, 
40 |         # multi_mapped_reads, unmapped_too_short
41 |         for l in logs:
42 |             for ix in value_dict.keys():
43 |                 value_dict[ix] = value_dict[ix] + float(l[ix].split('\t')[1])
44 | 
45 |         for ix in indices_to_normalise:
46 |             value_dict[ix] = value_dict[ix] / len(logs)
47 | 
48 |         # print to output
49 |         with open(output[0], 'w') as fo:
50 |             ix = 0
51 |             for line in logs[0]:
52 |                 entry = line.split('\t') 
53 |                 if ix in value_dict.keys():
54 |                     fo.write('%s\t%s\n' % (entry[0], value_dict[ix]))
55 |                 else:
56 |                     fo.write('%s\t%s\n' % (entry[0], 'NA'))
57 |                 ix = ix + 1
58 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | qc_sequencing_create_sheet_cache
2 | qc_sequencing_create_sheet_files
3 | automated_analysis_create_report_files
4 | automated_analysis_create_report_cache
5 | *.html
6 | .ipynb_checkpoints
7 | *.ipynb
8 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/automated_analysis.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scanpy as sc
 4 | import squidpy as sq
 5 | 
 6 | from spacemake.spatial.util import detect_tissue
 7 | 
 8 | # expect fitlered .h5ad dge, with spatial coords attached, tissue detected
 9 | adata = sc.read_h5ad(snakemake.input[0])
10 | umi_cutoff = int(snakemake.wildcards['umi_cutoff'])
11 | 
12 | # filter_umi or detect tissue
13 | # if data spatial and detect_tissue=True
14 | if 'spatial' in adata.obsm.keys() and snakemake.params['run_mode_variables']['detect_tissue']:
15 |     adata = detect_tissue(adata, umi_cutoff)
16 |     print('tissue detection')
17 | else:
18 |     print(f'filtering by umi cutoff: {umi_cutoff}')
19 |     adata = adata[adata.obs.total_counts > umi_cutoff, :]
20 | 
21 | # make the var indices (gene names) and obs indices (cell barcode) unique
22 | adata.obs_names_make_unique()
23 | adata.var_names_make_unique()
24 | 
25 | # save the raw counts
26 | adata.raw = adata
27 | 
28 | # identify highly variable genes if we have any observations
29 | nrow, ncol = adata.shape
30 | 
31 | # require at least 1000 genes expressed in the sample and at least 100 cells
32 | if nrow > 100 and ncol >= 1000:
33 |     print('starting analysis')
34 |     try:
35 |         sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=2000)
36 |     except ValueError:
37 |         sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=1000, span = 1)
38 |     
39 |     # calculate log(cpm)
40 |     print('normalising and log-scaling')
41 |     sc.pp.normalize_total(adata, target_sum=1e4)
42 |     sc.pp.log1p(adata, base=2)
43 |     
44 |     # PCA ANALYSIS
45 |     print('calculating pca components')
46 |     sc.tl.pca(adata, svd_solver='arpack')
47 |     
48 |     # get number of pcs-s identified. Sometimes can be smaller than 50, if
49 |     # less than 50 cells pass the threshold
50 |     n_pcs = adata.uns['pca']['variance'].size
51 |     # limit the number of pcs to 50
52 |     n_pcs = n_pcs if n_pcs < 40 else 40
53 |     
54 |     # Compute the neighborhood graph
55 |     print('computing neighborhood graph')
56 |     sc.pp.neighbors(adata, n_pcs=n_pcs)
57 |     
58 |     # compute UMAP
59 |     # for a very low number of cells, scanpy will throw an error here
60 |     try:
61 |         print('dimensionality reduction')
62 |         sc.tl.umap(adata)   
63 |     except TypeError:
64 |         pass
65 | 
66 |     # find out the clusters
67 |     # restrict to max 20 clusters
68 |     resolution = [0.4, 0.6, 0.8, 1.0, 1.2]
69 | 
70 |     print('clustering')
71 | 
72 | 
73 |     if snakemake.params['is_spatial']:
74 |         sq.gr.spatial_neighbors(adata, coord_type="generic")
75 | 
76 |     for res in resolution:
77 |         try:
78 |             res_key = 'leiden_' + str(res)
79 |             
80 |             sc.tl.leiden(adata, resolution = res, key_added = res_key)
81 |             
82 |             # finding marker genes
83 |             print(f'ranking genes for resolution {res}')
84 |             sc.tl.rank_genes_groups(adata, res_key, method='t-test', key_added = 'rank_genes_groups_' + res_key, pts=True,
85 |                 use_raw = False)
86 |             if snakemake.params['is_spatial']:
87 |                 # calculate nhood enrichment from squidpy
88 |                 try:
89 |                     sq.gr.nhood_enrichment(adata, cluster_key=res_key)
90 |                 except ValueError:
91 |                     print('Only one cluster found in the data - skipping neighborhood analysis')
92 |                     pass
93 |         except ZeroDivisionError as e:
94 |             pass
95 | 
96 | adata.write(snakemake.output[0])
97 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/automated_analysis_create_processed_data_files.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scanpy as sc
  4 | 
  5 | # save expression values as a long_df
  6 | def create_long_df(expr_matrix, id_vars = ['cell_bc']):
  7 |     long_df = expr_matrix.melt(id_vars = id_vars, var_name = 'gene', value_name = 'expr')
  8 |     long_df = long_df[long_df.expr > 0]
  9 |     return long_df
 10 | 
 11 | ##############
 12 | # LOAD ADATA #
 13 | ##############
 14 | 
 15 | adata = sc.read(snakemake.input[0])
 16 | 
 17 | uns_keys = ['hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap']
 18 | 
 19 | # all the keys have to be in adata.uns
 20 | adata_complete = any([key in adata.uns.keys() for key in uns_keys])
 21 | 
 22 | #################
 23 | # TOP20 markers #
 24 | #################
 25 | if not adata_complete:
 26 |     pd.DataFrame().to_csv(snakemake.output['cluster_markers'])
 27 |     pd.DataFrame().to_csv(snakemake.output['nhood_enrichment'])
 28 | else:
 29 |     res_keys = adata.obs.columns[adata.obs.columns.str.startswith('leiden_')]
 30 |     
 31 |     top_20_marker_dfs = []
 32 |     nhood_enrichment_dfs = []
 33 |     
 34 |     # Iterate over different resolution values
 35 |     for res_key in res_keys:
 36 |         rank_key = 'rank_genes_groups_' + res_key
 37 |         
 38 |         if not 'names' in adata.uns[rank_key]:
 39 |             continue
 40 | 
 41 |         df = pd.DataFrame(adata.uns[rank_key]['names'])\
 42 |                 .melt(var_name = 'cluster', value_name = 'gene')
 43 |         
 44 |         for key in ['logfoldchanges', 'pvals', 'pvals_adj']:
 45 |             df_key = pd.DataFrame(adata.uns[rank_key][key])\
 46 |                 .melt(var_name = 'cluster', value_name = key)
 47 |             df[key] = df_key[key]
 48 |             # set the index to gene-cluster pair
 49 |             
 50 |         df.set_index(['gene', 'cluster'], inplace=True)
 51 |             
 52 |         for key in ['pts', 'pts_rest']:
 53 |             # get the percentage expressed in cluster and rest
 54 |             df2 = adata.uns[rank_key][key]
 55 |             df2['gene'] = df2.index
 56 |             df2 = df2.melt(var_name='cluster', id_vars='gene')\
 57 |                 .set_index(['gene', 'cluster'])
 58 |             
 59 |             df[key] = df2.loc[df.index].value
 60 |              
 61 |         df['resolution'] = res_key.split('_')[1]
 62 |         df.reset_index(inplace=True)
 63 | 
 64 |         # Restrict to top X markers
 65 |         df = df.groupby("cluster").head(20)
 66 |          
 67 |         top_20_marker_dfs.append(df)
 68 | 
 69 |         if snakemake.params['is_spatial']:
 70 |             try:
 71 |             # get nhood data
 72 |                 df = pd.DataFrame(adata.uns[f'{res_key}_nhood_enrichment']['zscore'])
 73 |                 df = pd.melt(df.reset_index(), id_vars='index')\
 74 |                     .rename(columns={'index': 'cluster_a',
 75 |                                     'variable': 'cluster_b',
 76 |                                     'value': 'zscore'})
 77 |                 df['resolution'] = res_key.split('_')[1]
 78 | 
 79 |                 nhood_enrichment_dfs.append(df)
 80 |             except KeyError:
 81 |                 pass
 82 |         
 83 |     pd.concat(top_20_marker_dfs).to_csv(snakemake.output['cluster_markers'], index=False)
 84 | 
 85 |     if snakemake.params['is_spatial']:
 86 |         pd.concat(nhood_enrichment_dfs).to_csv(snakemake.output['nhood_enrichment'], index=False)
 87 |     else:
 88 |         # output empty csv file
 89 |         pd.DataFrame().to_csv(snakemake.output['nhood_enrichment'])
 90 | 
 91 | 
 92 | ###############
 93 | # SAVE OBS DF #
 94 | ###############
 95 | obs_df = adata.obs
 96 | 
 97 | if adata_complete:
 98 |     obs_df = sc.get.obs_df(adata, obsm_keys=[('X_umap', 0), ('X_umap', 1)])\
 99 |         .join(obs_df)\
100 |         .rename(columns={'X_umap-0':'umap_0', 'X_umap-1':'umap_1'})
101 | 
102 | obs_df.index.set_names('cell_bc', inplace=True)
103 | 
104 | obs_df.to_csv(snakemake.output['obs_df'])
105 | 
106 | ###############
107 | # SAVE VAR DF #
108 | ###############
109 | adata.var.index.set_names('gene_name', inplace=True)
110 | 
111 | adata.var.to_csv(snakemake.output['var_df'])
112 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/clean_top_barcodes.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import difflib
 3 | 
 4 | # we need to reverse it
 5 | optical_primer = 'GAATCACGATACGTACACCA'[::-1]
 6 | optical_primer_len = len(optical_primer)
 7 | 
 8 | nucl_stretches = ['TTTTTT', 'AAAAAAAA', 'CCCCCCCC', 'GGGGGGGG']
 9 | 
10 | with open(snakemake.input[0], 'r') as fi, open(snakemake.output[0], 'w') as fo:
11 |     for barcode in fi:
12 |         barcode = barcode.strip()
13 |         barcode_len = len(barcode)
14 |         
15 |         # clean up TAG=XC artifact
16 |         if barcode == 'TAG=XC':
17 |             continue
18 |         
19 |         matcher = difflib.SequenceMatcher(None, optical_primer, barcode)
20 |         
21 |         pos_optical_primer, pos_barcode, kmer_len = matcher.find_longest_match(0, optical_primer_len, 0, barcode_len)
22 |         
23 |         # if overlap with barcode is bigger than 4, and the overlap is at the end, skip
24 |         if kmer_len > 3 and pos_barcode + kmer_len == barcode_len:
25 |             continue
26 |         
27 |         # if overlap at least 7, anywhere, skip
28 |         if kmer_len > 6:
29 |             continue
30 |         
31 |         # if any of the nucl stretches is in the barcode, skip
32 |         if any([stretch in barcode for stretch in nucl_stretches]):
33 |             continue
34 |         
35 |         # write line to file
36 |         _ = fo.write(barcode + '\n')
37 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/create_sample_db.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(magrittr)
 3 | 
 4 | metadata <- read_csv(snakemake@input[[1]])
 5 | 
 6 | 
 7 | readStarLog <- function(log_file){
 8 | 
 9 |     out = list()
10 |     lines = readLines(log_file)
11 | 
12 |     out$input_reads = (lines[6] %>% strsplit('\t') %>% unlist)[2] %>% as.integer
13 | 
14 |     out$uniq_mapped_reads = (lines[9] %>% strsplit('\t') %>% unlist)[2] %>% as.integer
15 | 
16 |     #out$avg_length = (lines[11] %>% strsplit('\t') %>% unlist)[2] %>% as.numeric
17 |     
18 |     tibble(observation=names(out), value=unlist(unname(out)))
19 | }
20 | 
21 | read_metrics <- metadata %>%
22 |     select(project_id, sample_id, puck_id, species, sequencing_date) %>%
23 |     mutate(star_log = paste0('/data/rajewsky/projects/slide_seq/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/star_Log.final.out'),
24 |            read_types =paste0('/data/rajewsky/projects/slide_seq/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/split_reads/read_type_num.txt')) %>%
25 | 
26 |     filter(file.exists(star_log), file.exists(read_types)) %>%
27 |     mutate(star_log = map(star_log,
28 |                           ~ readStarLog(.))) %>%
29 |     unnest(star_log) %>%
30 |     mutate(read_types = map(read_types,
31 |                             ~ read_table2(., col_names=c('rt_obs', 'rt_value')))) %>%
32 |     unnest(read_types) %>%
33 |     mutate(rt_obs = tolower(rt_obs)) %>%
34 |     spread(rt_obs, rt_value) %>%
35 |     spread(observation, value)
36 | 
37 | read_metrics %>%
38 |     write_delim(snakemake@output[[1]], '\t')
39 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/create_sample_overview.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | header-includes:
  3 |     - \usepackage{float}
  4 |     - \usepackage[table]{xcolor}
  5 | output:
  6 |     html_document:
  7 |         toc: true
  8 |         toc_depth: 6
  9 | classoption: landscape
 10 | geometry: margin=0.5cm
 11 | version: 0.1.1
 12 | author: Tamas Ryszard Sztanka-Toth, Nikolaos Karaiskos
 13 | email: tamasryszard.sztanka-toth@mdc-berlin.de, nikolaos.karaiskos@mdc.berlin.de
 14 | license: GPL
 15 | title: Sample overview
 16 | pagetitle: Sample overview
 17 | date: "`r format(Sys.time(),'%d/%m/%y')`"
 18 | ---
 19 | 
 20 | ```{r knitr_options, include=FALSE, cache=FALSE}
 21 | knitr::opts_chunk$set(
 22 |   cache = F,
 23 |   autodep = TRUE,
 24 |   message = FALSE,
 25 |   warning = FALSE,
 26 |   comment = NA
 27 | )
 28 | 
 29 | options(knitr.table.format ='markdown')
 30 | ```
 31 | 
 32 | ```{r functions, echo = F}
 33 | readStarLog <- function(log_file){
 34 | 
 35 | 		out = list()
 36 | 		lines = readLines(log_file)
 37 | 	
 38 | 		out$input_reads = (lines[6] %>% strsplit('\t') %>% unlist)[2] %>% as.integer
 39 | 
 40 | 		out$uniq_mapped_reads = (lines[9] %>% strsplit('\t') %>% unlist)[2] %>% as.integer
 41 | 
 42 | 		#out$avg_length = (lines[11] %>% strsplit('\t') %>% unlist)[2] %>% as.numeric
 43 | 		
 44 |         tibble(observation=names(out), value=unlist(unname(out)))
 45 | 	}
 46 | ```
 47 | 
 48 | ```{r load_projects_puck_info, echo=F}
 49 | library(tidyverse)
 50 | library(magrittr)
 51 | metadata <- read_csv(snakemake@input[[1]])
 52 | ```
 53 | 
 54 | ```{r collect_data, echo = F}
 55 | root_dir <- snakemake@config$root_dir
 56 | read_metrics <- metadata %>%
 57 |     select(project_id, sample_id, puck_id, species, sequencing_date) %>%
 58 |     mutate(star_log = paste0(root_dir, '/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/star_Log.final.out'),
 59 |            read_types =paste0(root_dir,'/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/split_reads/read_type_num.txt')) %>%
 60 |     mutate(star_log = map(star_log,
 61 |                           ~ readStarLog(.))) %>%
 62 |     unnest(star_log) %>%
 63 |     mutate(read_types = map(read_types,
 64 |                             ~ read_table2(., col_names=c('rt_obs', 'rt_value')))) %>%
 65 |     unnest(read_types) %>%
 66 |     mutate(rt_obs = tolower(rt_obs)) %>%
 67 |     spread(rt_obs, rt_value) %>%
 68 |     spread(observation, value)
 69 | ```
 70 | 
 71 | ```{r show_sample_table, echo = F}
 72 | library(kableExtra)
 73 | to_table <- read_metrics %>%
 74 |     mutate(um_r = uniq_mapped_reads) %>%
 75 |     gather('obs', 'val', intergenic, amb, coding, intronic, utr) %>%
 76 |     mutate(val_p = round(val / um_r, 2),
 77 |            val = round(val / 1e6, 2),
 78 |            # add ratio in paranthesis if obs is not cds
 79 |            val = paste0(val, ' (', val_p, ')'),
 80 |            uniq_mapped_reads = round(uniq_mapped_reads / 1e6, 2),
 81 |            input_reads = round(input_reads / 1e6, 2),
 82 |            uniq_mapped_reads = paste0(uniq_mapped_reads, ' (', round(uniq_mapped_reads / input_reads, 2), ')')) %>%
 83 |     select(-um_r, -val_p) %>%
 84 |     spread(obs, val) %>%
 85 |     arrange(species) %>%
 86 |     select(sample_id, puck_id, species, sequencing_date, input_reads, uniq_mapped_reads, coding, utr, intergenic, intronic, amb) %>%
 87 |     rename(uniq_m = uniq_mapped_reads,
 88 |            input_r = input_reads,
 89 |            cds = coding)
 90 | ```
 91 | 
 92 | ```{r load_strand_info, echo = F}
 93 | strand_info <- metadata %>%
 94 |     select(project_id, sample_id, puck_id, species, sequencing_date) %>%
 95 |     mutate(filename = paste0(root_dir, '/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/split_reads/strand_type_num.txt'),
 96 |            content = map(filename, ~read_table2(., col_names = c('obs', 'num')))) %>%
 97 |     unnest(content) %>%
 98 |     select(-filename, project_id) %>%
 99 |     group_by(sample_id) %>%
100 |     mutate(num_sum = sum(num),
101 |            num_ratio = round(num / num_sum, 2),
102 |            num = round(num / 1e6, 2),
103 |            num = paste0(num, ' (', num_ratio, ')')) %>%
104 |     select(-num_ratio, -num_sum) %>%
105 |     spread(obs, num)
106 | ```
107 | 
108 | ```{r load_barcode_metadata, echo = F}
109 | umi_cutoffs <- c(1, 10, 50, 100)
110 | 
111 | load_filter_dge <- function(x, y){
112 |     read_table2(x, skip=6) %>%
113 |         filter(NUM_TRANSCRIPTS > y)
114 | }
115 | 
116 | read_dge_summary <- function(filename){
117 |     tibble(umi_cutoff = umi_cutoffs, filename=filename) %>%
118 |         mutate(dat = map2(filename, umi_cutoff, load_filter_dge)) %>%
119 |         select(-filename) %>%
120 |         unnest(dat) %>%
121 |         group_by(umi_cutoff) %>%
122 |         summarise(
123 |                 median_umi = median(NUM_TRANSCRIPTS),
124 |                 median_reads = median(NUM_GENIC_READS),
125 |                 median_genes = median(NUM_GENES),
126 |                 median_pcr = median(round(NUM_GENIC_READS / NUM_TRANSCRIPTS, 1)),
127 |                 mean_umi = as.integer(mean(NUM_TRANSCRIPTS)),
128 |                 mean_reads = as.integer(mean(NUM_GENIC_READS)),
129 |                 mean_genes = as.integer(mean(NUM_GENES)),
130 |                 num_beads = n())
131 |         
132 | }
133 | 
134 | barcode_metadata <- metadata %>%
135 |     select(project_id, sample_id, puck_id, species, sequencing_date) %>%
136 |     mutate(filename = paste0(root_dir, '/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/dge/')) %>%
137 |     mutate(filename = ifelse(file.exists(paste0(filename, 'dge_all_summary.txt')),
138 |                              paste0(filename, 'dge_all_summary.txt'),
139 |                              paste0(filename, 'dge_all_cleaned_summary.txt')),
140 |            content = map(filename, ~read_dge_summary(.))) %>%
141 |     select(-filename, -project_id) %>%
142 |     unnest(content)
143 | ```
144 | 
145 | ## Overview
146 | 
147 | We show here downstream metadata for each experiment performed in the sts project. There are three types of tables:
148 | 
149 | * Read information table: containing the parsed output of mapping, such as input read number, uniquely mapped read number etc.
150 | * Expression summary table: containing median number of umis, genes, reads (and mean) per bead for each sample. This is done after applying a UMI filter of 1, 10, 50, 100.
151 | * Strand information table: containing the numbers for reads mapping to the correct strand
152 | 
153 | Each table has the following 4 columns: sample\_id, puck\_id, species, sequencing\_date
154 | 
155 | ### Table column description
156 | 
157 | __Read information table__
158 | 
159 | * input\_r: number of input reads (millions) from the flowcell
160 | * uniq\_m: number of uniquely mapped reads (millions). In parantheses ratio to input\_r
161 | * cds, utr, intergenic, intronic, amb: coding, utr, intergenic, intronic and ambient (overlapping genes on both strands, or cannot be assigned to a single gene part). In millions, in parantheses ratio to uniq\_m.
162 | 
163 | __Expression summary tables__
164 | 
165 | All columns here are in raw counts. We have mean and median for UMIs, genes, reads (all per bead). Median pcr is the median of reads/umi (per bead).
166 | 
167 | __Strand information table__
168 | 
169 | Here there are 6 columns: minus\_AMB,  minus\_minus, minus\_plus, plus\_AMB, plus\_minus, plus\_plus. The first part is the position of the read (plus or minus strand) the second is the position of the mapped gene. AMB means that the mapped gene is ambient (overlapping genes on different strand) or that the read is intergenic.
170 | 
171 | ## Tables by species containing sequencing metadata
172 | 
173 | 
174 | ```{r print_by_species, echo = F, results = 'asis'}
175 | for(s in unique(to_table$species)){
176 |     cat(paste0('### ', s, ' samples'))
177 |     cat('\n')
178 | 
179 |     cat('#### Read information table\n') 
180 |     to_table %>%
181 |         filter(species == s) %>%
182 |         kable("html") %>%
183 |         kable_styling('striped', font_size=12)  %>%
184 |         row_spec(row=0, bold=T) %>%
185 |         print
186 | 
187 |     cat('\n')
188 |     cat('[Back to top](#)\n\n')
189 | 
190 |     cat('#### Expression summary tables\n')
191 | 
192 |     for(cutoff in umi_cutoffs){
193 |         cat(paste0('##### UMI cutoff: ', cutoff))
194 |         cat('\n')
195 | 
196 |         barcode_metadata %>%
197 |             filter(species == s, umi_cutoff == cutoff) %>%
198 |             kable("html") %>%
199 |             kable_styling('striped', font_size=12)  %>%
200 |             row_spec(row=0, bold=T) %>%
201 |             print
202 | 
203 |         cat('\n')
204 |         cat('[Back to top](#)\n\n')
205 | 
206 |     }
207 | 
208 |     cat('#### Strand information table\n')
209 | 
210 |     strand_info %>%
211 |         filter(species == s) %>%
212 |         kable("html") %>%
213 |         kable_styling('striped', font_size=12)  %>%
214 |         row_spec(row=0, bold=T) %>%
215 |         print
216 | 
217 |     cat('\n')
218 |     cat('[Back to top](#)\n\n')
219 | }
220 | ```
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/create_spatial_dge.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scanpy as sc
 4 | from spacemake.util import detect_tissue, attach_barcode_file
 5 | 
 6 | dge_path = snakemake.input['dge']
 7 | 
 8 | # umi cutoff 
 9 | umi_cutoff = int(snakemake.wildcards['umi_cutoff'])
10 | 
11 | adata = sc.read_h5ad(dge_path)
12 | 
13 | print('data read')
14 | 
15 | has_barcode_file = 'barcode_file' in snakemake.input.keys()
16 | 
17 | # ATTACH BARCODE FILE #
18 | if has_barcode_file:
19 |     adata = attach_barcode_file(adata, snakemake.input['barcode_file'])
20 | 
21 | # filter out cells based on umi, and genes based on number of cells
22 | sc.pp.filter_cells(adata, min_genes=1)
23 | sc.pp.filter_genes(adata, min_cells=3)
24 | 
25 | print('data filtered')
26 | 
27 | # DETECT TISSUE #
28 | # if there is no barcode file, filter adata based on UMI, otherwise detect tissue with UMI cutoff
29 | if has_barcode_file and snakemake.params['downstream_variables']['detect_tissue']:
30 |     tissue_indices = detect_tissue(adata, umi_cutoff)
31 |     adata = adata[tissue_indices, :]
32 | else:
33 |     adata = adata[adata.obs.total_counts > umi_cutoff, :]
34 | 
35 | adata.write(snakemake.output[0])
36 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/filter_mm_reads.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import datetime
 3 | import argparse
 4 | import numpy as np
 5 | 
 6 | counted_regions = ['UTR', 'CODING']
 7 | 
 8 | def select_alignment(alignments):
 9 |     read_names = [aln.query_name for aln in alignments]
10 |     if read_names.count(read_names[0]) != len(read_names):
11 |         print(read_names)
12 |         raise Exception(f'input alignments do not come from the same read')
13 | 
14 |     def is_exonic(aln):
15 |         if not aln.has_tag('XF'):
16 |             return False
17 | 
18 |         return aln.get_tag('XF') in counted_regions
19 | 
20 |     alignments_are_exonic = np.array([is_exonic(aln) for aln in alignments])
21 | 
22 |     exonic_ix = np.where(alignments_are_exonic  == True)[0]
23 | 
24 |     num_exonic = exonic_ix.shape[0]
25 | 
26 |     if num_exonic == 1:
27 |         # if only one exonic reads from the group
28 |         # return the exonic indices
29 |         return alignments[exonic_ix[0]]
30 |     else:
31 |         return None
32 | 
33 | if __name__ == '__main__':
34 |     parser = argparse.ArgumentParser(description='Filter out ambiguous multi-mapper reads')
35 | 
36 |     parser.add_argument('--in-bam', help='input bam')
37 |     parser.add_argument('--out-bam', help='output bam')
38 | 
39 |     args = parser.parse_args()
40 |     print(args)
41 | 
42 |     bam_in = pysam.AlignmentFile(args.in_bam, "rb")
43 | 
44 |     bam_out = pysam.AlignmentFile(args.out_bam, 'wb', header= bam_in.header)
45 |     counter = 0
46 |     start_time = datetime.datetime.now()
47 |     finish_time = start_time
48 |     total_start_time = datetime.datetime.now()
49 |     time_interval = 30
50 | 
51 |     multi_mappers = []
52 |     
53 |     for aln in bam_in.fetch(until_eof=True):
54 |         counter += 1
55 | 
56 |         finish_time = datetime.datetime.now()
57 |         delta_seconds = (finish_time - start_time).seconds
58 |         total_elapsed_seconds = (finish_time - total_start_time).total_seconds()
59 | 
60 |         if delta_seconds >= time_interval:
61 |             formatted_time = finish_time.strftime('%Y-%m-%d %H:%M:%S')
62 |             records_per_second = counter / delta_seconds
63 | 
64 |             print(f'Processed {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Average processing rate: {records_per_second:,.0f} records/second. Current time: {formatted_time}')
65 |             
66 |             start_time = finish_time
67 |         
68 |         mapped_number = aln.get_tag('NH')
69 | 
70 |         if mapped_number == 1:
71 |             bam_out.write(aln)
72 |         else:
73 |             if len(multi_mappers) < (mapped_number - 1):
74 |                 # still some multimappers missing. we need to add the alignments 
75 |                 # until the last one to the list
76 |                 multi_mappers.append(aln)
77 |             else:
78 |                 # add the last alignment
79 |                 multi_mappers.append(aln)
80 |                 # decide which, if any, to keep
81 |                 aln_to_keep = select_alignment(multi_mappers)
82 | 
83 |                 if aln_to_keep is not None:
84 |                     # set aln secondary flag to 0, so that it is flagged as primary
85 |                     # secondary flag is at 0x100, so 8th bit (starting from 0)
86 |                     aln_to_keep.flag = aln_to_keep.flag & ~(1<<8)
87 |                     bam_out.write(aln_to_keep)
88 |                     
89 |                 # reset multimapper list
90 |                 multi_mappers = []
91 | 
92 |     formatted_time = finish_time.strftime("%Y-%m-%d %H:%M:%S")
93 |     print(f'Finished processing {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Current time: {formatted_time}')
94 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/fix_bam_header.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | __version__ = "0.9"
 3 | __author__ = [
 4 |     "Marvin Jens",
 5 | ]
 6 | __license__ = "GPL"
 7 | __email__ = [
 8 |     "marvin.jens@mdc-berlin.de",
 9 | ]
10 | 
11 | import pysam
12 | import argparse
13 | import os
14 | import sys
15 | import logging
16 | 
17 | 
18 | def print_header(header):
19 |     for k, v in sorted(header.items()):
20 |         if type(v) == dict:
21 |             vstr = " ".join([f"{x}:{y}" for x, y in sorted(v.items())])
22 |             print(f"@{k}:\t{vstr}")
23 |         elif type(v) == list:
24 |             for row in v:
25 |                 if type(row) == dict:
26 |                     vstr = " ".join([f"{x}:{y}" for x, y in sorted(row.items())])
27 |                 else:
28 |                     vstr = str(row)
29 |                 print(f"@{k}:\t{vstr}")
30 |         else:
31 |             print(f"@{k}:\t{v}")
32 | 
33 | 
34 | def merge_headers(orig, star):
35 |     merged = dict(orig)
36 |     # most recent program should be on top
37 |     merged["PG"] = star["PG"] + merged["PG"]
38 |     merged["SQ"] = star["SQ"]
39 |     merged["HD"]["SO"] = star["HD"]["SO"]  # sorted by
40 | 
41 |     return merged
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser(
46 |         description="Fix .bam header of the STAR mapped output .bam"
47 |     )
48 | 
49 |     parser.add_argument("--in-bam-star", help="mapped star bam input")
50 |     parser.add_argument("--in-bam-tagged", help="unmapped dropseq tagged bam")
51 |     parser.add_argument("--out-bam", help="output bam")
52 | 
53 |     args = parser.parse_args()
54 | 
55 |     bam_star = pysam.AlignmentFile(args.in_bam_star, "rb")
56 |     bam_tagged = pysam.AlignmentFile(args.in_bam_tagged, "rb", check_sq=False)
57 | 
58 |     star_header = bam_star.header.to_dict()
59 |     tagged_header = bam_tagged.header.to_dict()
60 |     merged_header = merge_headers(tagged_header, star_header)
61 |     # print(f"STAR header")
62 |     # print_header(star_header)
63 | 
64 |     # print(f"original header")
65 |     # print_header(tagged_header)
66 | 
67 |     # print("merged header")
68 |     # print_header(merged_header)
69 | 
70 |     # copy input to output, just with the new header
71 |     bam_out = pysam.AlignmentFile(args.out_bam, "wb", header=merged_header)
72 |     for aln in bam_star.fetch(until_eof=True):
73 |         bam_out.write(aln)
74 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/kmer_stats_from_fastq.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import gzip
 3 | import itertools
 4 | import os
 5 | from collections import Counter
 6 | 
 7 | bases = ["A", "C", "T", "G", "N"]
 8 | kmer_len = int(snakemake.params["kmer_len"])
 9 | 
10 | # in this case we look for 4 mers
11 | kmers = ["".join(kmer) for kmer in itertools.product(bases, repeat=kmer_len)]
12 | 
13 | position_counts = None
14 | read_len = 0
15 | position_list = []
16 | 
17 | read_kmer_hashes = []
18 | 
19 | with gzip.open(snakemake.input[0], "rt") as fastq_in:
20 |     line = 0
21 |     for read in fastq_in:
22 |         if line == 1:
23 |             read = read.strip("\n")
24 |             read_len = len(read)
25 |             position_list = list(range(read_len - kmer_len + 1))
26 |             kmer_hashes = [
27 |                 "_".join(prod)
28 |                 for prod in itertools.product(kmers, [str(x) for x in position_list])
29 |             ]
30 |             position_counts = pd.DataFrame(0, index=kmer_hashes, columns=["count"])
31 | 
32 |         # if line is a read
33 |         if line % 4 == 1:
34 |             kmer_hashes = kmer_hashes + [
35 |                 str(read[i : i + kmer_len]) + "_" + str(i) for i in position_list
36 |             ]
37 | 
38 |         line = line + 1
39 |         if line % 4000 == 0:
40 |             kmer_hash_counts = Counter(kmer_hashes)
41 |             # print(kmer_hash_counts.values())
42 | 
43 |             # update df
44 |             position_counts.loc[kmer_hash_counts.keys(), "count"] = position_counts.loc[
45 |                 kmer_hash_counts.keys(), "count"
46 |             ] + list(kmer_hash_counts.values())
47 |             # print(position_counts)
48 | 
49 |             kmer_hashes = []
50 | 
51 |         if line % 4000000 == 0:
52 |             print("%s reads processed" % (line / 4))
53 | 
54 | position_counts.index.rename("kmer_hash", inplace=True)
55 | 
56 | file_path = snakemake.output[0]
57 | os.makedirs(os.path.dirname(file_path), exist_ok=True)
58 | 
59 | position_counts.to_csv(file_path)
60 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/parse_ribo_log.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def parse_ribo_log(ribo_log_file):
 4 |     # before the log, there can be some perl warnings prepended. so we need to find the
 5 |     # first line outputed by bwa mem
 6 |     input_reads = 0
 7 |     aligned_reads = 0
 8 | 
 9 |     # ribo log summary line: first line of the summary
10 |     first_line_regex = r'^\d+ reads; of these:$'
11 |     first_line_found = False
12 | 
13 |     line_n = 0
14 | 
15 |     with open(ribo_log_file) as f:
16 |         for line in f:
17 |             stripped_line = line.strip()
18 |             
19 |             if stripped_line == 'no_rRNA_index':
20 |                 input_reads = -1
21 |                 aligned_reads = -1
22 |                 break
23 | 
24 |             if not first_line_found:
25 |                 if re.match(first_line_regex, stripped_line) is not None:
26 |                     first_line_found = True
27 |                     line_n = 0
28 |                 else:
29 |                     # keep looking for first line
30 |                     continue
31 | 
32 |             if line_n == 0:
33 |                 input_reads = input_reads + int(stripped_line.split(' ')[0])
34 |             elif line_n == 3 or line_n == 4:
35 |                 aligned_reads = aligned_reads + int(stripped_line.split(' ')[0])
36 |             # reset after the fifth line, this is needed if there are several ribolog files
37 |             # appended one after the other. this is the case for merged samples
38 |             elif line_n == 5:
39 |                 first_line_found = False
40 | 
41 |             line_n = line_n + 1
42 |             
43 |     
44 |     if input_reads <= 0:
45 |         return (None, None)
46 |     else:
47 |         return (aligned_reads, input_reads)
48 | 
49 | 
50 | if snakemake.params.ribo_log == "no_rRNA_index":
51 |     input_reads = -1
52 |     aligned_reads = -1
53 | 
54 | else:
55 |     aligned_reads, input_reads = parse_ribo_log(snakemake.params.ribo_log)
56 | 
57 | with open(snakemake.output[0], 'w') as fo:
58 |     fo.write(f'aligned_reads\t{aligned_reads}\n')
59 |     fo.write(f'input_reads\t{input_reads}\n')
60 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/shared_functions.R:
--------------------------------------------------------------------------------
 1 | readStarLog <- function(log_file){
 2 | 
 3 | 		out = list()
 4 | 		lines = readLines(log_file)
 5 | 	
 6 | 		out$input_reads = (lines[6] %>% strsplit('\t') %>% unlist)[2] %>% as.integer
 7 | 
 8 | 		out$uniq_mapped_reads = (lines[9] %>% strsplit('\t') %>% unlist)[2] %>% as.integer
 9 | 
10 | 		#out$avg_length = (lines[11] %>% strsplit('\t') %>% unlist)[2] %>% as.numeric
11 | 		
12 |         tibble(observation=names(out), value=unlist(unname(out)))
13 | 	}
14 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/splice_bam_header.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | __version__ = "0.9"
  3 | __author__ = [
  4 |     "Marvin Jens",
  5 | ]
  6 | __license__ = "GPL"
  7 | __email__ = [
  8 |     "marvin.jens@mdc-berlin.de",
  9 | ]
 10 | 
 11 | import pysam
 12 | import argparse
 13 | import os
 14 | import sys
 15 | import logging
 16 | 
 17 | 
 18 | def print_header(header):
 19 |     for k, v in sorted(header.items()):
 20 |         if type(v) == dict:
 21 |             vstr = " ".join([f"{x}:{y}" for x, y in sorted(v.items())[::-1]])
 22 |             print(f"@{k}:\t{vstr}")
 23 |         elif type(v) == list:
 24 |             for row in v:
 25 |                 if type(row) == dict:
 26 |                     vstr = " ".join([f"{x}:{y}" for x, y in sorted(row.items())[::-1]])
 27 |                 else:
 28 |                     vstr = str(row)
 29 |                 print(f"@{k}:\t{vstr}")
 30 |         else:
 31 |             print(f"@{k}:\t{v}")
 32 | 
 33 | 
 34 | def unique_IDs(pg_list):
 35 |     from collections import defaultdict
 36 | 
 37 |     id_counts = defaultdict(int)
 38 | 
 39 |     # first, iterate over entire list and count how often each program ID is there.
 40 |     pp_list = [None]
 41 |     if len(pg_list) > 1:
 42 |         pp_list += pg_list[:-1]
 43 | 
 44 |     pg_new = []
 45 |     for pg, pp in zip(pg_list, pp_list):
 46 |         name = pg["ID"].split(".")[0]
 47 |         # edit in-place
 48 |         id_counts[name] += 1
 49 |         pg["ID"] = f"{name}.{id_counts[name]}"
 50 |         # id_counts[name] -= 1
 51 | 
 52 |         if pp:
 53 |             pname = pp["ID"].split(".")[0]
 54 |             pg["PP"] = f"{pname}.{id_counts[pname]}"
 55 | 
 56 |         pg_new.append(pg)
 57 | 
 58 |     return pg_new
 59 | 
 60 | 
 61 | def merge_headers(orig, other, enforce_RG=True):
 62 |     merged = dict(orig)
 63 |     # start from the original, including VN and RG entries...
 64 |     # connect the processing chains:
 65 |     # most recent program should be on top
 66 |     #  Previous Program (PP) of the first new output was the last Program Name (PN) in the original uBAM
 67 |     other["PG"][-1]["PP"] = orig["PG"][0]["ID"]
 68 |     merged["PG"] = unique_IDs(merged["PG"] + other["PG"])
 69 | 
 70 |     if "SO" in other["HD"]:
 71 |         merged["HD"]["SO"] = other["HD"]["SO"]  # keep sort-order
 72 | 
 73 |     # sequence identifiers should be absent from uBAM and at any rate are overwritten here
 74 |     merged["SQ"] = other["SQ"]
 75 |     if enforce_RG and not "RG" in merged or len(merged["RG"]) == 0:
 76 |         merged["RG"] = {"ID": "A", "SM": "NA"}
 77 |     # merged['HD']['SO'] = star['HD']['SO']  # sorted by
 78 | 
 79 |     return merged
 80 | 
 81 | 
 82 | if __name__ == "__main__":
 83 |     parser = argparse.ArgumentParser(
 84 |         description=(
 85 |             "STAR and bowtie2 create a new header from scratch and ignore everything upstream. "
 86 |             "This script fixes the .bam headers of such mapped output by splicing it together with "
 87 |             "the original uBAM header."
 88 |         )
 89 |     )
 90 | 
 91 |     parser.add_argument(
 92 |         "--in-bam",
 93 |         help="mapped star/bowtie2 bam input (default=/dev/stdin)",
 94 |         default="/dev/stdin",
 95 |     )
 96 |     parser.add_argument("--in-ubam", help="unmapped dropseq tagged bam", required=True)
 97 |     parser.add_argument(
 98 |         "--out-bam",
 99 |         help="fixed output bam (default=/dev/stdout)",
100 |         default="/dev/stdout",
101 |     )
102 |     parser.add_argument("--out-mode", help="mode for output (default=b0)", default="b0")
103 | 
104 |     args = parser.parse_args()
105 | 
106 |     mbam = pysam.AlignmentFile(args.in_bam, "rb")
107 |     ubam = pysam.AlignmentFile(args.in_ubam, "rb", check_sq=False)
108 | 
109 |     mapped_header = mbam.header.to_dict()
110 |     ubam_header = ubam.header.to_dict()
111 |     merged_header = merge_headers(ubam_header, mapped_header)
112 |     # print(f"mapped BAM header")
113 |     # print_header(mapped_header)
114 | 
115 |     # print(f"original uBAM header")
116 |     # print_header(ubam_header)
117 | 
118 |     # print("merged header")
119 |     # print_header(merged_header)
120 | 
121 |     # copy input to output, just with the new header
122 |     bam_out = pysam.AlignmentFile(
123 |         args.out_bam, f"w{args.out_mode}", header=merged_header
124 |     )
125 |     for aln in mbam.fetch(until_eof=True):
126 |         bam_out.write(aln)
127 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/scripts/split_reads_by_strand_info.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import argparse
  3 | 
  4 | 
  5 | parser = argparse.ArgumentParser(description='Split .bam file to .sam files by mapped reads strand orientation')
  6 | parser.add_argument('file_in', metavar = 'in', type=str)
  7 | parser.add_argument('--prefix')
  8 | 
  9 | args = parser.parse_args()
 10 | 
 11 | prefix = args.prefix
 12 | 
 13 | read_type_num = {'INTERGENIC':0,    'INTRONIC':0,    'CODING':0,    'UTR':0,    'AMB':0}
 14 | 
 15 | strand_type_num = {
 16 |     'minus_minus': 0,
 17 |     'minus_plus': 0,
 18 |     'plus_plus': 0,
 19 |     'plus_minus': 0,
 20 |     'plus_AMB': 0,
 21 |     'minus_AMB': 0
 22 | }
 23 | 
 24 | out_file_names = {x: prefix + x + '.sam' for x in strand_type_num.keys()}
 25 | 
 26 | out_files = {x: open(out_file_names[x], 'w') for x in out_file_names.keys()}
 27 | 
 28 | def return_collapsed(it):
 29 |     # set has exactly 1 element, meaning that all elements are the same in the list 
 30 |     if len(set(it)) == 1:
 31 |         return it[0]
 32 |     else:
 33 |         return 'AMB'
 34 | 
 35 | with open(args.file_in, 'r') as fi:
 36 |     for line in fi:
 37 |         # if line is header line
 38 |         if line.startswith('@'):
 39 |             for f in out_files.values():
 40 |                 f.write(line)
 41 | 
 42 |             # go to next iteration
 43 |             continue
 44 | 
 45 |         line_stripped = line.strip()
 46 | 
 47 |         elements = line_stripped.split()
 48 | 
 49 |         last = elements[-1]
 50 | 
 51 |         read_overlaps_gene = False
 52 | 
 53 |         # set gene strand
 54 |         if last.startswith('gs'):
 55 |             # if last element begins with gs, this means that read overlaps a gene (fw or rv strands)
 56 |             read_overlaps_gene = True
 57 |             gene_strand = return_collapsed(last.split(':')[-1].split(','))
 58 |             
 59 |             if gene_strand == '-':
 60 |                 gene_strand = 'minus'
 61 |             elif gene_strand == '+':
 62 |                 gene_strand = 'plus'
 63 | 
 64 |         else:
 65 |             gene_strand = 'AMB'
 66 | 
 67 |         # set read strand
 68 |         if elements[1] == '0':
 69 |             read_strand = 'plus'
 70 |         else:
 71 |             read_strand = 'minus'
 72 | 
 73 |         # get read type
 74 |         if read_overlaps_gene:
 75 |             read_type = return_collapsed(elements[-3].split(':')[-1].split(','))
 76 |         else:
 77 |             # if read do not overlap a gene, it is clearly intergenic
 78 |             read_type = 'INTERGENIC'
 79 | 
 80 | 
 81 |         read_type_num[read_type] = read_type_num[read_type] + 1
 82 |         
 83 |         strand_type = read_strand + '_' + gene_strand
 84 | 
 85 |         strand_type_num[strand_type] = strand_type_num[strand_type] + 1
 86 | 
 87 |         # print the read to the correct split file, depending on strand orientation
 88 |         out_files[strand_type].write(line)
 89 |         
 90 | with open(prefix + 'read_type_num.txt', 'w') as fo:
 91 |     for key, value in read_type_num.items():
 92 |         fo.write('%s %s\n' % (key, value))
 93 | 
 94 | with open(prefix + 'strand_type_num.txt', 'w') as fo:
 95 |     for key, value in strand_type_num.items():
 96 |         fo.write('%s %s\n' % (key, value))
 97 | 
 98 | for f in out_files.values():
 99 |     f.close()
100 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/species_init.smk:
--------------------------------------------------------------------------------
 1 | annotation_file = os.path.join(config['root_dir'],
 2 |     config['annotation_file_pattern'])
 3 | genome_file = os.path.join(config['root_dir'],
 4 |     config['genome_file_pattern'])
 5 | 
 6 | rule all:
 7 |     input:
 8 |         expand(annotation_file, species = config['species'],
 9 |                data_type = 'annotation'),
10 |         expand(genome_file, species = config['species'],
11 |                data_type = 'genome')
12 | 
13 | rule unzip:
14 |     input:
15 |         '{filename}.gz'
16 |     output:
17 |         '{filename}'
18 |     shell: "unpigz {input}"
19 | 
20 | def get_url(wildcards):
21 |     return config[wildcards.species + '_' + wildcards.data_type + '_url']
22 | 
23 | rule download_species_annotation:
24 |     output:
25 |         annotation_file.replace('.gtf', '.gtf.gz')
26 |     params:
27 |         url = lambda wildcards: get_url(wildcards)
28 |     shell:
29 |         "wget -O {output} {params.url}"
30 | 
31 | rule download_species_genome:
32 |     output:
33 |         genome_file.replace('.fa', '.fa.gz')
34 |     params:
35 |         url = lambda wildcards: get_url(wildcards)
36 |     shell:
37 |         "wget -O {output} {params.url}"
38 | 


--------------------------------------------------------------------------------
/spacemake/snakemake/visium.smk:
--------------------------------------------------------------------------------
 1 | configfile: 'config.yaml'
 2 | 
 3 | spaceranger_out_id = 'sr_out-{sample}-{run_type}'
 4 | 
 5 | spaceranger_outs = [
 6 |     spaceranger_out_id + '/outs/web_summary.html'
 7 | ]
 8 | 
 9 | raw_reads = 'data/reads/raw/{sample_id}_S{S}_L002_R{R}_001.fastq.gz'
10 | linked_reads = 'data/reads/linked/{sample}_S{S}_L002_R{R}_001.fastq.gz'
11 | 
12 | spaceranger_script = 'spaceranger-1.2.0/spaceranger'
13 | 
14 | linked_reads_root = 'data/reads/linked/'
15 | raw_reads_root = 'data/reads/raw/'
16 | 
17 | run_types = ['exon', 'exon_intron']
18 | 
19 | rule all:
20 |     input:
21 |         expand(spaceranger_outs, sample = config['samples'].keys(), run_type = run_types)
22 | 
23 | def get_raw_reads(wildcards):
24 |     sample_id = config['samples'][wildcards.sample]['id']
25 | 
26 |     return expand(raw_reads, sample_id = sample_id, S= wildcards.S, R = wildcards.R)
27 | 
28 | rule link_raw_reads:
29 |     input:
30 |         unpack(get_raw_reads)
31 |     output:
32 |         linked_reads
33 |     shell:
34 |         "ln -sr {input} {output}"
35 | 
36 | def get_spaceranger_inputs(wildcards):
37 |     S = config['samples'][wildcards.sample]['S']
38 |     img = config['samples'][wildcards.sample]['img']
39 |     sample_id = config['samples'][wildcards.sample]['id'] 
40 | 
41 |     return {
42 |         'reads': expand(raw_reads, sample_id = sample_id, S=S, R=[1,2]),
43 |         'img': img }
44 | 
45 | def get_refdata(wildcards):
46 |     if wildcards.run_type == 'exon':
47 |         return 'refdata-mm10-M23'
48 |     elif wildcards.run_type == 'exon_intron':
49 |         return 'refdata-pre-mm10-M23'
50 | 
51 | rule run_spaceranger_counts:
52 |     input:
53 |         unpack(get_spaceranger_inputs)
54 |     output:
55 |         spaceranger_outs
56 |     params:
57 |         area = lambda wildcards: config['samples'][wildcards.sample]['area'],
58 |         sample_id = lambda wildcards: config['samples'][wildcards.sample]['id'],
59 |         refdata = lambda wildcards: get_refdata(wildcards),
60 |         run_id = spaceranger_out_id
61 |     wildcard_constraints:
62 |        run_type='|'.join(run_types) 
63 |     threads: 8
64 |     shell:
65 |         # first we remove the directory, otherwise space ranger is gonna fail
66 |         # the directory is created by snakemake, by default. but after creation
67 |         # spaceranger thinks that it has already run...
68 |         """
69 |         rm -rf {params.run_id}
70 |         {spaceranger_script} count --id={params.run_id} \
71 |             --transcriptome={params.refdata} \
72 |             --fastqs={raw_reads_root} \
73 |             --sample={params.sample_id} \
74 |             --image={input.img} \
75 |             --localcores={threads} \
76 |             --localmem=64 \
77 |             --unknown-slide \
78 |             --reorient-images
79 |         """
80 | 


--------------------------------------------------------------------------------
/spacemake/spatial/__init__.py:
--------------------------------------------------------------------------------
1 | # # include in top level for backward compatibility
2 | # from .util import compute_neighbors, compute_islands, detect_tissue, \
3 | #     create_mesh, create_meshed_adata
4 | # # added novosparc_reconstruction for backward compatibility
5 | # from . import novosparc_integration as novosparc_reconstruction
6 | # from . import puck_collection as puck_collection
7 | 


--------------------------------------------------------------------------------
/spacemake/spatial/cmdline.py:
--------------------------------------------------------------------------------
  1 | #from ..config import ConfigFile
  2 | #from ..project_df import ProjectDF
  3 | from ..util import message_aggregation, bool_in_str, str2bool
  4 | from ..errors import SpacemakeError
  5 | 
  6 | import argparse
  7 | import logging
  8 | 
  9 | logger_name = "spacemake.spatial"
 10 | logger = logging.getLogger(logger_name)
 11 | 
 12 | def get_expression_img_parser(with_umi_cutoff = False):
 13 |     parser = argparse.ArgumentParser(allow_abbrev=False, add_help=False)
 14 | 
 15 |     parser.add_argument('--project_id', type=str,
 16 |         required=True)
 17 | 
 18 |     parser.add_argument('--sample_id', type=str,
 19 |         required=True)
 20 | 
 21 |     parser.add_argument('--run_mode', type=str,
 22 |         required=True)
 23 | 
 24 |     parser.add_argument('--umi_cutoff', type=int,
 25 |         required=False)
 26 | 
 27 |     parser.add_argument('--binary_top_qth_percentile',
 28 |         type=int, required=False, default=30)
 29 | 
 30 |     parser.add_argument('--binary', type=str,
 31 |         required=False, default='False')
 32 | 
 33 |     parser.add_argument('--processed_data', type=str,
 34 |         required=False, default='False')
 35 | 
 36 |     parser.add_argument('--out_img',
 37 |         type=str,
 38 |         required=True)
 39 | 
 40 |     return parser
 41 | 
 42 | def setup_spatial_parser(parent_parser_subparsers):
 43 |     parser = parent_parser_subparsers.add_parser('spatial',
 44 |         help = 'spacemake spatial commands')
 45 | 
 46 |     subparsers = parser.add_subparsers()
 47 | 
 48 |     aggregated_img_parser = subparsers.add_parser(
 49 |         'create_aggregated_expression_img',
 50 |         parents=[get_expression_img_parser()])
 51 | 
 52 |     aggregated_img_parser.set_defaults(
 53 |         func=lambda args: create_expression_img_cmdline(args,
 54 |             'aggregated'))
 55 | 
 56 |     spot_img_parser = subparsers.add_parser(
 57 |         'create_spot_expression_img',
 58 |         parents=[get_expression_img_parser()])
 59 | 
 60 |     spot_img_parser.set_defaults(
 61 |         func=lambda args: create_expression_img_cmdline(args,
 62 |             'spot'))
 63 | 
 64 | @message_aggregation(logger_name)
 65 | def create_expression_img_cmdline(args, img_type):
 66 |     import cv2 
 67 |     logger.info('Loading dge file...')
 68 |     from spacemake.smk import Spacemake
 69 |     spmk = Spacemake()
 70 | 
 71 |     if str2bool(args['processed_data']):
 72 |         if not 'umi_cutoff' in args:
 73 |             raise SpacemakeError('When creating image from processed data,'
 74 |                 ' a --umi_cutoff value must be provided') 
 75 | 
 76 |         adata = spmk.load_processed_adata(
 77 |             project_id = args['project_id'],
 78 |             sample_id = args['sample_id'],
 79 |             run_mode_name = args['run_mode'],
 80 |             umi_cutoff = args['umi_cutoff'])
 81 | 
 82 |     else:
 83 |         adata = spmk.load_raw_spatial_adata(
 84 |             project_id = args['project_id'],
 85 |             sample_id = args['sample_id'],
 86 |             run_mode_name = args['run_mode'])
 87 | 
 88 |     logger.info(f'Generating {img_type} expression image...')
 89 |     if img_type == 'spot':
 90 |         from .he_integration import create_spot_expression_img
 91 |         img, img_bw = create_spot_expression_img(adata,
 92 |             binary=str2bool(args['binary']))
 93 |     elif img_type == 'aggregated':
 94 |         from .he_integration import create_aggregated_expression_img
 95 |         img, img_bw = create_aggregated_expression_img(
 96 |             adata,
 97 |             binary_top_qth_percentile=int(args['binary_top_qth_percentile']))
 98 | 
 99 |     if str2bool(args['binary']):
100 |         img = img_bw
101 | 
102 |     cv2.imwrite(args['out_img'], img)


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | #rm project_df.csv > /dev/null
 4 | 
 5 | spacemake projects add_sample --project_id test \
 6 |     --sample_id sc_rnaseq_sample \
 7 |     --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
 8 |     --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
 9 |     --species mouse   
10 | 
11 | spacemake projects add_sample --project_id test \
12 |     --sample_id sc_rnaseq_sample_2 \
13 |     --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
14 |     --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
15 |     --species mouse  \
16 |     --barcode_flavor visium
17 | 
18 | # with one bc file
19 | spacemake projects add_sample --project_id test \
20 |     --sample_id one_bc_file \
21 |     --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
22 |     --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
23 |     --species mouse  \
24 |     --barcode_flavor visium \
25 |     --puck visium
26 | 
27 | # with two bc files
28 | spacemake projects add_sample --project_id test \
29 |     --sample_id two_bc_files \
30 |     --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
31 |     --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \
32 |     --species mouse  \
33 |     --barcode_flavor visium \
34 |     --puck visium \
35 |     --puck_barcode_file spacemake/data/test/test_bc1.csv spacemake/data/test/test_bc2.csv
36 | 
37 | # update sample
38 | spacemake projects update_sample --project_id test \
39 |     --sample_id two_bc_files \
40 |     --investigator Test
41 | 
42 | spacemake projects merge_samples --merged_project_id test \
43 |     --merged_sample_id test_merged \
44 |     --project_id_list test \
45 |     --sample_id_list one_bc_file two_bc_files
46 | 
47 | # this is expected to fail as has different barcode_flavor
48 | spacemake projects merge_samples --merged_project_id test \
49 |     --merged_sample_id test_merged_2 \
50 |     --project_id_list test \
51 |     --sample_id_list sc_rnaseq_sample two_bc_files
52 | 
53 | spacemake projects merge_samples --merged_project_id test \
54 |     --merged_sample_id test_merged_2 \
55 |     --project_id_list test \
56 |     --sample_id_list sc_rnaseq_sample_2 two_bc_files
57 | 


--------------------------------------------------------------------------------
/test_data/README.md:
--------------------------------------------------------------------------------
1 | # SPACEMAKE test data 
2 | 
3 | There is a mix of old and new stuff in here that needs to be cleaned up. In general, I think file size should not exceed 1 MB. If you need a larger file, let's place it on bimsbstatic and download it from there like so:
4 | 
5 | `wget https://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/spacemake_tile_test_data.tar.gz`
6 | 
7 | 
8 | Thanks!
9 | 


--------------------------------------------------------------------------------
/test_data/make_chr22_test_data.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import pandas as pd
  3 | from spacemake.util import FASTQ_src, read_fq
  4 | from byo.track import load_track
  5 | 
  6 | genome = load_track("/data/rajewsky/genomes/hg38/hg38.fa")
  7 | 
  8 | # load output of gene_loci_to_gtf.py: a table with gene-name, start and end coordinates
  9 | df_genes = pd.read_csv(
 10 |     "chr22_gene_bounds.csv", sep="\t", names=["gene", "start", "end", "L"]
 11 | ).set_index("gene")
 12 | 
 13 | # keep track of the few dozen reads that Nikos has selected
 14 | selected_reads = {}
 15 | for fa_id, seq, qual in read_fq("reads_chr22_R2.fastq.gz"):
 16 |     if "IGLC3" in fa_id:
 17 |         print(f"selecting IGLC3 read {fa_id} -> {fa_id.split('_')[0]}")
 18 |     selected_reads[fa_id.split("_")[0]] = fa_id
 19 | 
 20 | # then go through the SAM file (no header) to get the mapping position of these reads (not ideal)
 21 | df = pd.read_csv(
 22 |     "/data/rajewsky/home/nkarais/murphy/fc_sts/collect_reads_chr22/final.polyA_adapter_trimmed_chr22.sam",
 23 |     sep="\t",
 24 |     header=None,
 25 | )
 26 | 
 27 | starts = []
 28 | for row in df.itertuples():
 29 |     # print(row)
 30 |     qname = row[1]
 31 |     if "A00643:496:HFJ5MDRX2:1:2101:12888:1172" in qname:
 32 |         print(f"YAY! detected IGLC3 read {qname}")
 33 | 
 34 |     if qname in selected_reads:
 35 |         print(f"selecting read {qname}")
 36 |         starts.append(row[4])
 37 | 
 38 | # find genes that overlap the selected reads mapping position
 39 | # this intersection code is very crude, but effective
 40 | intervals = set()
 41 | starts = set(starts)
 42 | for row in df_genes.itertuples():
 43 |     next_starts = set(starts)
 44 |     for x in starts:
 45 |         if row.start < x and row.end > x:
 46 |             intervals.add((row.start, row.end))
 47 |             print(f"selecting gene entry '{row}'")
 48 |             next_starts.discard(x)
 49 | 
 50 |     starts = next_starts
 51 | 
 52 | print(
 53 |     f"we have the following start coordinates left. selecting buffer regions around these"
 54 | )
 55 | print(starts)
 56 | 
 57 | 
 58 | def do_merge(s, e, intervals):
 59 |     keep = []
 60 |     for j, (s2, e2) in enumerate(intervals):
 61 |         new = (s2, e2)
 62 |         if s2 <= e and e2 >= e:
 63 |             print(f"overlap on the right, s={s} e={e} s2={s2} e2={e2}")
 64 |             new = (min(s2, s), max(e, e2))
 65 |         elif e2 >= s and s2 <= s:
 66 |             print(f"overlap on the left, s={s} e={e} s2={s2} e2={e2}")
 67 |             new = (min(s2, s), max(e, e2))
 68 |         elif s2 >= s and e2 <= e:
 69 |             print(f"contained in other interval. discard, s={s} e={e} s2={s2} e2={e2}")
 70 |             continue
 71 | 
 72 |         keep.append(new)
 73 | 
 74 |     return keep
 75 | 
 76 | 
 77 | # merge intervals that have some overlap
 78 | intervals = sorted(list(intervals), key=lambda x: x[1] - x[0], reverse=True)
 79 | print(intervals)
 80 | 
 81 | while True:
 82 |     changed = False
 83 |     for i, (s, e) in enumerate(intervals):
 84 |         others = intervals[i + 1 :]
 85 |         keep = do_merge(s, e, others)
 86 |         if keep != others:
 87 |             print("we had a change!")
 88 |             print("before")
 89 |             for s, e in intervals:
 90 |                 print(f"{s} - {e}")
 91 | 
 92 |             intervals = intervals[: i + 1] + keep
 93 |             intervals = sorted(list(intervals), key=lambda x: x[1] - x[0], reverse=True)
 94 |             print("after")
 95 |             for s, e in intervals:
 96 |                 print(f"{s} - {e}")
 97 | 
 98 |             changed = True
 99 |             break  # the for loop
100 | 
101 |     if not changed:
102 |         break
103 | 
104 | intervals = sorted(list(set(intervals)), key=lambda x: x[1] - x[0], reverse=True)
105 | print("remaining intervals")
106 | for s, e in intervals:
107 |     print(f"{s} - {e}")
108 | 
109 | 
110 | # Okay, now we know the gene loci which are needed to map the test reads!
111 | intervals = list(sorted(intervals))
112 | print(f"relevant intervals found: {len(intervals)}")
113 | 
114 | # extract the genomic sequence for the loci we need and save as a mini-"genome"
115 | with open("test_genome.fa", "wt") as f:
116 |     for start, end in intervals:
117 |         seq = genome.get_oriented("chr22", start, end, "+")
118 |         f.write(f">test_chr22.{start}-{end}\n{seq}\n")
119 | 
120 | # cut down the GTF annotation to only those parts that pertain to the genic regions we care about
121 | with open("test_annotation.gtf", "wt") as f:
122 |     for line in open("gencode.v38.chr22.gtf", "rt"):
123 |         if line.startswith("#"):
124 |             continue
125 | 
126 |         parts = line.rstrip().split("\t")
127 |         chrom, source, rec, start, end = parts[:5]
128 |         if chrom != "chr22":
129 |             continue
130 | 
131 |         start = int(start)
132 |         end = int(end)
133 | 
134 |         for s, e in intervals:
135 |             if (
136 |                 (start <= s and end > s)  # overlap the start of interval
137 |                 or (start > s and end < e)  # internal to interval
138 |                 or (start < e and end > e)  # overlap the end of interval
139 |                 or (start < s and end > e)  # overlap the entire interval
140 |             ):
141 |                 # the name of the pseudo-chromosome this is on (see excision of genomic sequence above)
142 |                 chrom = f"test_{chrom}.{s}-{e}"
143 | 
144 |                 start = max(
145 |                     0, start - s
146 |                 )  # translate start and end coordinates from whole chr22 to the gene region
147 |                 end = min(e - s, end - s)
148 | 
149 |                 parts[0:5] = (chrom, "test_data", rec, str(start), str(end))
150 |                 f.write("\t".join(parts) + "\n")
151 | 
152 | # Done! We now have:
153 | # * test_genome.fa.gz with the gene sequences
154 | # * test_annotation.gtf.gz with the gene models (exon/intron, CDS/UTR etc.)
155 | # * reads_chr22_R1.fastq.gz with test read barcodes mapping to a few CBs and UMIs
156 | # * reads_chr22_R2.fastq.gz with test read cDNAs mapping to the genic regions we care about
157 | 


--------------------------------------------------------------------------------
/test_data/mirgenedb.hsa.mature.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/mirgenedb.hsa.mature.fa.gz


--------------------------------------------------------------------------------
/test_data/mirgenedb.hsa.mature.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/mirgenedb.hsa.mature.gtf.gz


--------------------------------------------------------------------------------
/test_data/rRNA_hsa.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/rRNA_hsa.fa.gz


--------------------------------------------------------------------------------
/test_data/reads_chr22_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/reads_chr22_R1.fastq.gz


--------------------------------------------------------------------------------
/test_data/reads_chr22_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/reads_chr22_R2.fastq.gz


--------------------------------------------------------------------------------
/test_data/test_annotation.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_annotation.gtf.gz


--------------------------------------------------------------------------------
/test_data/test_bam_md5.txt:
--------------------------------------------------------------------------------
 1 | ./projects/test/processed_data/test_01/illumina/complete_data/final.polyA_adapter_trimmed.bam	85b00c5c1c699e4e3afda2b52b9d6442
 2 | ./projects/test/processed_data/test_01/illumina/complete_data/genome.STAR.bam	85b00c5c1c699e4e3afda2b52b9d6442
 3 | ./projects/test/processed_data/test_01/illumina/complete_data/unaligned_bc_tagged.bam	75d9ee7a618f8f766938192c84c9ac5a
 4 | ./projects/test/processed_data/test_01/illumina/complete_data/unaligned_bc_unassigned.bam	d41d8cd98f00b204e9800998ecf8427e
 5 | ./projects/test/processed_data/test_02/illumina/complete_data/final.polyA_adapter_trimmed.bam	c6fa15dcf2a36cea7479349ccf004523
 6 | ./projects/test/processed_data/test_02/illumina/complete_data/genome.STAR.bam	c6fa15dcf2a36cea7479349ccf004523
 7 | ./projects/test/processed_data/test_02/illumina/complete_data/miRNA.bowtie2.bam	98b57c64f1814c61e320b4fc96d75deb
 8 | ./projects/test/processed_data/test_02/illumina/complete_data/rRNA.bowtie2.bam	ec87ba1ac2e64f78db4fb9ea84162dc5
 9 | ./projects/test/processed_data/test_02/illumina/complete_data/unaligned_bc_tagged.bam	75d9ee7a618f8f766938192c84c9ac5a
10 | ./projects/test/processed_data/test_02/illumina/complete_data/unaligned_bc_unassigned.bam	d41d8cd98f00b204e9800998ecf8427e
11 | 


--------------------------------------------------------------------------------
/test_data/test_config.yaml:
--------------------------------------------------------------------------------
  1 | root_dir: '.'
  2 | temp_dir: '/tmp'
  3 | external_bin: 
  4 |         dropseq_tools: '/data/rajewsky/shared_bins/Drop-seq_tools-2.5.1/'
  5 | logging:
  6 |   level: INFO
  7 |   debug: "spacemake.util.read_fq"
  8 | 
  9 | puck_data:
 10 |     barcode_file: 'predictions_ml.csv'
 11 |     root: 'puck_data'
 12 | 
 13 | pucks:
 14 |     default:
 15 |         width_um: 3000
 16 |         spot_diameter_um: 10
 17 |     visium:
 18 |         barcodes: 'puck_data/visium_barcode_positions.csv'
 19 |         width_um: 6500
 20 |         spot_diameter_um: 55
 21 |     seq_scope:
 22 |         width_um: 1000
 23 |         spot_diameter_um: 1
 24 |     slide_seq:
 25 |         width_um: 3000
 26 |         spot_diameter_um: 10
 27 |     test_puck:
 28 |         width_um: 4000
 29 |         spot_diameter_um: 1 
 30 |     openst:
 31 |         width_um: 1200
 32 |         spot_diameter_um: 0.6
 33 |         coordinate_system: 'puck_data/openst_coordinate_system.csv'
 34 | 
 35 | run_modes:
 36 |     default:
 37 |         n_beads: 100000
 38 |         umi_cutoff: [100, 300, 500]
 39 |         clean_dge: False
 40 |         detect_tissue: False
 41 |         count_intronic_reads: True
 42 |         count_mm_reads: False
 43 |         mesh_data: False
 44 |         mesh_type: 'circle'
 45 |         mesh_spot_diameter_um: 55
 46 |         mesh_spot_distance_um: 100
 47 |     visium:
 48 |         n_beads: 10000
 49 |         umi_cutoff: [1000]
 50 |         clean_dge: False
 51 |         detect_tissue: True
 52 |         count_intronic_reads: False
 53 |         count_mm_reads: True
 54 |     slide_seq:
 55 |         n_beads: 100000
 56 |         umi_cutoff: [50]
 57 |         clean_dge: False
 58 |         detect_tissue: False
 59 |     scRNA_seq:
 60 |         n_beads: 10000
 61 |         umi_cutoff: [500]
 62 |         detect_tissue: False
 63 |         count_intronic_reads: True
 64 |         count_mm_reads: False
 65 |     seq_scope:
 66 |         clean_dge: false
 67 |         count_intronic_reads: false
 68 |         count_mm_reads: false
 69 |         detect_tissue: false
 70 |         mesh_data: true
 71 |         mesh_spot_diameter_um: 10
 72 |         mesh_spot_distance_um: 15
 73 |         mesh_type: hexagon
 74 |         n_beads: 1000
 75 |         umi_cutoff:
 76 |         - 100
 77 |         - 300
 78 |     spatial_rm:
 79 |         clean_dge: false
 80 |         count_intronic_reads: false
 81 |         count_mm_reads: false
 82 |         detect_tissue: false
 83 |         mesh_data: true
 84 |         mesh_spot_diameter_um: 10
 85 |         mesh_spot_distance_um: 15
 86 |         mesh_type: hexagon
 87 |         n_beads: 1000
 88 |         umi_cutoff:
 89 |         - 500
 90 |         - 1000
 91 |     openst:
 92 |         clean_dge: false
 93 |         count_intronic_reads: true
 94 |         count_mm_reads: true
 95 |         detect_tissue: false
 96 |         mesh_data: true
 97 |         mesh_spot_diameter_um: 7
 98 |         mesh_spot_distance_um: 7
 99 |         mesh_type: hexagon
100 |         n_beads: 100000
101 |         polyA_adapter_trimming: true
102 |         spatial_barcode_min_matches: 0.1
103 |         umi_cutoff:
104 |         - 100
105 |         - 250
106 |         - 500
107 |   
108 | 
109 | barcode_flavors:
110 |     default:
111 |         cell: "r1[0:12]"
112 |         UMI: "r1[12:20]"
113 |     dropseq:
114 |         cell: "r1[0:12]"
115 |         UMI: "r1[12:20]"
116 |     slide_seq_14bc:
117 |         cell: "r1[0:14]"
118 |         UMI: "r1[14:23]"
119 |     slide_seq_15bc:
120 |         cell: "r1[0:14]"
121 |         UMI: "r1[15:23]"
122 |     visium:
123 |         cell: "r1[0:16]"
124 |         UMI: "r1[16:28]"
125 |     sc_10x_v2:
126 |         cell: "r1[0:16]"
127 |         UMI: "r1[16:26]"
128 |     seq_scope:
129 |         UMI: "r2[0:9]"
130 |         cell: "r1[0:20]"
131 |     nextflex:
132 |         min_qual_trim: 20
133 |         cell: "'A'"
134 |         read1: "None"
135 |         UMI: "r2[:4] + r2[-4:]"
136 |         seq: "r2[4:-4]"
137 |         qual: "r2_qual[4:-4]"
138 |     openst:
139 |         UMI: "r2[0:9]"
140 |         bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}"
141 |         cell: "r1[2:27]"
142 | 
143 | adapters:
144 |     optical_primer: GAATCACGATACGTACACCA
145 |     TSO_SMART: AAGCAGTGGTATCAACGCAGAGTGAATGGG
146 |     SMART: AAGCAGTGGTATCAACGCAGAGTG
147 |     smart: AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTC
148 |     TSO_10x: AAGCAGTGGTATCAACGCAGAGTACATGGG
149 |     chromium_bead: CTACACGACGCTCTTCCGATCT
150 |     dropseq_bead: AAGCAGTGGTATCAACGCAGAGTAC
151 |     polyA: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
152 |     polyG: GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
153 |     nextflex_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTGAA
154 |     truseq_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCA
155 | 
156 | adapter_flavors:
157 |   SMARTER:
158 |     cut_right:
159 |     - Q:
160 |         min_base_quality: 30
161 |     - polyA:
162 |         max_error: 0.25
163 |         min_overlap: 3
164 |     paired_end: replace_N
165 |   chromium:
166 |     cut_right:
167 |     - Q:
168 |         min_base_quality: 32
169 |     - polyA:
170 |         max_error: 0.25
171 |         min_overlap: 3
172 |     - polyG:
173 |         max_error: 0.1
174 |         min_overlap: 3
175 |   dropseq:
176 |     cut_left:
177 |     - TSO_SMART:
178 |         max_error: 0.1
179 |         min_overlap: 10
180 |     cut_right:
181 |     - Q:
182 |         min_base_quality: 30
183 |     - polyA:
184 |         max_error: 0.25
185 |         min_overlap: 3
186 |     - polyG:
187 |         max_error: 0.1
188 |         min_overlap: 3
189 |     paired_end: single-end
190 |   default:
191 |     cut_left:
192 |     - TSO_SMART:
193 |           max_error: 0.1
194 |           min_overlap: 10
195 |     cut_right:
196 |     - Q:
197 |         min_base_quality: 30
198 |     - polyA:
199 |         max_error: 0.25
200 |         min_overlap: 3
201 |     - polyG:
202 |         max_error: 0.1
203 |         min_overlap: 3
204 |     paired_end: single-end
205 |         
206 | quant:
207 |   default:
208 |     counter_class: "spacemake.quant.DefaultCounter"
209 |     channels: 
210 |       - "counts"
211 |       - "exonic_counts"
212 |       - "exonic_reads"
213 |       - "intronic_counts"
214 |       - "intronic_reads"
215 |     X_counts: ["exonic_counts", "intronic_counts"]
216 |     alignment_priorities: {
217 |         'C': 101, # coding exon
218 |         'c': 100, # coding exon (lower case == antisense)
219 |         'U': 51,  # UTR exon
220 |         'u': 50,
221 |         'CU': 51, # overlaps both, CDS+UTR (should in fact never occur as 'CU')
222 |         'cu': 50,
223 |         'N': 21, # exon of non-coding transcript
224 |         'n': 20,
225 |         'I': 11, # intronic region
226 |         'i': 10,
227 |         '-': 0,
228 |     }
229 |     gene_priorities: {
230 |         'C': 101, # coding exon
231 |         'c': 100, # coding exon (lower case == antisense)
232 |         'U': 51,  # UTR exon
233 |         'u': 50,
234 |         'CU': 51, # overlaps both, CDS+UTR (should in fact never occur as 'CU')
235 |         'cu': 50,
236 |         'N': 21, # exon of non-coding transcript
237 |         'n': 20,
238 |         'I': 11, # intronic region
239 |         'i': 10,
240 |         '-': 0,
241 |     }
242 |     exonic_tags: ["C", "U", "CU", "N", "c", "u", "cu", "n"]
243 |     intronic_tags: ["I", "i"]
244 |     alignment_selection: priority
245 |     exon_intron_disambiguation: "exon_wins"
246 |   miRNA:
247 |     alignment_selection: take_first
248 |   chrom:
249 |     alignment_selection: take_first    
250 |     gene_selection: chrom
251 |   custom_index:
252 |     alignment_selection: take_first_plus
253 |     gene_selection: chrom
254 | species:
255 |     test_hsa:
256 |         genome:
257 |             annotation: "{spacemake_dir}/test_data/test_genome.gtf.gz"
258 |             sequence: "{spacemake_dir}/test_data/test_genome.fa.gz"
259 |         miRNA:
260 |             annotation: "{spacemake_dir}/test_data/mirgenedb.hsa.mature.gtf.gz"
261 |             sequence: "{spacemake_dir}/test_data/mirgenedb.hsa.mature.fa.gz"
262 |         rRNA:
263 |             annotation: ''
264 |             sequence: "{spacemake_dir}/test_data/rRNA_hsa.fa.gz"
265 | 
266 | 


--------------------------------------------------------------------------------
/test_data/test_genome.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_genome.fa.gz


--------------------------------------------------------------------------------
/test_data/test_genome.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_genome.gtf.gz


--------------------------------------------------------------------------------
/test_data/test_project_df.csv:
--------------------------------------------------------------------------------
1 | project_id,sample_id,puck_barcode_file_id,sample_sheet,species,demux_barcode_mismatch,demux_dir,basecalls_dir,R1,R2,reads,longreads,longread_signature,investigator,sequencing_date,experiment,puck_barcode_file,run_mode,barcode_flavor,is_merged,merged_from,puck,dge,map_strategy,adapter_flavor
2 | test,test_01,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,STAR:genome,dropseq
3 | test,test_01b,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,STAR:genome,dropseq
4 | test,test_02,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq
5 | test,test_03_nofinal,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR,dropseq
6 | tile,tile_1,['tile_1'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,['{spacemake_dir}/test_data/tile_1.txt'],['spatial_rm'],dropseq,False,[],test_puck,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq
7 | tile,tile_2,['tile_2'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,['{spacemake_dir}/test_data/tile_2.txt'],['spatial_rm'],dropseq,False,[],test_puck,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq
8 | tile,tile_both,"['tile_1', 'tile_2']",,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,"['{spacemake_dir}/test_data/tile_1.txt', '{spacemake_dir}/test_data/tile_2.txt']",['spatial_rm'],dropseq,False,[],test_puck,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq
9 | 


--------------------------------------------------------------------------------
/test_data/test_reads.R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_reads.R1.fastq.gz


--------------------------------------------------------------------------------
/test_data/test_reads.R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_reads.R2.fastq.gz


--------------------------------------------------------------------------------
/test_data/tile_1.txt:
--------------------------------------------------------------------------------
1 | cell_bc	x_pos	y_pos
2 | ACGTACGTACGT	0	0
3 | GAAGGACTTCAA	0	1
4 | TATTTGGCACTC	1	0
5 | CTCTGATTAGGT	1	1
6 | 


--------------------------------------------------------------------------------
/test_data/tile_2.txt:
--------------------------------------------------------------------------------
1 | cell_bc	x_pos	y_pos
2 | ATTGTACGCATC	0	0
3 | GACGTGACGGCA	0	1
4 | TTATTGCGAGAC	1	0
5 | GTTGCAACTGTA	1	1
6 | 


--------------------------------------------------------------------------------
/test_data/tile_3.txt:
--------------------------------------------------------------------------------
1 | cell_bc	x_pos	y_pos
2 | AGTAGGGGTGTC	1	1
3 | AGCAAACTCGGC	1	2
4 | ATTTTATAGAGT	2	1
5 | CGGACGATGTGG	2	2


--------------------------------------------------------------------------------
/tests/fixtures.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | 
  4 | spacemake_dir = os.path.abspath(os.path.dirname(__file__) + "/../")
  5 | print("SPACEMAKE_DIR", spacemake_dir)
  6 | 
  7 | 
  8 | def sm(*argc, expect_fail=False):
  9 |     # construct the desired cmdline
 10 |     import sys
 11 | 
 12 |     sys.argv = [
 13 |         "spacemake",
 14 |     ] + list(argc)
 15 | 
 16 |     # ensure that no ConfigFile and ProjectDF instances
 17 |     # are retained from previous tests
 18 |     import spacemake.config
 19 |     import spacemake.project_df
 20 | 
 21 |     spacemake.config.__global_config = None
 22 |     spacemake.project_df.__global_ProjectDF = None
 23 | 
 24 |     # execute spacemake cmdline code
 25 |     from spacemake.cmdline import cmdline
 26 | 
 27 |     res = cmdline()
 28 |     # print("res", res)
 29 |     if expect_fail:
 30 |         assert isinstance(res, Exception) == True
 31 |     else:
 32 |         assert isinstance(res, Exception) == False
 33 | 
 34 |     return res
 35 | 
 36 | 
 37 | def _init():
 38 |     # just get the version
 39 |     sm("--version")
 40 | 
 41 |     # test the init parser
 42 |     sm("init", "--dropseq-tools", "/data/rajewsky/shared_bins/Drop-seq_tools-2.5.1/")
 43 | 
 44 | 
 45 | def _add_species():
 46 |     sm(
 47 |         "config",
 48 |         "add-species",
 49 |         "--name=test_hsa",
 50 |         "--reference=genome",
 51 |         f"--sequence={spacemake_dir}/test_data/test_genome.fa.gz",
 52 |         f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz",
 53 |     )
 54 |     # add a second reference
 55 |     sm(
 56 |         "config",
 57 |         "add-species",
 58 |         "--name=test_hsa",
 59 |         "--reference=rRNA",
 60 |         f"--sequence={spacemake_dir}/test_data/rRNA_hsa.fa.gz",
 61 |     )
 62 |     # add a third reference
 63 |     sm(
 64 |         "config",
 65 |         "add-species",
 66 |         "--name=test_hsa",
 67 |         "--reference=miRNA",
 68 |         f"--sequence={spacemake_dir}/test_data/mirgenedb.hsa.mature.fa.gz",
 69 |     )
 70 |     # pretend we have mouse as well
 71 |     # TODO: place some actual mouse genome and/or phiX genomes in test-data repository
 72 |     sm(
 73 |         "config",
 74 |         "add-species",
 75 |         "--name=mouse",
 76 |         "--reference=genome",
 77 |         f"--sequence={spacemake_dir}/test_data/test_genome.fa.gz",
 78 |         f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz",
 79 |     )
 80 |     sm(
 81 |         "config",
 82 |         "add-species",
 83 |         "--name=mouse",
 84 |         "--reference=phiX",
 85 |         f"--sequence={spacemake_dir}/test_data/test_genome.fa.gz",
 86 |         f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz",
 87 |     )
 88 |     sm(
 89 |         "config",
 90 |         "add-species",
 91 |         "--name=mouse",
 92 |         "--reference=rRNA",
 93 |         f"--sequence={spacemake_dir}/test_data/rRNA_hsa.fa.gz",
 94 |     )
 95 | 
 96 | 
 97 | @pytest.fixture
 98 | def tmp_root(tmp_path_factory):
 99 |     tmp = tmp_path_factory.mktemp("root_blank")
100 | 
101 |     return tmp
102 | 
103 | 
104 | @pytest.fixture
105 | def initialized_root(tmp_path_factory):
106 |     tmp = tmp_path_factory.mktemp("root_initialized")
107 |     os.chdir(tmp.as_posix())
108 | 
109 |     _init()
110 |     return tmp
111 | 
112 | 
113 | @pytest.fixture
114 | def with_species(initialized_root):
115 |     os.chdir(initialized_root.as_posix())
116 |     # # test old way
117 |     # sm(
118 |     #     "config", "add_species",
119 |     #     "--name=hsa_test",
120 |     #     f"--genome={spacemake_dir}/test_data/test_genome.fa.gz",
121 |     #     f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz",
122 |     # )
123 |     # test new way
124 |     _add_species()
125 |     return initialized_root
126 | 
127 | 
128 | @pytest.fixture
129 | def configured_root(tmp_path_factory):
130 |     tmp_root = tmp_path_factory.mktemp("root_preconfigured")
131 | 
132 |     # make a tmp-copy of the test_config.yaml
133 |     def_config = os.path.join(spacemake_dir, "test_data/test_config.yaml")
134 |     os.system(f"cp {def_config} {tmp_root / 'config.yaml'}")
135 | 
136 |     test_pdf = os.path.join(spacemake_dir, "test_data/test_project_df.csv")
137 |     open(f"{tmp_root / 'project_df.csv'}", "w").write(
138 |         open(test_pdf, "r").read().format(spacemake_dir=spacemake_dir)
139 |     )
140 |     # os.system(f"cp {test_pdf} {tmp_root / 'project_df.csv'}")
141 | 
142 |     return tmp_root
143 | 
144 | 
145 | @pytest.fixture(scope="session")
146 | def with_tile_test_data(tmp_path_factory):
147 |     tmp = tmp_path_factory.mktemp("root_tile_test")
148 |     os.chdir(tmp.as_posix())
149 |     _init()
150 |     _add_species()
151 |     print(
152 |         "return code",
153 |         os.system(
154 |             "wget https://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/spacemake_tile_test_data.tar.gz -O /dev/stdout | tar -xz"
155 |         ),
156 |     )
157 |     print(os.listdir("."))
158 | 
159 |     return tmp
160 | 


--------------------------------------------------------------------------------
/tests/test_fastq_to_ubam.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | 
  3 | import pytest
  4 | import sys
  5 | import os
  6 | from spacemake.bin.fastq_to_uBAM import *
  7 | 
  8 | 
  9 | spacemake_dir = os.path.dirname(__file__) + "/../"
 10 | 
 11 | 
 12 | @pytest.fixture(scope="session")
 13 | def test_root(tmp_path_factory):
 14 |     tmp = tmp_path_factory.mktemp("root")
 15 |     sm_path = os.path.dirname(__file__)
 16 |     # make a tmp-copy of the test_config.yaml
 17 |     # def_config = os.path.join(sm_path, "../test_data/test_config.yaml")
 18 |     # os.system(f"cp {def_config} {tmp / 'config.yaml'}")
 19 | 
 20 |     # test_pdf =  os.path.join(sm_path, "../test_data/test_project_df.csv")
 21 |     # os.system(f"cp {test_pdf} {tmp / 'project_df.csv'}")
 22 | 
 23 |     return tmp
 24 | 
 25 | 
 26 | def sm(*argc, expect_fail=False):
 27 |     sys.argv = [
 28 |         "fastq_to_uBAM.py",
 29 |     ] + list(argc)
 30 |     res = cmdline()
 31 |     print("got result", res)
 32 |     from spacemake.errors import SpacemakeError
 33 | 
 34 |     if expect_fail:
 35 |         assert isinstance(res, SpacemakeError) == True
 36 |     else:
 37 |         assert isinstance(res, Exception) == False
 38 | 
 39 | 
 40 | def test_help():
 41 |     try:
 42 |         sm("--help")
 43 |     except SystemExit:
 44 |         pass
 45 | 
 46 | 
 47 | def test_dropseq():
 48 |     sm(
 49 |         "--read1",
 50 |         spacemake_dir + "test_data/reads_chr22_R1.fastq.gz",
 51 |         "--read2",
 52 |         spacemake_dir + "test_data/reads_chr22_R2.fastq.gz",
 53 |         "--out-bam",
 54 |         "/dev/null",
 55 |     )
 56 | 
 57 | 
 58 | def test_single():
 59 |     sm(
 60 |         "--read2",
 61 |         spacemake_dir + "test_data/reads_chr22_R2.fastq.gz",
 62 |         "--out-bam",
 63 |         "/dev/null",
 64 |         """--cell='"A"'""",
 65 |     )
 66 | 
 67 | 
 68 | def test_minqual():
 69 |     sm(
 70 |         "--read2",
 71 |         spacemake_dir + "test_data/reads_chr22_R2.fastq.gz",
 72 |         "--out-bam",
 73 |         "/dev/null",
 74 |         "--min-qual",
 75 |         "30",
 76 |         """--cell='"A"'""",
 77 |     )
 78 | 
 79 | 
 80 | def test_issue135():
 81 |     import spacemake.bin.fastq_to_uBAM as ubam
 82 |     from argparse import Namespace
 83 | 
 84 |     args = Namespace(
 85 |         bam_tags="CR:{cell},CB:{cell},MI:{UMI},RG:A",
 86 |         min_len=18,
 87 |         min_qual_trim=0,
 88 |         cell="r1[8:20][::-1]",
 89 |         UMI="r1[0:8]",
 90 |         seq="r2",
 91 |         qual="r2_qual",
 92 |         disable_safety=False,
 93 |     )
 94 | 
 95 |     fmt = ubam.make_formatter_from_args(args)
 96 |     attrs = fmt(
 97 |         r2_qname="QNAME MUST NOT HAVE WHITESPACE",
 98 |         r1="ACGTACGT",
 99 |         r1_qual="########",
100 |         r2="TGCATGCATGCATGCA",
101 |         r2_qual="################",
102 |     )
103 |     sam = ubam.make_sam_record(flag=4, **attrs)
104 |     cols = sam.split()
105 |     assert cols[0] == "QNAME"
106 |     assert cols[1] == "4"
107 |     # print(sam)
108 | 
109 | 
110 | # if __name__ == "__main__":
111 | #     test_issue135()
112 | 


--------------------------------------------------------------------------------
/tests/test_map_strategy.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from spacemake.map_strategy import *
  4 | from spacemake.config import ConfigFile
  5 | from spacemake.project_df import ProjectDF
  6 | from spacemake.errors import *
  7 | import os
  8 | 
  9 | from fixtures import configured_root, tmp_root, sm, spacemake_dir
 10 | 
 11 | 
 12 | def test_validation(configured_root):
 13 |     config = ConfigFile.from_yaml((configured_root / "config.yaml").as_posix())
 14 |     data = [
 15 |         (
 16 |             "flipped",
 17 |             "rRNA:bowtie2->genome:STAR",
 18 |             "test_hsa",
 19 |             "bowtie2:rRNA->STAR:genome",
 20 |         ),
 21 |         (
 22 |             "species_missing",
 23 |             "bowtie2:rRNA->STAR:genome",
 24 |             "test_hs",
 25 |             "<class 'spacemake.errors.ConfigVariableNotFoundError'>",
 26 |         ),
 27 |         (
 28 |             "with_cflavor",
 29 |             "bowtie2@custom_index:rRNA->STAR@default:genome",
 30 |             "test_hsa",
 31 |             "bowtie2@custom_index:rRNA->STAR@default:genome",
 32 |         ),
 33 |         (
 34 |             "unknown_cflavor",
 35 |             "rRNA:bowtie2@custom->genome:STAR@default",
 36 |             "test_hsa",
 37 |             "<class 'spacemake.errors.ConfigVariableNotFoundError'>",
 38 |         ),
 39 |         # ("flipped", "bowtie2:rRNA->STAR:genome", "rRNA:bowtie2->genome:STAR"),
 40 |     ]
 41 |     for name, mapstr, species, expect in data:
 42 |         # print(f"running test {name}")
 43 |         try:
 44 |             res = validate_mapstr(mapstr, config=config, species=species)
 45 |         except (ValueError, ConfigVariableNotFoundError) as e:
 46 |             res = str(type(e))
 47 | 
 48 |         print(f"test '{name}': {mapstr}-> {res} expect={expect} {expect == res}")
 49 |         assert res == expect
 50 | 
 51 | 
 52 | def test_mapstr(configured_root):
 53 |     config = ConfigFile.from_yaml((configured_root / "config.yaml").as_posix())
 54 |     data = [
 55 |         ("with_cflavor", "bowtie2@custom_index:rRNA->STAR@default:genome", None),
 56 |     ]
 57 |     for name, mapstr, expect in data:
 58 |         mr, lr = mapstr_to_targets(mapstr)
 59 |         assert mr[0].input_name == "uBAM"
 60 |         assert mr[-1].input_name == "rRNA.bowtie2"
 61 |         assert lr[0].link_src == "genome.STAR"
 62 |         assert lr[0].link_name == "final"
 63 | 
 64 | 
 65 | def test_get_mapped_BAM_output(configured_root):
 66 |     config = ConfigFile.from_yaml((configured_root / "config.yaml").as_posix())
 67 |     project_df = ProjectDF(
 68 |         (configured_root / "project_df.csv").as_posix(), config=config
 69 |     )
 70 | 
 71 |     out_files = get_mapped_BAM_output(project_df=project_df, config=config)
 72 |     print(out_files)
 73 | 
 74 | 
 75 | def test_validation_cmdline_issue_54(configured_root):
 76 |     os.chdir(configured_root.as_posix())
 77 |     data = [
 78 |         ("flipped", "rRNA:bowtie2->genome:STAR", "test_hsa", True),
 79 |         ("species_missing", "bowtie2:rRNA->STAR:genome", "test_hs", False),
 80 |         (
 81 |             "with_cflavor",
 82 |             "rRNA:bowtie2@custom_index->genome:STAR@default",
 83 |             "test_hsa",
 84 |             True,
 85 |         ),
 86 |         (
 87 |             "unknown_cflavor",
 88 |             "rRNA:bowtie2@customX->genome:STAR@defaultBLA",
 89 |             "test_hsa",
 90 |             False,
 91 |         ),
 92 |         # ("flipped", "bowtie2:rRNA->STAR:genome", "rRNA:bowtie2->genome:STAR"),
 93 |     ]
 94 |     for name, mapstr, species, expect_pass in data:
 95 |         print(f"running test {name}")
 96 |         # add
 97 |         sm(
 98 |             "projects",
 99 |             "add-sample",
100 |             "--project-id=test",
101 |             f"--sample-id={name}",
102 |             f"--map-strategy={mapstr}",
103 |             f"--R1={spacemake_dir}/test_data/reads_chr22_R1.fastq.gz",
104 |             f"--R2={spacemake_dir}/test_data/reads_chr22_R2.fastq.gz",
105 |             f"--species={species}",
106 |             expect_fail=not expect_pass,
107 |         )
108 | 


--------------------------------------------------------------------------------