├── .coveragerc ├── .gitignore ├── .readthedocs.yaml ├── CONTRIBUTING.md ├── COPYING ├── LICENSE ├── MANIFEST.in ├── README.md ├── build_instructions.txt ├── docs ├── .gitignore ├── api │ ├── api.rst │ ├── index.rst │ └── internal_api.rst ├── conf.py ├── config.rst ├── index.rst ├── initialize.rst ├── install.rst ├── links.rst ├── projects │ └── index.rst ├── quick-start │ ├── index.rst │ └── run_spacemake.rst ├── requirements.txt ├── run.rst ├── shared │ ├── shared_sample_variables.rst │ └── spacemake_init.rst ├── smk_logo.png ├── troubleshooting.rst └── tutorials │ ├── .gitignore │ ├── he_integration.ipynb │ ├── img │ ├── .gitignore │ ├── manual_alignment_1.png │ ├── manual_alignment_2.png │ ├── manual_alignment_3.png │ ├── manual_alignment_4.png │ ├── manual_alignment_5.png │ ├── manual_alignment_6.png │ ├── test_longread.donuts.png │ ├── test_longread.hists.png │ └── test_longread.oligo_edits.png │ ├── index.rst │ ├── longreads.rst │ ├── manual_he_integration.rst │ ├── novosparc_integration.ipynb │ └── process_single_cell_data.rst ├── environment.yaml ├── pyproject.toml ├── sequences ├── .gitignore └── primers.fa ├── setup.cfg ├── setup.py ├── spacemake ├── .gitignore ├── __init__.py ├── alnstats.py ├── annotator.py ├── bin │ ├── BamTagHistogram.py │ └── fastq_to_uBAM.py ├── cmdline.py ├── config.py ├── contrib.py ├── cutadapt_bam.py ├── data │ ├── .gitignore │ ├── config │ │ ├── config.yaml │ │ ├── longread.yaml │ │ └── species_data_url.yaml │ ├── puck_collection │ │ ├── create_novaseq_S4_coordinate_system.py │ │ └── openst_coordinate_system.csv │ ├── test │ │ ├── test_bc1.csv │ │ ├── test_bc2.csv │ │ ├── visium_public_lane_joined_1m_R1.fastq.gz │ │ └── visium_public_lane_joined_1m_R2.fastq.gz │ └── visium_barcode_positions.csv ├── errors.py ├── longread │ ├── __main__.py │ ├── annotation.py │ ├── cache.py │ ├── cmdline.py │ ├── overview.py │ ├── report.py │ └── signature.py ├── map_strategy.py ├── parallel.py ├── preprocess │ ├── __init__.py │ ├── cmdline.py │ ├── dge.py │ └── fastq.py ├── project_df.py ├── quant.py ├── reporting.py ├── smk.py ├── snakemake │ ├── __init__.py │ ├── downsample.smk │ ├── dropseq.smk │ ├── longread.smk │ ├── main.smk │ ├── mapping.smk │ ├── merge_samples.smk │ ├── scripts │ │ ├── .gitignore │ │ ├── automated_analysis.py │ │ ├── automated_analysis_create_processed_data_files.py │ │ ├── automated_analysis_create_report.Rmd │ │ ├── clean_top_barcodes.py │ │ ├── create_sample_db.R │ │ ├── create_sample_overview.Rmd │ │ ├── create_spatial_barcode_file.py │ │ ├── create_spatial_dge.py │ │ ├── filter_mm_reads.py │ │ ├── fix_bam_header.py │ │ ├── kmer_stats_from_fastq.py │ │ ├── n_intersect_sequences.py │ │ ├── parse_ribo_log.py │ │ ├── qc_sequencing_create_sheet.Rmd │ │ ├── saturation_analysis.Rmd │ │ ├── shared_functions.R │ │ ├── snakemake_helper_functions.py │ │ ├── splice_bam_header.py │ │ └── split_reads_by_strand_info.py │ ├── species_init.smk │ ├── variables.py │ └── visium.smk ├── spatial │ ├── __init__.py │ ├── cmdline.py │ ├── he_integration.py │ ├── novosparc_integration.py │ ├── puck_collection.py │ └── util.py ├── tag_alignments.py ├── unittests.py └── util.py ├── test.sh ├── test_data ├── README.md ├── make_chr22_test_data.py ├── mirgenedb.hsa.mature.fa.gz ├── mirgenedb.hsa.mature.gtf.gz ├── rRNA_hsa.fa.gz ├── reads_chr22_R1.fastq.gz ├── reads_chr22_R2.fastq.gz ├── test_annotation.gtf.gz ├── test_bam_md5.txt ├── test_config.yaml ├── test_genome.fa.gz ├── test_genome.gtf.gz ├── test_project_df.csv ├── test_reads.R1.fastq.gz ├── test_reads.R2.fastq.gz ├── tile_1.txt ├── tile_2.txt └── tile_3.txt └── tests ├── fixtures.py ├── test_cmdline.py ├── test_fastq_to_ubam.py └── test_map_strategy.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | concurrency = multiprocessing 3 | parallel = true 4 | sigterm = true 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | projects 3 | config.yaml 4 | samples.yaml 5 | *.ipynb 6 | *.tif 7 | *.jpg 8 | *.jpeg 9 | *.png 10 | *.rds 11 | *.html 12 | *.csv 13 | *.pdf 14 | *.txt.gz 15 | *.fastq.gz 16 | *.fa 17 | *.gtf 18 | *.DS_Store 19 | *.log 20 | *.bam 21 | .coverage* 22 | .snakemake 23 | build 24 | __pycache__ 25 | spacemake.egg-info 26 | scratch 27 | 28 | *.icloud 29 | *.h5ad 30 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-20.04" 5 | tools: 6 | python: "3.9" 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to spacemake 2 | We want to make contributing to this project as easy and transparent as possible, whether it's: 3 | 4 | - Reporting a bug 5 | - Discussing the current state of the code 6 | - Submitting a fix 7 | - Proposing new features 8 | - Becoming a maintainer 9 | 10 | ## We Develop with Github 11 | We use github to host code, to track issues and feature requests, as well as accept pull requests. 12 | 13 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests 14 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: 15 | 16 | 1. Fork the repo and create your branch from `master`. 17 | 2. If you've added code that should be tested, add tests. 18 | 3. If you've changed APIs, update the documentation. 19 | 4. Ensure the test suite passes. 20 | 5. Make sure your code lints. 21 | 6. Issue that pull request! 22 | 23 | ## Any contributions you make will be under the GNU Software License 24 | When you submit code changes, your submissions are understood to be under the same [GNU License](http://choosealicense.com/licenses/gnu/) that covers the project. Feel free to contact the maintainers if that's a concern. 25 | 26 | ## Report bugs using Github's [issues](https://github.com/rajewsky-lab/spacemake/issues) 27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/rajewsky-lab/spacemake/issues/new?assignees=&labels=&template=bug-report.md&title=).; it's that easy! 28 | 29 | ## Write bug reports with detail, background, and sample code 30 | **Great Bug Reports** tend to have: 31 | 32 | - A quick summary and/or background 33 | - Steps to reproduce 34 | - Be specific! 35 | - Give sample code if you can. 36 | - What you expected would happen 37 | - What actually happens 38 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 39 | 40 | ## Issue Triage 41 | Here are some tags that we're using to better organize issues in this repo: 42 | 43 | * `good first issue` - Good candidates for someone new to the project to contribute. 44 | * `help wanted` - Issues that should be addressed and which we would welcome a 45 | PR for but may need significant investigation or work 46 | * `support` - Request for help with a concept or piece of code but this isn't an 47 | issue with the project. 48 | * `needs more info` - Missing repro steps or context for both project issues \& 49 | support questions. 50 | * `discussion` - Issues where folks are discussing various approaches \& ideas. 51 | * `question` - Something that is a question specifically for the maintainers such 52 | as [this issue about the license](https://github.com/facebook/draft-js/issues/1819). 53 | * `documentation` - Relating to improving documentation for the project. 54 | - Browser \& OS-specific tags for anything that is specific to a particular 55 | environment (e.g. `chrome`, `firefox`, `macos`, `android` and so forth). 56 | 57 | ## References 58 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md) 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU General Public License, version 2 (GPL-2.0) 2 | 3 | spacemake: pipeline for processing and analysing sequencing based spatial-transcriptomics data. 4 | 5 | Copyright (C) 2021 Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikolaos Karaiskos and Nikolaus Rajewsky. 6 | All rights reserved. 7 | 8 | This file is part of spacemake. 9 | 10 | Spacemake is free software; you can redistribute it and/or modify 11 | it under the terms of the GNU General Public License as published by 12 | the Free Software Foundation; either version 2 of the License, or 13 | (at your option) any later version. 14 | 15 | Spacemake is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | GNU General Public License for more details. 19 | 20 | You should have received a copy of the GNU General Public License 21 | along with this program; if not, write to the Free Software 22 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include *.smk *.csv *.py *.R *.Rmd *.yaml 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![docs](https://readthedocs.org/projects/spacemake/badge/?version=latest)](https://spacemake.readthedocs.io/) 2 | [![Downloads](https://pepy.tech/badge/spacemake)](https://pepy.tech/project/spacemake) 3 | [![PyPI Version](https://img.shields.io/pypi/v/spacemake.svg)](https://pypi.org/project/spacemake) 4 | [![PyPI License](https://img.shields.io/pypi/l/spacemake.svg)](https://pypi.org/project/spacemake) 5 | 6 | 7 | # Spacemake: processing and analysis of large-scale spatial transcriptomics data 8 | ### [🌐 docs](https://spacemake.readthedocs.io/en/latest/) | [📜 paper](https://doi.org/10.1093/gigascience/giac064) | [💬 discussions](https://github.com/rajewsky-lab/spacemake/discussions) 9 | 10 | 11 | Spacemake is a modular, robust, and scalable spatial transcriptomics pipeline built in `Snakemake` and `Python`. Spacemake is designed to handle all major spatial transcriptomics datasets and can be readily configured for other technologies. It can process and analyze several samples in parallel, even if they stem from different experimental methods. Spacemake's unified framework enables reproducible data processing from raw sequencing data to automatically generated downstream analysis reports. Spacemake is built with a modular design and offers additional functionality such as sample merging, saturation analysis, and analysis of long reads as separate modules. 12 | 13 | If you find Spacemake useful in your work, consider citing it: 14 | 15 | ``` 16 | Spacemake: processing and analysis of large-scale spatial transcriptomics data 17 | Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky 18 | GigaScience, Volume 11, 2022, giac064 19 | ``` 20 | 21 | Documentation can be found [here](https://spacemake.readthedocs.io/en/latest/). 22 | 23 | ## Unit testing 24 | 25 | We are committed to achieving a high code coverage with unit tests. The master branch utilizes the `unittest` module to run spacemake with small test data sets. On the current development branches, we have switched to `pytest` and cover a much broader range of the code. This work is ongoing. 26 | 27 | To run the currently implemented tests on master, run `python spacemake/unittests.py`. This will create a directory `spacemake/_tests/` inside which a minimal spacemake directory structure will be created using `spacemake init` and subsequently some of the core functionality (adding genomes/species, samples, changing configuration, etc.) will be executed. All output will be logged to `spacemake/_tests/run_spacemake.out.log`. If you encounter any weird behavior, please make sure to include the content of this file in your ticket on the issue tracker. Thank you! 28 | ... 29 | 30 | ## Contributing 31 | `Spacemake` is an open-source project mostly maintained by the [Rajewsky lab @ MDC Berlin](https://www.mdc-berlin.de/n-rajewsky) - so, your involvement is warmly welcome! 32 | If you're excited to join us, we recommend the following steps: 33 | 34 | - Found a bug? Contact an admin in the form of an [issue](https://github.com/rajewsky-lab/openst/issues/new?assignees=&labels=&template=bug-report.md&title=). 35 | - Implement your idea following guidelines set by the [official contributing guide](CONTRIBUTING.md) 36 | - Wait for admin approval; approval is iterative, but if accepted will belong to the main repository. 37 | 38 | In general, you can always refer to the [contribution guidelines](CONTRIBUTING.md) for more details! 39 | Currently, only [admins](https://github.com/orgs/rajewsky-lab/people) will be merging all accepted changes. 40 | 41 | ## Code of Conduct 42 | Everyone interacting in `spacemake`'s codebases, issue trackers, and discussion forums is expected to follow the [PSF Code of Conduct](https://www.python.org/psf/conduct/). 43 | -------------------------------------------------------------------------------- /build_instructions.txt: -------------------------------------------------------------------------------- 1 | # How this package was built for PyPI 2 | 3 | - installing 'build' and 'twine' via pip 4 | 5 | `python3 -m pip install --upgrade build` 6 | `python3 -m pip install --upgrade twine` 7 | 8 | - getting an API token from PyPI and placing in ~/.pypirc 9 | 10 | - in top-level (where pyptoject.toml resides) `python -m build` 11 | 12 | This creates package files in the dists/ subdirectory 13 | 14 | - upload 15 | testpypi: `python3 -m twine upload --repository testpypi dist/* ` 16 | live pypi: `python3 -m twine upload --repository pypi dist/* ` 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | !*.ipynb 2 | !projects 3 | -------------------------------------------------------------------------------- /docs/api/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | Spacemake class 5 | --------------- 6 | 7 | Accessing spacemake objects from python 8 | 9 | .. autoclass:: spacemake.Spacemake 10 | :members: 11 | 12 | H&E integration module 13 | ---------------------- 14 | 15 | .. autofunction:: spacemake.spatial.he_integration.align_he_spot_img 16 | 17 | .. autofunction:: spacemake.spatial.he_integration.align_he_aggregated_img 18 | 19 | .. autofunction:: spacemake.spatial.he_integration.attach_he_adata 20 | 21 | novosparc integration module 22 | ---------------------------- 23 | 24 | .. autofunction:: spacemake.spatial.novosparc_integration.novosparc_denovo 25 | 26 | .. autofunction:: spacemake.spatial.novosparc_integration.save_novosparc_res 27 | 28 | .. autofunction:: spacemake.spatial.novosparc_integration.novosparc_mapping 29 | 30 | .. autofunction:: spacemake.spatial.novosparc_integration.quantify_clusters_spatially 31 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | API and Internal API 2 | ==================== 3 | 4 | .. toctree:: 5 | 6 | api 7 | internal_api 8 | -------------------------------------------------------------------------------- /docs/api/internal_api.rst: -------------------------------------------------------------------------------- 1 | Internal API 2 | ============ 3 | 4 | ProjectDF 5 | --------- 6 | 7 | The ProjectDF class is the core back-end class of spacemake. 8 | 9 | .. automodule:: spacemake.project_df 10 | :members: 11 | 12 | ConfigFile 13 | ---------- 14 | 15 | This class is responsible for updating spacemake's configuration. 16 | 17 | .. automodule:: spacemake.config 18 | :members: 19 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | 3 | # -- Project information 4 | 5 | project = 'spacemake' 6 | copyright = '2021-2024, Rajewsky Lab' 7 | author = 'Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky' 8 | 9 | version = '0.8.0' 10 | release = version 11 | 12 | # -- General configuration 13 | 14 | extensions = [ 15 | "sphinx_rtd_theme", 16 | 'sphinx.ext.duration', 17 | 'sphinx.ext.doctest', 18 | 'sphinx.ext.autodoc', 19 | 'sphinx.ext.autosummary', 20 | 'sphinx.ext.intersphinx', 21 | 'sphinx.ext.autosectionlabel', 22 | 'nbsphinx' 23 | ] 24 | 25 | intersphinx_mapping = { 26 | 'python': ('https://docs.python.org/3/', None), 27 | 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), 28 | } 29 | 30 | intersphinx_disabled_domains = ['std'] 31 | 32 | templates_path = ['_templates'] 33 | 34 | # -- Options for HTML output 35 | html_theme = "sphinx_rtd_theme" 36 | html_theme_options = { 37 | 'navigation_depth': 3 38 | } 39 | 40 | # -- Options for EPUB output 41 | epub_show_urls = 'footnote' 42 | 43 | import os 44 | import sys 45 | sys.path.insert(0, os.path.abspath('../')) 46 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Spacemake: processing and analyzing sequenced-based spatial transcriptomics data 2 | ================================================================================ 3 | 4 | Spacemake is a modular, robust, and scalable spatial transcriptomics pipeline built 5 | in Snakemake and Python. Spacemake is designed to handle all major spatial transcriptomics 6 | datasets and can be readily configured for other technologies. It can process and analyze 7 | several samples in parallel, even if they stem from different experimental methods. 8 | Spacemake's unified framework enables reproducible data processing from raw sequencing 9 | data to automatically generated downstream analysis reports. Spacemake is built with 10 | a modular design and offers additional functionality such as sample merging, saturation 11 | analysis, and analysis of long reads as separate modules. 12 | 13 | .. toctree:: 14 | :maxdepth: 3 15 | :hidden: 16 | 17 | install 18 | quick-start/index.rst 19 | initialize 20 | config 21 | projects/index 22 | run 23 | tutorials/index 24 | troubleshooting 25 | api/index 26 | 27 | -------------------------------------------------------------------------------- /docs/initialize.rst: -------------------------------------------------------------------------------- 1 | Initialization 2 | ============== 3 | 4 | Initializing using required arguments 5 | ------------------------------------- 6 | 7 | .. include:: shared/spacemake_init.rst 8 | 9 | Optional arguments 10 | ------------------ 11 | 12 | The `spacemake init` command takes the following optional arguments: 13 | 14 | ``root-dir`` 15 | The ``root-dir`` for the spacemake instance. Defaults to ``.``, the directory in which `spacemake init` is ran. 16 | 17 | ``temp-dir`` 18 | Path to the temporary directory, defaults to ``/tmp``. 19 | 20 | ``download-species`` 21 | If set, spacemake will download the genome (.fa) and annotation (.gtf) files for mouse and 22 | human from gencode, as specified `here `_. 23 | 24 | Hence, the complete `spacemake init` command looks like this:: 25 | 26 | spacemake init \ 27 | --root-dir ROOT-DIR \ # optional 28 | --temp-dir TEMP-DIR \ # optional 29 | --download-species \ # optional 30 | --dropseq-tools DROPSEQ-TOOLS # required 31 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Step 1: create conda environment 5 | -------------------------------- 6 | 7 | The most straightforward way to install spacemake, is to first create conda environment with the above packages. 8 | We highly recommend using `mamba `_, a much faster conda package manager than conda itself. 9 | After mamba is installed, download the `environment.yaml `_. 10 | This file contains all dependencies required by spacemake. 11 | 12 | Once downloaded, to install all spacemake dependencies type:: 13 | 14 | mamba env create -f environment.yaml 15 | 16 | This will create a conda environment called ``spacemake``. To activate the newly created environment type:: 17 | 18 | conda activate spacemake 19 | 20 | Step 2: download Dropseq-tools 21 | ------------------------------ 22 | 23 | Spacemake currently requires to download `Dropseq-tools `_. 24 | This package is a collection of processing tools originally written for `Drop-seq `_. Spacemake uses several functions from this package during pre-processing and processing, and without it it is impossible to run spacemake. 25 | 26 | Simply download one of the releases (we recommend using `2.5.1 `_) and place it somewhere in your filesystem. 27 | 28 | 29 | Step 3: install spacemake 30 | ------------------------- 31 | 32 | **After creating the conda environment and downloading Dropseq-tools** (as described above), 33 | spacemake can be installed via ``pip``:: 34 | 35 | pip install spacemake 36 | 37 | This will install spacemake, you should be good to go :) 38 | 39 | .. warning:: 40 | Make sure to first create the conda environment as described above. 41 | 42 | Although it is also possible to install the required packages independently, and then 43 | to install spacemake, this option has not been tested, and one can quickly run into 44 | dependency issues and errors. 45 | 46 | To make sure spacemake has been properly installed, run:: 47 | 48 | spacemake --version 49 | 50 | This should output the latest spacemake version available in ``pip``. -------------------------------------------------------------------------------- /docs/links.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _Seq-scope: https://www.sciencedirect.com/science/article/pii/S0092867421006279 3 | .. _Visium: https://www.10xgenomics.com/products/spatial-gene-expression 4 | .. _Slide-seq: https://www.nature.com/articles/s41587-020-0739-1 5 | .. _Drop-seq: https://mccarrolllab.org/dropseq/ 6 | .. _10X Chromium: https://www.10xgenomics.com/products/single-cell-gene-expression 7 | -------------------------------------------------------------------------------- /docs/quick-start/run_spacemake.rst: -------------------------------------------------------------------------------- 1 | After a sample is added spacemake can be run with:: 2 | 3 | spacemake run --cores --keep-going 4 | 5 | The ``--keep-going`` flag is optional, however it will ensure that spacemake runs all 6 | the jobs it can, even if one job fails (this logic is directly taken from snakemake). 7 | 8 | For a complete explanation on the `spacemake run` command :ref:`check out the documentation here `. 9 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==5.0.2 2 | sphinxcontrib-napoleon 3 | docutils>=0.14 4 | sphinx_rtd_theme 5 | sphinx-argparse 6 | nbsphinx>=0.4 7 | importlib-metadata 8 | Jinja2<3.1 9 | -------------------------------------------------------------------------------- /docs/run.rst: -------------------------------------------------------------------------------- 1 | .. _Running spacemake general: 2 | 3 | Running spacemake 4 | ================= 5 | 6 | Main modules 7 | ------------ 8 | 9 | After spacemake in configured with the ``spacemake config`` command, and projects/samples 10 | are added with the ``spacemake projects`` command, spacemake can be run with the 11 | ``spacemake run`` command. It takes the following parameters:: 12 | 13 | spacemake run \ 14 | --cores CORES \ # number of cores to be used in total 15 | --dryrun, -n \ # invokes a dry snakemake run, printing only commands 16 | --rerun-incomplete, --ri \ 17 | # forces snakemake to rerun incompletely generated files 18 | --keep-going \ # if a job fails, keep executing independent jobs. 19 | # we recommend to always set this when running spacemake 20 | # overnight 21 | --printshellcmds, -p \ 22 | # print shell commands for each rule, if exist 23 | --touch, -t \ # rather than running the rules, just touch each file 24 | --with_fastqc, -wfqc 25 | # Run also fastqc as part of the spacemake run 26 | 27 | Downsampling 28 | ------------ 29 | 30 | To run a downsampling (or saturation) analysis, one can use the following command:: 31 | 32 | spacemake run downsample \ 33 | --project_id_list [PROJECT_ID_LIST ...] \ 34 | --sample_id_list [SAMPLE_ID_LIST ...] 35 | 36 | In the ``project_id_list`` and ``sample_id_list`` arguments one can specify which a 37 | list of ``project_id``-s and ``sample_id``-s respectively, for which the downsampling 38 | should be run. It is possible to set only one, or both of these arguments. If both are 39 | set the downsampling will be run on samples for which the ``project_id`` and the ``sample_id`` are in both lists (intersection). 40 | 41 | .. note:: 42 | 43 | In addition to the list arguments specified above, the downsample command also 44 | takes the same arguments as the simple ``spacemake run`` command. 45 | -------------------------------------------------------------------------------- /docs/shared/shared_sample_variables.rst: -------------------------------------------------------------------------------- 1 | One of the most important parts of spacemake are the so-called 'shared sample-variables'. 2 | These are reusable, user-definable variables, which we can assign to several samples. 3 | They can be shortly defined as follows: 4 | 5 | ``species`` 6 | a collection of genome, annotation and rRNA\_genome. There is no default species, and each sample can have exactly one species. 7 | 8 | ``barcode_flavor`` 9 | the variable which specifies the structure of Read1 and Read2, namely how the cell\_barcode and UMI should be extracted. If no value provided for a sample, the default will be used. 10 | 11 | ``run_mode`` 12 | each sample can have several ``run_mode``-s, all of which are user definable. If no ``run_mode``-s are specified, a sample will be processed using ``default`` ``run_mode`` settings. 13 | 14 | ``puck`` (spatial only) 15 | if a sample is spatial, it has to have a puck variable. If no puck is specified, a default puck will be used. 16 | 17 | 18 | To add, update, delete or list a shared sample-variable, you can use the following commands:: 19 | 20 | spacemake config add_ 21 | spacemake config update_ 22 | spacemake config delete_ 23 | spacemake config list_ 24 | 25 | where ```` is one of ``species, barcode_flavor, run_mode or puck`` 26 | -------------------------------------------------------------------------------- /docs/shared/spacemake_init.rst: -------------------------------------------------------------------------------- 1 | After you have installed spacemake as specified :ref:`here `, you are ready to process and analyze spatial samples. 2 | 3 | To initialize spacemake ``cd`` into the directory in which you want to start spacemake. This directory will be your ``project_root``. 4 | Then simply type:: 5 | 6 | spacemake init \ 7 | --dropseq_tools 8 | 9 | Here the `path_to_dropseq_tools_dir` should point to the directory of the downloaded Dropseq-tools package, downloaded :ref:`in Step 2 of the installation `. 10 | -------------------------------------------------------------------------------- /docs/smk_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/smk_logo.png -------------------------------------------------------------------------------- /docs/troubleshooting.rst: -------------------------------------------------------------------------------- 1 | Troubleshooting 2 | =============== 3 | Below is a list of known issues you may encounter when running spacemake. 4 | As spacemake depends on several external libraries and tools, it is not 5 | always possible to resolve some of these issues. 6 | 7 | Bumped on another problem which is not documented here? Feel free to `open 8 | an issue on Github. `_ 9 | 10 | 11 | GLIBCXX_xxx not found 12 | ^^^^^^^^^^^^^^^^^^^^^ 13 | In certain environments you might run into the following error: 14 | 15 | .. code-block:: console 16 | 17 | ImportError: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /.../envs/spacemake/lib/python3.10/site-packages/matplotlib/_path.cpython-310-x86_64-linux-gnu.so) 18 | 19 | Certain dependencies (e.g. scipy) might affect this. To overcome it, try the following: 20 | 21 | .. code-block:: console 22 | 23 | export LD_LIBRARY_PATH=/conda_installation_folder/envs/spacemake/lib:$LD_LIBRARY_PATH 24 | 25 | For more details on this and further troubleshooting visit `stackoverflow `_. 26 | 27 | 28 | Issues with memory usage 29 | ^^^^^^^^^^^^^^^^^^^^^^^^ 30 | Certain steps of the spacemake workflow might currently result in excessive memory 31 | usage. These occur in samples where a large number of barcodes exist in the data, 32 | such as for ``stero-seq`` or ``open-ST``. The excessive memory usage is due to using 33 | the ``Drop-seq`` tools, where a specific memory size is allocated for ``java``. 34 | 35 | We are working on removing the depndency to the ``Drop-seq`` tools altogether, 36 | which will also speed up several steps. If you run into memory errors, however, you 37 | can solve them by modifying the ``main.smk`` file inside your spacemake installation, 38 | which should be somewhere in 39 | 40 | .. code-block:: console 41 | 42 | /path_to_conda/envs/spacemake/lib/python3.10/site-packages/spacemake/snakemake 43 | 44 | inside your conda installation folder. Simply modify the following lines 45 | 46 | .. code-block:: console 47 | 48 | {dropseq_tools}/BamTagHistogram -m 32g 49 | {dropseq_tools}/DigitalExpression -m 16g 50 | 51 | by increasing the value of ``-m`` accordingly. 52 | 53 | 54 | Issues with STAR 55 | ^^^^^^^^^^^^^^^^ 56 | To reduce memory usage when running several samples at the same time, 57 | spacemake uses STAR's shared memory capability. This currently has 58 | the following limitations: 59 | 60 | 1. It is not possible for one user to run two distinct spacemake instances with the same genome index. Multiple spacemake instances (each processing several samples) can run at the same time if different species indexes are used. 61 | 2. Similarly, it is not possible that two users can run spacemake with the same genome index loaded at the same time. 62 | 63 | In addition to the above, you might run into STAR-related errors if the spacemake 64 | instance was killed before finishing. This occurs when the genome index is still loaded 65 | into memory, and STAR will either throw an error and exit, or just stall. In that case, 66 | try to run: 67 | 68 | .. code-block:: console 69 | 70 | STAR --genomeLoad Remove --genomeDir 71 | 72 | In case the shared memory cannot be released (Linux), try the following: 73 | 74 | .. code-block:: console 75 | 76 | ipcs -m | grep `whoami` | awk '{ print $2 }' | xargs -n1 ipcrm -m 77 | -------------------------------------------------------------------------------- /docs/tutorials/.gitignore: -------------------------------------------------------------------------------- 1 | !*.png 2 | -------------------------------------------------------------------------------- /docs/tutorials/img/.gitignore: -------------------------------------------------------------------------------- 1 | !*.png 2 | !*.jpg 3 | -------------------------------------------------------------------------------- /docs/tutorials/img/manual_alignment_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_1.png -------------------------------------------------------------------------------- /docs/tutorials/img/manual_alignment_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_2.png -------------------------------------------------------------------------------- /docs/tutorials/img/manual_alignment_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_3.png -------------------------------------------------------------------------------- /docs/tutorials/img/manual_alignment_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_4.png -------------------------------------------------------------------------------- /docs/tutorials/img/manual_alignment_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_5.png -------------------------------------------------------------------------------- /docs/tutorials/img/manual_alignment_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/manual_alignment_6.png -------------------------------------------------------------------------------- /docs/tutorials/img/test_longread.donuts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/test_longread.donuts.png -------------------------------------------------------------------------------- /docs/tutorials/img/test_longread.hists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/test_longread.hists.png -------------------------------------------------------------------------------- /docs/tutorials/img/test_longread.oligo_edits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/docs/tutorials/img/test_longread.oligo_edits.png -------------------------------------------------------------------------------- /docs/tutorials/index.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | .. toctree:: 5 | he_integration 6 | manual_he_integration 7 | process_single_cell_data 8 | longreads 9 | novosparc_integration 10 | -------------------------------------------------------------------------------- /docs/tutorials/manual_he_integration.rst: -------------------------------------------------------------------------------- 1 | Manual H&E alignment 2 | ==================== 3 | 4 | Before you begin 5 | ---------------- 6 | 7 | Before start, make sure that you have installed spacemake as specified :ref:`here ` 8 | 9 | For the manual allignment we will use Fiji, an open-source image processing tool. Download it from `here `_. 10 | 11 | We will be using tile nr 2105 from `Seq-scope `_ for this tutorial. The corresponding H&E image is 12 | `wt_4X_2.jpg `_. 13 | 14 | Step 1 - generate an expression image 15 | ------------------------------------- 16 | 17 | First using the command line we generate an aggregated expression image. In the directory of your spacemake project, type: 18 | 19 | .. code-block:: console 20 | 21 | spacemake spatial create_aggregated_expression_img \ 22 | --project_id seq_scope \ 23 | --sample_id seq_scope_liver_2105 \ 24 | --run_mode seq_scope \ 25 | --processed_data False \ 26 | --binary True \ 27 | --out_img aggregated_seq_scope_2105_img_bw.png 28 | 29 | This will generate a black and white image based on expression data. 30 | 31 | Step 2 - load images into Fiji 32 | ------------------------------ 33 | 34 | In the next step we load both images into Fiji like below: 35 | 36 | .. image:: img/manual_alignment_1.png 37 | :width: 100% 38 | :alt: Manual alignment first step 39 | 40 | Step 3 - select corresponding points 41 | ------------------------------------ 42 | 43 | Next, using the *Multi-point Tool* we manually select corresponding points between our expression image and the H&E image. 44 | Select a point on one of the images, and then select a corresponding point on the other image. Do this for at least 4-5 corresponding points for a better match. 45 | 46 | .. image:: img/manual_alignment_2.png 47 | :width: 100% 48 | :alt: Manual alignment second step 49 | 50 | Step 4 - align the images 51 | ------------------------- 52 | 53 | We then use the `Landmark Correspondences `_ plugin to align the two images based on the correspondencing points we 54 | selected in the previous step. We go to *Plugins -> Transform -> Landmark Correspondences*: 55 | 56 | .. image:: img/manual_alignment_3.png 57 | :width: 100% 58 | 59 | In the pop-up window we select H&E image as the *source image* and expression image as the *template image*. 60 | For the *transformation method* select *Moving Least Squares (non-linear)*. Set the *alpha* to *1.00* and the *mesh resolution* to *32*. 61 | Set the *transformation class* to *Affine*. 62 | 63 | .. image:: img/manual_alignment_4.png 64 | :width: 100% 65 | 66 | After the transformation we have the two images aligned. We can now save our transformed H&E image (which is aligned with our spatial data). 67 | 68 | .. image:: img/manual_alignment_5.png 69 | :width: 100% 70 | 71 | 72 | Step 5 - attach the aligned image 73 | --------------------------------- 74 | 75 | First we load the spacemake processed Seq-scope tile nr 2105 data: 76 | 77 | .. code-block:: ipython3 78 | 79 | from spacemake import Spacemake 80 | 81 | spmk = Spacemake('/path/to/your/spacemake/project') 82 | 83 | adata_2105 = spmk.load_processed_adata( 84 | project_id = 'seq_scope', 85 | sample_id = 'seq_scope_liver_2105', 86 | run_mode_name = 'seq_scope', 87 | umi_cutoff = 300 88 | ) 89 | 90 | Then we load the previously manually aligned image and attach it to our data: 91 | 92 | .. code-block:: ipython3 93 | 94 | from spacemake.spatial.he_integration import attach_he_adata 95 | import cv2 96 | 97 | matched_he = cv2.imread('./Transformedwt_4X_2.tif') 98 | 99 | adata = attach_he_adata(adata_2105.copy(), 100 | matched_he, 101 | push_by_spot_diameter=False, 102 | raw_aligned=True) 103 | 104 | After attachment, we can plot our expression data on top of the aligned H&E with `scanpy `_: 105 | 106 | .. code-block:: ipython3 107 | 108 | import scanpy as sc 109 | 110 | sc.set_figure_params(dpi=300) 111 | 112 | sc.pl.spatial(adata, color='total_counts') 113 | 114 | .. image:: img/manual_alignment_6.png 115 | :width: 100% 116 | 117 | 118 | .. note:: 119 | 120 | As axes in scanpy are flipped with respect to the axes in Fiji, because Fiji reads the image axes in different order. 121 | -------------------------------------------------------------------------------- /docs/tutorials/process_single_cell_data.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../links.rst 2 | 3 | Processing a custom single-cell sample 4 | ====================================== 5 | 6 | In this tutorial we will process a custom single cell sample. 7 | 8 | As an example we will be using 1 million reads from `this Visium dataset `_. 9 | 10 | .. note:: 11 | 12 | Firstly, the example data used here is a 10X `Visium`_ dataset, hence it is spatial. 13 | However, for the sake of this tutorial, we will be treating it as a single-cell sample. 14 | 15 | Secondly, for many methods (such as `Visium`_, `10X Chromium`_ `Slide-seq`_ or `Seq-scope`_) 16 | spacemake provides pre-defined variables. If you are using 17 | one of these methods follow our :ref:`Quick start guide ` instead. 18 | 19 | Step 1: install and initialize spacemake 20 | ----------------------------------------- 21 | 22 | To install spacemake follow the :ref:`installation guide here `. 23 | 24 | To initialize spacemake follow the :ref:`initialization guide here `. 25 | 26 | Step 2: download test data 27 | -------------------------- 28 | 29 | For the sake of this tutorial we will work with a test dataset: 1 million Read1 and 1 million Read2 reads from a `Visium`_ adult mouse brain. 30 | 31 | To download the test data: 32 | 33 | .. code-block:: 34 | 35 | wget -nv http://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/visium/test_fastq/visium_public_lane_joined_1m_R1.fastq.gz 36 | wget -nv http://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/visium/test_fastq/visium_public_lane_joined_1m_R2.fastq.gz 37 | 38 | .. note:: 39 | 40 | If there is already data available, to be processed and analyzed, this step can be omitted. 41 | 42 | Step 3: add a new species 43 | ------------------------- 44 | 45 | .. note:: 46 | 47 | If you initialized spacemake with the ``--download-species`` flag, you can 48 | omit this step, as spacemake will automatically download and configure 49 | mm10 mouse genome.fa and annotation.gtf files for you. 50 | 51 | The sample we are working with here is a mouse brain sample, so we have to add a new species: 52 | 53 | .. code-block:: console 54 | 55 | spacemake config add_species --name mouse \ 56 | --annotation /path/to/mouse/annotation.gtf \ 57 | --genome /path/to/mouse/genome.fa 58 | 59 | 60 | Step 4: add a new barcode\_flavor 61 | --------------------------------- 62 | 63 | The ``barcode_flavor`` will decide which nucletodies of Read1/Read2 extract the UMIs and cell-barcodes from. 64 | 65 | In this perticular test sample, the first 16 nucleotides of Read1 are the cell-barcode, and the following 12 nucleotides are the UMIs. 66 | 67 | Consequently, we create a new ``barcode_flavor`` like this: 68 | 69 | .. code-block:: console 70 | 71 | spacemake config add_barcode_flavor --name test_barcode_flavor \ 72 | --cell_barcode r1[0:16] \ 73 | --umi r1[16:28] 74 | 75 | .. note:: 76 | 77 | There are several ``barcode_flavors`` provided by spacemake out of the box, 78 | such as ``visium`` for 10X `Visium`_ or ``sc_10x_v2`` for `10X Chromium`_ v2 79 | kits. The ``default`` flavor is identical to a `Drop-seq`_ library, with 12 80 | nucleotide cell-barcode and 8 nucleotide UMI. 81 | 82 | :ref:`More info about provided flavors here `. 83 | 84 | If you want to use one of these, there is no need to add your own flavor. 85 | 86 | Step 5: add a new run\_mode 87 | --------------------------- 88 | 89 | A ``run_mode`` in spacemake defines how a sample should processed downstream. 90 | In this tutorial, we will trim the PolyA stretches from the 3' end of Read2, 91 | count both exonic and intronic reads, expect 5000 cells, and analyze the data, 92 | turn off multi-mapper counting (so only unique reads are counted), 93 | using 50, 100 and 300 UMI cutoffs. To set these parameters, we define a 94 | ``test_run_mode`` like this: 95 | 96 | .. code-block:: console 97 | 98 | spacemake config add_run_mode --name test_run_mode \ 99 | --polyA_adapter_trimming True \ 100 | --count_mm_reads False \ 101 | --n_beads 5000 \ 102 | --count_intronic_reads True \ 103 | --umi_cutoff 50 100 300 104 | 105 | .. note:: 106 | 107 | As with ``barcode_flavors``, spacemake provides several ``run_modes`` out 108 | of the box. For more info :ref:`check out a more detailed guide here `. 109 | 110 | Step 6: add the sample 111 | ---------------------- 112 | 113 | After configuring all the steps above, we are ready to add our (test) sample: 114 | 115 | .. code-block:: console 116 | 117 | spacemake projects add_sample --project_id test_project \ 118 | --sample_id test_sample \ 119 | --R1 visium_public_lane_joined_1m_R1.fastq.gz \ 120 | --R2 visium_public_lane_joined_1m_R1.fastq.gz \ 121 | --species mouse \ 122 | --barcode_flavor test_barcode_flavor \ 123 | --run_mode test_run_mode 124 | 125 | .. note:: 126 | 127 | If there is already data available, here the Read1 and Read2 ``.fastq.gz`` files should be added, 128 | instead of the test files. 129 | 130 | Step 7: runn spacemake 131 | ---------------------- 132 | 133 | Now we can process our samples with spacemake. Since we added only one sample, only one sample will be processed 134 | and analyzed. To start spacemake, simply write: 135 | 136 | .. code-block:: console 137 | 138 | spacemake run --cores 16 139 | 140 | .. note:: 141 | 142 | The number of cores used should be suited for the machine on which spacemake is ran. 143 | When processing more than one samle, we recommend using spacemake with at least 8 cores. 144 | In order to achieve maximum parallelism. 145 | 146 | Step 8: results 147 | --------------- 148 | 149 | The results of the analysis for this sample will be under ``projects/test_project/processed_data/test_sample/illumina/complete_data/`` 150 | 151 | Under this directory, there are several files and directories which are important: 152 | 153 | * ``final.polyA_adapter_trimmed.bam``: final, mapped, tagged ``.bam`` file. ``CB`` tag contains the cell barcode, and the ``MI`` contains the UMI-s. 154 | 155 | * ``qc_sheet_test_sample_no_spatial_data.html``: the QC-sheet for this sample, as a self-contained ``.html`` file. 156 | 157 | * ``dge/``: a directory containing the Digital Expression Matrices (DGEs) 158 | 159 | * ``dge.all.polyA_adapter_trimmed.5000_beads.txt.gz``: a compressed, text based DGE 160 | 161 | * ``dge.all.polyA_adapter_trimmed.5000_beads.h5ad``: the same DGE but stored in ``.h5ad`` format (`used by the anndata python package `_). This matrix is stored as a Compressed Sparse Column matrix (using `scipy.sparse.csc_matrix `_). 162 | 163 | * ``dge.all.polyA_adapter_trimmed.5000_beads.summary.txt``: the summary of the DGE, one line per cell. 164 | 165 | * ``dge.all.polyA_adapter_trimmed.5000_beads.obs.csv``: the observation table of the matrix. Similar to the previous file, more detailed. 166 | 167 | * ``automated_analysis/test_run_mode/umi_cutoff_50/``: In this directory the results of the automated analysis can be found. As it can be seen under the ``automated_analysis`` directory there are two further levels, one for ``run_mode`` and one for ``umi_cutoff``. This is because one sample can have several ``run_modes`` and in the same way one ``run_mode`` can have several UMI cutoffs. 168 | 169 | * ``results.h5ad``: the result of the automated analysis, stored in an anndata object. Same as the DGE before, but containing processed data. 170 | 171 | * ``test_sample_no_spatial_data_illumina_automated_report.html``: automated analysis self-contained ``.html`` report. 172 | 173 | .. note:: 174 | 175 | If the ``test_project`` had more samples, than those would be automatically placed under ``projects/test_project``. Similarily, under one spacemake 176 | directory there can be several projects in parallel, and each will have their own directory structure under the ``projects/`` folder. 177 | 178 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: spacemake 2 | channels: 3 | - bih-cubi 4 | - conda-forge 5 | - bioconda 6 | - nodefaults 7 | dependencies: 8 | - python>=3.6,<3.12 9 | - snakemake>=5.32.0,<6.4.0 10 | - star>=2.7.1a 11 | - samtools>=1.13 12 | - sambamba>=0.6.8 13 | - bowtie2>=2.3.4 14 | - bcl2fastq2>=2.19 15 | - fastqc>=0.11.9 16 | - pip>=21.1 17 | - r-base>=4.0.3 18 | - r-rmarkdown>=2.7 19 | - r-tidyverse>=1.3.1 20 | - r-kableextra>=1.3.4 21 | - r-cowplot>=1.1.1 22 | - r-pals>=1.7 23 | - r-hexbin 24 | - r-scales 25 | - pysam>=0.16.0.1 26 | - pot 27 | - openjdk==11.0.15 28 | - pigz 29 | - pip: 30 | - setproctitle 31 | - isal 32 | - pytest 33 | - pytest-cov 34 | - mrfifo>=0.3.0 35 | - pandas>2 36 | - scanpy>=1.8.1 37 | - leidenalg>=0.8.1 38 | - numpy>=1.18.1 39 | - more-itertools>=8.7.0 40 | - biopython>=1.78 41 | - scipy>=1.5.0 42 | - scikit-misc>=0.1.3 43 | - scikit-learn>=0.23.1 44 | - squidpy>=1.0.0 45 | - novosparc 46 | - opencv-python 47 | - jinja2>=3.1.3 48 | - matplotlib==3.8.4 49 | # - pytest-optional-tests 50 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /sequences/.gitignore: -------------------------------------------------------------------------------- 1 | primers.fa.nhr 2 | primers.fa.nin 3 | primers.fa.nog 4 | primers.fa.nsd 5 | primers.fa.nsi 6 | primers.fa.nsq 7 | -------------------------------------------------------------------------------- /sequences/primers.fa: -------------------------------------------------------------------------------- 1 | >dropseq_template_switch_oligo_tso 2 | AAGCAGTGGTATCAACGCAGAGTGAATG 3 | >second_strand_synthesis_oligo_dn_smrt 4 | AAGCAGTGGTATCAACGCAGAGTGANNNGGNNNB 5 | >smart_pcr_primer 6 | AAGCAGTGGTATCAACGCAGAGT 7 | >new_p5_smart_pcr_hybrid_oligo 8 | AATGATACGGCGACCACCGAGATCTACACGCCTGTCCGCGGAAGCAGTGGTATCAACGCAGAGT 9 | >nextera_n701_oligo 10 | CAAGCAGAAGACGGCATACGAGATTCGCCTTAGTCTCGTGGGCTCGG 11 | >next_tn5_rev_primer 12 | GTCTCGTGGGCTCGGAGAT 13 | >imaging_primer 14 | GAATCACGATACGTACACCA 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = spacemake 3 | version = attr: spacemake.contrib.__version__ 4 | author = Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky 5 | author_email = TamasRyszard.Sztanka-Toth@mdc-berlin.de 6 | description = A bioinformatic pipeline for the analysis of spatial transcriptomic data 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/rajewsky-lab/spacemake 10 | project_urls = 11 | Bug Tracker = https://github.com/rajewsky-lab/spacemake/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) 15 | Operating System :: OS Independent 16 | license = GPL 17 | 18 | [options] 19 | zip_safe = False 20 | python_requires = >=3.8 21 | include_package_data = True 22 | package_dir = 23 | spacemake = spacemake 24 | packages = spacemake 25 | 26 | [options.package_data] 27 | spacemake = 28 | snakemake/*.smk 29 | snakemake/scripts/*.R 30 | snakemake/scripts/*.Rmd 31 | snakemake/scripts/*.py 32 | data/*.csv 33 | data/*.fa 34 | config/*.yaml 35 | longread/*.py 36 | 37 | [options.entry_points] 38 | console_scripts = 39 | alnstats = spacemake.alnstats:cmdline 40 | preprocess = spacemake.preprocess:cmdline 41 | spacemake = spacemake.cmdline:cmdline 42 | pb_annotate = spacemake.longread.cmdline:cmdline 43 | 44 | [tool:pytest] 45 | testpaths = tests 46 | markers = 47 | big_download: needs to download large-ish files 48 | addopts = --cov=spacemake --cov-report html 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | from setuptools import setup 3 | 4 | setup() 5 | -------------------------------------------------------------------------------- /spacemake/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | -------------------------------------------------------------------------------- /spacemake/__init__.py: -------------------------------------------------------------------------------- 1 | # __version__ = 1.0 2 | # import matplotlib._path 3 | # from . import preprocess as pp 4 | # from . import spatial as sp 5 | 6 | # from .smk import Spacemake 7 | -------------------------------------------------------------------------------- /spacemake/bin/BamTagHistogram.py: -------------------------------------------------------------------------------- 1 | import mrfifo as mf 2 | import logging 3 | 4 | 5 | def parse_args(): 6 | from spacemake.util import make_minimal_parser 7 | 8 | parser = make_minimal_parser("BamTagHistogram") 9 | 10 | parser.add_argument("--parallel", type=int, default=8) 11 | parser.add_argument("--input", default="/dev/stdin") 12 | parser.add_argument("--output", default="/dev/stdout") 13 | parser.add_argument( 14 | "--prefix-size", 15 | default=4, 16 | type=int, 17 | help=( 18 | "how many letters of the tag value are used to split the stream. " 19 | "default=4 allows for up to (alphabet_size)^4 distinct parallel workers. " 20 | "will be spread across workers by mod " 21 | ), 22 | ) 23 | parser.add_argument("--prefix-alphabet", default="ACGTN") 24 | parser.add_argument("--min-count", default=10, type=int) 25 | parser.add_argument( 26 | "--sort-mem", 27 | default=8, 28 | type=int, 29 | help="how many GB are allowed to be used for sorting (default=8)", 30 | ) 31 | parser.add_argument( 32 | "--tag", default="CB", help="which BAM tag to count (default='CB')" 33 | ) 34 | 35 | return parser.parse_args() 36 | 37 | 38 | def CB_distributor( 39 | input, outputs, tag="CB", prefix_size=3, prefix_alphabet="ACGTN", n=8, **kw 40 | ): 41 | "ensure that the FIFOs are not managed" 42 | assert type(input) is str 43 | logger = logging.getLogger("mrfifo.parts.CB_distributor") 44 | logger.debug( 45 | f"reading from {input}, writing to {outputs} " 46 | f"tag={tag} prefix_size={prefix_size} prefix_alphabet={prefix_alphabet} " 47 | f"kw={kw}" 48 | ) 49 | 50 | lkup = {} 51 | from itertools import product 52 | 53 | i = 0 54 | for letters in product(*([prefix_alphabet] * prefix_size)): 55 | prefix = "".join(letters).encode("ascii") 56 | lkup[prefix] = i % n 57 | i += 1 58 | 59 | # for k, v in sorted(lkup.items()): 60 | # print(f"{k}\t{v}") 61 | 62 | from mrfifo.fast_loops import distribute_by_substr 63 | 64 | tag_lead = b"\t" + tag.encode("ascii") + b":Z:" 65 | logger.debug( 66 | f"scanning for tag-lead {tag_lead} and using next {prefix_size} bytes as prefix" 67 | ) 68 | res = distribute_by_substr( 69 | fin_name=input, 70 | fifo_names=outputs, 71 | sub_lookup=lkup, 72 | sub_size=prefix_size, 73 | sub_lead=tag_lead, 74 | # **kw, 75 | ) 76 | logger.debug("distribution complete") 77 | return res 78 | 79 | 80 | def tag_counter(input, output, tag="CB", min_count=10): 81 | from collections import defaultdict 82 | 83 | counter = defaultdict(int) 84 | stats = defaultdict(int) 85 | import re 86 | 87 | pattern = re.compile(f"{tag}:Z:(\S+)") 88 | for sam_line in input: 89 | stats["n_records"] += 1 90 | flags = int(sam_line.split("\t")[1]) 91 | if flags & 256: 92 | # 'not primary alignment' bit is set 93 | stats["n_secondary"] += 1 94 | continue 95 | 96 | if m := re.search(pattern, sam_line): 97 | stats["n_tagged"] += 1 98 | tag_val = m.groups(0)[0] 99 | counter[tag_val] += 1 100 | 101 | stats["n_values"] = len(counter) 102 | for value, count in counter.items(): 103 | if count >= min_count: 104 | stats["n_above_cut"] += 1 105 | output.write(f"{count}\t{value}\n") 106 | 107 | return stats 108 | 109 | 110 | def sort_function(input, output, n=8, sort_mem_gigs=8, header=None): 111 | import os 112 | 113 | if header is None: 114 | header = rf"# INPUT={args.input} TAG={args.tag} FILTER_PCR_DUPLICATES=false READ_QUALITY=0\n" 115 | 116 | if output.endswith(".gz"): 117 | cmd = ( 118 | f'{{ printf "{header}"; sort -rnk 1 -S {sort_mem_gigs}G --parallel={n} {input}; }}' 119 | f"| python -m isal.igzip -c > {output}" 120 | ) 121 | else: 122 | cmd = f'{{ printf "{header}"; sort -rnk 1 -S {sort_mem_gigs}G --parallel={n} {input}; }} > {output}' 123 | 124 | import subprocess 125 | 126 | subprocess.call(cmd, shell=True) 127 | 128 | 129 | def main(args): 130 | w = ( 131 | mf.Workflow("BamTagHistogram", total_pipe_buffer_MB=4) 132 | .BAM_reader( 133 | input=args.input, 134 | mode="S", 135 | threads=4, 136 | ) 137 | .distribute( 138 | input=mf.FIFO("input_sam", "rt"), 139 | outputs=mf.FIFO("dist_{n}", "wt", n=args.parallel), 140 | func=CB_distributor, 141 | tag=args.tag, 142 | prefix_size=args.prefix_size, 143 | prefix_alphabet=args.prefix_alphabet, 144 | n=args.parallel, 145 | ) 146 | .workers( 147 | func=tag_counter, 148 | tag=args.tag, 149 | input=mf.FIFO("dist_{n}", "rt"), 150 | output=mf.FIFO("counts_{n}", "wt"), 151 | n=args.parallel, 152 | min_count=args.min_count, 153 | ) 154 | .collect( 155 | inputs=mf.FIFO("counts_{n}", "rt", n=args.parallel), 156 | output=mf.FIFO("unsorted", "wt"), 157 | chunk_size=1, 158 | ) 159 | .funnel( 160 | input=mf.FIFO("unsorted", "rt"), 161 | output=args.output, 162 | func=sort_function, 163 | _manage_fifos=False, 164 | ) 165 | .run() 166 | ) 167 | stats = mf.util.CountDict() 168 | for jobname, d in w.result_dict.items(): 169 | if "worker" in jobname: 170 | stats.add_other_stats(d) 171 | 172 | df = stats.get_stats_df() 173 | df["input"] = args.input 174 | print(df.set_index("input")) 175 | return w 176 | 177 | 178 | if __name__ == "__main__": 179 | args = parse_args() 180 | import spacemake.util as util 181 | 182 | util.setup_logging(args) 183 | main(args) 184 | -------------------------------------------------------------------------------- /spacemake/contrib.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | __version__ = "0.8.1" 3 | __author__ = ["Nikos Karaiskos", "Tamas Ryszard Sztanka-Toth", 4 | "Marvin Jens", "Daniel Leon-Perinan"] 5 | __license__ = "GPL" 6 | __email__ = [ 7 | "nikolaos.karaiskos@mdc-berlin.de", 8 | "tamasryszard.sztanka-toth@mdc-berlin.de", 9 | "marvin.jens@charite.de", 10 | "daniel.leonperinan@mdc-berlin.de" 11 | ] 12 | 13 | author_contributions = """ 14 | Spacemake is built on snakemake scripts originally developed by Nikos Karaiskos 15 | for the analysis of dropseq data. These gradually evolved into a robust workflow for 16 | spatial transcriptomics data analysis that was improved and generalized to work 17 | with different ST technologies by Tamas Ryszard Sztanka-Toth. Marvin Jens contributed 18 | longread analysis code and support for converting fastq to BAM as a first step. 19 | Many features of the automated analysis and integration with Novosparc were added by 20 | Tamas, in close collaboration with Nikos, culminating in the first spacemake 21 | publication: 22 | 23 | https://doi.org/10.1093/gigascience/giac064 24 | 25 | Marvin then added new building blocks to successively replace the java-based 26 | dropseq tools with python/pysam based code: cutadapt_bam.py, annotator.py, as well 27 | as the ability to align raw reads to multiple indices, in close collaboration 28 | with Nikos & Tamas. 29 | 30 | Spacemake is actively maintained by Dani, Marvin and Nikos. 31 | """ 32 | 33 | roadmap = [ 34 | ("0.5.5", "universal ST support and utility, novosparc integration. Sztanka-Toth et al. 2022"), 35 | ("0.7", "support multiple mapping indices, bulk samples, custom user-defined snakemake rules"), 36 | ("1.x", "replace dropseq tools. Own annotator and towards entirely scanpy workflow"), 37 | ("1.x", "efficient handling of 1E8+ spatial barcodes (seq-scope etc.)"), 38 | ("1.x", "add interactive data exploration support (shiny?)"), 39 | ("2.x", "cmdline interface cleanup and remote API support"), 40 | ("2.x", "cython magic to speed up parallel BAM processing via shared memory"), 41 | ] 42 | -------------------------------------------------------------------------------- /spacemake/data/.gitignore: -------------------------------------------------------------------------------- 1 | !* 2 | -------------------------------------------------------------------------------- /spacemake/data/config/config.yaml: -------------------------------------------------------------------------------- 1 | puck_data: 2 | barcode_file: 'predictions_ml.csv' 3 | root: 'puck_data' 4 | 5 | pucks: 6 | default: 7 | width_um: 3000 8 | spot_diameter_um: 10 9 | coordinate_system: '' 10 | visium: 11 | barcodes: 'puck_data/visium_barcode_positions.csv' 12 | width_um: 6500 13 | spot_diameter_um: 55 14 | seq_scope: 15 | width_um: 1000 16 | spot_diameter_um: 1 17 | slide_seq: 18 | width_um: 3000 19 | spot_diameter_um: 10 20 | openst: 21 | width_um: 1200 22 | spot_diameter_um: 0.6 23 | coordinate_system: 'puck_data/openst_coordinate_system.csv' 24 | 25 | run_modes: 26 | default: 27 | n_beads: 100000 28 | umi_cutoff: [100, 300, 500] 29 | clean_dge: False 30 | detect_tissue: False 31 | polyA_adapter_trimming: True 32 | count_intronic_reads: True 33 | count_mm_reads: False 34 | mesh_data: False 35 | mesh_type: 'circle' 36 | mesh_spot_diameter_um: 55 37 | mesh_spot_distance_um: 100 38 | spatial_barcode_min_matches: 0 39 | visium: 40 | n_beads: 10000 41 | umi_cutoff: [1000] 42 | clean_dge: False 43 | detect_tissue: True 44 | count_intronic_reads: False 45 | count_mm_reads: True 46 | slide_seq: 47 | n_beads: 100000 48 | umi_cutoff: [50] 49 | clean_dge: False 50 | detect_tissue: False 51 | scRNA_seq: 52 | n_beads: 10000 53 | umi_cutoff: [500] 54 | detect_tissue: False 55 | count_intronic_reads: True 56 | count_mm_reads: False 57 | seq_scope: 58 | clean_dge: false 59 | count_intronic_reads: false 60 | count_mm_reads: false 61 | detect_tissue: false 62 | mesh_data: true 63 | mesh_spot_diameter_um: 10 64 | mesh_spot_distance_um: 15 65 | mesh_type: hexagon 66 | n_beads: 1000 67 | umi_cutoff: 68 | - 100 69 | - 300 70 | openst: 71 | clean_dge: false 72 | count_intronic_reads: true 73 | count_mm_reads: true 74 | detect_tissue: false 75 | mesh_data: true 76 | mesh_spot_diameter_um: 7 77 | mesh_spot_distance_um: 7 78 | mesh_type: hexagon 79 | n_beads: 100000 80 | polyA_adapter_trimming: true 81 | spatial_barcode_min_matches: 0.1 82 | umi_cutoff: 83 | - 100 84 | - 250 85 | - 500 86 | 87 | 88 | barcode_flavors: 89 | default: 90 | cell: "r1[0:12]" 91 | UMI: "r1[12:20]" 92 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 93 | dropseq: 94 | cell: "r1[0:12]" 95 | UMI: "r1[12:20]" 96 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 97 | slide_seq_14bc: 98 | cell: "r1[0:14]" 99 | UMI: "r1[14:23]" 100 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 101 | slide_seq_15bc: 102 | cell: "r1[0:14]" 103 | UMI: "r1[15:23]" 104 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 105 | visium: 106 | cell: "r1[0:16]" 107 | UMI: "r1[16:28]" 108 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 109 | sc_10x_v2: 110 | cell: "r1[0:16]" 111 | UMI: "r1[16:26]" 112 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 113 | seq_scope: 114 | UMI: "r2[0:9]" 115 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 116 | cell: "r1[0:20]" 117 | openst: 118 | UMI: "r2[0:9]" 119 | #bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 120 | cell: "r1[2:27]" 121 | 122 | adapter_flavors: 123 | SMARTER: 124 | cut_right: 125 | - Q: 126 | min_base_quality: 30 127 | - polyA: 128 | max_error: 0.25 129 | min_overlap: 3 130 | paired_end: replace_N 131 | chromium: 132 | cut_right: 133 | - Q: 134 | min_base_quality: 32 135 | - polyA: 136 | max_error: 0.25 137 | min_overlap: 3 138 | - polyG: 139 | max_error: 0.1 140 | min_overlap: 3 141 | default: 142 | cut_left: 143 | - TSO_SMART: 144 | max_error: 0.1 145 | min_overlap: 10 146 | cut_right: 147 | - Q: 148 | min_base_quality: 30 149 | - polyA: 150 | max_error: 0.25 151 | min_overlap: 3 152 | - polyG: 153 | max_error: 0.1 154 | min_overlap: 3 155 | paired_end: single-end 156 | dropseq: 157 | cut_left: 158 | - TSO_SMART: 159 | max_errors: 0.1 160 | min_overlap: 10 161 | cut_right: 162 | - Q: 163 | min_base_quality: 30 164 | - polyA: 165 | max_errors: 0.25 166 | min_overlap: 3 167 | - polyG: 168 | max_errors: 0.1 169 | min_overlap: 3 170 | paired_end: single-end 171 | fc_SMART_UMI_RPE: 172 | cut_left: 173 | - TSO_SMART: 174 | max_errors: 0.1 175 | min_overlap: 10 176 | cut_right: 177 | - Q: 178 | min_base_quality: 32 179 | - polyG: 180 | max_errors: 0.25 181 | min_overlap: 3 182 | - Q: 183 | min_base_quality: 32 184 | - polyA: 185 | max_errors: 0.25 186 | min_overlap: 3 187 | 188 | adapters: 189 | smart: 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTC' 190 | optical_primer: 'GAATCACGATACGTACACCA' 191 | TSO_SMART: AAGCAGTGGTATCAACGCAGAGTGAATGGG 192 | SMART: AAGCAGTGGTATCAACGCAGAGTG 193 | TSO_10x: AAGCAGTGGTATCAACGCAGAGTACATGGG 194 | chromium_bead: CTACACGACGCTCTTCCGATCT 195 | dropseq_bead: AAGCAGTGGTATCAACGCAGAGTAC 196 | polyA: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 197 | polyG: GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG 198 | nextflex_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTGAA 199 | truseq_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCA 200 | -------------------------------------------------------------------------------- /spacemake/data/config/longread.yaml: -------------------------------------------------------------------------------- 1 | blocks: 2 | P5: AATGATACGGCGACCACCGAGATCTACACGCCTGTCCGCGG 3 | N70X: CTGTCTCTTATACACATCTCCGAGCCCACGAGACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG 4 | SMART_primer: AAGCAGTGGTATCAACGCAGAGT 5 | SMART_bead: AAGCAGTGGTATCAACGCAGAGTAC 6 | dN-SMRT: AAGCAGTGGTATCAACGCAGAGTGA 7 | TSO: AAGCAGTGGTATCAACGCAGAGTGAATGGG 8 | sc_primer: CTCGGAGATGTGTATAAGAGACAGTATGGG 9 | # random_primer: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG 10 | OP1: GAATCACGATACGTACACCA 11 | OP2_2s: GCGTTGCGTTCCTAGCCGCTAC 12 | # OP3: CGCAGTCTCCGTCGATAAGGTC 13 | OP2: GCGTGTGGTCGGACGCACCCAC 14 | OP3: GCAAAGCTGCTGCCTCCGCTAGC 15 | polyT: TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT 16 | #dN-NEXT_Tn5: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG 17 | #NEXT_Tn5_Rev_Primer: GTCTCGTGGGCTCGGAGAT 18 | #Nextera_TN5: CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG 19 | 10X_start: CTACACGACGCTCTTCCGATCT 20 | 10X_TSO: AAGCAGTGGTATCAACGCAGAGTACATGGG 21 | 10X_C3_RT_PRIMER: AAGCAGTGGTATCAACGCAGAG 22 | 10X_C2_RT_PRIMER: AAGCAGTGGTATCAACGCAGAGTACAT 23 | 24 | signatures: 25 | visium: 26 | label: visium 27 | prio: 0 28 | color: pink 29 | intact: 10X_start,polyT,10X_TSO_RC 30 | other: 10X_C3_RT_PRIMER,10X_C2_RT_PRIMER 31 | prefixes: P5 32 | suffixes: N70X 33 | CB: r1[0:16] 34 | UMI: r1[16:28] 35 | cDNA_after: polyT 36 | read1_primer: 10X_start 37 | read2_primer: 10X_TSO 38 | 39 | chromium: 40 | label: chromium 41 | prio: 1 42 | color: gray 43 | CB: r1[0:16] 44 | UMI: r1[16:26] 45 | intact: 10X_start,polyT,10X_TSO_RC 46 | other: 10X_C3_RT_PRIMER,10X_C2_RT_PRIMER 47 | cDNA_after: polyT 48 | prefixes: P5 49 | suffixes: N70X 50 | read1_primer: 10X_start 51 | read2_primer: 10X_TSO 52 | 53 | dropseq: 54 | label: dropseq 55 | prio: 2 56 | color: gray 57 | CB: r1[8:20] 58 | UMI: r1[0:8] 59 | intact: SMART_bead,polyT 60 | cDNA_after: polyT 61 | other: SMART_primer,dN-SMRT,TSO,sc_primer 62 | prefixes: P5 63 | suffixes: N70X 64 | read1_primer: SMART_bead 65 | read2_primer: N70X 66 | 67 | # in-house experimental 68 | withUMI: 69 | label: withUMI 70 | prio: 3 71 | color: blue 72 | CB: r1[0:12] 73 | UMI: r1[12:20] 74 | intact: SMART_bead,OP1,polyT 75 | other: SMART_primer,dN-SMRT,TSO,sc_primer 76 | prefixes: P5 77 | suffixes: N70X 78 | cDNA_after: polyT 79 | read1_primer: SMART_bead 80 | read2_primer: N70X 81 | 82 | noUMI: 83 | label: noUMI 84 | prio: 10 85 | color: lightblue 86 | CB: r1[0:12] 87 | UMI: r2[0:8] 88 | intact: SMART_bead,OP1,polyT 89 | other: SMART_primer,dN-SMRT,TSO,sc_primer 90 | prefixes: P5 91 | suffixes: N70X 92 | cDNA_after: polyT 93 | read1_primer: SMART_bead 94 | read2_primer: N70X 95 | 96 | combv1: 97 | label: comb_2seg_pilot 98 | prio: 20 99 | color: blue 100 | CB: r1[0:12] 101 | UMI: r2[0:8] 102 | intact_bead: SMART_bead,OP1,OP2_2s,polyT 103 | other: SMART_primer,dN-SMRT,TSO,sc_primer,OP3 104 | cDNA_after: polyT 105 | read1_primer: SMART_bead 106 | read2_primer: N70X 107 | 108 | hybridv1: 109 | label: comb_hybrid 110 | prio: 30 111 | color: lightblue 112 | CB: r1[0:8]+r1[31:39] 113 | UMI: r2[0:8] 114 | intact: SMART_bead,OP2,OP3,polyT 115 | other: SMART_primer,dN-SMRT,TSO,sc_primer,OP3 116 | prefixes: P5 117 | suffixes: N70X 118 | cDNA_after: polyT 119 | read1_primer: SMART_bead 120 | read2_primer: N70X 121 | 122 | scsmrna: 123 | label: smallRNA 124 | prio: 40 125 | color: red 126 | CB: r1[0:12] 127 | UMI: r1[12:20] 128 | intact: SMART_bead,polyT,sc_primer_RC 129 | other: SMART_primer,dN-SMRT,TSO,sc_primer,OP1 130 | prefixes: P5 131 | suffixes: N70X 132 | cDNA_after: polyT 133 | read1_primer: SMART_bead 134 | read2_primer: TSO 135 | 136 | default: withUMI -------------------------------------------------------------------------------- /spacemake/data/config/species_data_url.yaml: -------------------------------------------------------------------------------- 1 | mouse: 2 | annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/gencode.vM27.primary_assembly.annotation.gtf.gz' 3 | genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M27/GRCm39.primary_assembly.genome.fa.gz' 4 | human: 5 | annotation: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.primary_assembly.annotation.gtf.gz' 6 | genome: 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.primary_assembly.genome.fa.gz' 7 | -------------------------------------------------------------------------------- /spacemake/data/puck_collection/create_novaseq_S4_coordinate_system.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | 4 | """ 5 | Global Coordinate System Generator for NovaSeq S4 Flow Cell 6 | 7 | This Python script is designed to create a global coordinate system 8 | for a NovaSeq S4 flow cell. 9 | 10 | It generates a DataFrame with puck names and their corresponding global 11 | (x, y, z) coordinates and saves it to a CSV file. 12 | 13 | Usage: 14 | python create_openst_coordinate_system.py --output [options] 15 | 16 | Example: 17 | python create_openst_coordinate_system.py \ 18 | --output output.csv \ 19 | --format-string fc_1_L{lane}{side_letter}_tile_{side_number}{column}{row:02d} \ 20 | --x-offset 33739 \ 21 | --y-offset 36282 \ 22 | --swath-offset-odd 0 \ 23 | --swath-offset-even 6201 \ 24 | --rows 78 \ 25 | --columns 6 \ 26 | --n_lanes 4 \ 27 | --zero-coded 28 | 29 | Author: 30 | Daniel León-Periñán 31 | """ 32 | 33 | def setup_parser(parser): 34 | parser.add_argument( 35 | "--output", 36 | type=str, 37 | help="where to store the output file with puck names and global (x,y,z) coordinates", 38 | required=True, 39 | ) 40 | 41 | parser.add_argument( 42 | "--format-string", 43 | type=str, 44 | help="this the format for puck names. There are 4 attributes that can be chosen:" 45 | + "\{lane\} (int), \{column\} (int), \{row\} (int), \{side_letter\} (str), \{side_number\} (int).\n" 46 | + "For instance, a valid string format would be: \n" 47 | + "fc_1_L{lane}{side_letter}_tile_{side_number}{column}{row:02d}\n" 48 | + "This name must be used, as is, when creating a new sample in spacemake.", 49 | default="L{lane}{side_letter}_tile_{side_number}{column}{row:02d}", 50 | ) 51 | 52 | parser.add_argument( 53 | "--x-offset", 54 | type=int, 55 | help="the offset in the x axis. Units are important during puck collection generation.", 56 | default=33739, 57 | ) 58 | 59 | parser.add_argument( 60 | "--y-offset", 61 | type=int, 62 | help="the offset of the y axis. Units are important during puck collection generation.", 63 | default=36282, 64 | ) 65 | 66 | parser.add_argument( 67 | "--swath-offset-odd", 68 | type=int, 69 | help="the swath offset for odd columns", 70 | default=0, 71 | ) 72 | 73 | parser.add_argument( 74 | "--swath-offset-even", 75 | type=int, 76 | help="the swath offset for even columns", 77 | default=6201, 78 | ) 79 | 80 | parser.add_argument( 81 | "--rows", 82 | type=int, 83 | help="number of rows", 84 | default=78, 85 | ) 86 | 87 | parser.add_argument( 88 | "--columns", 89 | type=int, 90 | help="number of columns", 91 | default=6, 92 | ) 93 | 94 | parser.add_argument( 95 | "--n_lanes", 96 | type=int, 97 | help="number of lanes", 98 | default=4, 99 | ) 100 | 101 | parser.add_argument( 102 | "--zero-coded", 103 | default=False, 104 | action="store_true", 105 | help="whether row and column indices should start at 0, instead of 1", 106 | ) 107 | 108 | return parser 109 | 110 | 111 | def create_coordinate_system( 112 | n_lanes: int, 113 | n_cols: int, 114 | n_rows: int, 115 | x_offset: int, 116 | y_offset: int, 117 | swath_offsets_odd: int, 118 | swath_offsets_even: int, 119 | zero_coded: bool, 120 | format_string: str, 121 | ) -> pd.DataFrame: 122 | """ 123 | Create a global coordinate system for a NovaSeq S4 flow cell. 124 | 125 | :param n_lanes: Number of lanes in the flow cell. 126 | :type n_lanes: int 127 | :param n_cols: Number of columns in the flow cell. 128 | :type n_cols: int 129 | :param n_rows: Number of rows in the flow cell. 130 | :type n_rows: int 131 | :param x_offset: Offset in the x-axis for coordinate calculations. 132 | :type x_offset: int 133 | :param y_offset: Offset in the y-axis for coordinate calculations. 134 | :type y_offset: int 135 | :param swath_offsets_odd: Swath offset for odd columns. 136 | :type swath_offsets_odd: int 137 | :param swath_offsets_even: Swath offset for even columns. 138 | :type swath_offsets_even: int 139 | :param zero_coded: Whether row and column indices should start at 0, instead of 1. 140 | :type zero_coded: bool 141 | :param format_string:The format for puck names. 142 | :type format_string: str 143 | :returns: DataFrame with puck names and their corresponding global coordinates. 144 | :rtype: pd.DataFrame 145 | """ 146 | 147 | one_coded_offset = 0 if zero_coded else 1 148 | swath_offsets = [swath_offsets_even, swath_offsets_odd] 149 | sides_letter = {1: "a", 2: "b"} 150 | l = [] 151 | for lane in range(one_coded_offset, n_lanes + one_coded_offset): 152 | for side in [1, 2]: 153 | for col in range(n_cols + one_coded_offset): 154 | for row in range(one_coded_offset, n_rows + one_coded_offset): 155 | puck_id = format_string.format( 156 | lane=lane, 157 | side_letter=sides_letter[side], 158 | side_number=side, 159 | column=col, 160 | row=row, 161 | ) 162 | 163 | x_ofs = int(col) * x_offset 164 | 165 | swath_offset = swath_offsets[int(col) % 2] 166 | swath_offset = -swath_offset if side == 1 else swath_offset 167 | 168 | y_ofs = int(row) * y_offset + swath_offset 169 | 170 | z_ofs = 0 171 | 172 | l.append( 173 | pd.DataFrame( 174 | { 175 | "puck_id": [puck_id], 176 | "x_offset": [x_ofs], 177 | "y_offset": [y_ofs], 178 | "z_offset": [z_ofs], 179 | } 180 | ) 181 | ) 182 | 183 | puck_names_coords = pd.concat(l) 184 | 185 | return puck_names_coords 186 | 187 | 188 | def cmdline(): 189 | """cmdline.""" 190 | parser = argparse.ArgumentParser( 191 | allow_abbrev=False, 192 | description="Global Coordinate System Generator for NovaSeq S4 Flow Cell", 193 | ) 194 | parser = setup_parser(parser) 195 | args = parser.parse_args() 196 | 197 | puck_names_coords = create_coordinate_system( 198 | n_lanes=args.n_lanes, 199 | n_cols=args.columns, 200 | n_rows=args.rows, 201 | x_offset=args.x_offset, 202 | y_offset=args.y_offset, 203 | swath_offsets_odd=args.swath_offset_odd, 204 | swath_offsets_even=args.swath_offset_even, 205 | zero_coded=args.zero_coded, 206 | format_string=args.format_string, 207 | ) 208 | 209 | puck_names_coords.to_csv(args.output, index=False) 210 | 211 | 212 | if __name__ == "__main__": 213 | cmdline() 214 | -------------------------------------------------------------------------------- /spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz -------------------------------------------------------------------------------- /spacemake/data/test/visium_public_lane_joined_1m_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/spacemake/data/test/visium_public_lane_joined_1m_R2.fastq.gz -------------------------------------------------------------------------------- /spacemake/errors.py: -------------------------------------------------------------------------------- 1 | class SpacemakeError(Exception): 2 | def __init__(self, msg=None): 3 | self.msg = msg 4 | 5 | def __str__(self): 6 | msg = 'ERROR: ' + str(self.__class__.__name__) + '\n' 7 | 8 | if hasattr(self, 'msg') and self.msg is not None: 9 | msg += self.msg 10 | 11 | return msg 12 | 13 | class FileWrongExtensionError(SpacemakeError): 14 | def __init__(self, filename, expected_extension): 15 | self.filename = filename 16 | self.expected_extension = expected_extension 17 | 18 | def __str__(self): 19 | msg = super().__str__() 20 | msg += f'File {self.filename} has wrong extension.\n' 21 | msg += f'The extension should be {self.expected_extension}.\n' 22 | 23 | return msg 24 | 25 | class ConfigVariableError(SpacemakeError): 26 | def __init__(self, variable_name, variable_value): 27 | self.variable_name = variable_name 28 | self.variable_value = variable_value 29 | 30 | class UnrecognisedConfigVariable(SpacemakeError): 31 | def __init__(self, variable_name, variable_options): 32 | self.variable_name = variable_name 33 | self.variable_options = variable_options 34 | 35 | def __str__(self): 36 | msg = super().__str__() 37 | msg += f'unrecognised variable {self.variable_name}\n' 38 | msg += f'it has to be one of {self.variable_options}.' 39 | 40 | return msg 41 | 42 | class EmptyConfigVariableError(SpacemakeError): 43 | def __init__(self, variable_name): 44 | self.variable_name = variable_name 45 | 46 | def __str__(self): 47 | msg = super().__str__() 48 | msg += f'cannot remove, or set {self.variable_name} to emtpy list, or None\n' 49 | msg += 'this ERROR could happen in two cases: \n' 50 | msg += f'1) you tried to remove a {self.variable_name}, ' 51 | msg += f'and as a result the sample would not have' 52 | msg += f' any {self.variable_name} available.\n' 53 | msg += f'2) you tried to remove the `default` value of' 54 | msg += f' {self.variable_name} from the configuration.\n' 55 | 56 | return msg 57 | 58 | class ConfigVariableNotFoundError(ConfigVariableError): 59 | def __str__(self): 60 | msg = super().__str__() 61 | msg += f'{self.variable_name}: {self.variable_value} not found.\n' 62 | msg += f'you can add a new {self.variable_name} using the ' 63 | msg += f'`spacemake config add_{self.variable_name}` command.\n' 64 | 65 | return msg 66 | 67 | class ConfigVariableIncompleteError(ConfigVariableError): 68 | def __init__(self, missing_key, **kwargs): 69 | super().__init__(**kwargs) 70 | self.missing_key = missing_key 71 | 72 | def __str__(self): 73 | msg = super().__str__() 74 | msg += f'{self.variable_name}: {self.variable_value} ' 75 | msg += f'is missing required key {self.required_key}.\n' 76 | msg += f'You can update this key of {self.variable_value} using the ' 77 | msg += f'`spacemake config update_{self.variable_name}` command.\n' 78 | 79 | return msg 80 | 81 | class InvalidBarcodeStructureError(SpacemakeError): 82 | def __init__(self, tag_name, to_match): 83 | self.tag_name = tag_name 84 | self.to_match = to_match 85 | 86 | def __str__(self): 87 | msg = super().__str__() 88 | msg += f'{self.tag_name} does not match {self.to_match}.\n' 89 | msg += f'Example matching would be: r1[0:12] for the first 12n of Read1 ' 90 | msg += f'for {self.tag_name}\n' 91 | 92 | 93 | class DuplicateConfigVariableError(ConfigVariableError): 94 | def __str__(self): 95 | msg = super().__str__() 96 | msg += f'{self.variable_name}: {self.variable_value} already exists.\n' 97 | msg += f'To update it use `spacemake config update_{self.variable_name}`,\n' 98 | msg += f'To delete it use `spacemake config delete_{self.variable_name}.\n' 99 | 100 | return msg 101 | 102 | class NoProjectSampleProvidedError(SpacemakeError): 103 | def __init__(self): 104 | pass 105 | 106 | def __str__(self): 107 | msg = super().__str__() 108 | msg += f'no projects or samples were provided.\n' 109 | 110 | return msg 111 | 112 | class ProjectSampleNotFoundError(SpacemakeError): 113 | def __init__(self, var_name, var_value): 114 | self.var_name = var_name 115 | self.var_value = var_value 116 | 117 | def __str__(self): 118 | msg = super().__str__() 119 | msg += f'sample with {self.var_name}={self.var_value} not found.\n' 120 | msg += 'you can add a new sample with `spacemake projects add_sample` command.\n' 121 | 122 | return msg 123 | 124 | class SampleAlreadyExistsError(SpacemakeError): 125 | def __init__(self, ix): 126 | self.ix = ix 127 | 128 | def __str__(self): 129 | msg = super().__str__() 130 | msg += f'sample with (project_id, sample_id)={self.ix} already exists.\n' 131 | msg += 'in order to update this sample use `spacemake projects update_sample`,\n' 132 | msg += 'to delete it use `spacemake projects delete_sample`.\n' 133 | 134 | return msg 135 | 136 | class InconsistentVariablesDuringMerge(ConfigVariableError): 137 | def __init__(self, ix, **kwargs): 138 | super().__init__(**kwargs) 139 | self.ix = ix 140 | 141 | def __str__(self): 142 | msg = super().__str__() 143 | msg += f'\nthe samples that you trying to merge have different ' 144 | msg += f'{self.variable_name} values.\n\ninconsistent values:' 145 | msg += f' {self.variable_value}\n' 146 | msg += f'samples: {self.ix}.\n\n' 147 | msg += 'You can only merge samples which have the same ' 148 | msg += f'{self.variable_name}, or if there is an overlap.\n' 149 | 150 | return msg 151 | -------------------------------------------------------------------------------- /spacemake/longread/__main__.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | from spacemake.longread.cmdline import cmdline 3 | 4 | cmdline() 5 | -------------------------------------------------------------------------------- /spacemake/longread/overview.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from glob import glob 5 | from matplotlib.backends.backend_pdf import PdfPages 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def main(args): 10 | dfs = [] 11 | for fname in list(args.fnames) + list(glob(args.glob_pattern)): 12 | print(f"loading {fname}") 13 | df = pd.read_csv(fname, sep='\t') 14 | df['stats_file'] = fname 15 | dfs.append(df) 16 | 17 | df = pd.concat(dfs) 18 | 19 | repriming = ['TSO,TSO_RC', 'dN-SMRT,dN-SMRT_RC', ] 20 | concatenation = [c for c in df.columns if c.endswith('+') and ',' not in c] 21 | bead = ["bead_complete", "bead_only_handle", "bead_no_dT", "bead_no_opseq"][::-1] 22 | 23 | # avoid crash if columns are missing 24 | for r in repriming + concatenation + bead: 25 | if r not in df.columns: 26 | df[r] = 0 27 | 28 | # print(df) 29 | # print(f"concat columns {concatenation}") 30 | # print(f"bead columns {bead}") 31 | df['reprimed'] = df[repriming].sum(axis=1) 32 | df['bead_complete'] = np.nan_to_num(df['bead_complete'], nan=0.0) 33 | df['concat'] = df[concatenation].sum(axis=1) 34 | df['bead_related'] = np.nan_to_num(df[bead].sum(axis=1), nan=0.0) 35 | df['bead_dropseq'] = np.nan_to_num(df['bead_no_opseq'], nan=0.0) 36 | df['bead_incomplete'] = df['bead_related'] - df['bead_complete'] - df['bead_dropseq'] 37 | df['non_bead'] = 100 - df['bead_related'] 38 | df['bead_fidelity'] = 100 * df['bead_complete'] / df['bead_related'] 39 | df = df.fillna(0) 40 | # print(df) 41 | if args.csv_out: 42 | df.to_csv(args.csv_out, float_format='%.2f', sep='\t', index=False) 43 | 44 | def clean(txt): 45 | txt = os.path.basename(txt) 46 | t = txt\ 47 | .replace('source/','') \ 48 | .replace('sts_', '') \ 49 | .replace('pb_', '') \ 50 | .replace('ds_', '') \ 51 | .replace('.fq', '') \ 52 | .replace('.bam', '') \ 53 | .replace('lima.', '') 54 | 55 | if t.count('_') > 1: 56 | t = "_".join(t.split('_')[:2]) 57 | 58 | return t 59 | 60 | df['name'] = df['qfa'].apply(clean) 61 | # df = df.sort_values('bead_related') 62 | df = df.sort_values('name') 63 | 64 | def guess_rRNA_file(path): 65 | # print("guessrRNA raw path", path) 66 | name = os.path.basename(path).replace('.summary', '.rRNA') 67 | 68 | if args.rRNA_same_place: 69 | place = os.path.dirname(path) 70 | else: 71 | place = args.rRNA 72 | 73 | return [ 74 | os.path.join(place, name.replace(".fq", ".txt")), 75 | os.path.join(place, name.replace(".fq", ".txt")).replace('.rRNA.tsv', '.txt'), 76 | os.path.join(place, name.replace(".fq", ".txt")).replace('.rRNA.tsv', '.rRNA.txt'), 77 | os.path.join(place, name.replace(".bam", ".txt").replace("lima.", "")), 78 | os.path.join(place, name.replace(".bam", ".txt").replace("lima.", "")).replace('.rRNA.tsv', '.txt'), 79 | os.path.join(place, name.replace(".bam", ".txt").replace("lima.", "")).replace('.rRNA.tsv', '.rRNA.txt'), 80 | ] 81 | 82 | rRNA_fracs = [] 83 | for row in df[['stats_file', 'N_reads']].itertuples(): 84 | rcount = np.nan 85 | for fname in guess_rRNA_file(row.stats_file): 86 | print(fname) 87 | try: 88 | rcount = int(open(fname).read()) 89 | except (FileNotFoundError, ValueError): 90 | pass 91 | else: 92 | break 93 | if rcount == np.nan: 94 | raise ValueError 95 | 96 | rRNA_fracs.append(100. * rcount / row.N_reads) 97 | 98 | df['rRNA'] = rRNA_fracs 99 | # print(df[['qfa', 'rRNA']]) 100 | 101 | def make_bars(ax, df, kinds, labels, cmap=plt.get_cmap('tab10'), w=0.9, colors=None): 102 | n = len(kinds) 103 | if colors is None: 104 | colors = cmap(np.linspace(0, 1, n)) 105 | 106 | x = np.arange(len(df)) - w/2.0 107 | y0 = np.zeros(len(x), dtype=float) 108 | for kind, label, color in zip(kinds, labels, colors): 109 | y = np.nan_to_num(df[kind], nan=0.0) 110 | # print(kind) 111 | # print(y) 112 | ax.bar(x, y, bottom=y0, label=label, width=w, color=color) 113 | y0 += y 114 | 115 | ax.set_ylabel('fraction of library') 116 | ax.set_xticks(x) 117 | labels = df['name'] # [clean(fq) for fq in df['qfa']] 118 | ax.set_xticklabels(labels, rotation=90) 119 | ax.set_ylim(0, 100) 120 | 121 | marie = ["non_bead", "bead_incomplete", "bead_dropseq", "bead_complete", ] 122 | marie_colors = ["gray", "royalblue", "green", "gold"] 123 | 124 | w = max(8 / 25. * len(df), 3) 125 | if args.multi_page: 126 | pdf = PdfPages(args.breakdown) 127 | fig, ax1 = plt.subplots(1, figsize=(w, 4)) 128 | else: 129 | fig, (ax1, ax2) = plt.subplots(2, figsize=(w, 6), sharex=True) 130 | 131 | make_bars(ax1, df, marie, labels=[b.replace('bead_', '') for b in marie], colors=marie_colors) 132 | ax1.legend(title='Marie-stats', ncol=len(marie)) 133 | if args.multi_page: 134 | fig.tight_layout() 135 | pdf.savefig() 136 | plt.close() 137 | fig, ax2 = plt.subplots(1, figsize=(w, 4)) 138 | 139 | make_bars(ax2, df, ["bead_fidelity"], labels=["bead fidelity"]) 140 | ax2.set_ylabel("bead fidelity") 141 | if args.multi_page: 142 | fig.tight_layout() 143 | pdf.savefig() 144 | pdf.close() 145 | else: 146 | fig.tight_layout() 147 | plt.savefig(args.breakdown) 148 | 149 | plt.close() 150 | 151 | if args.multi_page: 152 | pdf = PdfPages(args.output) 153 | fig, ax1 = plt.subplots(1, figsize=(w, 4)) 154 | else: 155 | fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, figsize=(w, 12), sharex=True) 156 | 157 | # print("bead related", bead) 158 | make_bars(ax1, df, bead, labels=[b.replace('bead_', '') for b in bead]) 159 | ax1.legend(title='bead-related', ncol=len(bead)) 160 | if args.multi_page: 161 | fig.tight_layout() 162 | pdf.savefig() 163 | plt.close() 164 | fig, ax2 = plt.subplots(1, figsize=(w, 4)) 165 | 166 | # print("repriming events", repriming) 167 | make_bars(ax2, df, repriming, labels=[r.split(',')[0] for r in repriming], cmap=plt.get_cmap('tab20c')) 168 | ax2.legend(title='repriming', ncol=len(repriming)) 169 | if args.multi_page: 170 | fig.tight_layout() 171 | pdf.savefig() 172 | plt.close() 173 | fig, ax3 = plt.subplots(1, figsize=(w, 4)) 174 | 175 | # print("concat events", concatenation) 176 | make_bars(ax3, df, concatenation, labels=concatenation, cmap=plt.get_cmap('tab20b')) 177 | ax3.legend(title='concatamers', ncol=len(concatenation)) 178 | if args.multi_page: 179 | fig.tight_layout() 180 | pdf.savefig() 181 | plt.close() 182 | fig, ax4 = plt.subplots(1, figsize=(w, 4)) 183 | 184 | make_bars(ax4, df, ["rRNA",], labels = ["rRNA"], cmap=plt.get_cmap('tab20c')) 185 | ax4.legend(title='human rRNA', ncol=1) 186 | if args.multi_page: 187 | fig.tight_layout() 188 | pdf.savefig() 189 | pdf.close() 190 | else: 191 | fig.tight_layout() 192 | plt.savefig(args.output) 193 | 194 | plt.close() 195 | 196 | 197 | def setup_parser(parser): 198 | parser.add_argument("fnames", nargs='*') 199 | parser.add_argument("--output", default="pb_overview.pdf", 200 | help="path/name of detailed report PDF") 201 | parser.add_argument("--csv-out", default="all_pb_stats.csv", 202 | help="path/name of detailed report PDF") 203 | parser.add_argument("--breakdown", default="bead_overview.pdf", 204 | help="path/name of bead report (Marie style) PDF") 205 | parser.add_argument("--glob-pattern", default="stats/*summary.tsv", 206 | help="search pattern to gather summary files generated by the scan command") 207 | parser.add_argument("--rRNA", default="rRNA/", 208 | help="path to search for rRNA counts corresponding to samples") 209 | parser.add_argument("--rRNA-same-place", default=False, action='store_true', 210 | help="If set, look for rRNA txt file with same sample name in same directory") 211 | parser.add_argument("--multi-page", default=False, action="store_true", 212 | help="If set, generate multiple PDF pages instead of subplots") 213 | 214 | 215 | if __name__ == "__main__": 216 | # setup own parser 217 | import argparse 218 | parser = argparse.ArgumentParser(prog='pb_overview') 219 | setup_parser(parser) 220 | main(parser.parse_args()) 221 | -------------------------------------------------------------------------------- /spacemake/longread/signature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from collections import OrderedDict, defaultdict 4 | from spacemake.util import rev_comp 5 | 6 | """ 7 | Small helper class to load longread signature definitions (see docs/tutorials/longreads) 8 | and make them accessible to the various cmdline tools. 9 | """ 10 | 11 | logger = logging.getLogger("spacemake.longread.signature") 12 | 13 | 14 | class SignatureDB: 15 | def __init__(self, blocks=OrderedDict(), **kw): 16 | self.blocks = blocks 17 | self.lkup = {} 18 | self.fields = sorted(kw.keys()) 19 | for f in self.fields: 20 | self.lkup[f] = kw[f] 21 | 22 | @classmethod 23 | def from_YAML(cls, fname="samples.yaml"): 24 | import yaml 25 | 26 | logger = logging.getLogger("spacemake.longread.SignatureDB.from_YAML") 27 | logger.info(f"reading longread signature definitions from '{fname}'") 28 | 29 | groups = yaml.load(open(fname), Loader=yaml.SafeLoader) 30 | signatures = groups["signatures"] 31 | default = signatures[groups["default"]] 32 | 33 | # load all building block oligo sequences and their reverse complements 34 | blocks = OrderedDict() 35 | for fa_id, seq in groups["blocks"].items(): 36 | blocks[fa_id] = seq 37 | blocks[fa_id + "_RC"] = rev_comp(seq) 38 | 39 | logger.info(f"load_oligos(): loaded {len(blocks)} sequences from '{fname}'") 40 | 41 | # load the signature definitions and split into separate dictionaries 42 | field_lkups = {} 43 | for name, d in signatures.items(): 44 | # print(f"name={name} d={d}") 45 | for f in d.keys(): 46 | if f not in field_lkups: 47 | field_lkups[f] = defaultdict(lambda: default[f]) 48 | 49 | field_lkups[f][name] = d[f] 50 | 51 | logger.info( 52 | f"found {len(signatures)} signature definitions:" 53 | f"{sorted(signatures.keys())}." 54 | ) 55 | return cls(blocks, **field_lkups) 56 | 57 | def __getattr__(self, attr): 58 | return self.lkup[attr] 59 | 60 | def sort_samples(self, samples, signatures): 61 | """ 62 | Sort samples by the priority assigned in the signature definitions first, 63 | then lexicographically. Used for overview plots combining multiple longread 64 | sample results to group samples sharing a signature. 65 | """ 66 | return sorted( 67 | zip(samples, signatures), key=lambda x: (self.prio.get(x[1], np.inf), x[0]) 68 | ) 69 | 70 | 71 | def get_signature_db(try_path): 72 | """ 73 | try to load a YAML file with longread signature definitions from . 74 | If that fails, default to spacemake/data/config/longread.yaml 75 | """ 76 | if os.access(try_path, os.R_OK): 77 | cfg = try_path 78 | else: 79 | cfg = os.path.join(os.path.dirname(__file__), "../data/config/longread.yaml") 80 | 81 | return SignatureDB.from_YAML(cfg) 82 | 83 | 84 | def process_intact_signature(complete_signature, prefixes=["P5"], suffixes=["N70X"]): 85 | complete = complete_signature.split(",") 86 | while complete and complete[0] in prefixes: 87 | complete.pop(0) 88 | 89 | while complete and complete[-1] in suffixes: 90 | complete.pop() 91 | 92 | complete_order = dict(x[::-1] for x in enumerate(complete)) 93 | # print(f"complete={complete}") 94 | 95 | return tuple(complete), complete_order 96 | 97 | 98 | def digest_signatures( 99 | sig_counts, 100 | bead_related="bead_start", 101 | complete_signature="P5,bead_start,OP1,polyT,N70X", 102 | prefixes=[ 103 | "P5", 104 | ], 105 | suffixes=[ 106 | "N70X", 107 | ], 108 | ): 109 | bead_counts = defaultdict(int) 110 | ov_counts = defaultdict(int) 111 | n_bead_related = 0 112 | 113 | complete, complete_order = process_intact_signature( 114 | complete_signature, prefixes, suffixes 115 | ) 116 | complete_set = set(complete) 117 | found_part_counts = defaultdict(int) 118 | 119 | def describe(found_set): 120 | missing = complete_set - found_set 121 | if not missing: 122 | descr = "complete" 123 | elif len(missing) < len(found_set): 124 | descr = f"missing_{','.join(sorted(missing))}" 125 | else: 126 | descr = f"only_{','.join(sorted(found_set))}" 127 | 128 | return descr 129 | 130 | def bead_relation(parts): 131 | search = list(complete) 132 | at = 0 133 | 134 | try: 135 | i = parts.index(search[0]) # look for first part, e.g. bead_start 136 | except ValueError: 137 | i = 0 138 | 139 | found = [] 140 | for part in parts[i:]: 141 | # find co-linear matches, 142 | # ignore extra inserted segments 143 | # (for now) 144 | if part in search[at:]: 145 | found.append(part) 146 | at = search.index(part) 147 | 148 | found_set = set(found) 149 | found_tup = tuple(sorted(found_set, key=lambda x: complete_order[x])) 150 | 151 | return describe(found_set), found_tup 152 | 153 | for sig, count in sig_counts.items(): 154 | parts = sig.split(",") 155 | if bead_related in parts: 156 | br, found_tup = bead_relation(parts) 157 | bead_counts[br] += count 158 | n_bead_related += count 159 | 160 | for i in range(1, len(found_tup) + 1): 161 | found_part_counts[found_tup[:i]] += count 162 | else: 163 | ov_counts[sig] = count 164 | 165 | ov_counts["bead-related"] = n_bead_related 166 | return ov_counts, bead_counts, found_part_counts, complete 167 | -------------------------------------------------------------------------------- /spacemake/parallel.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.9" 2 | __author__ = ["Marvin Jens"] 3 | __license__ = "GPL" 4 | __email__ = ["marvin.jens@mdc-berlin.de"] 5 | 6 | import logging 7 | import time 8 | 9 | 10 | def put_or_abort(Q, item, abort_flag, timeout=1): 11 | """ 12 | Small wrapper around queue.put() to prevent 13 | dead-locks in the event of (detectable) errors 14 | that might cause put() to block forever. 15 | Expects a shared mp.Value instance as abort_flag 16 | 17 | Returns: False if put() was succesful, True if execution 18 | should be aborted. 19 | """ 20 | import queue 21 | 22 | sent = False 23 | # logging.warning(f"sent={sent} abort_flag={abort_flag}") 24 | while not (sent or abort_flag.value): 25 | try: 26 | Q.put(item, timeout=timeout) 27 | except queue.Full: 28 | pass 29 | else: 30 | sent = True 31 | 32 | return abort_flag.value 33 | 34 | 35 | def queue_iter(Q, abort_flag, stop_item=None, timeout=1): 36 | """ 37 | Small generator/wrapper around multiprocessing.Queue allowing simple 38 | for-loop semantics: 39 | 40 | for item in queue_iter(queue, abort_flag): 41 | ... 42 | The abort_flag is handled analogous to put_or_abort, only 43 | that it ends the iteration instead 44 | """ 45 | import queue 46 | 47 | # logging.debug(f"queue_iter({queue})") 48 | while True: 49 | if abort_flag.value: 50 | break 51 | try: 52 | item = Q.get(timeout=timeout) 53 | except queue.Empty: 54 | pass 55 | else: 56 | if item == stop_item: 57 | # signals end->exit 58 | break 59 | else: 60 | # logging.debug(f"queue_iter->item {item}") 61 | yield item 62 | 63 | 64 | def join_with_empty_queues(proc, Qs, abort_flag, timeout=1): 65 | """ 66 | joins() a process that writes data to queues Qs w/o deadlock. 67 | In case of an abort, the subprocess normally would not join 68 | until the Qs are emptied. join_with_empty_queues() monitors a global 69 | abort flag and empties the queues if needed, allowing the sub-process 70 | to terminate properly. 71 | """ 72 | 73 | def drain(Q): 74 | content = [] 75 | while not Q.empty(): 76 | try: 77 | item = Q.get(timeout=timeout) 78 | except queue.Empty: 79 | pass 80 | else: 81 | content.append(item) 82 | 83 | return content 84 | 85 | contents = [list() for i in range(len(Qs))] 86 | while proc.exitcode is None: 87 | proc.join(timeout) 88 | if abort_flag.value: 89 | for Q, content in zip(Qs, contents): 90 | content.extend(drain(Q)) 91 | 92 | return contents 93 | 94 | 95 | def chunkify(src, n_chunk=1000): 96 | """ 97 | Iterator which collects up to n_chunk items from iterable and yields them 98 | as a list. 99 | """ 100 | chunk = [] 101 | n = 0 102 | for x in src: 103 | chunk.append(x) 104 | if len(chunk) >= n_chunk: 105 | yield n, chunk 106 | n += 1 107 | chunk = [] 108 | 109 | if chunk: 110 | yield n, chunk 111 | 112 | 113 | def log_qerr(qerr): 114 | "helper function for reporting errors in sub processes" 115 | for name, lines in qerr: 116 | for line in lines: 117 | logging.error(f"subprocess {name} exception {line}") 118 | 119 | 120 | class ExceptionLogging: 121 | """ 122 | A context manager that handles otherwise uncaught exceptions by logging 123 | the event and traceback info, optinally raises a flag. 124 | Very handy for wrapping the main function in a sub-process! 125 | """ 126 | 127 | def __init__(self, name, Qerr=None, exc_flag=None): 128 | # print('__init__ called') 129 | self.Qerr = Qerr 130 | self.exc_flag = exc_flag 131 | self.name = name 132 | self.logger = logging.getLogger(name) 133 | self.exception = None 134 | 135 | def __enter__(self): 136 | self.t0 = time.time() 137 | # print('__enter__ called') 138 | return self 139 | 140 | def __exit__(self, exc_type, exc_value, exc_traceback): 141 | # print('__exit__ called') 142 | self.t1 = time.time() 143 | self.logger.info(f"CPU time: {self.t1 - self.t0:.3f} seconds.") 144 | if exc_type and (exc_type != SystemExit): 145 | import traceback 146 | 147 | lines = "\n".join( 148 | traceback.format_exception(exc_type, exc_value, exc_traceback) 149 | ).split("\n") 150 | self.exception = lines 151 | self.logger.error(f"an unhandled exception occurred") 152 | for l in lines: 153 | self.logger.error(l) 154 | 155 | if self.Qerr is not None: 156 | self.Qerr.put((self.name, lines)) 157 | 158 | if self.exc_flag: 159 | self.logger.error(f"raising exception flag {self.exc_flag}") 160 | self.exc_flag.value = True 161 | -------------------------------------------------------------------------------- /spacemake/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | #from .cmdline import cmdline 2 | #from .dge import calculate_adata_metrics,\ 3 | #calculate_shannon_entropy_scompression, dge_to_sparse_adata,\ 4 | #attach_barcode_file, parse_barcode_file, load_external_dge,\ 5 | #attach_puck_variables, attach_puck 6 | 7 | -------------------------------------------------------------------------------- /spacemake/preprocess/cmdline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | __version__ = "0.9" 3 | __author__ = ["Marvin Jens"] 4 | __license__ = "GPL" 5 | __email__ = ["marvin.jens@mdc-berlin.de"] 6 | 7 | from spacemake.preprocess.fastq import ( 8 | parse_args, 9 | setup_logging, 10 | main_combinatorial, 11 | main_dropseq, 12 | ) 13 | 14 | from spacemake.parallel import ExceptionLogging 15 | 16 | 17 | def cmdline(): 18 | with ExceptionLogging("main"): 19 | args = parse_args() 20 | NO_CALL = args.na 21 | setup_logging(args) 22 | 23 | if args.out_format == "bam" and not args.read2: 24 | raise ValueError("bam output format requires --read2 parameter") 25 | 26 | if ("bc1" in args.cell and not args.bc1_ref) or ( 27 | "bc2" in args.cell and not args.bc2_ref 28 | ): 29 | raise ValueError( 30 | "bc1/2 are referenced in --cell or --cell-raw, but no reference barcodes are specified via --bc{{1,2}}-ref" 31 | ) 32 | 33 | if args.bc1_ref or args.bc2_ref: 34 | main_combinatorial(args) 35 | else: 36 | main_dropseq(args) 37 | 38 | 39 | if __name__ == "__main__": 40 | cmdline() 41 | -------------------------------------------------------------------------------- /spacemake/preprocess/dge.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_name = "spacemake.preprocess.dge" 4 | logger = logging.getLogger(logger_name) 5 | 6 | def calculate_adata_metrics(adata, dge_summary_path=None, n_reads=None): 7 | import scanpy as sc 8 | import pandas as pd 9 | 10 | # calculate mitochondrial gene percentage 11 | adata.var["mt"] = ( 12 | adata.var_names.str.startswith("Mt-") 13 | | adata.var_names.str.startswith("mt-") 14 | | adata.var_names.str.startswith("MT-") 15 | ) 16 | 17 | sc.pp.calculate_qc_metrics( 18 | adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True 19 | ) 20 | 21 | add_reads = False 22 | if dge_summary_path is not None: 23 | dge_summary = pd.read_csv( 24 | dge_summary_path, 25 | skiprows=7, 26 | sep="\t", 27 | index_col="cell_bc", 28 | names=["cell_bc", "n_reads", "n_umi", "n_genes"], 29 | ) 30 | 31 | adata.obs = pd.merge( 32 | adata.obs, dge_summary[["n_reads"]], left_index=True, right_index=True 33 | ) 34 | 35 | add_reads = True 36 | 37 | if n_reads is not None: 38 | adata.obs["n_reads"] = n_reads 39 | add_reads = True 40 | 41 | if add_reads: 42 | adata.obs["reads_per_counts"] = adata.obs.n_reads / adata.obs.total_counts 43 | 44 | 45 | def calculate_shannon_entropy_scompression(adata): 46 | import math 47 | import itertools 48 | import numpy as np 49 | from collections import Counter 50 | 51 | def compute_shannon_entropy(barcode): 52 | prob, length = Counter(barcode), float(len(barcode)) 53 | return -sum( 54 | count / length * math.log(count / length, 2) for count in prob.values() 55 | ) 56 | 57 | def compute_string_compression(barcode): 58 | compressed_barcode = "".join( 59 | letter + str(len(list(group))) 60 | for letter, group in itertools.groupby(barcode) 61 | ) 62 | 63 | return len(compressed_barcode) 64 | 65 | bc = adata.obs.index.to_numpy() 66 | bc_len = len(bc[0]) 67 | theoretical_barcodes = np.random.choice( 68 | ["A", "C", "T", "G"], size=(bc.shape[0], bc_len) 69 | ) 70 | 71 | adata.obs["exact_entropy"] = np.round( 72 | np.array([compute_shannon_entropy(cell_bc) for cell_bc in bc]), 2 73 | ) 74 | adata.obs["theoretical_entropy"] = np.round( 75 | np.array( 76 | [compute_shannon_entropy(cell_bc) for cell_bc in theoretical_barcodes] 77 | ), 78 | 2, 79 | ) 80 | adata.obs["exact_compression"] = np.round( 81 | np.array([compute_string_compression(cell_bc) for cell_bc in bc]), 2 82 | ) 83 | adata.obs["theoretical_compression"] = np.round( 84 | np.array( 85 | [compute_string_compression(cell_bc) for cell_bc in theoretical_barcodes] 86 | ), 87 | 2, 88 | ) 89 | 90 | 91 | def dge_to_sparse_adata(dge_path, dge_summary_path): 92 | import anndata 93 | import numpy as np 94 | import gzip 95 | import pandas as pd 96 | from scipy.sparse import coo_matrix, hstack 97 | 98 | gene_names = [] 99 | 100 | with gzip.open(dge_path, "rt") as dge: 101 | first_line = dge.readline().strip().split("\t") 102 | has_mt = False 103 | barcodes = first_line[1:] 104 | N_bc = len(barcodes) 105 | X = None 106 | 107 | # read DGE line by line 108 | # first row: contains CELL BARCODEs 109 | # each next row contains one gene name, and the counts of that gene 110 | for line in dge: 111 | vals = line.strip() 112 | _idx_tab = vals.index("\t") 113 | _gene_name = vals[:_idx_tab] 114 | gene_names.append(_gene_name) 115 | 116 | if _gene_name.lower().startswith("mt-"): 117 | has_mt = True 118 | 119 | # store counts as np.array 120 | _vals = np.fromstring(vals[_idx_tab:], dtype=np.int32, count=N_bc, sep='\t').flatten() 121 | _idx_nonzero = np.argwhere(_vals != 0).flatten() 122 | 123 | if len(_idx_nonzero) > 0: 124 | gene_sp = coo_matrix((_vals[_idx_nonzero].astype(np.int32), (_idx_nonzero, np.zeros(len(_idx_nonzero)))), shape=(N_bc, 1), dtype=np.int32) 125 | else: 126 | gene_sp = coo_matrix((N_bc, 1), dtype=np.int32) 127 | 128 | if X is None: 129 | X = gene_sp 130 | else: 131 | X = hstack([X, gene_sp]) 132 | 133 | if X is None: 134 | X = coo_matrix((len(barcodes), 0), dtype=np.int32) 135 | 136 | if not has_mt: 137 | # ensure we have an entry for mitochondrial transcripts even if it's just all zeros 138 | print( 139 | "need to add mt-missing because no mitochondrial stuff was among the genes for annotation" 140 | ) 141 | gene_names.append("mt-missing") 142 | X = hstack([X, np.zeros(X.shape[0])[:, None]]) 143 | 144 | X = X.tocsr() 145 | X = X.astype(np.float32) 146 | adata = anndata.AnnData( 147 | X, obs=pd.DataFrame(index=barcodes), var=pd.DataFrame(index=gene_names) 148 | ) 149 | 150 | # name the index 151 | adata.obs.index.name = "cell_bc" 152 | 153 | # attach metrics such as: total_counts, pct_mt_counts, etc 154 | # also attach n_genes, and calculate pcr 155 | calculate_adata_metrics(adata, dge_summary_path) 156 | 157 | # calculate per shannon_entropy and string_compression per bead 158 | calculate_shannon_entropy_scompression(adata) 159 | 160 | if adata.X.sum() == 0: 161 | logger.warn(f"The DGE from {dge_path} is empty") 162 | 163 | return adata 164 | 165 | 166 | def load_external_dge(dge_path): 167 | import scanpy as sc 168 | 169 | from scanpy._utils import check_nonnegative_integers 170 | from scipy.sparse import issparse, csc_matrix 171 | from spacemake.errors import SpacemakeError 172 | 173 | adata = sc.read(dge_path) 174 | 175 | if not check_nonnegative_integers(adata.X): 176 | raise SpacemakeError( 177 | f"External dge seems to contain values " 178 | + "which are already normalised. Raw-count matrix expected." 179 | ) 180 | 181 | if not issparse(adata.X): 182 | adata.X = csc_matrix(adata.X) 183 | 184 | # name the index 185 | adata.obs.index.name = "cell_bc" 186 | 187 | # attach metrics such as: total_counts, pct_mt_counts, etc 188 | # also attach n_genes, and calculate pcr 189 | calculate_adata_metrics(adata) 190 | 191 | return adata 192 | 193 | 194 | def parse_barcode_file(barcode_file): 195 | import pandas as pd 196 | 197 | bc = pd.read_csv(barcode_file, sep="[,|\t]", engine='python') 198 | 199 | # rename columns 200 | bc = ( 201 | bc.rename( 202 | columns={ 203 | "xcoord": "x_pos", 204 | "ycoord": "y_pos", 205 | "barcodes": "cell_bc", 206 | "barcode": "cell_bc", 207 | } 208 | ) 209 | .set_index("cell_bc") 210 | .loc[:, ["x_pos", "y_pos"]] 211 | ) 212 | 213 | bc = bc.loc[~bc.index.duplicated(keep="first")] 214 | 215 | bc = bc.loc[~bc.index.duplicated(keep="first")] 216 | 217 | return bc 218 | 219 | 220 | def attach_barcode_file(adata, barcode_file): 221 | bc = parse_barcode_file(barcode_file) 222 | 223 | # new obs has only the indices of the exact barcode matches 224 | new_obs = adata.obs.merge(bc, left_index=True, right_index=True, how="inner") 225 | adata = adata[new_obs.index, :] 226 | adata.obs = new_obs 227 | adata.obsm["spatial"] = adata.obs[["x_pos", "y_pos"]].to_numpy() 228 | 229 | return adata 230 | 231 | 232 | def attach_puck_variables(adata, puck_variables): 233 | if "spatial" not in adata.obsm.keys(): 234 | raise SpacemakeError( 235 | f"this dataset has no spatial information " 236 | + "available. Please attach the spatial information using the " 237 | + "spacemake.preprocess.attach_barcode_file() function first" 238 | ) 239 | 240 | adata.uns["puck_variables"] = puck_variables 241 | 242 | x_pos_max, y_pos_max = tuple(adata.obsm["spatial"].max(axis=0)) 243 | x_pos_min, y_pos_min = tuple(adata.obsm["spatial"].min(axis=0)) 244 | #print(f"PUCK VARS {puck_variables} X MIN {x_pos_min} X MAX {x_pos_max} Y MIN {y_pos_min} Y MAX {y_pos_max}") 245 | 246 | width_um = adata.uns["puck_variables"]["width_um"] 247 | coord_by_um = (x_pos_max - x_pos_min) / width_um 248 | 249 | # this can be NaN if only one coordinate (only one cell, will fail) 250 | if coord_by_um > 0: 251 | height_um = int((y_pos_max - y_pos_min) / coord_by_um) 252 | else: 253 | height_um = 1 # avoid division by zero and error in reports 254 | coord_by_um = 1 255 | 256 | adata.uns["puck_variables"]["height_um"] = height_um 257 | adata.uns["puck_variables"]["coord_by_um"] = coord_by_um 258 | 259 | return adata 260 | 261 | 262 | def attach_puck(adata, puck): 263 | attach_puck_variables(adata, puck.variables) 264 | adata.uns["puck_name"] = puck.name 265 | 266 | return adata -------------------------------------------------------------------------------- /spacemake/reporting.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | def count_dict_collapse_misc( 9 | counts, misc_thresh=0.01, total=1, add_up=None, sig_intact=None 10 | ): 11 | out_counts = {} 12 | out_frac = {} 13 | 14 | misc = 0 15 | sum = 0 16 | if sig_intact is not None: 17 | complete = ",".join(sig_intact) 18 | everything = set(sig_intact) 19 | else: 20 | complete = None 21 | everything = set() 22 | 23 | def relkey(key): 24 | if sig_intact is None: 25 | return key 26 | 27 | if key == complete: 28 | return "complete" 29 | 30 | obs = set(key.split(",")) 31 | there = obs & everything 32 | extra = obs - everything 33 | missing = everything - obs 34 | 35 | if len(missing) <= len(there): 36 | res = "missing_" + ",".join(sorted(missing)) 37 | else: 38 | res = "only_" + ",".join(sorted(there)) 39 | if extra: 40 | res += "_extra_" + ",".join(sorted(extra)) 41 | 42 | return res 43 | 44 | for key, n in sorted(counts.items()): 45 | key = relkey(key) 46 | sum += n 47 | f = n / float(total) 48 | if f < misc_thresh: 49 | misc += n 50 | else: 51 | out_counts[key] = n 52 | out_frac[key] = f 53 | 54 | if misc > 0: 55 | out_counts["misc"] = misc 56 | out_frac["misc"] = misc / float(total) 57 | 58 | if add_up is None: 59 | other = total - sum 60 | else: 61 | other = total - counts[add_up] 62 | 63 | if other > 0: 64 | out_counts["NA"] = other 65 | out_frac["NA"] = other / float(total) 66 | return out_counts, out_frac 67 | 68 | 69 | def count_dict_out(counts, title, misc_thresh=0.01, total=1, **kw): 70 | print(f"### {title}") 71 | out_counts, out_frac = count_dict_collapse_misc(counts, misc_thresh, total, **kw) 72 | for key in sorted(out_counts.keys()): 73 | print(f"{key}\t{out_counts[key]}\t{out_frac[key]:.3f}") 74 | 75 | 76 | def to_hist(d, normed=True): 77 | x = np.array(list(d.keys())) 78 | x0 = x.min() 79 | x1 = x.max() + 1 80 | counts = np.zeros(x1, dtype=np.float32) 81 | 82 | for i in x: 83 | counts[i] = d[i] 84 | 85 | n = counts.sum() 86 | if normed: 87 | counts /= n 88 | 89 | return counts, n 90 | 91 | 92 | def donut_plot( 93 | ax, data, sa=10, explode=None, colors=None, labels=None, title="", cmap="tab20" 94 | ): 95 | import matplotlib.pyplot as plt 96 | 97 | if labels is None: 98 | labels = sorted(data.keys()) 99 | 100 | counts = [data.get(k, 0) for k in labels] 101 | 102 | if colors is None: 103 | colors = list(plt.cm.get_cmap(cmap)(np.linspace(0, 1, len(labels)))) 104 | 105 | wedges, texts = ax.pie( 106 | counts, 107 | wedgeprops=dict(width=0.5), 108 | startangle=sa, 109 | explode=explode, 110 | colors=colors, 111 | ) 112 | 113 | bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.5) 114 | kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center") 115 | c = np.array(counts) 116 | pcts = 100.0 * c / float(c.sum()) 117 | for i, p in enumerate(wedges): 118 | ang = (p.theta2 - p.theta1) / 2.0 + p.theta1 119 | y = np.sin(np.deg2rad(ang)) 120 | x = np.cos(np.deg2rad(ang)) 121 | horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))] 122 | connectionstyle = "angle,angleA=0,angleB={}".format(ang) 123 | kw["arrowprops"].update({"connectionstyle": connectionstyle}) 124 | if pcts[i] > 0: 125 | ax.text(x * 0.75, y * 0.75, f"{pcts[i]:.1f}", horizontalalignment="center") 126 | ax.annotate( 127 | labels[i], 128 | xy=(x, y), 129 | xytext=(1.4 * np.sign(x), 1.4 * y), 130 | horizontalalignment=horizontalalignment, 131 | **kw, 132 | ) 133 | 134 | if title: 135 | ax.set_title(title) 136 | 137 | return labels, colors 138 | 139 | 140 | def approximate(intvalue): 141 | suffixes = {9: "G", 6: "M", 3: "k", 0: ""} 142 | dec = int(np.floor(np.log10(intvalue) / 3)) * 3 143 | x = np.round(intvalue / 10 ** dec, decimals=2) 144 | return f"{x:.2f} {suffixes.get(dec, '?')}" 145 | 146 | 147 | def len_plot( 148 | ax, 149 | data, 150 | labels=None, 151 | colors=None, 152 | xlabel="aligned bases", 153 | ylabel="fraction", 154 | title="type", 155 | cmap="tab20", 156 | min_count=10, 157 | cumulative=False, 158 | legend=True, 159 | ): 160 | import matplotlib.pyplot as plt 161 | 162 | if labels is None: 163 | labels = sorted(data.keys()) 164 | 165 | if colors is None: 166 | colors = plt.cm.get_cmap(cmap)(np.linspace(0, 1, len(labels))) 167 | 168 | color_dict = {} 169 | for cig_type, color in zip(labels, colors): 170 | color_dict[cig_type] = color 171 | 172 | if not cig_type in data: 173 | continue 174 | ml, n = to_hist(data[cig_type], normed=True) 175 | if n < min_count: 176 | continue 177 | 178 | x = np.arange(len(ml)) 179 | y = ml.cumsum() if cumulative else ml 180 | 181 | ax.step( 182 | x, 183 | y, 184 | where="mid", 185 | label=f"{cig_type} ({approximate(n)})", 186 | color=color, 187 | lw=2, 188 | solid_capstyle="round", 189 | ) 190 | 191 | if cumulative: 192 | ax.axhline(0.5, lw=0.5, ls="dashed", color="k") 193 | 194 | if legend: 195 | ax.legend(title=title, bbox_to_anchor=(0.5, 1.05), loc="lower center", ncol=2) 196 | 197 | ax.set_xlabel(xlabel) 198 | ax.set_ylabel(ylabel) 199 | 200 | return color_dict 201 | 202 | 203 | # def make_colors_explode(labels, cmap="Blues", hilight="bead-related", hicolor="red"): 204 | # import matplotlib.pyplot as plt 205 | # ex = np.zeros(len(labels)) 206 | # colors = list(plt.get_cmap(cmap)(np.linspace(0.2, 0.8, len(labels)))) 207 | # try: 208 | # i = labels.index(hilight) 209 | # except ValueError: 210 | # pass 211 | # else: 212 | # ex[i] = 0.1 213 | # colors[i] = hicolor 214 | # return ex, colors 215 | -------------------------------------------------------------------------------- /spacemake/smk.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_name = "spacemake.main" 4 | logger = logging.getLogger(logger_name) 5 | 6 | 7 | class Spacemake: 8 | """Spacemake. 9 | 10 | Class to access spacemake processed data from python. 11 | 12 | """ 13 | 14 | def __init__(self, root): 15 | """__init__ constructor function of the Spacemake class 16 | 17 | :param root: Path to the spacemake root directory. 18 | :type root: str 19 | """ 20 | from spacemake.config import get_global_config 21 | from spacemake.project_df import get_global_ProjectDF 22 | 23 | self.root = root 24 | self.config = get_global_config(root) 25 | self.project_df = get_global_ProjectDF(root) 26 | 27 | def load_processed_adata( 28 | self, project_id, sample_id, run_mode_name, umi_cutoff 29 | ): #-> anndata.AnnData: 30 | """Load spacemake processed data. 31 | 32 | :param project_id: project_id of the data to be loaded. 33 | :type project_id: str 34 | :param sample_id: sample_id of the data to be loaded. 35 | :type sample_id: str 36 | :param run_mode_name: name of the run mode of the data to be loaded. 37 | Each sample can have several run_modes during sample addition, 38 | here only one option needs to be provided. 39 | :type run_mode_name: str 40 | :param umi_cutoff: the umi_cutoff of the data to be loaded. Each 41 | run_mode can have several umi_cutoffs provided during configuration 42 | here only one option needs to be provided. 43 | :type umi_cutoff: int 44 | :returns: A spacemake processed and analyzed AnnData object, containing 45 | the results of the analysis. 46 | :rtype: anndata.AnnData 47 | """ 48 | import scanpy as sc 49 | # import anndata 50 | 51 | 52 | self.project_df.assert_run_mode(project_id, sample_id, run_mode_name) 53 | run_mode = self.config.get_run_mode(run_mode_name) 54 | 55 | if not int(umi_cutoff) in [int(uc) for uc in run_mode.variables["umi_cutoff"]]: 56 | raise SpacemakeError( 57 | f"run_mode={run_mode} has no " + f"umi_cutoff={umi_cutoff}" 58 | ) 59 | 60 | adata_raw = self.load_raw_spatial_adata( 61 | project_id=project_id, sample_id=sample_id, run_mode_name=run_mode_name 62 | ) 63 | 64 | adata = sc.read( 65 | f"{self.root}/projects/{project_id}/processed_data/{sample_id}/" 66 | + f"illumina/complete_data/automated_analysis/{run_mode_name}/" 67 | + f"umi_cutoff_{umi_cutoff}/results.h5ad" 68 | ) 69 | 70 | if "run_mode_variables" not in adata.uns.keys(): 71 | adata.uns["run_mode_variables"] = run_mode.variables 72 | if "puck_variables" not in adata.uns.keys(): 73 | adata.uns["puck_variables"] = adata_raw.uns["puck_variables"] 74 | 75 | return adata 76 | 77 | def load_raw_spatial_adata( 78 | self, project_id, sample_id, run_mode_name 79 | ): #-> anndata.AnnData: 80 | """Load raw, spacemake processed data. 81 | 82 | This function will load the raw countr matrix, created by spacemake. 83 | 84 | :param project_id: project_id of the raw data to be loaded. 85 | :type project_id: str 86 | :param sample_id: sample_id of the raw data to be loaded. 87 | :type sample_id: str 88 | :param run_mode_name: name of the run mode of the raw data to be loaded. 89 | Each sample can have several run_modes during sample addition, 90 | here only one option needs to be provided. 91 | :type run_mode_name: str 92 | :returns: A spacemake processed AnnData object, containing unfiltered 93 | raw expression data, and all cells or spatial units in the dataset. 94 | :rtype: anndata.AnnData 95 | """ 96 | import scanpy as sc 97 | 98 | self.project_df.assert_run_mode(project_id, sample_id, run_mode_name) 99 | run_mode = self.config.get_run_mode(run_mode_name) 100 | 101 | dge_type = "" 102 | dge_cleaned = "" 103 | polyA_adapter_trimmed = "" 104 | mm_included = "" 105 | 106 | if run_mode.variables["polyA_adapter_trimming"]: 107 | polyA_adapter_trimmed = ".polyA_adapter_trimmed" 108 | 109 | if run_mode.variables["count_intronic_reads"]: 110 | dge_type = ".all" 111 | else: 112 | dge_type = ".exon" 113 | 114 | if run_mode.variables["count_mm_reads"]: 115 | mm_included = ".mm_included" 116 | 117 | if run_mode.variables["clean_dge"]: 118 | dge_cleaned = ".cleaned" 119 | 120 | adata = sc.read( 121 | f"{self.root}/projects/{project_id}/processed_data/{sample_id}/" 122 | + f"illumina/complete_data/dge/dge{dge_type}{dge_cleaned}" 123 | + f"{polyA_adapter_trimmed}{mm_included}.spatial_beads.h5ad" 124 | ) 125 | 126 | if "puck_variables" not in adata.uns.keys(): 127 | from spacemake.preprocess import attach_puck_variables 128 | 129 | adata = attach_puck_variables( 130 | adata, 131 | puck_variables=self.project_df.get_puck_variables( 132 | project_id=project_id, sample_id=sample_id 133 | ), 134 | ) 135 | 136 | if "run_mode_variables" not in adata.uns.keys(): 137 | adata.uns["run_mode_variables"] = run_mode.variables 138 | 139 | return adata 140 | 141 | 142 | def get_novosparc_variables(pdf, args): 143 | """get_novosparc_variables. 144 | 145 | :param pdf: 146 | :param args: 147 | """ 148 | # assert that sample exists 149 | pdf.assert_sample(args["project_id"], args["sample_id"]) 150 | 151 | def populate_variables_from_args(pdf, args, arg_prefix=""): 152 | """populate_variables_from_args. 153 | 154 | :param pdf: 155 | :param args: 156 | :param arg_prefix: 157 | """ 158 | # get sample info 159 | sample_info = pdf.get_sample_info( 160 | project_id=args[f"{arg_prefix}project_id"], 161 | sample_id=args[f"{arg_prefix}sample_id"], 162 | ) 163 | 164 | # populate return dictionary 165 | ret = { 166 | f"{arg_prefix}project_id": args[f"{arg_prefix}project_id"], 167 | f"{arg_prefix}sample_id": args[f"{arg_prefix}sample_id"], 168 | } 169 | 170 | # get run mode 171 | if f"{arg_prefix}run_mode" in args: 172 | ret[f"{arg_prefix}run_mode"] = args[f"{arg_prefix}run_mode"] 173 | else: 174 | run_mode_name = sample_info["run_mode"][0] 175 | ret[f"{arg_prefix}run_mode"] = run_mode_name 176 | logger.info(f"No run_mode provided, using {run_mode_name}") 177 | 178 | run_mode = pdf.config.get_run_mode(ret[f"{arg_prefix}run_mode"]) 179 | 180 | if f"{arg_prefix}umi_cutoff" not in args: 181 | umi_cutoff = run_mode.variables["umi_cutoff"][0] 182 | ret[f"{arg_prefix}umi_cutoff"] = umi_cutoff 183 | logger.info(f"No umi_cutoff provided, using {umi_cutoff}") 184 | else: 185 | ret[f"{arg_prefix}umi_cutoff"] = args[f"{arg_prefix}umi_cutoff"] 186 | 187 | return ret 188 | 189 | ret = populate_variables_from_args(pdf, args) 190 | 191 | if "reference_project_id" not in args or "reference_sample_id" not in args: 192 | logger.info( 193 | "No reference_project_id or reference_sample_id provided," 194 | + " running novosparc de-novo..." 195 | ) 196 | ret["reference_project_id"] = "" 197 | ret["reference_sample_id"] = "" 198 | ret["reference_umi_cutoff"] = "" 199 | ret["reference_run_mode"] = "" 200 | else: 201 | pdf.assert_sample(args["reference_project_id"], args["reference_sample_id"]) 202 | 203 | logger.info( 204 | "Using (project_id, sample_id)=" 205 | + f"({args['reference_project_id']}, {args['reference_sample_id']})" 206 | + " reference, running novosparc with reference..." 207 | ) 208 | 209 | novosparc_ret = populate_variables_from_args(pdf, args, arg_prefix="reference_") 210 | 211 | ret = {**ret, **novosparc_ret} 212 | 213 | return ret 214 | 215 | 216 | _spacemake_instance = None 217 | 218 | 219 | def get_spacemake_object(): 220 | global _spacemake_instance 221 | if _spacemake_instance is None: 222 | _spacemake_instance = Spacemake(".") 223 | 224 | return _spacemake_instance 225 | 226 | 227 | # def get_ConfigFile(): 228 | # spmk = get_spacemake_object() 229 | # return spmk.config 230 | 231 | 232 | # def get_ProjectDF(): 233 | # spmk = get_spacemake_object() 234 | # return spmk.project_df 235 | -------------------------------------------------------------------------------- /spacemake/snakemake/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/spacemake/snakemake/__init__.py -------------------------------------------------------------------------------- /spacemake/snakemake/downsample.smk: -------------------------------------------------------------------------------- 1 | ######### 2 | # about # 3 | ######### 4 | __version__ = '0.1.0' 5 | __author__ = ['Nikos Karaiskos', 'Tamas Ryszard Sztanka-Toth'] 6 | __licence__ = 'GPL' 7 | __email__ = ['nikolaos.karaiskos@mdc-berlin.de', 'tamasryszard.sztanka-toth@mdc-berlin.de'] 8 | 9 | # first create downsampling for 10, 20 .. 90 10 | downsampled_ratios = range(10,100,10) 11 | 12 | rule downsample_bam: 13 | input: 14 | unpack(get_final_bam) 15 | output: 16 | downsampled_bam 17 | params: 18 | downsample_dir = downsampled_data_prefix, 19 | ratio = lambda wildcards: wildcards.downsampling_percentage[1:] 20 | threads: 4 21 | shell: 22 | """ 23 | mkdir -p {params.downsample_dir} 24 | 25 | sambamba view -o {output} -f bam -t {threads} \ 26 | -s 0.{params.ratio} {input} 27 | """ 28 | 29 | rule downsampled_filter_mm_reads: 30 | input: 31 | downsampled_bam 32 | output: 33 | temp(downsampled_bam_mm_included_pipe) 34 | shell: 35 | """ 36 | python {repo_dir}/scripts/filter_mm_reads.py \ 37 | --in-bam {input} \ 38 | --out-bam {output} 39 | """ 40 | 41 | def get_saturation_analysis_input(wildcards): 42 | # create dictionary with the right downsampling files where the key 43 | files = {} 44 | 45 | run_modes = get_run_modes_from_sample(wildcards.project_id, wildcards.sample_id) 46 | 47 | if project_df.is_spatial(project_id=wildcards.project_id, 48 | sample_id=wildcards.sample_id, 49 | puck_barcode_file_id=wildcards.puck_barcode_file_id): 50 | puck_barcode_file_ids = [wildcards.puck_barcode_file_id, 'no_spatial_data'] 51 | else: 52 | puck_barcode_file_ids = ['no_spatial_data'] 53 | 54 | for run_mode in run_modes: 55 | for ratio in downsampled_ratios: 56 | for puck_barcode_file_id in puck_barcode_file_ids: 57 | # dge_files contains dge/summary file paths per run_mode 58 | files[f'downsampled_dge_summary.{run_mode}.{ratio}.{puck_barcode_file_id}'] = get_dge_from_run_mode( 59 | project_id = wildcards.project_id, 60 | sample_id = wildcards.sample_id, 61 | run_mode = run_mode, 62 | data_root_type = 'downsampled_data', 63 | puck_barcode_file_id = puck_barcode_file_id, 64 | downsampling_percentage = '/' + str(ratio))['dge_summary'] 65 | 66 | for puck_barcode_file_id in puck_barcode_file_ids: 67 | files[f'downsampled_dge_summary.{run_mode}.100.{puck_barcode_file_id}'] = get_dge_from_run_mode( 68 | project_id = wildcards.project_id, 69 | sample_id = wildcards.sample_id, 70 | run_mode = run_mode, 71 | data_root_type = 'complete_data', 72 | puck_barcode_file_id = puck_barcode_file_id, 73 | downsampling_percentage = '')['dge_summary'] 74 | 75 | return files 76 | 77 | rule create_saturation_analysis: 78 | input: 79 | unpack(get_saturation_analysis_input) 80 | output: 81 | downsample_saturation_analysis 82 | params: 83 | sample_info = lambda wildcards: project_df.get_sample_info( 84 | wildcards.project_id, wildcards.sample_id), 85 | run_modes = lambda wildcards: get_run_modes_from_sample( 86 | wildcards.project_id, wildcards.sample_id) 87 | script: 88 | "scripts/saturation_analysis.Rmd" 89 | -------------------------------------------------------------------------------- /spacemake/snakemake/dropseq.smk: -------------------------------------------------------------------------------- 1 | ######### 2 | # about # 3 | ######### 4 | __version__ = '0.1.0' 5 | __author__ = ['Nikos Karaiskos', 'Tamas Ryszard Sztanka-Toth'] 6 | __licence__ = 'GPL' 7 | __email__ = ['nikolaos.karaiskos@mdc-berlin.de', 'tamasryszard.sztanka-toth@mdc-berlin.de'] 8 | 9 | ################################################### 10 | # Snakefile containing the dropseq pipeline rules # 11 | ################################################### 12 | rule remove_smart_adapter: 13 | input: 14 | tagged_bam 15 | output: 16 | pipe(tagged_trimmed_bam) 17 | params: 18 | reports_dir = reports_dir 19 | shell: 20 | """ 21 | mkdir -p {params.reports_dir} 22 | 23 | {dropseq_tools}/TrimStartingSequence OUTPUT_SUMMARY={params.reports_dir}/remove_smart_adapter.report.txt \ 24 | INPUT={input} \ 25 | OUTPUT={output} \ 26 | SEQUENCE={smart_adapter} \ 27 | MISMATCHES=0 \ 28 | NUM_BASES=5 \ 29 | COMPRESSION_LEVEL=0 30 | """ 31 | 32 | rule remove_polyA: 33 | input: 34 | tagged_trimmed_bam 35 | output: 36 | temp(tagged_polyA_adapter_trimmed_bam) 37 | params: 38 | reports_dir = reports_dir 39 | shell: 40 | """ 41 | {dropseq_tools}/PolyATrimmer OUTPUT_SUMMARY={params.reports_dir}/remove_polyA.report.txt \ 42 | MISMATCHES=0 \ 43 | INPUT={input} \ 44 | OUTPUT={output} \ 45 | NUM_BASES=6 46 | """ 47 | 48 | rule filter_mm_reads: 49 | input: 50 | unpack(get_final_bam) 51 | output: 52 | pipe(final_bam_mm_included_pipe) 53 | shell: 54 | """ 55 | python {repo_dir}/scripts/filter_mm_reads.py \ 56 | --in-bam {input} \ 57 | --out-bam {output} 58 | """ 59 | -------------------------------------------------------------------------------- /spacemake/snakemake/longread.smk: -------------------------------------------------------------------------------- 1 | ######### 2 | # about # 3 | ######### 4 | __version__ = '0.2' 5 | __author__ = ['Marvin Jens', 'Tamas Ryszard Sztanka-Toth'] 6 | __email__ = ['marvin.jens@mdc-berlin.de', 'tamasryszard.sztanka-toth@mdc-berlin.de'] 7 | 8 | lr_root = project_dir + "/processed_data/{sample_id}/longread" 9 | lr_cache_dir = lr_root + "/cache/" 10 | lr_ann_dir = lr_root + "/annotation/" 11 | lr_stats_dir = lr_root + "/stats/" 12 | lr_report_dir = lr_root + "/reports/" 13 | lr_examples_dir = lr_root + "/examples/" 14 | lr_cDNA_dir = lr_root + "/cDNA/" 15 | 16 | # targets 17 | lr_ann = lr_ann_dir + "{sample_id}.annotation.tsv" 18 | lr_stats = lr_stats_dir + "{sample_id}.stats.tsv" 19 | lr_report = lr_report_dir + "{sample_id}.donuts.pdf" 20 | lr_report_stats = lr_stats_dir + "{sample_id}.report.tsv" 21 | lr_edits = lr_report_dir + "{sample_id}.oligo_edits.pdf" 22 | lr_cDNA = lr_cDNA_dir + "{sample_id}.fa" 23 | lr_cDNA_log = lr_cDNA_dir + "{sample_id}.log" 24 | lr_cDNA_oligo_analysis = lr_cDNA_dir + "{sample_id}.oligo_analysis.csv" 25 | lr_cDNA_bam = lr_cDNA_dir + "{sample_id}.bam" 26 | lr_examples = lr_examples_dir + "{sample_id}.txt" 27 | 28 | lr_overview_dir = os.path.join(config['root_dir'], 'longread_overview/') 29 | lr_overview_pdf = lr_overview_dir + 'fidelity.pdf' 30 | lr_overview_csv = lr_overview_dir + 'overview.csv' 31 | 32 | LR_RAW_FILES = {} 33 | LR_SIGNATURE = {} 34 | LR_REPORT_STATS = [] 35 | def get_longread_output(project_df=None, config=None, **kw): 36 | """ 37 | This function is called from main.smk at least once 38 | to determine which output files need to be generated 39 | from longread longread analysis. 40 | We use this opportunity to populate LR_RAW_FILES 41 | """ 42 | out_files = [] 43 | for index, row in project_df.df.iterrows(): 44 | # for run_mode in row["run_mode"]: 45 | # run_mode_variables = project_df.config.get_run_mode(run_mode).variables 46 | if row.longreads: 47 | LR_REPORT_STATS.extend( 48 | expand(lr_report_stats, project_id=index[0], sample_id=index[1]) 49 | ) 50 | out_files += \ 51 | expand( 52 | lr_report, 53 | project_id=index[0], 54 | sample_id=index[1], 55 | ) + \ 56 | expand( 57 | lr_edits, 58 | project_id=index[0], 59 | sample_id=index[1], 60 | ) + \ 61 | expand( 62 | lr_cDNA_bam, 63 | project_id=index[0], 64 | sample_id=index[1], 65 | ) + \ 66 | expand( 67 | lr_cDNA_oligo_analysis, 68 | project_id=index[0], 69 | sample_id=index[1], 70 | ) 71 | 72 | LR_RAW_FILES[index[1]] = row.longreads 73 | LR_SIGNATURE[index[1]] = row.longread_signature 74 | 75 | # if we have any longread analysis, generate an overview plot 76 | if out_files: 77 | out_files.append(lr_overview_pdf) 78 | 79 | return out_files 80 | 81 | register_module_output_hook(get_longread_output, "longread.smk") 82 | 83 | def get_args(wc): 84 | args = f""" \ 85 | --cache={lr_cache_dir} \ 86 | --annotation-out={lr_ann_dir} \ 87 | --stats-out={lr_stats_dir} \ 88 | --report-out={lr_report_dir} \ 89 | --examples-out={lr_examples_dir} \ 90 | --sample={wc.sample_id} \ 91 | --signature={LR_SIGNATURE[wc.sample_id]} \ 92 | """.format(sample_id=wc.sample_id, project_id=wc.project_id) 93 | return args 94 | 95 | # Use {root_dir}/longread.yaml to set intact_bead layout and other settings that only make sense for 96 | # long reads 97 | longread_cmd = """ 98 | python -m spacemake.longread \ 99 | --parallel={threads} \ 100 | --config=longread.yaml \ 101 | {params.args} \ 102 | """ 103 | 104 | rule map_cDNA: 105 | input: lr_cDNA 106 | output: 107 | bam=lr_cDNA_bam, 108 | tmp=temp(directory(lr_cDNA_dir + 'tmp/')) 109 | params: 110 | index = lambda wc : get_star_index(wc)['index'], 111 | annotation = lambda wc: get_species_genome_annotation(wc)['annotation'], 112 | star_prefix = lr_cDNA_dir + 'tmp/', 113 | threads: 64 114 | shell: 115 | """ 116 | mkdir -p {params.star_prefix} 117 | STARlong \ 118 | --runThreadN {threads} \ 119 | --genomeDir {params.index} \ 120 | --genomeLoad NoSharedMemory \ 121 | --readFilesIn {input} \ 122 | --readFilesType Fastx \ 123 | --outSAMtype BAM Unsorted \ 124 | --outSAMunmapped Within \ 125 | --outSAMattributes All \ 126 | --outSAMprimaryFlag AllBestScore \ 127 | --outStd BAM_Unsorted \ 128 | --outFilterMultimapScoreRange 2 \ 129 | --outFilterScoreMin 0 \ 130 | --outFilterScoreMinOverLread 0 \ 131 | --outFilterMatchNminOverLread 0 \ 132 | --outFilterMatchNmin 30 \ 133 | --outFilterMismatchNmax 1000 \ 134 | --winAnchorMultimapNmax 200 \ 135 | --seedSearchStartLmax 12 \ 136 | --seedPerReadNmax 100000 \ 137 | --seedPerWindowNmax 100 \ 138 | --alignTranscriptsPerReadNmax 100000 \ 139 | --alignTranscriptsPerWindowNmax 10000 \ 140 | --outFileNamePrefix {output.tmp} | \ 141 | {dropseq_tools}/TagReadWithGeneFunction \ 142 | I=/dev/stdin \ 143 | O={output.bam} \ 144 | ANNOTATIONS_FILE={params.annotation} 145 | """ 146 | 147 | rule cmd_alnstats: 148 | input: 149 | rules.map_cDNA.output.bam 150 | output: 151 | oligo_csv=lr_cDNA_oligo_analysis, 152 | params: 153 | out = lambda wc: lr_cDNA_dir.format(**wc), 154 | shell: 155 | "alnstats --parse-oligos --out-csv={params.out} --out-pdf={params.out} --out-png={params.out} {input}" 156 | 157 | rule cmd_overview: 158 | input: 159 | reports=lambda wc: LR_REPORT_STATS 160 | output: 161 | pdf=lr_overview_pdf, 162 | csv=lr_overview_csv, 163 | params: 164 | out_path=lambda wc: lr_overview_dir.format(**wc), 165 | args="" 166 | shell: longread_cmd + " overview --output {params.out_path} {input.reports} " 167 | 168 | rule cmd_report: 169 | input: 170 | stats=lr_stats 171 | output: 172 | donuts=lr_report, 173 | repstats=lr_report_stats 174 | params: 175 | args=get_args 176 | threads: 1 177 | shell: longread_cmd + " report" 178 | 179 | rule cmd_extract: 180 | input: 181 | fname = lambda wc: LR_RAW_FILES[wc.sample_id], 182 | ann = lr_ann 183 | output: lr_cDNA 184 | params: 185 | args=get_args 186 | log: lr_cDNA_log 187 | # params: 188 | # known_barcodes = lambda wc: known_barcodes.get(wc.name,"") 189 | shell: longread_cmd + " extract {input.fname} 2> {log} > {output}" 190 | 191 | rule cmd_edits: 192 | input: 193 | fname = lambda wc: LR_RAW_FILES[wc.sample_id], 194 | stats = lr_stats 195 | output: lr_edits 196 | params: 197 | args=get_args 198 | threads: 1 199 | shell: longread_cmd + " edits {input.fname}" 200 | 201 | rule cmd_annotate: 202 | input: 203 | fname = lambda wc: LR_RAW_FILES[wc.sample_id], 204 | ann = lr_ann 205 | output: lr_stats 206 | params: 207 | args=get_args 208 | threads: 1 209 | shell: longread_cmd + " annotate {input.fname}" 210 | 211 | rule cmd_align: 212 | input: 213 | fname = lambda wc: LR_RAW_FILES[wc.sample_id] 214 | output: lr_ann 215 | params: 216 | args=get_args 217 | threads: 64 218 | shell: longread_cmd + " align {input.fname}" 219 | -------------------------------------------------------------------------------- /spacemake/snakemake/merge_samples.smk: -------------------------------------------------------------------------------- 1 | final_merged_bam = complete_data_root + final_bam_suffix + '.merged.bam' 2 | merged_ribo_depletion_log = complete_data_root + '/ribo_depletion_log.merged.txt' 3 | merged_star_log_file = complete_data_root + '/star.merged.Log.final.out' 4 | 5 | rule create_final_merged_bam: 6 | input: 7 | unpack(get_files_to_merge_snakemake(final_bam)) 8 | output: 9 | final_merged_bam 10 | threads: 4 11 | shell: 12 | "samtools merge -n -@ {threads} -o {output} {input}" 13 | 14 | rule create_merged_ribo_log: 15 | input: 16 | unpack(get_files_to_merge_snakemake(ribo_depletion_log)) 17 | output: 18 | merged_ribo_depletion_log 19 | shell: 20 | "cat {input} > {output}" 21 | 22 | rule create_merged_star_log: 23 | input: 24 | unpack(get_files_to_merge_snakemake(star_log_file)) 25 | output: 26 | merged_star_log_file 27 | run: 28 | logs = [] 29 | for f in input: 30 | with open(f, 'r') as fi: 31 | logs = logs + [fi.read().splitlines()] 32 | 33 | indices_to_save = [5, 8, 23, 10, 30] 34 | value_dict = {ix: 0 for ix in indices_to_save} 35 | indices_to_normalise = [10] 36 | 37 | # extract info from all logfiles, and add them up 38 | # we are only interested in lines 5, 8, 23, 10, 30 39 | # so: inp_reads, uniq_mapped_reads, avg_mapped_length, 40 | # multi_mapped_reads, unmapped_too_short 41 | for l in logs: 42 | for ix in value_dict.keys(): 43 | value_dict[ix] = value_dict[ix] + float(l[ix].split('\t')[1]) 44 | 45 | for ix in indices_to_normalise: 46 | value_dict[ix] = value_dict[ix] / len(logs) 47 | 48 | # print to output 49 | with open(output[0], 'w') as fo: 50 | ix = 0 51 | for line in logs[0]: 52 | entry = line.split('\t') 53 | if ix in value_dict.keys(): 54 | fo.write('%s\t%s\n' % (entry[0], value_dict[ix])) 55 | else: 56 | fo.write('%s\t%s\n' % (entry[0], 'NA')) 57 | ix = ix + 1 58 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | qc_sequencing_create_sheet_cache 2 | qc_sequencing_create_sheet_files 3 | automated_analysis_create_report_files 4 | automated_analysis_create_report_cache 5 | *.html 6 | .ipynb_checkpoints 7 | *.ipynb 8 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/automated_analysis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scanpy as sc 4 | import squidpy as sq 5 | 6 | from spacemake.spatial.util import detect_tissue 7 | 8 | # expect fitlered .h5ad dge, with spatial coords attached, tissue detected 9 | adata = sc.read_h5ad(snakemake.input[0]) 10 | umi_cutoff = int(snakemake.wildcards['umi_cutoff']) 11 | 12 | # filter_umi or detect tissue 13 | # if data spatial and detect_tissue=True 14 | if 'spatial' in adata.obsm.keys() and snakemake.params['run_mode_variables']['detect_tissue']: 15 | adata = detect_tissue(adata, umi_cutoff) 16 | print('tissue detection') 17 | else: 18 | print(f'filtering by umi cutoff: {umi_cutoff}') 19 | adata = adata[adata.obs.total_counts > umi_cutoff, :] 20 | 21 | # make the var indices (gene names) and obs indices (cell barcode) unique 22 | adata.obs_names_make_unique() 23 | adata.var_names_make_unique() 24 | 25 | # save the raw counts 26 | adata.raw = adata 27 | 28 | # identify highly variable genes if we have any observations 29 | nrow, ncol = adata.shape 30 | 31 | # require at least 1000 genes expressed in the sample and at least 100 cells 32 | if nrow > 100 and ncol >= 1000: 33 | print('starting analysis') 34 | try: 35 | sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=2000) 36 | except ValueError: 37 | sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=1000, span = 1) 38 | 39 | # calculate log(cpm) 40 | print('normalising and log-scaling') 41 | sc.pp.normalize_total(adata, target_sum=1e4) 42 | sc.pp.log1p(adata, base=2) 43 | 44 | # PCA ANALYSIS 45 | print('calculating pca components') 46 | sc.tl.pca(adata, svd_solver='arpack') 47 | 48 | # get number of pcs-s identified. Sometimes can be smaller than 50, if 49 | # less than 50 cells pass the threshold 50 | n_pcs = adata.uns['pca']['variance'].size 51 | # limit the number of pcs to 50 52 | n_pcs = n_pcs if n_pcs < 40 else 40 53 | 54 | # Compute the neighborhood graph 55 | print('computing neighborhood graph') 56 | sc.pp.neighbors(adata, n_pcs=n_pcs) 57 | 58 | # compute UMAP 59 | # for a very low number of cells, scanpy will throw an error here 60 | try: 61 | print('dimensionality reduction') 62 | sc.tl.umap(adata) 63 | except TypeError: 64 | pass 65 | 66 | # find out the clusters 67 | # restrict to max 20 clusters 68 | resolution = [0.4, 0.6, 0.8, 1.0, 1.2] 69 | 70 | print('clustering') 71 | 72 | 73 | if snakemake.params['is_spatial']: 74 | sq.gr.spatial_neighbors(adata, coord_type="generic") 75 | 76 | for res in resolution: 77 | try: 78 | res_key = 'leiden_' + str(res) 79 | 80 | sc.tl.leiden(adata, resolution = res, key_added = res_key) 81 | 82 | # finding marker genes 83 | print(f'ranking genes for resolution {res}') 84 | sc.tl.rank_genes_groups(adata, res_key, method='t-test', key_added = 'rank_genes_groups_' + res_key, pts=True, 85 | use_raw = False) 86 | if snakemake.params['is_spatial']: 87 | # calculate nhood enrichment from squidpy 88 | try: 89 | sq.gr.nhood_enrichment(adata, cluster_key=res_key) 90 | except ValueError: 91 | print('Only one cluster found in the data - skipping neighborhood analysis') 92 | pass 93 | except ZeroDivisionError as e: 94 | pass 95 | 96 | adata.write(snakemake.output[0]) 97 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/automated_analysis_create_processed_data_files.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scanpy as sc 4 | 5 | # save expression values as a long_df 6 | def create_long_df(expr_matrix, id_vars = ['cell_bc']): 7 | long_df = expr_matrix.melt(id_vars = id_vars, var_name = 'gene', value_name = 'expr') 8 | long_df = long_df[long_df.expr > 0] 9 | return long_df 10 | 11 | ############## 12 | # LOAD ADATA # 13 | ############## 14 | 15 | adata = sc.read(snakemake.input[0]) 16 | 17 | uns_keys = ['hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'] 18 | 19 | # all the keys have to be in adata.uns 20 | adata_complete = any([key in adata.uns.keys() for key in uns_keys]) 21 | 22 | ################# 23 | # TOP20 markers # 24 | ################# 25 | if not adata_complete: 26 | pd.DataFrame().to_csv(snakemake.output['cluster_markers']) 27 | pd.DataFrame().to_csv(snakemake.output['nhood_enrichment']) 28 | else: 29 | res_keys = adata.obs.columns[adata.obs.columns.str.startswith('leiden_')] 30 | 31 | top_20_marker_dfs = [] 32 | nhood_enrichment_dfs = [] 33 | 34 | # Iterate over different resolution values 35 | for res_key in res_keys: 36 | rank_key = 'rank_genes_groups_' + res_key 37 | 38 | if not 'names' in adata.uns[rank_key]: 39 | continue 40 | 41 | df = pd.DataFrame(adata.uns[rank_key]['names'])\ 42 | .melt(var_name = 'cluster', value_name = 'gene') 43 | 44 | for key in ['logfoldchanges', 'pvals', 'pvals_adj']: 45 | df_key = pd.DataFrame(adata.uns[rank_key][key])\ 46 | .melt(var_name = 'cluster', value_name = key) 47 | df[key] = df_key[key] 48 | # set the index to gene-cluster pair 49 | 50 | df.set_index(['gene', 'cluster'], inplace=True) 51 | 52 | for key in ['pts', 'pts_rest']: 53 | # get the percentage expressed in cluster and rest 54 | df2 = adata.uns[rank_key][key] 55 | df2['gene'] = df2.index 56 | df2 = df2.melt(var_name='cluster', id_vars='gene')\ 57 | .set_index(['gene', 'cluster']) 58 | 59 | df[key] = df2.loc[df.index].value 60 | 61 | df['resolution'] = res_key.split('_')[1] 62 | df.reset_index(inplace=True) 63 | 64 | # Restrict to top X markers 65 | df = df.groupby("cluster").head(20) 66 | 67 | top_20_marker_dfs.append(df) 68 | 69 | if snakemake.params['is_spatial']: 70 | try: 71 | # get nhood data 72 | df = pd.DataFrame(adata.uns[f'{res_key}_nhood_enrichment']['zscore']) 73 | df = pd.melt(df.reset_index(), id_vars='index')\ 74 | .rename(columns={'index': 'cluster_a', 75 | 'variable': 'cluster_b', 76 | 'value': 'zscore'}) 77 | df['resolution'] = res_key.split('_')[1] 78 | 79 | nhood_enrichment_dfs.append(df) 80 | except KeyError: 81 | pass 82 | 83 | pd.concat(top_20_marker_dfs).to_csv(snakemake.output['cluster_markers'], index=False) 84 | 85 | if snakemake.params['is_spatial']: 86 | pd.concat(nhood_enrichment_dfs).to_csv(snakemake.output['nhood_enrichment'], index=False) 87 | else: 88 | # output empty csv file 89 | pd.DataFrame().to_csv(snakemake.output['nhood_enrichment']) 90 | 91 | 92 | ############### 93 | # SAVE OBS DF # 94 | ############### 95 | obs_df = adata.obs 96 | 97 | if adata_complete: 98 | obs_df = sc.get.obs_df(adata, obsm_keys=[('X_umap', 0), ('X_umap', 1)])\ 99 | .join(obs_df)\ 100 | .rename(columns={'X_umap-0':'umap_0', 'X_umap-1':'umap_1'}) 101 | 102 | obs_df.index.set_names('cell_bc', inplace=True) 103 | 104 | obs_df.to_csv(snakemake.output['obs_df']) 105 | 106 | ############### 107 | # SAVE VAR DF # 108 | ############### 109 | adata.var.index.set_names('gene_name', inplace=True) 110 | 111 | adata.var.to_csv(snakemake.output['var_df']) 112 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/clean_top_barcodes.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import difflib 3 | 4 | # we need to reverse it 5 | optical_primer = 'GAATCACGATACGTACACCA'[::-1] 6 | optical_primer_len = len(optical_primer) 7 | 8 | nucl_stretches = ['TTTTTT', 'AAAAAAAA', 'CCCCCCCC', 'GGGGGGGG'] 9 | 10 | with open(snakemake.input[0], 'r') as fi, open(snakemake.output[0], 'w') as fo: 11 | for barcode in fi: 12 | barcode = barcode.strip() 13 | barcode_len = len(barcode) 14 | 15 | # clean up TAG=XC artifact 16 | if barcode == 'TAG=XC': 17 | continue 18 | 19 | matcher = difflib.SequenceMatcher(None, optical_primer, barcode) 20 | 21 | pos_optical_primer, pos_barcode, kmer_len = matcher.find_longest_match(0, optical_primer_len, 0, barcode_len) 22 | 23 | # if overlap with barcode is bigger than 4, and the overlap is at the end, skip 24 | if kmer_len > 3 and pos_barcode + kmer_len == barcode_len: 25 | continue 26 | 27 | # if overlap at least 7, anywhere, skip 28 | if kmer_len > 6: 29 | continue 30 | 31 | # if any of the nucl stretches is in the barcode, skip 32 | if any([stretch in barcode for stretch in nucl_stretches]): 33 | continue 34 | 35 | # write line to file 36 | _ = fo.write(barcode + '\n') 37 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/create_sample_db.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(magrittr) 3 | 4 | metadata <- read_csv(snakemake@input[[1]]) 5 | 6 | 7 | readStarLog <- function(log_file){ 8 | 9 | out = list() 10 | lines = readLines(log_file) 11 | 12 | out$input_reads = (lines[6] %>% strsplit('\t') %>% unlist)[2] %>% as.integer 13 | 14 | out$uniq_mapped_reads = (lines[9] %>% strsplit('\t') %>% unlist)[2] %>% as.integer 15 | 16 | #out$avg_length = (lines[11] %>% strsplit('\t') %>% unlist)[2] %>% as.numeric 17 | 18 | tibble(observation=names(out), value=unlist(unname(out))) 19 | } 20 | 21 | read_metrics <- metadata %>% 22 | select(project_id, sample_id, puck_id, species, sequencing_date) %>% 23 | mutate(star_log = paste0('/data/rajewsky/projects/slide_seq/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/star_Log.final.out'), 24 | read_types =paste0('/data/rajewsky/projects/slide_seq/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/split_reads/read_type_num.txt')) %>% 25 | 26 | filter(file.exists(star_log), file.exists(read_types)) %>% 27 | mutate(star_log = map(star_log, 28 | ~ readStarLog(.))) %>% 29 | unnest(star_log) %>% 30 | mutate(read_types = map(read_types, 31 | ~ read_table2(., col_names=c('rt_obs', 'rt_value')))) %>% 32 | unnest(read_types) %>% 33 | mutate(rt_obs = tolower(rt_obs)) %>% 34 | spread(rt_obs, rt_value) %>% 35 | spread(observation, value) 36 | 37 | read_metrics %>% 38 | write_delim(snakemake@output[[1]], '\t') 39 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/create_sample_overview.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | header-includes: 3 | - \usepackage{float} 4 | - \usepackage[table]{xcolor} 5 | output: 6 | html_document: 7 | toc: true 8 | toc_depth: 6 9 | classoption: landscape 10 | geometry: margin=0.5cm 11 | version: 0.1.1 12 | author: Tamas Ryszard Sztanka-Toth, Nikolaos Karaiskos 13 | email: tamasryszard.sztanka-toth@mdc-berlin.de, nikolaos.karaiskos@mdc.berlin.de 14 | license: GPL 15 | title: Sample overview 16 | pagetitle: Sample overview 17 | date: "`r format(Sys.time(),'%d/%m/%y')`" 18 | --- 19 | 20 | ```{r knitr_options, include=FALSE, cache=FALSE} 21 | knitr::opts_chunk$set( 22 | cache = F, 23 | autodep = TRUE, 24 | message = FALSE, 25 | warning = FALSE, 26 | comment = NA 27 | ) 28 | 29 | options(knitr.table.format ='markdown') 30 | ``` 31 | 32 | ```{r functions, echo = F} 33 | readStarLog <- function(log_file){ 34 | 35 | out = list() 36 | lines = readLines(log_file) 37 | 38 | out$input_reads = (lines[6] %>% strsplit('\t') %>% unlist)[2] %>% as.integer 39 | 40 | out$uniq_mapped_reads = (lines[9] %>% strsplit('\t') %>% unlist)[2] %>% as.integer 41 | 42 | #out$avg_length = (lines[11] %>% strsplit('\t') %>% unlist)[2] %>% as.numeric 43 | 44 | tibble(observation=names(out), value=unlist(unname(out))) 45 | } 46 | ``` 47 | 48 | ```{r load_projects_puck_info, echo=F} 49 | library(tidyverse) 50 | library(magrittr) 51 | metadata <- read_csv(snakemake@input[[1]]) 52 | ``` 53 | 54 | ```{r collect_data, echo = F} 55 | root_dir <- snakemake@config$root_dir 56 | read_metrics <- metadata %>% 57 | select(project_id, sample_id, puck_id, species, sequencing_date) %>% 58 | mutate(star_log = paste0(root_dir, '/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/star_Log.final.out'), 59 | read_types =paste0(root_dir,'/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/split_reads/read_type_num.txt')) %>% 60 | mutate(star_log = map(star_log, 61 | ~ readStarLog(.))) %>% 62 | unnest(star_log) %>% 63 | mutate(read_types = map(read_types, 64 | ~ read_table2(., col_names=c('rt_obs', 'rt_value')))) %>% 65 | unnest(read_types) %>% 66 | mutate(rt_obs = tolower(rt_obs)) %>% 67 | spread(rt_obs, rt_value) %>% 68 | spread(observation, value) 69 | ``` 70 | 71 | ```{r show_sample_table, echo = F} 72 | library(kableExtra) 73 | to_table <- read_metrics %>% 74 | mutate(um_r = uniq_mapped_reads) %>% 75 | gather('obs', 'val', intergenic, amb, coding, intronic, utr) %>% 76 | mutate(val_p = round(val / um_r, 2), 77 | val = round(val / 1e6, 2), 78 | # add ratio in paranthesis if obs is not cds 79 | val = paste0(val, ' (', val_p, ')'), 80 | uniq_mapped_reads = round(uniq_mapped_reads / 1e6, 2), 81 | input_reads = round(input_reads / 1e6, 2), 82 | uniq_mapped_reads = paste0(uniq_mapped_reads, ' (', round(uniq_mapped_reads / input_reads, 2), ')')) %>% 83 | select(-um_r, -val_p) %>% 84 | spread(obs, val) %>% 85 | arrange(species) %>% 86 | select(sample_id, puck_id, species, sequencing_date, input_reads, uniq_mapped_reads, coding, utr, intergenic, intronic, amb) %>% 87 | rename(uniq_m = uniq_mapped_reads, 88 | input_r = input_reads, 89 | cds = coding) 90 | ``` 91 | 92 | ```{r load_strand_info, echo = F} 93 | strand_info <- metadata %>% 94 | select(project_id, sample_id, puck_id, species, sequencing_date) %>% 95 | mutate(filename = paste0(root_dir, '/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/split_reads/strand_type_num.txt'), 96 | content = map(filename, ~read_table2(., col_names = c('obs', 'num')))) %>% 97 | unnest(content) %>% 98 | select(-filename, project_id) %>% 99 | group_by(sample_id) %>% 100 | mutate(num_sum = sum(num), 101 | num_ratio = round(num / num_sum, 2), 102 | num = round(num / 1e6, 2), 103 | num = paste0(num, ' (', num_ratio, ')')) %>% 104 | select(-num_ratio, -num_sum) %>% 105 | spread(obs, num) 106 | ``` 107 | 108 | ```{r load_barcode_metadata, echo = F} 109 | umi_cutoffs <- c(1, 10, 50, 100) 110 | 111 | load_filter_dge <- function(x, y){ 112 | read_table2(x, skip=6) %>% 113 | filter(NUM_TRANSCRIPTS > y) 114 | } 115 | 116 | read_dge_summary <- function(filename){ 117 | tibble(umi_cutoff = umi_cutoffs, filename=filename) %>% 118 | mutate(dat = map2(filename, umi_cutoff, load_filter_dge)) %>% 119 | select(-filename) %>% 120 | unnest(dat) %>% 121 | group_by(umi_cutoff) %>% 122 | summarise( 123 | median_umi = median(NUM_TRANSCRIPTS), 124 | median_reads = median(NUM_GENIC_READS), 125 | median_genes = median(NUM_GENES), 126 | median_pcr = median(round(NUM_GENIC_READS / NUM_TRANSCRIPTS, 1)), 127 | mean_umi = as.integer(mean(NUM_TRANSCRIPTS)), 128 | mean_reads = as.integer(mean(NUM_GENIC_READS)), 129 | mean_genes = as.integer(mean(NUM_GENES)), 130 | num_beads = n()) 131 | 132 | } 133 | 134 | barcode_metadata <- metadata %>% 135 | select(project_id, sample_id, puck_id, species, sequencing_date) %>% 136 | mutate(filename = paste0(root_dir, '/projects/', project_id, '/processed_data/', sample_id, '/illumina/complete_data/dge/')) %>% 137 | mutate(filename = ifelse(file.exists(paste0(filename, 'dge_all_summary.txt')), 138 | paste0(filename, 'dge_all_summary.txt'), 139 | paste0(filename, 'dge_all_cleaned_summary.txt')), 140 | content = map(filename, ~read_dge_summary(.))) %>% 141 | select(-filename, -project_id) %>% 142 | unnest(content) 143 | ``` 144 | 145 | ## Overview 146 | 147 | We show here downstream metadata for each experiment performed in the sts project. There are three types of tables: 148 | 149 | * Read information table: containing the parsed output of mapping, such as input read number, uniquely mapped read number etc. 150 | * Expression summary table: containing median number of umis, genes, reads (and mean) per bead for each sample. This is done after applying a UMI filter of 1, 10, 50, 100. 151 | * Strand information table: containing the numbers for reads mapping to the correct strand 152 | 153 | Each table has the following 4 columns: sample\_id, puck\_id, species, sequencing\_date 154 | 155 | ### Table column description 156 | 157 | __Read information table__ 158 | 159 | * input\_r: number of input reads (millions) from the flowcell 160 | * uniq\_m: number of uniquely mapped reads (millions). In parantheses ratio to input\_r 161 | * cds, utr, intergenic, intronic, amb: coding, utr, intergenic, intronic and ambient (overlapping genes on both strands, or cannot be assigned to a single gene part). In millions, in parantheses ratio to uniq\_m. 162 | 163 | __Expression summary tables__ 164 | 165 | All columns here are in raw counts. We have mean and median for UMIs, genes, reads (all per bead). Median pcr is the median of reads/umi (per bead). 166 | 167 | __Strand information table__ 168 | 169 | Here there are 6 columns: minus\_AMB, minus\_minus, minus\_plus, plus\_AMB, plus\_minus, plus\_plus. The first part is the position of the read (plus or minus strand) the second is the position of the mapped gene. AMB means that the mapped gene is ambient (overlapping genes on different strand) or that the read is intergenic. 170 | 171 | ## Tables by species containing sequencing metadata 172 | 173 | 174 | ```{r print_by_species, echo = F, results = 'asis'} 175 | for(s in unique(to_table$species)){ 176 | cat(paste0('### ', s, ' samples')) 177 | cat('\n') 178 | 179 | cat('#### Read information table\n') 180 | to_table %>% 181 | filter(species == s) %>% 182 | kable("html") %>% 183 | kable_styling('striped', font_size=12) %>% 184 | row_spec(row=0, bold=T) %>% 185 | print 186 | 187 | cat('\n') 188 | cat('[Back to top](#)\n\n') 189 | 190 | cat('#### Expression summary tables\n') 191 | 192 | for(cutoff in umi_cutoffs){ 193 | cat(paste0('##### UMI cutoff: ', cutoff)) 194 | cat('\n') 195 | 196 | barcode_metadata %>% 197 | filter(species == s, umi_cutoff == cutoff) %>% 198 | kable("html") %>% 199 | kable_styling('striped', font_size=12) %>% 200 | row_spec(row=0, bold=T) %>% 201 | print 202 | 203 | cat('\n') 204 | cat('[Back to top](#)\n\n') 205 | 206 | } 207 | 208 | cat('#### Strand information table\n') 209 | 210 | strand_info %>% 211 | filter(species == s) %>% 212 | kable("html") %>% 213 | kable_styling('striped', font_size=12) %>% 214 | row_spec(row=0, bold=T) %>% 215 | print 216 | 217 | cat('\n') 218 | cat('[Back to top](#)\n\n') 219 | } 220 | ``` 221 | 222 | 223 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/create_spatial_dge.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scanpy as sc 4 | from spacemake.util import detect_tissue, attach_barcode_file 5 | 6 | dge_path = snakemake.input['dge'] 7 | 8 | # umi cutoff 9 | umi_cutoff = int(snakemake.wildcards['umi_cutoff']) 10 | 11 | adata = sc.read_h5ad(dge_path) 12 | 13 | print('data read') 14 | 15 | has_barcode_file = 'barcode_file' in snakemake.input.keys() 16 | 17 | # ATTACH BARCODE FILE # 18 | if has_barcode_file: 19 | adata = attach_barcode_file(adata, snakemake.input['barcode_file']) 20 | 21 | # filter out cells based on umi, and genes based on number of cells 22 | sc.pp.filter_cells(adata, min_genes=1) 23 | sc.pp.filter_genes(adata, min_cells=3) 24 | 25 | print('data filtered') 26 | 27 | # DETECT TISSUE # 28 | # if there is no barcode file, filter adata based on UMI, otherwise detect tissue with UMI cutoff 29 | if has_barcode_file and snakemake.params['downstream_variables']['detect_tissue']: 30 | tissue_indices = detect_tissue(adata, umi_cutoff) 31 | adata = adata[tissue_indices, :] 32 | else: 33 | adata = adata[adata.obs.total_counts > umi_cutoff, :] 34 | 35 | adata.write(snakemake.output[0]) 36 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/filter_mm_reads.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import datetime 3 | import argparse 4 | import numpy as np 5 | 6 | counted_regions = ['UTR', 'CODING'] 7 | 8 | def select_alignment(alignments): 9 | read_names = [aln.query_name for aln in alignments] 10 | if read_names.count(read_names[0]) != len(read_names): 11 | print(read_names) 12 | raise Exception(f'input alignments do not come from the same read') 13 | 14 | def is_exonic(aln): 15 | if not aln.has_tag('XF'): 16 | return False 17 | 18 | return aln.get_tag('XF') in counted_regions 19 | 20 | alignments_are_exonic = np.array([is_exonic(aln) for aln in alignments]) 21 | 22 | exonic_ix = np.where(alignments_are_exonic == True)[0] 23 | 24 | num_exonic = exonic_ix.shape[0] 25 | 26 | if num_exonic == 1: 27 | # if only one exonic reads from the group 28 | # return the exonic indices 29 | return alignments[exonic_ix[0]] 30 | else: 31 | return None 32 | 33 | if __name__ == '__main__': 34 | parser = argparse.ArgumentParser(description='Filter out ambiguous multi-mapper reads') 35 | 36 | parser.add_argument('--in-bam', help='input bam') 37 | parser.add_argument('--out-bam', help='output bam') 38 | 39 | args = parser.parse_args() 40 | print(args) 41 | 42 | bam_in = pysam.AlignmentFile(args.in_bam, "rb") 43 | 44 | bam_out = pysam.AlignmentFile(args.out_bam, 'wb', header= bam_in.header) 45 | counter = 0 46 | start_time = datetime.datetime.now() 47 | finish_time = start_time 48 | total_start_time = datetime.datetime.now() 49 | time_interval = 30 50 | 51 | multi_mappers = [] 52 | 53 | for aln in bam_in.fetch(until_eof=True): 54 | counter += 1 55 | 56 | finish_time = datetime.datetime.now() 57 | delta_seconds = (finish_time - start_time).seconds 58 | total_elapsed_seconds = (finish_time - total_start_time).total_seconds() 59 | 60 | if delta_seconds >= time_interval: 61 | formatted_time = finish_time.strftime('%Y-%m-%d %H:%M:%S') 62 | records_per_second = counter / delta_seconds 63 | 64 | print(f'Processed {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Average processing rate: {records_per_second:,.0f} records/second. Current time: {formatted_time}') 65 | 66 | start_time = finish_time 67 | 68 | mapped_number = aln.get_tag('NH') 69 | 70 | if mapped_number == 1: 71 | bam_out.write(aln) 72 | else: 73 | if len(multi_mappers) < (mapped_number - 1): 74 | # still some multimappers missing. we need to add the alignments 75 | # until the last one to the list 76 | multi_mappers.append(aln) 77 | else: 78 | # add the last alignment 79 | multi_mappers.append(aln) 80 | # decide which, if any, to keep 81 | aln_to_keep = select_alignment(multi_mappers) 82 | 83 | if aln_to_keep is not None: 84 | # set aln secondary flag to 0, so that it is flagged as primary 85 | # secondary flag is at 0x100, so 8th bit (starting from 0) 86 | aln_to_keep.flag = aln_to_keep.flag & ~(1<<8) 87 | bam_out.write(aln_to_keep) 88 | 89 | # reset multimapper list 90 | multi_mappers = [] 91 | 92 | formatted_time = finish_time.strftime("%Y-%m-%d %H:%M:%S") 93 | print(f'Finished processing {counter:,} records in {total_elapsed_seconds:,.0f} seconds. Current time: {formatted_time}') 94 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/fix_bam_header.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | __version__ = "0.9" 3 | __author__ = [ 4 | "Marvin Jens", 5 | ] 6 | __license__ = "GPL" 7 | __email__ = [ 8 | "marvin.jens@mdc-berlin.de", 9 | ] 10 | 11 | import pysam 12 | import argparse 13 | import os 14 | import sys 15 | import logging 16 | 17 | 18 | def print_header(header): 19 | for k, v in sorted(header.items()): 20 | if type(v) == dict: 21 | vstr = " ".join([f"{x}:{y}" for x, y in sorted(v.items())]) 22 | print(f"@{k}:\t{vstr}") 23 | elif type(v) == list: 24 | for row in v: 25 | if type(row) == dict: 26 | vstr = " ".join([f"{x}:{y}" for x, y in sorted(row.items())]) 27 | else: 28 | vstr = str(row) 29 | print(f"@{k}:\t{vstr}") 30 | else: 31 | print(f"@{k}:\t{v}") 32 | 33 | 34 | def merge_headers(orig, star): 35 | merged = dict(orig) 36 | # most recent program should be on top 37 | merged["PG"] = star["PG"] + merged["PG"] 38 | merged["SQ"] = star["SQ"] 39 | merged["HD"]["SO"] = star["HD"]["SO"] # sorted by 40 | 41 | return merged 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser( 46 | description="Fix .bam header of the STAR mapped output .bam" 47 | ) 48 | 49 | parser.add_argument("--in-bam-star", help="mapped star bam input") 50 | parser.add_argument("--in-bam-tagged", help="unmapped dropseq tagged bam") 51 | parser.add_argument("--out-bam", help="output bam") 52 | 53 | args = parser.parse_args() 54 | 55 | bam_star = pysam.AlignmentFile(args.in_bam_star, "rb") 56 | bam_tagged = pysam.AlignmentFile(args.in_bam_tagged, "rb", check_sq=False) 57 | 58 | star_header = bam_star.header.to_dict() 59 | tagged_header = bam_tagged.header.to_dict() 60 | merged_header = merge_headers(tagged_header, star_header) 61 | # print(f"STAR header") 62 | # print_header(star_header) 63 | 64 | # print(f"original header") 65 | # print_header(tagged_header) 66 | 67 | # print("merged header") 68 | # print_header(merged_header) 69 | 70 | # copy input to output, just with the new header 71 | bam_out = pysam.AlignmentFile(args.out_bam, "wb", header=merged_header) 72 | for aln in bam_star.fetch(until_eof=True): 73 | bam_out.write(aln) 74 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/kmer_stats_from_fastq.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import gzip 3 | import itertools 4 | import os 5 | from collections import Counter 6 | 7 | bases = ["A", "C", "T", "G", "N"] 8 | kmer_len = int(snakemake.params["kmer_len"]) 9 | 10 | # in this case we look for 4 mers 11 | kmers = ["".join(kmer) for kmer in itertools.product(bases, repeat=kmer_len)] 12 | 13 | position_counts = None 14 | read_len = 0 15 | position_list = [] 16 | 17 | read_kmer_hashes = [] 18 | 19 | with gzip.open(snakemake.input[0], "rt") as fastq_in: 20 | line = 0 21 | for read in fastq_in: 22 | if line == 1: 23 | read = read.strip("\n") 24 | read_len = len(read) 25 | position_list = list(range(read_len - kmer_len + 1)) 26 | kmer_hashes = [ 27 | "_".join(prod) 28 | for prod in itertools.product(kmers, [str(x) for x in position_list]) 29 | ] 30 | position_counts = pd.DataFrame(0, index=kmer_hashes, columns=["count"]) 31 | 32 | # if line is a read 33 | if line % 4 == 1: 34 | kmer_hashes = kmer_hashes + [ 35 | str(read[i : i + kmer_len]) + "_" + str(i) for i in position_list 36 | ] 37 | 38 | line = line + 1 39 | if line % 4000 == 0: 40 | kmer_hash_counts = Counter(kmer_hashes) 41 | # print(kmer_hash_counts.values()) 42 | 43 | # update df 44 | position_counts.loc[kmer_hash_counts.keys(), "count"] = position_counts.loc[ 45 | kmer_hash_counts.keys(), "count" 46 | ] + list(kmer_hash_counts.values()) 47 | # print(position_counts) 48 | 49 | kmer_hashes = [] 50 | 51 | if line % 4000000 == 0: 52 | print("%s reads processed" % (line / 4)) 53 | 54 | position_counts.index.rename("kmer_hash", inplace=True) 55 | 56 | file_path = snakemake.output[0] 57 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 58 | 59 | position_counts.to_csv(file_path) 60 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/parse_ribo_log.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def parse_ribo_log(ribo_log_file): 4 | # before the log, there can be some perl warnings prepended. so we need to find the 5 | # first line outputed by bwa mem 6 | input_reads = 0 7 | aligned_reads = 0 8 | 9 | # ribo log summary line: first line of the summary 10 | first_line_regex = r'^\d+ reads; of these:$' 11 | first_line_found = False 12 | 13 | line_n = 0 14 | 15 | with open(ribo_log_file) as f: 16 | for line in f: 17 | stripped_line = line.strip() 18 | 19 | if stripped_line == 'no_rRNA_index': 20 | input_reads = -1 21 | aligned_reads = -1 22 | break 23 | 24 | if not first_line_found: 25 | if re.match(first_line_regex, stripped_line) is not None: 26 | first_line_found = True 27 | line_n = 0 28 | else: 29 | # keep looking for first line 30 | continue 31 | 32 | if line_n == 0: 33 | input_reads = input_reads + int(stripped_line.split(' ')[0]) 34 | elif line_n == 3 or line_n == 4: 35 | aligned_reads = aligned_reads + int(stripped_line.split(' ')[0]) 36 | # reset after the fifth line, this is needed if there are several ribolog files 37 | # appended one after the other. this is the case for merged samples 38 | elif line_n == 5: 39 | first_line_found = False 40 | 41 | line_n = line_n + 1 42 | 43 | 44 | if input_reads <= 0: 45 | return (None, None) 46 | else: 47 | return (aligned_reads, input_reads) 48 | 49 | 50 | if snakemake.params.ribo_log == "no_rRNA_index": 51 | input_reads = -1 52 | aligned_reads = -1 53 | 54 | else: 55 | aligned_reads, input_reads = parse_ribo_log(snakemake.params.ribo_log) 56 | 57 | with open(snakemake.output[0], 'w') as fo: 58 | fo.write(f'aligned_reads\t{aligned_reads}\n') 59 | fo.write(f'input_reads\t{input_reads}\n') 60 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/shared_functions.R: -------------------------------------------------------------------------------- 1 | readStarLog <- function(log_file){ 2 | 3 | out = list() 4 | lines = readLines(log_file) 5 | 6 | out$input_reads = (lines[6] %>% strsplit('\t') %>% unlist)[2] %>% as.integer 7 | 8 | out$uniq_mapped_reads = (lines[9] %>% strsplit('\t') %>% unlist)[2] %>% as.integer 9 | 10 | #out$avg_length = (lines[11] %>% strsplit('\t') %>% unlist)[2] %>% as.numeric 11 | 12 | tibble(observation=names(out), value=unlist(unname(out))) 13 | } 14 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/splice_bam_header.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | __version__ = "0.9" 3 | __author__ = [ 4 | "Marvin Jens", 5 | ] 6 | __license__ = "GPL" 7 | __email__ = [ 8 | "marvin.jens@mdc-berlin.de", 9 | ] 10 | 11 | import pysam 12 | import argparse 13 | import os 14 | import sys 15 | import logging 16 | 17 | 18 | def print_header(header): 19 | for k, v in sorted(header.items()): 20 | if type(v) == dict: 21 | vstr = " ".join([f"{x}:{y}" for x, y in sorted(v.items())[::-1]]) 22 | print(f"@{k}:\t{vstr}") 23 | elif type(v) == list: 24 | for row in v: 25 | if type(row) == dict: 26 | vstr = " ".join([f"{x}:{y}" for x, y in sorted(row.items())[::-1]]) 27 | else: 28 | vstr = str(row) 29 | print(f"@{k}:\t{vstr}") 30 | else: 31 | print(f"@{k}:\t{v}") 32 | 33 | 34 | def unique_IDs(pg_list): 35 | from collections import defaultdict 36 | 37 | id_counts = defaultdict(int) 38 | 39 | # first, iterate over entire list and count how often each program ID is there. 40 | pp_list = [None] 41 | if len(pg_list) > 1: 42 | pp_list += pg_list[:-1] 43 | 44 | pg_new = [] 45 | for pg, pp in zip(pg_list, pp_list): 46 | name = pg["ID"].split(".")[0] 47 | # edit in-place 48 | id_counts[name] += 1 49 | pg["ID"] = f"{name}.{id_counts[name]}" 50 | # id_counts[name] -= 1 51 | 52 | if pp: 53 | pname = pp["ID"].split(".")[0] 54 | pg["PP"] = f"{pname}.{id_counts[pname]}" 55 | 56 | pg_new.append(pg) 57 | 58 | return pg_new 59 | 60 | 61 | def merge_headers(orig, other, enforce_RG=True): 62 | merged = dict(orig) 63 | # start from the original, including VN and RG entries... 64 | # connect the processing chains: 65 | # most recent program should be on top 66 | # Previous Program (PP) of the first new output was the last Program Name (PN) in the original uBAM 67 | other["PG"][-1]["PP"] = orig["PG"][0]["ID"] 68 | merged["PG"] = unique_IDs(merged["PG"] + other["PG"]) 69 | 70 | if "SO" in other["HD"]: 71 | merged["HD"]["SO"] = other["HD"]["SO"] # keep sort-order 72 | 73 | # sequence identifiers should be absent from uBAM and at any rate are overwritten here 74 | merged["SQ"] = other["SQ"] 75 | if enforce_RG and not "RG" in merged or len(merged["RG"]) == 0: 76 | merged["RG"] = {"ID": "A", "SM": "NA"} 77 | # merged['HD']['SO'] = star['HD']['SO'] # sorted by 78 | 79 | return merged 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser( 84 | description=( 85 | "STAR and bowtie2 create a new header from scratch and ignore everything upstream. " 86 | "This script fixes the .bam headers of such mapped output by splicing it together with " 87 | "the original uBAM header." 88 | ) 89 | ) 90 | 91 | parser.add_argument( 92 | "--in-bam", 93 | help="mapped star/bowtie2 bam input (default=/dev/stdin)", 94 | default="/dev/stdin", 95 | ) 96 | parser.add_argument("--in-ubam", help="unmapped dropseq tagged bam", required=True) 97 | parser.add_argument( 98 | "--out-bam", 99 | help="fixed output bam (default=/dev/stdout)", 100 | default="/dev/stdout", 101 | ) 102 | parser.add_argument("--out-mode", help="mode for output (default=b0)", default="b0") 103 | 104 | args = parser.parse_args() 105 | 106 | mbam = pysam.AlignmentFile(args.in_bam, "rb") 107 | ubam = pysam.AlignmentFile(args.in_ubam, "rb", check_sq=False) 108 | 109 | mapped_header = mbam.header.to_dict() 110 | ubam_header = ubam.header.to_dict() 111 | merged_header = merge_headers(ubam_header, mapped_header) 112 | # print(f"mapped BAM header") 113 | # print_header(mapped_header) 114 | 115 | # print(f"original uBAM header") 116 | # print_header(ubam_header) 117 | 118 | # print("merged header") 119 | # print_header(merged_header) 120 | 121 | # copy input to output, just with the new header 122 | bam_out = pysam.AlignmentFile( 123 | args.out_bam, f"w{args.out_mode}", header=merged_header 124 | ) 125 | for aln in mbam.fetch(until_eof=True): 126 | bam_out.write(aln) 127 | -------------------------------------------------------------------------------- /spacemake/snakemake/scripts/split_reads_by_strand_info.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | 5 | parser = argparse.ArgumentParser(description='Split .bam file to .sam files by mapped reads strand orientation') 6 | parser.add_argument('file_in', metavar = 'in', type=str) 7 | parser.add_argument('--prefix') 8 | 9 | args = parser.parse_args() 10 | 11 | prefix = args.prefix 12 | 13 | read_type_num = {'INTERGENIC':0, 'INTRONIC':0, 'CODING':0, 'UTR':0, 'AMB':0} 14 | 15 | strand_type_num = { 16 | 'minus_minus': 0, 17 | 'minus_plus': 0, 18 | 'plus_plus': 0, 19 | 'plus_minus': 0, 20 | 'plus_AMB': 0, 21 | 'minus_AMB': 0 22 | } 23 | 24 | out_file_names = {x: prefix + x + '.sam' for x in strand_type_num.keys()} 25 | 26 | out_files = {x: open(out_file_names[x], 'w') for x in out_file_names.keys()} 27 | 28 | def return_collapsed(it): 29 | # set has exactly 1 element, meaning that all elements are the same in the list 30 | if len(set(it)) == 1: 31 | return it[0] 32 | else: 33 | return 'AMB' 34 | 35 | with open(args.file_in, 'r') as fi: 36 | for line in fi: 37 | # if line is header line 38 | if line.startswith('@'): 39 | for f in out_files.values(): 40 | f.write(line) 41 | 42 | # go to next iteration 43 | continue 44 | 45 | line_stripped = line.strip() 46 | 47 | elements = line_stripped.split() 48 | 49 | last = elements[-1] 50 | 51 | read_overlaps_gene = False 52 | 53 | # set gene strand 54 | if last.startswith('gs'): 55 | # if last element begins with gs, this means that read overlaps a gene (fw or rv strands) 56 | read_overlaps_gene = True 57 | gene_strand = return_collapsed(last.split(':')[-1].split(',')) 58 | 59 | if gene_strand == '-': 60 | gene_strand = 'minus' 61 | elif gene_strand == '+': 62 | gene_strand = 'plus' 63 | 64 | else: 65 | gene_strand = 'AMB' 66 | 67 | # set read strand 68 | if elements[1] == '0': 69 | read_strand = 'plus' 70 | else: 71 | read_strand = 'minus' 72 | 73 | # get read type 74 | if read_overlaps_gene: 75 | read_type = return_collapsed(elements[-3].split(':')[-1].split(',')) 76 | else: 77 | # if read do not overlap a gene, it is clearly intergenic 78 | read_type = 'INTERGENIC' 79 | 80 | 81 | read_type_num[read_type] = read_type_num[read_type] + 1 82 | 83 | strand_type = read_strand + '_' + gene_strand 84 | 85 | strand_type_num[strand_type] = strand_type_num[strand_type] + 1 86 | 87 | # print the read to the correct split file, depending on strand orientation 88 | out_files[strand_type].write(line) 89 | 90 | with open(prefix + 'read_type_num.txt', 'w') as fo: 91 | for key, value in read_type_num.items(): 92 | fo.write('%s %s\n' % (key, value)) 93 | 94 | with open(prefix + 'strand_type_num.txt', 'w') as fo: 95 | for key, value in strand_type_num.items(): 96 | fo.write('%s %s\n' % (key, value)) 97 | 98 | for f in out_files.values(): 99 | f.close() 100 | -------------------------------------------------------------------------------- /spacemake/snakemake/species_init.smk: -------------------------------------------------------------------------------- 1 | annotation_file = os.path.join(config['root_dir'], 2 | config['annotation_file_pattern']) 3 | genome_file = os.path.join(config['root_dir'], 4 | config['genome_file_pattern']) 5 | 6 | rule all: 7 | input: 8 | expand(annotation_file, species = config['species'], 9 | data_type = 'annotation'), 10 | expand(genome_file, species = config['species'], 11 | data_type = 'genome') 12 | 13 | rule unzip: 14 | input: 15 | '{filename}.gz' 16 | output: 17 | '{filename}' 18 | shell: "unpigz {input}" 19 | 20 | def get_url(wildcards): 21 | return config[wildcards.species + '_' + wildcards.data_type + '_url'] 22 | 23 | rule download_species_annotation: 24 | output: 25 | annotation_file.replace('.gtf', '.gtf.gz') 26 | params: 27 | url = lambda wildcards: get_url(wildcards) 28 | shell: 29 | "wget -O {output} {params.url}" 30 | 31 | rule download_species_genome: 32 | output: 33 | genome_file.replace('.fa', '.fa.gz') 34 | params: 35 | url = lambda wildcards: get_url(wildcards) 36 | shell: 37 | "wget -O {output} {params.url}" 38 | -------------------------------------------------------------------------------- /spacemake/snakemake/visium.smk: -------------------------------------------------------------------------------- 1 | configfile: 'config.yaml' 2 | 3 | spaceranger_out_id = 'sr_out-{sample}-{run_type}' 4 | 5 | spaceranger_outs = [ 6 | spaceranger_out_id + '/outs/web_summary.html' 7 | ] 8 | 9 | raw_reads = 'data/reads/raw/{sample_id}_S{S}_L002_R{R}_001.fastq.gz' 10 | linked_reads = 'data/reads/linked/{sample}_S{S}_L002_R{R}_001.fastq.gz' 11 | 12 | spaceranger_script = 'spaceranger-1.2.0/spaceranger' 13 | 14 | linked_reads_root = 'data/reads/linked/' 15 | raw_reads_root = 'data/reads/raw/' 16 | 17 | run_types = ['exon', 'exon_intron'] 18 | 19 | rule all: 20 | input: 21 | expand(spaceranger_outs, sample = config['samples'].keys(), run_type = run_types) 22 | 23 | def get_raw_reads(wildcards): 24 | sample_id = config['samples'][wildcards.sample]['id'] 25 | 26 | return expand(raw_reads, sample_id = sample_id, S= wildcards.S, R = wildcards.R) 27 | 28 | rule link_raw_reads: 29 | input: 30 | unpack(get_raw_reads) 31 | output: 32 | linked_reads 33 | shell: 34 | "ln -sr {input} {output}" 35 | 36 | def get_spaceranger_inputs(wildcards): 37 | S = config['samples'][wildcards.sample]['S'] 38 | img = config['samples'][wildcards.sample]['img'] 39 | sample_id = config['samples'][wildcards.sample]['id'] 40 | 41 | return { 42 | 'reads': expand(raw_reads, sample_id = sample_id, S=S, R=[1,2]), 43 | 'img': img } 44 | 45 | def get_refdata(wildcards): 46 | if wildcards.run_type == 'exon': 47 | return 'refdata-mm10-M23' 48 | elif wildcards.run_type == 'exon_intron': 49 | return 'refdata-pre-mm10-M23' 50 | 51 | rule run_spaceranger_counts: 52 | input: 53 | unpack(get_spaceranger_inputs) 54 | output: 55 | spaceranger_outs 56 | params: 57 | area = lambda wildcards: config['samples'][wildcards.sample]['area'], 58 | sample_id = lambda wildcards: config['samples'][wildcards.sample]['id'], 59 | refdata = lambda wildcards: get_refdata(wildcards), 60 | run_id = spaceranger_out_id 61 | wildcard_constraints: 62 | run_type='|'.join(run_types) 63 | threads: 8 64 | shell: 65 | # first we remove the directory, otherwise space ranger is gonna fail 66 | # the directory is created by snakemake, by default. but after creation 67 | # spaceranger thinks that it has already run... 68 | """ 69 | rm -rf {params.run_id} 70 | {spaceranger_script} count --id={params.run_id} \ 71 | --transcriptome={params.refdata} \ 72 | --fastqs={raw_reads_root} \ 73 | --sample={params.sample_id} \ 74 | --image={input.img} \ 75 | --localcores={threads} \ 76 | --localmem=64 \ 77 | --unknown-slide \ 78 | --reorient-images 79 | """ 80 | -------------------------------------------------------------------------------- /spacemake/spatial/__init__.py: -------------------------------------------------------------------------------- 1 | # # include in top level for backward compatibility 2 | # from .util import compute_neighbors, compute_islands, detect_tissue, \ 3 | # create_mesh, create_meshed_adata 4 | # # added novosparc_reconstruction for backward compatibility 5 | # from . import novosparc_integration as novosparc_reconstruction 6 | # from . import puck_collection as puck_collection 7 | -------------------------------------------------------------------------------- /spacemake/spatial/cmdline.py: -------------------------------------------------------------------------------- 1 | #from ..config import ConfigFile 2 | #from ..project_df import ProjectDF 3 | from ..util import message_aggregation, bool_in_str, str2bool 4 | from ..errors import SpacemakeError 5 | 6 | import argparse 7 | import logging 8 | 9 | logger_name = "spacemake.spatial" 10 | logger = logging.getLogger(logger_name) 11 | 12 | def get_expression_img_parser(with_umi_cutoff = False): 13 | parser = argparse.ArgumentParser(allow_abbrev=False, add_help=False) 14 | 15 | parser.add_argument('--project_id', type=str, 16 | required=True) 17 | 18 | parser.add_argument('--sample_id', type=str, 19 | required=True) 20 | 21 | parser.add_argument('--run_mode', type=str, 22 | required=True) 23 | 24 | parser.add_argument('--umi_cutoff', type=int, 25 | required=False) 26 | 27 | parser.add_argument('--binary_top_qth_percentile', 28 | type=int, required=False, default=30) 29 | 30 | parser.add_argument('--binary', type=str, 31 | required=False, default='False') 32 | 33 | parser.add_argument('--processed_data', type=str, 34 | required=False, default='False') 35 | 36 | parser.add_argument('--out_img', 37 | type=str, 38 | required=True) 39 | 40 | return parser 41 | 42 | def setup_spatial_parser(parent_parser_subparsers): 43 | parser = parent_parser_subparsers.add_parser('spatial', 44 | help = 'spacemake spatial commands') 45 | 46 | subparsers = parser.add_subparsers() 47 | 48 | aggregated_img_parser = subparsers.add_parser( 49 | 'create_aggregated_expression_img', 50 | parents=[get_expression_img_parser()]) 51 | 52 | aggregated_img_parser.set_defaults( 53 | func=lambda args: create_expression_img_cmdline(args, 54 | 'aggregated')) 55 | 56 | spot_img_parser = subparsers.add_parser( 57 | 'create_spot_expression_img', 58 | parents=[get_expression_img_parser()]) 59 | 60 | spot_img_parser.set_defaults( 61 | func=lambda args: create_expression_img_cmdline(args, 62 | 'spot')) 63 | 64 | @message_aggregation(logger_name) 65 | def create_expression_img_cmdline(args, img_type): 66 | import cv2 67 | logger.info('Loading dge file...') 68 | from spacemake.smk import Spacemake 69 | spmk = Spacemake() 70 | 71 | if str2bool(args['processed_data']): 72 | if not 'umi_cutoff' in args: 73 | raise SpacemakeError('When creating image from processed data,' 74 | ' a --umi_cutoff value must be provided') 75 | 76 | adata = spmk.load_processed_adata( 77 | project_id = args['project_id'], 78 | sample_id = args['sample_id'], 79 | run_mode_name = args['run_mode'], 80 | umi_cutoff = args['umi_cutoff']) 81 | 82 | else: 83 | adata = spmk.load_raw_spatial_adata( 84 | project_id = args['project_id'], 85 | sample_id = args['sample_id'], 86 | run_mode_name = args['run_mode']) 87 | 88 | logger.info(f'Generating {img_type} expression image...') 89 | if img_type == 'spot': 90 | from .he_integration import create_spot_expression_img 91 | img, img_bw = create_spot_expression_img(adata, 92 | binary=str2bool(args['binary'])) 93 | elif img_type == 'aggregated': 94 | from .he_integration import create_aggregated_expression_img 95 | img, img_bw = create_aggregated_expression_img( 96 | adata, 97 | binary_top_qth_percentile=int(args['binary_top_qth_percentile'])) 98 | 99 | if str2bool(args['binary']): 100 | img = img_bw 101 | 102 | cv2.imwrite(args['out_img'], img) -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | #rm project_df.csv > /dev/null 4 | 5 | spacemake projects add_sample --project_id test \ 6 | --sample_id sc_rnaseq_sample \ 7 | --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 8 | --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 9 | --species mouse 10 | 11 | spacemake projects add_sample --project_id test \ 12 | --sample_id sc_rnaseq_sample_2 \ 13 | --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 14 | --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 15 | --species mouse \ 16 | --barcode_flavor visium 17 | 18 | # with one bc file 19 | spacemake projects add_sample --project_id test \ 20 | --sample_id one_bc_file \ 21 | --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 22 | --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 23 | --species mouse \ 24 | --barcode_flavor visium \ 25 | --puck visium 26 | 27 | # with two bc files 28 | spacemake projects add_sample --project_id test \ 29 | --sample_id two_bc_files \ 30 | --R1 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 31 | --R2 spacemake/data/test/visium_public_lane_joined_1m_R1.fastq.gz \ 32 | --species mouse \ 33 | --barcode_flavor visium \ 34 | --puck visium \ 35 | --puck_barcode_file spacemake/data/test/test_bc1.csv spacemake/data/test/test_bc2.csv 36 | 37 | # update sample 38 | spacemake projects update_sample --project_id test \ 39 | --sample_id two_bc_files \ 40 | --investigator Test 41 | 42 | spacemake projects merge_samples --merged_project_id test \ 43 | --merged_sample_id test_merged \ 44 | --project_id_list test \ 45 | --sample_id_list one_bc_file two_bc_files 46 | 47 | # this is expected to fail as has different barcode_flavor 48 | spacemake projects merge_samples --merged_project_id test \ 49 | --merged_sample_id test_merged_2 \ 50 | --project_id_list test \ 51 | --sample_id_list sc_rnaseq_sample two_bc_files 52 | 53 | spacemake projects merge_samples --merged_project_id test \ 54 | --merged_sample_id test_merged_2 \ 55 | --project_id_list test \ 56 | --sample_id_list sc_rnaseq_sample_2 two_bc_files 57 | -------------------------------------------------------------------------------- /test_data/README.md: -------------------------------------------------------------------------------- 1 | # SPACEMAKE test data 2 | 3 | There is a mix of old and new stuff in here that needs to be cleaned up. In general, I think file size should not exceed 1 MB. If you need a larger file, let's place it on bimsbstatic and download it from there like so: 4 | 5 | `wget https://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/spacemake_tile_test_data.tar.gz` 6 | 7 | 8 | Thanks! 9 | -------------------------------------------------------------------------------- /test_data/make_chr22_test_data.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import pandas as pd 3 | from spacemake.util import FASTQ_src, read_fq 4 | from byo.track import load_track 5 | 6 | genome = load_track("/data/rajewsky/genomes/hg38/hg38.fa") 7 | 8 | # load output of gene_loci_to_gtf.py: a table with gene-name, start and end coordinates 9 | df_genes = pd.read_csv( 10 | "chr22_gene_bounds.csv", sep="\t", names=["gene", "start", "end", "L"] 11 | ).set_index("gene") 12 | 13 | # keep track of the few dozen reads that Nikos has selected 14 | selected_reads = {} 15 | for fa_id, seq, qual in read_fq("reads_chr22_R2.fastq.gz"): 16 | if "IGLC3" in fa_id: 17 | print(f"selecting IGLC3 read {fa_id} -> {fa_id.split('_')[0]}") 18 | selected_reads[fa_id.split("_")[0]] = fa_id 19 | 20 | # then go through the SAM file (no header) to get the mapping position of these reads (not ideal) 21 | df = pd.read_csv( 22 | "/data/rajewsky/home/nkarais/murphy/fc_sts/collect_reads_chr22/final.polyA_adapter_trimmed_chr22.sam", 23 | sep="\t", 24 | header=None, 25 | ) 26 | 27 | starts = [] 28 | for row in df.itertuples(): 29 | # print(row) 30 | qname = row[1] 31 | if "A00643:496:HFJ5MDRX2:1:2101:12888:1172" in qname: 32 | print(f"YAY! detected IGLC3 read {qname}") 33 | 34 | if qname in selected_reads: 35 | print(f"selecting read {qname}") 36 | starts.append(row[4]) 37 | 38 | # find genes that overlap the selected reads mapping position 39 | # this intersection code is very crude, but effective 40 | intervals = set() 41 | starts = set(starts) 42 | for row in df_genes.itertuples(): 43 | next_starts = set(starts) 44 | for x in starts: 45 | if row.start < x and row.end > x: 46 | intervals.add((row.start, row.end)) 47 | print(f"selecting gene entry '{row}'") 48 | next_starts.discard(x) 49 | 50 | starts = next_starts 51 | 52 | print( 53 | f"we have the following start coordinates left. selecting buffer regions around these" 54 | ) 55 | print(starts) 56 | 57 | 58 | def do_merge(s, e, intervals): 59 | keep = [] 60 | for j, (s2, e2) in enumerate(intervals): 61 | new = (s2, e2) 62 | if s2 <= e and e2 >= e: 63 | print(f"overlap on the right, s={s} e={e} s2={s2} e2={e2}") 64 | new = (min(s2, s), max(e, e2)) 65 | elif e2 >= s and s2 <= s: 66 | print(f"overlap on the left, s={s} e={e} s2={s2} e2={e2}") 67 | new = (min(s2, s), max(e, e2)) 68 | elif s2 >= s and e2 <= e: 69 | print(f"contained in other interval. discard, s={s} e={e} s2={s2} e2={e2}") 70 | continue 71 | 72 | keep.append(new) 73 | 74 | return keep 75 | 76 | 77 | # merge intervals that have some overlap 78 | intervals = sorted(list(intervals), key=lambda x: x[1] - x[0], reverse=True) 79 | print(intervals) 80 | 81 | while True: 82 | changed = False 83 | for i, (s, e) in enumerate(intervals): 84 | others = intervals[i + 1 :] 85 | keep = do_merge(s, e, others) 86 | if keep != others: 87 | print("we had a change!") 88 | print("before") 89 | for s, e in intervals: 90 | print(f"{s} - {e}") 91 | 92 | intervals = intervals[: i + 1] + keep 93 | intervals = sorted(list(intervals), key=lambda x: x[1] - x[0], reverse=True) 94 | print("after") 95 | for s, e in intervals: 96 | print(f"{s} - {e}") 97 | 98 | changed = True 99 | break # the for loop 100 | 101 | if not changed: 102 | break 103 | 104 | intervals = sorted(list(set(intervals)), key=lambda x: x[1] - x[0], reverse=True) 105 | print("remaining intervals") 106 | for s, e in intervals: 107 | print(f"{s} - {e}") 108 | 109 | 110 | # Okay, now we know the gene loci which are needed to map the test reads! 111 | intervals = list(sorted(intervals)) 112 | print(f"relevant intervals found: {len(intervals)}") 113 | 114 | # extract the genomic sequence for the loci we need and save as a mini-"genome" 115 | with open("test_genome.fa", "wt") as f: 116 | for start, end in intervals: 117 | seq = genome.get_oriented("chr22", start, end, "+") 118 | f.write(f">test_chr22.{start}-{end}\n{seq}\n") 119 | 120 | # cut down the GTF annotation to only those parts that pertain to the genic regions we care about 121 | with open("test_annotation.gtf", "wt") as f: 122 | for line in open("gencode.v38.chr22.gtf", "rt"): 123 | if line.startswith("#"): 124 | continue 125 | 126 | parts = line.rstrip().split("\t") 127 | chrom, source, rec, start, end = parts[:5] 128 | if chrom != "chr22": 129 | continue 130 | 131 | start = int(start) 132 | end = int(end) 133 | 134 | for s, e in intervals: 135 | if ( 136 | (start <= s and end > s) # overlap the start of interval 137 | or (start > s and end < e) # internal to interval 138 | or (start < e and end > e) # overlap the end of interval 139 | or (start < s and end > e) # overlap the entire interval 140 | ): 141 | # the name of the pseudo-chromosome this is on (see excision of genomic sequence above) 142 | chrom = f"test_{chrom}.{s}-{e}" 143 | 144 | start = max( 145 | 0, start - s 146 | ) # translate start and end coordinates from whole chr22 to the gene region 147 | end = min(e - s, end - s) 148 | 149 | parts[0:5] = (chrom, "test_data", rec, str(start), str(end)) 150 | f.write("\t".join(parts) + "\n") 151 | 152 | # Done! We now have: 153 | # * test_genome.fa.gz with the gene sequences 154 | # * test_annotation.gtf.gz with the gene models (exon/intron, CDS/UTR etc.) 155 | # * reads_chr22_R1.fastq.gz with test read barcodes mapping to a few CBs and UMIs 156 | # * reads_chr22_R2.fastq.gz with test read cDNAs mapping to the genic regions we care about 157 | -------------------------------------------------------------------------------- /test_data/mirgenedb.hsa.mature.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/mirgenedb.hsa.mature.fa.gz -------------------------------------------------------------------------------- /test_data/mirgenedb.hsa.mature.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/mirgenedb.hsa.mature.gtf.gz -------------------------------------------------------------------------------- /test_data/rRNA_hsa.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/rRNA_hsa.fa.gz -------------------------------------------------------------------------------- /test_data/reads_chr22_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/reads_chr22_R1.fastq.gz -------------------------------------------------------------------------------- /test_data/reads_chr22_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/reads_chr22_R2.fastq.gz -------------------------------------------------------------------------------- /test_data/test_annotation.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_annotation.gtf.gz -------------------------------------------------------------------------------- /test_data/test_bam_md5.txt: -------------------------------------------------------------------------------- 1 | ./projects/test/processed_data/test_01/illumina/complete_data/final.polyA_adapter_trimmed.bam 85b00c5c1c699e4e3afda2b52b9d6442 2 | ./projects/test/processed_data/test_01/illumina/complete_data/genome.STAR.bam 85b00c5c1c699e4e3afda2b52b9d6442 3 | ./projects/test/processed_data/test_01/illumina/complete_data/unaligned_bc_tagged.bam 75d9ee7a618f8f766938192c84c9ac5a 4 | ./projects/test/processed_data/test_01/illumina/complete_data/unaligned_bc_unassigned.bam d41d8cd98f00b204e9800998ecf8427e 5 | ./projects/test/processed_data/test_02/illumina/complete_data/final.polyA_adapter_trimmed.bam c6fa15dcf2a36cea7479349ccf004523 6 | ./projects/test/processed_data/test_02/illumina/complete_data/genome.STAR.bam c6fa15dcf2a36cea7479349ccf004523 7 | ./projects/test/processed_data/test_02/illumina/complete_data/miRNA.bowtie2.bam 98b57c64f1814c61e320b4fc96d75deb 8 | ./projects/test/processed_data/test_02/illumina/complete_data/rRNA.bowtie2.bam ec87ba1ac2e64f78db4fb9ea84162dc5 9 | ./projects/test/processed_data/test_02/illumina/complete_data/unaligned_bc_tagged.bam 75d9ee7a618f8f766938192c84c9ac5a 10 | ./projects/test/processed_data/test_02/illumina/complete_data/unaligned_bc_unassigned.bam d41d8cd98f00b204e9800998ecf8427e 11 | -------------------------------------------------------------------------------- /test_data/test_config.yaml: -------------------------------------------------------------------------------- 1 | root_dir: '.' 2 | temp_dir: '/tmp' 3 | external_bin: 4 | dropseq_tools: '/data/rajewsky/shared_bins/Drop-seq_tools-2.5.1/' 5 | logging: 6 | level: INFO 7 | debug: "spacemake.util.read_fq" 8 | 9 | puck_data: 10 | barcode_file: 'predictions_ml.csv' 11 | root: 'puck_data' 12 | 13 | pucks: 14 | default: 15 | width_um: 3000 16 | spot_diameter_um: 10 17 | visium: 18 | barcodes: 'puck_data/visium_barcode_positions.csv' 19 | width_um: 6500 20 | spot_diameter_um: 55 21 | seq_scope: 22 | width_um: 1000 23 | spot_diameter_um: 1 24 | slide_seq: 25 | width_um: 3000 26 | spot_diameter_um: 10 27 | test_puck: 28 | width_um: 4000 29 | spot_diameter_um: 1 30 | openst: 31 | width_um: 1200 32 | spot_diameter_um: 0.6 33 | coordinate_system: 'puck_data/openst_coordinate_system.csv' 34 | 35 | run_modes: 36 | default: 37 | n_beads: 100000 38 | umi_cutoff: [100, 300, 500] 39 | clean_dge: False 40 | detect_tissue: False 41 | count_intronic_reads: True 42 | count_mm_reads: False 43 | mesh_data: False 44 | mesh_type: 'circle' 45 | mesh_spot_diameter_um: 55 46 | mesh_spot_distance_um: 100 47 | visium: 48 | n_beads: 10000 49 | umi_cutoff: [1000] 50 | clean_dge: False 51 | detect_tissue: True 52 | count_intronic_reads: False 53 | count_mm_reads: True 54 | slide_seq: 55 | n_beads: 100000 56 | umi_cutoff: [50] 57 | clean_dge: False 58 | detect_tissue: False 59 | scRNA_seq: 60 | n_beads: 10000 61 | umi_cutoff: [500] 62 | detect_tissue: False 63 | count_intronic_reads: True 64 | count_mm_reads: False 65 | seq_scope: 66 | clean_dge: false 67 | count_intronic_reads: false 68 | count_mm_reads: false 69 | detect_tissue: false 70 | mesh_data: true 71 | mesh_spot_diameter_um: 10 72 | mesh_spot_distance_um: 15 73 | mesh_type: hexagon 74 | n_beads: 1000 75 | umi_cutoff: 76 | - 100 77 | - 300 78 | spatial_rm: 79 | clean_dge: false 80 | count_intronic_reads: false 81 | count_mm_reads: false 82 | detect_tissue: false 83 | mesh_data: true 84 | mesh_spot_diameter_um: 10 85 | mesh_spot_distance_um: 15 86 | mesh_type: hexagon 87 | n_beads: 1000 88 | umi_cutoff: 89 | - 500 90 | - 1000 91 | openst: 92 | clean_dge: false 93 | count_intronic_reads: true 94 | count_mm_reads: true 95 | detect_tissue: false 96 | mesh_data: true 97 | mesh_spot_diameter_um: 7 98 | mesh_spot_distance_um: 7 99 | mesh_type: hexagon 100 | n_beads: 100000 101 | polyA_adapter_trimming: true 102 | spatial_barcode_min_matches: 0.1 103 | umi_cutoff: 104 | - 100 105 | - 250 106 | - 500 107 | 108 | 109 | barcode_flavors: 110 | default: 111 | cell: "r1[0:12]" 112 | UMI: "r1[12:20]" 113 | dropseq: 114 | cell: "r1[0:12]" 115 | UMI: "r1[12:20]" 116 | slide_seq_14bc: 117 | cell: "r1[0:14]" 118 | UMI: "r1[14:23]" 119 | slide_seq_15bc: 120 | cell: "r1[0:14]" 121 | UMI: "r1[15:23]" 122 | visium: 123 | cell: "r1[0:16]" 124 | UMI: "r1[16:28]" 125 | sc_10x_v2: 126 | cell: "r1[0:16]" 127 | UMI: "r1[16:26]" 128 | seq_scope: 129 | UMI: "r2[0:9]" 130 | cell: "r1[0:20]" 131 | nextflex: 132 | min_qual_trim: 20 133 | cell: "'A'" 134 | read1: "None" 135 | UMI: "r2[:4] + r2[-4:]" 136 | seq: "r2[4:-4]" 137 | qual: "r2_qual[4:-4]" 138 | openst: 139 | UMI: "r2[0:9]" 140 | bam_tags: "CR:{cell},CB:{cell},MI:{UMI},RG:{assigned}" 141 | cell: "r1[2:27]" 142 | 143 | adapters: 144 | optical_primer: GAATCACGATACGTACACCA 145 | TSO_SMART: AAGCAGTGGTATCAACGCAGAGTGAATGGG 146 | SMART: AAGCAGTGGTATCAACGCAGAGTG 147 | smart: AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTC 148 | TSO_10x: AAGCAGTGGTATCAACGCAGAGTACATGGG 149 | chromium_bead: CTACACGACGCTCTTCCGATCT 150 | dropseq_bead: AAGCAGTGGTATCAACGCAGAGTAC 151 | polyA: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 152 | polyG: GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG 153 | nextflex_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTGAA 154 | truseq_RA3: TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCA 155 | 156 | adapter_flavors: 157 | SMARTER: 158 | cut_right: 159 | - Q: 160 | min_base_quality: 30 161 | - polyA: 162 | max_error: 0.25 163 | min_overlap: 3 164 | paired_end: replace_N 165 | chromium: 166 | cut_right: 167 | - Q: 168 | min_base_quality: 32 169 | - polyA: 170 | max_error: 0.25 171 | min_overlap: 3 172 | - polyG: 173 | max_error: 0.1 174 | min_overlap: 3 175 | dropseq: 176 | cut_left: 177 | - TSO_SMART: 178 | max_error: 0.1 179 | min_overlap: 10 180 | cut_right: 181 | - Q: 182 | min_base_quality: 30 183 | - polyA: 184 | max_error: 0.25 185 | min_overlap: 3 186 | - polyG: 187 | max_error: 0.1 188 | min_overlap: 3 189 | paired_end: single-end 190 | default: 191 | cut_left: 192 | - TSO_SMART: 193 | max_error: 0.1 194 | min_overlap: 10 195 | cut_right: 196 | - Q: 197 | min_base_quality: 30 198 | - polyA: 199 | max_error: 0.25 200 | min_overlap: 3 201 | - polyG: 202 | max_error: 0.1 203 | min_overlap: 3 204 | paired_end: single-end 205 | 206 | quant: 207 | default: 208 | counter_class: "spacemake.quant.DefaultCounter" 209 | channels: 210 | - "counts" 211 | - "exonic_counts" 212 | - "exonic_reads" 213 | - "intronic_counts" 214 | - "intronic_reads" 215 | X_counts: ["exonic_counts", "intronic_counts"] 216 | alignment_priorities: { 217 | 'C': 101, # coding exon 218 | 'c': 100, # coding exon (lower case == antisense) 219 | 'U': 51, # UTR exon 220 | 'u': 50, 221 | 'CU': 51, # overlaps both, CDS+UTR (should in fact never occur as 'CU') 222 | 'cu': 50, 223 | 'N': 21, # exon of non-coding transcript 224 | 'n': 20, 225 | 'I': 11, # intronic region 226 | 'i': 10, 227 | '-': 0, 228 | } 229 | gene_priorities: { 230 | 'C': 101, # coding exon 231 | 'c': 100, # coding exon (lower case == antisense) 232 | 'U': 51, # UTR exon 233 | 'u': 50, 234 | 'CU': 51, # overlaps both, CDS+UTR (should in fact never occur as 'CU') 235 | 'cu': 50, 236 | 'N': 21, # exon of non-coding transcript 237 | 'n': 20, 238 | 'I': 11, # intronic region 239 | 'i': 10, 240 | '-': 0, 241 | } 242 | exonic_tags: ["C", "U", "CU", "N", "c", "u", "cu", "n"] 243 | intronic_tags: ["I", "i"] 244 | alignment_selection: priority 245 | exon_intron_disambiguation: "exon_wins" 246 | miRNA: 247 | alignment_selection: take_first 248 | chrom: 249 | alignment_selection: take_first 250 | gene_selection: chrom 251 | custom_index: 252 | alignment_selection: take_first_plus 253 | gene_selection: chrom 254 | species: 255 | test_hsa: 256 | genome: 257 | annotation: "{spacemake_dir}/test_data/test_genome.gtf.gz" 258 | sequence: "{spacemake_dir}/test_data/test_genome.fa.gz" 259 | miRNA: 260 | annotation: "{spacemake_dir}/test_data/mirgenedb.hsa.mature.gtf.gz" 261 | sequence: "{spacemake_dir}/test_data/mirgenedb.hsa.mature.fa.gz" 262 | rRNA: 263 | annotation: '' 264 | sequence: "{spacemake_dir}/test_data/rRNA_hsa.fa.gz" 265 | 266 | -------------------------------------------------------------------------------- /test_data/test_genome.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_genome.fa.gz -------------------------------------------------------------------------------- /test_data/test_genome.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_genome.gtf.gz -------------------------------------------------------------------------------- /test_data/test_project_df.csv: -------------------------------------------------------------------------------- 1 | project_id,sample_id,puck_barcode_file_id,sample_sheet,species,demux_barcode_mismatch,demux_dir,basecalls_dir,R1,R2,reads,longreads,longread_signature,investigator,sequencing_date,experiment,puck_barcode_file,run_mode,barcode_flavor,is_merged,merged_from,puck,dge,map_strategy,adapter_flavor 2 | test,test_01,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,STAR:genome,dropseq 3 | test,test_01b,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,STAR:genome,dropseq 4 | test,test_02,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq 5 | test,test_03_nofinal,['no_spatial_data'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,,['default'],dropseq,False,[],default,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR,dropseq 6 | tile,tile_1,['tile_1'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,['{spacemake_dir}/test_data/tile_1.txt'],['spatial_rm'],dropseq,False,[],test_puck,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq 7 | tile,tile_2,['tile_2'],,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,['{spacemake_dir}/test_data/tile_2.txt'],['spatial_rm'],dropseq,False,[],test_puck,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq 8 | tile,tile_both,"['tile_1', 'tile_2']",,test_hsa,1,,,['{spacemake_dir}/test_data/reads_chr22_R1.fastq.gz'],['{spacemake_dir}/test_data/reads_chr22_R2.fastq.gz'],,,,unknown,unknown,unknown,"['{spacemake_dir}/test_data/tile_1.txt', '{spacemake_dir}/test_data/tile_2.txt']",['spatial_rm'],dropseq,False,[],test_puck,,rRNA:bowtie2->miRNA:bowtie2->genome:STAR:final,dropseq 9 | -------------------------------------------------------------------------------- /test_data/test_reads.R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_reads.R1.fastq.gz -------------------------------------------------------------------------------- /test_data/test_reads.R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajewsky-lab/spacemake/274cd58e7ba9de244aca25c6bbd7cdfe49683753/test_data/test_reads.R2.fastq.gz -------------------------------------------------------------------------------- /test_data/tile_1.txt: -------------------------------------------------------------------------------- 1 | cell_bc x_pos y_pos 2 | ACGTACGTACGT 0 0 3 | GAAGGACTTCAA 0 1 4 | TATTTGGCACTC 1 0 5 | CTCTGATTAGGT 1 1 6 | -------------------------------------------------------------------------------- /test_data/tile_2.txt: -------------------------------------------------------------------------------- 1 | cell_bc x_pos y_pos 2 | ATTGTACGCATC 0 0 3 | GACGTGACGGCA 0 1 4 | TTATTGCGAGAC 1 0 5 | GTTGCAACTGTA 1 1 6 | -------------------------------------------------------------------------------- /test_data/tile_3.txt: -------------------------------------------------------------------------------- 1 | cell_bc x_pos y_pos 2 | AGTAGGGGTGTC 1 1 3 | AGCAAACTCGGC 1 2 4 | ATTTTATAGAGT 2 1 5 | CGGACGATGTGG 2 2 -------------------------------------------------------------------------------- /tests/fixtures.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | spacemake_dir = os.path.abspath(os.path.dirname(__file__) + "/../") 5 | print("SPACEMAKE_DIR", spacemake_dir) 6 | 7 | 8 | def sm(*argc, expect_fail=False): 9 | # construct the desired cmdline 10 | import sys 11 | 12 | sys.argv = [ 13 | "spacemake", 14 | ] + list(argc) 15 | 16 | # ensure that no ConfigFile and ProjectDF instances 17 | # are retained from previous tests 18 | import spacemake.config 19 | import spacemake.project_df 20 | 21 | spacemake.config.__global_config = None 22 | spacemake.project_df.__global_ProjectDF = None 23 | 24 | # execute spacemake cmdline code 25 | from spacemake.cmdline import cmdline 26 | 27 | res = cmdline() 28 | # print("res", res) 29 | if expect_fail: 30 | assert isinstance(res, Exception) == True 31 | else: 32 | assert isinstance(res, Exception) == False 33 | 34 | return res 35 | 36 | 37 | def _init(): 38 | # just get the version 39 | sm("--version") 40 | 41 | # test the init parser 42 | sm("init", "--dropseq-tools", "/data/rajewsky/shared_bins/Drop-seq_tools-2.5.1/") 43 | 44 | 45 | def _add_species(): 46 | sm( 47 | "config", 48 | "add-species", 49 | "--name=test_hsa", 50 | "--reference=genome", 51 | f"--sequence={spacemake_dir}/test_data/test_genome.fa.gz", 52 | f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz", 53 | ) 54 | # add a second reference 55 | sm( 56 | "config", 57 | "add-species", 58 | "--name=test_hsa", 59 | "--reference=rRNA", 60 | f"--sequence={spacemake_dir}/test_data/rRNA_hsa.fa.gz", 61 | ) 62 | # add a third reference 63 | sm( 64 | "config", 65 | "add-species", 66 | "--name=test_hsa", 67 | "--reference=miRNA", 68 | f"--sequence={spacemake_dir}/test_data/mirgenedb.hsa.mature.fa.gz", 69 | ) 70 | # pretend we have mouse as well 71 | # TODO: place some actual mouse genome and/or phiX genomes in test-data repository 72 | sm( 73 | "config", 74 | "add-species", 75 | "--name=mouse", 76 | "--reference=genome", 77 | f"--sequence={spacemake_dir}/test_data/test_genome.fa.gz", 78 | f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz", 79 | ) 80 | sm( 81 | "config", 82 | "add-species", 83 | "--name=mouse", 84 | "--reference=phiX", 85 | f"--sequence={spacemake_dir}/test_data/test_genome.fa.gz", 86 | f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz", 87 | ) 88 | sm( 89 | "config", 90 | "add-species", 91 | "--name=mouse", 92 | "--reference=rRNA", 93 | f"--sequence={spacemake_dir}/test_data/rRNA_hsa.fa.gz", 94 | ) 95 | 96 | 97 | @pytest.fixture 98 | def tmp_root(tmp_path_factory): 99 | tmp = tmp_path_factory.mktemp("root_blank") 100 | 101 | return tmp 102 | 103 | 104 | @pytest.fixture 105 | def initialized_root(tmp_path_factory): 106 | tmp = tmp_path_factory.mktemp("root_initialized") 107 | os.chdir(tmp.as_posix()) 108 | 109 | _init() 110 | return tmp 111 | 112 | 113 | @pytest.fixture 114 | def with_species(initialized_root): 115 | os.chdir(initialized_root.as_posix()) 116 | # # test old way 117 | # sm( 118 | # "config", "add_species", 119 | # "--name=hsa_test", 120 | # f"--genome={spacemake_dir}/test_data/test_genome.fa.gz", 121 | # f"--annotation={spacemake_dir}/test_data/test_genome.gtf.gz", 122 | # ) 123 | # test new way 124 | _add_species() 125 | return initialized_root 126 | 127 | 128 | @pytest.fixture 129 | def configured_root(tmp_path_factory): 130 | tmp_root = tmp_path_factory.mktemp("root_preconfigured") 131 | 132 | # make a tmp-copy of the test_config.yaml 133 | def_config = os.path.join(spacemake_dir, "test_data/test_config.yaml") 134 | os.system(f"cp {def_config} {tmp_root / 'config.yaml'}") 135 | 136 | test_pdf = os.path.join(spacemake_dir, "test_data/test_project_df.csv") 137 | open(f"{tmp_root / 'project_df.csv'}", "w").write( 138 | open(test_pdf, "r").read().format(spacemake_dir=spacemake_dir) 139 | ) 140 | # os.system(f"cp {test_pdf} {tmp_root / 'project_df.csv'}") 141 | 142 | return tmp_root 143 | 144 | 145 | @pytest.fixture(scope="session") 146 | def with_tile_test_data(tmp_path_factory): 147 | tmp = tmp_path_factory.mktemp("root_tile_test") 148 | os.chdir(tmp.as_posix()) 149 | _init() 150 | _add_species() 151 | print( 152 | "return code", 153 | os.system( 154 | "wget https://bimsbstatic.mdc-berlin.de/rajewsky/spacemake-test-data/spacemake_tile_test_data.tar.gz -O /dev/stdout | tar -xz" 155 | ), 156 | ) 157 | print(os.listdir(".")) 158 | 159 | return tmp 160 | -------------------------------------------------------------------------------- /tests/test_fastq_to_ubam.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | import pytest 4 | import sys 5 | import os 6 | from spacemake.bin.fastq_to_uBAM import * 7 | 8 | 9 | spacemake_dir = os.path.dirname(__file__) + "/../" 10 | 11 | 12 | @pytest.fixture(scope="session") 13 | def test_root(tmp_path_factory): 14 | tmp = tmp_path_factory.mktemp("root") 15 | sm_path = os.path.dirname(__file__) 16 | # make a tmp-copy of the test_config.yaml 17 | # def_config = os.path.join(sm_path, "../test_data/test_config.yaml") 18 | # os.system(f"cp {def_config} {tmp / 'config.yaml'}") 19 | 20 | # test_pdf = os.path.join(sm_path, "../test_data/test_project_df.csv") 21 | # os.system(f"cp {test_pdf} {tmp / 'project_df.csv'}") 22 | 23 | return tmp 24 | 25 | 26 | def sm(*argc, expect_fail=False): 27 | sys.argv = [ 28 | "fastq_to_uBAM.py", 29 | ] + list(argc) 30 | res = cmdline() 31 | print("got result", res) 32 | from spacemake.errors import SpacemakeError 33 | 34 | if expect_fail: 35 | assert isinstance(res, SpacemakeError) == True 36 | else: 37 | assert isinstance(res, Exception) == False 38 | 39 | 40 | def test_help(): 41 | try: 42 | sm("--help") 43 | except SystemExit: 44 | pass 45 | 46 | 47 | def test_dropseq(): 48 | sm( 49 | "--read1", 50 | spacemake_dir + "test_data/reads_chr22_R1.fastq.gz", 51 | "--read2", 52 | spacemake_dir + "test_data/reads_chr22_R2.fastq.gz", 53 | "--out-bam", 54 | "/dev/null", 55 | ) 56 | 57 | 58 | def test_single(): 59 | sm( 60 | "--read2", 61 | spacemake_dir + "test_data/reads_chr22_R2.fastq.gz", 62 | "--out-bam", 63 | "/dev/null", 64 | """--cell='"A"'""", 65 | ) 66 | 67 | 68 | def test_minqual(): 69 | sm( 70 | "--read2", 71 | spacemake_dir + "test_data/reads_chr22_R2.fastq.gz", 72 | "--out-bam", 73 | "/dev/null", 74 | "--min-qual", 75 | "30", 76 | """--cell='"A"'""", 77 | ) 78 | 79 | 80 | def test_issue135(): 81 | import spacemake.bin.fastq_to_uBAM as ubam 82 | from argparse import Namespace 83 | 84 | args = Namespace( 85 | bam_tags="CR:{cell},CB:{cell},MI:{UMI},RG:A", 86 | min_len=18, 87 | min_qual_trim=0, 88 | cell="r1[8:20][::-1]", 89 | UMI="r1[0:8]", 90 | seq="r2", 91 | qual="r2_qual", 92 | disable_safety=False, 93 | ) 94 | 95 | fmt = ubam.make_formatter_from_args(args) 96 | attrs = fmt( 97 | r2_qname="QNAME MUST NOT HAVE WHITESPACE", 98 | r1="ACGTACGT", 99 | r1_qual="########", 100 | r2="TGCATGCATGCATGCA", 101 | r2_qual="################", 102 | ) 103 | sam = ubam.make_sam_record(flag=4, **attrs) 104 | cols = sam.split() 105 | assert cols[0] == "QNAME" 106 | assert cols[1] == "4" 107 | # print(sam) 108 | 109 | 110 | # if __name__ == "__main__": 111 | # test_issue135() 112 | -------------------------------------------------------------------------------- /tests/test_map_strategy.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from spacemake.map_strategy import * 4 | from spacemake.config import ConfigFile 5 | from spacemake.project_df import ProjectDF 6 | from spacemake.errors import * 7 | import os 8 | 9 | from fixtures import configured_root, tmp_root, sm, spacemake_dir 10 | 11 | 12 | def test_validation(configured_root): 13 | config = ConfigFile.from_yaml((configured_root / "config.yaml").as_posix()) 14 | data = [ 15 | ( 16 | "flipped", 17 | "rRNA:bowtie2->genome:STAR", 18 | "test_hsa", 19 | "bowtie2:rRNA->STAR:genome", 20 | ), 21 | ( 22 | "species_missing", 23 | "bowtie2:rRNA->STAR:genome", 24 | "test_hs", 25 | "", 26 | ), 27 | ( 28 | "with_cflavor", 29 | "bowtie2@custom_index:rRNA->STAR@default:genome", 30 | "test_hsa", 31 | "bowtie2@custom_index:rRNA->STAR@default:genome", 32 | ), 33 | ( 34 | "unknown_cflavor", 35 | "rRNA:bowtie2@custom->genome:STAR@default", 36 | "test_hsa", 37 | "", 38 | ), 39 | # ("flipped", "bowtie2:rRNA->STAR:genome", "rRNA:bowtie2->genome:STAR"), 40 | ] 41 | for name, mapstr, species, expect in data: 42 | # print(f"running test {name}") 43 | try: 44 | res = validate_mapstr(mapstr, config=config, species=species) 45 | except (ValueError, ConfigVariableNotFoundError) as e: 46 | res = str(type(e)) 47 | 48 | print(f"test '{name}': {mapstr}-> {res} expect={expect} {expect == res}") 49 | assert res == expect 50 | 51 | 52 | def test_mapstr(configured_root): 53 | config = ConfigFile.from_yaml((configured_root / "config.yaml").as_posix()) 54 | data = [ 55 | ("with_cflavor", "bowtie2@custom_index:rRNA->STAR@default:genome", None), 56 | ] 57 | for name, mapstr, expect in data: 58 | mr, lr = mapstr_to_targets(mapstr) 59 | assert mr[0].input_name == "uBAM" 60 | assert mr[-1].input_name == "rRNA.bowtie2" 61 | assert lr[0].link_src == "genome.STAR" 62 | assert lr[0].link_name == "final" 63 | 64 | 65 | def test_get_mapped_BAM_output(configured_root): 66 | config = ConfigFile.from_yaml((configured_root / "config.yaml").as_posix()) 67 | project_df = ProjectDF( 68 | (configured_root / "project_df.csv").as_posix(), config=config 69 | ) 70 | 71 | out_files = get_mapped_BAM_output(project_df=project_df, config=config) 72 | print(out_files) 73 | 74 | 75 | def test_validation_cmdline_issue_54(configured_root): 76 | os.chdir(configured_root.as_posix()) 77 | data = [ 78 | ("flipped", "rRNA:bowtie2->genome:STAR", "test_hsa", True), 79 | ("species_missing", "bowtie2:rRNA->STAR:genome", "test_hs", False), 80 | ( 81 | "with_cflavor", 82 | "rRNA:bowtie2@custom_index->genome:STAR@default", 83 | "test_hsa", 84 | True, 85 | ), 86 | ( 87 | "unknown_cflavor", 88 | "rRNA:bowtie2@customX->genome:STAR@defaultBLA", 89 | "test_hsa", 90 | False, 91 | ), 92 | # ("flipped", "bowtie2:rRNA->STAR:genome", "rRNA:bowtie2->genome:STAR"), 93 | ] 94 | for name, mapstr, species, expect_pass in data: 95 | print(f"running test {name}") 96 | # add 97 | sm( 98 | "projects", 99 | "add-sample", 100 | "--project-id=test", 101 | f"--sample-id={name}", 102 | f"--map-strategy={mapstr}", 103 | f"--R1={spacemake_dir}/test_data/reads_chr22_R1.fastq.gz", 104 | f"--R2={spacemake_dir}/test_data/reads_chr22_R2.fastq.gz", 105 | f"--species={species}", 106 | expect_fail=not expect_pass, 107 | ) 108 | --------------------------------------------------------------------------------