├── .github └── workflows │ └── test_simpleaf.yml ├── .gitignore ├── LICENSE ├── R ├── README.md ├── build_splici_ref.R ├── cellRangerLikeEmptyDrops.R └── load_fry.R ├── README.md ├── bash ├── geneid_to_name.sh ├── get_10x_permit_lists.sh ├── simpleaf └── test_simpleaf.sh ├── docker ├── Dockerfile ├── Singularity.def └── build_docker.sh ├── python └── load_fry.py ├── simpleaf ├── Cargo.toml └── src │ ├── main.rs │ └── utils │ ├── af_utils.rs │ └── prog_utils.rs └── simpleaf_conda_env.yml /.github/workflows/test_simpleaf.yml: -------------------------------------------------------------------------------- 1 | name: simpleaf-check 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | test-simpleaf1: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Checkout roe repo 13 | uses: actions/checkout@v3 14 | with: 15 | repository: COMBINE-lab/roe 16 | path: roe 17 | - uses: conda-incubator/setup-miniconda@v2 18 | with: 19 | python-version: 3.7 20 | mamba-version: "*" 21 | channels: conda-forge,bioconda 22 | channel-priority: true 23 | activate-environment: anaconda-client-env 24 | environment-file: simpleaf_conda_env.yml 25 | - name: install roe 26 | shell: bash -l {0} 27 | run: | 28 | Rscript -e "devtools::install(pkg=\"roe\", dependencies=FALSE)" 29 | # Rscript -e "devtools::install()" 30 | - name: Test simpleaf 31 | shell: bash -l {0} 32 | run: | 33 | cd bash 34 | chmod +x test_simpleaf.sh 35 | ./test_simpleaf.sh 36 | - name: Check job status 37 | run: echo "This job's status is ${{ job.status }}." 38 | 39 | # jobs: 40 | # test-simpleaf: 41 | # runs-on: ubuntu-latest 42 | # defaults: 43 | # run: 44 | # shell: bash -l {0} 45 | # steps: 46 | # - name: Checkout 47 | # uses: actions/checkout@v3 48 | 49 | # - name: Checkout roe repo 50 | # uses: actions/checkout@v3 51 | # with: 52 | # repository: COMBINE-lab/roe 53 | # path: roe 54 | # - name: traverse roe folder 55 | # run: | 56 | # cd roe 57 | # ls -lh 58 | # - name: Add conda to system path 59 | # run: | 60 | # # $CONDA is an environment variable pointing to the root of the miniconda directory 61 | # echo $CONDA/bin >> $GITHUB_PATH 62 | # conda env update --file simpleaf_conda_env.yml --name base 63 | # conda activate base 64 | # # Rscript -e "devtools::install(pkg=\"./roe\", dependencies=FALSE)" 65 | # Rscript -e "devtools::install()" 66 | # - name: Install dependencies 67 | # run: | 68 | # conda env update --file simpleaf_conda_env.yml --name base 69 | # - name: install roe 70 | # run: | 71 | # Rscript -e "devtools::install(pkg=\"./roe\", dependencies=FALSE)" 72 | # - name: Test simpleaf 73 | # run: | 74 | # cd bash 75 | # chmod +x test_simpleaf.sh 76 | # ./test_simpleaf.sh 77 | # - name: Check job status 78 | # run: echo "This job's status is ${{ job.status }}." 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, COMBINE lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /R/README.md: -------------------------------------------------------------------------------- 1 | ## Useful R functions for preparing and processing data for alevin-fry. 2 | 3 | * `build_splici_ref.R` — A script to build a spliced + intron (splici) ref for indexing and quantification with `alevin-fry`. This function is now included in the [`roe`](https://github.com/COMBINE-lab/roe). Please follow [this instruction](https://github.com/COMBINE-lab/roe#installlation) to install `roe`. 4 | 5 | * `cellRangerLikeEmptyDrops.R` — An implementation of the hybrid UMI count filtering and [`emptyDrops`](https://github.com/MarioniLab/DropletUtils) used by CellRanger (and subsequently by [STARsolo](https://github.com/alexdobin/STAR)). This R implementation is a translation of the implementation in STARsolo, which itself was reverse-engineered from CellRanger. *This script should only be used for the old R version ($<$ R v4.1.0 and BioC 3.1.4).* If you have a higher version of R, please use the `emptyDropsCellRanger()` function in the [`DropletUtils`](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html) BioConductor package. 6 | * `load_fry.R` — Contains a function to load `alevin-fry` output (including from USA mode quantification) into a [`SingleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) object. *This script should only be used for old R version ($<$ R v4.1.0 and BioC 3.1.4).* If you have a higher version of R, please use the `loadFry()` function in the [`fishpond`](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html) BioConductor package. 7 | -------------------------------------------------------------------------------- /R/build_splici_ref.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # usage : 4 | # $ ./build_splici_ref.R 5 | 6 | # install BioC depedencies if necessary 7 | if ( (!requireNamespace("eisaR", quietly = TRUE)) || 8 | (!requireNamespace("BSgenome", quietly = TRUE)) || 9 | (!requireNamespace("fishpond", quietly = TRUE)) ) { 10 | 11 | # install BioC itself, if we don't have it 12 | if (!requireNamespace("BiocManager", quietly = TRUE)) { 13 | install.packages("BiocManager") 14 | } 15 | 16 | BiocManager::install(c("eisaR","BSgenome","fishpond")) 17 | } 18 | 19 | # install argparser 20 | if (!requireNamespace("argparser", quietly = TRUE)) 21 | install.packages("argparser") 22 | 23 | # install devtools 24 | if (!requireNamespace("devtools", quietly = TRUE)) 25 | install.packages("devtools") 26 | 27 | # install roe from github 28 | if (!requireNamespace("roe", quietly = TRUE)) 29 | devtools::install_github("COMBINE-lab/roe") 30 | 31 | # load packages 32 | suppressPackageStartupMessages({ 33 | library(argparser) 34 | library(roe) 35 | }) 36 | 37 | # Create a parser 38 | p <- arg_parser("Build a splici reference from a genome and GTF file.") 39 | 40 | # Add command line arguments 41 | # required arguments 42 | p <- add_argument(p, "genome", help="The path to a genome FASTA file.") 43 | p <- add_argument(p, "gtf", help="The path to a gtf file.") 44 | p <- add_argument(p, "read-length", help="The read length of the single-cell experiment being processed (determines flank size).", 45 | type="numeric") 46 | p <- add_argument(p, "output-dir", 47 | help="The output directory where splici reference files will be written.") 48 | 49 | # optional arguments 50 | p <- add_argument(p, "--flank-trim-length", 51 | help="Determines the amount subtracted from the read length to get the flank length.", 52 | type="numeric", 53 | default=5) 54 | p <- add_argument(p, "--filename-prefix", 55 | help="The file name prefix of the generated output files.", 56 | default="splici") 57 | p <- add_argument(p, "--extra-spliced", 58 | help="The path to an extra spliced sequence fasta file.") 59 | p <- add_argument(p, "--extra-unspliced", 60 | help="The path to an extra unspliced sequence fasta file.") 61 | p <- add_argument(p, "--dedup-seqs", 62 | help="A flag indicates whether identical sequences will be deduplicated.", 63 | flag=TRUE) 64 | p <- add_argument(p, "--no-flanking-merge", 65 | help="A flag indicates whether flank lengths will be considered when merging introns.", 66 | flag=TRUE) 67 | 68 | # Parse the command line arguments 69 | argv <- parse_args(p) 70 | 71 | # Set NAs to NULLs and call the function 72 | if (is.na(argv$extra_spliced)) { 73 | argv$extra_spliced <- NULL 74 | } 75 | if (is.na(argv$extra_unspliced)) { 76 | argv$extra_unspliced <- NULL 77 | } 78 | 79 | make_splici_txome(gtf_path = argv$gtf, 80 | genome_path = argv$genome, 81 | read_length = argv$read_length, 82 | output_dir = argv$output_dir, 83 | flank_trim_length = argv$flank_trim_length, 84 | filename_prefix = argv$filename_prefix, 85 | extra_spliced = argv$extra_spliced, 86 | extra_unspliced = argv$extra_unspliced, 87 | dedup_seqs = argv$dedup_seqs, 88 | no_flanking_merge = argv$no_flanking_merge) 89 | -------------------------------------------------------------------------------- /R/cellRangerLikeEmptyDrops.R: -------------------------------------------------------------------------------- 1 | # To use this function, source it to your environment by `source("cellRangerLikeEmptyDrops.R")` 2 | #' An approximate implementation of the `--soloCellFilter EmptyDrops_CR` filtering approach to identify empty droplets. 3 | #' 4 | #' An approximate implementation of the `--soloCellFilter EmptyDrops_CR` filtering approach, 5 | #' which, itself, was reverse-engineered from the behavior of CellRanger 3+. 6 | #' 7 | #' @param m A numeric matrix-like object containing counts, where columns represent barcoded droplets and rows represent features. 8 | #' The matrix should only contain barcodes for an individual sample, prior to any filtering for cells. 9 | #' 10 | #' @param umiMin A numeric scalar specifying the minimum UMI count above which a sample will be included in ambient profiles, 11 | #' as specified in the call to CellRanger. 12 | #' 13 | #' @param umiMinFracMedian A numeric scalar between 0 and 1 specifying that only the samples whose UMI count are above \code{umiMinFracMedian} 14 | #' fraction of the median UMI count#' of the top \code{nExpectedCells} samples will be included in the ambient profile. 15 | #' as specified in the call to CellRanger. 16 | #' 17 | #' @param candMaxN An integer specifying the maximum number of ambient samples that are possible to be regarded as real cells, 18 | #' as specified in the call to CellRanger. 19 | #' 20 | #' @param indMax An integer specifying the highest UMI count ranking of the ambient pool, cells with UMI count ranking above 21 | #' this number will not be included in the ambient pool, as specified in the call to CellRanger. 22 | #' 23 | #' @param indMin An integer specifying the lowest UMI count ranking of the ambient pool, cells with UMI count ranking below 24 | #' this number will not be included in the ambient pool, as specified in the call to CellRanger. 25 | #' 26 | #' @param fdr_thresh A numeric scalar specifying the FDR threshold to filter samples. Samples whose FDR returned by emptyDrops 27 | #' is above this threshold will not be regarded as real cells, as specified in the call to CellRanger. 28 | #' 29 | #' @param maxPercentile A numeric scalar specifying the percentile used in simple filtering, samples selected by simple filtering 30 | #' will be regarded as real cells regardless of the \code{emptyDrops} result, as specified in the call to CellRanger. 31 | #' 32 | #' @param nExpectedCells A numeric scalar specifying the expected number of cells in this sample, as specified in the call to CellRanger. 33 | #' 34 | #' @param maxMinRatio A numeric scalar specifying the maximum ratio of maximum UMI count and minimum UMI count used in simple filtering, 35 | #' maximum UMI count used in simple filtering is determined first by \code{nExpectedCells*(1-maxPercentile)}, minimum UMI count used in 36 | #' simple filtering is then determined by this ratio, as specified in the call to CellRanger.. 37 | #' 38 | #' @param seed Integer specifying the seed that will be used to run \code{emptyDrops} 39 | #' @param ... For the generic, further arguments to pass to \code{emptyDrops}. 40 | #' 41 | #' @details 42 | #' This function is an approximate implementation of the `--soloCellFilter EmptyDrops_CR` filtering approach of STARsolo 43 | #' (\url{https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1}), which, itself, was reverse engineered from the behavior of CellRanger 3+. 44 | #' The original C++ code on which this function is based can be found at 45 | #' (\url{https://github.com/alexdobin/STAR/blob/master/source/SoloFeature_cellFiltering.cpp}) 46 | #' All parameters are defaulty set as the default value used in starSolo and Cellranger. 47 | #' In the most cases, users just need to specify the raw and unfiltered count matrix, \code{m}. 48 | #' See \code{?\link{emptyDrops}} for an alternative approach for cell calling. 49 | #' 50 | #' @return 51 | #' A DataFrame like \code{\link{emptyDrops}}, with an additional binary \code{is.cell} field demonstrating whether 52 | #' samples are estimated as real cells. 53 | #' 54 | #' @author 55 | #' Dongze He, Rob Patro 56 | #' 57 | #' @examples 58 | #' # Mocking up some data: 59 | #' set.seed(0) 60 | #' my.counts <- DropletUtils:::simCounts() 61 | #' 62 | #' # Identify likely cell-containing droplets. 63 | #' e.out <- cellRangerLikeEmptyDrops(my.counts) 64 | #' e.out 65 | #' 66 | #' # Get matrix of estimated cells. 67 | #' cell.counts <- my.counts[, e.out$is.cell] 68 | #' 69 | #' @references 70 | #' Kaminow et al. (2021). 71 | #' STARsolo: accurate, fast and versatile mapping/quantification of single-cell and single-nucleus RNA-seq data 72 | #' \url{https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1} 73 | #' 74 | #' @seealso 75 | #' \code{\link{emptyDrops}}, for another method for calling cells. 76 | #' 77 | #' @name cellRangerLikeEmptyDrops 78 | NULL 79 | 80 | # Authors: Dongze He, Rob Patro 81 | # Center of Bioinformatics and Computational Biology, University of Maryland, College Park, Maryland, 20740 82 | 83 | #' @importFrom DropletUtils emptyDrops 84 | .cellRangerLikeEmptyDrops <- function(m, 85 | umiMin=500, 86 | umiMinFracMedian=0.01, 87 | candMaxN=20000, 88 | indMax=90000, 89 | indMin=45000, 90 | fdr_thresh=0.01, 91 | maxPercentile=0.99, 92 | nExpectedCells=3000, 93 | maxMinRatio=10, 94 | seed=2718, 95 | ... 96 | ) { 97 | # This function is an approximate implementation of the 98 | # `--soloCellFilter EmptyDrops_CR` filtering approach 99 | # of STARsolo (https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1), 100 | # which, itself, was reverse engineered from the behavior of 101 | # CellRanger 3+. The original C++ code on which this 102 | # function is based can be found at (https://github.com/alexdobin/STAR/blob/master/source/SoloFeature_cellFiltering.cpp) 103 | 104 | ################################################################################################################### 105 | # get the sorted nUMI vector of cells 106 | csums <- colSums2(m) 107 | indCount <- as.data.frame(cbind(1:length(csums), csums)) 108 | colnames(indCount) <- c("index", "count") 109 | indCount <- indCount[order(indCount$count,decreasing = TRUE),] 110 | 111 | # Simple Filtering 112 | maxind <- round(nExpectedCells * (1 - maxPercentile)) 113 | nUMImax <- indCount$count[min(ncol(m), maxind)] 114 | nUMImin <- round(nUMImax/maxMinRatio) 115 | ncellsSimple <- sum(indCount$count>=nUMImin) 116 | 117 | # set lower bound 118 | minUMI <- max(umiMin, round(umiMinFracMedian * indCount$count[ncellsSimple/2])) 119 | 120 | ## we at most assign candMaxN samples in the ambient pool as real cells 121 | minUMI <- max(minUMI, indCount$count[min(ncellsSimple+candMaxN,nrow(indCount))]) 122 | 123 | # emptyDrops 124 | ## ignore: the lower bound of UMI count, samples with UMI count less than ignore 125 | ## will not be considered as ambient cells. 126 | ignore_index <- min(ncol(m), indMax) 127 | ignore <- indCount$count[ignore_index] 128 | 129 | ## by.rank: cells with UMI count ranking lower than by.rank will be considered as 130 | ## ambient cells 131 | by.rank <- indMin 132 | 133 | ## retain: samples with UMI count higher than retain will be regarded as cells 134 | retain <- indCount$count[ncellsSimple] 135 | 136 | ## the cells with total UMI count between ignore and lower will be considered as ambient 137 | set.seed(seed) 138 | e.out <- DropletUtils::emptyDrops(m, by.rank=by.rank, ignore=ignore, retain=retain, alpha=Inf) 139 | e.out$is.cell <- e.out$FDR < fdr_thresh 140 | e.out$is.cell[is.na(e.out$is.cell)] <- FALSE 141 | 142 | # further filter cells by minUMI 143 | e.out$is.cell[indCount[indCount$count ${output} 62 | 63 | trap cleanup EXIT 64 | -------------------------------------------------------------------------------- /bash/get_10x_permit_lists.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | display_usage() { 4 | echo "This script downloads a 10x chromium v2 or v3 permit list to the specified output directory" 5 | echo -e "\nUsage: $0 [options]" 6 | echo -e "\toptions:" 7 | echo -e "\t -o, --output REQUIRED path to output directory (will be created if it doesn't exist)" 8 | echo -e "\t -l, --list REQUIRED permit list to download, one of {v2, v3}" 9 | echo -e "\t -h, --help display this help message" 10 | } 11 | 12 | v3_plist="https://umd.box.com/shared/static/eo0qlkfqf2v24ws6dfnxty6gqk1otf2h" 13 | v2_plist="https://umd.box.com/shared/static/jbs2wszgbj7k4ic2hass9ts6nhqkwq1p" 14 | 15 | while [[ "$#" -gt 0 ]]; do 16 | case $1 in 17 | -o|--output) output="$2"; shift ;; 18 | -l|--list) list="$2"; shift;; 19 | -h|--help) help=1; shift;; 20 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 21 | esac 22 | shift 23 | done 24 | 25 | if [[ -n "$help" ]]; then 26 | display_usage 27 | exit 0 28 | fi 29 | 30 | if [[ -z "$output" || -z "$list" ]]; then 31 | display_usage 32 | exit 1 33 | fi 34 | 35 | if [ "$list" == "v2" ]; then 36 | mkdir -p $output 37 | wget -v -O $output/10x_v2_permit.txt -L $v2_plist 38 | fi 39 | 40 | if [ "$list" == "v3" ]; then 41 | mkdir -p $output 42 | wget -v -O $output/10x_v3_permit.txt -L $v3_plist 43 | fi 44 | 45 | echo -e "\n\noutput written to ${output}/10x_${list}_permit.txt\n\n" 46 | -------------------------------------------------------------------------------- /bash/simpleaf: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function display_quant_usage() { 4 | echo "This script runs alevin-fry to quantify a single-cell RNA-seq experiment" 5 | echo -e "\nUsage: $0 quant [options]" 6 | echo -e "\toptions:" 7 | echo -e "\t -1, --r1 REQUIRED comma separated list of left reads" 8 | echo -e "\t -2, --r2 REQUIRED comma separated list of right reads" 9 | echo -e "\t -i, --index REQUIRED path to a (sparse or dense) salmon splici index" 10 | echo -e "\t -o, --output REQUIRED path to output directory (will be created if it doesn't exist)" 11 | echo -e "\t -f, --fmode REQUIRED permit list filter mode, one of {knee, k, unfilt, u}" 12 | echo -e "\t -c, --chem REQUIRED chemistry of experiment, one of {v2, v3}" 13 | echo -e "\t -r, --res REQUIRED resolution strategy for alevin-fry, one of {cr-like, cr-like-em}" 14 | echo -e "\t -m, --t2g REQUIRED three-column txp-to-gene file to pass to alevin-fry quant command" 15 | echo -e "\t -t, --threads OPTIONAL number of threads to use when running [default: min(16, num cores)]" 16 | echo -e "\t -h, --help display this help message" 17 | } 18 | 19 | function display_index_usage() { 20 | echo "This script generates a splici reference and indexes it" 21 | echo -e "\nUsage: $0 index [options]" 22 | echo -e "\toptions:" 23 | echo -e "\t -f, --fasta REQUIRED genome reference FASTA file" 24 | echo -e "\t -g, --gtf REQUIRED GTF file with gene annotations" 25 | echo -e "\t -l, --rlen REQUIRED the target read length the index will be built for" 26 | echo -e "\t -o, --output REQUIRED path to output directory (will be created if it doesn't exist)" 27 | echo -e "\t -s, --spliced OPTIONAL path to FASTA file with extra spliced sequence to add to the index" 28 | echo -e "\t -u, --unspliced OPTIONAL path to FASTA file with extra unspliced sequence to add to the index" 29 | echo -e "\t -d, --dedup FLAG OPTIONAL deduplicate identical sequences inside the R script when building the splici reference" 30 | echo -e "\t -e, --dense FLAG OPTIONAL if this flag is passed, build the dense rather than sparse index for mapping" 31 | echo -e "\t -t, --threads OPTIONAL number of threads to use when running [default: min(16, num cores)]" 32 | echo -e "\t -h, --help display this help message" 33 | } 34 | 35 | function display_usage() { 36 | echo "This script wraps alevin-fry to index a reference or quantify a single-cell RNA-seq experiment" 37 | echo -e "\nUsage: $0 index [index_options]" 38 | echo -e "Usage: $0 quant [quant_options]" 39 | echo -e "Usage: $0 -h | --help" 40 | echo -e "use $0 index -h or $0 quant -h to get more help for those sub-commands" 41 | } 42 | 43 | # compare versions. 44 | # from: https://stackoverflow.com/questions/4023830/how-to-compare-two-strings-in-dot-separated-version-format-in-bash 45 | function vercomp () { 46 | if [[ $1 == $2 ]] 47 | then 48 | return 0 49 | fi 50 | local IFS=. 51 | local i ver1=($1) ver2=($2) 52 | # fill empty fields in ver1 with zeros 53 | for ((i=${#ver1[@]}; i<${#ver2[@]}; i++)) 54 | do 55 | ver1[i]=0 56 | done 57 | for ((i=0; i<${#ver1[@]}; i++)) 58 | do 59 | if [[ -z ${ver2[i]} ]] 60 | then 61 | # fill empty fields in ver2 with zeros 62 | ver2[i]=0 63 | fi 64 | if ((10#${ver1[i]} > 10#${ver2[i]})) 65 | then 66 | return 1 67 | fi 68 | if ((10#${ver1[i]} < 10#${ver2[i]})) 69 | then 70 | return 2 71 | fi 72 | done 73 | return 0 74 | } 75 | 76 | function check_min_version() { 77 | prog_name=$1 78 | min_ver=$2 79 | prog_vstr=`$prog_name --version`; ec=$? 80 | case $ec in 81 | 0) echo "$prog_vstr";; 82 | *) echo "ERROR: $prog_name -v exited with non-zero exit code"; exit 1;; 83 | esac 84 | 85 | # check that the version is new enough 86 | prog_ver=`echo $prog_vstr | cut -d' ' -f2` 87 | 88 | vercomp $prog_ver $min_ver; ec1=$? 89 | 90 | if [[ $ec1 -eq 0 || $ec1 -eq 1 ]]; then 91 | echo "$prog_name version $prog_ver is sufficiently new." 92 | return 0 93 | else 94 | echo "ERROR: $prog_name version was $prog_ver, require at least $min_ver." 95 | exit 1 96 | fi 97 | } 98 | 99 | ### 100 | # end of helper functions, entry point of 101 | # script. 102 | ### 103 | 104 | # make sure that an alevin-fry home is set 105 | if [ -z "${ALEVIN_FRY_HOME}" ]; then 106 | echo "To use $0, you must set a valid ALEVIN_FRY_HOME environment variable" 107 | exit 1 108 | else 109 | ALEVIN_FRY_HOME=$(realpath ${ALEVIN_FRY_HOME}) 110 | fi 111 | 112 | if [ -d "$ALEVIN_FRY_HOME" ]; then 113 | echo "ALEVIN_FRY_HOME=$ALEVIN_FRY_HOME" 114 | else 115 | echo "ALEVIN_FRY_HOME did not exist; creating it." 116 | mkdir -p $ALEVIN_FRY_HOME 117 | fi 118 | 119 | # check that the salmon executable runs and is of at least the required version 120 | salmon="${SALMON_BIN:-salmon}" 121 | check_min_version $salmon "1.5.1" 122 | 123 | # check that the fry executable runs and is of at least the required version 124 | fry="${FRY_BIN:-alevin-fry}" 125 | check_min_version $fry "0.4.0" 126 | 127 | time=${TIME_BIN:-"/usr/bin/time"} 128 | # time writes version to stderr ... sigh 129 | time_vstr=$($time -V 2>&1); ec=$? 130 | if [[ "$ec" -ne 0 ]]; then 131 | echo "$time -V returned non-zero exit code, please set TIME_BIN to point to a GNU time executable" 132 | exit 1 133 | else 134 | if (echo $time_vstr | grep -iq "GNU Time"); then 135 | echo "$time command appears to execute a valid GNU time" 136 | else 137 | echo "$time does not appear to be GNU time ($time -V did not return a string starting with GNU time)" 138 | exit 1; 139 | fi 140 | fi 141 | 142 | # now that we know we have a proper GNU time executable 143 | # redefine it to have the desired options attached 144 | time="$time -v -o" 145 | 146 | threads=16 147 | 148 | if [ -x "$(command -v nproc)" ]; then 149 | np=`nproc` 150 | elif [ -x "$(command -v sysctl)"]; then 151 | np=`sysctl -n hw.ncpu` 152 | else 153 | np=16 154 | fi 155 | 156 | if [ "$threads" -gt "$np" ]; then 157 | threads="$np" 158 | fi 159 | 160 | function simpleaf_index() { 161 | while [[ "$#" -gt 0 ]]; do 162 | case $1 in 163 | -f|--fasta) genome="$2"; shift ;; 164 | -g|--gtf) gtf="$2"; shift ;; 165 | -l|--rlen) rlen="$2"; shift ;; 166 | -o|--output) output="$2"; shift ;; 167 | -t|--threads) threads="$2"; shift ;; 168 | -s|--spliced) extra_spliced="$2"; shift ;; 169 | -u|--unspliced) extra_unspliced="$2"; shift ;; 170 | -e|--dense) dense_index=1; shift ;; 171 | -d|--dedup) dedup=1; shift ;; 172 | -h|--help) help=1; shift ;; 173 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 174 | esac 175 | shift 176 | done 177 | set -o errexit -o pipefail 178 | 179 | if [[ -n "$help" ]]; then 180 | display_index_usage 181 | exit 0 182 | fi 183 | 184 | 185 | if [[ -z "$genome" || -z "$gtf" || -z "$rlen" || -z "$output" ]]; then 186 | display_index_usage 187 | exit 1 188 | fi 189 | 190 | # make the directory where we will put the reference 191 | output=$(realpath ${output}) 192 | outref="$output/ref/" 193 | mkdir -p $outref 194 | 195 | cwd=$(pwd) 196 | cd "../R" 197 | 198 | if [[ -z "$extra_spliced" ]]; then 199 | extra_spliced_flag="" 200 | else 201 | extra_spliced_flag="--extra-spliced $extra_spliced" 202 | fi 203 | 204 | if [[ -z "$extra_unspliced" ]]; then 205 | extra_unspliced_flag="" 206 | else 207 | extra_unspliced_flag="--extra-unspliced $extra_unspliced" 208 | fi 209 | 210 | if [[ -z "$dedup" ]]; then 211 | dedup_flag="" 212 | else 213 | dedup_flag="--dedup-seqs" 214 | fi 215 | # $ ./build_splici_ref.R 216 | cmd="Rscript build_splici_ref.R $genome $gtf $rlen $outref $extra_spliced_flag $extra_unspliced_flag $dedup_flag --filename-prefix splici" 217 | echo -e "\nExtracting the splici reference using command \n\n $cmd \n" 218 | eval $cmd 219 | 220 | echo -e "\nDone. Building index." 221 | cd $cwd 222 | 223 | outidx="$output/index" 224 | fl=$(( $rlen - 5 )) 225 | 226 | # the default is sparse, but if the user 227 | # passed the -e/--dense flag, then don't 228 | # pass the sparse flag, and build a dense 229 | # index instead. 230 | sparse_flag="--sparse" 231 | if [[ -z "$dense_index" ]]; then 232 | sparse_flag="" 233 | fi 234 | 235 | cmd="$salmon index -t $outref/splici_fl$fl.fa -i $outidx -p $threads $sparse_flag" 236 | echo -e "\nbuilding index:" 237 | echo "command: $cmd" 238 | echo "=============" 239 | eval $cmd 240 | 241 | cp $outref/splici_fl${fl}_t2g_3col.tsv $outidx/t2g_3col.tsv 242 | echo -e "\nDone. Wrote index to $outidx" 243 | } 244 | 245 | function simpleaf_quant() { 246 | while [[ "$#" -gt 0 ]]; do 247 | case $1 in 248 | -1|--r1) read1="$2"; shift ;; 249 | -2|--r2) read2="$2"; shift ;; 250 | -i|--index) index="$2"; shift ;; 251 | -t|--threads) threads="$2"; shift ;; 252 | -o|--output) output="$2"; shift ;; 253 | -f|--fmode) fmode="$2"; shift ;; 254 | -r|--res) res="$2"; shift ;; 255 | -c|--chem) chem="$2"; shift ;; 256 | -m|--t2g) t2g="$2"; shift ;; 257 | -h|--help) help=1; shift ;; 258 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 259 | esac 260 | shift 261 | done 262 | 263 | set -o errexit -o pipefail 264 | 265 | if [[ -n "$help" ]]; then 266 | display_quant_usage 267 | exit 0 268 | fi 269 | 270 | 271 | if [[ -z "$read1" || -z "$read2" || -z "$index" || -z "$threads" || -z "$output" || -z "$fmode" || -z "$res" || -z "$chem" || -z "$t2g" ]]; then 272 | display_quant_usage 273 | exit 1 274 | fi 275 | 276 | ## Check that the fitler mode is one of knee,k,unfilt,u 277 | if [[ "$fmode" == "k" || "$fmode" == "knee" || "$fmode" == "unfilt" || "$fmode" == "u" ]]; then 278 | echo "filter mode is : $fmode" 279 | else 280 | echo "filter mode must be one of {knee, k, unfilt, u}" 281 | exit 1 282 | fi 283 | 284 | ## Check that the chemistry is either v2 or v3 285 | if [[ "$chem" == "v2" || "$chem" == "v3" ]]; then 286 | echo "chemistry is : 10x chromimum $chem" 287 | else 288 | echo "chemistry mode must be one of {v2, v3}" 289 | exit 1 290 | fi 291 | 292 | ## If the chemistry is v2, set the chemflag and the fitler flags 293 | if [[ $chem == "v2" ]]; then 294 | if [[ "$fmode" == "unfilt" || "$fmode" == "u" ]]; then 295 | v2file=$ALEVIN_FRY_HOME/plist/10x_v2_permit.txt 296 | if [ ! -f "$v2file" ]; then 297 | echo "10x v2 permit list does not exist, downloading now" 298 | bash get_10x_permit_lists.sh -o $ALEVIN_FRY_HOME/plist -l v2 299 | fi 300 | permitmode="-u $ALEVIN_FRY_HOME/plist/10x_v2_permit.txt" 301 | else 302 | permitmode="-k" 303 | fi 304 | chemflag="--chromium" 305 | fi 306 | 307 | ## If the chemistry is v3, set the chemflag and the fitler flags 308 | if [[ $chem == "v3" ]]; then 309 | if [[ "$fmode" == "unfilt" || "$fmode" == "u" ]]; then 310 | v3file=$ALEVIN_FRY_HOME/plist/10x_v3_permit.txt 311 | if [ ! -f "$v3file" ]; then 312 | echo "10x v3 permit list does not exist, downloading now" 313 | bash get_10x_permit_lists.sh -o $ALEVIN_FRY_HOME/plist -l v3 314 | fi 315 | permitmode="-u $ALEVIN_FRY_HOME/plist/10x_v3_permit.txt" 316 | else 317 | permitmode="-k" 318 | fi 319 | chemflag="--chromiumV3" 320 | fi 321 | 322 | mkdir -p $output/logdir 323 | logdir="$output/logdir" 324 | 325 | ## turn comma separated list into space separated list 326 | read1=`echo $read1 | tr ',' ' '` 327 | read2=`echo $read2 | tr ',' ' '` 328 | 329 | ## map 330 | cmd="$time $logdir/map.time $salmon alevin -l ISR -i $index -1 $read1 -2 $read2 -p $threads $chemflag -o $output/alevin_map --sketch" 331 | echo "mapping:" 332 | echo "command: $cmd" 333 | echo "=============" 334 | eval $cmd 335 | 336 | ### generate permit list 337 | cmd="$time $logdir/gpl.time $fry generate-permit-list $permitmode -d fw -i $output/alevin_map -o $output/gpl/ |& stdbuf -oL tr '\r' '\n' > $logdir/gpl.log" 338 | echo "gpl:" 339 | echo "command: "$cmd 340 | echo "=============" 341 | eval $cmd 342 | 343 | ### collate 344 | cmd="$time $logdir/collate.time $fry collate -i $output/gpl/ -r $output/alevin_map -t $threads |& stdbuf -oL tr '\r' '\n' > $logdir/collate.log" 345 | echo "collate:" 346 | echo "command: "$cmd 347 | echo "=============" 348 | eval $cmd 349 | 350 | ### quant 351 | cmd="$time $logdir/quant.time $fry quant -r $res --use-mtx -m $t2g -i $output/gpl/ -o $output/quant -t $threads |& stdbuf -oL tr '\r' '\n' > $logdir/quant.log" 352 | echo "quant:" 353 | echo "command: "$cmd 354 | echo "=============" 355 | eval $cmd 356 | 357 | echo "Finished! Quantification results are at $output/quant" 358 | } 359 | 360 | if [[ $# -eq 0 || "$1" == "-h" || "$1" == "--help" ]]; then 361 | display_usage 362 | exit 0 363 | fi 364 | 365 | if [[ "$1" == "quant" ]]; then 366 | shift; 367 | simpleaf_quant "$@" 368 | elif [[ "$1" == "index" ]]; then 369 | shift; 370 | simpleaf_index "$@" 371 | else 372 | echo -e "\nERROR must pass a valid sub-command, index or quant" 373 | display_usage 374 | exit 1 375 | fi 376 | -------------------------------------------------------------------------------- /bash/test_simpleaf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # In this script we test simpleaf using a toy read-reference set 3 | # template took from here https://stackoverflow.com/a/34676160/18156398 4 | # the directory of the script 5 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 6 | echo "Testing simpleaf using a toy read-reference set" 7 | 8 | # the temp directory used, within $DIR 9 | # omit the -p parameter to create a temporal directory in the default location 10 | # WORK_DIR=`mktemp -d -p "$DIR"` 11 | WORK_DIR=`mktemp -d` 12 | LOG_DIR="${WORK_DIR}/simpleaf_logs" 13 | ALEVIN_FRY_HOME="${WORK_DIR}/alevin_fry_home" 14 | mkdir -p $LOG_DIR 15 | 16 | # check if tmp dir was created 17 | if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then 18 | echo "Could not create temp dir" 19 | exit 1 20 | fi 21 | 22 | # deletes the temp directory 23 | function cleanup { 24 | rm -rf "$WORK_DIR" 25 | echo " - Deleted temp working directory $WORK_DIR" 26 | } 27 | 28 | # implementation of script starts here 29 | echo " - Downloading the toy read-reference set" 30 | wget https://umd.box.com/shared/static/lx2xownlrhz3us8496tyu9c4dgade814.gz -O ${WORK_DIR}/toy_read_ref_set.tar.gz -q 31 | tar -xf ${WORK_DIR}/toy_read_ref_set.tar.gz -C ${WORK_DIR} 32 | 33 | echo " - Testing simpleaf index" 34 | REF_DIR="${WORK_DIR}/toy_read_ref_set/toy_human_ref" 35 | index_cmd="ALEVIN_FRY_HOME=$ALEVIN_FRY_HOME \ 36 | ${DIR}/simpleaf index -f ${REF_DIR}/fasta/genome.fa \ 37 | -g ${REF_DIR}/genes/genes.gtf \ 38 | -l 91 -o ${WORK_DIR}/test_index_outdir" 39 | eval $index_cmd 40 | status=$? 41 | 42 | if [ $status -ne 0 ]; then 43 | echo "ERROR when running simpleaf index" 44 | exit 1 45 | else 46 | echo "simpleaf index ran successfully" 47 | fi 48 | 49 | echo " - Testing simpleaf quant" 50 | FASTQ_DIR="${WORK_DIR}/toy_read_ref_set/toy_read_fastq" 51 | quant_cmd="ALEVIN_FRY_HOME=$ALEVIN_FRY_HOME \ 52 | ${DIR}/simpleaf quant \ 53 | -1 ${FASTQ_DIR}/selected_R1_reads.fastq \ 54 | -2 ${FASTQ_DIR}/selected_R2_reads.fastq \ 55 | -i ${WORK_DIR}/test_index_outdir/index \ 56 | -o ${WORK_DIR}/test_quant_outdir \ 57 | -f u -c v3 -r cr-like \ 58 | -m ${WORK_DIR}/test_index_outdir/index/t2g_3col.tsv \ 59 | -t 16" 60 | eval $quant_cmd 61 | status=$? 62 | 63 | if [ $status -ne 0 ]; then 64 | echo "ERROR when running simpleaf quant" 65 | exit 1 66 | else 67 | echo " - simpleaf quant ran successfully" 68 | fi 69 | # register the cleanup function to be called on the EXIT signal 70 | status=$? 71 | [ "$status" -eq 0 ] && rm -rf $WORK_DIR 72 | 73 | echo "simpleaf works!" 74 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # image: COMBINE-lab/dockeraf 2 | FROM condaforge/mambaforge:4.11.0-4 3 | MAINTAINER salmon.maintainer@gmail.com 4 | 5 | RUN mamba install -c anaconda git 6 | RUN mamba install -c bioconda salmon gffread 7 | RUN mamba install -c conda-forge cxx-compiler 8 | RUN mamba install -c conda-forge r-base r-essentials 9 | RUN mamba install -c bioconda bioconductor-eisar bioconductor-biostrings bioconductor-bsgenome bioconductor-genomicfeatures bioconductor-singlecellexperiment 10 | RUN mamba install -c conda-forge r-stringr r-argparser r-rjson r-matrix 11 | RUN mamba install -c conda-forge time 12 | RUN mamba install -c conda-forge r-devtools 13 | RUN R -e "devtools::install_github('COMBINE-lab/roe')" 14 | RUN mamba install -c bioconda alevin-fry 15 | 16 | RUN touch /root/.bashrc \ 17 | && echo "export ALEVIN_FRY_HOME=/workdir/.afhome" >> /root/.bashrc \ 18 | && echo "export TIME_BIN=/opt/conda/bin/time" >> /root/.bashrc 19 | 20 | RUN touch /root/.Rprofile 21 | 22 | RUN echo 'local({r <- getOption("repos") \n\ 23 | r["CRAN"] <- "http://cran.r-project.org" \n\ 24 | options(repos=r) \n\ 25 | })' >> /root/.Rprofile 26 | 27 | RUN git clone https://github.com/COMBINE-lab/usefulaf.git 28 | 29 | ENV ALEVIN_FRY_HOME=/workdir/.afhome 30 | ENV TIME_BIN=/opt/conda/bin/time 31 | -------------------------------------------------------------------------------- /docker/Singularity.def: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: condaforge/mambaforge:4.10.3-1 3 | Stage: spython-base 4 | 5 | %labels 6 | MAINTAINER salmon.maintainer@gmail.com 7 | %post 8 | # image: COMBINE-lab/dockeraf 9 | 10 | mamba install -c anaconda git 11 | mamba install -c bioconda salmon alevin-fry gffread 12 | mamba install -c conda-forge cxx-compiler 13 | mamba install -c conda-forge r-base r-essentials 14 | mamba install -c bioconda bioconductor-eisar bioconductor-biostrings bioconductor-bsgenome bioconductor-genomicfeatures bioconductor-singlecellexperiment 15 | mamba install -c conda-forge r-stringr r-argparser r-rjson r-matrix 16 | mamba install -c conda-forge time 17 | 18 | git clone https://github.com/COMBINE-lab/usefulaf.git 19 | %environment 20 | export ALEVIN_FRY_HOME="/workdir/.afhome" 21 | export TIME_BIN="/opt/conda/bin/time" 22 | %runscript 23 | exec /bin/bash "$@" 24 | %startscript 25 | exec /bin/bash "$@" 26 | -------------------------------------------------------------------------------- /docker/build_docker.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | USEFULAF_VERSION=0.5.5 3 | docker build --no-cache -t combinelab/usefulaf:${USEFULAF_VERSION} -t combinelab/usefulaf:latest . 4 | -------------------------------------------------------------------------------- /python/load_fry.py: -------------------------------------------------------------------------------- 1 | import scanpy 2 | 3 | def load_fry(frydir, which_counts={'X' : ['S','A']}, verbose=False): 4 | """ 5 | 6 | Parameters: 7 | frydir - The directory containing the alevin-fry quantification (i.e. the the quant.json file & alevin subdirectory). 8 | verbose - True if messages (including error messages) should be printed out, False if function should be quiet. 9 | which_count - Dictionary specifying how a USA mode matrix should be returned or combined into the resulting 10 | output matrix. If the input is not a USA mode quantification directory, this parameter is ignored 11 | and the count matrix is returned in the `X` field of the returned `AnnData` object. If the input 12 | quantification directory contains a USA mode quantification, then there are 3 sub-matrices that can 13 | be referenced in the dictionary; 'U', 'S', 'A' containing, respectively, unspliced, spliced and 14 | ambiguous counts. The dictionary should have entries of the form `key` (str) : `value` (list[str]). 15 | The following constraints apply : there should be one key-value pair with the key `X`, the resulting 16 | counts will be returned in the `X` field of the AnnData object. There can be an arbitrary number 17 | of other key-value pairs, but each will be returned as a layer of the resulting AnnData object. 18 | Within the key-value pairs, the key refers to the layer name that will be given to the combined 19 | count matrix upon output, and the value should be a subset of `['U', 'S', 'A']` that defines 20 | which sub-matrices should be summed. For example: 21 | {'X' : ['S', 'A'], 'unspliced' : ['U']} 22 | 23 | will result in a return AnnData object where the X field has a matrix in which each entry 24 | corresponds to the summed spliced and ambiguous counts for each gene in each cell, and there 25 | is an additional 'unspliced' layer, whose counts are taken directly from the unspliced sub-matrix. 26 | 27 | Returns: 28 | An AnnData object with X and layers corresponding to the requested `which_counts`, or None if an 29 | error is encountered. 30 | """ 31 | import json 32 | import os 33 | import pandas as pd 34 | 35 | # since alevin-fry 0.4.1 the generic "meta_info.json" 36 | # has been replaced by a more informative name for each 37 | # sub-command. For quantification, it is "quant.json". 38 | # we check for both files here, in order. 39 | meta_info_files = ["quant.json", "meta_info.json"] 40 | 41 | fpath = os.path.sep.join([frydir, meta_info_files[0]]) 42 | # first, check for the new file, if we don't find it, check 43 | # for the old one. 44 | if not os.path.exists(fpath): 45 | if verbose: 46 | print(f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.") 47 | fpath = os.path.sep.join([frydir, meta_info_files[1]]) 48 | # if we don't find the old one either, then return None 49 | if not os.path.exists(fpath): 50 | if verbose: 51 | print(f"Found no {meta_info_files[1]} file either; cannot proceed.") 52 | return None 53 | 54 | # if we got here then we had a valid json file, so 55 | # use it to get the number of genes, and if we are 56 | # in USA mode or not. 57 | meta_info = json.load(open(fpath)) 58 | ng = meta_info['num_genes'] 59 | usa_mode = meta_info['usa_mode'] 60 | 61 | # if we are in USA mode 62 | if usa_mode: 63 | # make sure that num_genes is a multiple of 3 64 | if ng %3 != 0: 65 | if verbose: 66 | print("Found USA mode, but num genes = {ng} is not a multiple of 3; cannot proceed.") 67 | return None 68 | # each gene has 3 splicing statuses, so the actual number of distinct 69 | # genes is ng/3. 70 | ng = int(ng/3) 71 | if verbose: 72 | print("processing input in USA mode, will return {}".format("+".join(which_counts))) 73 | 74 | # make sure which_counts isn't empty 75 | assert(len(which_counts) > 0) 76 | 77 | # make sure the specification in which_counts is OK 78 | if 'X' not in which_counts: 79 | if verbose: 80 | print('In USA mode some sub-matrices must be assigned to the \"X\" (default) output.') 81 | return None 82 | if verbose: 83 | print(f"will populate output field X with sum of counts frorm {which_counts['X']}.") 84 | 85 | for k,v in which_counts.items(): 86 | valid_elem = len(set(v) - set(['U', 'S', 'A'])) == 0 87 | if not valid_elem: 88 | if verbose: 89 | print(f'Found non-USA element in which_count element list \"{v}\" for key \"{k}\"; cannot proceed.') 90 | return None 91 | if verbose and (k != 'X'): 92 | print(f'will combine {v} into output layer {k}.') 93 | 94 | elif verbose: 95 | print("Processing input in standard mode, will return processed count (which_count will be ignored).") 96 | 97 | # read the actual input matrix 98 | af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"])) 99 | afg = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])).readlines()][:ng] 100 | # read the gene ids 101 | afg_df = pd.DataFrame(afg, columns=["gene_ids"]) 102 | afg_df = afg_df.set_index("gene_ids") 103 | # and the barcodes 104 | abc = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])).readlines() ] 105 | abc_df = pd.DataFrame(abc, columns=["barcodes"]) 106 | abc_df.index = abc_df["barcodes"] 107 | 108 | x = af_raw.X 109 | # if we're not in USA mode, just combine this info into 110 | # an AnnData object 111 | if not usa_mode: 112 | af = scanpy.AnnData(x.T, var=abc_df, obs=afg_df) 113 | af = af.T 114 | else: # USA mode 115 | # otherwise, combine the sub-matrices into the output object as 116 | # specified by `which_counts` 117 | rd = {'S' : range(0,ng), 'U' : range(ng, 2*ng), 'A' : range(2*ng,3*ng)} 118 | xcounts = which_counts['X'] 119 | o = x[:, rd[xcounts[0]]] 120 | for wc in xcounts[1:]: 121 | o += x[:, rd[wc]] 122 | af = scanpy.AnnData(o.T, var=abc_df, obs=afg_df) 123 | af = af.T 124 | 125 | # now, if there are other layers requested, populate those 126 | for other_layer in which_counts.keys() - 'X': 127 | xcounts = which_counts[other_layer] 128 | o = x[:, rd[xcounts[0]]] 129 | for wc in xcounts[1:]: 130 | o += x[:, rd[wc]] 131 | af.layers[other_layer] = o 132 | return af 133 | -------------------------------------------------------------------------------- /simpleaf/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "simpleaf" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | anyhow = "^1.0" 10 | clap = { version = ">=3.2.12", features = ["derive", "wrap_help", "cargo", "deprecated", "wrap_help"]} 11 | cmd_lib = "^1.3.0" 12 | env_logger = "^0.9.0" 13 | log = "^0.4.17" 14 | semver = "^1.0.12" 15 | serde = {version = "1.0.139", features = ["derive"]} 16 | serde_json = "1.0.82" 17 | time = {version = "^0.3.11", features = ["macros", "formatting", "parsing", "serde", "serde-human-readable"]} 18 | which = "^4.2.5" 19 | 20 | 21 | [profile.release] 22 | lto = "thin" 23 | opt-level = 3 24 | -------------------------------------------------------------------------------- /simpleaf/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate env_logger; 2 | #[macro_use] 3 | extern crate log; 4 | 5 | use anyhow::{anyhow, bail, Context, Result}; 6 | use clap::{ArgGroup, Parser, Subcommand}; 7 | use cmd_lib::run_fun; 8 | use env_logger::Env; 9 | use serde_json::json; 10 | use time::Instant; 11 | 12 | use std::env; 13 | use std::io::BufReader; 14 | use std::path::PathBuf; 15 | 16 | mod utils; 17 | use utils::af_utils::*; 18 | use utils::prog_utils::*; 19 | 20 | #[derive(Debug, Subcommand)] 21 | enum Commands { 22 | /// build the splici index 23 | #[clap(arg_required_else_help = true)] 24 | Index { 25 | /// reference genome 26 | #[clap(short, long, value_parser)] 27 | fasta: PathBuf, 28 | 29 | /// reference GTF file 30 | #[clap(short, long, value_parser)] 31 | gtf: PathBuf, 32 | 33 | /// the target read length the index will be built for 34 | #[clap(short, long, value_parser)] 35 | rlen: u32, 36 | 37 | /// path to output directory (will be created if it doesn't exist) 38 | #[clap(short, long, value_parser)] 39 | output: PathBuf, 40 | 41 | /// path to FASTA file with extra spliced sequence to add to the index 42 | #[clap(short, long, value_parser)] 43 | spliced: Option, 44 | 45 | /// path to FASTA file with extra unspliced sequence to add to the index 46 | #[clap(short, long, value_parser)] 47 | unspliced: Option, 48 | 49 | /// deduplicate identical sequences inside the R script when building the splici reference 50 | #[clap(short = 'd', long = "dedup", action)] 51 | dedup: bool, 52 | 53 | /// if this flag is passed, build the sparse rather than dense index for mapping 54 | #[clap(short = 'p', long = "sparse", action)] 55 | sparse: bool, 56 | 57 | /// number of threads to use when running [default: min(16, num cores)]" 58 | #[clap(short, long, default_value_t = 16, value_parser)] 59 | threads: u32, 60 | }, 61 | /// quantify a sample 62 | #[clap(arg_required_else_help = true)] 63 | #[clap(group( 64 | ArgGroup::new("filter") 65 | .required(true) 66 | .args(&["knee", "unfiltered-pl", "forced-cells", "expect-cells"]) 67 | ))] 68 | Quant { 69 | /// path to index 70 | #[clap(short, long, value_parser)] 71 | index: PathBuf, 72 | 73 | /// path to read 1 files 74 | #[clap(short = '1', long = "reads1", value_parser)] 75 | reads1: Vec, 76 | 77 | /// path to read 2 files 78 | #[clap(short = '2', long = "reads2", value_parser)] 79 | reads2: Vec, 80 | 81 | /// number of threads to use when running [default: min(16, num cores)]" 82 | #[clap(short, long, default_value_t = 16, value_parser)] 83 | threads: u32, 84 | 85 | /// use knee filtering mode 86 | #[clap(short, long, action)] 87 | knee: bool, 88 | 89 | /// use unfiltered permit list 90 | #[clap(short, long, action)] 91 | unfiltered_pl: bool, 92 | 93 | /// use a filtered, explicit permit list 94 | #[clap(short, long, value_parser)] 95 | explicit_pl: Option, 96 | 97 | /// use forced number of cells 98 | #[clap(short, long, value_parser)] 99 | forced_cells: Option, 100 | 101 | /// use expected number of cells 102 | #[clap(short, long, value_parser)] 103 | expect_cells: Option, 104 | 105 | /// resolution mode 106 | #[clap(short, long, value_parser = clap::builder::PossibleValuesParser::new(["cr-like", "cr-like-em", "parsimony", "parsimony-em", "parsimony-gene", "parsimony-gene-em"]))] 107 | resolution: String, 108 | 109 | /// chemistry 110 | #[clap(short, long, value_parser)] 111 | chemistry: String, 112 | 113 | /// transcript to gene map 114 | #[clap(short = 'm', long, value_parser)] 115 | t2g_map: PathBuf, 116 | 117 | /// output directory 118 | #[clap(short, long, value_parser)] 119 | output: PathBuf, 120 | }, 121 | /// set paths to the programs that simpleaf will use 122 | SetPaths { 123 | /// path to salmon to use 124 | #[clap(short, long, value_parser)] 125 | salmon: Option, 126 | /// path to alein-fry to use 127 | #[clap(short, long, value_parser)] 128 | alevin_fry: Option, 129 | /// path to pyroe to use 130 | #[clap(short, long, value_parser)] 131 | pyroe: Option, 132 | }, 133 | } 134 | 135 | /// simplifying alevin-fry workflows 136 | #[derive(Debug, Parser)] 137 | struct Cli { 138 | #[clap(subcommand)] 139 | command: Commands, 140 | } 141 | 142 | enum Chemistry { 143 | TenxV2, 144 | TenxV3, 145 | Other(String), 146 | } 147 | 148 | enum PermitListResult { 149 | DownloadSuccessful(PathBuf), 150 | AlreadyPresent(PathBuf), 151 | UnregisteredChemistry, 152 | } 153 | 154 | fn get_permit_if_absent(chem: Chemistry) -> Result { 155 | let chem_file; 156 | let dl_url; 157 | match chem { 158 | Chemistry::TenxV2 => { 159 | chem_file = "10x_v2_permit.txt"; 160 | dl_url = "https://umd.box.com/shared/static/jbs2wszgbj7k4ic2hass9ts6nhqkwq1p"; 161 | } 162 | Chemistry::TenxV3 => { 163 | chem_file = "10x_v3_permit.txt"; 164 | dl_url = "https://umd.box.com/shared/static/eo0qlkfqf2v24ws6dfnxty6gqk1otf2h"; 165 | } 166 | _ => { 167 | return Ok(PermitListResult::UnregisteredChemistry); 168 | } 169 | } 170 | match env::var("ALEVIN_FRY_HOME") { 171 | Ok(p) => { 172 | let odir = PathBuf::from(p).join("plist"); 173 | if odir.join(chem_file).exists() { 174 | Ok(PermitListResult::AlreadyPresent(odir.join(chem_file))) 175 | } else { 176 | run_fun!(mkdir -p $odir)?; 177 | let mut dl_cmd = std::process::Command::new("wget"); 178 | dl_cmd 179 | .arg("-v") 180 | .arg("-O") 181 | .arg(odir.join(chem_file).to_string_lossy().to_string()) 182 | .arg("-L") 183 | .arg(dl_url); 184 | let r = dl_cmd.output()?; 185 | if !r.status.success() { 186 | return Err(anyhow!("failed to download permit list {:?}", r.status)); 187 | } 188 | Ok(PermitListResult::DownloadSuccessful(odir.join(chem_file))) 189 | } 190 | } 191 | Err(e) => { 192 | return Err(anyhow!( 193 | "could not resolve $ALEVIN_FRY_HOME environment variable : {}", 194 | e 195 | )); 196 | } 197 | } 198 | } 199 | 200 | fn main() -> anyhow::Result<()> { 201 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init(); 202 | const AF_HOME: &str = "ALEVIN_FRY_HOME"; 203 | let af_home_path = match env::var(AF_HOME) { 204 | Ok(p) => PathBuf::from(p), 205 | Err(e) => { 206 | bail!( 207 | "${} is unset {}, please set this environment variable to continue.", 208 | AF_HOME, 209 | e 210 | ); 211 | } 212 | }; 213 | 214 | let cli_args = Cli::parse(); 215 | 216 | match cli_args.command { 217 | Commands::SetPaths { 218 | salmon, 219 | alevin_fry, 220 | pyroe, 221 | } => { 222 | let rp = get_required_progs_from_paths(salmon, alevin_fry, pyroe)?; 223 | 224 | if rp.salmon.is_none() { 225 | bail!("Suitable salmon executable not found"); 226 | } 227 | if rp.alevin_fry.is_none() { 228 | bail!("Suitable alevin_fry executable not found"); 229 | } 230 | if rp.pyroe.is_none() { 231 | bail!("Suitable pyroe executable not found"); 232 | } 233 | 234 | let simpleaf_info_file = af_home_path.join("simpleaf_info.json"); 235 | let simpleaf_info = json!({ "prog_info": rp }); 236 | 237 | std::fs::write( 238 | &simpleaf_info_file, 239 | serde_json::to_string_pretty(&simpleaf_info).unwrap(), 240 | ) 241 | .with_context(|| format!("could not write {}", simpleaf_info_file.display()))?; 242 | } 243 | Commands::Index { 244 | fasta, 245 | gtf, 246 | rlen, 247 | output, 248 | spliced, 249 | unspliced, 250 | dedup, 251 | sparse, 252 | mut threads, 253 | } => { 254 | // Open the file in read-only mode with buffer. 255 | let af_info_p = af_home_path.join("simpleaf_info.json"); 256 | let simpleaf_info_file = std::fs::File::open(&af_info_p).with_context({ 257 | || 258 | format!("Could not open file {}; please run the set-paths command before using `index` or `quant`", af_info_p.display()) 259 | })?; 260 | 261 | let simpleaf_info_reader = BufReader::new(simpleaf_info_file); 262 | 263 | // Read the JSON contents of the file as an instance of `User`. 264 | let v: serde_json::Value = serde_json::from_reader(simpleaf_info_reader)?; 265 | let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; 266 | 267 | run_fun!(mkdir -p $output)?; 268 | let ref_file = format!("splici_fl{}.fa", rlen - 5); 269 | 270 | let outref = output.join("ref"); 271 | run_fun!(mkdir -p $outref)?; 272 | 273 | let t2g_file = outref.join(format!("splici_fl{}_t2g_3col.tsv", rlen - 5)); 274 | let info_file = output.join("index_info.json"); 275 | let index_info = json!({ 276 | "command" : "index", 277 | "version_info" : rp, 278 | "t2g_file" : t2g_file, 279 | "args" : { 280 | "fasta" : fasta, 281 | "gtf" : gtf, 282 | "rlen" : rlen, 283 | "output" : output, 284 | "spliced" : spliced, 285 | "unspliced" : unspliced, 286 | "dedup" : dedup, 287 | "sparse" : sparse, 288 | "threads" : threads 289 | } 290 | }); 291 | 292 | std::fs::write( 293 | &info_file, 294 | serde_json::to_string_pretty(&index_info).unwrap(), 295 | ) 296 | .with_context(|| format!("could not write {}", info_file.display()))?; 297 | 298 | let mut cmd = 299 | std::process::Command::new(format!("{}", rp.pyroe.unwrap().exe_path.display())); 300 | // we will run the make-splici command 301 | cmd.arg("make-splici"); 302 | 303 | // if the user wants to dedup output sequences 304 | if dedup { 305 | cmd.arg(String::from("--dedup-seqs")); 306 | } 307 | 308 | // extra spliced sequence 309 | match spliced { 310 | Some(es) => { 311 | cmd.arg(String::from("--extra-spliced")); 312 | cmd.arg(format!("{}", es.display())); 313 | } 314 | None => {} 315 | } 316 | 317 | // extra unspliced sequence 318 | match unspliced { 319 | Some(eu) => { 320 | cmd.arg(String::from("--extra-unspliced")); 321 | cmd.arg(format!("{}", eu.display())); 322 | } 323 | None => {} 324 | } 325 | 326 | cmd.arg(fasta) 327 | .arg(gtf) 328 | .arg(format!("{}", rlen)) 329 | .arg(&outref); 330 | 331 | let pyroe_start = Instant::now(); 332 | let cres = cmd.output()?; 333 | let pyroe_duration = pyroe_start.elapsed(); 334 | 335 | if !cres.status.success() { 336 | bail!("pyroe failed to return succesfully {:?}", cres.status); 337 | } 338 | 339 | let mut salmon_index_cmd = 340 | std::process::Command::new(format!("{}", rp.salmon.unwrap().exe_path.display())); 341 | let ref_seq = outref.join(ref_file); 342 | 343 | let output_index_dir = output.join("index"); 344 | salmon_index_cmd 345 | .arg("index") 346 | .arg("-i") 347 | .arg(&output_index_dir) 348 | .arg("-t") 349 | .arg(ref_seq); 350 | 351 | // if the user requested a sparse index. 352 | if sparse { 353 | salmon_index_cmd.arg("--sparse"); 354 | } 355 | 356 | // if the user requested more threads than can be used 357 | if let Ok(max_threads_usize) = std::thread::available_parallelism() { 358 | let max_threads = max_threads_usize.get() as u32; 359 | if threads > max_threads { 360 | warn!( 361 | "The maximum available parallelism is {}, but {} threads were requested.", 362 | max_threads, threads 363 | ); 364 | warn!("setting number of threads to {}", max_threads); 365 | threads = max_threads; 366 | } 367 | } 368 | 369 | salmon_index_cmd 370 | .arg("--threads") 371 | .arg(format!("{}", threads)); 372 | 373 | let index_start = Instant::now(); 374 | salmon_index_cmd 375 | .output() 376 | .expect("failed to run salmon index"); 377 | let index_duration = index_start.elapsed(); 378 | 379 | // copy over the t2g file to the index 380 | let index_t2g_path = output_index_dir.join("t2g_3col.tsv"); 381 | std::fs::copy(t2g_file, index_t2g_path)?; 382 | 383 | let index_log_file = output.join("simpleaf_index_log.json"); 384 | let index_log_info = json!({ 385 | "time_info" : { 386 | "pyroe_time" : pyroe_duration, 387 | "index_time" : index_duration 388 | } 389 | }); 390 | 391 | std::fs::write( 392 | &index_log_file, 393 | serde_json::to_string_pretty(&index_log_info).unwrap(), 394 | ) 395 | .with_context(|| format!("could not write {}", index_log_file.display()))?; 396 | } 397 | Commands::Quant { 398 | index, 399 | reads1, 400 | reads2, 401 | threads, 402 | knee, 403 | unfiltered_pl, 404 | explicit_pl, 405 | forced_cells, 406 | expect_cells, 407 | resolution, 408 | t2g_map, 409 | chemistry, 410 | output, 411 | } => { 412 | // Open the file in read-only mode with buffer. 413 | let af_info_p = af_home_path.join("simpleaf_info.json"); 414 | let simpleaf_info_file = std::fs::File::open(&af_info_p).with_context({ 415 | || 416 | format!("Could not open file {}; please run the set-paths command before using `index` or `quant`", af_info_p.display()) 417 | })?; 418 | 419 | let simpleaf_info_reader = BufReader::new(&simpleaf_info_file); 420 | 421 | // Read the JSON contents of the file as an instance of `User`. 422 | info!("deserializing from {:?}", simpleaf_info_file); 423 | let v: serde_json::Value = serde_json::from_reader(simpleaf_info_reader)?; 424 | let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?; 425 | 426 | info!("prog info = {:?}", rp); 427 | 428 | let mut filter_meth_opt = None; 429 | let chem = match chemistry.as_str() { 430 | "10xv2" => Chemistry::TenxV2, 431 | "10xv3" => Chemistry::TenxV3, 432 | s => Chemistry::Other(s.to_string()), 433 | }; 434 | 435 | // based on the filtering method 436 | if unfiltered_pl { 437 | // check the chemistry 438 | let pl_res = get_permit_if_absent(chem)?; 439 | let min_cells = 10usize; 440 | match pl_res { 441 | PermitListResult::DownloadSuccessful(p) 442 | | PermitListResult::AlreadyPresent(p) => { 443 | filter_meth_opt = Some(CellFilterMethod::UnfilteredExternalList( 444 | p.to_string_lossy().into_owned(), 445 | min_cells, 446 | )); 447 | } 448 | PermitListResult::UnregisteredChemistry => { 449 | bail!( 450 | "Cannot use unrecognized chemistry {} with unfiltered permit list.", 451 | chemistry.as_str() 452 | ); 453 | } 454 | } 455 | } else { 456 | match explicit_pl { 457 | Some(filtered_path) => { 458 | filter_meth_opt = Some(CellFilterMethod::ExplicitList( 459 | filtered_path.to_string_lossy().into_owned(), 460 | )); 461 | } 462 | None => {} 463 | }; 464 | match forced_cells { 465 | Some(num_forced) => { 466 | filter_meth_opt = Some(CellFilterMethod::ForceCells(num_forced)); 467 | } 468 | None => {} 469 | }; 470 | match expect_cells { 471 | Some(num_expected) => { 472 | filter_meth_opt = Some(CellFilterMethod::ExpectCells(num_expected)); 473 | } 474 | None => {} 475 | }; 476 | } 477 | // otherwise it must have been knee; 478 | if knee { 479 | filter_meth_opt = Some(CellFilterMethod::KneeFinding); 480 | } 481 | 482 | if filter_meth_opt.is_none() { 483 | bail!("It seems no valid filtering strategy was provided!"); 484 | } 485 | 486 | // here we must be safe to unwrap 487 | let filter_meth = filter_meth_opt.unwrap(); 488 | 489 | let mut salmon_quant_cmd = 490 | std::process::Command::new(format!("{}", rp.salmon.unwrap().exe_path.display())); 491 | 492 | // set the input index and library type 493 | let index_path = format!("{}", index.display()); 494 | salmon_quant_cmd 495 | .arg("alevin") 496 | .arg("--index") 497 | .arg(index_path) 498 | .arg("-l") 499 | .arg("A"); 500 | 501 | // location of the reads 502 | let r1_str = reads1 503 | .iter() 504 | .map(|x| format!("{}", x.display())) 505 | .collect::>() 506 | .join(","); 507 | let r2_str = reads2 508 | .iter() 509 | .map(|x| format!("{}", x.display())) 510 | .collect::>() 511 | .join(","); 512 | salmon_quant_cmd.arg("-1").arg(r1_str).arg("-2").arg(r2_str); 513 | 514 | // location of outptu directory, number of threads 515 | let map_output = output.join("af_map"); 516 | salmon_quant_cmd 517 | .arg("--threads") 518 | .arg(format!("{}", threads)) 519 | .arg("-o") 520 | .arg(&map_output); 521 | salmon_quant_cmd.arg("--sketch"); 522 | 523 | // setting the technology / chemistry 524 | match chemistry.as_str() { 525 | "10xv2" => { 526 | salmon_quant_cmd.arg("--chromium"); 527 | } 528 | "10xv3" => { 529 | salmon_quant_cmd.arg("--chromiumV3"); 530 | } 531 | s => { 532 | salmon_quant_cmd.arg(format!("--{}", s)); 533 | } 534 | }; 535 | 536 | info!("cmd : {:?}", salmon_quant_cmd); 537 | let map_start = Instant::now(); 538 | let map_proc_out = salmon_quant_cmd 539 | .output() 540 | .expect("failed to execute salmon alevin [mapping phase]"); 541 | let map_duration = map_start.elapsed(); 542 | 543 | if !map_proc_out.status.success() { 544 | bail!("mapping failed with exit status {:?}", map_proc_out.status); 545 | } 546 | 547 | let alevin_fry = rp.alevin_fry.unwrap().exe_path; 548 | // alevin-fry generate permit list 549 | let mut alevin_gpl_cmd = 550 | std::process::Command::new(format!("{}", &alevin_fry.display())); 551 | 552 | alevin_gpl_cmd.arg("generate-permit-list"); 553 | alevin_gpl_cmd.arg("-i").arg(&map_output); 554 | alevin_gpl_cmd.arg("-d").arg("fw"); 555 | 556 | // add the filter mode 557 | add_to_args(&filter_meth, &mut alevin_gpl_cmd); 558 | 559 | let gpl_output = output.join("af_quant"); 560 | alevin_gpl_cmd.arg("-o").arg(&gpl_output); 561 | 562 | info!("cmd : {:?}", alevin_gpl_cmd); 563 | 564 | let gpl_start = Instant::now(); 565 | let gpl_proc_out = alevin_gpl_cmd 566 | .output() 567 | .expect("could not execute [generate permit list]"); 568 | let gpl_duration = gpl_start.elapsed(); 569 | 570 | if !gpl_proc_out.status.success() { 571 | bail!( 572 | "generate-permit-list failed with exit status {:?}", 573 | gpl_proc_out.status 574 | ); 575 | } 576 | 577 | // 578 | // collate 579 | // 580 | let mut alevin_collate_cmd = 581 | std::process::Command::new(format!("{}", &alevin_fry.display())); 582 | 583 | alevin_collate_cmd.arg("collate"); 584 | alevin_collate_cmd.arg("-i").arg(&gpl_output); 585 | alevin_collate_cmd.arg("-r").arg(&map_output); 586 | alevin_collate_cmd.arg("-t").arg(format!("{}", threads)); 587 | 588 | info!("cmd : {:?}", alevin_collate_cmd); 589 | let collate_start = Instant::now(); 590 | let collate_proc_out = alevin_collate_cmd 591 | .output() 592 | .expect("could not execute [collate]"); 593 | let collate_duration = collate_start.elapsed(); 594 | 595 | if !collate_proc_out.status.success() { 596 | bail!( 597 | "collate failed with exit status {:?}", 598 | collate_proc_out.status 599 | ); 600 | } 601 | 602 | // 603 | // quant 604 | // 605 | let mut alevin_quant_cmd = 606 | std::process::Command::new(format!("{}", &alevin_fry.display())); 607 | 608 | alevin_quant_cmd 609 | .arg("quant") 610 | .arg("-i") 611 | .arg(&gpl_output) 612 | .arg("-o") 613 | .arg(&gpl_output); 614 | alevin_quant_cmd.arg("-t").arg(format!("{}", threads)); 615 | alevin_quant_cmd.arg("-m").arg(t2g_map); 616 | alevin_quant_cmd.arg("-r").arg(resolution); 617 | 618 | info!("cmd : {:?}", alevin_quant_cmd); 619 | let quant_start = Instant::now(); 620 | let quant_proc_out = alevin_quant_cmd 621 | .output() 622 | .expect("could not execute [quant]"); 623 | let quant_duration = quant_start.elapsed(); 624 | 625 | if !quant_proc_out.status.success() { 626 | bail!("quant failed with exit status {:?}", quant_proc_out.status); 627 | } 628 | 629 | let af_quant_info_file = output.join("simpleaf_quant_log.json"); 630 | let af_quant_info = json!({ 631 | "time_info" : { 632 | "map_time" : map_duration, 633 | "gpl_time" : gpl_duration, 634 | "collate_time" : collate_duration, 635 | "quant_time" : quant_duration 636 | } 637 | }); 638 | 639 | std::fs::write( 640 | &af_quant_info_file, 641 | serde_json::to_string_pretty(&af_quant_info).unwrap(), 642 | ) 643 | .with_context(|| format!("could not write {}", af_quant_info_file.display()))?; 644 | } 645 | } 646 | Ok(()) 647 | } 648 | -------------------------------------------------------------------------------- /simpleaf/src/utils/af_utils.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone)] 2 | pub enum CellFilterMethod { 3 | // cut off at this cell in 4 | // the frequency sorted list 5 | ForceCells(usize), 6 | // use this cell as a hint in 7 | // the frequency sorted list 8 | ExpectCells(usize), 9 | // correct all cells in an 10 | // edit distance of 1 of these 11 | // barcodes 12 | ExplicitList(String), 13 | // barcodes will be provided in the 14 | // form of an *unfiltered* external 15 | // permit list 16 | UnfilteredExternalList(String, usize), 17 | // use the distance method to 18 | // automatically find the knee 19 | // in the curve 20 | KneeFinding, 21 | } 22 | 23 | pub fn add_to_args(fm: &CellFilterMethod, cmd: &mut std::process::Command) { 24 | match fm { 25 | CellFilterMethod::ForceCells(nc) => { 26 | cmd.arg("--force").arg(format!("{}", nc)); 27 | } 28 | CellFilterMethod::ExpectCells(nc) => { 29 | cmd.arg("--force").arg(format!("{}", nc)); 30 | } 31 | CellFilterMethod::ExplicitList(l) => { 32 | cmd.arg("--valid-bc").arg(l); 33 | } 34 | CellFilterMethod::UnfilteredExternalList(l, m) => { 35 | cmd.arg("--unfiltered-pl") 36 | .arg(l) 37 | .arg("--min-reads") 38 | .arg(format!("{}", m)); 39 | } 40 | CellFilterMethod::KneeFinding => { 41 | cmd.arg("--knee"); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /simpleaf/src/utils/prog_utils.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{anyhow, Result}; 2 | use cmd_lib::run_fun; 3 | use semver::{Version, VersionReq}; 4 | use serde::{Deserialize, Serialize}; 5 | use std::env; 6 | use std::path::PathBuf; 7 | use which::which; 8 | 9 | #[derive(Debug, Serialize, Deserialize, Clone)] 10 | pub struct ProgInfo { 11 | pub exe_path: PathBuf, 12 | pub version: String, 13 | } 14 | 15 | impl Default for ProgInfo { 16 | fn default() -> Self { 17 | Self { 18 | exe_path: PathBuf::from(""), 19 | version: String::from("0.0.0"), 20 | } 21 | } 22 | } 23 | 24 | // Holds the paths to the 25 | // programs we'll need to run 26 | // the tool. 27 | #[derive(Debug, Serialize, Deserialize)] 28 | pub struct ReqProgs { 29 | pub salmon: Option, 30 | pub alevin_fry: Option, 31 | pub pyroe: Option, 32 | } 33 | 34 | pub fn check_version_constraints>( 35 | req_string: S1, 36 | prog_output: std::result::Result, 37 | ) -> Result { 38 | match prog_output { 39 | Ok(vs) => { 40 | let x = vs.split_whitespace(); 41 | if let Some(version) = x.last() { 42 | let parsed_version = Version::parse(version).unwrap(); 43 | let req = VersionReq::parse(req_string.as_ref()).unwrap(); 44 | if req.matches(&parsed_version) { 45 | return Ok(parsed_version); 46 | } else { 47 | return Err(anyhow!( 48 | "parsed version {:?} does not satisfy constraints {:?}", 49 | version, 50 | req 51 | )); 52 | } 53 | } 54 | } 55 | Err(e) => { 56 | eprintln!("Error running salmon {}", e); 57 | return Err(anyhow!("could not parse program output")); 58 | } 59 | } 60 | Err(anyhow!("invalid version string")) 61 | } 62 | 63 | pub fn get_which_executable(prog_name: &str) -> Result { 64 | match which(prog_name) { 65 | Ok(p) => { 66 | println!("found `{}` in the PATH at {}", prog_name, p.display()); 67 | Ok(p) 68 | } 69 | Err(e) => { 70 | return Err(anyhow!( 71 | "could not find `{}` in your path: {}", 72 | prog_name, 73 | e 74 | )); 75 | } 76 | } 77 | } 78 | 79 | #[allow(dead_code)] 80 | pub fn search_for_executable(env_key: &str, prog_name: &str) -> Result { 81 | match env::var(env_key) { 82 | Ok(p) => Ok(PathBuf::from(p)), 83 | Err(e) => { 84 | eprintln!("${} is unset {}, trying default path.", env_key, e); 85 | eprintln!( 86 | "If a satisfactory version is not found, consider setting the ${} variable.", 87 | env_key 88 | ); 89 | get_which_executable(prog_name) 90 | } 91 | } 92 | } 93 | 94 | pub fn get_required_progs_from_paths( 95 | salmon_exe: Option, 96 | alevin_fry_exe: Option, 97 | pyroe_exe: Option, 98 | ) -> Result { 99 | let mut rp = ReqProgs { 100 | salmon: None, 101 | alevin_fry: None, 102 | pyroe: None, 103 | }; 104 | 105 | // use the given path if we have it 106 | // otherwise, check `which` 107 | let salmon = match salmon_exe { 108 | Some(p) => p, 109 | None => match get_which_executable("salmon") { 110 | Ok(p) => p, 111 | Err(e) => { 112 | return Err(e); 113 | } 114 | }, 115 | }; 116 | let alevin_fry = match alevin_fry_exe { 117 | Some(p) => p, 118 | None => match get_which_executable("alevin-fry") { 119 | Ok(p) => p, 120 | Err(e) => { 121 | return Err(e); 122 | } 123 | }, 124 | }; 125 | let pyroe = match pyroe_exe { 126 | Some(p) => p, 127 | None => match get_which_executable("pyroe") { 128 | Ok(p) => p, 129 | Err(e) => { 130 | return Err(e); 131 | } 132 | }, 133 | }; 134 | 135 | let st = salmon.display().to_string(); 136 | let sr = run_fun!($st --version); 137 | let v = check_version_constraints(">=1.5.1, <2.0.0", sr)?; 138 | rp.salmon = Some(ProgInfo { 139 | exe_path: salmon, 140 | version: format!("{}", v), 141 | }); 142 | 143 | let st = alevin_fry.display().to_string(); 144 | let sr = run_fun!($st --version); 145 | let v = check_version_constraints(">=0.4.1, <1.0.0", sr)?; 146 | rp.alevin_fry = Some(ProgInfo { 147 | exe_path: alevin_fry, 148 | version: format!("{}", v), 149 | }); 150 | 151 | let st = pyroe.display().to_string(); 152 | let sr = run_fun!($st --version); 153 | let v = check_version_constraints(">=0.6.2, <1.0.0", sr)?; 154 | rp.pyroe = Some(ProgInfo { 155 | exe_path: pyroe, 156 | version: format!("{}", v), 157 | }); 158 | 159 | Ok(rp) 160 | } 161 | 162 | #[allow(dead_code)] 163 | pub fn get_required_progs() -> Result { 164 | // First look for any environment variables 165 | // then check the path. 166 | let salmon_exe = Some(search_for_executable("SALMON", "salmon")?); 167 | let alevin_fry_exe = Some(search_for_executable("ALEVIN_FRY", "alevin-fry")?); 168 | let pyroe_exe = Some(search_for_executable("PYROE", "pyroe")?); 169 | 170 | get_required_progs_from_paths(salmon_exe, alevin_fry_exe, pyroe_exe) 171 | } 172 | -------------------------------------------------------------------------------- /simpleaf_conda_env.yml: -------------------------------------------------------------------------------- 1 | name: simpleaf 2 | 3 | channels: 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | 8 | dependencies: 9 | # - python>=3.9 10 | - bioconda::salmon>=1.8.0 11 | - bioconda::alevin-fry>=0.6.0 12 | - bioconda::pyroe>=0.6.2 13 | - bioconda::bedtools>=2.30.0 14 | - bioconda::gffread>=0.12.7 15 | - conda-forge::r-essentials>=4.1 16 | - conda-forge::r-devtools 17 | - conda-forge::r-argparser 18 | - conda-forge::r-biocmanager 19 | - bioconductor-genomicfeatures 20 | - bioconductor-eisaR 21 | - bioconductor-bsgenome 22 | - bioconductor-fishpond 23 | --------------------------------------------------------------------------------