├── .github
    └── workflows
    │   └── test_simpleaf.yml
├── .gitignore
├── LICENSE
├── R
    ├── README.md
    ├── build_splici_ref.R
    ├── cellRangerLikeEmptyDrops.R
    └── load_fry.R
├── README.md
├── bash
    ├── geneid_to_name.sh
    ├── get_10x_permit_lists.sh
    ├── simpleaf
    └── test_simpleaf.sh
├── docker
    ├── Dockerfile
    ├── Singularity.def
    └── build_docker.sh
├── python
    └── load_fry.py
├── simpleaf
    ├── Cargo.toml
    └── src
    │   ├── main.rs
    │   └── utils
    │       ├── af_utils.rs
    │       └── prog_utils.rs
└── simpleaf_conda_env.yml


/.github/workflows/test_simpleaf.yml:
--------------------------------------------------------------------------------
 1 | name: simpleaf-check
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   test-simpleaf1:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Checkout roe repo
13 |       uses: actions/checkout@v3
14 |       with:
15 |         repository: COMBINE-lab/roe
16 |         path: roe
17 |     - uses: conda-incubator/setup-miniconda@v2
18 |       with:
19 |         python-version: 3.7
20 |         mamba-version: "*"
21 |         channels: conda-forge,bioconda
22 |         channel-priority: true
23 |         activate-environment: anaconda-client-env
24 |         environment-file: simpleaf_conda_env.yml
25 |     - name: install roe
26 |       shell: bash -l {0}
27 |       run: |
28 |         Rscript -e "devtools::install(pkg=\"roe\", dependencies=FALSE)"
29 |         # Rscript -e "devtools::install()"
30 |     - name: Test simpleaf
31 |       shell: bash -l {0}
32 |       run: |
33 |         cd bash
34 |         chmod +x test_simpleaf.sh
35 |         ./test_simpleaf.sh
36 |     - name: Check job status
37 |       run: echo "This job's status is ${{ job.status }}."
38 | 
39 | # jobs:
40 | #   test-simpleaf:
41 | #     runs-on: ubuntu-latest
42 | #     defaults:
43 | #       run:
44 | #         shell: bash -l {0}
45 | #     steps:
46 | #     - name: Checkout
47 | #       uses: actions/checkout@v3
48 | 
49 | #     - name: Checkout roe repo
50 | #       uses: actions/checkout@v3
51 | #       with:
52 | #         repository: COMBINE-lab/roe
53 | #         path: roe
54 | #     - name: traverse roe folder
55 | #       run: |
56 | #         cd roe
57 | #         ls -lh
58 | #     - name: Add conda to system path
59 | #       run: |
60 | #         # $CONDA is an environment variable pointing to the root of the miniconda directory
61 | #         echo $CONDA/bin >> $GITHUB_PATH
62 | #         conda env update --file simpleaf_conda_env.yml --name base
63 | #         conda activate base
64 | #         # Rscript -e "devtools::install(pkg=\"./roe\", dependencies=FALSE)"
65 | #         Rscript -e "devtools::install()"
66 | #     - name: Install dependencies
67 | #       run: |
68 | #         conda env update --file simpleaf_conda_env.yml --name base
69 | #     - name: install roe
70 | #       run: |
71 | #         Rscript -e "devtools::install(pkg=\"./roe\", dependencies=FALSE)"
72 | #     - name: Test simpleaf
73 | #       run: |
74 | #         cd bash
75 | #         chmod +x test_simpleaf.sh
76 | #         ./test_simpleaf.sh
77 | #     - name: Check job status
78 | #       run: echo "This job's status is ${{ job.status }}."
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, COMBINE lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/R/README.md:
--------------------------------------------------------------------------------
1 | ## Useful R functions for preparing and processing data for alevin-fry.
2 | 
3 | * `build_splici_ref.R` — A script to build a spliced + intron (splici) ref for indexing and quantification with `alevin-fry`. This function is now included in the [`roe`](https://github.com/COMBINE-lab/roe). Please follow [this instruction](https://github.com/COMBINE-lab/roe#installlation) to install `roe`.
4 | 
5 | * `cellRangerLikeEmptyDrops.R` — An implementation of the hybrid UMI count filtering and [`emptyDrops`](https://github.com/MarioniLab/DropletUtils) used by CellRanger (and subsequently by [STARsolo](https://github.com/alexdobin/STAR)). This R implementation is a translation of the implementation in STARsolo, which itself was reverse-engineered from CellRanger. *This script should only be used for the old R version ($<$ R v4.1.0 and BioC 3.1.4).* If you have a higher version of R, please use the `emptyDropsCellRanger()` function in the [`DropletUtils`](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html) BioConductor package. 
6 | * `load_fry.R` — Contains a function to load `alevin-fry` output (including from USA mode quantification) into a [`SingleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) object. *This script should only be used for old R version ($<$ R v4.1.0 and BioC 3.1.4).* If you have a higher version of R, please use the `loadFry()` function in the [`fishpond`](https://bioconductor.org/packages/release/bioc/html/DropletUtils.html) BioConductor package. 
7 | 


--------------------------------------------------------------------------------
/R/build_splici_ref.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # usage :
 4 | # $ ./build_splici_ref.R <path_to_genome_fasta> <path_to_gtf> <target_read_length> <output_dir>
 5 | 
 6 | # install BioC depedencies if necessary
 7 | if ( (!requireNamespace("eisaR", quietly = TRUE)) || 
 8 |      (!requireNamespace("BSgenome", quietly = TRUE)) || 
 9 |      (!requireNamespace("fishpond", quietly = TRUE)) ) {
10 | 
11 |   # install BioC itself, if we don't have it
12 |   if (!requireNamespace("BiocManager", quietly = TRUE)) {
13 |     install.packages("BiocManager")
14 |   }
15 | 
16 |   BiocManager::install(c("eisaR","BSgenome","fishpond"))
17 | }
18 | 
19 | # install argparser
20 | if (!requireNamespace("argparser", quietly = TRUE))
21 |     install.packages("argparser")
22 | 
23 | # install devtools 
24 | if (!requireNamespace("devtools", quietly = TRUE))
25 |     install.packages("devtools")
26 | 
27 | # install roe from github
28 | if (!requireNamespace("roe", quietly = TRUE))
29 |     devtools::install_github("COMBINE-lab/roe")
30 | 
31 | # load packages
32 | suppressPackageStartupMessages({
33 | library(argparser)
34 | library(roe)
35 | })
36 | 
37 | # Create a parser
38 | p <- arg_parser("Build a splici reference from a genome and GTF file.")
39 | 
40 | # Add command line arguments
41 | # required arguments
42 | p <- add_argument(p, "genome", help="The path to a genome FASTA file.")
43 | p <- add_argument(p, "gtf", help="The path to a gtf file.")
44 | p <- add_argument(p, "read-length", help="The read length of the single-cell experiment being processed (determines flank size).",
45 | 	type="numeric")
46 | p <- add_argument(p, "output-dir", 
47 | 	help="The output directory where splici reference files will be written.")
48 | 
49 | # optional arguments
50 | p <- add_argument(p, "--flank-trim-length", 
51 | 	help="Determines the amount subtracted from the read length to get the flank length.",
52 | 	type="numeric",
53 | 	default=5)
54 | p <- add_argument(p, "--filename-prefix", 
55 | 	help="The file name prefix of the generated output files.",
56 | 	default="splici")
57 | p <- add_argument(p, "--extra-spliced",
58 | 	help="The path to an extra spliced sequence fasta file.")
59 | p <- add_argument(p, "--extra-unspliced",
60 | 	help="The path to an extra unspliced sequence fasta file.")
61 | p <- add_argument(p, "--dedup-seqs",
62 | 	help="A flag indicates whether identical sequences will be deduplicated.", 
63 | 	flag=TRUE)
64 | p <- add_argument(p, "--no-flanking-merge", 
65 | 	help="A flag indicates whether flank lengths will be considered when merging introns.", 
66 | 	flag=TRUE)
67 | 
68 | # Parse the command line arguments
69 | argv <- parse_args(p)
70 | 
71 | # Set NAs to NULLs and call the function
72 | if (is.na(argv$extra_spliced)) {
73 | 	argv$extra_spliced <- NULL
74 | }
75 | if (is.na(argv$extra_unspliced)) {
76 | 	argv$extra_unspliced <- NULL
77 | }
78 | 
79 | make_splici_txome(gtf_path = argv$gtf,
80 | 				genome_path = argv$genome,
81 | 				read_length = argv$read_length,
82 | 				output_dir = argv$output_dir,
83 | 				flank_trim_length = argv$flank_trim_length,
84 | 				filename_prefix = argv$filename_prefix,
85 | 				extra_spliced = argv$extra_spliced,
86 | 				extra_unspliced = argv$extra_unspliced,
87 | 				dedup_seqs = argv$dedup_seqs,
88 | 				no_flanking_merge = argv$no_flanking_merge)
89 | 


--------------------------------------------------------------------------------
/R/cellRangerLikeEmptyDrops.R:
--------------------------------------------------------------------------------
  1 | # To use this function, source it to your environment by `source("cellRangerLikeEmptyDrops.R")`
  2 | #' An approximate implementation of the `--soloCellFilter  EmptyDrops_CR` filtering approach to identify empty droplets.
  3 | #'
  4 | #' An approximate implementation of the `--soloCellFilter  EmptyDrops_CR` filtering approach, 
  5 | #' which, itself, was reverse-engineered from the behavior of CellRanger 3+.
  6 | #' 
  7 | #' @param m A numeric matrix-like object containing counts, where columns represent barcoded droplets and rows represent features.
  8 | #' The matrix should only contain barcodes for an individual sample, prior to any filtering for cells.
  9 | #' 
 10 | #' @param umiMin A numeric scalar specifying the  minimum UMI count above which a sample will be included in ambient profiles, 
 11 | #' as specified in the call to CellRanger.
 12 | #' 
 13 | #' @param umiMinFracMedian A numeric scalar between 0 and 1 specifying that only the samples whose UMI count are above \code{umiMinFracMedian} 
 14 | #' fraction of the median UMI count#'  of the top \code{nExpectedCells} samples will be included in the ambient profile. 
 15 | #' as specified in the call to CellRanger.
 16 | #' 
 17 | #' @param candMaxN An integer specifying the maximum number of ambient samples that are possible to be regarded as real cells, 
 18 | #' as specified in the call to CellRanger.
 19 | #' 
 20 | #' @param indMax An integer specifying the highest UMI count ranking of the ambient pool, cells with UMI count ranking above
 21 | #' this number will not be included in the ambient pool, as specified in the call to CellRanger.
 22 | #' 
 23 | #' @param indMin An integer specifying the lowest UMI count ranking of the ambient pool, cells with UMI count ranking below
 24 | #' this number will not be included in the ambient pool, as specified in the call to CellRanger.
 25 | #' 
 26 | #' @param fdr_thresh A numeric scalar specifying the FDR threshold to filter samples. Samples whose FDR returned by emptyDrops
 27 | #' is above this threshold will not be regarded as real cells, as specified in the call to CellRanger.
 28 | #' 
 29 | #' @param maxPercentile A numeric scalar specifying the percentile used in simple filtering, samples selected by simple filtering 
 30 | #' will be regarded as real cells regardless of the \code{emptyDrops} result, as specified in the call to CellRanger.
 31 | #' 
 32 | #' @param nExpectedCells A numeric scalar specifying the expected number of cells in this sample, as specified in the call to CellRanger.
 33 | #' 
 34 | #' @param maxMinRatio A numeric scalar specifying the maximum ratio of maximum UMI count and minimum UMI count used in simple filtering, 
 35 | #' maximum UMI count used in simple filtering is determined first by \code{nExpectedCells*(1-maxPercentile)}, minimum UMI count used in
 36 | #'  simple filtering is then determined by this ratio, as specified in the call to CellRanger..
 37 | #'  
 38 | #' @param seed Integer specifying the seed that will be used to run \code{emptyDrops}
 39 | #' @param ... For the generic, further arguments to pass to \code{emptyDrops}.
 40 | #'
 41 | #' @details
 42 | #' This function is an approximate implementation of the  `--soloCellFilter  EmptyDrops_CR` filtering approach of STARsolo 
 43 | #' (\url{https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1}), which, itself, was reverse engineered from the behavior of  CellRanger 3+. 
 44 | #' The original C++ code on which this function is based can be found at 
 45 | #' (\url{https://github.com/alexdobin/STAR/blob/master/source/SoloFeature_cellFiltering.cpp}) 
 46 | #' All parameters are defaulty set as the default value used in starSolo and Cellranger.
 47 | #' In the most cases, users just need to specify the raw and unfiltered count matrix, \code{m}.
 48 | #' See \code{?\link{emptyDrops}} for an alternative approach for cell calling.
 49 | #' 
 50 | #' @return
 51 | #' A DataFrame like \code{\link{emptyDrops}}, with an additional binary \code{is.cell} field demonstrating whether
 52 | #' samples are estimated as real cells.
 53 | #' 
 54 | #' @author
 55 | #' Dongze He, Rob Patro
 56 | #' 
 57 | #' @examples
 58 | #' # Mocking up some data:
 59 | #' set.seed(0)
 60 | #' my.counts <- DropletUtils:::simCounts()
 61 | #' 
 62 | #' # Identify likely cell-containing droplets.
 63 | #' e.out <- cellRangerLikeEmptyDrops(my.counts)
 64 | #' e.out
 65 | #' 
 66 | #' # Get matrix of estimated cells.
 67 | #' cell.counts <- my.counts[, e.out$is.cell]
 68 | #' 
 69 | #' @references
 70 | #' Kaminow et al. (2021).
 71 | #' STARsolo: accurate, fast and versatile mapping/quantification of single-cell and single-nucleus RNA-seq data
 72 | #' \url{https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1}
 73 | #' 
 74 | #' @seealso
 75 | #' \code{\link{emptyDrops}}, for another method for calling cells.
 76 | #'
 77 | #' @name cellRangerLikeEmptyDrops
 78 | NULL
 79 | 
 80 | # Authors: Dongze He, Rob Patro
 81 | # Center of Bioinformatics and Computational Biology, University of Maryland, College Park, Maryland, 20740
 82 | 
 83 | #' @importFrom DropletUtils emptyDrops
 84 | .cellRangerLikeEmptyDrops  <- function(m, 
 85 |                                       umiMin=500,
 86 |                                       umiMinFracMedian=0.01, 
 87 |                                       candMaxN=20000, 
 88 |                                       indMax=90000, 
 89 |                                       indMin=45000, 
 90 |                                       fdr_thresh=0.01, 
 91 |                                       maxPercentile=0.99, 
 92 |                                       nExpectedCells=3000, 
 93 |                                       maxMinRatio=10,
 94 |                                       seed=2718,
 95 |                                       ...
 96 | ) {
 97 |     # This function is an approximate implementation of the 
 98 |     # `--soloCellFilter  EmptyDrops_CR` filtering approach 
 99 |     # of STARsolo (https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1),
100 |     # which, itself, was reverse engineered from the behavior of 
101 |     # CellRanger 3+. The original C++ code on which this 
102 |     # function is based can be found at (https://github.com/alexdobin/STAR/blob/master/source/SoloFeature_cellFiltering.cpp) 
103 | 
104 |     ###################################################################################################################    
105 |     # get the sorted nUMI vector of cells 
106 |     csums <- colSums2(m)
107 |     indCount <- as.data.frame(cbind(1:length(csums), csums))
108 |     colnames(indCount) <- c("index", "count")
109 |     indCount <- indCount[order(indCount$count,decreasing = TRUE),]
110 | 
111 |     # Simple Filtering
112 |     maxind <- round(nExpectedCells * (1 - maxPercentile))
113 |     nUMImax <- indCount$count[min(ncol(m), maxind)]
114 |     nUMImin <- round(nUMImax/maxMinRatio)
115 |     ncellsSimple <- sum(indCount$count>=nUMImin)
116 | 
117 |     # set lower bound
118 |     minUMI    <- max(umiMin, round(umiMinFracMedian * indCount$count[ncellsSimple/2]))
119 | 
120 |     ## we at most assign candMaxN samples in the ambient pool as real cells
121 |     minUMI <- max(minUMI, indCount$count[min(ncellsSimple+candMaxN,nrow(indCount))])
122 | 
123 |     # emptyDrops
124 |     ## ignore: the lower bound of UMI count, samples with UMI count less than ignore
125 |     ## will not be considered as ambient cells.
126 |     ignore_index <- min(ncol(m), indMax)
127 |     ignore <- indCount$count[ignore_index]
128 | 
129 |     ## by.rank: cells with UMI count ranking lower than by.rank will be considered as 
130 |     ## ambient cells
131 |     by.rank <- indMin
132 | 
133 |     ## retain: samples with UMI count higher than retain will be regarded as cells 
134 |     retain <- indCount$count[ncellsSimple]
135 | 
136 |     ## the cells with total UMI count between ignore and lower will be considered as ambient
137 |     set.seed(seed)
138 |     e.out <- DropletUtils::emptyDrops(m, by.rank=by.rank, ignore=ignore, retain=retain, alpha=Inf)
139 |     e.out$is.cell <- e.out$FDR < fdr_thresh
140 |     e.out$is.cell[is.na(e.out$is.cell)] <- FALSE
141 | 
142 |     # further filter cells by minUMI
143 |     e.out$is.cell[indCount[indCount$count<minUMI, "index"]] <- FALSE
144 |     e.out
145 | }
146 | 
147 | #' @export
148 | #' @rdname cellRangerLikeEmptyDrops
149 | setGeneric("cellRangerLikeEmptyDrops", function(m, ...) standardGeneric("cellRangerLikeEmptyDrops"))
150 | 
151 | #' @export
152 | #' @rdname cellRangerLikeEmptyDrops
153 | setMethod("cellRangerLikeEmptyDrops", "ANY", .cellRangerLikeEmptyDrops)
154 | 
155 | #' @export
156 | #' @rdname cellRangerLikeEmptyDrops
157 | #' @importFrom SummarizedExperiment assay
158 | setMethod("cellRangerLikeEmptyDrops", "SummarizedExperiment", function(m, ..., assay.type="counts") {
159 |   .cellRangerLikeEmptyDrops(assay(m, assay.type), ...)
160 | })
161 | 


--------------------------------------------------------------------------------
/R/load_fry.R:
--------------------------------------------------------------------------------
 1 | #' Read alevin-fry quantifications into a SingleCellExperiment object
 2 | load_fry <- function(frydir, which_counts = c('S', 'A'), verbose = FALSE) {
 3 |   suppressPackageStartupMessages({
 4 |     library(rjson)
 5 |     library(Matrix)
 6 |     library(SingleCellExperiment)
 7 |   })
 8 |   
 9 |   # read in metadata
10 |   qfile <- file.path(frydir, "quant.json")
11 |   if (!file.exists(qfile)) {
12 |     qfile <- file.path(frydir, "meta_info.json")
13 |   }
14 | 
15 |   meta_info <- fromJSON(file = qfile)
16 |   ng <- meta_info$num_genes
17 |   usa_mode <- meta_info$usa_mode
18 |   
19 |   if (usa_mode) {
20 |     if (length(which_counts) == 0) {
21 |       stop("Please at least provide one status in 'U' 'S' 'A' ")
22 |     }
23 |     if (verbose) {
24 |       message("processing input in USA mode, will return ", paste(which_counts, collapse = '+'))
25 |     }
26 |   } else if (verbose) {
27 |     message("processing input in standard mode, will return spliced count")
28 |   }
29 | 
30 |   # read in count matrix
31 |   af_raw <- readMM(file = file.path(frydir, "alevin", "quants_mat.mtx"))
32 |   # if usa mode, each gene gets 3 rows, so the actual number of genes is ng/3
33 |   if (usa_mode) {
34 |     if (ng %% 3 != 0) {
35 |       stop("The number of quantified targets is not a multiple of 3")
36 |     }
37 |     ng <- as.integer(ng/3)
38 |   }
39 |   
40 |   # read in gene name file and cell barcode file
41 |   afg <- read.csv(file.path(frydir, "alevin", "quants_mat_cols.txt"), 
42 |                   strip.white = TRUE, header = FALSE, nrows = ng, 
43 |                   col.names = c("gene_ids"), row.names = 1)
44 |   afc <- read.csv(file.path(frydir, "alevin", "quants_mat_rows.txt"), 
45 |                   strip.white = TRUE, header = FALSE,
46 |                   col.names = c("barcodes"), row.names = 1)
47 | 
48 |   # if in usa_mode, sum up counts in different status according to which_counts
49 |   if (usa_mode) {
50 |     rd <- list("S" = seq(1, ng), "U" =  seq(ng + 1, 2 * ng),
51 |                "A" =  seq(2 * ng + 1, 3 * ng))
52 |     o <- af_raw[, rd[[which_counts[1]]], drop = FALSE]
53 |     for (wc in which_counts[-1]) {
54 |       o <- o + af_raw[, rd[[wc]], drop = FALSE]
55 |     }
56 |   } else {
57 |     o <- af_raw
58 |   }
59 |   
60 |   # create SingleCellExperiment object
61 |   sce <- SingleCellExperiment(list(counts = t(o)),
62 |                               colData = afc,
63 |                               rowData = afg
64 |   )
65 |   sce
66 | }
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Usefulaf: An all-in-one Docker/Singularity image for single-cell processing with alevin-fry
 2 | 
 3 | [`Usefulaf`](https://hub.docker.com/r/combinelab/usefulaf/tags) is an all-in-one Docker/Singularity image for single-cell processing with [Alevin-fry](https://github.com/COMBINE-lab/alevin-fry)([paper](https://www.nature.com/articles/s41592-022-01408-3)). It includes the all tools you need to turn your FASTQ files into a count matrix and then load it into your favorite analysis environment. Specifically, this image includes:
 4 | 
 5 | - [`simpleaf`](https://github.com/COMBINE-lab/simpleaf): A simplified interface to indexing and quantifying with `alevin-fry`.
 6 | - [`pyroe`](https://github.com/COMBINE-lab/pyroe): An alevin-fry utility python package for building splici references, converting alevin-fry output formats, loading count matrix in Python, adding gene names (instead of just gene IDs) to output matrices, etc.
 7 | - [`fishpond::loadFry()`](https://rdrr.io/github/mikelove/fishpond/man/loadFry.html): A R function for loading count matrix as [SingleCellExperiment](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) object.
 8 | 
 9 | For processing data simply using the `usefulaf` image, check our latest tutorial [here](https://combine-lab.github.io/alevin-fry-tutorials/2021/quickstart-usefulaf-singularity/).
10 | 
11 | For pulling the Singularity image, please run the following code in bash. Note that the image is $\sim$ 1.65 GB.
12 | 
13 | ```bash
14 | # if you use Docker
15 | $ docker pull combinelab/usefulaf:latest
16 | 
17 | # if you use Singularity
18 | $ singularity pull docker://combinelab/usefulaf:latest
19 | 
20 | ```
21 | 
22 | ## Usefulaf history
23 | 
24 | [Alevin-fry](https://github.com/COMBINE-lab/alevin-fry) is a fast, accurate, and memory-frugal tool for preprocessing single-cell and single-nucleus RNA-seq data. You can read more about alevin-fry in alevin-fry [pre-print](https://www.biorxiv.org/content/10.1101/2021.06.29.450377v2), and [paper](https://www.nature.com/articles/s41592-022-01408-3).
25 | 
26 | This repository was created initially with scripts, functions, and utilities that are useful for preparing data for processing with alevin-fry, as well as for reading alevin-fry data into other packages for downstream analysis. It also accompanies a Docker/Singularity container containing all of this relevant software in one place. However, as `alevin-fry` has continued to grow, all of that relevant functionality found its way into other, more stable and permanent homes (e.g. [`pyroe`](https://github.com/COMBINE-lab/pyroe) for splici reference construction and loading data in Python, [`roe`](https://github.com/COMBINE-lab/roe) for splici reference construction in R and [`fishpond`](https://bioconductor.org/packages/release/bioc/html/fishpond.html) for loading data in `R`). Finally, this repository also contained a bash script called `simpleaf` to simplify common workflows with `alevin-fry`. That, too, has evolved into its own (much more feature-rich and comprehensive) tool, living in its own repository ([`simpleaf`](https://github.com/COMBINE-lab/simpleaf)).
27 | 
28 | As such, all the scripts and functions in this repository have been retired. However, as usefulaf is still the only place that provides all these functionalities, we decided to turn [`usefulaf`] as an all-in-one [Docker/Singularity image](https://hub.docker.com/r/combinelab/usefulaf/tags) that makes use of all those new tools listed above. That has replaced the older `usefulaf` image that made use of the varied assortment of scripts and tools hosted in this repository. 
29 | 


--------------------------------------------------------------------------------
/bash/geneid_to_name.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | display_usage() { 
 4 |   echo "This script returns the gene id to gene name mapping as a TSV file using gffread"
 5 |   echo -e "\nUsage: $0 [options]" 
 6 |   echo -e "\toptions:" 
 7 |   echo -e "\t -g, --gtf REQUIRED path to a GTF file"
 8 |   echo -e "\t -o, --output REQUIRED path to the output TSV file (will be created if it doesn't exist)"
 9 |   echo -e "\t --gffread, path to the gffread binary"
10 |   echo -e "\t -h, --help display this help message"
11 | } 
12 | 
13 | v3_plist="https://umd.box.com/shared/static/eo0qlkfqf2v24ws6dfnxty6gqk1otf2h"
14 | v2_plist="https://umd.box.com/shared/static/jbs2wszgbj7k4ic2hass9ts6nhqkwq1p"
15 | 
16 | while [[ "$#" -gt 0 ]]; do 
17 |   case $1 in 
18 |     -g|--gtf) gtf="$2"; shift;;
19 |     -o|--output) output="$2"; shift ;;
20 |     --gffread) gffread="$2"; shift ;;
21 |     -h|--help) help=1; shift;;
22 |     *) echo "Unknown parameter passed: $1"; exit 1 ;;
23 |   esac 
24 |   shift 
25 | done
26 | 
27 | if [[ -n "$help" ]]; then
28 |   display_usage
29 |   exit 0
30 | fi
31 | 
32 | if [[ -z "$output" || -z "$gtf" ]]; then
33 |   display_usage
34 |   exit 1
35 | fi
36 | 
37 | if [[ -z "$gffread" ]]; then 
38 | gffread="gffread"
39 | fi
40 | 
41 | # make temp dir
42 | WORK_DIR=`mktemp -d`
43 | 
44 | # check if tmp dir was created
45 | if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then
46 |     echo "Could not create temp dir"
47 |     exit 1
48 | fi
49 | 
50 | # deletes the temp directory
51 | function cleanup {      
52 |     rm -rf "$WORK_DIR"
53 |     # echo "  - Deleted temp working directory $WORK_DIR"
54 | }
55 | 
56 | # generate gff from gtf
57 | gff_cmd="$gffread ${gtf} -o $WORK_DIR/genes.gff"
58 | eval $gff_cmd
59 | 
60 | # make the file
61 | grep "gene_name" $WORK_DIR/genes.gff | cut -f9 | cut -d';' -f2,3 | sed 's/=/ /g' | sed 's/;/ /g' | cut -d' ' -f2,4 | sort | uniq > ${output}
62 | 
63 | trap cleanup EXIT
64 | 


--------------------------------------------------------------------------------
/bash/get_10x_permit_lists.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | display_usage() { 
 4 |   echo "This script downloads a 10x chromium v2 or v3 permit list to the specified output directory"
 5 |   echo -e "\nUsage: $0 [options]" 
 6 |   echo -e "\toptions:" 
 7 |   echo -e "\t -o, --output REQUIRED path to output directory (will be created if it doesn't exist)"
 8 |   echo -e "\t -l, --list REQUIRED permit list to download, one of {v2, v3}"
 9 |   echo -e "\t -h, --help display this help message"
10 | } 
11 | 
12 | v3_plist="https://umd.box.com/shared/static/eo0qlkfqf2v24ws6dfnxty6gqk1otf2h"
13 | v2_plist="https://umd.box.com/shared/static/jbs2wszgbj7k4ic2hass9ts6nhqkwq1p"
14 | 
15 | while [[ "$#" -gt 0 ]]; do 
16 |   case $1 in 
17 |     -o|--output) output="$2"; shift ;;
18 |     -l|--list) list="$2"; shift;;
19 |     -h|--help) help=1; shift;;
20 |     *) echo "Unknown parameter passed: $1"; exit 1 ;;
21 |   esac 
22 |   shift 
23 | done
24 | 
25 | if [[ -n "$help" ]]; then
26 |   display_usage
27 |   exit 0
28 | fi
29 | 
30 | if [[ -z "$output" || -z "$list" ]]; then
31 |   display_usage
32 |   exit 1
33 | fi
34 | 
35 | if [ "$list" == "v2" ]; then
36 |   mkdir -p $output
37 |   wget -v -O $output/10x_v2_permit.txt -L $v2_plist
38 | fi
39 | 
40 | if [ "$list" == "v3" ]; then
41 |   mkdir -p $output
42 |   wget -v -O $output/10x_v3_permit.txt -L $v3_plist
43 | fi
44 | 
45 | echo -e "\n\noutput written to ${output}/10x_${list}_permit.txt\n\n"
46 | 


--------------------------------------------------------------------------------
/bash/simpleaf:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | function display_quant_usage() { 
  4 |   echo "This script runs alevin-fry to quantify a single-cell RNA-seq experiment"
  5 |   echo -e "\nUsage: $0 quant [options]" 
  6 |   echo -e "\toptions:" 
  7 |   echo -e "\t -1, --r1 REQUIRED comma separated list of left reads"
  8 |   echo -e "\t -2, --r2 REQUIRED comma separated list of right reads"
  9 |   echo -e "\t -i, --index REQUIRED path to a (sparse or dense) salmon splici index"
 10 |   echo -e "\t -o, --output REQUIRED path to output directory (will be created if it doesn't exist)"
 11 |   echo -e "\t -f, --fmode REQUIRED permit list filter mode, one of {knee, k, unfilt, u}"
 12 |   echo -e "\t -c, --chem REQUIRED chemistry of experiment, one of {v2, v3}"
 13 |   echo -e "\t -r, --res REQUIRED resolution strategy for alevin-fry, one of {cr-like, cr-like-em}"
 14 |   echo -e "\t -m, --t2g REQUIRED three-column txp-to-gene file to pass to alevin-fry quant command"
 15 |   echo -e "\t -t, --threads OPTIONAL number of threads to use when running [default: min(16, num cores)]"
 16 |   echo -e "\t -h, --help display this help message"
 17 | } 
 18 | 
 19 | function display_index_usage() { 
 20 |   echo "This script generates a splici reference and indexes it"
 21 |   echo -e "\nUsage: $0 index [options]" 
 22 |   echo -e "\toptions:" 
 23 |   echo -e "\t -f, --fasta REQUIRED genome reference FASTA file"
 24 |   echo -e "\t -g, --gtf REQUIRED GTF file with gene annotations"
 25 |   echo -e "\t -l, --rlen REQUIRED the target read length the index will be built for"
 26 |   echo -e "\t -o, --output REQUIRED path to output directory (will be created if it doesn't exist)"
 27 |   echo -e "\t -s, --spliced OPTIONAL path to FASTA file with extra spliced sequence to add to the index"
 28 |   echo -e "\t -u, --unspliced OPTIONAL path to FASTA file with extra unspliced sequence to add to the index"
 29 |   echo -e "\t -d, --dedup FLAG OPTIONAL deduplicate identical sequences inside the R script when building the splici reference"
 30 |   echo -e "\t -e, --dense FLAG OPTIONAL if this flag is passed, build the dense rather than sparse index for mapping"
 31 |   echo -e "\t -t, --threads OPTIONAL number of threads to use when running [default: min(16, num cores)]"
 32 |   echo -e "\t -h, --help display this help message"
 33 | } 
 34 | 
 35 | function display_usage() { 
 36 |   echo "This script wraps alevin-fry to index a reference or quantify a single-cell RNA-seq experiment"
 37 |   echo -e "\nUsage: $0 index [index_options]" 
 38 |   echo -e "Usage: $0 quant [quant_options]" 
 39 |   echo -e "Usage: $0 -h | --help" 
 40 |   echo -e "use $0 index -h or $0 quant -h to get more help for those sub-commands"
 41 | } 
 42 | 
 43 | # compare versions.
 44 | # from: https://stackoverflow.com/questions/4023830/how-to-compare-two-strings-in-dot-separated-version-format-in-bash
 45 | function vercomp () {
 46 |     if [[ $1 == $2 ]]
 47 |     then
 48 |         return 0
 49 |     fi
 50 |     local IFS=.
 51 |     local i ver1=($1) ver2=($2)
 52 |     # fill empty fields in ver1 with zeros
 53 |     for ((i=${#ver1[@]}; i<${#ver2[@]}; i++))
 54 |     do
 55 |         ver1[i]=0
 56 |     done
 57 |     for ((i=0; i<${#ver1[@]}; i++))
 58 |     do
 59 |         if [[ -z ${ver2[i]} ]]
 60 |         then
 61 |             # fill empty fields in ver2 with zeros
 62 |             ver2[i]=0
 63 |         fi
 64 |         if ((10#${ver1[i]} > 10#${ver2[i]}))
 65 |         then
 66 |             return 1
 67 |         fi
 68 |         if ((10#${ver1[i]} < 10#${ver2[i]}))
 69 |         then
 70 |             return 2
 71 |         fi
 72 |     done
 73 |     return 0
 74 | }
 75 | 
 76 | function check_min_version() {
 77 |   prog_name=$1
 78 |   min_ver=$2
 79 |   prog_vstr=`$prog_name --version`; ec=$?
 80 |   case $ec in
 81 |     0) echo "$prog_vstr";;
 82 |     *) echo "ERROR: $prog_name -v exited with non-zero exit code"; exit 1;;
 83 |   esac
 84 |   
 85 |   # check that the version is new enough
 86 |   prog_ver=`echo $prog_vstr | cut -d' ' -f2`
 87 |   
 88 |   vercomp $prog_ver $min_ver; ec1=$?
 89 |  
 90 |   if [[ $ec1 -eq 0 || $ec1 -eq 1 ]]; then
 91 |     echo "$prog_name version $prog_ver is sufficiently new."
 92 |     return 0
 93 |   else
 94 |     echo "ERROR: $prog_name version was $prog_ver, require at least $min_ver."
 95 |     exit 1
 96 |   fi
 97 | }
 98 | 
 99 | ###
100 | # end of helper functions, entry point of 
101 | # script.
102 | ###
103 | 
104 | # make sure that an alevin-fry home is set
105 | if [ -z "${ALEVIN_FRY_HOME}" ]; then
106 |   echo "To use $0, you must set a valid ALEVIN_FRY_HOME environment variable"
107 |   exit 1
108 | else
109 |   ALEVIN_FRY_HOME=$(realpath ${ALEVIN_FRY_HOME})
110 | fi
111 | 
112 | if [ -d "$ALEVIN_FRY_HOME" ]; then
113 |     echo "ALEVIN_FRY_HOME=$ALEVIN_FRY_HOME" 
114 | else
115 |     echo "ALEVIN_FRY_HOME did not exist; creating it."
116 |     mkdir -p $ALEVIN_FRY_HOME
117 | fi
118 | 
119 | # check that the salmon executable runs and is of at least the required version
120 | salmon="${SALMON_BIN:-salmon}"
121 | check_min_version $salmon "1.5.1"
122 | 
123 | # check that the fry executable runs and is of at least the required version
124 | fry="${FRY_BIN:-alevin-fry}"
125 | check_min_version $fry "0.4.0"
126 | 
127 | time=${TIME_BIN:-"/usr/bin/time"}
128 | # time writes version to stderr ... sigh
129 | time_vstr=$($time -V 2>&1); ec=$?
130 | if [[ "$ec" -ne 0 ]]; then 
131 |   echo "$time -V returned non-zero exit code, please set TIME_BIN to point to a GNU time executable"
132 |   exit 1
133 | else
134 |   if (echo $time_vstr | grep -iq "GNU Time"); then 
135 | 	  echo "$time command appears to execute a valid GNU time" 
136 |   else 
137 |     echo "$time does not appear to be GNU time ($time -V did not return a string starting with GNU time)"
138 | 	  exit 1;
139 |   fi
140 | fi
141 | 
142 | # now that we know we have a proper GNU time executable
143 | # redefine it to have the desired options attached
144 | time="$time -v -o"
145 | 
146 | threads=16
147 | 
148 | if [ -x "$(command -v nproc)" ]; then
149 |   np=`nproc`
150 | elif [ -x "$(command -v sysctl)"]; then
151 |   np=`sysctl -n hw.ncpu`
152 | else
153 |   np=16
154 | fi
155 | 
156 | if [ "$threads" -gt "$np" ]; then
157 |   threads="$np"
158 | fi
159 | 
160 | function simpleaf_index() {
161 |   while [[ "$#" -gt 0 ]]; do 
162 |     case $1 in 
163 |       -f|--fasta) genome="$2"; shift ;;
164 |       -g|--gtf) gtf="$2"; shift ;;
165 |       -l|--rlen) rlen="$2"; shift ;;
166 |       -o|--output) output="$2"; shift ;;
167 |       -t|--threads) threads="$2"; shift ;;
168 |       -s|--spliced) extra_spliced="$2"; shift ;;
169 |       -u|--unspliced) extra_unspliced="$2"; shift ;;
170 |       -e|--dense) dense_index=1; shift ;;
171 |       -d|--dedup) dedup=1; shift ;;
172 |       -h|--help) help=1; shift ;;
173 |       *) echo "Unknown parameter passed: $1"; exit 1 ;;
174 |     esac 
175 |     shift 
176 |   done 
177 |   set -o errexit -o pipefail
178 | 
179 |   if [[ -n "$help" ]]; then
180 |     display_index_usage
181 |     exit 0
182 |   fi
183 | 
184 | 
185 |   if [[ -z "$genome" || -z "$gtf" || -z "$rlen" || -z "$output" ]]; then
186 |     display_index_usage
187 |     exit 1
188 |   fi
189 | 
190 |   # make the directory where we will put the reference
191 |   output=$(realpath ${output})
192 |   outref="$output/ref/"
193 |   mkdir -p $outref
194 | 
195 |   cwd=$(pwd)
196 |   cd "../R"
197 | 
198 |   if [[ -z "$extra_spliced" ]]; then 
199 |     extra_spliced_flag=""
200 |   else
201 |     extra_spliced_flag="--extra-spliced $extra_spliced"
202 |   fi
203 | 
204 |   if [[ -z "$extra_unspliced" ]]; then 
205 |     extra_unspliced_flag=""
206 |   else
207 |     extra_unspliced_flag="--extra-unspliced $extra_unspliced"
208 |   fi
209 | 
210 |   if [[ -z "$dedup" ]]; then
211 |     dedup_flag=""
212 |   else
213 |     dedup_flag="--dedup-seqs"
214 |   fi
215 |   # $ ./build_splici_ref.R <path_to_genome_fasta> <path_to_gtf> <target_read_length> <output_dir>
216 |   cmd="Rscript build_splici_ref.R $genome $gtf $rlen $outref $extra_spliced_flag $extra_unspliced_flag $dedup_flag --filename-prefix splici"
217 |   echo -e "\nExtracting the splici reference using command \n\n $cmd \n"
218 |   eval $cmd 
219 | 
220 |   echo -e "\nDone. Building index."
221 |   cd $cwd
222 | 
223 |   outidx="$output/index" 
224 |   fl=$(( $rlen - 5 ))
225 | 
226 |   # the default is sparse, but if the user
227 |   # passed the -e/--dense flag, then don't
228 |   # pass the sparse flag, and build a dense
229 |   # index instead.
230 |   sparse_flag="--sparse"
231 |   if [[ -z "$dense_index" ]]; then
232 |     sparse_flag=""
233 |   fi
234 | 
235 |   cmd="$salmon index -t $outref/splici_fl$fl.fa -i $outidx -p $threads $sparse_flag"  
236 |   echo -e "\nbuilding index:"
237 |   echo "command: $cmd"
238 |   echo "============="
239 |   eval $cmd 
240 | 
241 |   cp $outref/splici_fl${fl}_t2g_3col.tsv $outidx/t2g_3col.tsv
242 |   echo -e "\nDone. Wrote index to $outidx"
243 | }
244 | 
245 | function simpleaf_quant() {
246 |   while [[ "$#" -gt 0 ]]; do 
247 |     case $1 in 
248 |       -1|--r1) read1="$2"; shift ;;
249 |       -2|--r2) read2="$2"; shift ;;
250 |       -i|--index) index="$2"; shift ;;
251 |       -t|--threads) threads="$2"; shift ;;
252 |       -o|--output) output="$2"; shift ;;
253 |       -f|--fmode) fmode="$2"; shift ;;
254 |       -r|--res) res="$2"; shift ;;
255 |       -c|--chem) chem="$2"; shift ;;
256 |       -m|--t2g) t2g="$2"; shift ;;
257 |       -h|--help) help=1; shift ;;
258 |       *) echo "Unknown parameter passed: $1"; exit 1 ;;
259 |     esac 
260 |     shift 
261 |   done
262 | 
263 |   set -o errexit -o pipefail
264 | 
265 |   if [[ -n "$help" ]]; then
266 |     display_quant_usage
267 |     exit 0
268 |   fi
269 | 
270 | 
271 |   if [[ -z "$read1" || -z "$read2" || -z "$index" || -z "$threads" || -z "$output" || -z "$fmode" || -z "$res" || -z "$chem" || -z "$t2g" ]]; then
272 |     display_quant_usage
273 |     exit 1
274 |   fi
275 | 
276 |   ## Check that the fitler mode is one of knee,k,unfilt,u
277 |   if [[ "$fmode" == "k" || "$fmode" == "knee" || "$fmode" == "unfilt" || "$fmode" == "u" ]]; then
278 |     echo "filter mode is : $fmode"
279 |   else
280 |     echo "filter mode must be one of {knee, k, unfilt, u}"
281 |     exit 1
282 |   fi
283 | 
284 |   ## Check that the chemistry is either v2 or v3
285 |   if [[ "$chem" == "v2" || "$chem" == "v3" ]]; then
286 |     echo "chemistry is : 10x chromimum $chem"
287 |   else
288 |     echo "chemistry mode must be one of {v2, v3}"
289 |     exit 1
290 |   fi
291 | 
292 |   ## If the chemistry is v2, set the chemflag and the fitler flags
293 |   if [[ $chem == "v2" ]]; then
294 |     if [[ "$fmode" == "unfilt" || "$fmode" == "u" ]]; then
295 |       v2file=$ALEVIN_FRY_HOME/plist/10x_v2_permit.txt
296 |       if [ ! -f "$v2file" ]; then
297 |             echo "10x v2 permit list does not exist, downloading now"
298 |             bash get_10x_permit_lists.sh -o $ALEVIN_FRY_HOME/plist -l v2 
299 |       fi
300 |       permitmode="-u $ALEVIN_FRY_HOME/plist/10x_v2_permit.txt"
301 |     else
302 |       permitmode="-k"
303 |     fi
304 |     chemflag="--chromium"
305 |   fi
306 | 
307 |   ## If the chemistry is v3, set the chemflag and the fitler flags
308 |   if [[ $chem == "v3" ]]; then
309 |     if [[ "$fmode" == "unfilt" || "$fmode" == "u" ]]; then
310 |       v3file=$ALEVIN_FRY_HOME/plist/10x_v3_permit.txt
311 |       if [ ! -f "$v3file" ]; then
312 |             echo "10x v3 permit list does not exist, downloading now"
313 |             bash get_10x_permit_lists.sh -o $ALEVIN_FRY_HOME/plist -l v3 
314 |       fi
315 |       permitmode="-u $ALEVIN_FRY_HOME/plist/10x_v3_permit.txt"
316 |     else
317 |       permitmode="-k"
318 |     fi
319 |     chemflag="--chromiumV3"
320 |   fi
321 | 
322 |   mkdir -p $output/logdir
323 |   logdir="$output/logdir"
324 | 
325 |   ## turn comma separated list into space separated list
326 |   read1=`echo $read1 | tr ',' ' '`
327 |   read2=`echo $read2 | tr ',' ' '`
328 | 
329 |   ## map
330 |   cmd="$time $logdir/map.time $salmon alevin -l ISR -i $index -1 $read1 -2 $read2 -p $threads $chemflag -o $output/alevin_map --sketch"
331 |   echo "mapping:"
332 |   echo "command: $cmd"
333 |   echo "============="
334 |   eval $cmd
335 | 
336 |   ### generate permit list
337 |   cmd="$time $logdir/gpl.time $fry generate-permit-list $permitmode -d fw -i $output/alevin_map -o $output/gpl/ |& stdbuf -oL tr '\r' '\n' > $logdir/gpl.log"
338 |   echo "gpl:"
339 |   echo "command: "$cmd
340 |   echo "============="
341 |   eval $cmd
342 | 
343 |   ### collate
344 |   cmd="$time $logdir/collate.time $fry collate -i $output/gpl/ -r $output/alevin_map -t $threads |& stdbuf -oL tr '\r' '\n' > $logdir/collate.log"
345 |   echo "collate:"
346 |   echo "command: "$cmd
347 |   echo "============="
348 |   eval $cmd
349 | 
350 |   ### quant
351 |   cmd="$time $logdir/quant.time $fry quant -r $res --use-mtx -m $t2g -i $output/gpl/ -o $output/quant -t $threads |& stdbuf -oL tr '\r' '\n' > $logdir/quant.log"
352 |   echo "quant:"
353 |   echo "command: "$cmd
354 |   echo "============="
355 |   eval $cmd
356 | 
357 |   echo "Finished! Quantification results are at $output/quant"
358 | }
359 | 
360 | if [[ $# -eq 0 || "$1" == "-h" || "$1" == "--help" ]]; then
361 |   display_usage
362 |   exit 0
363 | fi
364 | 
365 | if [[ "$1" == "quant" ]]; then
366 |   shift;
367 |   simpleaf_quant "$@"
368 | elif [[ "$1" == "index" ]]; then
369 |   shift;
370 |   simpleaf_index "$@"
371 | else
372 |   echo -e "\nERROR must pass a valid sub-command, index or quant"
373 |   display_usage
374 |   exit 1
375 | fi
376 | 


--------------------------------------------------------------------------------
/bash/test_simpleaf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # In this script we test simpleaf using a toy read-reference set
 3 | # template took from here https://stackoverflow.com/a/34676160/18156398
 4 | # the directory of the script
 5 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 6 | echo "Testing simpleaf using a toy read-reference set"
 7 | 
 8 | # the temp directory used, within $DIR
 9 | # omit the -p parameter to create a temporal directory in the default location
10 | # WORK_DIR=`mktemp -d -p "$DIR"`
11 | WORK_DIR=`mktemp -d`
12 | LOG_DIR="${WORK_DIR}/simpleaf_logs"
13 | ALEVIN_FRY_HOME="${WORK_DIR}/alevin_fry_home"
14 | mkdir -p $LOG_DIR
15 | 
16 | # check if tmp dir was created
17 | if [[ ! "$WORK_DIR" || ! -d "$WORK_DIR" ]]; then
18 |         echo "Could not create temp dir"
19 |         exit 1
20 | fi
21 | 
22 | # deletes the temp directory
23 | function cleanup {      
24 |         rm -rf "$WORK_DIR"
25 |         echo "  - Deleted temp working directory $WORK_DIR"
26 | }
27 | 
28 | # implementation of script starts here
29 | echo "  - Downloading the toy read-reference set"
30 | wget https://umd.box.com/shared/static/lx2xownlrhz3us8496tyu9c4dgade814.gz  -O  ${WORK_DIR}/toy_read_ref_set.tar.gz -q
31 | tar -xf ${WORK_DIR}/toy_read_ref_set.tar.gz -C ${WORK_DIR}
32 | 
33 | echo "  - Testing simpleaf index"
34 | REF_DIR="${WORK_DIR}/toy_read_ref_set/toy_human_ref"
35 | index_cmd="ALEVIN_FRY_HOME=$ALEVIN_FRY_HOME \
36 | ${DIR}/simpleaf index -f ${REF_DIR}/fasta/genome.fa \
37 | -g ${REF_DIR}/genes/genes.gtf \
38 | -l 91 -o ${WORK_DIR}/test_index_outdir"
39 | eval $index_cmd
40 | status=$?
41 | 
42 | if [ $status -ne 0 ]; then
43 |         echo "ERROR when running simpleaf index"
44 |         exit 1
45 | else
46 |         echo "simpleaf index ran successfully"
47 | fi
48 | 
49 | echo "  - Testing simpleaf quant"
50 | FASTQ_DIR="${WORK_DIR}/toy_read_ref_set/toy_read_fastq"
51 | quant_cmd="ALEVIN_FRY_HOME=$ALEVIN_FRY_HOME \
52 | ${DIR}/simpleaf quant \
53 | -1 ${FASTQ_DIR}/selected_R1_reads.fastq \
54 | -2 ${FASTQ_DIR}/selected_R2_reads.fastq \
55 | -i ${WORK_DIR}/test_index_outdir/index \
56 | -o ${WORK_DIR}/test_quant_outdir \
57 | -f u -c v3 -r cr-like \
58 | -m ${WORK_DIR}/test_index_outdir/index/t2g_3col.tsv \
59 | -t 16"
60 | eval $quant_cmd
61 | status=$?
62 | 
63 | if [ $status -ne 0 ]; then
64 |         echo "ERROR when running simpleaf quant"
65 |         exit 1
66 | else
67 |         echo "  - simpleaf quant ran successfully"
68 | fi
69 | # register the cleanup function to be called on the EXIT signal
70 | status=$?
71 | [ "$status" -eq 0 ] && rm -rf $WORK_DIR
72 | 
73 | echo "simpleaf works!"
74 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # image: COMBINE-lab/dockeraf
 2 | FROM condaforge/mambaforge:4.11.0-4
 3 | MAINTAINER salmon.maintainer@gmail.com
 4 | 
 5 | RUN mamba install -c anaconda git
 6 | RUN mamba install -c bioconda salmon gffread
 7 | RUN mamba install -c conda-forge cxx-compiler
 8 | RUN mamba install -c conda-forge r-base r-essentials
 9 | RUN mamba install -c bioconda bioconductor-eisar bioconductor-biostrings bioconductor-bsgenome bioconductor-genomicfeatures bioconductor-singlecellexperiment
10 | RUN mamba install -c conda-forge r-stringr r-argparser r-rjson r-matrix
11 | RUN mamba install -c conda-forge time
12 | RUN mamba install -c conda-forge r-devtools
13 | RUN R -e "devtools::install_github('COMBINE-lab/roe')"
14 | RUN mamba install -c bioconda alevin-fry
15 | 
16 | RUN touch /root/.bashrc \
17 |        && echo "export ALEVIN_FRY_HOME=/workdir/.afhome" >> /root/.bashrc \
18 |        && echo "export TIME_BIN=/opt/conda/bin/time" >> /root/.bashrc
19 | 
20 | RUN touch /root/.Rprofile 
21 | 
22 | RUN echo 'local({r <- getOption("repos") \n\
23 |       r["CRAN"] <- "http://cran.r-project.org" \n\
24 |       options(repos=r) \n\
25 |       })' >> /root/.Rprofile
26 | 
27 | RUN git clone https://github.com/COMBINE-lab/usefulaf.git
28 | 
29 | ENV ALEVIN_FRY_HOME=/workdir/.afhome
30 | ENV TIME_BIN=/opt/conda/bin/time
31 | 


--------------------------------------------------------------------------------
/docker/Singularity.def:
--------------------------------------------------------------------------------
 1 | Bootstrap: docker
 2 | From: condaforge/mambaforge:4.10.3-1
 3 | Stage: spython-base
 4 | 
 5 | %labels
 6 | MAINTAINER salmon.maintainer@gmail.com
 7 | %post
 8 | # image: COMBINE-lab/dockeraf
 9 | 
10 | mamba install -c anaconda git
11 | mamba install -c bioconda salmon alevin-fry gffread
12 | mamba install -c conda-forge cxx-compiler
13 | mamba install -c conda-forge r-base r-essentials
14 | mamba install -c bioconda bioconductor-eisar bioconductor-biostrings bioconductor-bsgenome bioconductor-genomicfeatures bioconductor-singlecellexperiment
15 | mamba install -c conda-forge r-stringr r-argparser r-rjson r-matrix
16 | mamba install -c conda-forge time
17 | 
18 | git clone https://github.com/COMBINE-lab/usefulaf.git
19 | %environment
20 |   export ALEVIN_FRY_HOME="/workdir/.afhome"
21 |   export TIME_BIN="/opt/conda/bin/time"
22 | %runscript
23 |   exec /bin/bash "$@"
24 | %startscript
25 |   exec /bin/bash "$@"
26 | 


--------------------------------------------------------------------------------
/docker/build_docker.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | USEFULAF_VERSION=0.5.5
3 | docker build --no-cache -t combinelab/usefulaf:${USEFULAF_VERSION} -t combinelab/usefulaf:latest .
4 | 


--------------------------------------------------------------------------------
/python/load_fry.py:
--------------------------------------------------------------------------------
  1 | import scanpy
  2 | 
  3 | def load_fry(frydir, which_counts={'X' : ['S','A']}, verbose=False):
  4 |     """
  5 |     
  6 |     Parameters:
  7 |         frydir - The directory containing the alevin-fry quantification (i.e. the the quant.json file & alevin subdirectory).
  8 |         verbose - True if messages (including error messages) should be printed out, False if function should be quiet.
  9 |         which_count - Dictionary specifying how a USA mode matrix should be returned or combined into the resulting 
 10 |                       output matrix.  If the input is not a USA mode quantification directory, this parameter is ignored
 11 |                       and the count matrix is returned in the `X` field of the returned `AnnData` object.  If the input
 12 |                       quantification directory contains a USA mode quantification, then there are 3 sub-matrices that can 
 13 |                       be referenced in the dictionary; 'U', 'S', 'A' containing, respectively, unspliced, spliced and 
 14 |                       ambiguous counts.  The dictionary should have entries of the form `key` (str) : `value` (list[str]).
 15 |                       The following constraints apply : there should be one key-value pair with the key `X`, the resulting
 16 |                       counts will be returned in the `X` field of the AnnData object. There can be an arbitrary number
 17 |                       of other key-value pairs, but each will be returned as a layer of the resulting AnnData object.
 18 |                       Within the key-value pairs, the key refers to the layer name that will be given to the combined 
 19 |                       count matrix upon output, and the value should be a subset of `['U', 'S', 'A']` that defines 
 20 |                       which sub-matrices should be summed.  For example:
 21 |                       {'X' : ['S', 'A'], 'unspliced' : ['U']}
 22 | 
 23 |                       will result in a return AnnData object where the X field has a matrix in which each entry 
 24 |                       corresponds to the summed spliced and ambiguous counts for each gene in each cell, and there
 25 |                       is an additional 'unspliced' layer, whose counts are taken directly from the unspliced sub-matrix.
 26 | 
 27 |     Returns:
 28 |         An AnnData object with X and layers corresponding to the requested `which_counts`, or None if an 
 29 |         error is encountered.
 30 |     """
 31 |     import json
 32 |     import os
 33 |     import pandas as pd
 34 | 
 35 |     # since alevin-fry 0.4.1 the generic "meta_info.json"
 36 |     # has been replaced by a more informative name for each
 37 |     # sub-command. For quantification, it is "quant.json".
 38 |     # we check for both files here, in order.
 39 |     meta_info_files = ["quant.json", "meta_info.json"]
 40 | 
 41 |     fpath = os.path.sep.join([frydir, meta_info_files[0]])
 42 |     # first, check for the new file, if we don't find it, check
 43 |     # for the old one.
 44 |     if not os.path.exists(fpath):
 45 |         if verbose:
 46 |             print(f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.")
 47 |         fpath = os.path.sep.join([frydir, meta_info_files[1]])
 48 |         # if we don't find the old one either, then return None
 49 |         if not os.path.exists(fpath):
 50 |             if verbose:
 51 |                 print(f"Found no {meta_info_files[1]} file either; cannot proceed.")
 52 |             return None
 53 | 
 54 |     # if we got here then we had a valid json file, so 
 55 |     # use it to get the number of genes, and if we are 
 56 |     # in USA mode or not.
 57 |     meta_info = json.load(open(fpath))
 58 |     ng = meta_info['num_genes']
 59 |     usa_mode = meta_info['usa_mode']
 60 | 
 61 |     # if we are in USA mode
 62 |     if usa_mode:
 63 |         # make sure that num_genes is a multiple of 3
 64 |         if ng %3 != 0:
 65 |             if verbose:
 66 |                 print("Found USA mode, but num genes = {ng} is not a multiple of 3; cannot proceed.")
 67 |             return None
 68 |         # each gene has 3 splicing statuses, so the actual number of distinct 
 69 |         # genes is ng/3.
 70 |         ng = int(ng/3)
 71 |         if verbose:
 72 |             print("processing input in USA mode, will return {}".format("+".join(which_counts)))
 73 |               
 74 |         # make sure which_counts isn't empty
 75 |         assert(len(which_counts) > 0)  
 76 | 
 77 |         # make sure the specification in which_counts is OK
 78 |         if 'X' not in which_counts:
 79 |             if verbose:
 80 |                 print('In USA mode some sub-matrices must be assigned to the \"X\" (default) output.')
 81 |             return None
 82 |         if verbose:
 83 |             print(f"will populate output field X with sum of counts frorm {which_counts['X']}.")
 84 | 
 85 |         for k,v in which_counts.items():
 86 |             valid_elem = len(set(v) - set(['U', 'S', 'A'])) == 0
 87 |             if not valid_elem:
 88 |                 if verbose:
 89 |                     print(f'Found non-USA element in which_count element list \"{v}\" for key \"{k}\"; cannot proceed.')
 90 |                 return None
 91 |             if verbose and (k != 'X'):
 92 |                 print(f'will combine {v} into output layer {k}.') 
 93 | 
 94 |     elif verbose:
 95 |         print("Processing input in standard mode, will return processed count (which_count will be ignored).")
 96 | 
 97 |     # read the actual input matrix
 98 |     af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"]))
 99 |     afg = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])).readlines()][:ng]
100 |     # read the gene ids
101 |     afg_df =  pd.DataFrame(afg, columns=["gene_ids"])
102 |     afg_df = afg_df.set_index("gene_ids")
103 |     # and the barcodes
104 |     abc = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])).readlines() ]
105 |     abc_df = pd.DataFrame(abc, columns=["barcodes"])
106 |     abc_df.index = abc_df["barcodes"]
107 |     
108 |     x = af_raw.X
109 |     # if we're not in USA mode, just combine this info into 
110 |     # an AnnData object
111 |     if not usa_mode:
112 |         af = scanpy.AnnData(x.T, var=abc_df, obs=afg_df)
113 |         af = af.T 
114 |     else: # USA mode
115 |         # otherwise, combine the sub-matrices into the output object as 
116 |         # specified by `which_counts`
117 |         rd = {'S' : range(0,ng), 'U' : range(ng, 2*ng), 'A' : range(2*ng,3*ng)}
118 |         xcounts = which_counts['X']
119 |         o = x[:, rd[xcounts[0]]]
120 |         for wc in xcounts[1:]:
121 |             o += x[:, rd[wc]]
122 |         af = scanpy.AnnData(o.T, var=abc_df, obs=afg_df)
123 |         af = af.T
124 | 
125 |         # now, if there are other layers requested, populate those
126 |         for other_layer in which_counts.keys() - 'X':
127 |             xcounts = which_counts[other_layer]
128 |             o = x[:, rd[xcounts[0]]]
129 |             for wc in xcounts[1:]:
130 |                 o += x[:, rd[wc]] 
131 |             af.layers[other_layer] = o
132 |     return af
133 | 


--------------------------------------------------------------------------------
/simpleaf/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "simpleaf"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | anyhow = "^1.0"
10 | clap = { version = ">=3.2.12", features = ["derive", "wrap_help", "cargo", "deprecated", "wrap_help"]} 
11 | cmd_lib = "^1.3.0"
12 | env_logger = "^0.9.0"
13 | log = "^0.4.17"
14 | semver = "^1.0.12"
15 | serde = {version = "1.0.139", features = ["derive"]}
16 | serde_json = "1.0.82"
17 | time = {version = "^0.3.11", features = ["macros", "formatting", "parsing", "serde", "serde-human-readable"]}
18 | which = "^4.2.5"
19 | 
20 | 
21 | [profile.release]
22 | lto = "thin"
23 | opt-level = 3
24 | 


--------------------------------------------------------------------------------
/simpleaf/src/main.rs:
--------------------------------------------------------------------------------
  1 | extern crate env_logger;
  2 | #[macro_use]
  3 | extern crate log;
  4 | 
  5 | use anyhow::{anyhow, bail, Context, Result};
  6 | use clap::{ArgGroup, Parser, Subcommand};
  7 | use cmd_lib::run_fun;
  8 | use env_logger::Env;
  9 | use serde_json::json;
 10 | use time::Instant;
 11 | 
 12 | use std::env;
 13 | use std::io::BufReader;
 14 | use std::path::PathBuf;
 15 | 
 16 | mod utils;
 17 | use utils::af_utils::*;
 18 | use utils::prog_utils::*;
 19 | 
 20 | #[derive(Debug, Subcommand)]
 21 | enum Commands {
 22 |     /// build the splici index
 23 |     #[clap(arg_required_else_help = true)]
 24 |     Index {
 25 |         /// reference genome
 26 |         #[clap(short, long, value_parser)]
 27 |         fasta: PathBuf,
 28 | 
 29 |         /// reference GTF file
 30 |         #[clap(short, long, value_parser)]
 31 |         gtf: PathBuf,
 32 | 
 33 |         /// the target read length the index will be built for
 34 |         #[clap(short, long, value_parser)]
 35 |         rlen: u32,
 36 | 
 37 |         /// path to output directory (will be created if it doesn't exist)
 38 |         #[clap(short, long, value_parser)]
 39 |         output: PathBuf,
 40 | 
 41 |         /// path to FASTA file with extra spliced sequence to add to the index
 42 |         #[clap(short, long, value_parser)]
 43 |         spliced: Option<PathBuf>,
 44 | 
 45 |         /// path to FASTA file with extra unspliced sequence to add to the index
 46 |         #[clap(short, long, value_parser)]
 47 |         unspliced: Option<PathBuf>,
 48 | 
 49 |         /// deduplicate identical sequences inside the R script when building the splici reference
 50 |         #[clap(short = 'd', long = "dedup", action)]
 51 |         dedup: bool,
 52 | 
 53 |         /// if this flag is passed, build the sparse rather than dense index for mapping
 54 |         #[clap(short = 'p', long = "sparse", action)]
 55 |         sparse: bool,
 56 | 
 57 |         /// number of threads to use when running [default: min(16, num cores)]"
 58 |         #[clap(short, long, default_value_t = 16, value_parser)]
 59 |         threads: u32,
 60 |     },
 61 |     /// quantify a sample
 62 |     #[clap(arg_required_else_help = true)]
 63 |     #[clap(group(
 64 |             ArgGroup::new("filter")
 65 |             .required(true)
 66 |             .args(&["knee", "unfiltered-pl", "forced-cells", "expect-cells"])
 67 |             ))]
 68 |     Quant {
 69 |         /// path to index
 70 |         #[clap(short, long, value_parser)]
 71 |         index: PathBuf,
 72 | 
 73 |         /// path to read 1 files
 74 |         #[clap(short = '1', long = "reads1", value_parser)]
 75 |         reads1: Vec<PathBuf>,
 76 | 
 77 |         /// path to read 2 files
 78 |         #[clap(short = '2', long = "reads2", value_parser)]
 79 |         reads2: Vec<PathBuf>,
 80 | 
 81 |         /// number of threads to use when running [default: min(16, num cores)]"
 82 |         #[clap(short, long, default_value_t = 16, value_parser)]
 83 |         threads: u32,
 84 | 
 85 |         /// use knee filtering mode
 86 |         #[clap(short, long, action)]
 87 |         knee: bool,
 88 | 
 89 |         /// use unfiltered permit list
 90 |         #[clap(short, long, action)]
 91 |         unfiltered_pl: bool,
 92 | 
 93 |         /// use a filtered, explicit permit list
 94 |         #[clap(short, long, value_parser)]
 95 |         explicit_pl: Option<PathBuf>,
 96 | 
 97 |         /// use forced number of cells
 98 |         #[clap(short, long, value_parser)]
 99 |         forced_cells: Option<usize>,
100 | 
101 |         /// use expected number of cells
102 |         #[clap(short, long, value_parser)]
103 |         expect_cells: Option<usize>,
104 | 
105 |         /// resolution mode
106 |         #[clap(short, long, value_parser = clap::builder::PossibleValuesParser::new(["cr-like", "cr-like-em", "parsimony", "parsimony-em", "parsimony-gene", "parsimony-gene-em"]))]
107 |         resolution: String,
108 | 
109 |         /// chemistry
110 |         #[clap(short, long, value_parser)]
111 |         chemistry: String,
112 | 
113 |         /// transcript to gene map
114 |         #[clap(short = 'm', long, value_parser)]
115 |         t2g_map: PathBuf,
116 | 
117 |         /// output directory
118 |         #[clap(short, long, value_parser)]
119 |         output: PathBuf,
120 |     },
121 |     /// set paths to the programs that simpleaf will use
122 |     SetPaths {
123 |         /// path to salmon to use
124 |         #[clap(short, long, value_parser)]
125 |         salmon: Option<PathBuf>,
126 |         /// path to alein-fry to use
127 |         #[clap(short, long, value_parser)]
128 |         alevin_fry: Option<PathBuf>,
129 |         /// path to pyroe to use
130 |         #[clap(short, long, value_parser)]
131 |         pyroe: Option<PathBuf>,
132 |     },
133 | }
134 | 
135 | /// simplifying alevin-fry workflows
136 | #[derive(Debug, Parser)]
137 | struct Cli {
138 |     #[clap(subcommand)]
139 |     command: Commands,
140 | }
141 | 
142 | enum Chemistry {
143 |     TenxV2,
144 |     TenxV3,
145 |     Other(String),
146 | }
147 | 
148 | enum PermitListResult {
149 |     DownloadSuccessful(PathBuf),
150 |     AlreadyPresent(PathBuf),
151 |     UnregisteredChemistry,
152 | }
153 | 
154 | fn get_permit_if_absent(chem: Chemistry) -> Result<PermitListResult> {
155 |     let chem_file;
156 |     let dl_url;
157 |     match chem {
158 |         Chemistry::TenxV2 => {
159 |             chem_file = "10x_v2_permit.txt";
160 |             dl_url = "https://umd.box.com/shared/static/jbs2wszgbj7k4ic2hass9ts6nhqkwq1p";
161 |         }
162 |         Chemistry::TenxV3 => {
163 |             chem_file = "10x_v3_permit.txt";
164 |             dl_url = "https://umd.box.com/shared/static/eo0qlkfqf2v24ws6dfnxty6gqk1otf2h";
165 |         }
166 |         _ => {
167 |             return Ok(PermitListResult::UnregisteredChemistry);
168 |         }
169 |     }
170 |     match env::var("ALEVIN_FRY_HOME") {
171 |         Ok(p) => {
172 |             let odir = PathBuf::from(p).join("plist");
173 |             if odir.join(chem_file).exists() {
174 |                 Ok(PermitListResult::AlreadyPresent(odir.join(chem_file)))
175 |             } else {
176 |                 run_fun!(mkdir -p $odir)?;
177 |                 let mut dl_cmd = std::process::Command::new("wget");
178 |                 dl_cmd
179 |                     .arg("-v")
180 |                     .arg("-O")
181 |                     .arg(odir.join(chem_file).to_string_lossy().to_string())
182 |                     .arg("-L")
183 |                     .arg(dl_url);
184 |                 let r = dl_cmd.output()?;
185 |                 if !r.status.success() {
186 |                     return Err(anyhow!("failed to download permit list {:?}", r.status));
187 |                 }
188 |                 Ok(PermitListResult::DownloadSuccessful(odir.join(chem_file)))
189 |             }
190 |         }
191 |         Err(e) => {
192 |             return Err(anyhow!(
193 |                 "could not resolve $ALEVIN_FRY_HOME environment variable : {}",
194 |                 e
195 |             ));
196 |         }
197 |     }
198 | }
199 | 
200 | fn main() -> anyhow::Result<()> {
201 |     env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
202 |     const AF_HOME: &str = "ALEVIN_FRY_HOME";
203 |     let af_home_path = match env::var(AF_HOME) {
204 |         Ok(p) => PathBuf::from(p),
205 |         Err(e) => {
206 |             bail!(
207 |                 "${} is unset {}, please set this environment variable to continue.",
208 |                 AF_HOME,
209 |                 e
210 |             );
211 |         }
212 |     };
213 | 
214 |     let cli_args = Cli::parse();
215 | 
216 |     match cli_args.command {
217 |         Commands::SetPaths {
218 |             salmon,
219 |             alevin_fry,
220 |             pyroe,
221 |         } => {
222 |             let rp = get_required_progs_from_paths(salmon, alevin_fry, pyroe)?;
223 | 
224 |             if rp.salmon.is_none() {
225 |                 bail!("Suitable salmon executable not found");
226 |             }
227 |             if rp.alevin_fry.is_none() {
228 |                 bail!("Suitable alevin_fry executable not found");
229 |             }
230 |             if rp.pyroe.is_none() {
231 |                 bail!("Suitable pyroe executable not found");
232 |             }
233 | 
234 |             let simpleaf_info_file = af_home_path.join("simpleaf_info.json");
235 |             let simpleaf_info = json!({ "prog_info": rp });
236 | 
237 |             std::fs::write(
238 |                 &simpleaf_info_file,
239 |                 serde_json::to_string_pretty(&simpleaf_info).unwrap(),
240 |             )
241 |             .with_context(|| format!("could not write {}", simpleaf_info_file.display()))?;
242 |         }
243 |         Commands::Index {
244 |             fasta,
245 |             gtf,
246 |             rlen,
247 |             output,
248 |             spliced,
249 |             unspliced,
250 |             dedup,
251 |             sparse,
252 |             mut threads,
253 |         } => {
254 |             // Open the file in read-only mode with buffer.
255 |             let af_info_p = af_home_path.join("simpleaf_info.json");
256 |             let simpleaf_info_file = std::fs::File::open(&af_info_p).with_context({
257 |                 ||
258 |                 format!("Could not open file {}; please run the set-paths command before using `index` or `quant`", af_info_p.display())
259 |             })?;
260 | 
261 |             let simpleaf_info_reader = BufReader::new(simpleaf_info_file);
262 | 
263 |             // Read the JSON contents of the file as an instance of `User`.
264 |             let v: serde_json::Value = serde_json::from_reader(simpleaf_info_reader)?;
265 |             let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?;
266 | 
267 |             run_fun!(mkdir -p $output)?;
268 |             let ref_file = format!("splici_fl{}.fa", rlen - 5);
269 | 
270 |             let outref = output.join("ref");
271 |             run_fun!(mkdir -p $outref)?;
272 | 
273 |             let t2g_file = outref.join(format!("splici_fl{}_t2g_3col.tsv", rlen - 5));
274 |             let info_file = output.join("index_info.json");
275 |             let index_info = json!({
276 |                 "command" : "index",
277 |                 "version_info" : rp,
278 |                 "t2g_file" : t2g_file,
279 |                 "args" : {
280 |                     "fasta" : fasta,
281 |                     "gtf" : gtf,
282 |                     "rlen" : rlen,
283 |                     "output" : output,
284 |                     "spliced" : spliced,
285 |                     "unspliced" : unspliced,
286 |                     "dedup" : dedup,
287 |                     "sparse" : sparse,
288 |                     "threads" : threads
289 |                 }
290 |             });
291 | 
292 |             std::fs::write(
293 |                 &info_file,
294 |                 serde_json::to_string_pretty(&index_info).unwrap(),
295 |             )
296 |             .with_context(|| format!("could not write {}", info_file.display()))?;
297 | 
298 |             let mut cmd =
299 |                 std::process::Command::new(format!("{}", rp.pyroe.unwrap().exe_path.display()));
300 |             // we will run the make-splici command
301 |             cmd.arg("make-splici");
302 | 
303 |             // if the user wants to dedup output sequences
304 |             if dedup {
305 |                 cmd.arg(String::from("--dedup-seqs"));
306 |             }
307 | 
308 |             // extra spliced sequence
309 |             match spliced {
310 |                 Some(es) => {
311 |                     cmd.arg(String::from("--extra-spliced"));
312 |                     cmd.arg(format!("{}", es.display()));
313 |                 }
314 |                 None => {}
315 |             }
316 | 
317 |             // extra unspliced sequence
318 |             match unspliced {
319 |                 Some(eu) => {
320 |                     cmd.arg(String::from("--extra-unspliced"));
321 |                     cmd.arg(format!("{}", eu.display()));
322 |                 }
323 |                 None => {}
324 |             }
325 | 
326 |             cmd.arg(fasta)
327 |                 .arg(gtf)
328 |                 .arg(format!("{}", rlen))
329 |                 .arg(&outref);
330 | 
331 |             let pyroe_start = Instant::now();
332 |             let cres = cmd.output()?;
333 |             let pyroe_duration = pyroe_start.elapsed();
334 | 
335 |             if !cres.status.success() {
336 |                 bail!("pyroe failed to return succesfully {:?}", cres.status);
337 |             }
338 | 
339 |             let mut salmon_index_cmd =
340 |                 std::process::Command::new(format!("{}", rp.salmon.unwrap().exe_path.display()));
341 |             let ref_seq = outref.join(ref_file);
342 | 
343 |             let output_index_dir = output.join("index");
344 |             salmon_index_cmd
345 |                 .arg("index")
346 |                 .arg("-i")
347 |                 .arg(&output_index_dir)
348 |                 .arg("-t")
349 |                 .arg(ref_seq);
350 | 
351 |             // if the user requested a sparse index.
352 |             if sparse {
353 |                 salmon_index_cmd.arg("--sparse");
354 |             }
355 | 
356 |             // if the user requested more threads than can be used
357 |             if let Ok(max_threads_usize) = std::thread::available_parallelism() {
358 |                 let max_threads = max_threads_usize.get() as u32;
359 |                 if threads > max_threads {
360 |                     warn!(
361 |                         "The maximum available parallelism is {}, but {} threads were requested.",
362 |                         max_threads, threads
363 |                     );
364 |                     warn!("setting number of threads to {}", max_threads);
365 |                     threads = max_threads;
366 |                 }
367 |             }
368 | 
369 |             salmon_index_cmd
370 |                 .arg("--threads")
371 |                 .arg(format!("{}", threads));
372 | 
373 |             let index_start = Instant::now();
374 |             salmon_index_cmd
375 |                 .output()
376 |                 .expect("failed to run salmon index");
377 |             let index_duration = index_start.elapsed();
378 | 
379 |             // copy over the t2g file to the index
380 |             let index_t2g_path = output_index_dir.join("t2g_3col.tsv");
381 |             std::fs::copy(t2g_file, index_t2g_path)?;
382 | 
383 |             let index_log_file = output.join("simpleaf_index_log.json");
384 |             let index_log_info = json!({
385 |                 "time_info" : {
386 |                     "pyroe_time" : pyroe_duration,
387 |                     "index_time" : index_duration
388 |                 }
389 |             });
390 | 
391 |             std::fs::write(
392 |                 &index_log_file,
393 |                 serde_json::to_string_pretty(&index_log_info).unwrap(),
394 |             )
395 |             .with_context(|| format!("could not write {}", index_log_file.display()))?;
396 |         }
397 |         Commands::Quant {
398 |             index,
399 |             reads1,
400 |             reads2,
401 |             threads,
402 |             knee,
403 |             unfiltered_pl,
404 |             explicit_pl,
405 |             forced_cells,
406 |             expect_cells,
407 |             resolution,
408 |             t2g_map,
409 |             chemistry,
410 |             output,
411 |         } => {
412 |             // Open the file in read-only mode with buffer.
413 |             let af_info_p = af_home_path.join("simpleaf_info.json");
414 |             let simpleaf_info_file = std::fs::File::open(&af_info_p).with_context({
415 |                 ||
416 |                 format!("Could not open file {}; please run the set-paths command before using `index` or `quant`", af_info_p.display())
417 |             })?;
418 | 
419 |             let simpleaf_info_reader = BufReader::new(&simpleaf_info_file);
420 | 
421 |             // Read the JSON contents of the file as an instance of `User`.
422 |             info!("deserializing from {:?}", simpleaf_info_file);
423 |             let v: serde_json::Value = serde_json::from_reader(simpleaf_info_reader)?;
424 |             let rp: ReqProgs = serde_json::from_value(v["prog_info"].clone())?;
425 | 
426 |             info!("prog info = {:?}", rp);
427 | 
428 |             let mut filter_meth_opt = None;
429 |             let chem = match chemistry.as_str() {
430 |                 "10xv2" => Chemistry::TenxV2,
431 |                 "10xv3" => Chemistry::TenxV3,
432 |                 s => Chemistry::Other(s.to_string()),
433 |             };
434 | 
435 |             // based on the filtering method
436 |             if unfiltered_pl {
437 |                 // check the chemistry
438 |                 let pl_res = get_permit_if_absent(chem)?;
439 |                 let min_cells = 10usize;
440 |                 match pl_res {
441 |                     PermitListResult::DownloadSuccessful(p)
442 |                     | PermitListResult::AlreadyPresent(p) => {
443 |                         filter_meth_opt = Some(CellFilterMethod::UnfilteredExternalList(
444 |                             p.to_string_lossy().into_owned(),
445 |                             min_cells,
446 |                         ));
447 |                     }
448 |                     PermitListResult::UnregisteredChemistry => {
449 |                         bail!(
450 |                             "Cannot use unrecognized chemistry {} with unfiltered permit list.",
451 |                             chemistry.as_str()
452 |                         );
453 |                     }
454 |                 }
455 |             } else {
456 |                 match explicit_pl {
457 |                     Some(filtered_path) => {
458 |                         filter_meth_opt = Some(CellFilterMethod::ExplicitList(
459 |                             filtered_path.to_string_lossy().into_owned(),
460 |                         ));
461 |                     }
462 |                     None => {}
463 |                 };
464 |                 match forced_cells {
465 |                     Some(num_forced) => {
466 |                         filter_meth_opt = Some(CellFilterMethod::ForceCells(num_forced));
467 |                     }
468 |                     None => {}
469 |                 };
470 |                 match expect_cells {
471 |                     Some(num_expected) => {
472 |                         filter_meth_opt = Some(CellFilterMethod::ExpectCells(num_expected));
473 |                     }
474 |                     None => {}
475 |                 };
476 |             }
477 |             // otherwise it must have been knee;
478 |             if knee {
479 |                 filter_meth_opt = Some(CellFilterMethod::KneeFinding);
480 |             }
481 | 
482 |             if filter_meth_opt.is_none() {
483 |                 bail!("It seems no valid filtering strategy was provided!");
484 |             }
485 | 
486 |             // here we must be safe to unwrap
487 |             let filter_meth = filter_meth_opt.unwrap();
488 | 
489 |             let mut salmon_quant_cmd =
490 |                 std::process::Command::new(format!("{}", rp.salmon.unwrap().exe_path.display()));
491 | 
492 |             // set the input index and library type
493 |             let index_path = format!("{}", index.display());
494 |             salmon_quant_cmd
495 |                 .arg("alevin")
496 |                 .arg("--index")
497 |                 .arg(index_path)
498 |                 .arg("-l")
499 |                 .arg("A");
500 | 
501 |             // location of the reads
502 |             let r1_str = reads1
503 |                 .iter()
504 |                 .map(|x| format!("{}", x.display()))
505 |                 .collect::<Vec<String>>()
506 |                 .join(",");
507 |             let r2_str = reads2
508 |                 .iter()
509 |                 .map(|x| format!("{}", x.display()))
510 |                 .collect::<Vec<String>>()
511 |                 .join(",");
512 |             salmon_quant_cmd.arg("-1").arg(r1_str).arg("-2").arg(r2_str);
513 | 
514 |             // location of outptu directory, number of threads
515 |             let map_output = output.join("af_map");
516 |             salmon_quant_cmd
517 |                 .arg("--threads")
518 |                 .arg(format!("{}", threads))
519 |                 .arg("-o")
520 |                 .arg(&map_output);
521 |             salmon_quant_cmd.arg("--sketch");
522 | 
523 |             // setting the technology / chemistry
524 |             match chemistry.as_str() {
525 |                 "10xv2" => {
526 |                     salmon_quant_cmd.arg("--chromium");
527 |                 }
528 |                 "10xv3" => {
529 |                     salmon_quant_cmd.arg("--chromiumV3");
530 |                 }
531 |                 s => {
532 |                     salmon_quant_cmd.arg(format!("--{}", s));
533 |                 }
534 |             };
535 | 
536 |             info!("cmd : {:?}", salmon_quant_cmd);
537 |             let map_start = Instant::now();
538 |             let map_proc_out = salmon_quant_cmd
539 |                 .output()
540 |                 .expect("failed to execute salmon alevin [mapping phase]");
541 |             let map_duration = map_start.elapsed();
542 | 
543 |             if !map_proc_out.status.success() {
544 |                 bail!("mapping failed with exit status {:?}", map_proc_out.status);
545 |             }
546 | 
547 |             let alevin_fry = rp.alevin_fry.unwrap().exe_path;
548 |             // alevin-fry generate permit list
549 |             let mut alevin_gpl_cmd =
550 |                 std::process::Command::new(format!("{}", &alevin_fry.display()));
551 | 
552 |             alevin_gpl_cmd.arg("generate-permit-list");
553 |             alevin_gpl_cmd.arg("-i").arg(&map_output);
554 |             alevin_gpl_cmd.arg("-d").arg("fw");
555 | 
556 |             // add the filter mode
557 |             add_to_args(&filter_meth, &mut alevin_gpl_cmd);
558 | 
559 |             let gpl_output = output.join("af_quant");
560 |             alevin_gpl_cmd.arg("-o").arg(&gpl_output);
561 | 
562 |             info!("cmd : {:?}", alevin_gpl_cmd);
563 | 
564 |             let gpl_start = Instant::now();
565 |             let gpl_proc_out = alevin_gpl_cmd
566 |                 .output()
567 |                 .expect("could not execute [generate permit list]");
568 |             let gpl_duration = gpl_start.elapsed();
569 | 
570 |             if !gpl_proc_out.status.success() {
571 |                 bail!(
572 |                     "generate-permit-list failed with exit status {:?}",
573 |                     gpl_proc_out.status
574 |                 );
575 |             }
576 | 
577 |             //
578 |             // collate
579 |             //
580 |             let mut alevin_collate_cmd =
581 |                 std::process::Command::new(format!("{}", &alevin_fry.display()));
582 | 
583 |             alevin_collate_cmd.arg("collate");
584 |             alevin_collate_cmd.arg("-i").arg(&gpl_output);
585 |             alevin_collate_cmd.arg("-r").arg(&map_output);
586 |             alevin_collate_cmd.arg("-t").arg(format!("{}", threads));
587 | 
588 |             info!("cmd : {:?}", alevin_collate_cmd);
589 |             let collate_start = Instant::now();
590 |             let collate_proc_out = alevin_collate_cmd
591 |                 .output()
592 |                 .expect("could not execute [collate]");
593 |             let collate_duration = collate_start.elapsed();
594 | 
595 |             if !collate_proc_out.status.success() {
596 |                 bail!(
597 |                     "collate failed with exit status {:?}",
598 |                     collate_proc_out.status
599 |                 );
600 |             }
601 | 
602 |             //
603 |             // quant
604 |             //
605 |             let mut alevin_quant_cmd =
606 |                 std::process::Command::new(format!("{}", &alevin_fry.display()));
607 | 
608 |             alevin_quant_cmd
609 |                 .arg("quant")
610 |                 .arg("-i")
611 |                 .arg(&gpl_output)
612 |                 .arg("-o")
613 |                 .arg(&gpl_output);
614 |             alevin_quant_cmd.arg("-t").arg(format!("{}", threads));
615 |             alevin_quant_cmd.arg("-m").arg(t2g_map);
616 |             alevin_quant_cmd.arg("-r").arg(resolution);
617 | 
618 |             info!("cmd : {:?}", alevin_quant_cmd);
619 |             let quant_start = Instant::now();
620 |             let quant_proc_out = alevin_quant_cmd
621 |                 .output()
622 |                 .expect("could not execute [quant]");
623 |             let quant_duration = quant_start.elapsed();
624 | 
625 |             if !quant_proc_out.status.success() {
626 |                 bail!("quant failed with exit status {:?}", quant_proc_out.status);
627 |             }
628 | 
629 |             let af_quant_info_file = output.join("simpleaf_quant_log.json");
630 |             let af_quant_info = json!({
631 |                 "time_info" : {
632 |                 "map_time" : map_duration,
633 |                 "gpl_time" : gpl_duration,
634 |                 "collate_time" : collate_duration,
635 |                 "quant_time" : quant_duration
636 |                 }
637 |             });
638 | 
639 |             std::fs::write(
640 |                 &af_quant_info_file,
641 |                 serde_json::to_string_pretty(&af_quant_info).unwrap(),
642 |             )
643 |             .with_context(|| format!("could not write {}", af_quant_info_file.display()))?;
644 |         }
645 |     }
646 |     Ok(())
647 | }
648 | 


--------------------------------------------------------------------------------
/simpleaf/src/utils/af_utils.rs:
--------------------------------------------------------------------------------
 1 | #[derive(Debug, Clone)]
 2 | pub enum CellFilterMethod {
 3 |     // cut off at this cell in
 4 |     // the frequency sorted list
 5 |     ForceCells(usize),
 6 |     // use this cell as a hint in
 7 |     // the frequency sorted list
 8 |     ExpectCells(usize),
 9 |     // correct all cells in an
10 |     // edit distance of 1 of these
11 |     // barcodes
12 |     ExplicitList(String),
13 |     // barcodes will be provided in the
14 |     // form of an *unfiltered* external
15 |     // permit list
16 |     UnfilteredExternalList(String, usize),
17 |     // use the distance method to
18 |     // automatically find the knee
19 |     // in the curve
20 |     KneeFinding,
21 | }
22 | 
23 | pub fn add_to_args(fm: &CellFilterMethod, cmd: &mut std::process::Command) {
24 |     match fm {
25 |         CellFilterMethod::ForceCells(nc) => {
26 |             cmd.arg("--force").arg(format!("{}", nc));
27 |         }
28 |         CellFilterMethod::ExpectCells(nc) => {
29 |             cmd.arg("--force").arg(format!("{}", nc));
30 |         }
31 |         CellFilterMethod::ExplicitList(l) => {
32 |             cmd.arg("--valid-bc").arg(l);
33 |         }
34 |         CellFilterMethod::UnfilteredExternalList(l, m) => {
35 |             cmd.arg("--unfiltered-pl")
36 |                 .arg(l)
37 |                 .arg("--min-reads")
38 |                 .arg(format!("{}", m));
39 |         }
40 |         CellFilterMethod::KneeFinding => {
41 |             cmd.arg("--knee");
42 |         }
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/simpleaf/src/utils/prog_utils.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{anyhow, Result};
  2 | use cmd_lib::run_fun;
  3 | use semver::{Version, VersionReq};
  4 | use serde::{Deserialize, Serialize};
  5 | use std::env;
  6 | use std::path::PathBuf;
  7 | use which::which;
  8 | 
  9 | #[derive(Debug, Serialize, Deserialize, Clone)]
 10 | pub struct ProgInfo {
 11 |     pub exe_path: PathBuf,
 12 |     pub version: String,
 13 | }
 14 | 
 15 | impl Default for ProgInfo {
 16 |     fn default() -> Self {
 17 |         Self {
 18 |             exe_path: PathBuf::from(""),
 19 |             version: String::from("0.0.0"),
 20 |         }
 21 |     }
 22 | }
 23 | 
 24 | // Holds the paths to the
 25 | // programs we'll need to run
 26 | // the tool.
 27 | #[derive(Debug, Serialize, Deserialize)]
 28 | pub struct ReqProgs {
 29 |     pub salmon: Option<ProgInfo>,
 30 |     pub alevin_fry: Option<ProgInfo>,
 31 |     pub pyroe: Option<ProgInfo>,
 32 | }
 33 | 
 34 | pub fn check_version_constraints<S1: AsRef<str>>(
 35 |     req_string: S1,
 36 |     prog_output: std::result::Result<String, std::io::Error>,
 37 | ) -> Result<Version> {
 38 |     match prog_output {
 39 |         Ok(vs) => {
 40 |             let x = vs.split_whitespace();
 41 |             if let Some(version) = x.last() {
 42 |                 let parsed_version = Version::parse(version).unwrap();
 43 |                 let req = VersionReq::parse(req_string.as_ref()).unwrap();
 44 |                 if req.matches(&parsed_version) {
 45 |                     return Ok(parsed_version);
 46 |                 } else {
 47 |                     return Err(anyhow!(
 48 |                         "parsed version {:?} does not satisfy constraints {:?}",
 49 |                         version,
 50 |                         req
 51 |                     ));
 52 |                 }
 53 |             }
 54 |         }
 55 |         Err(e) => {
 56 |             eprintln!("Error running salmon {}", e);
 57 |             return Err(anyhow!("could not parse program output"));
 58 |         }
 59 |     }
 60 |     Err(anyhow!("invalid version string"))
 61 | }
 62 | 
 63 | pub fn get_which_executable(prog_name: &str) -> Result<PathBuf> {
 64 |     match which(prog_name) {
 65 |         Ok(p) => {
 66 |             println!("found `{}` in the PATH at {}", prog_name, p.display());
 67 |             Ok(p)
 68 |         }
 69 |         Err(e) => {
 70 |             return Err(anyhow!(
 71 |                 "could not find `{}` in your path: {}",
 72 |                 prog_name,
 73 |                 e
 74 |             ));
 75 |         }
 76 |     }
 77 | }
 78 | 
 79 | #[allow(dead_code)]
 80 | pub fn search_for_executable(env_key: &str, prog_name: &str) -> Result<PathBuf> {
 81 |     match env::var(env_key) {
 82 |         Ok(p) => Ok(PathBuf::from(p)),
 83 |         Err(e) => {
 84 |             eprintln!("${} is unset {}, trying default path.", env_key, e);
 85 |             eprintln!(
 86 |                 "If a satisfactory version is not found, consider setting the ${} variable.",
 87 |                 env_key
 88 |             );
 89 |             get_which_executable(prog_name)
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | pub fn get_required_progs_from_paths(
 95 |     salmon_exe: Option<PathBuf>,
 96 |     alevin_fry_exe: Option<PathBuf>,
 97 |     pyroe_exe: Option<PathBuf>,
 98 | ) -> Result<ReqProgs> {
 99 |     let mut rp = ReqProgs {
100 |         salmon: None,
101 |         alevin_fry: None,
102 |         pyroe: None,
103 |     };
104 | 
105 |     // use the given path if we have it
106 |     // otherwise, check `which`
107 |     let salmon = match salmon_exe {
108 |         Some(p) => p,
109 |         None => match get_which_executable("salmon") {
110 |             Ok(p) => p,
111 |             Err(e) => {
112 |                 return Err(e);
113 |             }
114 |         },
115 |     };
116 |     let alevin_fry = match alevin_fry_exe {
117 |         Some(p) => p,
118 |         None => match get_which_executable("alevin-fry") {
119 |             Ok(p) => p,
120 |             Err(e) => {
121 |                 return Err(e);
122 |             }
123 |         },
124 |     };
125 |     let pyroe = match pyroe_exe {
126 |         Some(p) => p,
127 |         None => match get_which_executable("pyroe") {
128 |             Ok(p) => p,
129 |             Err(e) => {
130 |                 return Err(e);
131 |             }
132 |         },
133 |     };
134 | 
135 |     let st = salmon.display().to_string();
136 |     let sr = run_fun!($st --version);
137 |     let v = check_version_constraints(">=1.5.1, <2.0.0", sr)?;
138 |     rp.salmon = Some(ProgInfo {
139 |         exe_path: salmon,
140 |         version: format!("{}", v),
141 |     });
142 | 
143 |     let st = alevin_fry.display().to_string();
144 |     let sr = run_fun!($st --version);
145 |     let v = check_version_constraints(">=0.4.1, <1.0.0", sr)?;
146 |     rp.alevin_fry = Some(ProgInfo {
147 |         exe_path: alevin_fry,
148 |         version: format!("{}", v),
149 |     });
150 | 
151 |     let st = pyroe.display().to_string();
152 |     let sr = run_fun!($st --version);
153 |     let v = check_version_constraints(">=0.6.2, <1.0.0", sr)?;
154 |     rp.pyroe = Some(ProgInfo {
155 |         exe_path: pyroe,
156 |         version: format!("{}", v),
157 |     });
158 | 
159 |     Ok(rp)
160 | }
161 | 
162 | #[allow(dead_code)]
163 | pub fn get_required_progs() -> Result<ReqProgs> {
164 |     // First look for any environment variables
165 |     // then check the path.
166 |     let salmon_exe = Some(search_for_executable("SALMON", "salmon")?);
167 |     let alevin_fry_exe = Some(search_for_executable("ALEVIN_FRY", "alevin-fry")?);
168 |     let pyroe_exe = Some(search_for_executable("PYROE", "pyroe")?);
169 | 
170 |     get_required_progs_from_paths(salmon_exe, alevin_fry_exe, pyroe_exe)
171 | }
172 | 


--------------------------------------------------------------------------------
/simpleaf_conda_env.yml:
--------------------------------------------------------------------------------
 1 | name: simpleaf
 2 | 
 3 | channels:
 4 |   - conda-forge
 5 |   - bioconda
 6 |   - defaults
 7 | 
 8 | dependencies:
 9 |   # - python>=3.9
10 |   - bioconda::salmon>=1.8.0
11 |   - bioconda::alevin-fry>=0.6.0
12 |   - bioconda::pyroe>=0.6.2
13 |   - bioconda::bedtools>=2.30.0
14 |   - bioconda::gffread>=0.12.7
15 |   - conda-forge::r-essentials>=4.1
16 |   - conda-forge::r-devtools
17 |   - conda-forge::r-argparser
18 |   - conda-forge::r-biocmanager
19 |   - bioconductor-genomicfeatures
20 |   - bioconductor-eisaR
21 |   - bioconductor-bsgenome
22 |   - bioconductor-fishpond
23 | 


--------------------------------------------------------------------------------