├── .gitignore ├── LICENSE ├── README.md ├── benchmarks ├── data │ └── scripts │ │ ├── generate.sh │ │ └── makeLoom.py ├── memory.jpg ├── parsers │ ├── readCsv.R │ ├── readEds.R │ ├── readH5.R │ ├── readLoom.R │ └── readMtx.R ├── profile.sh ├── size.jpg ├── stats │ ├── ltime.tex │ ├── memory.tex │ └── size.tex └── time.jpg ├── eds.jpg ├── src-cpp └── readEDS.cpp └── src-rs ├── Cargo.toml └── src ├── csv.rs ├── eds.rs ├── h5.rs ├── main.rs ├── mtx.rs └── utils.rs /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, COMBINE-lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## What's EDS ? 2 | EDS is an accronym for Efficient single cell binary Data Storage format for the cell-feature count matrices. 3 | 4 | ![EDS](https://github.com/COMBINE-lab/EDS/blob/master/eds.jpg) 5 | 6 | ## Why we need a new storage format ? 7 | Recent advancements in single-cell technologies have seen rapid increase in the amount of data. Most single-cell studies generate a cell by feature (can be gene) count matrices, where the number of cells are now reaching towards millions. Traditional Single-cell quantification pipelines use matrix market exchange (mtx) format (sometimes gzipped) for sharing the count matrices. However, the textual representation of mtx format makes it bigger in size compared to a compressed binary format. Our quantification tool [alevin](https://combine-lab.github.io/alevin-tutorial/) dumps the output in EDS format which saves storage space. 8 | 9 | 10 | ## What are the caveats ? 11 | There are other formats (such as [loom](https://github.com/linnarsson-lab/loompy)) which are designed for optimizing the query of the matrix. EDS is primarily designed to improve the storage efficiency rather than query and currently don't support random access to a cell (row). 12 | 13 | ## How to convert eds to mtx format ? 14 | We have a simple rust code inside the `src-rs`, it can be installed using `cargo build --release` and can be used as `./target/release/eds convert -i --[mtx | eds | h5 | csv] -c -f `. 15 | 16 | ## Benchmarks 17 | * Size on disk. 18 | ![Disk Space](https://github.com/COMBINE-lab/EDS/blob/master/benchmarks/size.jpg) 19 | 20 | * Matrix loading into memory time. 21 | ![Loading time](https://github.com/COMBINE-lab/EDS/blob/master/benchmarks/time.jpg) 22 | 23 | * Memory required to load the matrix. 24 | ![Memory Usage](https://github.com/COMBINE-lab/EDS/blob/master/benchmarks/memory.jpg) 25 | 26 | ## Future 27 | - [ ] Support delayedArray R object 28 | - [ ] Random access through `EDS index` 29 | 30 | ## Contributors 31 | - Avi Srivastava 32 | - Mike Love 33 | - Rob Patro 34 | -------------------------------------------------------------------------------- /benchmarks/data/scripts/generate.sh: -------------------------------------------------------------------------------- 1 | name=$1 2 | cells=$2 3 | feats=$3 4 | 5 | bin="/mnt/scratch1/avi/anton/alevin_r/EDS/src-rs/target/release/eds" 6 | dpath="/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/"$name"/quants_mat.eds.gz" 7 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --csv -c $cells -f $feats 8 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --mtx -c $cells -f $feats 9 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --h5 -c $cells -f $feats 10 | -------------------------------------------------------------------------------- /benchmarks/data/scripts/makeLoom.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import loompy 3 | import numpy as np 4 | import sys 5 | from scipy.io import mmread 6 | 7 | data = sys.argv[1] 8 | mtx_file = "/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/" + data + "/quants_mat.mtx.gz" 9 | out_file = "/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/" + data + "/quants_mat.loom" 10 | 11 | data = mmread( gzip.open(mtx_file) ) 12 | (cells, feats) = data.shape 13 | 14 | data = data.T 15 | 16 | row_names = {} 17 | row_names['rname'] = np.array(range(feats)) 18 | 19 | col_names = {} 20 | col_names['cname'] = np.array( range(cells) ) 21 | 22 | loompy.create(out_file, data, row_names, col_names) 23 | -------------------------------------------------------------------------------- /benchmarks/memory.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/memory.jpg -------------------------------------------------------------------------------- /benchmarks/parsers/readCsv.R: -------------------------------------------------------------------------------- 1 | args = commandArgs(trailingOnly=TRUE) 2 | 3 | data <- args[1] 4 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.csv.gz") 5 | 6 | system.time({ 7 | csv <- read.table( gzfile( fpath ), sep="," ) 8 | }) 9 | 10 | print(dim(csv)) 11 | -------------------------------------------------------------------------------- /benchmarks/parsers/readEds.R: -------------------------------------------------------------------------------- 1 | library(Rcpp) 2 | library(Matrix) 3 | 4 | args = commandArgs(trailingOnly=TRUE) 5 | sourceCpp("/mnt/scratch1/avi/anton/alevin_r/EDS/src-cpp/readEDS.cpp") 6 | 7 | data <- args[1] 8 | num.cells <- as.integer(args[2]) 9 | num.genes <- as.integer(args[3]) 10 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.eds.gz") 11 | 12 | system.time({ 13 | pos <- getSparseMatrix( num.genes, num.cells, fpath ) 14 | }) 15 | 16 | str(pos) 17 | -------------------------------------------------------------------------------- /benchmarks/parsers/readH5.R: -------------------------------------------------------------------------------- 1 | library(hdf5r) 2 | library(Matrix) 3 | 4 | args = commandArgs(trailingOnly=TRUE) 5 | data <- args[1] 6 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.h5") 7 | 8 | system.time({ 9 | infile <- hdf5r::H5File$new(filename = fpath, mode = 'r') 10 | genome <- "/matrix" 11 | 12 | counts <- infile[[paste0(genome, '/data')]] 13 | indices <- infile[[paste0(genome, '/indices')]] 14 | indptr <- infile[[paste0(genome, '/indptr')]] 15 | shp <- infile[[paste0(genome, '/shape')]] 16 | 17 | sparse.mat <- sparseMatrix( 18 | i = indices[] + 1, 19 | p = indptr[], 20 | x = as.numeric(x = counts[]), 21 | dims = shp[], 22 | giveCsparse = TRUE 23 | ) 24 | }) 25 | 26 | str(sparse.mat) 27 | -------------------------------------------------------------------------------- /benchmarks/parsers/readLoom.R: -------------------------------------------------------------------------------- 1 | library(loomR) 2 | 3 | args = commandArgs(trailingOnly=TRUE) 4 | # inparts taked from https://satijalab.org/loomR/loomR_tutorial.html 5 | data <- args[1] 6 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.loom") 7 | 8 | system.time({ 9 | lfile <- connect(filename = fpath, mode = "r+") 10 | full.matrix <- lfile$matrix[, ] 11 | }) 12 | 13 | dim( full.matrix ) 14 | -------------------------------------------------------------------------------- /benchmarks/parsers/readMtx.R: -------------------------------------------------------------------------------- 1 | library(Matrix) 2 | 3 | args = commandArgs(trailingOnly=TRUE) 4 | data <- args[1] 5 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.mtx.gz") 6 | 7 | system.time({ 8 | mm <- readMM( gzfile( fpath ) ) 9 | }) 10 | 11 | print(dim(mm)) 12 | -------------------------------------------------------------------------------- /benchmarks/profile.sh: -------------------------------------------------------------------------------- 1 | datas=("neurons_450k_random" "pbmc_40k_random" "neurons_1m" "neurons_900" "neurons_2k" "pbmc_4k" "pbmc_8k" "neurons_9k" "pbmc_40k" "neurons_450k") 2 | cells=(456400 43400 1000000 931 2022 4340 8381 9128 43400 456400) 3 | feats=(50686 58278 27998 50686 50686 58278 58278 50686 58278 50686) 4 | 5 | for id in {0..0}; do 6 | data=${datas[$id]} 7 | cell=${cells[$id]} 8 | feat=${feats[$id]} 9 | 10 | echo $data $cell $feat 11 | echo "EDS" 12 | /usr/bin/time Rscript --vanilla parsers/readEds.R $data $cell $feat && 13 | 14 | echo "H5" && 15 | /usr/bin/time Rscript --vanilla parsers/readH5.R $data && 16 | 17 | echo "Mtx" && 18 | /usr/bin/time Rscript --vanilla parsers/readMtx.R $data && 19 | 20 | echo "loom" && 21 | /usr/bin/time Rscript --vanilla parsers/readLoom.R $data && 22 | 23 | echo "CSV" 24 | /usr/bin/time Rscript --vanilla parsers/readCsv.R $data 25 | done 26 | 27 | -------------------------------------------------------------------------------- /benchmarks/size.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/size.jpg -------------------------------------------------------------------------------- /benchmarks/stats/ltime.tex: -------------------------------------------------------------------------------- 1 | \begin{center} 2 | \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||} 3 | \hline 4 | \vtop{\hbox{\strut Loading}\hbox{\strut (Second)}} & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex] 5 | \hline\hline 6 | eds.gz & 0.28 & 0.86 & 1.49 & 2.30 & 3.24 & 10.04 & 9.93 & 119.61 & 112.88 & 159.77 \\ 7 | \hline 8 | csv.gz & 28.92 & 40.15 & 75.30 & 134.44 & 114.96 & 680.1 & 439.47 & 6768.7 & 6524.1 & N/A \\ 9 | \hline 10 | h5 & 0.83 & 1.56 & 1.65 & 3.88 & 5.72 & 10.11 & 10.20 & 276.36 & 338.91 & 705.56 \\ 11 | \hline 12 | mtx.gz & 1.93 & 2.37 & 3.53 & 6.87 & 12.45 & 26.84 & 23.60 & 536.91 & 489.76 & 904.35 \\ 13 | \hline 14 | loom & 0.98 & 1.82 & 4.08 & 7.67 & 8.05 & 44.82 & 36.92 & 400.99 & 378.02 & N/A \\ [1ex] 15 | \hline 16 | \end{tabular} 17 | \end{center} 18 | -------------------------------------------------------------------------------- /benchmarks/stats/memory.tex: -------------------------------------------------------------------------------- 1 | \begin{center} 2 | \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||} 3 | \hline 4 | \vtop{\hbox{\strut Memory}\hbox{\strut (Gb)}} & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex] 5 | \hline\hline 6 | eds.gz & 0.32 & 0.33 & 0.33 & 0.36 & 0.49 & 0.78 & 0.77 & 11.5 & 11.5 & 22.9 \\ 7 | \hline 8 | csv.gz & 0.73 & 2.40 & 4.80 & 9.87 & 8.14 & 42.12 & 41.08 & 322 & 317.2 & N/A \\ 9 | \hline 10 | h5 & 0.47 & 0.59 & 0.69 & 1.10 & 1.76 & 3.17 & 3.12 & 56.8 & 56.77 & 114.29 \\ 11 | \hline 12 | mtx.gz & 0.37 & 0.38 & 0.48 & 0.74 & 1.10 & 1.75 & 1.72 & 30.3 & 30.33 & 61 \\ 13 | \hline 14 | loom & 0.68 & 1.13 & 2.47 & 4.53 & 4.30 & 22.447 & 22.40 & 203.3 & 203.2 & N/A \\ [1ex] 15 | \hline 16 | \end{tabular} 17 | \end{center} 18 | -------------------------------------------------------------------------------- /benchmarks/stats/size.tex: -------------------------------------------------------------------------------- 1 | \begin{center} 2 | \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||} 3 | \hline 4 | Size & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex] 5 | \hline\hline 6 | eds.gz & 5.4M & 9.8M & 15M & 29M & 42M & 108M & 116M & 1.8G & 1.9G & 2.6G \\ 7 | \hline 8 | csv.gz & 6.6M & 13M & 19M & 37M & 51M & 142M & 142M & 2.3G & 2.3G & 3.1G \\ 9 | \hline 10 | h5 & 7.6M & 14M & 18M & 36M & 59M & 97M & 135M & 1.8G & 2.5G & 4.0G \\ 11 | \hline 12 | mtx.gz & 11M & 20M & 26M & 52M & 86M & 186M & 192M & 3.6G & 3.6G & 5.8G \\ 13 | \hline 14 | loom & 12M & 23M & 40M & 78M & 97M & 297M & 355M & 3.6G & 4.8G & 6.1G \\ [1ex] 15 | \hline 16 | \end{tabular} 17 | \end{center} 18 | -------------------------------------------------------------------------------- /benchmarks/time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/time.jpg -------------------------------------------------------------------------------- /eds.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/eds.jpg -------------------------------------------------------------------------------- /src-cpp/readEDS.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Alevin Efficient Data Storage (EDS) reader 3 | * 4 | * Author: Avi Srivastava 5 | * Last modified: August 13, 2019 6 | * License: LGPL (>= 3) 7 | * 8 | */ 9 | 10 | #include 11 | #include 12 | 13 | using namespace Rcpp; 14 | 15 | // C++ internal function to figure out the spaces to reserve 16 | size_t getReserveSpaces(size_t numOfGenes, size_t numOfOriginalCells, 17 | Rcpp::IntegerVector& bitVecLengths, 18 | std::string& countMatFilename) { 19 | 20 | // opening gzipped compressed stream 21 | gzFile fileHandler = gzopen(countMatFilename.c_str(), "rb") ; 22 | 23 | // We are storing the bit vector in u8 so total number of u8 = numGenes/8 24 | size_t numFlags = std::ceil(numOfGenes / 8.0); 25 | 26 | // vector for storing the bitvector flags 27 | std::vector alphasFlag (numFlags, 0); 28 | 29 | // getting the sizs of u8 and float 32 30 | size_t flagSize = sizeof(decltype(alphasFlag)::value_type); 31 | size_t elSize = sizeof(float); 32 | size_t totalSpace { 0 }; 33 | 34 | // iterating over cells 35 | for (size_t cellId = 0 ; cellId < numOfOriginalCells ; ++cellId) { 36 | // reading bitvectors 37 | gzread(fileHandler, reinterpret_cast(alphasFlag.data()), flagSize * numFlags); 38 | size_t numOfExpGenes { 0 }; 39 | 40 | for (size_t j = 0; j < alphasFlag.size(); j++) { 41 | uint8_t flag = alphasFlag[j]; 42 | 43 | for (size_t i = 0; i < 8; i++){ 44 | // counting positions only if the flag is set 45 | if (flag & (128 >> i)) { 46 | numOfExpGenes += 1; 47 | } 48 | } 49 | } 50 | 51 | // skipping the expression values and saving the counts for numOfExpGenes 52 | gzseek(fileHandler, elSize * numOfExpGenes, SEEK_CUR); 53 | bitVecLengths[ cellId + 1 ] = ( numOfExpGenes + bitVecLengths[ cellId ] ); 54 | totalSpace += numOfExpGenes; 55 | } 56 | 57 | return totalSpace; 58 | } 59 | 60 | // [[Rcpp::export]] 61 | SEXP getSparseMatrix(size_t numOfGenes, size_t numOfOriginalCells, std::string countMatFilename) { 62 | Rcpp::S4 mat("dgCMatrix"); 63 | 64 | // initializing vector to store bitvecSpaces 65 | Rcpp::IntegerVector bitVecLengths(numOfOriginalCells + 1, 0); 66 | size_t totalSpace = getReserveSpaces( numOfGenes, numOfOriginalCells, 67 | bitVecLengths, countMatFilename ); 68 | 69 | // initializing sparse matrix 70 | typedef Rcpp::NumericVector ValuesT; 71 | ValuesT values(totalSpace, 0.0); 72 | 73 | // initializing sparse matrix indices 74 | typedef Rcpp::IntegerVector IndicesT; 75 | IndicesT indices(totalSpace, 0); 76 | 77 | // opening gzipped compressed stream 78 | gzFile fileHandler = gzopen(countMatFilename.c_str(), "rb") ; 79 | 80 | // We are storing the bit vector in u8 so total number of u8 = numGenes/8 81 | size_t numFlags = std::ceil(numOfGenes / 8.0); 82 | 83 | // vector for storing the bitvector flags 84 | std::vector alphasFlag (numFlags, 0); 85 | 86 | // getting the sizs of u8 and float 32 87 | size_t flagSize = sizeof(decltype(alphasFlag)::value_type); 88 | size_t elSize = sizeof(float); 89 | 90 | size_t valCounter { 0 }; 91 | // iterating over cells 92 | for (size_t cellId = 0 ; cellId < numOfOriginalCells ; ++cellId) { 93 | // reading bitvectors 94 | gzread(fileHandler, reinterpret_cast(alphasFlag.data()), flagSize * numFlags); 95 | 96 | // iterating over u8 flags for bitvectors 97 | size_t numExpGenes { 0 }; 98 | for (size_t j = 0; j < alphasFlag.size(); j++) { 99 | uint8_t flag = alphasFlag[j]; 100 | 101 | for (size_t i = 0; i < 8; i++){ 102 | // extracting positions only if the flag is set 103 | if (flag & (128 >> i)) { 104 | if ( valCounter + numExpGenes >= totalSpace ) { 105 | return Rcpp::List(); 106 | } 107 | 108 | size_t offset = i + (8 * j); 109 | indices[ valCounter + numExpGenes ] = offset; 110 | numExpGenes += 1; 111 | } 112 | } 113 | } 114 | 115 | // reading in the expression 116 | std::vector alphasSparse(numExpGenes); 117 | gzread(fileHandler, reinterpret_cast(alphasSparse.data()), elSize * numExpGenes); 118 | 119 | // saving the positions and expression 120 | for (size_t i = 0; i < numExpGenes; i++) { 121 | if ( valCounter >= totalSpace ) { 122 | return Rcpp::List(); 123 | } 124 | 125 | values[valCounter] = alphasSparse[i]; 126 | valCounter += 1; 127 | } 128 | } 129 | 130 | // code in-parts taken from https://github.com/LTLA/beachmat/blob/master/inst/include/beachmat/output/Csparse_writer.h#L268 131 | mat.slot("Dim") = Rcpp::IntegerVector::create(numOfGenes, numOfOriginalCells); 132 | 133 | // Setting p 134 | mat.slot("p") = bitVecLengths; 135 | 136 | // Setting 'x'. 137 | mat.slot("x") = values; 138 | 139 | // Setting 'i'. 140 | mat.slot("i") = indices; 141 | 142 | return SEXP(mat); 143 | } 144 | 145 | -------------------------------------------------------------------------------- /src-rs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "eds" 3 | version = "0.1.0" 4 | authors = ["avisrivastava , Mike Love, Rob Patro"] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | pretty_env_logger = "0.3.0" 9 | clap = "2.33.0" 10 | log = "0.4.6" 11 | byteorder = "1.3.1" 12 | flate2 = "1.0.7" 13 | libmath = "0.2.1" 14 | hdf5 = "0.5.2" 15 | rand = "0.7.0" -------------------------------------------------------------------------------- /src-rs/src/csv.rs: -------------------------------------------------------------------------------- 1 | use flate2::write::GzEncoder; 2 | use flate2::Compression; 3 | use std::fs::File; 4 | use std::io; 5 | use std::io::Write; 6 | 7 | pub fn writer( 8 | path_str: String, 9 | expressions: Vec>, 10 | bit_vecs: Vec>, 11 | _num_cells: usize, 12 | num_features: usize, 13 | ) -> Result { 14 | let file_handle = File::create(path_str)?; 15 | let mut file = GzEncoder::new(file_handle, Compression::default()); 16 | 17 | let mut header = "\"\"".to_string(); 18 | for gid in 1..num_features + 1 { 19 | header.push_str(&format!(",gene{}", gid)); 20 | } 21 | header.push_str(&format!("\n")); 22 | file.write_all(header.as_bytes())?; 23 | 24 | let mut mtx_data: String; 25 | assert!( 26 | bit_vecs.len() == expressions.len(), 27 | "length of bit vec and expression is not same" 28 | ); 29 | for (cell_id, exp) in expressions.into_iter().enumerate() { 30 | let bit_vec = &bit_vecs[cell_id]; 31 | let mut fids: Vec = Vec::new(); 32 | 33 | for (feature_id, flag) in bit_vec.into_iter().enumerate() { 34 | if *flag != 0 { 35 | for (offset, j) in format!("{:8b}", flag).chars().enumerate() { 36 | match j { 37 | '1' => fids.push((8 * feature_id) + offset), 38 | _ => (), 39 | }; 40 | } 41 | } 42 | } 43 | 44 | assert!( 45 | fids.len() == exp.len(), 46 | format!("#positions {} doesn't match with #expressed features {}", 47 | fids.len(), exp.len()) 48 | ); 49 | mtx_data = format!("cell{}", cell_id + 1); 50 | let mut zero_counter = 0; 51 | for (index, count) in exp.into_iter().enumerate() { 52 | assert!( 53 | fids[index] < num_features, 54 | format!("{} position > {}", fids[index], num_features) 55 | ); 56 | 57 | while zero_counter != fids[index] { 58 | zero_counter += 1; 59 | mtx_data.push_str(&format!(",0")); 60 | } 61 | 62 | zero_counter += 1; 63 | mtx_data.push_str(&format!(",{}", count)); 64 | } 65 | 66 | while zero_counter < num_features { 67 | zero_counter += 1; 68 | mtx_data.push_str(&format!(",0")); 69 | } 70 | 71 | mtx_data.push_str(&format!("\n")); 72 | file.write_all(mtx_data.as_bytes())?; 73 | } 74 | 75 | Ok(true) 76 | } 77 | -------------------------------------------------------------------------------- /src-rs/src/eds.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io; 3 | use std::io::{Read, Write}; 4 | 5 | use byteorder::{ByteOrder, LittleEndian}; 6 | use flate2::read::GzDecoder; 7 | use math::round; 8 | 9 | use flate2::write::GzEncoder; 10 | use flate2::Compression; 11 | 12 | pub fn reader( 13 | input: &str, 14 | num_cells: usize, 15 | num_genes: usize, 16 | expr: &mut Vec>, 17 | bit_vecs: &mut Vec>, 18 | ) -> Result { 19 | info!("Using {} as input EDS file\n", input); 20 | info!( 21 | "Using {} Rows (cells) and {} Columns (features)", 22 | num_cells, num_genes 23 | ); 24 | 25 | let num_bit_vecs: usize = round::ceil(num_genes as f64 / 8.0, 0) as usize; 26 | let mut total_molecules = 0.0; 27 | let mut total_exp_values = 0; 28 | 29 | { 30 | let mut count = 0; 31 | let file_handle = File::open(input)?; 32 | let mut file = GzDecoder::new(file_handle); 33 | 34 | for _ in 0..num_cells { 35 | let mut bit_vec = vec![0; num_bit_vecs]; 36 | file.read_exact(&mut bit_vec[..])?; 37 | let mut num_ones = 0; 38 | for bits in bit_vec.iter() { 39 | num_ones += bits.count_ones(); 40 | } 41 | bit_vecs.push(bit_vec); 42 | 43 | let mut expression: Vec = vec![0; 4 * (num_ones as usize)]; 44 | let mut float_buffer: Vec = vec![0.0_f32; num_ones as usize]; 45 | file.read_exact(&mut expression[..])?; 46 | LittleEndian::read_f32_into(&expression, &mut float_buffer); 47 | 48 | let cell_count: f32 = float_buffer.iter().sum(); 49 | total_molecules += cell_count; 50 | expr.push(float_buffer); 51 | 52 | count += 1; 53 | total_exp_values += num_ones; 54 | if count % 100 == 0 { 55 | print!("\r Done Reading {} cells", count); 56 | io::stdout().flush()?; 57 | } 58 | } 59 | } 60 | 61 | println!("\n"); 62 | assert!( 63 | expr.len() == num_cells, 64 | "rows and quants file size mismatch" 65 | ); 66 | 67 | info!("Found Total {:.2} molecules", total_molecules); 68 | info!("Found Total {:.2} expressed entries", total_exp_values); 69 | info!( 70 | "w/ {:.2} Molecules/cell", 71 | total_molecules / num_cells as f32 72 | ); 73 | Ok(true) 74 | } 75 | 76 | pub fn writer( 77 | path_str: String, 78 | expressions: Vec>, 79 | bit_vecs: Vec>, 80 | _num_cells: usize, 81 | _num_features: usize, 82 | ) -> Result { 83 | let file_handle = File::create(path_str)?; 84 | let mut file = GzEncoder::new(file_handle, Compression::default()); 85 | 86 | assert!(expressions.len() == bit_vecs.len()); 87 | for (exp, bvec) in expressions.into_iter().zip(bit_vecs.into_iter()) { 88 | file.write_all(&bvec)?; 89 | 90 | let mut bin_exp: Vec = vec![0_u8; exp.len() * 4]; 91 | LittleEndian::write_f32_into(&exp, &mut bin_exp); 92 | file.write_all(&bin_exp)?; 93 | } 94 | 95 | Ok(true) 96 | } 97 | -------------------------------------------------------------------------------- /src-rs/src/h5.rs: -------------------------------------------------------------------------------- 1 | use hdf5; 2 | use std::io; 3 | 4 | pub fn writer( 5 | path_str: String, 6 | expressions: Vec>, 7 | bit_vecs: Vec>, 8 | num_cells: usize, 9 | num_features: usize, 10 | ) -> Result { 11 | let file = hdf5::File::open(path_str, "w").expect("can't create output file"); 12 | 13 | let group = file 14 | .create_group("matrix") 15 | .expect("can't create group in h5"); 16 | 17 | let shape = group 18 | .new_dataset::() 19 | .gzip(6) 20 | .create("shape", 2) 21 | .expect("can't write shape in h5"); 22 | 23 | shape 24 | .write(&[num_features, num_cells]) 25 | .expect("error writing shape"); 26 | 27 | assert!( 28 | bit_vecs.len() == expressions.len(), 29 | "length of bit vec and expression is not same" 30 | ); 31 | 32 | let total_entries; 33 | { 34 | let mut cumm_sum = 0; 35 | let mut indptr_vals: Vec = vec![cumm_sum]; 36 | for exp in expressions.iter() { 37 | cumm_sum += exp.len() as u32; 38 | indptr_vals.push(cumm_sum); 39 | } 40 | 41 | total_entries = indptr_vals.last().expect("indptr empty").clone(); 42 | assert!( 43 | indptr_vals.len() == num_cells + 1, 44 | "num cells doesn't match" 45 | ); 46 | 47 | let indptr = group 48 | .new_dataset::() 49 | .gzip(6) 50 | .create("indptr", indptr_vals.len()) 51 | .expect("can't write indptr in h5"); 52 | 53 | indptr 54 | .write_raw(&indptr_vals) 55 | .expect("error writing indptr"); 56 | } // end writing indptr 57 | 58 | { 59 | let data = group 60 | .new_dataset::() 61 | .gzip(6) 62 | .create("data", total_entries as usize) 63 | .expect("can't write data in h5"); 64 | 65 | let flatten_data: Vec = expressions 66 | .iter() 67 | .flat_map(|array| array.iter()) 68 | .cloned() 69 | .collect(); 70 | 71 | assert!( 72 | flatten_data.len() == total_entries as usize, 73 | "different number of entries" 74 | ); 75 | data.write_raw(&flatten_data).expect("can't write data"); 76 | } // end writing data 77 | 78 | { 79 | let indices = group 80 | .new_dataset::() 81 | .gzip(6) 82 | .create("indices", total_entries as usize) 83 | .expect("can't write positions in h5"); 84 | 85 | let mut positions: Vec = Vec::new(); 86 | for bit_vec in bit_vecs { 87 | for (feature_id, flag) in bit_vec.into_iter().enumerate() { 88 | if flag != 0 { 89 | for (offset, j) in format!("{:8b}", flag).chars().enumerate() { 90 | match j { 91 | '1' => positions.push((8 * feature_id) as u32 + offset as u32), 92 | _ => (), 93 | }; 94 | } 95 | } 96 | } 97 | } // end-for 98 | 99 | assert!( 100 | positions.len() == total_entries as usize, 101 | "different number of entries" 102 | ); 103 | indices.write_raw(&positions).expect("can't write indices"); 104 | } // end writing indices 105 | 106 | Ok(true) 107 | } 108 | -------------------------------------------------------------------------------- /src-rs/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate byteorder; 2 | extern crate clap; 3 | extern crate flate2; 4 | extern crate hdf5; 5 | extern crate math; 6 | extern crate pretty_env_logger; 7 | 8 | #[macro_use] 9 | extern crate log; 10 | 11 | mod csv; 12 | mod eds; 13 | mod h5; 14 | mod mtx; 15 | mod utils; 16 | 17 | use clap::{App, Arg, ArgMatches, SubCommand}; 18 | use std::io; 19 | use utils::FileType; 20 | 21 | fn randomize_file(sub_m: &ArgMatches) -> Result<(), io::Error> { 22 | let input_file_path = sub_m.value_of("input").unwrap(); 23 | let output_file_type = FileType::Dummy(".random".to_string()); 24 | 25 | let (input_file_type, output_file_path) = 26 | utils::get_output_path(input_file_path, output_file_type.clone()); 27 | 28 | let num_cells: usize = sub_m 29 | .value_of("cells") 30 | .expect("can't find #cells") 31 | .parse() 32 | .unwrap(); 33 | 34 | let num_features = sub_m 35 | .value_of("features") 36 | .expect("can't find #features") 37 | .parse() 38 | .unwrap(); 39 | 40 | let (bit_vecs, alphas) = utils::read_file(input_file_path, 41 | input_file_type.clone(), 42 | num_cells, 43 | num_features)?; 44 | 45 | let (bit_vecs, alphas) = utils::randomize( bit_vecs, alphas )?; 46 | utils::write_file( output_file_path, input_file_type, 47 | bit_vecs, alphas, num_cells, num_features)?; 48 | 49 | info!("All Done!"); 50 | Ok(()) 51 | } 52 | 53 | fn convert_file(sub_m: &ArgMatches) -> Result<(), io::Error> { 54 | let input_file_path = sub_m.value_of("input").unwrap(); 55 | let output_file_type = utils::find_output_format(sub_m); 56 | 57 | let (input_file_type, output_file_path) = 58 | utils::get_output_path(input_file_path, output_file_type.clone()); 59 | 60 | let num_cells: usize = sub_m 61 | .value_of("cells") 62 | .expect("can't find #cells") 63 | .parse() 64 | .unwrap(); 65 | 66 | let num_features = sub_m 67 | .value_of("features") 68 | .expect("can't find #features") 69 | .parse() 70 | .unwrap(); 71 | 72 | let (bit_vecs, alphas) = utils::read_file(input_file_path, 73 | input_file_type, 74 | num_cells, 75 | num_features)?; 76 | 77 | utils::write_file( output_file_path, output_file_type, 78 | bit_vecs, alphas, num_cells, num_features)?; 79 | 80 | info!("All Done!"); 81 | Ok(()) 82 | } 83 | 84 | fn main() -> io::Result<()> { 85 | let matches = App::new("EDS") 86 | .version("0.1.0") 87 | .author("Avi Srivastava, Mike Love and Rob Patro") 88 | .about("Efficient scData Storage format") 89 | .subcommand( 90 | SubCommand::with_name("randomize") 91 | .about("randomize the order of cells") 92 | .arg( 93 | Arg::with_name("cells") 94 | .long("cells") 95 | .short("c") 96 | .takes_value(true) 97 | .help("Number of cells"), 98 | ) 99 | .arg( 100 | Arg::with_name("features") 101 | .long("features") 102 | .short("f") 103 | .takes_value(true) 104 | .help("Number of features"), 105 | ) 106 | .arg( 107 | Arg::with_name("input") 108 | .long("input") 109 | .short("i") 110 | .takes_value(true) 111 | .requires("cells") 112 | .requires("features") 113 | .help("path to input file"), 114 | ), 115 | ) 116 | .subcommand( 117 | SubCommand::with_name("convert") 118 | .about("comnvert from eds data format to csv or mtx format") 119 | .arg( 120 | Arg::with_name("mtx") 121 | .long("mtx") 122 | .conflicts_with("eds") 123 | .conflicts_with("csv") 124 | .conflicts_with("h5") 125 | .help("convert to matrix market exchange file"), 126 | ) 127 | .arg( 128 | Arg::with_name("h5") 129 | .long("h5") 130 | .conflicts_with("eds") 131 | .conflicts_with("csv") 132 | .conflicts_with("mtx") 133 | .help("convert to h5 wrapped csc file"), 134 | ) 135 | .arg( 136 | Arg::with_name("csv") 137 | .long("csv") 138 | .conflicts_with("eds") 139 | .conflicts_with("mtx") 140 | .conflicts_with("h5") 141 | .help("convert to comma separated file"), 142 | ) 143 | .arg( 144 | Arg::with_name("eds") 145 | .long("eds") 146 | .conflicts_with("csv") 147 | .conflicts_with("mtx") 148 | .conflicts_with("h5") 149 | .help("convert to EDS file"), 150 | ) 151 | .arg( 152 | Arg::with_name("cells") 153 | .long("cells") 154 | .short("c") 155 | .takes_value(true) 156 | .help("Number of cells"), 157 | ) 158 | .arg( 159 | Arg::with_name("features") 160 | .long("features") 161 | .short("f") 162 | .takes_value(true) 163 | .help("Number of features"), 164 | ) 165 | .arg( 166 | Arg::with_name("input") 167 | .long("input") 168 | .short("i") 169 | .takes_value(true) 170 | .requires("cells") 171 | .requires("features") 172 | .help("path to input file"), 173 | ), 174 | ) 175 | .get_matches(); 176 | 177 | pretty_env_logger::init_timed(); 178 | match matches.subcommand_matches("convert") { 179 | Some(sub_m) => { 180 | let ret = convert_file(&sub_m); 181 | return ret; 182 | } 183 | None => (), 184 | }; 185 | 186 | match matches.subcommand_matches("randomize") { 187 | Some(sub_m) => { 188 | let ret = randomize_file(&sub_m); 189 | return ret; 190 | } 191 | None => (), 192 | }; 193 | 194 | Ok(()) 195 | } 196 | -------------------------------------------------------------------------------- /src-rs/src/mtx.rs: -------------------------------------------------------------------------------- 1 | use flate2::write::GzEncoder; 2 | use flate2::read::GzDecoder; 3 | use flate2::Compression; 4 | use std::fs::File; 5 | use std::io; 6 | use std::collections::HashMap; 7 | use std::io::{Write, BufReader, BufRead}; 8 | 9 | use crate::utils::triplets_to_eds; 10 | pub fn reader( 11 | input: &str, 12 | num_cells: usize, 13 | num_genes: usize, 14 | expr: &mut Vec>, 15 | bit_vecs: &mut Vec>, 16 | ) -> Result { 17 | info!("Using {} as input MTX file\n", input); 18 | info!( 19 | "Using {} Rows (cells) and {} Columns (features)", 20 | num_cells, num_genes 21 | ); 22 | 23 | let file_handle = File::open(input)?; 24 | let file = BufReader::new( GzDecoder::new(file_handle) ); 25 | 26 | let cell_by_gene = true; 27 | let (cell_index, gene_index) = match cell_by_gene { 28 | true => (0, 1), 29 | false => (1, 0), 30 | }; 31 | 32 | let mut found_first = false; 33 | let mut triplets: Vec> = vec![ HashMap::new(); num_cells ]; 34 | 35 | for line in file.lines() { 36 | let record = line?; 37 | if record.chars().nth(0).unwrap() == '%' { 38 | continue; 39 | } 40 | 41 | let vals: Vec<&str> = record.split("\t") 42 | .collect(); 43 | 44 | let gid = vals[gene_index].parse::() 45 | .expect("can't convert gid"); 46 | let cid = vals[cell_index].parse::() 47 | .expect("can't convert cid"); 48 | let value = vals[2].parse::() 49 | .expect("can't convert value"); 50 | 51 | if ! found_first { 52 | found_first = true; 53 | 54 | assert!(num_cells == cid ); 55 | assert!(num_genes == gid as usize); 56 | continue; 57 | } 58 | 59 | triplets[cid - 1].insert(gid - 1, value); 60 | } 61 | 62 | triplets_to_eds(&triplets, expr, bit_vecs, num_genes); 63 | Ok(true) 64 | } 65 | 66 | pub fn writer( 67 | path_str: String, 68 | expressions: Vec>, 69 | bit_vecs: Vec>, 70 | num_cells: usize, 71 | num_features: usize, 72 | ) -> Result { 73 | let mut tot_expressed_features = 0; 74 | expressions 75 | .iter() 76 | .for_each(|x| tot_expressed_features += x.len()); 77 | 78 | let file_handle = File::create(path_str)?; 79 | let mut file = GzEncoder::new(file_handle, Compression::default()); 80 | 81 | let mut header = "%%MatrixMarket\tmatrix\tcoordinate\treal\tgeneral\n".to_string(); 82 | header.push_str(&format!( 83 | "{}\t{}\t{}\n", 84 | num_cells, num_features, tot_expressed_features 85 | )); 86 | file.write_all(header.as_bytes())?; 87 | 88 | assert!( 89 | bit_vecs.len() == expressions.len(), 90 | "length of bit vec and expression is not same" 91 | ); 92 | for (cell_id, exp) in expressions.into_iter().enumerate() { 93 | let bit_vec = &bit_vecs[cell_id]; 94 | let mut fids: Vec = Vec::new(); 95 | 96 | for (feature_id, flag) in bit_vec.into_iter().enumerate() { 97 | if *flag != 0 { 98 | for (offset, j) in format!("{:8b}", flag).chars().enumerate() { 99 | match j { 100 | '1' => fids.push((8 * feature_id) + offset), 101 | _ => (), 102 | }; 103 | } 104 | } 105 | } 106 | 107 | assert!( 108 | fids.len() == exp.len(), 109 | "#positions doesn't match with #expressed features" 110 | ); 111 | let mut mtx_data = "".to_string(); 112 | for (index, count) in exp.into_iter().enumerate() { 113 | mtx_data.push_str(&format!( 114 | "{}\t{}\t{}\n", 115 | cell_id + 1, 116 | fids[index] + 1, 117 | count 118 | )); 119 | } 120 | 121 | file.write_all(mtx_data.as_bytes())?; 122 | } 123 | 124 | Ok(true) 125 | } 126 | -------------------------------------------------------------------------------- /src-rs/src/utils.rs: -------------------------------------------------------------------------------- 1 | use math::round; 2 | use clap::ArgMatches; 3 | use std::collections::HashMap; 4 | 5 | use std; 6 | use std::io; 7 | use rand::thread_rng; 8 | use rand::seq::SliceRandom; 9 | 10 | use crate::{h5, mtx, csv, eds}; 11 | 12 | #[derive(Clone, Debug, PartialEq)] 13 | pub enum FileType { 14 | EDS, 15 | MTX, 16 | H5, 17 | CSV, 18 | Dummy(String), 19 | } 20 | 21 | pub fn write_file( file_path: String, 22 | file_type: FileType, 23 | bit_vecs: Vec>, 24 | alphas: Vec>, 25 | num_cells: usize, 26 | num_features: usize, 27 | ) -> Result { 28 | info!("Writing Output into file path: {}", file_path); 29 | 30 | match file_type { 31 | FileType::MTX => mtx::writer(file_path, alphas, bit_vecs, num_cells, num_features)?, 32 | FileType::CSV => csv::writer(file_path, alphas, bit_vecs, num_cells, num_features)?, 33 | FileType::H5 => h5::writer(file_path, alphas, bit_vecs, num_cells, num_features)?, 34 | FileType::EDS => eds::writer(file_path, alphas, bit_vecs, num_cells, num_features)?, 35 | _ => unreachable!(), 36 | }; 37 | 38 | Ok(true) 39 | } 40 | 41 | pub fn read_file(file_path: &str, 42 | file_type: FileType, 43 | num_cells: usize, 44 | num_features: usize, 45 | ) -> Result<(Vec>, Vec>), io::Error> { 46 | let mut alphas: Vec> = Vec::new(); 47 | let mut bit_vecs: Vec> = Vec::new(); 48 | 49 | match file_type { 50 | FileType::EDS => eds::reader( 51 | file_path, 52 | num_cells, 53 | num_features, 54 | &mut alphas, 55 | &mut bit_vecs, 56 | )?, 57 | FileType::MTX => mtx::reader( 58 | file_path, 59 | num_cells, 60 | num_features, 61 | &mut alphas, 62 | &mut bit_vecs, 63 | )?, 64 | _ => unreachable!(), 65 | }; 66 | 67 | info!("Done Reading Input file"); 68 | Ok((bit_vecs, alphas)) 69 | } 70 | 71 | pub fn randomize(bit_vecs: Vec>, 72 | alphas: Vec>, 73 | ) -> Result<(Vec>, Vec>), io::Error> { 74 | info!("Randomizing order"); 75 | assert!( bit_vecs.len() == alphas.len() ); 76 | 77 | let num_elem = bit_vecs.len() as u32; 78 | let mut order: Vec = (0..num_elem).collect(); 79 | order.shuffle(&mut thread_rng()); 80 | 81 | let mut shuf_bvecs = vec![Vec::new(); bit_vecs.len()]; 82 | let mut shuf_alphas = vec![Vec::new(); bit_vecs.len()]; 83 | for (nindex, oindex) in order.into_iter().enumerate() { 84 | shuf_bvecs[nindex] = bit_vecs[oindex as usize].clone(); 85 | shuf_alphas[nindex] = alphas[oindex as usize].clone(); 86 | } 87 | 88 | Ok((shuf_bvecs, shuf_alphas)) 89 | } 90 | 91 | pub fn find_output_format(sub_m: &ArgMatches) -> FileType { 92 | let mut out_file_type: Option = None; 93 | let mut found_file_types = 0; 94 | 95 | out_file_type = match sub_m.is_present("mtx") { 96 | true => { 97 | found_file_types += 1; 98 | Some(FileType::MTX) 99 | } 100 | false => out_file_type, 101 | }; 102 | 103 | out_file_type = match sub_m.is_present("csv") { 104 | true => { 105 | found_file_types += 1; 106 | Some(FileType::CSV) 107 | } 108 | false => out_file_type, 109 | }; 110 | 111 | out_file_type = match sub_m.is_present("h5") { 112 | true => { 113 | found_file_types += 1; 114 | Some(FileType::H5) 115 | } 116 | false => out_file_type, 117 | }; 118 | 119 | out_file_type = match sub_m.is_present("eds") { 120 | true => { 121 | found_file_types += 1; 122 | Some(FileType::EDS) 123 | } 124 | false => out_file_type, 125 | }; 126 | 127 | assert!(found_file_types == 1, "found unexpected not 1 file types"); 128 | return out_file_type.expect("can't find output format type"); 129 | } 130 | 131 | pub fn get_output_path(input_path: &str, otype: FileType) -> (FileType, String) { 132 | let mut itype: FileType = FileType::EDS; 133 | let mut opath = input_path.to_string(); 134 | let mut offset: usize = opath.len(); 135 | let mut found_file_types = 0; 136 | 137 | match opath.find(".eds") { 138 | Some(val) => { 139 | offset = val; 140 | } 141 | _ => (), 142 | }; 143 | 144 | match opath.find(".mtx") { 145 | Some(val) => { 146 | offset = val; 147 | itype = FileType::MTX; 148 | found_file_types += 1; 149 | } 150 | _ => (), 151 | }; 152 | 153 | match opath.find(".h5") { 154 | Some(val) => { 155 | offset = val; 156 | itype = FileType::H5; 157 | found_file_types += 1; 158 | } 159 | _ => (), 160 | }; 161 | 162 | match opath.find(".csv") { 163 | Some(val) => { 164 | offset = val; 165 | itype = FileType::CSV; 166 | found_file_types += 1; 167 | } 168 | _ => (), 169 | }; 170 | 171 | assert!( 172 | found_file_types == 1 || itype == FileType::EDS, 173 | " Can't find right input file type " 174 | ); 175 | assert!( 176 | itype != otype, 177 | "Found same input and output file file format" 178 | ); 179 | 180 | info!(" Found {:?} as input file type ", itype); 181 | info!(" Found {:?} as output file type ", otype); 182 | 183 | match otype { 184 | FileType::MTX => opath.replace_range(offset.., ".mtx.gz"), 185 | FileType::CSV => opath.replace_range(offset.., ".csv.gz"), 186 | FileType::H5 => opath.replace_range(offset.., ".h5"), 187 | FileType::EDS => opath.replace_range(offset.., ".eds.gz"), 188 | FileType::Dummy(name) => opath.replace_range(offset.., &name), 189 | } 190 | 191 | (itype, opath) 192 | } 193 | 194 | pub fn triplets_to_eds(triplets: &Vec>, 195 | expr: &mut Vec>, 196 | bit_vecs: &mut Vec>, 197 | num_genes: usize, 198 | ) { 199 | for cell_data in triplets { 200 | let mut keys: Vec = cell_data.keys() 201 | .cloned() 202 | .collect(); 203 | keys.sort(); 204 | 205 | let values: Vec = keys.iter().map( |key| cell_data[key] ) 206 | .collect(); 207 | 208 | expr.push(values); 209 | 210 | let num_exp_genes = keys.len(); 211 | let num_bit_vecs: usize = round::ceil(num_genes as f64 / 8.0, 0) as usize; 212 | let mut bit_vec: Vec = vec![0; num_bit_vecs]; 213 | 214 | let mut min_processed_close = 0; 215 | let mut max_processed_open = 8; 216 | let mut curr_index = 0; 217 | let mut flag: u8 = 0; 218 | 219 | for key in keys { 220 | assert!(key >= min_processed_close); 221 | assert!(curr_index < num_bit_vecs); 222 | 223 | let offset: u8 = (key % 8) as u8; 224 | if key < max_processed_open { 225 | flag |= 128u8 >> offset; 226 | } else { 227 | bit_vec[curr_index] = flag; 228 | 229 | while key >= max_processed_open { 230 | curr_index += 1; 231 | min_processed_close = max_processed_open; 232 | max_processed_open += 8; 233 | } 234 | flag = 128u8 >> offset; 235 | } 236 | } 237 | bit_vec[curr_index] = flag; 238 | 239 | let mut num_ones = 0; 240 | for bits in bit_vec.iter() { 241 | num_ones += bits.count_ones(); 242 | } 243 | assert!(num_ones as usize == num_exp_genes, 244 | format!("{:?} {:?}", num_ones, num_exp_genes)); 245 | 246 | bit_vecs.push(bit_vec); 247 | } 248 | } 249 | --------------------------------------------------------------------------------