├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
    ├── data
    │   └── scripts
    │   │   ├── generate.sh
    │   │   └── makeLoom.py
    ├── memory.jpg
    ├── parsers
    │   ├── readCsv.R
    │   ├── readEds.R
    │   ├── readH5.R
    │   ├── readLoom.R
    │   └── readMtx.R
    ├── profile.sh
    ├── size.jpg
    ├── stats
    │   ├── ltime.tex
    │   ├── memory.tex
    │   └── size.tex
    └── time.jpg
├── eds.jpg
├── src-cpp
    └── readEDS.cpp
└── src-rs
    ├── Cargo.toml
    └── src
        ├── csv.rs
        ├── eds.rs
        ├── h5.rs
        ├── main.rs
        ├── mtx.rs
        └── utils.rs


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, COMBINE-lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## What's EDS ?
 2 | EDS is an accronym for Efficient single cell binary Data Storage format for the cell-feature count matrices.
 3 | 
 4 | ![EDS](https://github.com/COMBINE-lab/EDS/blob/master/eds.jpg)
 5 | 
 6 | ## Why we need a new storage format ?
 7 | Recent advancements in single-cell technologies have seen rapid increase in the amount of data. Most single-cell studies generate a cell by feature (can be gene) count matrices, where the number of cells are now reaching towards millions. Traditional Single-cell quantification pipelines use matrix market exchange (mtx) format (sometimes gzipped) for sharing the count matrices. However, the textual representation of mtx format makes it bigger in size compared to a compressed binary format. Our quantification tool [alevin](https://combine-lab.github.io/alevin-tutorial/) dumps the output in EDS format which saves storage space.
 8 | 
 9 | 
10 | ## What are the caveats ?
11 | There are other formats (such as [loom](https://github.com/linnarsson-lab/loompy)) which are designed for optimizing the query of the matrix. EDS is primarily designed to improve the storage efficiency rather than query and currently don't support random access to a cell (row).
12 | 
13 | ## How to convert eds to mtx format ?
14 | We have a simple rust code inside the `src-rs`, it can be installed using `cargo build --release` and can be used as `./target/release/eds convert -i <input gzipped file currently [eds.gz | mtx.gz]> --[mtx | eds | h5 | csv] -c <num_cells> -f <num_features>`.
15 | 
16 | ## Benchmarks
17 | * Size on disk.
18 | ![Disk Space](https://github.com/COMBINE-lab/EDS/blob/master/benchmarks/size.jpg)
19 | 
20 | * Matrix loading into memory time.
21 | ![Loading time](https://github.com/COMBINE-lab/EDS/blob/master/benchmarks/time.jpg)
22 | 
23 | * Memory required to load the matrix.
24 | ![Memory Usage](https://github.com/COMBINE-lab/EDS/blob/master/benchmarks/memory.jpg)
25 | 
26 | ## Future 
27 | - [ ] Support delayedArray R object
28 | - [ ] Random access through `EDS index`
29 | 
30 | ## Contributors
31 | - Avi Srivastava
32 | - Mike Love
33 | - Rob Patro
34 | 


--------------------------------------------------------------------------------
/benchmarks/data/scripts/generate.sh:
--------------------------------------------------------------------------------
 1 | name=$1
 2 | cells=$2
 3 | feats=$3
 4 | 
 5 | bin="/mnt/scratch1/avi/anton/alevin_r/EDS/src-rs/target/release/eds"
 6 | dpath="/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/"$name"/quants_mat.eds.gz"
 7 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --csv -c $cells -f $feats
 8 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --mtx -c $cells -f $feats
 9 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --h5 -c $cells -f $feats
10 | 


--------------------------------------------------------------------------------
/benchmarks/data/scripts/makeLoom.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import loompy
 3 | import numpy as np
 4 | import sys
 5 | from scipy.io import mmread
 6 | 
 7 | data = sys.argv[1]
 8 | mtx_file = "/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/" + data + "/quants_mat.mtx.gz"
 9 | out_file = "/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/" + data + "/quants_mat.loom"
10 | 
11 | data = mmread( gzip.open(mtx_file) )
12 | (cells, feats) = data.shape
13 | 
14 | data = data.T
15 | 
16 | row_names = {}
17 | row_names['rname'] = np.array(range(feats))
18 | 
19 | col_names = {}
20 | col_names['cname'] = np.array( range(cells) )
21 | 
22 | loompy.create(out_file, data, row_names, col_names)
23 | 


--------------------------------------------------------------------------------
/benchmarks/memory.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/memory.jpg


--------------------------------------------------------------------------------
/benchmarks/parsers/readCsv.R:
--------------------------------------------------------------------------------
 1 | args = commandArgs(trailingOnly=TRUE)
 2 | 
 3 | data <- args[1]
 4 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.csv.gz")
 5 | 
 6 | system.time({
 7 | csv <- read.table( gzfile( fpath ), sep="," )
 8 | })
 9 | 
10 | print(dim(csv))
11 | 


--------------------------------------------------------------------------------
/benchmarks/parsers/readEds.R:
--------------------------------------------------------------------------------
 1 | library(Rcpp)
 2 | library(Matrix)
 3 | 
 4 | args = commandArgs(trailingOnly=TRUE)
 5 | sourceCpp("/mnt/scratch1/avi/anton/alevin_r/EDS/src-cpp/readEDS.cpp")
 6 | 
 7 | data <- args[1]
 8 | num.cells <- as.integer(args[2])
 9 | num.genes <- as.integer(args[3])
10 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.eds.gz")
11 | 
12 | system.time({
13 | pos <- getSparseMatrix( num.genes, num.cells, fpath )
14 | })
15 | 
16 | str(pos)
17 | 


--------------------------------------------------------------------------------
/benchmarks/parsers/readH5.R:
--------------------------------------------------------------------------------
 1 | library(hdf5r)
 2 | library(Matrix)
 3 | 
 4 | args = commandArgs(trailingOnly=TRUE)
 5 | data <- args[1]
 6 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.h5")
 7 | 
 8 | system.time({
 9 | infile <- hdf5r::H5File$new(filename = fpath, mode = 'r')
10 | genome <- "/matrix"
11 | 
12 | counts <- infile[[paste0(genome, '/data')]]
13 | indices <- infile[[paste0(genome, '/indices')]]
14 | indptr <- infile[[paste0(genome, '/indptr')]]
15 | shp <- infile[[paste0(genome, '/shape')]]
16 | 
17 | sparse.mat <- sparseMatrix(
18 |   i = indices[] + 1,
19 |   p = indptr[],
20 |   x = as.numeric(x = counts[]),
21 |   dims = shp[],
22 |   giveCsparse = TRUE
23 | )
24 | })
25 | 
26 | str(sparse.mat)
27 | 


--------------------------------------------------------------------------------
/benchmarks/parsers/readLoom.R:
--------------------------------------------------------------------------------
 1 | library(loomR)
 2 | 
 3 | args = commandArgs(trailingOnly=TRUE)
 4 | # inparts taked from https://satijalab.org/loomR/loomR_tutorial.html
 5 | data <- args[1]
 6 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.loom")
 7 | 
 8 | system.time({ 
 9 | lfile <- connect(filename = fpath, mode = "r+")
10 | full.matrix <- lfile$matrix[, ]
11 | })
12 | 
13 | dim( full.matrix )
14 | 


--------------------------------------------------------------------------------
/benchmarks/parsers/readMtx.R:
--------------------------------------------------------------------------------
 1 | library(Matrix)
 2 | 
 3 | args = commandArgs(trailingOnly=TRUE)
 4 | data <- args[1]
 5 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.mtx.gz")
 6 | 
 7 | system.time({
 8 | mm <- readMM( gzfile( fpath ) )
 9 | })
10 | 
11 | print(dim(mm))
12 | 


--------------------------------------------------------------------------------
/benchmarks/profile.sh:
--------------------------------------------------------------------------------
 1 | datas=("neurons_450k_random" "pbmc_40k_random" "neurons_1m" "neurons_900" "neurons_2k" "pbmc_4k" "pbmc_8k" "neurons_9k" "pbmc_40k" "neurons_450k")
 2 | cells=(456400 43400 1000000 931 2022 4340 8381 9128 43400 456400)
 3 | feats=(50686 58278 27998 50686 50686 58278 58278 50686 58278 50686)
 4 | 
 5 | for id in {0..0}; do
 6 | 	data=${datas[$id]}
 7 | 	cell=${cells[$id]}
 8 | 	feat=${feats[$id]}
 9 | 
10 | 	echo $data $cell $feat
11 | 	echo "EDS"
12 | 	/usr/bin/time Rscript --vanilla parsers/readEds.R $data $cell $feat &&
13 | 	
14 | 	echo "H5" &&
15 | 	/usr/bin/time Rscript --vanilla parsers/readH5.R $data &&
16 | 
17 | 	echo "Mtx" &&
18 | 	/usr/bin/time Rscript --vanilla parsers/readMtx.R $data &&
19 | 
20 | 	echo "loom" &&
21 | 	/usr/bin/time Rscript --vanilla parsers/readLoom.R $data &&
22 | 
23 | 	echo "CSV" 
24 | 	/usr/bin/time Rscript --vanilla parsers/readCsv.R $data
25 | done
26 | 
27 | 


--------------------------------------------------------------------------------
/benchmarks/size.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/size.jpg


--------------------------------------------------------------------------------
/benchmarks/stats/ltime.tex:
--------------------------------------------------------------------------------
 1 | \begin{center}
 2 |  \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||}
 3 |  \hline
 4 |  \vtop{\hbox{\strut Loading}\hbox{\strut (Second)}} & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex]
 5 |  \hline\hline
 6 |  eds.gz & 0.28 & 0.86 & 1.49 & 2.30 & 3.24 & 10.04 & 9.93 & 119.61 & 112.88 & 159.77 \\
 7 |  \hline
 8 |  csv.gz & 28.92 & 40.15 & 75.30 & 134.44 & 114.96 & 680.1 & 439.47 & 6768.7 & 6524.1 & N/A \\
 9 |  \hline
10 |  h5 & 0.83 & 1.56 & 1.65 & 3.88 & 5.72 & 10.11 & 10.20 & 276.36 & 338.91 & 705.56 \\
11 |  \hline
12 |  mtx.gz & 1.93 & 2.37 & 3.53 & 6.87 & 12.45 & 26.84 & 23.60 & 536.91 & 489.76 & 904.35 \\
13 |  \hline
14 |  loom & 0.98 & 1.82 & 4.08 & 7.67 & 8.05 & 44.82 & 36.92 & 400.99 & 378.02 & N/A \\ [1ex]
15 |  \hline
16 | \end{tabular}
17 | \end{center}
18 | 


--------------------------------------------------------------------------------
/benchmarks/stats/memory.tex:
--------------------------------------------------------------------------------
 1 | \begin{center}
 2 |  \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||}
 3 |  \hline
 4 |  \vtop{\hbox{\strut Memory}\hbox{\strut (Gb)}} & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex]
 5 |  \hline\hline
 6 |  eds.gz & 0.32 & 0.33 & 0.33 & 0.36 & 0.49 & 0.78 & 0.77 & 11.5 & 11.5 & 22.9 \\
 7 |  \hline
 8 |  csv.gz & 0.73 & 2.40 & 4.80 & 9.87 & 8.14 & 42.12 & 41.08 & 322 & 317.2 & N/A \\
 9 |  \hline
10 |  h5 & 0.47 & 0.59 & 0.69 & 1.10 & 1.76 & 3.17 & 3.12 & 56.8 & 56.77 & 114.29 \\
11 |  \hline
12 |  mtx.gz & 0.37 & 0.38 & 0.48 & 0.74 & 1.10 & 1.75 & 1.72 & 30.3 & 30.33 & 61 \\
13 |  \hline
14 |  loom & 0.68 & 1.13 & 2.47 & 4.53 & 4.30 & 22.447 & 22.40 & 203.3 & 203.2 & N/A \\ [1ex]
15 |  \hline
16 | \end{tabular}
17 | \end{center}
18 | 


--------------------------------------------------------------------------------
/benchmarks/stats/size.tex:
--------------------------------------------------------------------------------
 1 | \begin{center}
 2 |  \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||}
 3 |  \hline
 4 |  Size & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex]
 5 |  \hline\hline
 6 |  eds.gz & 5.4M & 9.8M & 15M & 29M & 42M & 108M & 116M & 1.8G & 1.9G & 2.6G \\
 7 |  \hline
 8 |  csv.gz & 6.6M & 13M & 19M & 37M & 51M & 142M & 142M & 2.3G & 2.3G & 3.1G \\
 9 |  \hline
10 |  h5 & 7.6M & 14M & 18M & 36M & 59M & 97M & 135M & 1.8G & 2.5G & 4.0G \\
11 |  \hline
12 |  mtx.gz & 11M & 20M & 26M & 52M & 86M & 186M & 192M & 3.6G & 3.6G & 5.8G \\
13 |  \hline
14 |  loom & 12M & 23M & 40M & 78M & 97M & 297M & 355M & 3.6G & 4.8G & 6.1G \\ [1ex]
15 |  \hline
16 | \end{tabular}
17 | \end{center}
18 | 


--------------------------------------------------------------------------------
/benchmarks/time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/time.jpg


--------------------------------------------------------------------------------
/eds.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/eds.jpg


--------------------------------------------------------------------------------
/src-cpp/readEDS.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Alevin Efficient Data Storage (EDS) reader
  3 |  *
  4 |  * Author: Avi Srivastava
  5 |  * Last modified: August 13, 2019
  6 |  * License: LGPL (>= 3)
  7 |  *
  8 |  */
  9 | 
 10 | #include <Rcpp.h>
 11 | #include <zlib.h>
 12 | 
 13 | using namespace Rcpp;
 14 | 
 15 | // C++ internal function to figure out the spaces to reserve
 16 | size_t getReserveSpaces(size_t numOfGenes, size_t numOfOriginalCells,
 17 |                       Rcpp::IntegerVector& bitVecLengths,
 18 |                       std::string& countMatFilename) {
 19 | 
 20 |   // opening gzipped compressed stream
 21 |   gzFile fileHandler = gzopen(countMatFilename.c_str(), "rb") ;
 22 |   
 23 |   // We are storing the bit vector in u8 so total number of u8 = numGenes/8
 24 |   size_t numFlags = std::ceil(numOfGenes / 8.0);
 25 | 
 26 |   // vector for storing the bitvector flags
 27 |   std::vector<uint8_t> alphasFlag (numFlags, 0);
 28 | 
 29 |   // getting the sizs of u8 and float 32
 30 |   size_t flagSize = sizeof(decltype(alphasFlag)::value_type);
 31 |   size_t elSize = sizeof(float);
 32 |   size_t totalSpace { 0 };
 33 | 
 34 |   // iterating over cells
 35 |   for (size_t cellId = 0 ; cellId < numOfOriginalCells ; ++cellId) {
 36 |     // reading bitvectors
 37 |     gzread(fileHandler, reinterpret_cast<char*>(alphasFlag.data()), flagSize * numFlags);
 38 |     size_t numOfExpGenes { 0 };
 39 | 
 40 |     for (size_t j = 0; j < alphasFlag.size(); j++) {
 41 |       uint8_t flag = alphasFlag[j];
 42 | 
 43 |       for (size_t i = 0; i < 8; i++){
 44 |         // counting positions only if the flag is set
 45 |         if (flag & (128 >> i)) {
 46 |           numOfExpGenes += 1;
 47 |         }
 48 |       }
 49 |     }
 50 | 
 51 |     // skipping the expression values and saving the counts for numOfExpGenes
 52 |     gzseek(fileHandler, elSize * numOfExpGenes, SEEK_CUR);
 53 |     bitVecLengths[ cellId + 1 ] = ( numOfExpGenes + bitVecLengths[ cellId ] );
 54 |     totalSpace += numOfExpGenes;
 55 |   }
 56 | 
 57 |   return totalSpace;
 58 | }
 59 | 
 60 | // [[Rcpp::export]]
 61 | SEXP getSparseMatrix(size_t numOfGenes, size_t numOfOriginalCells, std::string countMatFilename) {
 62 |   Rcpp::S4 mat("dgCMatrix");
 63 |   
 64 |   // initializing vector to store bitvecSpaces
 65 |   Rcpp::IntegerVector bitVecLengths(numOfOriginalCells + 1, 0);
 66 |   size_t totalSpace = getReserveSpaces( numOfGenes, numOfOriginalCells, 
 67 |                                         bitVecLengths, countMatFilename );
 68 | 
 69 |   // initializing sparse matrix
 70 |   typedef Rcpp::NumericVector ValuesT;
 71 |   ValuesT values(totalSpace, 0.0);
 72 | 
 73 |   // initializing sparse matrix indices
 74 |   typedef Rcpp::IntegerVector IndicesT;
 75 |   IndicesT indices(totalSpace, 0);
 76 | 
 77 |   // opening gzipped compressed stream
 78 |   gzFile fileHandler = gzopen(countMatFilename.c_str(), "rb") ;
 79 |   
 80 |   // We are storing the bit vector in u8 so total number of u8 = numGenes/8
 81 |   size_t numFlags = std::ceil(numOfGenes / 8.0);
 82 | 
 83 |   // vector for storing the bitvector flags
 84 |   std::vector<uint8_t> alphasFlag (numFlags, 0);
 85 | 
 86 |   // getting the sizs of u8 and float 32
 87 |   size_t flagSize = sizeof(decltype(alphasFlag)::value_type);
 88 |   size_t elSize = sizeof(float);
 89 | 
 90 |   size_t valCounter { 0 };
 91 |   // iterating over cells
 92 |   for (size_t cellId = 0 ; cellId < numOfOriginalCells ; ++cellId) {
 93 |     // reading bitvectors
 94 |     gzread(fileHandler, reinterpret_cast<char*>(alphasFlag.data()), flagSize * numFlags);
 95 | 
 96 |     // iterating over u8 flags for bitvectors
 97 |     size_t numExpGenes { 0 };
 98 |     for (size_t j = 0; j < alphasFlag.size(); j++) {
 99 |       uint8_t flag = alphasFlag[j];
100 | 
101 |       for (size_t i = 0; i < 8; i++){
102 |         // extracting positions only if the flag is set
103 |         if (flag & (128 >> i)) {
104 |           if ( valCounter + numExpGenes >= totalSpace ) { 
105 |             return Rcpp::List(); 
106 |           }
107 |           
108 |           size_t offset = i + (8 * j);
109 |           indices[ valCounter + numExpGenes ] = offset;
110 |           numExpGenes += 1;
111 |         }
112 |       }
113 |     }
114 |     
115 |     // reading in the expression
116 |     std::vector<float> alphasSparse(numExpGenes);
117 |     gzread(fileHandler, reinterpret_cast<char*>(alphasSparse.data()), elSize * numExpGenes);
118 | 
119 |     // saving the positions and expression
120 |     for (size_t i = 0; i < numExpGenes; i++) {
121 |       if ( valCounter >= totalSpace ) {
122 |         return Rcpp::List();
123 |       }
124 | 
125 |       values[valCounter] = alphasSparse[i];
126 |       valCounter += 1;
127 |     }
128 |   }
129 | 
130 |   // code in-parts taken from https://github.com/LTLA/beachmat/blob/master/inst/include/beachmat/output/Csparse_writer.h#L268
131 |   mat.slot("Dim") = Rcpp::IntegerVector::create(numOfGenes, numOfOriginalCells);
132 | 
133 |   // Setting p
134 |   mat.slot("p") = bitVecLengths;
135 | 
136 |   // Setting 'x'.
137 |   mat.slot("x") = values;
138 | 
139 |   // Setting 'i'.
140 |   mat.slot("i") = indices;
141 | 
142 |   return SEXP(mat);
143 | }
144 | 
145 | 


--------------------------------------------------------------------------------
/src-rs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "eds"
 3 | version = "0.1.0"
 4 | authors = ["avisrivastava <asrivastava@cs.stonybrook.edu>, Mike Love, Rob Patro"]
 5 | edition = "2018"
 6 | 
 7 | [dependencies]
 8 | pretty_env_logger = "0.3.0"
 9 | clap = "2.33.0"
10 | log = "0.4.6"
11 | byteorder = "1.3.1"
12 | flate2 = "1.0.7"
13 | libmath = "0.2.1"
14 | hdf5 = "0.5.2"
15 | rand = "0.7.0"


--------------------------------------------------------------------------------
/src-rs/src/csv.rs:
--------------------------------------------------------------------------------
 1 | use flate2::write::GzEncoder;
 2 | use flate2::Compression;
 3 | use std::fs::File;
 4 | use std::io;
 5 | use std::io::Write;
 6 | 
 7 | pub fn writer(
 8 |     path_str: String,
 9 |     expressions: Vec<Vec<f32>>,
10 |     bit_vecs: Vec<Vec<u8>>,
11 |     _num_cells: usize,
12 |     num_features: usize,
13 | ) -> Result<bool, io::Error> {
14 |     let file_handle = File::create(path_str)?;
15 |     let mut file = GzEncoder::new(file_handle, Compression::default());
16 | 
17 |     let mut header = "\"\"".to_string();
18 |     for gid in 1..num_features + 1 {
19 |         header.push_str(&format!(",gene{}", gid));
20 |     }
21 |     header.push_str(&format!("\n"));
22 |     file.write_all(header.as_bytes())?;
23 | 
24 |     let mut mtx_data: String;
25 |     assert!(
26 |         bit_vecs.len() == expressions.len(),
27 |         "length of bit vec and expression is not same"
28 |     );
29 |     for (cell_id, exp) in expressions.into_iter().enumerate() {
30 |         let bit_vec = &bit_vecs[cell_id];
31 |         let mut fids: Vec<usize> = Vec::new();
32 | 
33 |         for (feature_id, flag) in bit_vec.into_iter().enumerate() {
34 |             if *flag != 0 {
35 |                 for (offset, j) in format!("{:8b}", flag).chars().enumerate() {
36 |                     match j {
37 |                         '1' => fids.push((8 * feature_id) + offset),
38 |                         _ => (),
39 |                     };
40 |                 }
41 |             }
42 |         }
43 | 
44 |         assert!(
45 |             fids.len() == exp.len(),
46 |             format!("#positions {} doesn't match with #expressed features {}",
47 |                     fids.len(), exp.len())
48 |         );
49 |         mtx_data = format!("cell{}", cell_id + 1);
50 |         let mut zero_counter = 0;
51 |         for (index, count) in exp.into_iter().enumerate() {
52 |             assert!(
53 |                 fids[index] < num_features,
54 |                 format!("{} position > {}", fids[index], num_features)
55 |             );
56 | 
57 |             while zero_counter != fids[index] {
58 |                 zero_counter += 1;
59 |                 mtx_data.push_str(&format!(",0"));
60 |             }
61 | 
62 |             zero_counter += 1;
63 |             mtx_data.push_str(&format!(",{}", count));
64 |         }
65 | 
66 |         while zero_counter < num_features {
67 |             zero_counter += 1;
68 |             mtx_data.push_str(&format!(",0"));
69 |         }
70 | 
71 |         mtx_data.push_str(&format!("\n"));
72 |         file.write_all(mtx_data.as_bytes())?;
73 |     }
74 | 
75 |     Ok(true)
76 | }
77 | 


--------------------------------------------------------------------------------
/src-rs/src/eds.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::File;
 2 | use std::io;
 3 | use std::io::{Read, Write};
 4 | 
 5 | use byteorder::{ByteOrder, LittleEndian};
 6 | use flate2::read::GzDecoder;
 7 | use math::round;
 8 | 
 9 | use flate2::write::GzEncoder;
10 | use flate2::Compression;
11 | 
12 | pub fn reader(
13 |     input: &str,
14 |     num_cells: usize,
15 |     num_genes: usize,
16 |     expr: &mut Vec<Vec<f32>>,
17 |     bit_vecs: &mut Vec<Vec<u8>>,
18 | ) -> Result<bool, io::Error> {
19 |     info!("Using {} as input EDS file\n", input);
20 |     info!(
21 |         "Using {} Rows (cells) and {} Columns (features)",
22 |         num_cells, num_genes
23 |     );
24 | 
25 |     let num_bit_vecs: usize = round::ceil(num_genes as f64 / 8.0, 0) as usize;
26 |     let mut total_molecules = 0.0;
27 |     let mut total_exp_values = 0;
28 | 
29 |     {
30 |         let mut count = 0;
31 |         let file_handle = File::open(input)?;
32 |         let mut file = GzDecoder::new(file_handle);
33 | 
34 |         for _ in 0..num_cells {
35 |             let mut bit_vec = vec![0; num_bit_vecs];
36 |             file.read_exact(&mut bit_vec[..])?;
37 |             let mut num_ones = 0;
38 |             for bits in bit_vec.iter() {
39 |                 num_ones += bits.count_ones();
40 |             }
41 |             bit_vecs.push(bit_vec);
42 | 
43 |             let mut expression: Vec<u8> = vec![0; 4 * (num_ones as usize)];
44 |             let mut float_buffer: Vec<f32> = vec![0.0_f32; num_ones as usize];
45 |             file.read_exact(&mut expression[..])?;
46 |             LittleEndian::read_f32_into(&expression, &mut float_buffer);
47 | 
48 |             let cell_count: f32 = float_buffer.iter().sum();
49 |             total_molecules += cell_count;
50 |             expr.push(float_buffer);
51 | 
52 |             count += 1;
53 |             total_exp_values += num_ones;
54 |             if count % 100 == 0 {
55 |                 print!("\r Done Reading {} cells", count);
56 |                 io::stdout().flush()?;
57 |             }
58 |         }
59 |     }
60 | 
61 |     println!("\n");
62 |     assert!(
63 |         expr.len() == num_cells,
64 |         "rows and quants file size mismatch"
65 |     );
66 | 
67 |     info!("Found Total {:.2} molecules", total_molecules);
68 |     info!("Found Total {:.2} expressed entries", total_exp_values);
69 |     info!(
70 |         "w/ {:.2} Molecules/cell",
71 |         total_molecules / num_cells as f32
72 |     );
73 |     Ok(true)
74 | }
75 | 
76 | pub fn writer(
77 |     path_str: String,
78 |     expressions: Vec<Vec<f32>>,
79 |     bit_vecs: Vec<Vec<u8>>,
80 |     _num_cells: usize,
81 |     _num_features: usize,
82 | ) -> Result<bool, io::Error> {
83 |     let file_handle = File::create(path_str)?;
84 |     let mut file = GzEncoder::new(file_handle, Compression::default());
85 | 
86 |     assert!(expressions.len() == bit_vecs.len());
87 |     for (exp, bvec) in expressions.into_iter().zip(bit_vecs.into_iter()) {
88 |         file.write_all(&bvec)?;
89 | 
90 |         let mut bin_exp: Vec<u8> = vec![0_u8; exp.len() * 4];
91 |         LittleEndian::write_f32_into(&exp, &mut bin_exp);
92 |         file.write_all(&bin_exp)?;
93 |     }
94 | 
95 |     Ok(true)
96 | }
97 | 


--------------------------------------------------------------------------------
/src-rs/src/h5.rs:
--------------------------------------------------------------------------------
  1 | use hdf5;
  2 | use std::io;
  3 | 
  4 | pub fn writer(
  5 |     path_str: String,
  6 |     expressions: Vec<Vec<f32>>,
  7 |     bit_vecs: Vec<Vec<u8>>,
  8 |     num_cells: usize,
  9 |     num_features: usize,
 10 | ) -> Result<bool, io::Error> {
 11 |     let file = hdf5::File::open(path_str, "w").expect("can't create output file");
 12 | 
 13 |     let group = file
 14 |         .create_group("matrix")
 15 |         .expect("can't create group in h5");
 16 | 
 17 |     let shape = group
 18 |         .new_dataset::<u64>()
 19 |         .gzip(6)
 20 |         .create("shape", 2)
 21 |         .expect("can't write shape in h5");
 22 | 
 23 |     shape
 24 |         .write(&[num_features, num_cells])
 25 |         .expect("error writing shape");
 26 | 
 27 |     assert!(
 28 |         bit_vecs.len() == expressions.len(),
 29 |         "length of bit vec and expression is not same"
 30 |     );
 31 | 
 32 |     let total_entries;
 33 |     {
 34 |         let mut cumm_sum = 0;
 35 |         let mut indptr_vals: Vec<u32> = vec![cumm_sum];
 36 |         for exp in expressions.iter() {
 37 |             cumm_sum += exp.len() as u32;
 38 |             indptr_vals.push(cumm_sum);
 39 |         }
 40 | 
 41 |         total_entries = indptr_vals.last().expect("indptr empty").clone();
 42 |         assert!(
 43 |             indptr_vals.len() == num_cells + 1,
 44 |             "num cells doesn't match"
 45 |         );
 46 | 
 47 |         let indptr = group
 48 |             .new_dataset::<u32>()
 49 |             .gzip(6)
 50 |             .create("indptr", indptr_vals.len())
 51 |             .expect("can't write indptr in h5");
 52 | 
 53 |         indptr
 54 |             .write_raw(&indptr_vals)
 55 |             .expect("error writing indptr");
 56 |     } // end writing indptr
 57 | 
 58 |     {
 59 |         let data = group
 60 |             .new_dataset::<f32>()
 61 |             .gzip(6)
 62 |             .create("data", total_entries as usize)
 63 |             .expect("can't write data in h5");
 64 | 
 65 |         let flatten_data: Vec<f32> = expressions
 66 |             .iter()
 67 |             .flat_map(|array| array.iter())
 68 |             .cloned()
 69 |             .collect();
 70 | 
 71 |         assert!(
 72 |             flatten_data.len() == total_entries as usize,
 73 |             "different number of entries"
 74 |         );
 75 |         data.write_raw(&flatten_data).expect("can't write data");
 76 |     } // end writing data
 77 | 
 78 |     {
 79 |         let indices = group
 80 |             .new_dataset::<u32>()
 81 |             .gzip(6)
 82 |             .create("indices", total_entries as usize)
 83 |             .expect("can't write positions in h5");
 84 | 
 85 |         let mut positions: Vec<u32> = Vec::new();
 86 |         for bit_vec in bit_vecs {
 87 |             for (feature_id, flag) in bit_vec.into_iter().enumerate() {
 88 |                 if flag != 0 {
 89 |                     for (offset, j) in format!("{:8b}", flag).chars().enumerate() {
 90 |                         match j {
 91 |                             '1' => positions.push((8 * feature_id) as u32 + offset as u32),
 92 |                             _ => (),
 93 |                         };
 94 |                     }
 95 |                 }
 96 |             }
 97 |         } // end-for
 98 | 
 99 |         assert!(
100 |             positions.len() == total_entries as usize,
101 |             "different number of entries"
102 |         );
103 |         indices.write_raw(&positions).expect("can't write indices");
104 |     } // end writing indices
105 | 
106 |     Ok(true)
107 | }
108 | 


--------------------------------------------------------------------------------
/src-rs/src/main.rs:
--------------------------------------------------------------------------------
  1 | extern crate byteorder;
  2 | extern crate clap;
  3 | extern crate flate2;
  4 | extern crate hdf5;
  5 | extern crate math;
  6 | extern crate pretty_env_logger;
  7 | 
  8 | #[macro_use]
  9 | extern crate log;
 10 | 
 11 | mod csv;
 12 | mod eds;
 13 | mod h5;
 14 | mod mtx;
 15 | mod utils;
 16 | 
 17 | use clap::{App, Arg, ArgMatches, SubCommand};
 18 | use std::io;
 19 | use utils::FileType;
 20 | 
 21 | fn randomize_file(sub_m: &ArgMatches) -> Result<(), io::Error> {
 22 |     let input_file_path = sub_m.value_of("input").unwrap();
 23 |     let output_file_type = FileType::Dummy(".random".to_string());
 24 | 
 25 |     let (input_file_type, output_file_path) =
 26 |         utils::get_output_path(input_file_path, output_file_type.clone());
 27 | 
 28 |     let num_cells: usize = sub_m
 29 |         .value_of("cells")
 30 |         .expect("can't find #cells")
 31 |         .parse()
 32 |         .unwrap();
 33 | 
 34 |     let num_features = sub_m
 35 |         .value_of("features")
 36 |         .expect("can't find #features")
 37 |         .parse()
 38 |         .unwrap();
 39 | 
 40 |     let (bit_vecs, alphas) = utils::read_file(input_file_path,
 41 |                                               input_file_type.clone(),
 42 |                                               num_cells,
 43 |                                               num_features)?;
 44 | 
 45 |     let (bit_vecs, alphas) = utils::randomize( bit_vecs, alphas )?;
 46 |     utils::write_file( output_file_path, input_file_type,
 47 |                        bit_vecs, alphas, num_cells, num_features)?;
 48 | 
 49 |     info!("All Done!");
 50 |     Ok(())
 51 | }
 52 | 
 53 | fn convert_file(sub_m: &ArgMatches) -> Result<(), io::Error> {
 54 |     let input_file_path = sub_m.value_of("input").unwrap();
 55 |     let output_file_type = utils::find_output_format(sub_m);
 56 | 
 57 |     let (input_file_type, output_file_path) =
 58 |         utils::get_output_path(input_file_path, output_file_type.clone());
 59 | 
 60 |     let num_cells: usize = sub_m
 61 |         .value_of("cells")
 62 |         .expect("can't find #cells")
 63 |         .parse()
 64 |         .unwrap();
 65 | 
 66 |     let num_features = sub_m
 67 |         .value_of("features")
 68 |         .expect("can't find #features")
 69 |         .parse()
 70 |         .unwrap();
 71 | 
 72 |     let (bit_vecs, alphas) = utils::read_file(input_file_path,
 73 |                                               input_file_type,
 74 |                                               num_cells,
 75 |                                               num_features)?;
 76 | 
 77 |     utils::write_file( output_file_path, output_file_type,
 78 |                        bit_vecs, alphas, num_cells, num_features)?;
 79 | 
 80 |     info!("All Done!");
 81 |     Ok(())
 82 | }
 83 | 
 84 | fn main() -> io::Result<()> {
 85 |     let matches = App::new("EDS")
 86 |         .version("0.1.0")
 87 |         .author("Avi Srivastava, Mike Love and Rob Patro")
 88 |         .about("Efficient scData Storage format")
 89 |         .subcommand(
 90 |             SubCommand::with_name("randomize")
 91 |                 .about("randomize the order of cells")
 92 |                 .arg(
 93 |                     Arg::with_name("cells")
 94 |                         .long("cells")
 95 |                         .short("c")
 96 |                         .takes_value(true)
 97 |                         .help("Number of cells"),
 98 |                 )
 99 |                 .arg(
100 |                     Arg::with_name("features")
101 |                         .long("features")
102 |                         .short("f")
103 |                         .takes_value(true)
104 |                         .help("Number of features"),
105 |                 )
106 |                 .arg(
107 |                     Arg::with_name("input")
108 |                         .long("input")
109 |                         .short("i")
110 |                         .takes_value(true)
111 |                         .requires("cells")
112 |                         .requires("features")
113 |                         .help("path to input file"),
114 |                 ),
115 |         )
116 |         .subcommand(
117 |             SubCommand::with_name("convert")
118 |                 .about("comnvert from eds data format to csv or mtx format")
119 |                 .arg(
120 |                     Arg::with_name("mtx")
121 |                         .long("mtx")
122 |                         .conflicts_with("eds")
123 |                         .conflicts_with("csv")
124 |                         .conflicts_with("h5")
125 |                         .help("convert to matrix market exchange file"),
126 |                 )
127 |                 .arg(
128 |                     Arg::with_name("h5")
129 |                         .long("h5")
130 |                         .conflicts_with("eds")
131 |                         .conflicts_with("csv")
132 |                         .conflicts_with("mtx")
133 |                         .help("convert to h5 wrapped csc file"),
134 |                 )
135 |                 .arg(
136 |                     Arg::with_name("csv")
137 |                         .long("csv")
138 |                         .conflicts_with("eds")
139 |                         .conflicts_with("mtx")
140 |                         .conflicts_with("h5")
141 |                         .help("convert to comma separated file"),
142 |                 )
143 |                 .arg(
144 |                     Arg::with_name("eds")
145 |                         .long("eds")
146 |                         .conflicts_with("csv")
147 |                         .conflicts_with("mtx")
148 |                         .conflicts_with("h5")
149 |                         .help("convert to EDS file"),
150 |                 )
151 |                 .arg(
152 |                     Arg::with_name("cells")
153 |                         .long("cells")
154 |                         .short("c")
155 |                         .takes_value(true)
156 |                         .help("Number of cells"),
157 |                 )
158 |                 .arg(
159 |                     Arg::with_name("features")
160 |                         .long("features")
161 |                         .short("f")
162 |                         .takes_value(true)
163 |                         .help("Number of features"),
164 |                 )
165 |                 .arg(
166 |                     Arg::with_name("input")
167 |                         .long("input")
168 |                         .short("i")
169 |                         .takes_value(true)
170 |                         .requires("cells")
171 |                         .requires("features")
172 |                         .help("path to input file"),
173 |                 ),
174 |         )
175 |         .get_matches();
176 | 
177 |     pretty_env_logger::init_timed();
178 |     match matches.subcommand_matches("convert") {
179 |         Some(sub_m) => {
180 |             let ret = convert_file(&sub_m);
181 |             return ret;
182 |         }
183 |         None => (),
184 |     };
185 | 
186 |     match matches.subcommand_matches("randomize") {
187 |         Some(sub_m) => {
188 |             let ret = randomize_file(&sub_m);
189 |             return ret;
190 |         }
191 |         None => (),
192 |     };
193 | 
194 |     Ok(())
195 | }
196 | 


--------------------------------------------------------------------------------
/src-rs/src/mtx.rs:
--------------------------------------------------------------------------------
  1 | use flate2::write::GzEncoder;
  2 | use flate2::read::GzDecoder;
  3 | use flate2::Compression;
  4 | use std::fs::File;
  5 | use std::io;
  6 | use std::collections::HashMap;
  7 | use std::io::{Write, BufReader, BufRead};
  8 | 
  9 | use crate::utils::triplets_to_eds;
 10 | pub fn reader(
 11 |     input: &str,
 12 |     num_cells: usize,
 13 |     num_genes: usize,
 14 |     expr: &mut Vec<Vec<f32>>,
 15 |     bit_vecs: &mut Vec<Vec<u8>>,
 16 | ) -> Result<bool, io::Error> {
 17 |     info!("Using {} as input MTX file\n", input);
 18 |     info!(
 19 |         "Using {} Rows (cells) and {} Columns (features)",
 20 |         num_cells, num_genes
 21 |     );
 22 | 
 23 |     let file_handle = File::open(input)?;
 24 |     let file = BufReader::new( GzDecoder::new(file_handle) );
 25 | 
 26 |     let cell_by_gene = true;
 27 |     let (cell_index, gene_index) = match cell_by_gene {
 28 |         true => (0, 1),
 29 |         false => (1, 0),
 30 |     };
 31 | 
 32 |     let mut found_first = false;
 33 |     let mut triplets: Vec<HashMap<u32, f32>> = vec![ HashMap::new(); num_cells ];
 34 | 
 35 |     for line in file.lines() {
 36 |         let record = line?;
 37 |         if record.chars().nth(0).unwrap() == '%' {
 38 |             continue;
 39 |         }
 40 | 
 41 |         let vals: Vec<&str> = record.split("\t")
 42 |             .collect();
 43 | 
 44 |         let gid = vals[gene_index].parse::<u32>()
 45 |             .expect("can't convert gid");
 46 |         let cid = vals[cell_index].parse::<usize>()
 47 |             .expect("can't convert cid");
 48 |         let value = vals[2].parse::<f32>()
 49 |             .expect("can't convert value");
 50 | 
 51 |         if ! found_first {
 52 |             found_first = true;
 53 | 
 54 |             assert!(num_cells == cid );
 55 |             assert!(num_genes == gid as usize);
 56 |             continue;
 57 |         }
 58 | 
 59 |         triplets[cid - 1].insert(gid - 1, value);
 60 |     }
 61 | 
 62 |     triplets_to_eds(&triplets, expr, bit_vecs, num_genes);
 63 |     Ok(true)
 64 | }
 65 | 
 66 | pub fn writer(
 67 |     path_str: String,
 68 |     expressions: Vec<Vec<f32>>,
 69 |     bit_vecs: Vec<Vec<u8>>,
 70 |     num_cells: usize,
 71 |     num_features: usize,
 72 | ) -> Result<bool, io::Error> {
 73 |     let mut tot_expressed_features = 0;
 74 |     expressions
 75 |         .iter()
 76 |         .for_each(|x| tot_expressed_features += x.len());
 77 | 
 78 |     let file_handle = File::create(path_str)?;
 79 |     let mut file = GzEncoder::new(file_handle, Compression::default());
 80 | 
 81 |     let mut header = "%%MatrixMarket\tmatrix\tcoordinate\treal\tgeneral\n".to_string();
 82 |     header.push_str(&format!(
 83 |         "{}\t{}\t{}\n",
 84 |         num_cells, num_features, tot_expressed_features
 85 |     ));
 86 |     file.write_all(header.as_bytes())?;
 87 | 
 88 |     assert!(
 89 |         bit_vecs.len() == expressions.len(),
 90 |         "length of bit vec and expression is not same"
 91 |     );
 92 |     for (cell_id, exp) in expressions.into_iter().enumerate() {
 93 |         let bit_vec = &bit_vecs[cell_id];
 94 |         let mut fids: Vec<usize> = Vec::new();
 95 | 
 96 |         for (feature_id, flag) in bit_vec.into_iter().enumerate() {
 97 |             if *flag != 0 {
 98 |                 for (offset, j) in format!("{:8b}", flag).chars().enumerate() {
 99 |                     match j {
100 |                         '1' => fids.push((8 * feature_id) + offset),
101 |                         _ => (),
102 |                     };
103 |                 }
104 |             }
105 |         }
106 | 
107 |         assert!(
108 |             fids.len() == exp.len(),
109 |             "#positions doesn't match with #expressed features"
110 |         );
111 |         let mut mtx_data = "".to_string();
112 |         for (index, count) in exp.into_iter().enumerate() {
113 |             mtx_data.push_str(&format!(
114 |                 "{}\t{}\t{}\n",
115 |                 cell_id + 1,
116 |                 fids[index] + 1,
117 |                 count
118 |             ));
119 |         }
120 | 
121 |         file.write_all(mtx_data.as_bytes())?;
122 |     }
123 | 
124 |     Ok(true)
125 | }
126 | 


--------------------------------------------------------------------------------
/src-rs/src/utils.rs:
--------------------------------------------------------------------------------
  1 | use math::round;
  2 | use clap::ArgMatches;
  3 | use std::collections::HashMap;
  4 | 
  5 | use std;
  6 | use std::io;
  7 | use rand::thread_rng;
  8 | use rand::seq::SliceRandom;
  9 | 
 10 | use crate::{h5, mtx, csv, eds};
 11 | 
 12 | #[derive(Clone, Debug, PartialEq)]
 13 | pub enum FileType {
 14 |     EDS,
 15 |     MTX,
 16 |     H5,
 17 |     CSV,
 18 |     Dummy(String),
 19 | }
 20 | 
 21 | pub fn write_file( file_path: String,
 22 |                    file_type: FileType,
 23 |                    bit_vecs: Vec<Vec<u8>>,
 24 |                    alphas: Vec<Vec<f32>>,
 25 |                    num_cells: usize,
 26 |                    num_features: usize,
 27 | ) -> Result<bool, io::Error> {
 28 |     info!("Writing Output into file path: {}", file_path);
 29 | 
 30 |     match file_type {
 31 |         FileType::MTX => mtx::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
 32 |         FileType::CSV => csv::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
 33 |         FileType::H5 => h5::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
 34 |         FileType::EDS => eds::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
 35 |         _ => unreachable!(),
 36 |     };
 37 | 
 38 |     Ok(true)
 39 | }
 40 | 
 41 | pub fn read_file(file_path: &str,
 42 |                  file_type: FileType,
 43 |                  num_cells: usize,
 44 |                  num_features: usize,
 45 | ) -> Result<(Vec<Vec<u8>>, Vec<Vec<f32>>), io::Error> {
 46 |     let mut alphas: Vec<Vec<f32>> = Vec::new();
 47 |     let mut bit_vecs: Vec<Vec<u8>> = Vec::new();
 48 | 
 49 |     match file_type {
 50 |         FileType::EDS => eds::reader(
 51 |             file_path,
 52 |             num_cells,
 53 |             num_features,
 54 |             &mut alphas,
 55 |             &mut bit_vecs,
 56 |         )?,
 57 |         FileType::MTX => mtx::reader(
 58 |             file_path,
 59 |             num_cells,
 60 |             num_features,
 61 |             &mut alphas,
 62 |             &mut bit_vecs,
 63 |         )?,
 64 |         _ => unreachable!(),
 65 |     };
 66 | 
 67 |     info!("Done Reading Input file");
 68 |     Ok((bit_vecs, alphas))
 69 | }
 70 | 
 71 | pub fn randomize(bit_vecs: Vec<Vec<u8>>,
 72 |                  alphas: Vec<Vec<f32>>,
 73 | ) -> Result<(Vec<Vec<u8>>, Vec<Vec<f32>>), io::Error> {
 74 |     info!("Randomizing order");
 75 |     assert!( bit_vecs.len() == alphas.len() );
 76 | 
 77 |     let num_elem = bit_vecs.len() as u32;
 78 |     let mut order: Vec<u32> = (0..num_elem).collect();
 79 |     order.shuffle(&mut thread_rng());
 80 | 
 81 |     let mut shuf_bvecs = vec![Vec::new(); bit_vecs.len()];
 82 |     let mut shuf_alphas = vec![Vec::new(); bit_vecs.len()];
 83 |     for (nindex, oindex) in order.into_iter().enumerate() {
 84 |         shuf_bvecs[nindex] = bit_vecs[oindex as usize].clone();
 85 |         shuf_alphas[nindex] = alphas[oindex as usize].clone();
 86 |     }
 87 | 
 88 |     Ok((shuf_bvecs, shuf_alphas))
 89 | }
 90 | 
 91 | pub fn find_output_format(sub_m: &ArgMatches) -> FileType {
 92 |     let mut out_file_type: Option<FileType> = None;
 93 |     let mut found_file_types = 0;
 94 | 
 95 |     out_file_type = match sub_m.is_present("mtx") {
 96 |         true => {
 97 |             found_file_types += 1;
 98 |             Some(FileType::MTX)
 99 |         }
100 |         false => out_file_type,
101 |     };
102 | 
103 |     out_file_type = match sub_m.is_present("csv") {
104 |         true => {
105 |             found_file_types += 1;
106 |             Some(FileType::CSV)
107 |         }
108 |         false => out_file_type,
109 |     };
110 | 
111 |     out_file_type = match sub_m.is_present("h5") {
112 |         true => {
113 |             found_file_types += 1;
114 |             Some(FileType::H5)
115 |         }
116 |         false => out_file_type,
117 |     };
118 | 
119 |     out_file_type = match sub_m.is_present("eds") {
120 |         true => {
121 |             found_file_types += 1;
122 |             Some(FileType::EDS)
123 |         }
124 |         false => out_file_type,
125 |     };
126 | 
127 |     assert!(found_file_types == 1, "found unexpected not 1 file types");
128 |     return out_file_type.expect("can't find output format type");
129 | }
130 | 
131 | pub fn get_output_path(input_path: &str, otype: FileType) -> (FileType, String) {
132 |     let mut itype: FileType = FileType::EDS;
133 |     let mut opath = input_path.to_string();
134 |     let mut offset: usize = opath.len();
135 |     let mut found_file_types = 0;
136 | 
137 |     match opath.find(".eds") {
138 |         Some(val) => {
139 |             offset = val;
140 |         }
141 |         _ => (),
142 |     };
143 | 
144 |     match opath.find(".mtx") {
145 |         Some(val) => {
146 |             offset = val;
147 |             itype = FileType::MTX;
148 |             found_file_types += 1;
149 |         }
150 |         _ => (),
151 |     };
152 | 
153 |     match opath.find(".h5") {
154 |         Some(val) => {
155 |             offset = val;
156 |             itype = FileType::H5;
157 |             found_file_types += 1;
158 |         }
159 |         _ => (),
160 |     };
161 | 
162 |     match opath.find(".csv") {
163 |         Some(val) => {
164 |             offset = val;
165 |             itype = FileType::CSV;
166 |             found_file_types += 1;
167 |         }
168 |         _ => (),
169 |     };
170 | 
171 |     assert!(
172 |         found_file_types == 1 || itype == FileType::EDS,
173 |         " Can't find right input file type "
174 |     );
175 |     assert!(
176 |         itype != otype,
177 |         "Found same input and output file file format"
178 |     );
179 | 
180 |     info!(" Found {:?} as input file type ", itype);
181 |     info!(" Found {:?} as output file type ", otype);
182 | 
183 |     match otype {
184 |         FileType::MTX => opath.replace_range(offset.., ".mtx.gz"),
185 |         FileType::CSV => opath.replace_range(offset.., ".csv.gz"),
186 |         FileType::H5 => opath.replace_range(offset.., ".h5"),
187 |         FileType::EDS => opath.replace_range(offset.., ".eds.gz"),
188 |         FileType::Dummy(name) => opath.replace_range(offset.., &name),
189 |     }
190 | 
191 |     (itype, opath)
192 | }
193 | 
194 | pub fn triplets_to_eds(triplets: &Vec<HashMap<u32, f32>>,
195 |                        expr: &mut Vec<Vec<f32>>,
196 |                        bit_vecs: &mut Vec<Vec<u8>>,
197 |                        num_genes: usize,
198 | ) {
199 |     for cell_data in triplets {
200 |         let mut keys: Vec<u32> = cell_data.keys()
201 |             .cloned()
202 |             .collect();
203 |         keys.sort();
204 | 
205 |         let values: Vec<f32> = keys.iter().map( |key| cell_data[key] )
206 |             .collect();
207 | 
208 |         expr.push(values);
209 | 
210 |         let num_exp_genes = keys.len();
211 |         let num_bit_vecs: usize = round::ceil(num_genes as f64 / 8.0, 0) as usize;
212 |         let mut bit_vec: Vec<u8> = vec![0; num_bit_vecs];
213 | 
214 |         let mut min_processed_close = 0;
215 |         let mut max_processed_open = 8;
216 |         let mut curr_index = 0;
217 |         let mut flag: u8 = 0;
218 | 
219 |         for key in keys {
220 |             assert!(key >= min_processed_close);
221 |             assert!(curr_index < num_bit_vecs);
222 | 
223 |             let offset: u8 = (key % 8) as u8;
224 |             if key < max_processed_open {
225 |                 flag |= 128u8 >> offset;
226 |             } else {
227 |                 bit_vec[curr_index] = flag;
228 | 
229 |                 while key >= max_processed_open {
230 |                     curr_index += 1;
231 |                     min_processed_close = max_processed_open;
232 |                     max_processed_open += 8;
233 |                 }
234 |                 flag = 128u8 >> offset;
235 |             }
236 |         }
237 |         bit_vec[curr_index] = flag;
238 | 
239 |         let mut num_ones = 0;
240 |         for bits in bit_vec.iter() {
241 |             num_ones += bits.count_ones();
242 |         }
243 |         assert!(num_ones as usize == num_exp_genes,
244 |                 format!("{:?} {:?}", num_ones, num_exp_genes));
245 | 
246 |         bit_vecs.push(bit_vec);
247 |     }
248 | }
249 | 


--------------------------------------------------------------------------------