├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
├── data
│ └── scripts
│ │ ├── generate.sh
│ │ └── makeLoom.py
├── memory.jpg
├── parsers
│ ├── readCsv.R
│ ├── readEds.R
│ ├── readH5.R
│ ├── readLoom.R
│ └── readMtx.R
├── profile.sh
├── size.jpg
├── stats
│ ├── ltime.tex
│ ├── memory.tex
│ └── size.tex
└── time.jpg
├── eds.jpg
├── src-cpp
└── readEDS.cpp
└── src-rs
├── Cargo.toml
└── src
├── csv.rs
├── eds.rs
├── h5.rs
├── main.rs
├── mtx.rs
└── utils.rs
/.gitignore:
--------------------------------------------------------------------------------
1 | # Generated by Cargo
2 | # will have compiled files and executables
3 | /target/
4 |
5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
7 | Cargo.lock
8 |
9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, COMBINE-lab
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## What's EDS ?
2 | EDS is an accronym for Efficient single cell binary Data Storage format for the cell-feature count matrices.
3 |
4 | 
5 |
6 | ## Why we need a new storage format ?
7 | Recent advancements in single-cell technologies have seen rapid increase in the amount of data. Most single-cell studies generate a cell by feature (can be gene) count matrices, where the number of cells are now reaching towards millions. Traditional Single-cell quantification pipelines use matrix market exchange (mtx) format (sometimes gzipped) for sharing the count matrices. However, the textual representation of mtx format makes it bigger in size compared to a compressed binary format. Our quantification tool [alevin](https://combine-lab.github.io/alevin-tutorial/) dumps the output in EDS format which saves storage space.
8 |
9 |
10 | ## What are the caveats ?
11 | There are other formats (such as [loom](https://github.com/linnarsson-lab/loompy)) which are designed for optimizing the query of the matrix. EDS is primarily designed to improve the storage efficiency rather than query and currently don't support random access to a cell (row).
12 |
13 | ## How to convert eds to mtx format ?
14 | We have a simple rust code inside the `src-rs`, it can be installed using `cargo build --release` and can be used as `./target/release/eds convert -i --[mtx | eds | h5 | csv] -c -f `.
15 |
16 | ## Benchmarks
17 | * Size on disk.
18 | 
19 |
20 | * Matrix loading into memory time.
21 | 
22 |
23 | * Memory required to load the matrix.
24 | 
25 |
26 | ## Future
27 | - [ ] Support delayedArray R object
28 | - [ ] Random access through `EDS index`
29 |
30 | ## Contributors
31 | - Avi Srivastava
32 | - Mike Love
33 | - Rob Patro
34 |
--------------------------------------------------------------------------------
/benchmarks/data/scripts/generate.sh:
--------------------------------------------------------------------------------
1 | name=$1
2 | cells=$2
3 | feats=$3
4 |
5 | bin="/mnt/scratch1/avi/anton/alevin_r/EDS/src-rs/target/release/eds"
6 | dpath="/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/"$name"/quants_mat.eds.gz"
7 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --csv -c $cells -f $feats
8 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --mtx -c $cells -f $feats
9 | RUST_BACKTRACE=1 RUST_LOG=trace $bin convert -i $dpath --h5 -c $cells -f $feats
10 |
--------------------------------------------------------------------------------
/benchmarks/data/scripts/makeLoom.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import loompy
3 | import numpy as np
4 | import sys
5 | from scipy.io import mmread
6 |
7 | data = sys.argv[1]
8 | mtx_file = "/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/" + data + "/quants_mat.mtx.gz"
9 | out_file = "/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/" + data + "/quants_mat.loom"
10 |
11 | data = mmread( gzip.open(mtx_file) )
12 | (cells, feats) = data.shape
13 |
14 | data = data.T
15 |
16 | row_names = {}
17 | row_names['rname'] = np.array(range(feats))
18 |
19 | col_names = {}
20 | col_names['cname'] = np.array( range(cells) )
21 |
22 | loompy.create(out_file, data, row_names, col_names)
23 |
--------------------------------------------------------------------------------
/benchmarks/memory.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/memory.jpg
--------------------------------------------------------------------------------
/benchmarks/parsers/readCsv.R:
--------------------------------------------------------------------------------
1 | args = commandArgs(trailingOnly=TRUE)
2 |
3 | data <- args[1]
4 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.csv.gz")
5 |
6 | system.time({
7 | csv <- read.table( gzfile( fpath ), sep="," )
8 | })
9 |
10 | print(dim(csv))
11 |
--------------------------------------------------------------------------------
/benchmarks/parsers/readEds.R:
--------------------------------------------------------------------------------
1 | library(Rcpp)
2 | library(Matrix)
3 |
4 | args = commandArgs(trailingOnly=TRUE)
5 | sourceCpp("/mnt/scratch1/avi/anton/alevin_r/EDS/src-cpp/readEDS.cpp")
6 |
7 | data <- args[1]
8 | num.cells <- as.integer(args[2])
9 | num.genes <- as.integer(args[3])
10 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.eds.gz")
11 |
12 | system.time({
13 | pos <- getSparseMatrix( num.genes, num.cells, fpath )
14 | })
15 |
16 | str(pos)
17 |
--------------------------------------------------------------------------------
/benchmarks/parsers/readH5.R:
--------------------------------------------------------------------------------
1 | library(hdf5r)
2 | library(Matrix)
3 |
4 | args = commandArgs(trailingOnly=TRUE)
5 | data <- args[1]
6 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.h5")
7 |
8 | system.time({
9 | infile <- hdf5r::H5File$new(filename = fpath, mode = 'r')
10 | genome <- "/matrix"
11 |
12 | counts <- infile[[paste0(genome, '/data')]]
13 | indices <- infile[[paste0(genome, '/indices')]]
14 | indptr <- infile[[paste0(genome, '/indptr')]]
15 | shp <- infile[[paste0(genome, '/shape')]]
16 |
17 | sparse.mat <- sparseMatrix(
18 | i = indices[] + 1,
19 | p = indptr[],
20 | x = as.numeric(x = counts[]),
21 | dims = shp[],
22 | giveCsparse = TRUE
23 | )
24 | })
25 |
26 | str(sparse.mat)
27 |
--------------------------------------------------------------------------------
/benchmarks/parsers/readLoom.R:
--------------------------------------------------------------------------------
1 | library(loomR)
2 |
3 | args = commandArgs(trailingOnly=TRUE)
4 | # inparts taked from https://satijalab.org/loomR/loomR_tutorial.html
5 | data <- args[1]
6 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.loom")
7 |
8 | system.time({
9 | lfile <- connect(filename = fpath, mode = "r+")
10 | full.matrix <- lfile$matrix[, ]
11 | })
12 |
13 | dim( full.matrix )
14 |
--------------------------------------------------------------------------------
/benchmarks/parsers/readMtx.R:
--------------------------------------------------------------------------------
1 | library(Matrix)
2 |
3 | args = commandArgs(trailingOnly=TRUE)
4 | data <- args[1]
5 | fpath <- paste0("/mnt/scratch1/avi/anton/alevin_r/EDS/benchmarks/data/", data, "/quants_mat.mtx.gz")
6 |
7 | system.time({
8 | mm <- readMM( gzfile( fpath ) )
9 | })
10 |
11 | print(dim(mm))
12 |
--------------------------------------------------------------------------------
/benchmarks/profile.sh:
--------------------------------------------------------------------------------
1 | datas=("neurons_450k_random" "pbmc_40k_random" "neurons_1m" "neurons_900" "neurons_2k" "pbmc_4k" "pbmc_8k" "neurons_9k" "pbmc_40k" "neurons_450k")
2 | cells=(456400 43400 1000000 931 2022 4340 8381 9128 43400 456400)
3 | feats=(50686 58278 27998 50686 50686 58278 58278 50686 58278 50686)
4 |
5 | for id in {0..0}; do
6 | data=${datas[$id]}
7 | cell=${cells[$id]}
8 | feat=${feats[$id]}
9 |
10 | echo $data $cell $feat
11 | echo "EDS"
12 | /usr/bin/time Rscript --vanilla parsers/readEds.R $data $cell $feat &&
13 |
14 | echo "H5" &&
15 | /usr/bin/time Rscript --vanilla parsers/readH5.R $data &&
16 |
17 | echo "Mtx" &&
18 | /usr/bin/time Rscript --vanilla parsers/readMtx.R $data &&
19 |
20 | echo "loom" &&
21 | /usr/bin/time Rscript --vanilla parsers/readLoom.R $data &&
22 |
23 | echo "CSV"
24 | /usr/bin/time Rscript --vanilla parsers/readCsv.R $data
25 | done
26 |
27 |
--------------------------------------------------------------------------------
/benchmarks/size.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/size.jpg
--------------------------------------------------------------------------------
/benchmarks/stats/ltime.tex:
--------------------------------------------------------------------------------
1 | \begin{center}
2 | \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||}
3 | \hline
4 | \vtop{\hbox{\strut Loading}\hbox{\strut (Second)}} & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex]
5 | \hline\hline
6 | eds.gz & 0.28 & 0.86 & 1.49 & 2.30 & 3.24 & 10.04 & 9.93 & 119.61 & 112.88 & 159.77 \\
7 | \hline
8 | csv.gz & 28.92 & 40.15 & 75.30 & 134.44 & 114.96 & 680.1 & 439.47 & 6768.7 & 6524.1 & N/A \\
9 | \hline
10 | h5 & 0.83 & 1.56 & 1.65 & 3.88 & 5.72 & 10.11 & 10.20 & 276.36 & 338.91 & 705.56 \\
11 | \hline
12 | mtx.gz & 1.93 & 2.37 & 3.53 & 6.87 & 12.45 & 26.84 & 23.60 & 536.91 & 489.76 & 904.35 \\
13 | \hline
14 | loom & 0.98 & 1.82 & 4.08 & 7.67 & 8.05 & 44.82 & 36.92 & 400.99 & 378.02 & N/A \\ [1ex]
15 | \hline
16 | \end{tabular}
17 | \end{center}
18 |
--------------------------------------------------------------------------------
/benchmarks/stats/memory.tex:
--------------------------------------------------------------------------------
1 | \begin{center}
2 | \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||}
3 | \hline
4 | \vtop{\hbox{\strut Memory}\hbox{\strut (Gb)}} & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex]
5 | \hline\hline
6 | eds.gz & 0.32 & 0.33 & 0.33 & 0.36 & 0.49 & 0.78 & 0.77 & 11.5 & 11.5 & 22.9 \\
7 | \hline
8 | csv.gz & 0.73 & 2.40 & 4.80 & 9.87 & 8.14 & 42.12 & 41.08 & 322 & 317.2 & N/A \\
9 | \hline
10 | h5 & 0.47 & 0.59 & 0.69 & 1.10 & 1.76 & 3.17 & 3.12 & 56.8 & 56.77 & 114.29 \\
11 | \hline
12 | mtx.gz & 0.37 & 0.38 & 0.48 & 0.74 & 1.10 & 1.75 & 1.72 & 30.3 & 30.33 & 61 \\
13 | \hline
14 | loom & 0.68 & 1.13 & 2.47 & 4.53 & 4.30 & 22.447 & 22.40 & 203.3 & 203.2 & N/A \\ [1ex]
15 | \hline
16 | \end{tabular}
17 | \end{center}
18 |
--------------------------------------------------------------------------------
/benchmarks/stats/size.tex:
--------------------------------------------------------------------------------
1 | \begin{center}
2 | \begin{tabular}{||c || c | c | c | c | c | c | c | c | c | c ||}
3 | \hline
4 | Size & \vtop{\hbox{\strut neurons}\hbox{\strut 900}} & \vtop{\hbox{\strut neurons}\hbox{\strut 2k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 8k}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k}} & \vtop{\hbox{\strut pbmc}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut rand}\hbox{\strut 4k*10}} & \vtop{\hbox{\strut neurons}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut rand}\hbox{\strut 9k*50}} & \vtop{\hbox{\strut neurons}\hbox{\strut 1M}} \\ [0.5ex]
5 | \hline\hline
6 | eds.gz & 5.4M & 9.8M & 15M & 29M & 42M & 108M & 116M & 1.8G & 1.9G & 2.6G \\
7 | \hline
8 | csv.gz & 6.6M & 13M & 19M & 37M & 51M & 142M & 142M & 2.3G & 2.3G & 3.1G \\
9 | \hline
10 | h5 & 7.6M & 14M & 18M & 36M & 59M & 97M & 135M & 1.8G & 2.5G & 4.0G \\
11 | \hline
12 | mtx.gz & 11M & 20M & 26M & 52M & 86M & 186M & 192M & 3.6G & 3.6G & 5.8G \\
13 | \hline
14 | loom & 12M & 23M & 40M & 78M & 97M & 297M & 355M & 3.6G & 4.8G & 6.1G \\ [1ex]
15 | \hline
16 | \end{tabular}
17 | \end{center}
18 |
--------------------------------------------------------------------------------
/benchmarks/time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/benchmarks/time.jpg
--------------------------------------------------------------------------------
/eds.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/COMBINE-lab/EDS/a66a0b4ec96c25ea6f2b25f1566e6c089aff6cda/eds.jpg
--------------------------------------------------------------------------------
/src-cpp/readEDS.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Alevin Efficient Data Storage (EDS) reader
3 | *
4 | * Author: Avi Srivastava
5 | * Last modified: August 13, 2019
6 | * License: LGPL (>= 3)
7 | *
8 | */
9 |
10 | #include
11 | #include
12 |
13 | using namespace Rcpp;
14 |
15 | // C++ internal function to figure out the spaces to reserve
16 | size_t getReserveSpaces(size_t numOfGenes, size_t numOfOriginalCells,
17 | Rcpp::IntegerVector& bitVecLengths,
18 | std::string& countMatFilename) {
19 |
20 | // opening gzipped compressed stream
21 | gzFile fileHandler = gzopen(countMatFilename.c_str(), "rb") ;
22 |
23 | // We are storing the bit vector in u8 so total number of u8 = numGenes/8
24 | size_t numFlags = std::ceil(numOfGenes / 8.0);
25 |
26 | // vector for storing the bitvector flags
27 | std::vector alphasFlag (numFlags, 0);
28 |
29 | // getting the sizs of u8 and float 32
30 | size_t flagSize = sizeof(decltype(alphasFlag)::value_type);
31 | size_t elSize = sizeof(float);
32 | size_t totalSpace { 0 };
33 |
34 | // iterating over cells
35 | for (size_t cellId = 0 ; cellId < numOfOriginalCells ; ++cellId) {
36 | // reading bitvectors
37 | gzread(fileHandler, reinterpret_cast(alphasFlag.data()), flagSize * numFlags);
38 | size_t numOfExpGenes { 0 };
39 |
40 | for (size_t j = 0; j < alphasFlag.size(); j++) {
41 | uint8_t flag = alphasFlag[j];
42 |
43 | for (size_t i = 0; i < 8; i++){
44 | // counting positions only if the flag is set
45 | if (flag & (128 >> i)) {
46 | numOfExpGenes += 1;
47 | }
48 | }
49 | }
50 |
51 | // skipping the expression values and saving the counts for numOfExpGenes
52 | gzseek(fileHandler, elSize * numOfExpGenes, SEEK_CUR);
53 | bitVecLengths[ cellId + 1 ] = ( numOfExpGenes + bitVecLengths[ cellId ] );
54 | totalSpace += numOfExpGenes;
55 | }
56 |
57 | return totalSpace;
58 | }
59 |
60 | // [[Rcpp::export]]
61 | SEXP getSparseMatrix(size_t numOfGenes, size_t numOfOriginalCells, std::string countMatFilename) {
62 | Rcpp::S4 mat("dgCMatrix");
63 |
64 | // initializing vector to store bitvecSpaces
65 | Rcpp::IntegerVector bitVecLengths(numOfOriginalCells + 1, 0);
66 | size_t totalSpace = getReserveSpaces( numOfGenes, numOfOriginalCells,
67 | bitVecLengths, countMatFilename );
68 |
69 | // initializing sparse matrix
70 | typedef Rcpp::NumericVector ValuesT;
71 | ValuesT values(totalSpace, 0.0);
72 |
73 | // initializing sparse matrix indices
74 | typedef Rcpp::IntegerVector IndicesT;
75 | IndicesT indices(totalSpace, 0);
76 |
77 | // opening gzipped compressed stream
78 | gzFile fileHandler = gzopen(countMatFilename.c_str(), "rb") ;
79 |
80 | // We are storing the bit vector in u8 so total number of u8 = numGenes/8
81 | size_t numFlags = std::ceil(numOfGenes / 8.0);
82 |
83 | // vector for storing the bitvector flags
84 | std::vector alphasFlag (numFlags, 0);
85 |
86 | // getting the sizs of u8 and float 32
87 | size_t flagSize = sizeof(decltype(alphasFlag)::value_type);
88 | size_t elSize = sizeof(float);
89 |
90 | size_t valCounter { 0 };
91 | // iterating over cells
92 | for (size_t cellId = 0 ; cellId < numOfOriginalCells ; ++cellId) {
93 | // reading bitvectors
94 | gzread(fileHandler, reinterpret_cast(alphasFlag.data()), flagSize * numFlags);
95 |
96 | // iterating over u8 flags for bitvectors
97 | size_t numExpGenes { 0 };
98 | for (size_t j = 0; j < alphasFlag.size(); j++) {
99 | uint8_t flag = alphasFlag[j];
100 |
101 | for (size_t i = 0; i < 8; i++){
102 | // extracting positions only if the flag is set
103 | if (flag & (128 >> i)) {
104 | if ( valCounter + numExpGenes >= totalSpace ) {
105 | return Rcpp::List();
106 | }
107 |
108 | size_t offset = i + (8 * j);
109 | indices[ valCounter + numExpGenes ] = offset;
110 | numExpGenes += 1;
111 | }
112 | }
113 | }
114 |
115 | // reading in the expression
116 | std::vector alphasSparse(numExpGenes);
117 | gzread(fileHandler, reinterpret_cast(alphasSparse.data()), elSize * numExpGenes);
118 |
119 | // saving the positions and expression
120 | for (size_t i = 0; i < numExpGenes; i++) {
121 | if ( valCounter >= totalSpace ) {
122 | return Rcpp::List();
123 | }
124 |
125 | values[valCounter] = alphasSparse[i];
126 | valCounter += 1;
127 | }
128 | }
129 |
130 | // code in-parts taken from https://github.com/LTLA/beachmat/blob/master/inst/include/beachmat/output/Csparse_writer.h#L268
131 | mat.slot("Dim") = Rcpp::IntegerVector::create(numOfGenes, numOfOriginalCells);
132 |
133 | // Setting p
134 | mat.slot("p") = bitVecLengths;
135 |
136 | // Setting 'x'.
137 | mat.slot("x") = values;
138 |
139 | // Setting 'i'.
140 | mat.slot("i") = indices;
141 |
142 | return SEXP(mat);
143 | }
144 |
145 |
--------------------------------------------------------------------------------
/src-rs/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "eds"
3 | version = "0.1.0"
4 | authors = ["avisrivastava , Mike Love, Rob Patro"]
5 | edition = "2018"
6 |
7 | [dependencies]
8 | pretty_env_logger = "0.3.0"
9 | clap = "2.33.0"
10 | log = "0.4.6"
11 | byteorder = "1.3.1"
12 | flate2 = "1.0.7"
13 | libmath = "0.2.1"
14 | hdf5 = "0.5.2"
15 | rand = "0.7.0"
--------------------------------------------------------------------------------
/src-rs/src/csv.rs:
--------------------------------------------------------------------------------
1 | use flate2::write::GzEncoder;
2 | use flate2::Compression;
3 | use std::fs::File;
4 | use std::io;
5 | use std::io::Write;
6 |
7 | pub fn writer(
8 | path_str: String,
9 | expressions: Vec>,
10 | bit_vecs: Vec>,
11 | _num_cells: usize,
12 | num_features: usize,
13 | ) -> Result {
14 | let file_handle = File::create(path_str)?;
15 | let mut file = GzEncoder::new(file_handle, Compression::default());
16 |
17 | let mut header = "\"\"".to_string();
18 | for gid in 1..num_features + 1 {
19 | header.push_str(&format!(",gene{}", gid));
20 | }
21 | header.push_str(&format!("\n"));
22 | file.write_all(header.as_bytes())?;
23 |
24 | let mut mtx_data: String;
25 | assert!(
26 | bit_vecs.len() == expressions.len(),
27 | "length of bit vec and expression is not same"
28 | );
29 | for (cell_id, exp) in expressions.into_iter().enumerate() {
30 | let bit_vec = &bit_vecs[cell_id];
31 | let mut fids: Vec = Vec::new();
32 |
33 | for (feature_id, flag) in bit_vec.into_iter().enumerate() {
34 | if *flag != 0 {
35 | for (offset, j) in format!("{:8b}", flag).chars().enumerate() {
36 | match j {
37 | '1' => fids.push((8 * feature_id) + offset),
38 | _ => (),
39 | };
40 | }
41 | }
42 | }
43 |
44 | assert!(
45 | fids.len() == exp.len(),
46 | format!("#positions {} doesn't match with #expressed features {}",
47 | fids.len(), exp.len())
48 | );
49 | mtx_data = format!("cell{}", cell_id + 1);
50 | let mut zero_counter = 0;
51 | for (index, count) in exp.into_iter().enumerate() {
52 | assert!(
53 | fids[index] < num_features,
54 | format!("{} position > {}", fids[index], num_features)
55 | );
56 |
57 | while zero_counter != fids[index] {
58 | zero_counter += 1;
59 | mtx_data.push_str(&format!(",0"));
60 | }
61 |
62 | zero_counter += 1;
63 | mtx_data.push_str(&format!(",{}", count));
64 | }
65 |
66 | while zero_counter < num_features {
67 | zero_counter += 1;
68 | mtx_data.push_str(&format!(",0"));
69 | }
70 |
71 | mtx_data.push_str(&format!("\n"));
72 | file.write_all(mtx_data.as_bytes())?;
73 | }
74 |
75 | Ok(true)
76 | }
77 |
--------------------------------------------------------------------------------
/src-rs/src/eds.rs:
--------------------------------------------------------------------------------
1 | use std::fs::File;
2 | use std::io;
3 | use std::io::{Read, Write};
4 |
5 | use byteorder::{ByteOrder, LittleEndian};
6 | use flate2::read::GzDecoder;
7 | use math::round;
8 |
9 | use flate2::write::GzEncoder;
10 | use flate2::Compression;
11 |
12 | pub fn reader(
13 | input: &str,
14 | num_cells: usize,
15 | num_genes: usize,
16 | expr: &mut Vec>,
17 | bit_vecs: &mut Vec>,
18 | ) -> Result {
19 | info!("Using {} as input EDS file\n", input);
20 | info!(
21 | "Using {} Rows (cells) and {} Columns (features)",
22 | num_cells, num_genes
23 | );
24 |
25 | let num_bit_vecs: usize = round::ceil(num_genes as f64 / 8.0, 0) as usize;
26 | let mut total_molecules = 0.0;
27 | let mut total_exp_values = 0;
28 |
29 | {
30 | let mut count = 0;
31 | let file_handle = File::open(input)?;
32 | let mut file = GzDecoder::new(file_handle);
33 |
34 | for _ in 0..num_cells {
35 | let mut bit_vec = vec![0; num_bit_vecs];
36 | file.read_exact(&mut bit_vec[..])?;
37 | let mut num_ones = 0;
38 | for bits in bit_vec.iter() {
39 | num_ones += bits.count_ones();
40 | }
41 | bit_vecs.push(bit_vec);
42 |
43 | let mut expression: Vec = vec![0; 4 * (num_ones as usize)];
44 | let mut float_buffer: Vec = vec![0.0_f32; num_ones as usize];
45 | file.read_exact(&mut expression[..])?;
46 | LittleEndian::read_f32_into(&expression, &mut float_buffer);
47 |
48 | let cell_count: f32 = float_buffer.iter().sum();
49 | total_molecules += cell_count;
50 | expr.push(float_buffer);
51 |
52 | count += 1;
53 | total_exp_values += num_ones;
54 | if count % 100 == 0 {
55 | print!("\r Done Reading {} cells", count);
56 | io::stdout().flush()?;
57 | }
58 | }
59 | }
60 |
61 | println!("\n");
62 | assert!(
63 | expr.len() == num_cells,
64 | "rows and quants file size mismatch"
65 | );
66 |
67 | info!("Found Total {:.2} molecules", total_molecules);
68 | info!("Found Total {:.2} expressed entries", total_exp_values);
69 | info!(
70 | "w/ {:.2} Molecules/cell",
71 | total_molecules / num_cells as f32
72 | );
73 | Ok(true)
74 | }
75 |
76 | pub fn writer(
77 | path_str: String,
78 | expressions: Vec>,
79 | bit_vecs: Vec>,
80 | _num_cells: usize,
81 | _num_features: usize,
82 | ) -> Result {
83 | let file_handle = File::create(path_str)?;
84 | let mut file = GzEncoder::new(file_handle, Compression::default());
85 |
86 | assert!(expressions.len() == bit_vecs.len());
87 | for (exp, bvec) in expressions.into_iter().zip(bit_vecs.into_iter()) {
88 | file.write_all(&bvec)?;
89 |
90 | let mut bin_exp: Vec = vec![0_u8; exp.len() * 4];
91 | LittleEndian::write_f32_into(&exp, &mut bin_exp);
92 | file.write_all(&bin_exp)?;
93 | }
94 |
95 | Ok(true)
96 | }
97 |
--------------------------------------------------------------------------------
/src-rs/src/h5.rs:
--------------------------------------------------------------------------------
1 | use hdf5;
2 | use std::io;
3 |
4 | pub fn writer(
5 | path_str: String,
6 | expressions: Vec>,
7 | bit_vecs: Vec>,
8 | num_cells: usize,
9 | num_features: usize,
10 | ) -> Result {
11 | let file = hdf5::File::open(path_str, "w").expect("can't create output file");
12 |
13 | let group = file
14 | .create_group("matrix")
15 | .expect("can't create group in h5");
16 |
17 | let shape = group
18 | .new_dataset::()
19 | .gzip(6)
20 | .create("shape", 2)
21 | .expect("can't write shape in h5");
22 |
23 | shape
24 | .write(&[num_features, num_cells])
25 | .expect("error writing shape");
26 |
27 | assert!(
28 | bit_vecs.len() == expressions.len(),
29 | "length of bit vec and expression is not same"
30 | );
31 |
32 | let total_entries;
33 | {
34 | let mut cumm_sum = 0;
35 | let mut indptr_vals: Vec = vec![cumm_sum];
36 | for exp in expressions.iter() {
37 | cumm_sum += exp.len() as u32;
38 | indptr_vals.push(cumm_sum);
39 | }
40 |
41 | total_entries = indptr_vals.last().expect("indptr empty").clone();
42 | assert!(
43 | indptr_vals.len() == num_cells + 1,
44 | "num cells doesn't match"
45 | );
46 |
47 | let indptr = group
48 | .new_dataset::()
49 | .gzip(6)
50 | .create("indptr", indptr_vals.len())
51 | .expect("can't write indptr in h5");
52 |
53 | indptr
54 | .write_raw(&indptr_vals)
55 | .expect("error writing indptr");
56 | } // end writing indptr
57 |
58 | {
59 | let data = group
60 | .new_dataset::()
61 | .gzip(6)
62 | .create("data", total_entries as usize)
63 | .expect("can't write data in h5");
64 |
65 | let flatten_data: Vec = expressions
66 | .iter()
67 | .flat_map(|array| array.iter())
68 | .cloned()
69 | .collect();
70 |
71 | assert!(
72 | flatten_data.len() == total_entries as usize,
73 | "different number of entries"
74 | );
75 | data.write_raw(&flatten_data).expect("can't write data");
76 | } // end writing data
77 |
78 | {
79 | let indices = group
80 | .new_dataset::()
81 | .gzip(6)
82 | .create("indices", total_entries as usize)
83 | .expect("can't write positions in h5");
84 |
85 | let mut positions: Vec = Vec::new();
86 | for bit_vec in bit_vecs {
87 | for (feature_id, flag) in bit_vec.into_iter().enumerate() {
88 | if flag != 0 {
89 | for (offset, j) in format!("{:8b}", flag).chars().enumerate() {
90 | match j {
91 | '1' => positions.push((8 * feature_id) as u32 + offset as u32),
92 | _ => (),
93 | };
94 | }
95 | }
96 | }
97 | } // end-for
98 |
99 | assert!(
100 | positions.len() == total_entries as usize,
101 | "different number of entries"
102 | );
103 | indices.write_raw(&positions).expect("can't write indices");
104 | } // end writing indices
105 |
106 | Ok(true)
107 | }
108 |
--------------------------------------------------------------------------------
/src-rs/src/main.rs:
--------------------------------------------------------------------------------
1 | extern crate byteorder;
2 | extern crate clap;
3 | extern crate flate2;
4 | extern crate hdf5;
5 | extern crate math;
6 | extern crate pretty_env_logger;
7 |
8 | #[macro_use]
9 | extern crate log;
10 |
11 | mod csv;
12 | mod eds;
13 | mod h5;
14 | mod mtx;
15 | mod utils;
16 |
17 | use clap::{App, Arg, ArgMatches, SubCommand};
18 | use std::io;
19 | use utils::FileType;
20 |
21 | fn randomize_file(sub_m: &ArgMatches) -> Result<(), io::Error> {
22 | let input_file_path = sub_m.value_of("input").unwrap();
23 | let output_file_type = FileType::Dummy(".random".to_string());
24 |
25 | let (input_file_type, output_file_path) =
26 | utils::get_output_path(input_file_path, output_file_type.clone());
27 |
28 | let num_cells: usize = sub_m
29 | .value_of("cells")
30 | .expect("can't find #cells")
31 | .parse()
32 | .unwrap();
33 |
34 | let num_features = sub_m
35 | .value_of("features")
36 | .expect("can't find #features")
37 | .parse()
38 | .unwrap();
39 |
40 | let (bit_vecs, alphas) = utils::read_file(input_file_path,
41 | input_file_type.clone(),
42 | num_cells,
43 | num_features)?;
44 |
45 | let (bit_vecs, alphas) = utils::randomize( bit_vecs, alphas )?;
46 | utils::write_file( output_file_path, input_file_type,
47 | bit_vecs, alphas, num_cells, num_features)?;
48 |
49 | info!("All Done!");
50 | Ok(())
51 | }
52 |
53 | fn convert_file(sub_m: &ArgMatches) -> Result<(), io::Error> {
54 | let input_file_path = sub_m.value_of("input").unwrap();
55 | let output_file_type = utils::find_output_format(sub_m);
56 |
57 | let (input_file_type, output_file_path) =
58 | utils::get_output_path(input_file_path, output_file_type.clone());
59 |
60 | let num_cells: usize = sub_m
61 | .value_of("cells")
62 | .expect("can't find #cells")
63 | .parse()
64 | .unwrap();
65 |
66 | let num_features = sub_m
67 | .value_of("features")
68 | .expect("can't find #features")
69 | .parse()
70 | .unwrap();
71 |
72 | let (bit_vecs, alphas) = utils::read_file(input_file_path,
73 | input_file_type,
74 | num_cells,
75 | num_features)?;
76 |
77 | utils::write_file( output_file_path, output_file_type,
78 | bit_vecs, alphas, num_cells, num_features)?;
79 |
80 | info!("All Done!");
81 | Ok(())
82 | }
83 |
84 | fn main() -> io::Result<()> {
85 | let matches = App::new("EDS")
86 | .version("0.1.0")
87 | .author("Avi Srivastava, Mike Love and Rob Patro")
88 | .about("Efficient scData Storage format")
89 | .subcommand(
90 | SubCommand::with_name("randomize")
91 | .about("randomize the order of cells")
92 | .arg(
93 | Arg::with_name("cells")
94 | .long("cells")
95 | .short("c")
96 | .takes_value(true)
97 | .help("Number of cells"),
98 | )
99 | .arg(
100 | Arg::with_name("features")
101 | .long("features")
102 | .short("f")
103 | .takes_value(true)
104 | .help("Number of features"),
105 | )
106 | .arg(
107 | Arg::with_name("input")
108 | .long("input")
109 | .short("i")
110 | .takes_value(true)
111 | .requires("cells")
112 | .requires("features")
113 | .help("path to input file"),
114 | ),
115 | )
116 | .subcommand(
117 | SubCommand::with_name("convert")
118 | .about("comnvert from eds data format to csv or mtx format")
119 | .arg(
120 | Arg::with_name("mtx")
121 | .long("mtx")
122 | .conflicts_with("eds")
123 | .conflicts_with("csv")
124 | .conflicts_with("h5")
125 | .help("convert to matrix market exchange file"),
126 | )
127 | .arg(
128 | Arg::with_name("h5")
129 | .long("h5")
130 | .conflicts_with("eds")
131 | .conflicts_with("csv")
132 | .conflicts_with("mtx")
133 | .help("convert to h5 wrapped csc file"),
134 | )
135 | .arg(
136 | Arg::with_name("csv")
137 | .long("csv")
138 | .conflicts_with("eds")
139 | .conflicts_with("mtx")
140 | .conflicts_with("h5")
141 | .help("convert to comma separated file"),
142 | )
143 | .arg(
144 | Arg::with_name("eds")
145 | .long("eds")
146 | .conflicts_with("csv")
147 | .conflicts_with("mtx")
148 | .conflicts_with("h5")
149 | .help("convert to EDS file"),
150 | )
151 | .arg(
152 | Arg::with_name("cells")
153 | .long("cells")
154 | .short("c")
155 | .takes_value(true)
156 | .help("Number of cells"),
157 | )
158 | .arg(
159 | Arg::with_name("features")
160 | .long("features")
161 | .short("f")
162 | .takes_value(true)
163 | .help("Number of features"),
164 | )
165 | .arg(
166 | Arg::with_name("input")
167 | .long("input")
168 | .short("i")
169 | .takes_value(true)
170 | .requires("cells")
171 | .requires("features")
172 | .help("path to input file"),
173 | ),
174 | )
175 | .get_matches();
176 |
177 | pretty_env_logger::init_timed();
178 | match matches.subcommand_matches("convert") {
179 | Some(sub_m) => {
180 | let ret = convert_file(&sub_m);
181 | return ret;
182 | }
183 | None => (),
184 | };
185 |
186 | match matches.subcommand_matches("randomize") {
187 | Some(sub_m) => {
188 | let ret = randomize_file(&sub_m);
189 | return ret;
190 | }
191 | None => (),
192 | };
193 |
194 | Ok(())
195 | }
196 |
--------------------------------------------------------------------------------
/src-rs/src/mtx.rs:
--------------------------------------------------------------------------------
1 | use flate2::write::GzEncoder;
2 | use flate2::read::GzDecoder;
3 | use flate2::Compression;
4 | use std::fs::File;
5 | use std::io;
6 | use std::collections::HashMap;
7 | use std::io::{Write, BufReader, BufRead};
8 |
9 | use crate::utils::triplets_to_eds;
10 | pub fn reader(
11 | input: &str,
12 | num_cells: usize,
13 | num_genes: usize,
14 | expr: &mut Vec>,
15 | bit_vecs: &mut Vec>,
16 | ) -> Result {
17 | info!("Using {} as input MTX file\n", input);
18 | info!(
19 | "Using {} Rows (cells) and {} Columns (features)",
20 | num_cells, num_genes
21 | );
22 |
23 | let file_handle = File::open(input)?;
24 | let file = BufReader::new( GzDecoder::new(file_handle) );
25 |
26 | let cell_by_gene = true;
27 | let (cell_index, gene_index) = match cell_by_gene {
28 | true => (0, 1),
29 | false => (1, 0),
30 | };
31 |
32 | let mut found_first = false;
33 | let mut triplets: Vec> = vec![ HashMap::new(); num_cells ];
34 |
35 | for line in file.lines() {
36 | let record = line?;
37 | if record.chars().nth(0).unwrap() == '%' {
38 | continue;
39 | }
40 |
41 | let vals: Vec<&str> = record.split("\t")
42 | .collect();
43 |
44 | let gid = vals[gene_index].parse::()
45 | .expect("can't convert gid");
46 | let cid = vals[cell_index].parse::()
47 | .expect("can't convert cid");
48 | let value = vals[2].parse::()
49 | .expect("can't convert value");
50 |
51 | if ! found_first {
52 | found_first = true;
53 |
54 | assert!(num_cells == cid );
55 | assert!(num_genes == gid as usize);
56 | continue;
57 | }
58 |
59 | triplets[cid - 1].insert(gid - 1, value);
60 | }
61 |
62 | triplets_to_eds(&triplets, expr, bit_vecs, num_genes);
63 | Ok(true)
64 | }
65 |
66 | pub fn writer(
67 | path_str: String,
68 | expressions: Vec>,
69 | bit_vecs: Vec>,
70 | num_cells: usize,
71 | num_features: usize,
72 | ) -> Result {
73 | let mut tot_expressed_features = 0;
74 | expressions
75 | .iter()
76 | .for_each(|x| tot_expressed_features += x.len());
77 |
78 | let file_handle = File::create(path_str)?;
79 | let mut file = GzEncoder::new(file_handle, Compression::default());
80 |
81 | let mut header = "%%MatrixMarket\tmatrix\tcoordinate\treal\tgeneral\n".to_string();
82 | header.push_str(&format!(
83 | "{}\t{}\t{}\n",
84 | num_cells, num_features, tot_expressed_features
85 | ));
86 | file.write_all(header.as_bytes())?;
87 |
88 | assert!(
89 | bit_vecs.len() == expressions.len(),
90 | "length of bit vec and expression is not same"
91 | );
92 | for (cell_id, exp) in expressions.into_iter().enumerate() {
93 | let bit_vec = &bit_vecs[cell_id];
94 | let mut fids: Vec = Vec::new();
95 |
96 | for (feature_id, flag) in bit_vec.into_iter().enumerate() {
97 | if *flag != 0 {
98 | for (offset, j) in format!("{:8b}", flag).chars().enumerate() {
99 | match j {
100 | '1' => fids.push((8 * feature_id) + offset),
101 | _ => (),
102 | };
103 | }
104 | }
105 | }
106 |
107 | assert!(
108 | fids.len() == exp.len(),
109 | "#positions doesn't match with #expressed features"
110 | );
111 | let mut mtx_data = "".to_string();
112 | for (index, count) in exp.into_iter().enumerate() {
113 | mtx_data.push_str(&format!(
114 | "{}\t{}\t{}\n",
115 | cell_id + 1,
116 | fids[index] + 1,
117 | count
118 | ));
119 | }
120 |
121 | file.write_all(mtx_data.as_bytes())?;
122 | }
123 |
124 | Ok(true)
125 | }
126 |
--------------------------------------------------------------------------------
/src-rs/src/utils.rs:
--------------------------------------------------------------------------------
1 | use math::round;
2 | use clap::ArgMatches;
3 | use std::collections::HashMap;
4 |
5 | use std;
6 | use std::io;
7 | use rand::thread_rng;
8 | use rand::seq::SliceRandom;
9 |
10 | use crate::{h5, mtx, csv, eds};
11 |
12 | #[derive(Clone, Debug, PartialEq)]
13 | pub enum FileType {
14 | EDS,
15 | MTX,
16 | H5,
17 | CSV,
18 | Dummy(String),
19 | }
20 |
21 | pub fn write_file( file_path: String,
22 | file_type: FileType,
23 | bit_vecs: Vec>,
24 | alphas: Vec>,
25 | num_cells: usize,
26 | num_features: usize,
27 | ) -> Result {
28 | info!("Writing Output into file path: {}", file_path);
29 |
30 | match file_type {
31 | FileType::MTX => mtx::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
32 | FileType::CSV => csv::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
33 | FileType::H5 => h5::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
34 | FileType::EDS => eds::writer(file_path, alphas, bit_vecs, num_cells, num_features)?,
35 | _ => unreachable!(),
36 | };
37 |
38 | Ok(true)
39 | }
40 |
41 | pub fn read_file(file_path: &str,
42 | file_type: FileType,
43 | num_cells: usize,
44 | num_features: usize,
45 | ) -> Result<(Vec>, Vec>), io::Error> {
46 | let mut alphas: Vec> = Vec::new();
47 | let mut bit_vecs: Vec> = Vec::new();
48 |
49 | match file_type {
50 | FileType::EDS => eds::reader(
51 | file_path,
52 | num_cells,
53 | num_features,
54 | &mut alphas,
55 | &mut bit_vecs,
56 | )?,
57 | FileType::MTX => mtx::reader(
58 | file_path,
59 | num_cells,
60 | num_features,
61 | &mut alphas,
62 | &mut bit_vecs,
63 | )?,
64 | _ => unreachable!(),
65 | };
66 |
67 | info!("Done Reading Input file");
68 | Ok((bit_vecs, alphas))
69 | }
70 |
71 | pub fn randomize(bit_vecs: Vec>,
72 | alphas: Vec>,
73 | ) -> Result<(Vec>, Vec>), io::Error> {
74 | info!("Randomizing order");
75 | assert!( bit_vecs.len() == alphas.len() );
76 |
77 | let num_elem = bit_vecs.len() as u32;
78 | let mut order: Vec = (0..num_elem).collect();
79 | order.shuffle(&mut thread_rng());
80 |
81 | let mut shuf_bvecs = vec![Vec::new(); bit_vecs.len()];
82 | let mut shuf_alphas = vec![Vec::new(); bit_vecs.len()];
83 | for (nindex, oindex) in order.into_iter().enumerate() {
84 | shuf_bvecs[nindex] = bit_vecs[oindex as usize].clone();
85 | shuf_alphas[nindex] = alphas[oindex as usize].clone();
86 | }
87 |
88 | Ok((shuf_bvecs, shuf_alphas))
89 | }
90 |
91 | pub fn find_output_format(sub_m: &ArgMatches) -> FileType {
92 | let mut out_file_type: Option = None;
93 | let mut found_file_types = 0;
94 |
95 | out_file_type = match sub_m.is_present("mtx") {
96 | true => {
97 | found_file_types += 1;
98 | Some(FileType::MTX)
99 | }
100 | false => out_file_type,
101 | };
102 |
103 | out_file_type = match sub_m.is_present("csv") {
104 | true => {
105 | found_file_types += 1;
106 | Some(FileType::CSV)
107 | }
108 | false => out_file_type,
109 | };
110 |
111 | out_file_type = match sub_m.is_present("h5") {
112 | true => {
113 | found_file_types += 1;
114 | Some(FileType::H5)
115 | }
116 | false => out_file_type,
117 | };
118 |
119 | out_file_type = match sub_m.is_present("eds") {
120 | true => {
121 | found_file_types += 1;
122 | Some(FileType::EDS)
123 | }
124 | false => out_file_type,
125 | };
126 |
127 | assert!(found_file_types == 1, "found unexpected not 1 file types");
128 | return out_file_type.expect("can't find output format type");
129 | }
130 |
131 | pub fn get_output_path(input_path: &str, otype: FileType) -> (FileType, String) {
132 | let mut itype: FileType = FileType::EDS;
133 | let mut opath = input_path.to_string();
134 | let mut offset: usize = opath.len();
135 | let mut found_file_types = 0;
136 |
137 | match opath.find(".eds") {
138 | Some(val) => {
139 | offset = val;
140 | }
141 | _ => (),
142 | };
143 |
144 | match opath.find(".mtx") {
145 | Some(val) => {
146 | offset = val;
147 | itype = FileType::MTX;
148 | found_file_types += 1;
149 | }
150 | _ => (),
151 | };
152 |
153 | match opath.find(".h5") {
154 | Some(val) => {
155 | offset = val;
156 | itype = FileType::H5;
157 | found_file_types += 1;
158 | }
159 | _ => (),
160 | };
161 |
162 | match opath.find(".csv") {
163 | Some(val) => {
164 | offset = val;
165 | itype = FileType::CSV;
166 | found_file_types += 1;
167 | }
168 | _ => (),
169 | };
170 |
171 | assert!(
172 | found_file_types == 1 || itype == FileType::EDS,
173 | " Can't find right input file type "
174 | );
175 | assert!(
176 | itype != otype,
177 | "Found same input and output file file format"
178 | );
179 |
180 | info!(" Found {:?} as input file type ", itype);
181 | info!(" Found {:?} as output file type ", otype);
182 |
183 | match otype {
184 | FileType::MTX => opath.replace_range(offset.., ".mtx.gz"),
185 | FileType::CSV => opath.replace_range(offset.., ".csv.gz"),
186 | FileType::H5 => opath.replace_range(offset.., ".h5"),
187 | FileType::EDS => opath.replace_range(offset.., ".eds.gz"),
188 | FileType::Dummy(name) => opath.replace_range(offset.., &name),
189 | }
190 |
191 | (itype, opath)
192 | }
193 |
194 | pub fn triplets_to_eds(triplets: &Vec>,
195 | expr: &mut Vec>,
196 | bit_vecs: &mut Vec>,
197 | num_genes: usize,
198 | ) {
199 | for cell_data in triplets {
200 | let mut keys: Vec = cell_data.keys()
201 | .cloned()
202 | .collect();
203 | keys.sort();
204 |
205 | let values: Vec = keys.iter().map( |key| cell_data[key] )
206 | .collect();
207 |
208 | expr.push(values);
209 |
210 | let num_exp_genes = keys.len();
211 | let num_bit_vecs: usize = round::ceil(num_genes as f64 / 8.0, 0) as usize;
212 | let mut bit_vec: Vec = vec![0; num_bit_vecs];
213 |
214 | let mut min_processed_close = 0;
215 | let mut max_processed_open = 8;
216 | let mut curr_index = 0;
217 | let mut flag: u8 = 0;
218 |
219 | for key in keys {
220 | assert!(key >= min_processed_close);
221 | assert!(curr_index < num_bit_vecs);
222 |
223 | let offset: u8 = (key % 8) as u8;
224 | if key < max_processed_open {
225 | flag |= 128u8 >> offset;
226 | } else {
227 | bit_vec[curr_index] = flag;
228 |
229 | while key >= max_processed_open {
230 | curr_index += 1;
231 | min_processed_close = max_processed_open;
232 | max_processed_open += 8;
233 | }
234 | flag = 128u8 >> offset;
235 | }
236 | }
237 | bit_vec[curr_index] = flag;
238 |
239 | let mut num_ones = 0;
240 | for bits in bit_vec.iter() {
241 | num_ones += bits.count_ones();
242 | }
243 | assert!(num_ones as usize == num_exp_genes,
244 | format!("{:?} {:?}", num_ones, num_exp_genes));
245 |
246 | bit_vecs.push(bit_vec);
247 | }
248 | }
249 |
--------------------------------------------------------------------------------