├── R ├── lock.R ├── todo-list.R ├── methods-typeof.R ├── methods-subtract.R ├── misc-docs.R ├── aaa-docs.R ├── internal_misc.R ├── write.R ├── aaa.R ├── methods-apply.R ├── filearray-package.R ├── RcppExports.R ├── header.R ├── helpers.R ├── mapreduce.R ├── methods-subset.R └── methods-subsetAssign.R ├── .github ├── .gitignore └── workflows │ ├── pkgdown.yaml │ ├── R-CMD-check.yaml │ └── rhub.yaml ├── adhoc ├── .gitignore ├── other docs │ ├── comparisons.md │ ├── .gitignore │ ├── comparison-threads-coldstart.png │ ├── comparison-threads-warmstart.png │ ├── comparison-singlethread-coldstart.png │ ├── readme-figure.R │ ├── comparison-singlethread.R │ └── comparison-threads.R ├── readme-speed.png ├── rchk.sh ├── debug1.R ├── performance.R ├── performance-write.R ├── debug.R ├── map-test.R ├── gctorture.R ├── profiling2.R ├── profiling.R └── readme-figure.R ├── inst ├── WORDLIST ├── hexbadge.png └── include │ ├── TinyParallel.h │ └── TinyParallel │ ├── Common.h │ ├── TinyThread.h │ └── Timer.h ├── src ├── .gitignore ├── Makevars ├── common.h ├── threadSettings.h ├── defs.h ├── load.h ├── threadSettings.cpp ├── utils.h ├── serialize.h ├── interfaces.cpp ├── save.h ├── core.h ├── conversion.h └── serialize.cpp ├── vignettes └── .gitignore ├── cran-comments.md ├── _pkgdown.yml ├── CRAN-SUBMISSION ├── .Rbuildignore ├── tests ├── testthat.R └── testthat │ ├── test-method_subset.R │ ├── test-method_subsetAssign.R │ ├── test-as_filearray.R │ ├── test-methods_simple.R │ ├── test-methods.R │ ├── test-dimnames.R │ ├── test-helpers.R │ ├── test-bind.R │ ├── test-map.R │ └── test-method_sub.R ├── filearray.Rproj ├── .gitignore ├── man ├── filearray_threads.Rd ├── typeof.Rd ├── apply.Rd ├── FileArray-class.Rd ├── fwhich.Rd ├── S3-filearray.Rd ├── filearray_bind.Rd ├── mapreduce.Rd ├── fmap.Rd └── filearray.Rd ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md └── README.md /R/lock.R: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /adhoc/.gitignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | -------------------------------------------------------------------------------- /adhoc/other docs/comparisons.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | FileArray 2 | OpenMP 3 | -------------------------------------------------------------------------------- /adhoc/other docs/.gitignore: -------------------------------------------------------------------------------- 1 | *.RData 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS= -I../inst/include 2 | PKG_LIBS= 3 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## R CMD check results 2 | 3 | 0 errors | 0 warnings | 0 notes 4 | -------------------------------------------------------------------------------- /inst/hexbadge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipterix/filearray/HEAD/inst/hexbadge.png -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://dipterix.org/filearray/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /adhoc/readme-speed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/readme-speed.png -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 0.1.9 2 | Date: 2024-11-08 16:38:30 UTC 3 | SHA: 5f59eb137a26b7e2915a36884d8742407ce9fa72 4 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_COMMON_H 2 | #define FARR_COMMON_H 3 | 4 | #include 5 | 6 | #endif // FARR_COMMON_H -------------------------------------------------------------------------------- /adhoc/other docs/comparison-threads-coldstart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/other docs/comparison-threads-coldstart.png -------------------------------------------------------------------------------- /adhoc/other docs/comparison-threads-warmstart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/other docs/comparison-threads-warmstart.png -------------------------------------------------------------------------------- /R/todo-list.R: -------------------------------------------------------------------------------- 1 | 2 | # message("TODO: add test functions for `apply` on proxy arrays") 3 | # message("TODO: add test functions for `mapreduce` on proxy arrays") 4 | -------------------------------------------------------------------------------- /adhoc/other docs/comparison-singlethread-coldstart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/other docs/comparison-singlethread-coldstart.png -------------------------------------------------------------------------------- /adhoc/rchk.sh: -------------------------------------------------------------------------------- 1 | # rm "$HOME/Dropbox/projects/filearray_0.1.3.9001.tar.gz" 2 | docker run -v "$HOME/Dropbox/projects":/projects kalibera/rchk:latest "/projects/filearray_0.1.3.9001.tar.gz" 3 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | tests2 4 | README.md 5 | ^adhoc 6 | ^\.github$ 7 | ^_pkgdown\.yml$ 8 | ^docs$ 9 | ^pkgdown$ 10 | ^cran-comments\.md$ 11 | ^CRAN-RELEASE$ 12 | ^inst/hexbadge\.png$ 13 | ^CRAN-SUBMISSION$ 14 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(filearray) 3 | 4 | # Sys.setenv("FILEARRAY_SKIP_COLLAPSE" = "TRUE") 5 | 6 | cat(utils::capture.output({ 7 | print(Sys.getenv()) 8 | }), sep = "\n", file = stderr()) 9 | 10 | test_check("filearray") 11 | 12 | filearray:::clear_cache() 13 | -------------------------------------------------------------------------------- /src/threadSettings.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_OPENMP_H 2 | #define FARR_OPENMP_H 3 | 4 | #ifdef _OPENMP 5 | #include 6 | #include 7 | #define FARR_HAS_OPENMP true 8 | #else 9 | #define omp_get_thread_num() 0 10 | #define omp_get_max_threads() 1 11 | #define FARR_HAS_OPENMP false 12 | #endif 13 | 14 | #include "common.h" 15 | #include 16 | 17 | 18 | int getThreads(const bool& max = false); 19 | 20 | 21 | #endif // FARR_OPENMP_H 22 | -------------------------------------------------------------------------------- /filearray.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: 21f88fad-b806-4cce-a5b4-b11d735e9125 3 | 4 | RestoreWorkspace: No 5 | SaveWorkspace: No 6 | AlwaysSaveHistory: No 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 4 11 | Encoding: UTF-8 12 | 13 | RnwWeave: knitr 14 | LaTeX: pdfLaTeX 15 | 16 | BuildType: Package 17 | PackageUseDevtools: Yes 18 | PackageInstallArgs: --no-multiarch --with-keep.source 19 | PackageCheckArgs: --as-cran 20 | PackageRoxygenize: rd,collate,namespace,vignette 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | # Session Data files 5 | .RData 6 | # Example code in package build process 7 | *-Ex.R 8 | # RStudio files 9 | .Rproj.user/ 10 | # produced vignettes 11 | vignettes/*.html 12 | vignettes/*.pdf 13 | vignettes/UsingFilematrices_cache/* 14 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 15 | .httr-oauth 16 | .Rproj.user 17 | # extra tests 18 | tests2/ 19 | *.DS_Store 20 | docs 21 | CRAN-RELEASE 22 | inst/doc 23 | # C++ related 24 | *.a 25 | *.dll 26 | *.o 27 | *.so 28 | *.dll 29 | -------------------------------------------------------------------------------- /man/filearray_threads.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/misc-docs.R 3 | \name{filearray_threads} 4 | \alias{filearray_threads} 5 | \title{Set or get file array threads} 6 | \usage{ 7 | filearray_threads(n, ...) 8 | } 9 | \arguments{ 10 | \item{n}{number of threads to set. If \code{n} is negative, 11 | then default to the number of cores that computer has.} 12 | 13 | \item{...}{internally used} 14 | } 15 | \value{ 16 | An integer of current number of threads 17 | } 18 | \description{ 19 | Will enable/disable multi-threaded reading or writing 20 | at \code{C++} level. 21 | } 22 | -------------------------------------------------------------------------------- /src/defs.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_DEF_H 2 | #define FARR_DEF_H 3 | 4 | #ifndef NA_INTEGER64 5 | #define NA_INTEGER64 LLONG_MIN 6 | #endif 7 | 8 | #ifndef FARR_HEADER_LENGTH 9 | #define FARR_HEADER_LENGTH 1024 10 | #endif 11 | 12 | /********************************************************** 13 | * Extended types 14 | ***********************************************************/ 15 | 16 | #define FLTSXP 26 17 | #define FLOAT(x) ((float*) INTEGER(x)) 18 | 19 | const static float NA_FLOAT = NAN; 20 | const static Rbyte NA_RBYTE = 2; 21 | 22 | #define INT64SXP REALSXP 23 | #define INTEGER64(x) ((int64_t*) REAL(x)) 24 | 25 | #endif // FARR_DEF_H -------------------------------------------------------------------------------- /man/typeof.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods-typeof.R 3 | \name{typeof} 4 | \alias{typeof} 5 | \alias{typeof,FileArray-method} 6 | \alias{typeof,FileArrayProxy-method} 7 | \title{The type of a file array (extended)} 8 | \usage{ 9 | typeof(x) 10 | 11 | \S4method{typeof}{FileArray}(x) 12 | 13 | \S4method{typeof}{FileArrayProxy}(x) 14 | } 15 | \arguments{ 16 | \item{x}{any file array} 17 | } 18 | \value{ 19 | A character string. The possible values are \code{"double"}, 20 | \code{"integer"}, \code{"logical"}, and \code{"raw"} 21 | } 22 | \description{ 23 | The type of a file array (extended) 24 | } 25 | -------------------------------------------------------------------------------- /src/load.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_LOAD_H 2 | #define FARR_LOAD_H 3 | 4 | #include "common.h" 5 | 6 | SEXP FARR_subset2( 7 | const std::string& filebase, 8 | const SEXP listOrEnv, 9 | const SEXP reshape, 10 | const bool drop, 11 | const bool use_dimnames, 12 | size_t thread_buffer, 13 | int split_dim, 14 | const int strict 15 | ); 16 | 17 | SEXP FARR_subset_sequential( 18 | const std::string& filebase, 19 | const int64_t& unit_partlen, 20 | SEXP cum_partsizes, 21 | SEXPTYPE array_type, 22 | SEXP ret, 23 | const int64_t from, 24 | const int64_t len 25 | ); 26 | 27 | #endif // FARR_LOAD_H -------------------------------------------------------------------------------- /src/threadSettings.cpp: -------------------------------------------------------------------------------- 1 | #include "threadSettings.h" 2 | 3 | #include 4 | #include 5 | 6 | // [[Rcpp::export]] 7 | SEXP getDefaultNumThreads() { 8 | SEXP threadsSEXP = PROTECT(Rf_allocVector(INTSXP, 1)); 9 | INTEGER(threadsSEXP)[0] = tthread::thread::hardware_concurrency(); 10 | UNPROTECT(1); 11 | return threadsSEXP; 12 | } 13 | 14 | // [[Rcpp::export]] 15 | int getThreads(const bool& max){ 16 | int maxThreads = tthread::thread::hardware_concurrency(); 17 | if( max ) { 18 | return( maxThreads ); 19 | } 20 | int n = TinyParallel::resolveValue("FILEARRAY_NUM_THREADS", -1, maxThreads); 21 | if( n <= 0 || n > maxThreads ) { 22 | n = maxThreads; 23 | } 24 | return( n ); 25 | } 26 | -------------------------------------------------------------------------------- /R/methods-typeof.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title The type of a file array (extended) 3 | #' @param x any file array 4 | #' @return A character string. The possible values are \code{"double"}, 5 | #' \code{"integer"}, \code{"logical"}, and \code{"raw"} 6 | #' @export 7 | setGeneric("typeof") 8 | 9 | #' @rdname typeof 10 | setMethod('typeof', signature(x = "FileArray"), function(x){ 11 | if(!x$valid()){ 12 | stop("Invalid file array") 13 | } 14 | x$type() 15 | }) 16 | 17 | #' @rdname typeof 18 | setMethod('typeof', signature(x = "FileArrayProxy"), function(x){ 19 | if(!x$valid()){ 20 | stop("Invalid file array") 21 | } 22 | if( length(x$.ops) ) { 23 | final_op <- x$.ops[[length(x$.ops)]] 24 | return( final_op$output_type ) 25 | } else { 26 | x$type() 27 | } 28 | }) 29 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_UTILS_H 2 | #define FARR_UTILS_H 3 | 4 | #include "common.h" 5 | #include "defs.h" 6 | #include "core.h" 7 | 8 | int guess_splitdim(SEXP dim, int elem_size, size_t buffer_bytes); 9 | 10 | void set_buffer(SEXP dim, int elem_size, size_t buffer_bytes, int split_dim); 11 | 12 | SEXPTYPE file_buffer_sxptype(SEXPTYPE array_type); 13 | SEXPTYPE array_memory_sxptype(SEXPTYPE array_type); 14 | 15 | int file_element_size(SEXPTYPE array_type); 16 | int memory_element_size(SEXPTYPE array_type); 17 | 18 | int kinda_sorted(SEXP idx, int64_t min_, int64_t buffer_count); 19 | 20 | SEXP check_missing_dots(const SEXP env); 21 | 22 | SEXP sub_vec(SEXP x, SEXP idx_int64); 23 | SEXP sub_vec_range(SEXP x, const R_xlen_t& min_, const R_xlen_t& len_); 24 | 25 | SEXP reshape_or_drop(SEXP x, SEXP reshape, bool drop); 26 | 27 | SEXP subset_dimnames(SEXP dimnames, SEXP sliceIdx); 28 | 29 | 30 | #endif // FARR_UTILS_H -------------------------------------------------------------------------------- /src/serialize.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_UNSERIALIZE_H 2 | #define FARR_UNSERIALIZE_H 3 | 4 | #include "common.h" 5 | 6 | // buffer type 7 | typedef struct { 8 | size_t length; 9 | size_t pos; 10 | unsigned char *data; 11 | } buffer_t; 12 | 13 | int read_byte(R_inpstream_t stream); 14 | void read_bytes(R_inpstream_t stream, void *dst, int length); 15 | 16 | SEXP unserialize_raw(SEXP x); 17 | SEXP unserialize_connection(FILE* conn, size_t len); 18 | 19 | /********************************************************** 20 | * Endianess 21 | ***********************************************************/ 22 | bool isLittleEndian(); 23 | 24 | void swap_endianess(void *ptr, const size_t& size, const size_t& nmemb); 25 | 26 | size_t lendian_fwrite(void *ptr, size_t size, size_t nmemb, FILE *stream); 27 | 28 | size_t lendian_fread(void *ptr, size_t size, size_t nmemb, FILE *stream); 29 | 30 | void lendian_assign(void* dst, const void* src, const size_t& elem_size, const size_t& nelems = 1); 31 | 32 | #endif // FARR_UNSERIALIZE_H -------------------------------------------------------------------------------- /man/apply.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods-apply.R 3 | \name{apply} 4 | \alias{apply} 5 | \alias{apply,FileArray-method} 6 | \alias{apply,FileArrayProxy-method} 7 | \title{Apply functions over file array margins (extended)} 8 | \usage{ 9 | apply(X, MARGIN, FUN, ..., simplify = TRUE) 10 | 11 | \S4method{apply}{FileArray}(X, MARGIN, FUN, ..., simplify = TRUE) 12 | 13 | \S4method{apply}{FileArrayProxy}(X, MARGIN, FUN, ..., simplify = TRUE) 14 | } 15 | \arguments{ 16 | \item{X}{a file array} 17 | 18 | \item{MARGIN}{scalar giving the subscripts which the function will be applied over. Current implementation only allows margin size to be one} 19 | 20 | \item{FUN}{the function to be applied} 21 | 22 | \item{...}{optional arguments to \code{FUN}} 23 | 24 | \item{simplify}{a logical indicating whether results should be simplified if possible} 25 | } 26 | \value{ 27 | See Section 'Value' in \code{\link[base]{apply}}; 28 | } 29 | \description{ 30 | Apply functions over file array margins (extended) 31 | } 32 | -------------------------------------------------------------------------------- /R/methods-subtract.R: -------------------------------------------------------------------------------- 1 | # 2 | # fa_subtract <- function(e1, e2) { 3 | # call <- match.call() 4 | # call[[1]] <- quote(`-`) 5 | # label <- sprintf("Calculating: %s (fa_subtract)", deparse1(call)) 6 | # 7 | # fa_pairwise_operator(e1, e2, op = "-", label = label) 8 | # } 9 | # 10 | # 11 | # 12 | # setMethod('-', signature(e1 = "FileArray", e2 = "FileArray"), fa_subtract) 13 | # 14 | # 15 | # setMethod('-', signature(e1 = "FileArray", e2 = "numeric"), fa_subtract) 16 | # 17 | # setMethod('-', signature(e1 = "numeric", e2 = "FileArray"), fa_subtract) 18 | # 19 | # 20 | # setMethod('-', signature(e1 = "FileArray", e2 = "complex"), fa_subtract) 21 | # 22 | # setMethod('-', signature(e1 = "complex", e2 = "FileArray"), fa_subtract) 23 | # 24 | # 25 | # setMethod('-', signature(e1 = "FileArray", e2 = "logical"), fa_subtract) 26 | # 27 | # setMethod('-', signature(e1 = "logical", e2 = "FileArray"), fa_subtract) 28 | # 29 | # 30 | # setMethod('-', signature(e1 = "FileArray", e2 = "array"), fa_subtract) 31 | # 32 | # setMethod('-', signature(e1 = "array", e2 = "FileArray"), fa_subtract) 33 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: filearray 2 | Type: Package 3 | Title: File-Backed Array for Out-of-Memory Computation 4 | Version: 0.2.0 5 | Language: en-US 6 | Encoding: UTF-8 7 | License: LGPL-3 8 | URL: https://dipterix.org/filearray/, 9 | https://github.com/dipterix/filearray 10 | BugReports: https://github.com/dipterix/filearray/issues 11 | Authors@R: c( 12 | person( 13 | "Zhengjia", "Wang", role = c("aut", "cre", "cph"), 14 | email = "dipterix.wang@gmail.com" 15 | )) 16 | Description: Stores large arrays in files to avoid occupying large 17 | memories. Implemented with super fast gigabyte-level multi-threaded 18 | reading/writing via 'OpenMP'. Supports multiple non-character data 19 | types (double, float, complex, integer, logical, and raw). 20 | Imports: 21 | digest, 22 | fastmap (>= 1.1.1), 23 | methods, 24 | Rcpp, 25 | uuid (>= 1.1.0) 26 | Suggests: 27 | bit64, 28 | knitr, 29 | rmarkdown, 30 | testthat (>= 3.0.0) 31 | RoxygenNote: 7.3.2 32 | LinkingTo: 33 | BH, 34 | Rcpp 35 | Config/testthat/edition: 3 36 | VignetteBuilder: knitr 37 | -------------------------------------------------------------------------------- /src/interfaces.cpp: -------------------------------------------------------------------------------- 1 | #include "core.h" 2 | #include "save.h" 3 | #include "load.h" 4 | using namespace Rcpp; 5 | 6 | // [[Rcpp:interfaces(cpp)]] 7 | 8 | // [[Rcpp::export]] 9 | SEXP filearray_meta( 10 | const std::string& filebase 11 | ){ 12 | List re = FARR_meta(filebase); 13 | return( re ); 14 | } 15 | 16 | // [[Rcpp::export]] 17 | SEXP filearray_assign( 18 | const std::string& filebase, SEXP value, 19 | const SEXP position_indices 20 | ) { 21 | size_t thread_buffer = get_buffer_size(); 22 | FARR_subset_assign2( 23 | filebase, value, position_indices, 24 | thread_buffer, 0); 25 | return(R_NilValue); 26 | } 27 | 28 | // [[Rcpp::export]] 29 | SEXP filearray_subset( 30 | const std::string& filebase, 31 | const SEXP position_indices, 32 | const bool drop = true, 33 | const bool use_dimnames = true, 34 | const SEXP reshape = R_NilValue 35 | ) { 36 | size_t thread_buffer = get_buffer_size(); 37 | SEXP ret = PROTECT(FARR_subset2(filebase, position_indices, reshape, drop, 38 | use_dimnames, thread_buffer, 0, 1)); 39 | UNPROTECT(1); 40 | return( ret ); 41 | } -------------------------------------------------------------------------------- /src/save.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_SAVE_H 2 | #define FARR_SAVE_H 3 | 4 | #include "common.h" 5 | /** 6 | * @param filebase root path of array, must be absolute and "corrected" (ends with "/" or "\\", depending on system) 7 | * @param sch returned from `schedule`, scheduled indices 8 | * @param value value to set, must be coerced 9 | * @param buff_ptrs buffer points: length of `buff_ptrs` are the number of cores; each element must be at least buffer size in bytes (get_buffer_size()) 10 | */ 11 | 12 | SEXP FARR_subset_assign2( 13 | const std::string& filebase, 14 | SEXP value, 15 | const SEXP listOrEnv, 16 | const size_t thread_buffer, 17 | int split_dim 18 | ); 19 | 20 | SEXP FARR_subset_assign_sequential( 21 | const std::string& filebase, 22 | const int64_t& unit_partlen, 23 | SEXP cum_partsizes, 24 | SEXPTYPE array_type, 25 | SEXP value, 26 | const int64_t from 27 | ); 28 | 29 | SEXP FARR_subset_assign_sequential_bare( 30 | const std::string& filebase, 31 | const int64_t& unit_partlen, 32 | SEXP cum_partsizes, 33 | SEXPTYPE array_type, 34 | SEXP value_, 35 | const int64_t from 36 | ); 37 | 38 | #endif //FARR_SAVE_H 39 | 40 | -------------------------------------------------------------------------------- /inst/include/TinyParallel.h: -------------------------------------------------------------------------------- 1 | #ifndef __FILEARRAY_PARALLEL__ 2 | #define __FILEARRAY_PARALLEL__ 3 | 4 | // TinyThread implementation 5 | #include "TinyParallel/TinyThread.h" 6 | 7 | namespace TinyParallel { 8 | 9 | inline void parallelFor(std::size_t begin, 10 | std::size_t end, 11 | Worker& worker, 12 | std::size_t grainSize = 1, 13 | int numThreads = -1) 14 | { 15 | grainSize = resolveValue("FILEARRAY_GRAIN_SIZE", grainSize, 1u); 16 | numThreads = resolveValue("FILEARRAY_NUM_THREADS", numThreads, -1); 17 | 18 | ttParallelFor(begin, end, worker, grainSize); 19 | } 20 | 21 | template 22 | inline void parallelReduce(std::size_t begin, 23 | std::size_t end, 24 | Reducer& reducer, 25 | std::size_t grainSize = 1, 26 | int numThreads = -1) 27 | { 28 | grainSize = resolveValue("FILEARRAY_GRAIN_SIZE", grainSize, 1); 29 | numThreads = resolveValue("FILEARRAY_NUM_THREADS", numThreads, -1); 30 | 31 | ttParallelReduce(begin, end, reducer, grainSize); 32 | } 33 | 34 | } // end namespace TinyParallel 35 | 36 | #endif // __FILEARRAY_PARALLEL__ 37 | -------------------------------------------------------------------------------- /tests/testthat/test-method_subset.R: -------------------------------------------------------------------------------- 1 | test_that("subset filearray-proxy", { 2 | 3 | # normal indexing 4 | x <- as_filearray(1:120, dimension = c(10,12)) 5 | dimnames(x) <- list(A = 1:10, B = 1:12) 6 | y <- x + 1L 7 | z <- y + y 8 | 9 | idx1 <- c(3,3,4,5,5,4,3,2,1,1,2,3,3,2,2) 10 | idx2 <- idx1 * 2 11 | 12 | expect_equal(y[idx1, idx2], x[idx1, idx2] + 1) 13 | expect_equal(z[idx1, idx2], 2 * y[idx1, idx2]) 14 | 15 | expect_equal(y[, idx2], x[, idx2] + 1) 16 | expect_equal(z[, idx2], 2 * y[, idx2]) 17 | 18 | expect_equal(y[idx1, ], x[idx1, ] + 1) 19 | expect_equal(z[idx1, ], 2 * y[idx1, ]) 20 | 21 | # filearray as index 22 | i <- array(FALSE, dim(x)) 23 | i[idx1, idx2] <- TRUE 24 | j <- as_filearray(i) 25 | expect_equal(typeof(j), "logical") 26 | 27 | expect_equal(x[i], x[][i]) 28 | expect_equal(x[i], x[][j[]]) 29 | 30 | expect_equal(y[i], x[i] + 1L) 31 | expect_equal(y[j], y[i]) 32 | 33 | expect_equal(z[i], y[i] * 2) 34 | expect_equal(z[j], z[i]) 35 | 36 | # integer index 37 | idx <- rep(which(i), 2) 38 | 39 | # expect_equal(x[idx], x[][idx]) 40 | # expect_equal(y[idx], y[idx]) 41 | # expect_equal(z[idx], z[idx]) 42 | expect_error(x[idx]) 43 | 44 | }) 45 | -------------------------------------------------------------------------------- /adhoc/debug1.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | loadNamespace("bit64") 3 | unlink('~/Desktop/junk/arr', recursive = TRUE) 4 | arr <- filearray::filearray_create( 5 | filebase = '~/Desktop/junk/arr', 6 | dimension = c(861584, 233), 7 | type = "integer", partition_size = 1 8 | ) 9 | 10 | parition_size <- 9001 11 | niters <- 96 12 | begin <- 216025 13 | end <- 225025 14 | by <- 9001 15 | # arr[begin-1 + seq_len(9000),] <- rep(1:233, 9000) 16 | arr[begin-1 + seq_len(9001),] <- rep(1:233, 9001) 17 | 18 | 0x102204000 19 | 0x1021fb360 20 | 21 | filearray::filearray_threads(1) 22 | 23 | arr[1,] <- 1:233 24 | 25 | arr[seq_len(by),] <- rep(1:233,by) 26 | arr[end,] <- rep(1:233,1) 27 | i <- begin 28 | arr[i-1 + 1:100,] <- rep(1:233,100) 29 | 30 | 31 | # 0, skip=0 32 | # idx start-end: 0 - 0 33 | # [1] "/Users/dipterix/Desktop/junk/arr/0.farr" 34 | # block_size: 861584, idx1len: 9001, idx1_start: 216024, idx2_start: 0, idx2_len: 1 35 | # ### idx2ii:0, start_loc: 0, buf pos: 0, idx1_start: 216024 36 | 37 | # *idx1ptr: 216024, idx1_start: 216024 38 | # *idx1ptr: 225024, idx1_start: 216024 39 | arr[begin-1 + seq_len(by),] <- rep(1:233,by) 40 | 41 | 42 | for(i in seq(begin, 861584-by+1, by = by)) { 43 | print(i) 44 | try({ arr[i-1 + seq_len(by),] <- rep(1:233,by) }) 45 | } 46 | 225024 - 216024 47 | 48 | file.size(arr$partition_path(1)) 49 | (3447360 - 1024) / 4 50 | -------------------------------------------------------------------------------- /R/misc-docs.R: -------------------------------------------------------------------------------- 1 | #' @title Set or get file array threads 2 | #' @description Will enable/disable multi-threaded reading or writing 3 | #' at \code{C++} level. 4 | #' @param n number of threads to set. If \code{n} is negative, 5 | #' then default to the number of cores that computer has. 6 | #' @param ... internally used 7 | #' @return An integer of current number of threads 8 | #' @export 9 | filearray_threads <- function(n, ...){ 10 | 11 | if(!missing(n)){ 12 | setThreads(n, ...) 13 | } 14 | 15 | return(getThreads(FALSE)) 16 | } 17 | 18 | setThreads <- function (n = "auto", stack_size = "auto", ...) { 19 | if (identical(n, "auto")) { 20 | n <- -1L 21 | } else { 22 | n <- as.integer(n) 23 | if (length(n) != 1 || is.na(n) || !is.numeric(n)) { 24 | stop("n must be an integer") 25 | } 26 | } 27 | if (identical(stack_size, "auto")) { 28 | stack_size <- 0L 29 | } else if (!is.numeric(stack_size)) { 30 | stop("stack_size must be an integer") 31 | } else { 32 | stack_size <- as.integer(stack_size) 33 | } 34 | if (n == -1L) { 35 | Sys.unsetenv("FILEARRAY_NUM_THREADS") 36 | } else { 37 | Sys.setenv(FILEARRAY_NUM_THREADS = n) 38 | } 39 | if (stack_size == 0L) { 40 | Sys.unsetenv("FILEARRAY_STACK_SIZE") 41 | } else { 42 | Sys.setenv(FILEARRAY_STACK_SIZE = stack_size) 43 | } 44 | invisible() 45 | } 46 | -------------------------------------------------------------------------------- /R/aaa-docs.R: -------------------------------------------------------------------------------- 1 | ## usethis namespace: start 2 | #' @importFrom Rcpp sourceCpp 3 | #' @useDynLib filearray, .registration = TRUE 4 | ## usethis namespace: end 5 | NULL 6 | 7 | #' @title 'S3' methods for 'FileArray' 8 | #' @name S3-filearray 9 | #' @description These are 'S3' methods for 'FileArray' 10 | #' @param x a file array 11 | #' @param drop whether to drop dimensions; see topic \code{\link[base]{Extract}} 12 | #' @param reshape a new dimension to set before returning subset results; default is \code{NULL} (use default dimensions) 13 | #' @param strict whether to allow indices to exceed bound; currently only accept \code{TRUE} 14 | #' @param dimnames whether to preserve \code{\link[base]{dimnames}} 15 | #' @param value value to substitute or set 16 | #' @param na.rm whether to remove \code{NA} values during the calculation 17 | #' @param split_dim internally used; split dimension and calculate indices to 18 | #' manually speed up the subset; value ranged from 0 to size of dimension minus 19 | #' one. 20 | #' @param lazy whether to lazy-evaluate the method, only works when assigning 21 | #' arrays with logical array index 22 | #' @param i,... index set, or passed to other methods 23 | #' @param .env environment to evaluate formula when evaluating subset margin indices. 24 | NULL 25 | 26 | #' @name S4-filearray 27 | #' @title 'S4' methods for \code{FileArray} 28 | #' @param x,z,e1,e2 \code{FileArray} or compatible data 29 | #' @param base,digits,... passed to other methods 30 | #' @returns See \code{\link[methods]{S4groupGeneric}} 31 | NULL 32 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method("[",FileArray) 4 | S3method("[<-",FileArray) 5 | S3method("[[",FileArray) 6 | S3method("dimnames<-",FileArray) 7 | S3method(as.array,FileArray) 8 | S3method(as_filearray,FileArray) 9 | S3method(as_filearray,FileArrayProxy) 10 | S3method(as_filearray,character) 11 | S3method(as_filearray,default) 12 | S3method(as_filearrayproxy,FileArray) 13 | S3method(as_filearrayproxy,FileArrayProxy) 14 | S3method(as_filearrayproxy,default) 15 | S3method(dim,FileArray) 16 | S3method(dimnames,FileArray) 17 | S3method(fwhich,FileArray) 18 | S3method(fwhich,default) 19 | S3method(length,FileArray) 20 | S3method(max,FileArray) 21 | S3method(min,FileArray) 22 | S3method(range,FileArray) 23 | S3method(subset,FileArray) 24 | S3method(sum,FileArray) 25 | export(apply) 26 | export(as_filearray) 27 | export(as_filearrayproxy) 28 | export(filearray_bind) 29 | export(filearray_checkload) 30 | export(filearray_create) 31 | export(filearray_load) 32 | export(filearray_load_or_create) 33 | export(filearray_threads) 34 | export(fmap) 35 | export(fmap2) 36 | export(fmap_element_wise) 37 | export(fwhich) 38 | export(mapreduce) 39 | export(typeof) 40 | exportClasses(FileArray) 41 | exportClasses(FileArrayProxy) 42 | exportMethods(apply) 43 | exportMethods(mapreduce) 44 | importFrom(Rcpp,sourceCpp) 45 | importFrom(methods,new) 46 | importFrom(methods,setGeneric) 47 | importFrom(methods,setMethod) 48 | importFrom(methods,setRefClass) 49 | importFrom(methods,signature) 50 | useDynLib(filearray, .registration = TRUE) 51 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | permissions: read-all 15 | 16 | jobs: 17 | pkgdown: 18 | runs-on: ubuntu-latest 19 | # Only restrict concurrency for non-PR jobs 20 | concurrency: 21 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 22 | env: 23 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 24 | permissions: 25 | contents: write 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - uses: r-lib/actions/setup-pandoc@v2 30 | 31 | - uses: r-lib/actions/setup-r@v2 32 | with: 33 | use-public-rspm: true 34 | 35 | - uses: r-lib/actions/setup-r-dependencies@v2 36 | with: 37 | extra-packages: any::pkgdown, local::. 38 | needs: website 39 | 40 | - name: Build site 41 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 42 | shell: Rscript {0} 43 | 44 | - name: Deploy to GitHub pages 🚀 45 | if: github.event_name != 'pull_request' 46 | uses: JamesIves/github-pages-deploy-action@v4.5.0 47 | with: 48 | clean: false 49 | branch: gh-pages 50 | folder: docs 51 | -------------------------------------------------------------------------------- /src/core.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_CORE_H 2 | #define FARR_CORE_H 3 | 4 | #include "common.h" 5 | #include "defs.h" 6 | 7 | /********************************************************** 8 | * Buffer 9 | ***********************************************************/ 10 | 11 | int system_buffer_size(); 12 | 13 | int set_buffer_size(int size); 14 | 15 | int get_buffer_size(); 16 | 17 | /********************************************************** 18 | * Load meta 19 | ***********************************************************/ 20 | 21 | std::string correct_filebase(const std::string& filebase); 22 | 23 | Rcpp::List FARR_meta(const std::string& filebase); 24 | 25 | /********************************************************** 26 | * Schedule reading 27 | ***********************************************************/ 28 | SEXP locationList(const SEXP listOrEnv, const Rcpp::NumericVector& dim, const int strict); 29 | 30 | SEXP addCycle(SEXP x, SEXP ret, const R_xlen_t step = 1, const R_xlen_t mag = 1); 31 | 32 | SEXP loc2idx(const Rcpp::List sliceIdx, const Rcpp::NumericVector& dim); 33 | 34 | Rcpp::List schedule(const SEXP listOrEnv, 35 | const Rcpp::NumericVector& dim, 36 | const Rcpp::NumericVector& cum_part_sizes, 37 | const int split_dim, const int strict = 1); 38 | 39 | /********************************************************** 40 | * Utils 41 | ***********************************************************/ 42 | 43 | SEXP seq_len_int64(const R_xlen_t len); 44 | double prod_double(const Rcpp::NumericVector& x); 45 | 46 | #endif // FARR_CORE_H -------------------------------------------------------------------------------- /R/internal_misc.R: -------------------------------------------------------------------------------- 1 | # misc internal functions that are byte-compiled 2 | 3 | # move here to byte-compile 4 | .max_mapper1 <- function(data, size, idx){ 5 | if(!length(size)){ 6 | return(numeric(0)) 7 | } 8 | max(data, na.rm = TRUE) 9 | } 10 | .max_mapper2 <- function(data, size, idx){ 11 | if(!length(size)){ 12 | return(numeric(0)) 13 | } 14 | if(length(data) != size){ 15 | data <- data[seq_len(size)] 16 | } 17 | max(data, na.rm = FALSE) 18 | } 19 | .min_mapper1 <- function(data, size, idx){ 20 | if(!length(size)){ 21 | return(numeric(0)) 22 | } 23 | min(data, na.rm = TRUE) 24 | } 25 | .min_mapper2 <- function(data, size, idx){ 26 | if(!length(size)){ 27 | return(numeric(0)) 28 | } 29 | if(length(data) != size){ 30 | data <- data[seq_len(size)] 31 | } 32 | min(data, na.rm = FALSE) 33 | } 34 | .range_mapper1 <- function(data, size, idx){ 35 | if(!length(size)){ 36 | return(numeric(0)) 37 | } 38 | range(data, na.rm = TRUE) 39 | } 40 | .range_mapper2 <- function(data, size, idx){ 41 | if(!length(size)){ 42 | return(numeric(0)) 43 | } 44 | if(length(data) != size){ 45 | data <- data[seq_len(size)] 46 | } 47 | range(data, na.rm = FALSE) 48 | } 49 | 50 | .sum_mapper1 <- function(data, size, idx){ 51 | if(!length(size)){ 52 | return(0) 53 | } 54 | sum(data, na.rm = TRUE) 55 | } 56 | .sum_mapper2 <- function(data, size, idx){ 57 | if(!length(size)){ 58 | return(0) 59 | } 60 | if(length(data) != size){ 61 | data <- data[seq_len(size)] 62 | } 63 | sum(data, na.rm = FALSE) 64 | } 65 | -------------------------------------------------------------------------------- /adhoc/performance.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | 3 | # 800 MB data 4 | dim3 <- 100 5 | 6 | set.seed(1) 7 | file <- tempfile() 8 | unlink(file, recursive = TRUE) 9 | 10 | dim <- c(400, 500, dim3, 5) 11 | 12 | arr <- filearray_create(file, dim, type = 'integer', partition_size = 2) 13 | x <- array(as.integer(seq_len(prod(dim))), dim) 14 | for(ii in 1:5){ 15 | x[1,1,1,ii] <- NA 16 | } 17 | 18 | system.time({ 19 | arr$set_partition(1, x[,,,1:2], lock = FALSE) 20 | arr$set_partition(2, x[,,,3:4], lock = FALSE) 21 | arr$set_partition(3, x[,,,5],,,,1,lock = FALSE) 22 | }) 23 | 24 | basefile <- paste0(normalizePath(arr$.filebase), arr$.sep) 25 | 26 | sample1 = function(x, ...){x} 27 | sample1 <- base::sample 28 | sample2 <- base::sample 29 | replace <- TRUE 30 | set.seed(1) 31 | locs <- list( 32 | quote(x), 33 | drop=F, 34 | sample1(c(1:dim[[1]]), replace = replace), 35 | sample1(c(1:dim[[2]]), replace = replace), 36 | sample2(c(1:dim3), replace = replace), 37 | sample2(c(1:5), replace = replace) 38 | ) 39 | # get_buffer_size() 40 | # set_buffer_size(1048576)->bs; bs; log10(bs) 41 | # set_buffer_size(2^16) 42 | getThreads() 43 | setThreads(1) 44 | 45 | system.time({ 46 | b <- do.call('[', locs) 47 | }) 48 | 49 | # system.time({ 50 | # a <- FARR_subset(filebase = basefile, 51 | # listOrEnv = locs[-(1:2)], 52 | # dim = dim, 53 | # cum_part_sizes = 1:5, 54 | # type = type_to_sexp('integer'), 55 | # split_dim = 3)}) 56 | 57 | system.time({ 58 | a <- local({ 59 | x <- arr 60 | do.call('[', locs) 61 | }) 62 | }) 63 | 64 | 65 | identical(a, b) 66 | -------------------------------------------------------------------------------- /adhoc/performance-write.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | set.seed(1) 3 | dim <- c(500, 400, 100, 5) 4 | file <- tempfile(); unlink(file, recursive = TRUE); 5 | x <- filearray_create(file, dim, type = 'integer', partition_size = 1) 6 | # na <- NA; storage.mode(na) <- 'integer' 7 | na <- as.integer(NA) 8 | v <- array(na, dim) 9 | 10 | filebase <- paste0(x$.filebase, x$.sep) 11 | pna <- local({ 12 | dim[[length(dim)]] <- x$partition_size() 13 | rep(na, prod(dim)) 14 | }) 15 | reset <- function(){ 16 | system.time({ 17 | lapply(seq_len(ceiling(dim[[length(dim)]] / x$partition_size())), function(i){ 18 | dim[[length(dim)]] <- x$partition_size() 19 | write_partition( 20 | file = x$partition_path(i), 21 | partition = i, 22 | dimension = dim, 23 | type = 'integer', 24 | value = pna 25 | ) 26 | }) 27 | }) 28 | } 29 | 30 | sample <- function(x, ...){x} 31 | sample <- base::sample 32 | 33 | listOrEnv <- function(seed){ 34 | set.seed(seed) 35 | lapply(dim, function(d){ 36 | sample(seq_len(d), size = sample(seq_len(d), size = 1), replace = FALSE) 37 | }) 38 | } 39 | 40 | 41 | seed = 1 42 | locs <- listOrEnv(seed) 43 | vals <- 1:prod(sapply(locs, length)) 44 | system.time({ 45 | b <- local({ 46 | locs <- listOrEnv(seed) 47 | v[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.integer(vals) 48 | v 49 | }) 50 | }) 51 | 52 | locs <- listOrEnv(seed) 53 | reset(); gc() 54 | system.time({ 55 | setThreads(5) 56 | locs <- listOrEnv(seed) 57 | x[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.double(vals) 58 | }, gcFirst = TRUE) 59 | system.time({ 60 | x[,,,] 61 | }) 62 | 63 | 64 | identical(x[], b) 65 | -------------------------------------------------------------------------------- /adhoc/debug.R: -------------------------------------------------------------------------------- 1 | f <- function(seed){ 2 | require(testthat) 3 | require(bit64) 4 | bsz <- get_buffer_size() 5 | on.exit({ 6 | set_buffer_size(bsz) 7 | max_buffer_size(2097152) 8 | }) 9 | set_buffer_size(16L) 10 | max_buffer_size(64L) 11 | 12 | set.seed(seed) 13 | file <- tempfile() 14 | unlink(file, recursive = TRUE) 15 | dim <- 33:35 16 | x <- filearray_create(file, dim, partition_size = 1, type = "double") 17 | 18 | expect_equal(x[[2]], x$.na) 19 | 20 | y <- array(x$.na, dim) 21 | tmp <- 1:(prod(dim)) 22 | x[] <- tmp 23 | y[] <- tmp 24 | 25 | eps <- 10^(ceiling(log10(max(abs(y)))) - 7) 26 | # y[] <- x[] 27 | locs <- 28 | lapply(dim, function(d) { 29 | d1 <- sample(c(1,3:d), 10, replace = TRUE ) 30 | d2 <- NULL#c(NA, NA) 31 | sort(as.double(sample(c(d1,d2))), decreasing = TRUE) 32 | }) 33 | assign('locs', locs, envir = globalenv()) 34 | 35 | a <- x[locs[[1]], locs[[2]], locs[[3]]] 36 | b <- y[locs[[1]], locs[[2]], locs[[3]]] 37 | assign('x', x, envir = globalenv()) 38 | assign('y', y, envir = globalenv()) 39 | eps <- 1e-6 40 | sel <- !is.na(a-b) & (a-b) > eps 41 | 42 | assign('sel', sel, envir = globalenv()) 43 | assign('a', a, envir = globalenv()) 44 | assign('b', b, envir = globalenv()) 45 | 46 | expect_equal(is.na(a), is.na(b)) 47 | 48 | 49 | if(length(a[sel])){ 50 | # fail the test 51 | print(a[sel]) 52 | print(b[sel]) 53 | expect_length(object = (a-b)[sel], n = 0) 54 | stop() 55 | } 56 | 57 | unlink(file, recursive = TRUE) 58 | } 59 | # setThreads(1) 60 | for(i in 1:1000){ 61 | print(i) 62 | f(i) 63 | } 64 | 65 | -------------------------------------------------------------------------------- /tests/testthat/test-method_subsetAssign.R: -------------------------------------------------------------------------------- 1 | test_that("subset-assign filearray-proxy", { 2 | 3 | # normal indexing 4 | x0 <- array(1:120, dim = c(10,12)) 5 | dimnames(x0) <- list(A = 1:10, B = 1:12) 6 | x <- as_filearray(x0) 7 | y <- x + 1L 8 | y[,1] <- rep(0, 10) 9 | y[1,] <- rep(0, 12) 10 | z <- y + y 11 | 12 | idx1 <- c(3,4,5,2,1) 13 | idx2 <- idx1 * 2 14 | 15 | expect_equal(x[idx1, idx2], x0[idx1, idx2]) 16 | expect_equal(y[1,1:12, dimnames = NULL], rep(0L, 12)) 17 | expect_equal(y[1:10,1, dimnames = NULL], rep(0L, 10)) 18 | expect_equal(y[2:10,2:10], x0[2:10,2:10] + 1L) 19 | expect_equal(z[, idx2], 2 * y[, idx2]) 20 | 21 | expect_equal(y[2:10, 2:12], x[2:10, 2:12] + 1) 22 | expect_equal(z[], 2 * y[]) 23 | 24 | clear_cache() 25 | 26 | # Array indexing 27 | idx1 <- c(3,4,5,2,1) 28 | idx2 <- idx1 * 2 29 | 30 | i <- array(FALSE, dim(x)) 31 | i[idx1, idx2] <- TRUE 32 | j <- as_filearray(i) 33 | expect_equal(typeof(j), "logical") 34 | 35 | x0 <- array(1:120, dim = c(10,12)) 36 | dimnames(x0) <- list(A = 1:10, B = 1:12) 37 | x <- as_filearray(x0) 38 | y <- x + 1L 39 | y[j,lazy = TRUE] <- 0 40 | z <- y + y 41 | z[j, lazy = TRUE] <- 1 42 | 43 | expect_equal(y$.filebase, x$.filebase) 44 | expect_equal(z$.filebase, x$.filebase) 45 | 46 | # mature z 47 | expect_equal(z[j], rep(1, sum(j))) 48 | expect_equal(y[j], rep(0, sum(j))) 49 | 50 | expect_equal(x[!i], x0[!i]) 51 | expect_equal(y[!i], x[!i] + 1L) 52 | expect_equal(z[!i], y[!i] * 2) 53 | 54 | expect_equal(x[!j], x0[!i]) 55 | expect_equal(y[!j], x[!i] + 1L) 56 | expect_equal(z[!j], y[!i] * 2) 57 | 58 | clear_cache() 59 | }) 60 | -------------------------------------------------------------------------------- /adhoc/map-test.R: -------------------------------------------------------------------------------- 1 | require(filearray) 2 | dim <- c(100,100,10,40) 3 | set.seed(1); 4 | tmp <- seq_len(1e5) 5 | xs <- lapply(1:2, function(i){ 6 | file <- tempfile(); unlink(file, recursive = TRUE) 7 | x <- filearray_create(file, dim, type = 'double') 8 | for(i in 1:40){ 9 | x[,,,i] <- tmp 10 | } 11 | x 12 | }) 13 | set.seed(2); file <- tempfile(); unlink(file, recursive = TRUE) 14 | y <- filearray_create(file, c(100,10,40), type = 'double') 15 | y$initialize_partition() 16 | 17 | 18 | system.time({ 19 | z <- xs[[1]]$collapse(keep = c(2,3,4), method = "sum") 20 | }) 21 | system.time({ 22 | fmap(xs, function(x, a){ 23 | # a <<- c(a, Sys.time() - now) 24 | z <- x[[1]] + x[[2]] 25 | dim(z) <- c(100,100,10) 26 | z <- dipsaus::collapse(z, keep = c(2,3)) 27 | as.list(z + a) 28 | }, .y = y, .input_size = 100000, .output_size = 1000, 29 | a = 2 30 | ) 31 | }) 32 | range(y[] - z*2) 33 | 34 | # aa <- c() 35 | system.time({ 36 | now <- Sys.time() 37 | # aa <<- c(aa, Sys.time() - now) 38 | y[] <- xs[[1]][] + xs[[2]][] 39 | }) 40 | 41 | # filearray:::setThreads(1) 42 | filearray:::set_buffer_size(2097152) 43 | filearray:::set_buffer_size(8000000) 44 | filearray:::get_buffer_size() 45 | filearray_threads(8) 46 | env <- new.env(parent = emptyenv()) 47 | env$a <- matrix(NA_real_, nrow = 4, ncol = ceiling(length(y) / filearray:::get_buffer_size() * 8)) 48 | env$count = 1; 49 | system.time({ 50 | now <- Sys.time() 51 | fmap_element_wise(xs, function(input) { 52 | input[[1]] + input[[2]] 53 | }, y 54 | , profile = function(){ 55 | env$a[[env$count]] <- Sys.time() - now 56 | now <<- Sys.time() 57 | env$count <- env$count + 1 58 | } 59 | ) 60 | }) 61 | b <- t(env$a) 62 | colSums(b, na.rm = TRUE) 63 | summary(b) 64 | 65 | range(y[] - xs[[1]][] - xs[[2]][]) 66 | -------------------------------------------------------------------------------- /adhoc/gctorture.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | set.seed(1) 3 | dim <- c(5, 4, 2, 5) 4 | file <- tempfile(); unlink(file, recursive = TRUE); 5 | x <- filearray_create(file, dim, type = 'integer', partition_size = 1) 6 | # na <- NA; storage.mode(na) <- 'integer' 7 | na <- as.integer(NA) 8 | v <- array(na, dim) 9 | 10 | filebase <- paste0(x$.filebase, x$.sep) 11 | pna <- local({ 12 | dim[[length(dim)]] <- x$partition_size() 13 | rep(na, prod(dim)) 14 | }) 15 | reset <- function(){ 16 | system.time({ 17 | lapply(seq_len(ceiling(dim[[length(dim)]] / x$partition_size())), function(i){ 18 | dim[[length(dim)]] <- x$partition_size() 19 | write_partition( 20 | file = x$partition_path(i), 21 | partition = i, 22 | dimension = dim, 23 | type = 'integer', 24 | value = pna 25 | ) 26 | }) 27 | }) 28 | } 29 | 30 | sample <- function(x, ...){x} 31 | sample <- base::sample 32 | 33 | listOrEnv <- function(seed){ 34 | set.seed(seed) 35 | lapply(dim, function(d){ 36 | sample(seq_len(d), size = sample(seq_len(d), size = 1), replace = FALSE) 37 | }) 38 | } 39 | 40 | 41 | seed = 1 42 | locs <- listOrEnv(seed) 43 | vals <- 1:prod(sapply(locs, length)) 44 | system.time({ 45 | b <- local({ 46 | locs <- listOrEnv(seed) 47 | v[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.integer(vals) 48 | v 49 | }) 50 | }) 51 | 52 | locs <- listOrEnv(seed) 53 | reset(); gc() 54 | system.time({ 55 | setThreads(5) 56 | locs <- listOrEnv(seed) 57 | gctorture2(1) 58 | x[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.double(vals) 59 | gctorture2(0) 60 | }, gcFirst = TRUE) 61 | system.time({ 62 | gctorture2(1) 63 | tmp <- x[,,,] 64 | gctorture2(0) 65 | identical(tmp, b) 66 | }) 67 | 68 | 69 | identical(x[], b) 70 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | branches: [main, master] 12 | 13 | name: R-CMD-check 14 | 15 | permissions: read-all 16 | 17 | jobs: 18 | R-CMD-check: 19 | runs-on: ${{ matrix.config.os }} 20 | 21 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 22 | 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | config: 27 | - {os: macos-latest, r: 'release'} 28 | 29 | - {os: windows-latest, r: 'release'} 30 | # use 4.1 to check with rtools40's older compiler 31 | - {os: windows-latest, r: '4.1'} 32 | 33 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 34 | - {os: ubuntu-latest, r: 'release'} 35 | - {os: ubuntu-latest, r: 'oldrel-1'} 36 | - {os: ubuntu-latest, r: 'oldrel-2'} 37 | - {os: ubuntu-latest, r: 'oldrel-3'} 38 | 39 | env: 40 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 41 | R_KEEP_PKG_SOURCE: yes 42 | 43 | steps: 44 | - uses: actions/checkout@v4 45 | 46 | - uses: r-lib/actions/setup-pandoc@v2 47 | 48 | - uses: r-lib/actions/setup-r@v2 49 | with: 50 | r-version: ${{ matrix.config.r }} 51 | http-user-agent: ${{ matrix.config.http-user-agent }} 52 | use-public-rspm: true 53 | 54 | - uses: r-lib/actions/setup-r-dependencies@v2 55 | with: 56 | extra-packages: any::rcmdcheck 57 | needs: check 58 | 59 | - uses: r-lib/actions/check-r-package@v2 60 | with: 61 | upload-snapshots: true 62 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 63 | -------------------------------------------------------------------------------- /inst/include/TinyParallel/Common.h: -------------------------------------------------------------------------------- 1 | #ifndef __FILEARRAY_PARALLEL_COMMON__ 2 | #define __FILEARRAY_PARALLEL_COMMON__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace TinyParallel { 9 | 10 | template 11 | inline int resolveValue(const char* envvar, 12 | T requestedValue, 13 | U defaultValue) 14 | { 15 | // if the requested value is non-zero and not the default, we can use it 16 | if (requestedValue != defaultValue && requestedValue > 0) 17 | return requestedValue; 18 | 19 | // otherwise, try reading the default from associated envvar 20 | // if the environment variable is unset, use the default 21 | const char* var = getenv(envvar); 22 | if (var == NULL) 23 | return defaultValue; 24 | 25 | // try to convert the string to a number 26 | // if an error occurs during conversion, just use default 27 | errno = 0; 28 | char* end; 29 | long value = strtol(var, &end, 10); 30 | 31 | // check for conversion failure 32 | if (end == var || *end != '\0' || errno == ERANGE) 33 | return defaultValue; 34 | 35 | // okay, return the parsed environment variable value 36 | return value; 37 | } 38 | 39 | // Work executed within a background thread. We implement dynamic 40 | // dispatch using vtables so we can have a stable type to cast 41 | // to from the void* passed to the worker thread (required because 42 | // the tinythreads interface allows to pass only a void* to the 43 | // thread main rather than a generic type / template) 44 | 45 | struct Worker 46 | { 47 | // construct and destruct (delete virtually) 48 | Worker() {} 49 | virtual ~Worker() {} 50 | 51 | // dispatch work over a range of values 52 | virtual void operator()(std::size_t begin, std::size_t end) = 0; 53 | 54 | // disable copying and assignment 55 | private: 56 | Worker(const Worker&); 57 | void operator=(const Worker&); 58 | }; 59 | 60 | // Tag type used for disambiguating splitting constructors 61 | 62 | struct Split {}; 63 | 64 | } // namespace TinyParallel 65 | 66 | 67 | #endif // __FILEARRAY_PARALLEL_COMMON__ 68 | -------------------------------------------------------------------------------- /tests/testthat/test-as_filearray.R: -------------------------------------------------------------------------------- 1 | test_that("as_filearray", { 2 | options("filearray.operator.precision" = "float") 3 | on.exit({ 4 | options("filearray.operator.precision" = NULL) 5 | }, add = TRUE) 6 | 7 | x <- rnorm(24) 8 | dim(x) <- c(2, 3, 4) 9 | 10 | # as_filearray.default 11 | y <- as_filearray(x) 12 | testthat::expect_equal(y$type(), getOption("filearray.operator.precision")) 13 | testthat::expect_equal(y[], x, tolerance = 1e-5, ignore_attr = TRUE) 14 | testthat::expect_equal(y$.mode, "readwrite") 15 | 16 | # as_filearray.character 17 | y <- as_filearray(y$.filebase) 18 | testthat::expect_equal(y[dimnames = FALSE], x, tolerance = 1e-5, ignore_attr = TRUE) 19 | testthat::expect_equal(y$.mode, "readonly") 20 | 21 | # as_filearray.FileArray(Proxy) 22 | z <- as_filearray(y) 23 | testthat::expect_identical(z, y) 24 | 25 | z <- as_filearray(as_filearrayproxy(y)) 26 | testthat::expect_equal(typeof(z), getOption("filearray.operator.precision")) 27 | testthat::expect_equal(z[], x, tolerance = 1e-5, ignore_attr = TRUE) 28 | 29 | y$.mode <- "readwrite" 30 | y$delete() 31 | 32 | testthat::expect_false(z$valid()) 33 | 34 | }) 35 | 36 | 37 | test_that("as_filearrayproxy", { 38 | options("filearray.operator.precision" = "float") 39 | on.exit({ 40 | options("filearray.operator.precision" = NULL) 41 | }, add = TRUE) 42 | 43 | x <- rnorm(24) 44 | dim(x) <- c(2, 3, 4) 45 | dimnames(x) <- list(A = 1:2, B = 1:3, C = letters[1:4]) 46 | 47 | # as_filearrayproxy.default 48 | y <- as_filearrayproxy(x) 49 | testthat::expect_equal(typeof(y), getOption("filearray.operator.precision")) 50 | testthat::expect_equal(y[], x, tolerance = 1e-5) 51 | testthat::expect_equal(y$.mode, "readwrite") 52 | 53 | # as_filearrayproxy.character 54 | y <- as_filearrayproxy(y$.filebase) 55 | testthat::expect_equal(y[], x, tolerance = 1e-5) 56 | testthat::expect_equal(y$.mode, "readwrite") 57 | 58 | # as_filearrayproxy.FileArrayProxy 59 | z <- as_filearrayproxy(y) 60 | testthat::expect_false(identical(z, y)) 61 | testthat::expect_equal(y[], z[]) 62 | 63 | y$delete() 64 | 65 | testthat::expect_false(z$valid()) 66 | 67 | }) 68 | -------------------------------------------------------------------------------- /man/FileArray-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/class-filearray.R 3 | \name{FileArray-class} 4 | \alias{FileArray-class} 5 | \title{Definition of file array} 6 | \description{ 7 | \code{S4} class definition of \code{FileArray}. Please 8 | use \code{\link{filearray_create}} and \code{\link{filearray_load}} 9 | to create instances. 10 | } 11 | \section{Public Methods}{ 12 | 13 | \describe{ 14 | \item{\code{get_header(key, default = NULL)}}{Get header information; returns \code{default} if \code{key} is missing} 15 | \item{\code{set_header(key, value)}}{Set header information; the extra headers will be stored in meta file. Please do not store large headers as they will be loaded into memory frequently.} 16 | \item{\code{can_write()}}{Whether the array data can be altered} 17 | \item{\code{create(filebase, dimension, type = "double", partition_size = 1)}}{Create a file array instance} 18 | \item{\code{delete(force = FALSE)}}{Remove array from local file system and reset} 19 | \item{\code{dimension()}}{Get dimension vector} 20 | \item{\code{dimnames(v)}}{Set/get dimension names} 21 | \item{\code{element_size()}}{Internal storage: bytes per element} 22 | \item{\code{fill_partition(part, value)}}{Fill a partition with given scalar} 23 | \item{\code{get_partition(part, reshape = NULL)}}{Get partition data, and reshape (if not null) to desired dimension} 24 | \item{\code{expand(n)}}{Expand array along the last margin; returns true if expanded; if the \code{dimnames} have been assigned prior to expansion, the last dimension names will be filled with \code{NA}} 25 | \item{\code{initialize_partition()}}{Make sure a partition file exists; if not, create one and fill with \code{NA}s or 0 (\code{type='raw'})} 26 | \item{\code{load(filebase, mode = c("readwrite", "readonly"))}}{Load file array from existing directory} 27 | \item{\code{partition_path(part)}}{Get partition file path} 28 | \item{\code{partition_size()}}{Get partition size; see \code{\link{filearray}}} 29 | \item{\code{set_partition(part, value, ..., strict = TRUE)}}{Set partition value} 30 | \item{\code{sexp_type()}}{Get data \code{SEXP} type; see R internal manuals} 31 | \item{\code{show()}}{Print information} 32 | \item{\code{type()}}{Get data type} 33 | \item{\code{valid()}}{Check if the array is valid.} 34 | } 35 | } 36 | 37 | \seealso{ 38 | \code{\link{filearray}} 39 | } 40 | -------------------------------------------------------------------------------- /man/fwhich.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{fwhich} 4 | \alias{fwhich} 5 | \alias{fwhich.default} 6 | \alias{fwhich.FileArray} 7 | \title{A generic function of \code{which} that is \code{'FileArray'} compatible} 8 | \usage{ 9 | fwhich(x, val, arr.ind = FALSE, ret.values = FALSE, ...) 10 | 11 | \method{fwhich}{default}(x, val, arr.ind = FALSE, ret.values = FALSE, ...) 12 | 13 | \method{fwhich}{FileArray}(x, val, arr.ind = FALSE, ret.values = FALSE, ...) 14 | } 15 | \arguments{ 16 | \item{x}{any R vector, matrix, array or file-array} 17 | 18 | \item{val}{values to find, or a function taking one argument (a slice of 19 | data vector) and returns either logical vector with the same length as the 20 | slice or index of the slice; see 'Examples'} 21 | 22 | \item{arr.ind}{logical; should array indices be 23 | returned when \code{x} is an array?} 24 | 25 | \item{ret.values}{whether to return the values of corresponding indices as 26 | an attributes; default is false} 27 | 28 | \item{...}{passed to \code{val} if \code{val} is a function} 29 | } 30 | \value{ 31 | The indices of \code{x} elements that are listed in \code{val}. 32 | } 33 | \description{ 34 | A generic function of \code{which} that is \code{'FileArray'} compatible 35 | } 36 | \examples{ 37 | 38 | 39 | # ---- Default case ------------------------------------ 40 | x <- array(1:27 + 2, rep(3,3)) 41 | 42 | # find index of `x` equal to either 4 or 5 43 | fwhich(x, c(4,5)) 44 | res <- fwhich(x, c(4,5), ret.values = TRUE) 45 | res 46 | attr(res, "values") 47 | 48 | # ---- file-array case -------------------------------- 49 | arr <- filearray_create(tempfile(), dim(x)) 50 | arr[] <- x 51 | fwhich(arr, c(4,5)) 52 | fwhich(arr, c(4,5), arr.ind = TRUE, ret.values = TRUE) 53 | 54 | arr[2:3, 1, 1] 55 | 56 | # Clean up this example 57 | arr$delete() 58 | 59 | # ---- `val` is a function ---------------------------- 60 | x <- as_filearray(c(sample(15), 15), dimension = c(4,4)) 61 | 62 | ret <- fwhich(x, val = which.max, 63 | ret.values = TRUE, arr.ind = FALSE) 64 | 65 | # ret is the index 66 | ret == which.max(x[]) 67 | 68 | # attr(ret, "values") is the max value 69 | max(x[]) == attr(ret, "values") 70 | 71 | # customize `val` 72 | fwhich(x, ret.values = TRUE, arr.ind = FALSE, 73 | val = function( slice ) { 74 | slice > 10 # or which(slice > 10) 75 | }) 76 | 77 | 78 | } 79 | -------------------------------------------------------------------------------- /R/write.R: -------------------------------------------------------------------------------- 1 | 2 | ensure_partition <- function( 3 | file, partition, dimension, 4 | type = c("double","integer","logical","raw"), size = NULL){ 5 | 6 | type <- match.arg(type) 7 | if(is.null(size)){ 8 | size <- get_elem_size(type) 9 | } else { 10 | size <- as.integer(size) 11 | } 12 | 13 | if( !file.exists(file) ){ 14 | fid <- file(description = file, open = "w+b") 15 | write_header(fid, partition, dimension, type, size) 16 | close(fid) 17 | } 18 | 19 | header <- validate_header(file) 20 | 21 | expected_type <- sexp_to_type(header$sexp_type) 22 | 23 | if( type != expected_type ){ 24 | stop(sprintf("Partition data type mismatch: %s != %s", expected_type, type)) 25 | } 26 | 27 | if( header$partition != partition ){ 28 | quiet_warning(sprintf("Partition number mismatch: %s != %s", header$partition, partition)) 29 | } 30 | if( prod(dimension) != header$partition_size ){ 31 | quiet_warning(sprintf("Partition size mismatch: %s != %s", header$partition_size, prod(dimension))) 32 | } 33 | 34 | return(header) 35 | 36 | } 37 | 38 | sexp_to_type <- function(sexp){ 39 | switch( 40 | as.character(sexp), 41 | '14' = 'double', 42 | '13' = 'integer', 43 | '10' = 'logical', 44 | '24' = 'raw', 45 | '15' = 'complex', 46 | '26' = 'float', 47 | stop("Unknown SEXP code: ", sexp) 48 | ) 49 | } 50 | 51 | type_to_sexp <- function(type){ 52 | switch( 53 | type, 54 | double = 14L, 55 | integer = 13L, 56 | logical = 10L, 57 | raw = 24L, 58 | complex = 15L, 59 | float = 26L, 60 | stop("Unknown data type: ", type) 61 | ) 62 | } 63 | 64 | load_partition <- function(file, dim){ 65 | stopifnot(file.exists(file)) 66 | fid <- file(description = file, open = "rb") 67 | on.exit({ 68 | close(fid) 69 | }) 70 | header <- validate_header(fid = fid) 71 | type <- sexp_to_type(header$sexp_type) 72 | 73 | if( missing(dim) ){ 74 | dim <- header$partition_dim 75 | } else { 76 | stopifnot(prod(header$partition_dim) == prod(dim)) 77 | } 78 | 79 | structure( 80 | readBin(con = fid, what = type, size = header$unit_bytes, 81 | n = header$content_length, endian = header$endianness), 82 | dim = dim 83 | ) 84 | } 85 | -------------------------------------------------------------------------------- /tests/testthat/test-methods_simple.R: -------------------------------------------------------------------------------- 1 | test_that("Simple operators", { 2 | 3 | # logical 4 | x <- as_filearrayproxy(1:24, dimension = c(4,6), type = "double") 5 | x[3:4,3] <- rep(NA, 2) 6 | y <- !is.na(x) 7 | testthat::expect_s4_class(y, "FileArrayProxy") 8 | testthat::expect_equal(y[], !is.na(x[])) 9 | 10 | # math to double 11 | x0 <- x[] 12 | suppressWarnings({ 13 | for(op in FILEARRAY_SIMPLE_OPS$math) { 14 | y <- do.call(op, list(x)) 15 | testthat::expect_s4_class(y, "FileArrayProxy") 16 | testthat::expect_equal(y[], do.call(op, list(x0))) 17 | } 18 | }) 19 | 20 | # math to complex 21 | x0 <- array(rnorm(20) + rnorm(20) * 1i, c(5,4)) 22 | x <- as_filearrayproxy(x0) 23 | suppressWarnings({ 24 | for(op in FILEARRAY_SIMPLE_OPS$math) { 25 | valid_op <- tryCatch({ 26 | y0 <- do.call(op, list(x0)) 27 | TRUE 28 | }, error = function(e) { 29 | FALSE 30 | }) 31 | 32 | if( valid_op ) { 33 | y <- do.call(op, list(x)) 34 | testthat::expect_s4_class(y, "FileArrayProxy") 35 | dif <- y[dimnames = NULL] - y0 36 | mag <- Mod(y[dimnames = NULL] + y0) 37 | mag[mag < 1] <- 1 38 | dif <- dif / Mod(mag) 39 | testthat::expect_lt(max(abs(Re(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op)) 40 | testthat::expect_lt(max(abs(Im(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op)) 41 | } else { 42 | testthat::expect_error(do.call(op, list(x)), label = sprintf("Operator `%s` on complex should fail, but", op)) 43 | } 44 | 45 | 46 | } 47 | }) 48 | 49 | # complex 50 | for(op in FILEARRAY_SIMPLE_OPS$complex) { 51 | y0 <- do.call(op, list(x0)) 52 | y <- do.call(op, list(x)) 53 | testthat::expect_s4_class(y, "FileArrayProxy") 54 | dif <- y[dimnames = NULL] - y0 55 | mag <- Mod(y[dimnames = NULL] + y0) 56 | mag[mag < 1] <- 1 57 | dif <- dif / Mod(mag) 58 | testthat::expect_lt(max(abs(Re(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op)) 59 | testthat::expect_lt(max(abs(Im(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op)) 60 | } 61 | 62 | }) 63 | -------------------------------------------------------------------------------- /tests/testthat/test-methods.R: -------------------------------------------------------------------------------- 1 | test_that("R: FileArray-class", { 2 | set.seed(2) 3 | file <- tempfile() 4 | unlink(file, recursive = TRUE) 5 | dim <- 3:5 6 | x <- filearray_create(file, dim, partition_size = 3, initialize = FALSE) 7 | 8 | expect_equal(x$dimension(), dim) 9 | expect_true(x$can_write()) 10 | expect_equal(x$type(), 'double') 11 | expect_equal(x$element_size(), get_elem_size(x$type())) 12 | expect_error(x$fill_partition(-1, 1)) 13 | expect_error(x$fill_partition(NA, 1)) 14 | expect_error(x$fill_partition(0, 1)) 15 | expect_warning(x$fill_partition(1, 1:10)) 16 | x$fill_partition(2, 2) 17 | expect_equal(x[[60]], 2) 18 | 19 | unlink(file, recursive = TRUE) 20 | x <- filearray_create(file, dim, partition_size = 1, initialize = FALSE) 21 | expect_equal(x$.mode, "readwrite") 22 | x$fill_partition(1, 2) 23 | 24 | x <- filearray_load(file, mode = 'readonly') 25 | expect_equal(x$.mode, "readonly") 26 | expect_error(x$fill_partition(1, 1)) 27 | 28 | x$initialize_partition(1:2) 29 | expect_equal(x[1,1,1:2], c(2,NA)) 30 | expect_equal(file.exists(x$partition_path(1:3)), c(TRUE, TRUE, FALSE)) 31 | expect_equal(x$.mode, "readonly") 32 | 33 | expect_error(x[] <- 1:60) 34 | 35 | unlink(file, recursive = TRUE) 36 | # not removed by $delete 37 | expect_true(x$.valid) 38 | expect_false(x$valid()) 39 | expect_error(x[]) 40 | expect_error(as.array(x)) 41 | expect_error(x[[1]]) 42 | expect_error(mapreduce(x, I)) 43 | expect_error(typeof(x)) 44 | expect_error(max(x)) 45 | 46 | x$delete() 47 | expect_false(x$.valid) 48 | 49 | unlink(file, recursive = TRUE) 50 | }) 51 | 52 | 53 | test_that("R: S3 methods", { 54 | set.seed(3) 55 | file <- tempfile() 56 | unlink(file, recursive = TRUE) 57 | dim <- 3:5 58 | x <- filearray_create(file, dim, partition_size = 3, initialize = FALSE) 59 | expect_equal(file.exists(x$partition_path(1:3)), c(FALSE, FALSE, FALSE)) 60 | x[,,4] <- 1:12 61 | expect_equal(as.integer(x[,,4]), 1:12) 62 | expect_equal(file.exists(x$partition_path(1:3)), c(FALSE, TRUE, FALSE)) 63 | 64 | expect_equal(typeof(x), 'double') 65 | expect_equal(range(x, na.rm = TRUE), as.double(c(1, 12))) 66 | expect_equal(file.exists(x$partition_path(1:3)), c(FALSE, TRUE, FALSE)) 67 | expect_equal(range(x), as.double(c(NA, NA))) 68 | expect_equal(file.exists(x$partition_path(1:3)), c(TRUE, TRUE, FALSE)) 69 | 70 | unlink(file, recursive = TRUE) 71 | }) 72 | -------------------------------------------------------------------------------- /R/aaa.R: -------------------------------------------------------------------------------- 1 | #' @importFrom methods new 2 | #' @importFrom methods signature 3 | #' @importFrom methods setGeneric 4 | #' @importFrom methods setRefClass 5 | #' @importFrom methods setMethod 6 | NULL 7 | HEADER_SIZE <- 1024 8 | FILE_VER <- c( 1L, 1L, 0L ) 9 | HEADER_VER <- 1L 10 | RESERVED_HEADERS <- c("endianness", "version", "sexp_type", 11 | "unit_bytes", "partition", "partition_size", 12 | "partition_dim", "header_bytes", "header_version", 13 | "dimnames", "content_length") 14 | 15 | # The saved files are always little endian 16 | ENDIANNESS <- "little" 17 | 18 | max_buffer_size <- local({ 19 | # By default, maximum of 2MB buffer size 20 | size <- 2097152 21 | function(v){ 22 | if(!missing(v)){ 23 | if(v < 64){ 24 | stop("Maximum buffer size is too small.") 25 | } 26 | v <- 2^ceiling(log2(v)) 27 | if(v > 2^30){ 28 | stop("Maximum buffer size is too large.") 29 | } 30 | size <<- v 31 | } 32 | return(size) 33 | } 34 | }) 35 | 36 | quiet_warning <- function(..., call. = FALSE){ 37 | if(!getOption("filearray.quiet", FALSE)){ 38 | warning(..., '\n\n* To suppress this message, set `options("filearray.quiet" = TRUE)`', call. = call.) 39 | } 40 | } 41 | 42 | get_os <- function(){ 43 | if("windows" %in% tolower(.Platform$OS.type)){ 44 | return("windows") 45 | } 46 | os <- tolower(R.version$os) 47 | if(startsWith(os, "darwin")){ 48 | return('darwin') 49 | } 50 | if(startsWith(os, "linux")){ 51 | return('linux') 52 | } 53 | if(startsWith(os, "solaris")){ 54 | return('solaris') 55 | } 56 | if(startsWith(os, "win")){ 57 | return('windows') 58 | } 59 | return('unknown') 60 | } 61 | 62 | 63 | deparse1 <- function (expr, collapse = " ") { 64 | paste(deparse(expr), collapse = collapse) 65 | } 66 | 67 | temp_dir <- function(check = FALSE) { 68 | re <- file.path(getOption("filearray.temporary.path", tempdir()), "_filearray_tempdir") 69 | if(check && !dir.exists(re)) { 70 | dir.create(re, showWarnings = FALSE, recursive = TRUE) 71 | } 72 | re 73 | } 74 | 75 | temp_path <- function(pattern = "tmpfilearray", fileext = ".farr", check = FALSE) { 76 | tempfile(pattern = pattern, tmpdir = temp_dir(check = check), fileext = fileext) 77 | } 78 | 79 | clear_cache <- function() { 80 | tdir <- temp_dir() 81 | if(dir.exists(tdir)) { 82 | unlink(tdir, recursive = TRUE) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /adhoc/profiling2.R: -------------------------------------------------------------------------------- 1 | library(filearray) 2 | set.seed(1) 3 | file <- tempfile(); unlink(file, recursive = TRUE) 4 | dim <- c(100, 100, 50, 100) 5 | x <- filearray_create(file, dim, type = "complex") 6 | xlen <- length(x) 7 | y <- rnorm(xlen) + 1i * rnorm(xlen) 8 | 9 | x$initialize_partition() 10 | 11 | # converting 800MB to 400MB and write 12 | system.time({ 13 | x[] <- y 14 | }) 15 | # user system elapsed 16 | # 0.268 0.770 0.368 17 | 18 | y <- x[] 19 | 20 | # Load 400MB, then convert to 800MB 21 | system.time({ 22 | x[] 23 | }) 24 | # user system elapsed 25 | # 0.228 0.342 0.351 26 | locs <- lapply(dim, function(d){ 27 | sample(1:d, size = sample(1:d, size = 1)) 28 | }) 29 | microbenchmark::microbenchmark( 30 | filearray = { 31 | x[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] 32 | }, 33 | native = { 34 | y[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] 35 | }, times = 10, unit = "ms" 36 | ) 37 | # Unit: milliseconds 38 | # expr min lq mean median uq max neval 39 | # filearray 8.401187 8.48126 9.764138 8.759076 9.615115 16.98675 10 40 | # native 19.753554 19.82092 23.815748 20.394937 21.714994 53.37421 10 41 | 42 | keep <- c(3, 4) 43 | microbenchmark::microbenchmark( 44 | filearray = { 45 | x$collapse(keep = keep, method = "sum") 46 | }, 47 | native = { 48 | apply(y, keep, sum) 49 | }, 50 | dipsaus = { 51 | dipsaus::collapse(y, keep, average = FALSE) 52 | }, 53 | times = 5, unit = "s" 54 | ) 55 | 56 | # Unit: seconds 57 | # expr min lq mean median uq max neval 58 | # filearray 0.3544366 0.3599713 0.4557022 0.4204501 0.5650227 0.5786305 5 59 | # native 2.0568676 2.1961080 2.3898924 2.2768022 2.6102381 2.8094461 5 60 | # dipsaus 0.3548668 0.4553942 0.5307459 0.4588728 0.6577143 0.7268814 5 61 | 62 | 63 | keep <- c(3, 2) 64 | microbenchmark::microbenchmark( 65 | filearray = { 66 | x$collapse(keep = keep, method = "sum") 67 | }, 68 | native = { 69 | apply(y, keep, sum) 70 | }, 71 | dipsaus = { 72 | dipsaus::collapse(y, keep, average = FALSE) 73 | }, 74 | times = 5, unit = "s" 75 | ) 76 | 77 | # Unit: seconds 78 | # expr min lq mean median uq max neval 79 | # filearray 0.3574229 0.4281074 0.4975226 0.4827639 0.5061053 0.7132133 5 80 | # native 0.7374846 1.3993005 1.4811525 1.5285852 1.5995152 2.1408772 5 81 | # dipsaus 0.3303778 0.4295939 0.6026428 0.6484702 0.7453199 0.8594522 5 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /tests/testthat/test-dimnames.R: -------------------------------------------------------------------------------- 1 | test_that("Subset with dimnames", { 2 | x <- filearray_create(tempfile(), dimension = c(3,4,5,6)) 3 | on.exit({ 4 | x$delete() 5 | }) 6 | y <- array(as.double(1:240), c(3,4,5,6)) 7 | x[] <- y 8 | expect_identical(dimnames(x[]), dimnames(y)) 9 | 10 | dnames1 <- list( 11 | A = 1:3, 12 | B = 1:4, 13 | NULL, 14 | D = 1:6 15 | ) 16 | dnames2 <- list( 17 | A = 1:3 18 | ) 19 | dnames3 <- list( 20 | A = 1:3, 21 | B = 1:4, 22 | C = NULL, 23 | D = NULL 24 | ) 25 | dnames4 <- list( 26 | A = 1:3, 27 | B = 1:4, 28 | C = 1:5, 29 | D = 1:6 30 | ) 31 | 32 | expect_error({ 33 | dimnames(x) <- list( 34 | A = 1:4 35 | ) 36 | }) 37 | 38 | 39 | dimnames(y) <- dnames1 40 | dimnames(x) <- dnames1 41 | 42 | expect_identical(dimnames(x[]), dimnames(y)) 43 | expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4])) 44 | expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]), 45 | dimnames(y[1,1:2,2:3,1:4,drop=FALSE])) 46 | expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1])) 47 | 48 | dimnames(y) <- dnames2 49 | dimnames(x) <- dnames2 50 | 51 | expect_identical(dimnames(x[]), dimnames(y)) 52 | expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4])) 53 | expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]), 54 | dimnames(y[1,1:2,2:3,1:4,drop=FALSE])) 55 | expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1])) 56 | 57 | dimnames(y) <- dnames3 58 | dimnames(x) <- dnames3 59 | 60 | expect_identical(dimnames(x[]), dimnames(y)) 61 | expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4])) 62 | expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]), 63 | dimnames(y[1,1:2,2:3,1:4,drop=FALSE])) 64 | expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1])) 65 | 66 | 67 | dimnames(y) <- dnames4 68 | dimnames(x) <- dnames4 69 | 70 | expect_identical(dimnames(x[]), dimnames(y)) 71 | expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4])) 72 | expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]), 73 | dimnames(y[1,1:2,2:3,1:4,drop=FALSE])) 74 | expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1])) 75 | 76 | 77 | # expand 78 | dimnames(x) <- dnames4 79 | x$expand(10) 80 | expect_equal(dimnames(x)$D, c(dnames4$D, rep(NA_integer_, 4))) 81 | 82 | }) 83 | -------------------------------------------------------------------------------- /R/methods-apply.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Apply functions over file array margins (extended) 3 | #' @param X a file array 4 | #' @param MARGIN scalar giving the subscripts which the function will be applied over. Current implementation only allows margin size to be one 5 | #' @param FUN the function to be applied 6 | #' @param ... optional arguments to \code{FUN} 7 | #' @param simplify a logical indicating whether results should be simplified if possible 8 | #' @return See Section 'Value' in \code{\link[base]{apply}}; 9 | #' @export 10 | setGeneric("apply") 11 | 12 | apply_filearray <- function(X, MARGIN, FUN, ..., simplify = TRUE){ 13 | if(!X$valid()){ 14 | stop("Invalid file array") 15 | } 16 | dim <- X$dimension() 17 | 18 | FUN <- match.fun(FUN) 19 | simplify <- isTRUE(simplify) 20 | d <- dim(X) 21 | dl <- length(d) 22 | dn <- dimnames(X) 23 | ds <- seq_len(dl) 24 | if (is.character(MARGIN)) { 25 | dnn <- names(dn) 26 | if (is.null(dnn)) 27 | stop("'X' must have named dimnames") 28 | MARGIN <- match(MARGIN, dnn) 29 | if (anyNA(MARGIN)) 30 | stop("not all elements of 'MARGIN' are names of dimensions") 31 | } 32 | d.call <- d[-MARGIN] 33 | d.ans <- d[MARGIN] 34 | if (anyNA(d.call) || anyNA(d.ans)) { 35 | stop("'MARGIN' does not match dim(X)") 36 | } 37 | s.call <- ds[-MARGIN] 38 | s.ans <- ds[MARGIN] 39 | if(length(s.ans) != 1){ 40 | stop("`apply` on FileArray margin size can only be 1.") 41 | } 42 | dn.call <- dn[-MARGIN] 43 | dn.ans <- dn[MARGIN] 44 | d2 <- prod(d.ans) 45 | if (d2 == 0L) { 46 | newX <- array(vector(typeof(X), 1L), 47 | dim = c(prod(d.call), 1L)) 48 | if (length(d.call) < 2L) { 49 | tmp <- newX[, 1] 50 | } else { 51 | tmp <- array(newX[, 1L], d.call, dn.call) 52 | } 53 | ans <- forceAndCall(1, FUN, tmp, ...) 54 | if(is.null(ans)){ 55 | return(ans) 56 | } else if (length(d.ans) < 2L) { 57 | return(ans[1L][-1L]) 58 | } else { 59 | return(array(ans, d.ans, dn.ans)) 60 | } 61 | } 62 | 63 | tmp <- rep("", dl) 64 | tmp[[s.ans]] <- ".__i__." 65 | f <- sprintf("function(.__i__., ...){ FUN(X[%s], ...) }", paste(tmp, collapse = ",")) 66 | f <- eval(parse(text = f)) 67 | 68 | sapply(seq_len(d[[s.ans]]), f, ..., simplify = simplify) 69 | } 70 | 71 | #' @rdname apply 72 | #' @export 73 | setMethod('apply', signature(X = "FileArray"), apply_filearray) 74 | 75 | #' @rdname apply 76 | #' @export 77 | setMethod('apply', signature(X = "FileArrayProxy"), apply_filearray) 78 | 79 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/actions/setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/actions/checkout@v1 55 | - uses: r-hub/actions/platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/actions/setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/actions/run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/actions/checkout@v1 80 | - uses: r-hub/actions/setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/actions/platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/actions/setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/actions/run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /src/conversion.h: -------------------------------------------------------------------------------- 1 | #ifndef FARR_CONVERSION_H 2 | #define FARR_CONVERSION_H 3 | 4 | #include "common.h" 5 | #include "serialize.h" 6 | #include "defs.h" 7 | 8 | double na_cplx_dbl(); 9 | SEXP realToInt64(Rcpp::NumericVector x, const double min_ = NA_REAL, const double max_ = NA_REAL, const int strict = 1); 10 | SEXP realToInt64_inplace(SEXP x, const double min_ = NA_REAL, const double max_ = NA_REAL, const int strict = 1); 11 | SEXP convert_as(SEXP x, SEXPTYPE type); 12 | SEXP convert_as2(SEXP x, SEXP y, SEXPTYPE type); 13 | 14 | // void realToCplx(double* x, Rcomplex* y, size_t nelem); 15 | void realToCplx(const double* x, Rcomplex* y, const size_t& nelem, const bool swap_endian = false); 16 | 17 | void cplxToReal(Rcomplex* x, double* y, size_t nelem); 18 | 19 | void realToFloat(double* x, float* y, size_t nelem); 20 | 21 | void floatToReal(float* x, double* y, size_t nelem); 22 | 23 | /********************************************************** 24 | * Transform functions 25 | ***********************************************************/ 26 | 27 | template 28 | inline void transform_asis(const T* x, T* y, const bool& swap_endian = false) { 29 | if( swap_endian ){ 30 | const size_t size = sizeof(T); 31 | T tmp; 32 | 33 | unsigned char *buffer_src = (unsigned char*) x; 34 | unsigned char *buffer_dst = (unsigned char*) (&tmp); 35 | 36 | for (size_t ix = 0; ix < size; ix++) { 37 | *(buffer_dst + (size - ix - 1)) = *(buffer_src + ix); 38 | } 39 | *y = tmp; 40 | } else { 41 | memcpy(y, x, sizeof(T)); 42 | // *y = *x; 43 | } 44 | 45 | } 46 | void transform_float(const float* x, double* y, const bool& swap_endian = false); 47 | void transform_logical(const Rbyte* x, int* y, const bool& swap_endian = false); 48 | void transform_complex(const double* x, Rcomplex* y, const bool& swap_endian = false); 49 | 50 | 51 | template 52 | inline void transforms_asis(const T* x, T* y, const int& nelem, const bool& swap_endian = false){ 53 | memcpy(y, x, nelem * sizeof(T)); 54 | if( swap_endian ){ 55 | swap_endianess(y, sizeof(T), nelem); 56 | } 57 | } 58 | 59 | void transforms_float(const float* x, double* y, const int& nelem, const bool& swap_endian = false); 60 | void transforms_logical(const Rbyte* x, int* y, const int& nelem, const bool& swap_endian = false); 61 | void transforms_complex(const double* x, Rcomplex* y, const int& nelem, const bool& swap_endian = false); 62 | 63 | template 64 | inline void transforms(const T* x, B* y, const int& nelem, const bool& swap_endian = false){ 65 | if(std::is_same::value){ 66 | transforms_asis(x, (T*) y, nelem, swap_endian); 67 | return; 68 | } 69 | if(std::is_same::value){ 70 | transforms_float((const float*) x, (double*) y, nelem, swap_endian); 71 | return; 72 | } 73 | 74 | if(std::is_same::value){ 75 | transforms_logical((const Rbyte*) x, (int*) y, nelem, swap_endian); 76 | return; 77 | } 78 | if(std::is_same::value){ 79 | transforms_complex((const double*) x, (Rcomplex*) y, nelem, swap_endian); 80 | return; 81 | } 82 | Rcpp::stop("Unknown type in `transforms`."); 83 | 84 | } 85 | 86 | #endif // FARR_CONVERSION_H -------------------------------------------------------------------------------- /man/S3-filearray.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aaa-docs.R, R/methods-subset.R, 3 | % R/methods-subsetAssign.R, R/methods.R 4 | \name{S3-filearray} 5 | \alias{S3-filearray} 6 | \alias{[.FileArray} 7 | \alias{[<-.FileArray} 8 | \alias{[[.FileArray} 9 | \alias{as.array.FileArray} 10 | \alias{dim.FileArray} 11 | \alias{dimnames.FileArray} 12 | \alias{dimnames<-.FileArray} 13 | \alias{length.FileArray} 14 | \alias{max.FileArray} 15 | \alias{min.FileArray} 16 | \alias{range.FileArray} 17 | \alias{sum.FileArray} 18 | \alias{subset.FileArray} 19 | \title{'S3' methods for 'FileArray'} 20 | \usage{ 21 | \method{[}{FileArray}( 22 | x, 23 | i, 24 | ..., 25 | drop = TRUE, 26 | reshape = NULL, 27 | strict = TRUE, 28 | dimnames = TRUE, 29 | split_dim = 0 30 | ) 31 | 32 | \method{[}{FileArray}(x, i, ..., lazy = FALSE) <- value 33 | 34 | \method{[[}{FileArray}(x, i) 35 | 36 | \method{as.array}{FileArray}(x, reshape = NULL, drop = FALSE, ...) 37 | 38 | \method{dim}{FileArray}(x) 39 | 40 | \method{dimnames}{FileArray}(x) 41 | 42 | \method{dimnames}{FileArray}(x) <- value 43 | 44 | \method{length}{FileArray}(x) 45 | 46 | \method{max}{FileArray}(x, na.rm = FALSE, ...) 47 | 48 | \method{min}{FileArray}(x, na.rm = FALSE, ...) 49 | 50 | \method{range}{FileArray}(x, na.rm = FALSE, ...) 51 | 52 | \method{sum}{FileArray}(x, na.rm = FALSE, ...) 53 | 54 | \method{subset}{FileArray}(x, ..., drop = FALSE, .env = parent.frame()) 55 | } 56 | \arguments{ 57 | \item{x}{a file array} 58 | 59 | \item{i, ...}{index set, or passed to other methods} 60 | 61 | \item{drop}{whether to drop dimensions; see topic \code{\link[base]{Extract}}} 62 | 63 | \item{reshape}{a new dimension to set before returning subset results; default is \code{NULL} (use default dimensions)} 64 | 65 | \item{strict}{whether to allow indices to exceed bound; currently only accept \code{TRUE}} 66 | 67 | \item{dimnames}{whether to preserve \code{\link[base]{dimnames}}} 68 | 69 | \item{split_dim}{internally used; split dimension and calculate indices to 70 | manually speed up the subset; value ranged from 0 to size of dimension minus 71 | one.} 72 | 73 | \item{lazy}{whether to lazy-evaluate the method, only works when assigning 74 | arrays with logical array index} 75 | 76 | \item{value}{value to substitute or set} 77 | 78 | \item{na.rm}{whether to remove \code{NA} values during the calculation} 79 | 80 | \item{.env}{environment to evaluate formula when evaluating subset margin indices.} 81 | } 82 | \description{ 83 | These are 'S3' methods for 'FileArray' 84 | } 85 | \section{Functions}{ 86 | \itemize{ 87 | \item \code{[}: subset array 88 | 89 | \item \code{`[`(FileArray) <- value}: subset assign array 90 | 91 | \item \code{[[}: get element by index 92 | 93 | \item \code{as.array(FileArray)}: converts file array to native array in R 94 | 95 | \item \code{dim(FileArray)}: get dimensions 96 | 97 | \item \code{dimnames(FileArray)}: get dimension names 98 | 99 | \item \code{dimnames(FileArray) <- value}: set dimension names 100 | 101 | \item \code{length(FileArray)}: get array length 102 | 103 | \item \code{max(FileArray)}: get max value 104 | 105 | \item \code{min(FileArray)}: get min value 106 | 107 | \item \code{range(FileArray)}: get value range 108 | 109 | \item \code{sum(FileArray)}: get summation 110 | 111 | \item \code{subset(FileArray)}: get subset file array with formulae 112 | 113 | }} 114 | -------------------------------------------------------------------------------- /R/filearray-package.R: -------------------------------------------------------------------------------- 1 | in_rcmdcheck <- function (...) { 2 | evidences <- list() 3 | args <- commandArgs() 4 | is_vanilla <- is.element("--vanilla", args) 5 | if(!is_vanilla){ 6 | return(FALSE) 7 | } 8 | pwd <- getwd() 9 | dirname <- basename(pwd) 10 | parent <- basename(dirname(pwd)) 11 | pattern <- ".+[.]Rcheck$" 12 | in_test <- (grepl(pattern, parent) && grepl("^tests(|_.*)$", dirname)) 13 | if(!(in_test || grepl(pattern, dirname))){ 14 | return(FALSE) 15 | } 16 | in_examples <- is.element("CheckExEnv", search()) 17 | in_win_builder <- (.Platform$OS.type == "windows" && grepl("Rterm[.]exe$", args[1])) 18 | if(in_win_builder){ 19 | n <- length(args) 20 | if (!all(c("--no-save", "--no-restore", "--no-site-file", 21 | "--no-init-file") %in% args)) { 22 | return(FALSE) 23 | } 24 | if (!grepl(pattern, parent)) { 25 | return(FALSE) 26 | } 27 | } 28 | if(in_test){ 29 | return(structure(TRUE, status = "tests")) 30 | } 31 | if(in_examples){ 32 | return(structure(TRUE, status = "examples")) 33 | } 34 | return(FALSE) 35 | } 36 | 37 | symlink_enabled <- local({ 38 | enabled <- NA 39 | function(){ 40 | if(!is.na(enabled)){ return(enabled) } 41 | tempdir(check = TRUE) 42 | f1 <- temp_path(pattern = 'filearray_simlink_test_from') 43 | f2 <- temp_path(pattern = 'filearray_simlink_test_to') 44 | on.exit({ 45 | if(file.exists(f1)){ 46 | unlink(f1) 47 | } 48 | if(file.exists(f2)){ 49 | unlink(f2) 50 | } 51 | }, add = FALSE) 52 | s <- paste(sample(LETTERS), collapse = "") 53 | writeLines(s, con = f1) 54 | file.symlink(f1, to = f2) 55 | en <- tryCatch({ 56 | if(identical(readLines(f2), s)){ 57 | TRUE 58 | } else { 59 | FALSE 60 | } 61 | }, error = function(e){ 62 | FALSE 63 | }, warning = function(e){ 64 | FALSE 65 | }) 66 | enabled <<- en 67 | 68 | if(file.exists(f1)){ 69 | unlink(f1) 70 | } 71 | if(file.exists(f2)){ 72 | unlink(f2) 73 | } 74 | on.exit({}, add = FALSE) 75 | 76 | return(enabled) 77 | } 78 | }) 79 | 80 | 81 | .onLoad <- function(libname, pkgname){ 82 | # Check if in R CMD check mode 83 | if(Sys.getenv("_R_CHECK_LIMIT_CORES_") == "TRUE"){ 84 | # R CMD check with --as-cran 85 | n <- 2L 86 | } else if(in_rcmdcheck()){ 87 | # R CMD check (without CRAN) 88 | n <- 2L 89 | } else { 90 | n <- filearray_threads(-1) 91 | if(n > 8L){ 92 | n <- 8L 93 | } 94 | } 95 | 96 | filearray_threads(n) 97 | ns <- asNamespace(pkgname) 98 | ns$NA_float_ <- get_float_na() 99 | } 100 | 101 | .onAttach <- function(libname, pkgname){ 102 | if(Sys.getenv("_R_CHECK_LIMIT_CORES_") == "TRUE"){ 103 | packageStartupMessage( 104 | "Found environment variable `_R_CHECK_LIMIT_CORES_`=TRUE. Using ", 105 | filearray_threads(), " threads.") 106 | } else if(in_rcmdcheck()){ 107 | packageStartupMessage("R CMD check mode. Using ", 108 | filearray_threads(), " threads.") 109 | } else { 110 | packageStartupMessage( 111 | "Filearray is using ", filearray_threads(), " threads.") 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /adhoc/profiling.R: -------------------------------------------------------------------------------- 1 | # performance test 2 | 3 | filearray::filearray_threads() 4 | # 2.56GB file 5 | 6 | set.seed(1) 7 | file <- tempfile() 8 | unlink(file, recursive = TRUE) 9 | 10 | x <- filearray::filearray_create(file, c(128, 128, 128, 128)) 11 | 12 | # One-time initialize partition files with NA 13 | system.time({ 14 | x$initialize_partition() 15 | }) 16 | # user system elapsed 17 | # 0.236 2.476 2.864 18 | 19 | # -------------------- WRITE - single threaded ------------------------ 20 | # write to fast margin - single threaded 21 | filearray::filearray_threads(1) 22 | tmp <- rnorm(128^3) 23 | system.time({ 24 | for(i in 1:128){ 25 | x[,,,i] <- tmp 26 | } 27 | }) 28 | # user system elapsed 29 | # 0.220 1.097 2.089 30 | 31 | # 1.2 GB/s 32 | 33 | 34 | system.time({ 35 | for(i in 1:128){ 36 | x[,,i,] <- tmp 37 | } 38 | }) 39 | # user system elapsed 40 | # 1.044 1.400 4.795 41 | 42 | # 500 MB/s 43 | 44 | # write to slow margin - Never recommended to write on single 45 | # (see next section - block write) 46 | system.time({ 47 | # for(i in 1:128){ 48 | x[1,,,] <- tmp 49 | # } 50 | }) 51 | # user system elapsed 52 | # 0.057 1.048 2.097 53 | 54 | # Using memory map boosts the speed 55 | # 1.2 GB/s 56 | 57 | # -------------------- WRITE (block) - 4-threaded ------------------------ 58 | # write to fast margin - 4 threaded 59 | filearray::filearray_threads(4) 60 | 61 | # 1GB data 62 | tmp <- rnorm(128^3 * 50) 63 | idx <- sample(1:128, 50) 64 | system.time({ 65 | x[,,,idx] <- tmp 66 | }) 67 | # user system elapsed 68 | # 0.135 0.642 0.730 69 | 70 | # 1.4 GB/s 71 | 72 | 73 | system.time({ 74 | x[,,idx,] <- tmp 75 | }) 76 | # user system elapsed 77 | # 0.288 0.707 1.165 78 | 79 | # 1 GB/s 80 | 81 | system.time({ 82 | x[,idx,,] <- tmp 83 | }) 84 | # user system elapsed 85 | # 0.166 1.379 2.082 86 | 87 | # 500 MB/s 88 | 89 | system.time({ 90 | x[idx,,,] <- tmp 91 | }) 92 | # user system elapsed 93 | # 0.152 1.555 1.995 94 | 95 | # 500 MB/s 96 | 97 | rm(tmp); gc() 98 | # -------------------- READ - single-threaded ------------------------ 99 | ### single-threaded 100 | filearray::filearray_threads(1) 101 | 102 | # read 1GB subset 103 | 104 | #### ordered indices #### 105 | idx <- sort(sample(1:128, 50)) 106 | 107 | # partition margin 108 | system.time({ 109 | x[,,,idx] 110 | }) 111 | # user system elapsed 112 | # 0.135 0.267 0.620 113 | 114 | # 1.6 GB/s 115 | 116 | # fast margin 117 | system.time({ 118 | x[,,idx,] 119 | }) 120 | # user system elapsed 121 | # 0.135 0.414 0.955 122 | 123 | # 1 GB/s 124 | 125 | # slow margin 126 | system.time({ 127 | x[idx,,,] 128 | }) 129 | # user system elapsed 130 | # 0.141 0.376 0.950 131 | 132 | # 1 GB/s 133 | 134 | 135 | #### random indices #### 136 | idx <- sample(1:128, 50) 137 | 138 | # partition margin 139 | system.time({ 140 | x[,,,idx] 141 | }) 142 | # user system elapsed 143 | # 0.127 0.219 0.507 144 | 145 | # 2 GB/s 146 | 147 | # fast margin 148 | system.time({ 149 | x[,,idx,] 150 | }) 151 | # user system elapsed 152 | # 0.133 0.355 0.796 153 | 154 | # 1.4 GB/s 155 | 156 | # slow margin 157 | system.time({ 158 | x[idx,,,] 159 | }) 160 | # user system elapsed 161 | # 0.141 0.347 0.953 162 | 163 | # 1 GB/s 164 | 165 | 166 | 167 | # -------------------- READ - 4-threaded ------------------------ 168 | filearray::filearray_threads(8) 169 | idx <- sample(1:128, 50) 170 | 171 | # partition margin 172 | system.time({ 173 | x[,,,idx] 174 | }) 175 | # user system elapsed 176 | # 0.175 0.264 0.289 177 | 178 | # 3.4 GB/s 179 | 180 | # fast margin 181 | system.time({ 182 | x[,,idx,] 183 | }) 184 | # user system elapsed 185 | # 0.378 0.324 0.248 186 | 187 | # 4 GB/s 188 | 189 | # slow margin 190 | system.time({ 191 | x[idx,,,] 192 | }) 193 | # user system elapsed 194 | # 0.174 0.448 0.681 195 | 196 | # 1.5 GB/s 197 | 198 | 199 | system.time({ 200 | x[1:65,,,] 201 | }) 202 | system.time({ 203 | x[c(1, 67:128),,,] 204 | }) 205 | -------------------------------------------------------------------------------- /tests/testthat/test-helpers.R: -------------------------------------------------------------------------------- 1 | test_that("is_same_dim", { 2 | 3 | # sanity check 4 | testthat::expect_true(is_same_dim(array(1:12, c(3,4)), matrix(NA, nrow = 3, ncol = 4))) 5 | testthat::expect_false(is_same_dim(array(1:12, c(3,4)), matrix(NA, nrow = 3, ncol = 3))) 6 | testthat::expect_false(is_same_dim(array(1:12, c(3,4)), array(NA, c(3,4,1)))) 7 | testthat::expect_false(is_same_dim(array(1:12, c(3,4)), array(NA, c(4, 3)))) 8 | 9 | testthat::expect_false(is_same_dim(array(NA, c(3,4)), list())) 10 | testthat::expect_false(is_same_dim(array(NA, c(3,4)), data.frame())) 11 | testthat::expect_false(is_same_dim(array(NA, c(3,4)), NULL)) 12 | testthat::expect_false(is_same_dim(array(NA, c(3,4)), 1)) 13 | 14 | # number of margins <= 1 15 | testthat::expect_true(is_same_dim(1, 1)) 16 | testthat::expect_false(is_same_dim(1, 1:3)) 17 | testthat::expect_false(is_same_dim(1, 1:3)) 18 | testthat::expect_true(is_same_dim(NULL, numeric())) 19 | 20 | # margin =1 but with dim 21 | x <- 1:5 22 | dim(x) <- 5 23 | testthat::expect_true(is_same_dim(x, 1:5)) 24 | 25 | }) 26 | 27 | test_that("operation_output_type", { 28 | double_type <- "float" 29 | options("filearray.operator.precision" = double_type) 30 | on.exit({ 31 | options("filearray.operator.precision" = NULL) 32 | }) 33 | 34 | # complex 35 | type1 <- "complex" 36 | for(type2 in c("complex", "double", "float", "integer", "logical")) { 37 | testthat::expect_equal(operation_output_type(type1, type2), type1) 38 | } 39 | testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), type1) 40 | testthat::expect_error(operation_output_type(type1, "raw")) 41 | 42 | # double/float 43 | type1 <- "double" 44 | for(type2 in c("double", "float", "integer", "logical")) { 45 | testthat::expect_equal(operation_output_type(type1, type2), double_type) 46 | } 47 | testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), double_type) 48 | testthat::expect_error(operation_output_type(type1, "raw")) 49 | 50 | type1 <- "float" 51 | for(type2 in c("double", "float", "integer", "logical")) { 52 | testthat::expect_equal(operation_output_type(type1, type2), double_type) 53 | } 54 | testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), double_type) 55 | testthat::expect_error(operation_output_type(type1, "raw")) 56 | 57 | # integer 58 | type1 <- "integer" 59 | for(type2 in c("integer", "logical")) { 60 | testthat::expect_equal(operation_output_type(type1, type2), type1) 61 | } 62 | testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), type1) 63 | testthat::expect_error(operation_output_type(type1, "raw")) 64 | 65 | # logical 66 | testthat::expect_equal(operation_output_type("logical", 'logical'), 'integer') 67 | testthat::expect_equal(operation_output_type("logical", 'logical', logical = "logical"), 'logical') 68 | testthat::expect_equal(operation_output_type('logical', 'raw', raw = "integer"), 'integer') 69 | testthat::expect_error(operation_output_type('logical', "raw")) 70 | 71 | # raw 72 | testthat::expect_equal(operation_output_type('raw', 'raw', raw = "integer"), 'integer') 73 | testthat::expect_error(operation_output_type('raw', "raw")) 74 | 75 | }) 76 | 77 | test_that("fmap_buffer", { 78 | 79 | buffer_large <- get_buffer_size() * 16 80 | 81 | testthat::expect_equal(common_fmap_buffer_count(84, 12, 96), 1) 82 | testthat::expect_equal(common_fmap_buffer_count(84, 12, 97), 1) 83 | 84 | testthat::expect_equal(common_fmap_buffer_count(buffer_large*2, 12, 1024), 2) 85 | testthat::expect_equal(common_fmap_buffer_count(buffer_large*2 + 2, 12, 1024), 2) 86 | testthat::expect_equal(common_fmap_buffer_count(buffer_large*4096, buffer_large, 1024), 1024) 87 | testthat::expect_equal(common_fmap_buffer_count(buffer_large*2, 13, 1024), 1) 88 | 89 | expect_true(validate_fmap_buffer_count(12, input_lens = c(84, 12, 96))) 90 | expect_true(validate_fmap_buffer_count(1, input_lens = c(84, 12, 96))) 91 | expect_false(validate_fmap_buffer_count(15, input_lens = c(84, 12, 96))) 92 | expect_true(validate_fmap_buffer_count(1, input_lens = c(84, 12, 97))) 93 | 94 | }) -------------------------------------------------------------------------------- /man/filearray_bind.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bind.R 3 | \name{filearray_bind} 4 | \alias{filearray_bind} 5 | \title{Merge and bind homogeneous file arrays} 6 | \usage{ 7 | filearray_bind( 8 | ..., 9 | .list = list(), 10 | filebase = tempfile(), 11 | symlink = FALSE, 12 | overwrite = FALSE, 13 | cache_ok = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{..., .list}{file array instances} 18 | 19 | \item{filebase}{where to create merged array} 20 | 21 | \item{symlink}{whether to use \code{\link[base]{file.symlink}}; if true, 22 | then partition files will be symbolic-linked to the original arrays, 23 | otherwise the partition files will be copied over. If you want your data 24 | to be portable, do not use symbolic-links. The default value is \code{FALSE}} 25 | 26 | \item{overwrite}{whether to overwrite when \code{filebase} already exists; 27 | default is false, which raises errors} 28 | 29 | \item{cache_ok}{see 'Details', only used if \code{overwrite} is true.} 30 | } 31 | \value{ 32 | A bound array in \code{'FileArray'} class. 33 | } 34 | \description{ 35 | The file arrays to be merged must be homogeneous: 36 | same data type, partition size, and partition length 37 | } 38 | \details{ 39 | The input arrays must share the same data type and partition size. 40 | The dimension for each partition should also be the same. For example 41 | an array \code{x1} has dimension \eqn{100x20x30} with partition size 42 | \code{1}, then each partition dimension is \eqn{100x20x1}, and there are 43 | \code{30} partitions. \code{x1} can bind with another array of the same 44 | partition size. This means if \code{x2} has dimension 45 | \eqn{100x20x40} and each partition size is \code{1}, then \code{x1} and 46 | \code{x2} can be merged. 47 | 48 | If \code{filebase} exists and \code{overwrite} is \code{FALSE}, an error will 49 | always raise. If \code{overwrite=TRUE} and \code{cache_ok=FALSE}, then 50 | the existing \code{filebase} will be erased and any data stored within will 51 | be lost. 52 | If both \code{overwrite} and \code{cache_ok} are \code{TRUE}, then 53 | , before erasing \code{filebase}, the function validates the existing 54 | array header and compare the header signatures. If the existing header 55 | signature is the same as the array to be created, then the existing array 56 | will be returned. This \code{cache_ok} could be extremely useful when 57 | binding large arrays with \code{symlink=FALSE} as the cache might avoid 58 | moving files around. However, \code{cache_ok} should be enabled with caution. 59 | This is because only the header information will be compared, but the 60 | partition data will not be compared. If the existing array was generated from 61 | an old versions of the source arrays, but the data from the source arrays 62 | has been altered, then the \code{cache_ok=TRUE} is rarely proper as the cache 63 | is outdated. 64 | 65 | The \code{symlink} option should be used with extra caution. Creating 66 | symbolic links is definitely faster than copying partition files. However, 67 | since the partition files are simply linked to the original partition files, 68 | changing to the input arrays will also affect the merged arrays, and 69 | vice versa; see 'Examples'. Also for arrays created from symbolic links, if 70 | the original 71 | arrays are deleted, while the merged arrays will not be invalidated, 72 | the corresponding partitions will no longer be accessible. Attempts to 73 | set deleted partitions will likely result in failure. Therefore 74 | \code{symlink} should be set to true when creating merged arrays are 75 | temporary for read-only purpose, and when speed and disk space is in 76 | consideration. For extended reading, please check \code{\link[base]{files}} 77 | for details. 78 | } 79 | \examples{ 80 | partition_size <- 1 81 | type <- "double" 82 | x1 <- filearray_create( 83 | tempfile(), c(2,2), type = type, 84 | partition_size = partition_size) 85 | x1[] <- 1:4 86 | x2 <- filearray_create( 87 | tempfile(), c(2,1), type = type, 88 | partition_size = partition_size) 89 | x2[] <- 5:6 90 | 91 | y1 <- filearray_bind(x1, x2, symlink = FALSE) 92 | y2 <- filearray_bind(x1, x2) 93 | 94 | # y1 copies partition files, and y2 simply creates links 95 | # if symlink is supported 96 | 97 | y1[] - y2[] 98 | 99 | # change x1 100 | x1[1,1] <- NA 101 | 102 | # y1 is not affected 103 | y1[] 104 | 105 | # y2 changes 106 | y2[] 107 | 108 | 109 | } 110 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | FARR_collapse <- function(filebase, dim, keep, cum_part, array_type, method = 1L, remove_na = FALSE, scale = 1.0) { 5 | .Call(`_filearray_FARR_collapse`, filebase, dim, keep, cum_part, array_type, method, remove_na, scale) 6 | } 7 | 8 | FARR_collapse_complex <- function(filebase, dim, keep, cum_part, method = 1L, remove_na = FALSE, scale = 1.0) { 9 | .Call(`_filearray_FARR_collapse_complex`, filebase, dim, keep, cum_part, method, remove_na, scale) 10 | } 11 | 12 | realToInt64 <- function(x, min_, max_, strict) { 13 | .Call(`_filearray_realToInt64`, x, min_, max_, strict) 14 | } 15 | 16 | cplxToReal2 <- function(x) { 17 | .Call(`_filearray_cplxToReal2`, x) 18 | } 19 | 20 | realToCplx2 <- function(x) { 21 | .Call(`_filearray_realToCplx2`, x) 22 | } 23 | 24 | realToFloat2 <- function(x) { 25 | .Call(`_filearray_realToFloat2`, x) 26 | } 27 | 28 | floatToReal2 <- function(x) { 29 | .Call(`_filearray_floatToReal2`, x) 30 | } 31 | 32 | get_float_na <- function() { 33 | .Call(`_filearray_get_float_na`) 34 | } 35 | 36 | set_buffer_size <- function(size) { 37 | .Call(`_filearray_set_buffer_size`, size) 38 | } 39 | 40 | get_buffer_size <- function() { 41 | .Call(`_filearray_get_buffer_size`) 42 | } 43 | 44 | FARR_meta <- function(filebase) { 45 | .Call(`_filearray_FARR_meta`, filebase) 46 | } 47 | 48 | loc2idx <- function(sliceIdx, dim) { 49 | .Call(`_filearray_loc2idx`, sliceIdx, dim) 50 | } 51 | 52 | locationList <- function(listOrEnv, dim, strict) { 53 | .Call(`_filearray_locationList`, listOrEnv, dim, strict) 54 | } 55 | 56 | schedule <- function(listOrEnv, dim, cum_part_sizes, split_dim, strict) { 57 | .Call(`_filearray_schedule`, listOrEnv, dim, cum_part_sizes, split_dim, strict) 58 | } 59 | 60 | filearray_meta <- function(filebase) { 61 | .Call(`_filearray_filearray_meta`, filebase) 62 | } 63 | 64 | filearray_assign <- function(filebase, value, position_indices) { 65 | .Call(`_filearray_filearray_assign`, filebase, value, position_indices) 66 | } 67 | 68 | filearray_subset <- function(filebase, position_indices, drop = TRUE, use_dimnames = TRUE, reshape = NULL) { 69 | .Call(`_filearray_filearray_subset`, filebase, position_indices, drop, use_dimnames, reshape) 70 | } 71 | 72 | FARR_subset_sequential <- function(filebase, unit_partlen, cum_partsizes, array_type, ret, from = 0L, len = 1L) { 73 | .Call(`_filearray_FARR_subset_sequential`, filebase, unit_partlen, cum_partsizes, array_type, ret, from, len) 74 | } 75 | 76 | FARR_subset2 <- function(filebase, listOrEnv, reshape = NULL, drop = FALSE, use_dimnames = TRUE, thread_buffer = 0L, split_dim = 0L, strict = 1L) { 77 | .Call(`_filearray_FARR_subset2`, filebase, listOrEnv, reshape, drop, use_dimnames, thread_buffer, split_dim, strict) 78 | } 79 | 80 | FARR_buffer_map <- function(input_filebases, output_filebase, map, buffer_nelems, result_nelems = 0L) { 81 | .Call(`_filearray_FARR_buffer_map`, input_filebases, output_filebase, map, buffer_nelems, result_nelems) 82 | } 83 | 84 | FARR_buffer_map2 <- function(input_filebases, map, buffer_nelems) { 85 | .Call(`_filearray_FARR_buffer_map2`, input_filebases, map, buffer_nelems) 86 | } 87 | 88 | FARR_buffer_mapreduce <- function(filebase, map, reduce, buffer_nelems) { 89 | .Call(`_filearray_FARR_buffer_mapreduce`, filebase, map, reduce, buffer_nelems) 90 | } 91 | 92 | FARR_subset_assign_sequential <- function(filebase, unit_partlen, cum_partsizes, array_type, value, from) { 93 | .Call(`_filearray_FARR_subset_assign_sequential`, filebase, unit_partlen, cum_partsizes, array_type, value, from) 94 | } 95 | 96 | FARR_subset_assign2 <- function(filebase, value, listOrEnv, thread_buffer = 2097152L, split_dim = 0L) { 97 | .Call(`_filearray_FARR_subset_assign2`, filebase, value, listOrEnv, thread_buffer, split_dim) 98 | } 99 | 100 | getDefaultNumThreads <- function() { 101 | .Call(`_filearray_getDefaultNumThreads`) 102 | } 103 | 104 | getThreads <- function(max) { 105 | .Call(`_filearray_getThreads`, max) 106 | } 107 | 108 | kinda_sorted <- function(idx, min_, buffer_count) { 109 | .Call(`_filearray_kinda_sorted`, idx, min_, buffer_count) 110 | } 111 | 112 | check_missing_dots <- function(env) { 113 | .Call(`_filearray_check_missing_dots`, env) 114 | } 115 | 116 | reshape_or_drop <- function(x, reshape, drop) { 117 | .Call(`_filearray_reshape_or_drop`, x, reshape, drop) 118 | } 119 | 120 | -------------------------------------------------------------------------------- /src/serialize.cpp: -------------------------------------------------------------------------------- 1 | #include "serialize.h" 2 | using namespace Rcpp; 3 | 4 | int read_byte(R_inpstream_t stream){ 5 | buffer_t *buf = (buffer_t *)stream->data; 6 | if (buf->pos >= buf->length) { 7 | stop("Overflow in `read_byte()`"); 8 | } 9 | return buf->data[buf->pos++]; 10 | } 11 | 12 | void read_bytes(R_inpstream_t stream, void *dst, int length) { 13 | buffer_t *buf = (buffer_t *)stream->data; 14 | if (buf->pos + length > buf->length) { 15 | stop("Overflow in `read_bytes()`"); 16 | } 17 | memcpy(dst, buf->data + buf->pos, length); 18 | buf->pos += length; 19 | } 20 | 21 | SEXP unserialize_raw(SEXP x) { 22 | if (TYPEOF(x) != RAWSXP) { 23 | stop("`unserialize_raw` requires raw input"); 24 | } 25 | Rbyte *vec = RAW(x); 26 | R_xlen_t len = Rf_xlength(x); 27 | 28 | buffer_t *buf = (buffer_t*) malloc(sizeof(buffer_t)); 29 | if (buf == NULL) { 30 | stop("`unserialize_raw` Cannot allocate memory for buffer"); 31 | } 32 | buf->length = len; 33 | buf->pos = 0; 34 | buf->data = (unsigned char *) vec; 35 | 36 | // Treat the data buffer as an input stream 37 | struct R_inpstream_st input_stream; 38 | 39 | R_InitInPStream( 40 | &input_stream, 41 | (R_pstream_data_t) buf, 42 | R_pstream_any_format, 43 | read_byte, 44 | read_bytes, 45 | NULL, 46 | NULL 47 | ); 48 | 49 | // Unserialize! 50 | SEXP ret = PROTECT(R_Unserialize(&input_stream)); 51 | 52 | free(buf); 53 | buf = NULL; 54 | UNPROTECT(1); 55 | return ret; 56 | } 57 | 58 | SEXP unserialize_connection(FILE* conn, size_t len) { 59 | SEXP raw = PROTECT(Rf_allocVector(RAWSXP, len)); 60 | lendian_fread(RAW(raw), 1, len, conn); 61 | SEXP re = PROTECT(unserialize_raw(raw)); 62 | UNPROTECT(2); 63 | return(re); 64 | } 65 | 66 | bool isLittleEndian(){ 67 | int x = 1; 68 | bool is_little = *((char*)&x) == 1; 69 | return ( is_little ); 70 | // DEBUG test big endianess 71 | // return(!is_little); 72 | } 73 | 74 | void swap_endianess_old(void *ptr, size_t size, size_t nmemb){ 75 | unsigned char *buffer_src = (unsigned char*)ptr; 76 | unsigned char *buffer_dst = new unsigned char[size]; 77 | size_t ix = 0; 78 | for (size_t i = 0; i < nmemb; i++, buffer_src += size) { 79 | for (ix = 0; ix < size; ix++) { 80 | *(buffer_dst + (size - 1 - ix)) = *(buffer_src + ix); 81 | } 82 | memcpy(buffer_src, buffer_dst, size); 83 | } 84 | delete[] buffer_dst; 85 | } 86 | 87 | void swap_endianess(void *ptr, const size_t& size, const size_t& nmemb){ 88 | if( size <= 1 || nmemb <= 0 ){ return; } 89 | unsigned char *buffer_src1 = (unsigned char*)ptr; 90 | unsigned char *buffer_src2 = buffer_src1 + (size - 1); 91 | 92 | unsigned char tmp = 0; 93 | unsigned char* tmpptr = &(tmp); 94 | 95 | size_t ix = 0; 96 | const size_t half_size = size / 2; 97 | for (size_t i = 0; i < nmemb; i++) { 98 | for(ix = 0; ix < half_size; ix++){ 99 | *tmpptr = *buffer_src1; 100 | *buffer_src1++ = *buffer_src2; 101 | *buffer_src2-- = *tmpptr; 102 | } 103 | buffer_src1 += half_size; 104 | buffer_src2 += half_size + size; 105 | } 106 | } 107 | 108 | size_t lendian_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { 109 | const size_t len = fread(ptr, size, nmemb, stream); 110 | if( !isLittleEndian() ){ 111 | // little endian to big endian 112 | swap_endianess(ptr, size, nmemb); 113 | } 114 | return( len ); 115 | } 116 | 117 | size_t lendian_fwrite(void *ptr, size_t size, size_t nmemb, FILE *stream) { 118 | if( !isLittleEndian() ){ 119 | // big endian to little endian 120 | swap_endianess(ptr, size, nmemb); 121 | } 122 | return( fwrite(ptr, size, nmemb, stream) ); 123 | } 124 | 125 | 126 | void lendian_assign(void* dst, const void* src, const size_t& elem_size, const size_t& nelems){ 127 | if( !isLittleEndian() ){ 128 | const unsigned char *buffer_src = (const unsigned char*)src; 129 | unsigned char *buffer_dst = (unsigned char*)dst; 130 | size_t i = 0; 131 | for(size_t idx = 0; idx < nelems; idx++){ 132 | for(i = 0; i < elem_size; i++, buffer_dst++){ 133 | *buffer_dst = *(buffer_src + (idx * elem_size + elem_size - i - 1)); 134 | } 135 | } 136 | } else { 137 | memcpy(dst, src, elem_size * nelems); 138 | } 139 | } 140 | 141 | -------------------------------------------------------------------------------- /man/mapreduce.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mapreduce.R 3 | \name{mapreduce} 4 | \alias{mapreduce} 5 | \alias{mapreduce,FileArray,ANY,function-method} 6 | \alias{mapreduce,FileArray,ANY,NULL-method} 7 | \alias{mapreduce,FileArray,ANY,missing-method} 8 | \title{A map-reduce method to iterate blocks of file-array data with little memory usage} 9 | \usage{ 10 | mapreduce(x, map, reduce, ...) 11 | 12 | \S4method{mapreduce}{FileArray,ANY,function}(x, map, reduce, buffer_size = NA, ...) 13 | 14 | \S4method{mapreduce}{FileArray,ANY,NULL}(x, map, reduce, buffer_size = NA, ...) 15 | 16 | \S4method{mapreduce}{FileArray,ANY,missing}(x, map, reduce, buffer_size = NA, ...) 17 | } 18 | \arguments{ 19 | \item{x}{a file array object} 20 | 21 | \item{map}{mapping function that receives 3 arguments; see 'Details'} 22 | 23 | \item{reduce}{\code{NULL}, or a function that takes a list as input} 24 | 25 | \item{...}{passed to other methods} 26 | 27 | \item{buffer_size}{control how we split the array; see 'Details'} 28 | } 29 | \value{ 30 | If \code{reduce} is \code{NULL}, return mapped results, otherwise 31 | return reduced results from \code{reduce} function 32 | } 33 | \description{ 34 | A map-reduce method to iterate blocks of file-array data with little memory usage 35 | } 36 | \details{ 37 | When handling out-of-memory arrays, it is recommended to load 38 | a block of array at a time and execute on block level. See 39 | \code{\link{apply}} for a implementation. When an array is too large, 40 | and when there are too many blocks, this operation will become 41 | very slow if computer memory is low. 42 | This is because the R will perform garbage collection frequently. 43 | Implemented in \code{C++}, \code{mapreduce} creates a buffer to store 44 | the block data. By reusing the memory over and over again, it is possible 45 | to iterate through the array with minimal garbage collections. Many 46 | statistics, including \code{min}, \code{max}, \code{sum}, 47 | \code{mean}, ... These statistics can be calculated in this 48 | way efficiently. 49 | 50 | The function \code{map} contains three arguments: \code{data} (mandate), 51 | \code{size} (optional), and \code{first_index} (optional). 52 | The \code{data} is the buffer, 53 | whose length is consistent across iterations. \code{size} indicates 54 | the effective size of the buffer. If the partition size 55 | is not divisible by the buffer size, only first \code{size} elements of 56 | the data are from array, and the rest elements will be \code{NA}. 57 | This situation could only occurs when \code{buffer_size} is manually 58 | specified. By default, all of \code{data} should belong to arrays. 59 | The last argument \code{first_index} is the index of the first element 60 | \code{data[1]} in the whole array. It is useful when positional data 61 | is needed. 62 | 63 | The buffer size, specified by \code{buffer_size} is an 64 | additional optional argument in \code{...}. Its default is \code{NA}, 65 | and will be calculated automatically. If manually specified, a 66 | large buffer size would be desired to speed up the calculation. 67 | The default buffer size will not exceed \eqn{nThreads x 2MB}, where 68 | \code{nThreads} is the number of threads set by \code{\link{filearray_threads}}. 69 | When partition length cannot be divided by the buffer size, instead of 70 | trimming the buffer, \code{NA}s will be filled to the buffer, 71 | passed to \code{map} function; see previous paragraph for treatments. 72 | 73 | The function \code{mapreduce} ignores the missing partitions. That means 74 | if a partition is missing, its data will not be read nor passed to 75 | \code{map} function. Please run \code{x$initialize_partition()} to make sure 76 | partition files exist. 77 | } 78 | \examples{ 79 | 80 | 81 | x <- filearray_create(tempfile(), c(100, 100, 10)) 82 | x[] <- rnorm(1e5) 83 | 84 | ## calculate summation 85 | # identical to sum(x[]), but is more feasible in large cases 86 | 87 | mapreduce(x, map = function(data, size){ 88 | # make sure `data` is all from array 89 | if(length(data) != size){ 90 | data <- data[1:size] 91 | } 92 | sum(data) 93 | }, reduce = function(mapped_list){ 94 | do.call(sum, mapped_list) 95 | }) 96 | 97 | 98 | ## Find elements are less than -3 99 | positions <- mapreduce( 100 | x, 101 | map = function(data, size, first_index) { 102 | if (length(data) != size) { 103 | data <- data[1:size] 104 | } 105 | which(data < -3) + (first_index - 1) 106 | }, 107 | reduce = function(mapped_list) { 108 | do.call(c, mapped_list) 109 | } 110 | ) 111 | 112 | if(length(positions)){ 113 | x[[positions[1]]] 114 | } 115 | 116 | 117 | } 118 | -------------------------------------------------------------------------------- /adhoc/other docs/readme-figure.R: -------------------------------------------------------------------------------- 1 | set.seed(1) 2 | dim <- c(100,200,200,100) 3 | lazyx <- lazyarray::create_lazyarray( 4 | tempfile(), storage_format = 'double', dim = dim) 5 | lazyx[] <- NA 6 | filex <- filearray::filearray_create( 7 | tempfile(), dim, 'double') 8 | filex$initialize_partition() 9 | xs <- list(lazyx, filex) 10 | 11 | lazyarray:::set_lazy_threads(8) 12 | filearray::filearray_threads(8) 13 | 14 | set.seed(1) 15 | tmp <- rnorm(4e7) 16 | 17 | speed1 <- rowMeans(replicate(1, { 18 | res <- sapply(xs, function(x){ 19 | gc() 20 | system.time({ 21 | for(i in 1:10){ 22 | x[,,,(i-1)*10 +1:10] <- tmp 23 | } 24 | }, gcFirst = TRUE) 25 | }) 26 | speed <- prod(dim) *8e-6 / res[3,]; speed 27 | })); speed1 28 | 29 | speed2 <- rowMeans(replicate(1, { 30 | res <- sapply(xs, function(x){ 31 | gc() 32 | system.time({ 33 | for(i in 1:10){ 34 | x[,,,(i-1)*10 +1:10] 35 | } 36 | }, gcFirst = TRUE) 37 | }) 38 | speed <- prod(dim) *8e-6 / res[3,]; speed 39 | })); speed2 40 | 41 | set.seed(1) 42 | locs <- lapply(dim, function(d){ 43 | sample(1:d, replace = FALSE, size = sample(50:d, 1)) 44 | }) 45 | 46 | speed3 <- rowMeans(replicate(10, { 47 | res <- sapply(xs, function(x){ 48 | gc() 49 | system.time({ 50 | x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] 51 | }, gcFirst = TRUE) 52 | }) 53 | speed <- prod(sapply(locs, length)) *8e-6 / res[3,]; speed 54 | })); speed3 55 | 56 | tmp <- rnorm(prod(sapply(locs, length))) 57 | speed4 <- rowMeans(replicate(10, { 58 | res <- sapply(xs, function(x){ 59 | gc() 60 | system.time({ 61 | x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp 62 | }, gcFirst = TRUE) 63 | }) 64 | speed <- prod(sapply(locs, length)) *8e-6 / res[3,]; speed 65 | })); speed4 66 | 67 | 68 | rm(tmp, sudo_pwd); gc() 69 | 70 | f <- function(txt_cex = 1.2){ 71 | 72 | cols <- c("orange", "dodgerblue3") 73 | speed <- cbind( 74 | speed1, 75 | speed4, 76 | speed2, 77 | speed3 78 | ) 79 | rownames(speed) <-c("lazyarray", "filearray") 80 | colnames(speed) <- NULL 81 | 82 | txt_cex2 = txt_cex 83 | 84 | par(mfrow = c(1,2)) 85 | 86 | plt <- barplot.default( 87 | speed[,1:2], beside = TRUE, 88 | ylab = "Speed (MB/s)", 89 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 90 | ylim = c(0, 1000), las = 1, yaxt = "n", 91 | border = NA, 92 | main = "Single threaded\nmemory purged", 93 | cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4 94 | ) 95 | axis(2, c(0, 350, 700), las = 1) 96 | axis(1, colMeans(plt), las = 1, label = c( 97 | "Write 10GB Data\n(Sequential)", 98 | "Substitute 800MB Data\n(Random)" 99 | # "Read 10GB Data\n(Sequential)", 100 | # "Subset 800MB Data\n(Random)" 101 | ), tick = FALSE) 102 | text.default( 103 | x = plt, 104 | y = speed[,1:2], 105 | labels = sprintf(c( 106 | "\n%.0f MB/s", 107 | "\n%.0f MB/s" 108 | ), speed[,1:2]), 109 | cex = txt_cex, 110 | col = 'white' 111 | ) 112 | text.default( 113 | x = plt, 114 | y = speed[,1:2], 115 | labels = sprintf("%s\n", rownames(speed)), 116 | cex = txt_cex2, 117 | col = cols 118 | ) 119 | 120 | plt <- barplot.default( 121 | speed[,3:4], beside = TRUE, 122 | ylab = "Speed (GB/s)", 123 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 124 | ylim = c(0, 2500), las = 1, yaxt = "n", 125 | border = NA, 126 | main = "Single threaded\nmemory purged", 127 | cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4 128 | ) 129 | axis(2, c(0, 1000, 2000), las = 1, labels = c(0,1,2)) 130 | axis(1, colMeans(plt), las = 1, label = c( 131 | # "Write 10GB Data\n(Sequential)", 132 | # "Substitute 800MB Data\n(Random)" 133 | "Read 10GB Data\n(Sequential)", 134 | "Subset 800MB Data\n(Random)" 135 | ), tick = FALSE) 136 | text.default( 137 | x = plt, 138 | y = speed[,3:4], 139 | labels = sprintf(c( 140 | "\n%.0f MB/s", 141 | "\n%.0f MB/s" 142 | ), speed[,3:4]), 143 | cex = txt_cex, 144 | col = 'white' 145 | ) 146 | text.default( 147 | x = plt, 148 | y = speed[,3:4], 149 | labels = sprintf("%s\n", rownames(speed)), 150 | cex = txt_cex2, 151 | col = cols 152 | ) 153 | 154 | 155 | } 156 | 157 | # png("./adhoc/other docs/comparison-simple.png", width = 4267, height = 1600, res = 300) 158 | f(0.7) 159 | dev.off() 160 | -------------------------------------------------------------------------------- /tests/testthat/test-bind.R: -------------------------------------------------------------------------------- 1 | test_that("bind (FileArray)", { 2 | 3 | 4 | x <- array(rnorm(120), 2:5) 5 | y <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L) 6 | z <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L) 7 | 8 | options("filearray.quiet" = FALSE) 9 | on.exit({ 10 | options("filearray.quiet" = FALSE) 11 | y$delete(force = TRUE) 12 | z$delete(force = TRUE) 13 | }, add = TRUE) 14 | lapply(1:10, function(ii){ 15 | if(ii %% 2 == 0){ 16 | y[,,,ii] <- x[,,,ii / 2] 17 | z[,,,ii] <- x[,,,ii / 2] 18 | } 19 | }) 20 | 21 | testthat::expect_warning({ 22 | w <- filearray_bind(y, z, symlink = FALSE) 23 | w$delete() 24 | }, regexp = "^One or more arrays have last margin size.+") 25 | 26 | 27 | 28 | options("filearray.quiet" = TRUE) 29 | y$expand(n = 12) 30 | z$expand(n = 12) 31 | w <- filearray_bind(y, z, symlink = TRUE) 32 | l <- filearray_load(w$.filebase, mode = "readonly") 33 | 34 | on.exit({ 35 | w$.mode <- "readwrite" 36 | w$delete() 37 | l$.mode <- "readwrite" 38 | l$delete() 39 | }, add = TRUE) 40 | 41 | expect_null({ 42 | filearray_checkload(filebase = w$.filebase, symlink_ok = TRUE) 43 | filearray_checkload(filebase = w$.filebase, partition = 3) 44 | NULL 45 | }) 46 | 47 | if(w$.header$filearray_bind$symlink){ 48 | expect_error({ 49 | filearray_checkload(filebase = w$.filebase, symlink_ok = FALSE) 50 | }) 51 | } 52 | 53 | expect_identical(w[], l[]) 54 | expect_identical(w[,,,seq(2,10,2)], x) 55 | expect_identical(w[,,,seq(2,10,2) + 12], x) 56 | expect_identical(l[,,,seq(2,10,2)], x) 57 | expect_identical(l[,,,seq(2,10,2) + 12], x) 58 | 59 | expect_identical(l[,,,seq(2,10,2) + c(0, 12, 12, 0, NA)], x[,,,c(1:4, NA)]) 60 | 61 | expect_equal( 62 | l$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE), 63 | apply(x, c(2,3), sum) * 2 64 | ) 65 | 66 | # Check if cached bind works 67 | l <- filearray_bind(y, z, filebase = w$.filebase, symlink = w$.header$filearray_bind$symlink, overwrite = TRUE, cache_ok = TRUE) 68 | 69 | expect_true(attr(l, "cached_bind")) 70 | 71 | }) 72 | 73 | 74 | test_that("bind (FileArrayProxy)", { 75 | 76 | 77 | y <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L, type = "integer") 78 | z <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L, type = "double") 79 | 80 | options("filearray.quiet" = FALSE) 81 | on.exit({ 82 | options("filearray.quiet" = FALSE) 83 | y$delete(force = TRUE) 84 | z$delete(force = TRUE) 85 | }, add = TRUE) 86 | 87 | y[] <- rep(0L, 240) 88 | z[] <- rep(0L, 240) 89 | x <- array(rnorm(240), c(2,3,4,10)) 90 | 91 | y <- y + x 92 | z <- z + x 93 | 94 | testthat::expect_error(y$expand(n = 12)) 95 | 96 | options("filearray.quiet" = TRUE) 97 | w <- filearray_bind(y, z, symlink = TRUE) 98 | l <- filearray_load(w$.filebase, mode = "readonly") 99 | 100 | on.exit({ 101 | w$.mode <- "readwrite" 102 | w$delete() 103 | l$.mode <- "readwrite" 104 | l$delete() 105 | }, add = TRUE) 106 | 107 | expect_null({ 108 | filearray_checkload(filebase = w$.filebase, symlink_ok = TRUE) 109 | filearray_checkload(filebase = w$.filebase, partition = 3) 110 | NULL 111 | }) 112 | 113 | if(w$.header$filearray_bind$symlink){ 114 | expect_error({ 115 | filearray_checkload(filebase = w$.filebase, symlink_ok = FALSE) 116 | }) 117 | } 118 | 119 | expect_identical(w[], l[]) 120 | expect_identical(w[,,,1:10, dimnames = NULL], x) 121 | expect_identical(w[,,,1:10 + 12, dimnames = NULL], x) 122 | expect_identical(l[,,,1:10, dimnames = NULL], x) 123 | expect_identical(l[,,,1:10 + 12, dimnames = NULL], x) 124 | 125 | expect_identical(l[,,,seq(2,10,2) + c(0, 12, 12, 0, NA), dimnames = NULL], x[,,,c(1:4*2, NA)]) 126 | 127 | expect_equal( 128 | l$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE), 129 | apply(x, c(2,3), sum) * 2 130 | ) 131 | 132 | expect_equal( 133 | y$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE), 134 | apply(x, c(2,3), sum) 135 | ) 136 | expect_equal( 137 | z$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE), 138 | apply(x, c(2,3), sum) 139 | ) 140 | 141 | 142 | # Check if cached bind works 143 | # l <- filearray_bind(y, z, filebase = w$.filebase, symlink = w$.header$filearray_bind$symlink, overwrite = TRUE, cache_ok = TRUE) 144 | # 145 | # expect_true(attr(l, "cached_bind")) 146 | 147 | }) 148 | -------------------------------------------------------------------------------- /adhoc/other docs/comparison-singlethread.R: -------------------------------------------------------------------------------- 1 | sudo_pwd <- rstudioapi::askForPassword("sudo password") 2 | cold_start <- TRUE 3 | purge_memory <- function(){ 4 | if(cold_start){ 5 | system("sudo -kS purge", input = sudo_pwd) 6 | Sys.sleep(1) 7 | } 8 | } 9 | 10 | set.seed(1) 11 | dim <- c(100,200,200,100) 12 | lazyx <- lazyarray::create_lazyarray( 13 | tempfile(), storage_format = 'double', dim = dim) 14 | lazyx[] <- NA 15 | filex <- filearray::filearray_create( 16 | tempfile(), dim, 'double') 17 | filex$initialize_partition() 18 | xs <- list(lazyx, filex) 19 | 20 | lazyarray:::set_lazy_threads(1) 21 | filearray::filearray_threads(1) 22 | 23 | set.seed(1) 24 | tmp <- rnorm(4e7) 25 | 26 | 27 | speed1 <- rowMeans(replicate(5, { 28 | res <- sapply(xs, function(x){ 29 | gc() 30 | purge_memory() 31 | system.time({ 32 | for(i in 1:10){ 33 | x[,,,(i-1)*10 +1:10] <- tmp 34 | } 35 | }, gcFirst = TRUE) 36 | }) 37 | speed <- prod(dim) *8e-6 / res[3,]; speed 38 | })); speed1 39 | 40 | speed2 <- rowMeans(replicate(5, { 41 | res <- sapply(xs, function(x){ 42 | gc() 43 | purge_memory() 44 | system.time({ 45 | for(i in 1:10){ 46 | x[,,,(i-1)*10 +1:10] 47 | } 48 | }, gcFirst = TRUE) 49 | }) 50 | speed <- prod(dim) *8e-6 / res[3,]; speed 51 | })); speed2 52 | 53 | set.seed(1) 54 | locs <- lapply(dim, function(d){ 55 | sample(1:d, replace = FALSE, size = 100) 56 | }) 57 | 58 | speed3 <- rowMeans(replicate(5, { 59 | res <- sapply(xs, function(x){ 60 | gc() 61 | purge_memory() 62 | system.time({ 63 | x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] 64 | }, gcFirst = TRUE) 65 | }) 66 | speed <- 800 / res[3,]; speed 67 | })); speed3 68 | 69 | tmp <- rnorm(prod(sapply(locs, length))) 70 | speed4 <- rowMeans(replicate(5, { 71 | res <- sapply(xs, function(x){ 72 | gc() 73 | purge_memory() 74 | system.time({ 75 | x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp 76 | }, gcFirst = TRUE) 77 | }) 78 | speed <- 800 / res[3,]; speed 79 | })); speed4 80 | 81 | 82 | rm(tmp, sudo_pwd); gc() 83 | 84 | f <- function(){ 85 | 86 | cols <- c("orange", "dodgerblue3") 87 | speed <- cbind( 88 | speed1, 89 | speed4, 90 | speed2, 91 | speed3 92 | ) 93 | rownames(speed) <-c("lazyarray", "filearray") 94 | colnames(speed) <- NULL 95 | 96 | txt_cex = 1.2 97 | txt_cex2 = 1.2 98 | 99 | par(mfrow = c(1,2)) 100 | 101 | plt <- barplot.default( 102 | speed[,1:2], beside = TRUE, 103 | ylab = "Speed (MB/s)", 104 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 105 | ylim = c(0, 1000), las = 1, yaxt = "n", 106 | border = NA, 107 | main = sprintf("Single threaded\n%s", ifelse(cold_start, "memory purged", "")), 108 | cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4 109 | ) 110 | axis(2, c(0, 350, 700), las = 1) 111 | axis(1, colMeans(plt), las = 1, label = c( 112 | "Write 10GB Data\n(Sequential)", 113 | "Substitute 800MB Data\n(Random)" 114 | # "Read 10GB Data\n(Sequential)", 115 | # "Subset 800MB Data\n(Random)" 116 | ), tick = FALSE) 117 | text.default( 118 | x = plt, 119 | y = speed[,1:2], 120 | labels = sprintf(c( 121 | "\n%.0f MB/s", 122 | "\n%.0f MB/s" 123 | ), speed[,1:2]), 124 | cex = txt_cex, 125 | col = 'white' 126 | ) 127 | text.default( 128 | x = plt, 129 | y = speed[,1:2], 130 | labels = sprintf("%s\n", rownames(speed)), 131 | cex = txt_cex2, 132 | col = cols 133 | ) 134 | 135 | plt <- barplot.default( 136 | speed[,3:4], beside = TRUE, 137 | ylab = "Speed (GB/s)", 138 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 139 | ylim = c(0, 2500), las = 1, yaxt = "n", 140 | border = NA, 141 | main = sprintf("Single threaded\n%s", ifelse(cold_start, "memory purged", "")), 142 | cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4 143 | ) 144 | axis(2, c(0, 1000, 2000), las = 1, labels = c(0,1,2)) 145 | axis(1, colMeans(plt), las = 1, label = c( 146 | # "Write 10GB Data\n(Sequential)", 147 | # "Substitute 800MB Data\n(Random)" 148 | "Read 10GB Data\n(Sequential)", 149 | "Subset 800MB Data\n(Random)" 150 | ), tick = FALSE) 151 | text.default( 152 | x = plt, 153 | y = speed[,3:4], 154 | labels = sprintf(c( 155 | "\n%.0f MB/s", 156 | "\n%.0f MB/s" 157 | ), speed[,3:4]), 158 | cex = txt_cex, 159 | col = 'white' 160 | ) 161 | text.default( 162 | x = plt, 163 | y = speed[,3:4], 164 | labels = sprintf("%s\n", rownames(speed)), 165 | cex = txt_cex2, 166 | col = cols 167 | ) 168 | 169 | 170 | } 171 | 172 | png(sprintf("./adhoc/other docs/comparison-singlethread-%s.png", ifelse(cold_start, "coldstart", "warmstart")), width = 4267, height = 1600, res = 300) 173 | f() 174 | dev.off() 175 | -------------------------------------------------------------------------------- /man/fmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/method-map.R 3 | \name{fmap} 4 | \alias{fmap} 5 | \alias{fmap2} 6 | \alias{fmap_element_wise} 7 | \title{Map multiple file arrays and save results} 8 | \usage{ 9 | fmap( 10 | x, 11 | fun, 12 | .y = NULL, 13 | .buffer_count = NA_integer_, 14 | .output_size = NA_integer_, 15 | ... 16 | ) 17 | 18 | fmap2(x, fun, .buffer_count = NA, .simplify = TRUE, ...) 19 | 20 | fmap_element_wise(x, fun, .y, ..., .input_size = NA) 21 | } 22 | \arguments{ 23 | \item{x}{a list of file arrays to map; each element of \code{x} must 24 | share the same dimensions.} 25 | 26 | \item{fun}{function that takes one list} 27 | 28 | \item{.y}{a file array object, used to save results} 29 | 30 | \item{.buffer_count}{number of total buffers (chunks) to run} 31 | 32 | \item{.output_size}{\code{fun} output vector length} 33 | 34 | \item{...}{other arguments passing to \code{fun}} 35 | 36 | \item{.simplify}{whether to apply \code{\link[base]{simplify2array}} to 37 | the result} 38 | 39 | \item{.input_size}{number of elements to read from each array of \code{x}} 40 | } 41 | \value{ 42 | File array instance \code{.y} 43 | } 44 | \description{ 45 | Advanced mapping function for multiple file arrays. \code{fmap} 46 | runs the mapping functions and stores the results in file arrays. 47 | \code{fmap2} stores results in memory. This 48 | feature is experimental. There are several constraints to the input. 49 | Failure to meet these constraints may result in undefined results, or 50 | even crashes. Please read Section 'Details' carefully before using 51 | this function. 52 | } 53 | \details{ 54 | Denote the first argument of \code{fun} as \code{input}, The length 55 | of \code{input} equals the length of \code{x}. The size of each 56 | element of \code{input} is defined by \code{.input_size}, except for the 57 | last loop. For example, given dimension of each input array as 58 | \eqn{10x10x10x10}, if \code{.input_size=100}, then 59 | \code{length(input[[1]])=100}. The total number of runs equals to 60 | \code{length(x[[1]])/100}. If \code{.input_size=300}, then 61 | \code{length(input[[1]])} will be \code{300} except for the last run. 62 | This is because \eqn{10000} cannot be divided by \code{300}. 63 | The element length of the last run will be \code{100}. 64 | 65 | The returned variable length of \code{fun} will be checked by 66 | \code{.output_size}. If the output length exceed \code{.output_size}, 67 | an error will be raised. 68 | 69 | Please make sure that \code{length(.y)/length(x[[1]])} equals to 70 | \code{.output_size/.input_size}. 71 | 72 | For \code{fmap_element_wise}, the \code{input[[1]]} and output length 73 | must be the consistent. 74 | } 75 | \examples{ 76 | 77 | 78 | set.seed(1) 79 | x1 <- filearray_create(tempfile(), dimension = c(100,20,3)) 80 | x1[] <- rnorm(6000) 81 | x2 <- filearray_create(tempfile(), dimension = c(100,20,3)) 82 | x2[] <- rnorm(6000) 83 | 84 | # Add two arrays 85 | output <- filearray_create(tempfile(), dimension = c(100,20,3)) 86 | fmap(list(x1, x2), function(input){ 87 | input[[1]] + input[[2]] 88 | }, output) 89 | 90 | # check 91 | range(output[] - (x1[] + x2[])) 92 | 93 | output$delete() 94 | 95 | # Calculate the maximum of x1/x2 for every 100 elements 96 | # total 60 batches/loops (`.buffer_count`) 97 | output <- filearray_create(tempfile(), dimension = c(20,3)) 98 | fmap(list(x1, x2), function(input){ 99 | max(input[[1]] / input[[2]]) 100 | }, .y = output, .buffer_count = 60) 101 | 102 | # check 103 | range(output[] - apply(x1[] / x2[], c(2,3), max)) 104 | 105 | output$delete() 106 | 107 | # A large array example 108 | if(interactive()){ 109 | x <- filearray_create(tempfile(), dimension = c(287, 100, 301, 4)) 110 | dimnames(x) <- list( 111 | Trial = 1:287, 112 | Marker = 1:100, 113 | Time = 1:301, 114 | Location = 1:4 115 | ) 116 | 117 | for(i in 1:4){ 118 | x[,,,i] <- runif(8638700) 119 | } 120 | # Step 1: 121 | # for each location, trial, and marker, calibrate (baseline) 122 | # according to first 50 time-points 123 | 124 | output <- filearray_create(tempfile(), dimension = dim(x)) 125 | 126 | # baseline-percentage change 127 | fmap( 128 | list(x), 129 | function(input){ 130 | # get locational data 131 | location_data <- input[[1]] 132 | dim(location_data) <- c(287, 100, 301) 133 | 134 | # collapse over first 50 time points for 135 | # each trial, and marker 136 | baseline <- apply(location_data[,,1:50], c(1,2), mean) 137 | 138 | # calibrate 139 | calibrated <- sweep(location_data, c(1,2), baseline, 140 | FUN = function(data, bl){ 141 | (data / bl - 1) * 100 142 | }) 143 | return(calibrated) 144 | }, 145 | 146 | .y = output, 147 | 148 | # input dimension is 287 x 100 x 301 for each location 149 | # hence 4 loops in total 150 | .buffer_count = 4 151 | ) 152 | 153 | # cleanup 154 | x$delete() 155 | 156 | } 157 | 158 | # cleanup 159 | x1$delete() 160 | x2$delete() 161 | output$delete() 162 | } 163 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # filearray 0.2.0 2 | 3 | * Additional `audo_set_headers` to function `filearray_load_or_create` to avoid automatically setting headers, with default being `TRUE` for compatibility concerns 4 | 5 | # filearray 0.1.9 6 | 7 | * Fixed `CRAN` check issue brought by `testthat` update, which requires `bit64` when checking `int64` arrays. Added `bit64` to `Suggests` field to mitigate this check error 8 | 9 | # filearray 0.1.8 10 | 11 | * Fixed a bug where `C++` errors are not correctly thrown due to a recent `Rcpp` update. This bug may cause erroneous code to execute ignoring try-catch statement and imbalanced protecting stacks (`fwhich`, `mapreduce`) 12 | 13 | # filearray 0.1.7 14 | 15 | * Removed non-API R C interface 16 | * Allows `subsetAssign` to assign when value length is 1 (thanks `@talegari` issue `#9`) 17 | * Allows more flexible `fwhich` function with functional `val` input and options to include corresponding values as part of return (thanks `@talegari` issue `#10`) 18 | * Fixed `mapreduce` bug when partition length is way larger than the buffer size, causing buffer to stop updating 19 | * Fixed other `C++` issues, such as unused buffer allocation in subset-assign, and `min_` offset in `sub_vec_range`. These issues are not considered bugs since they don't affect the results, but the performance is slightly increased after fix. 20 | * Replaced `dipsaus` with more updated `ravetools` in the vignette performance profiling 21 | * Updated examples to get rid of warning messages and out-dated arguments. 22 | 23 | # filearray 0.1.6 24 | 25 | * Removed `c++11` from system requirement and `Makevars` 26 | * Used `TinyThreads` instead of `OpenMP` to get parallel working on `OSX` 27 | * Added `as_filearray` method, and support optional `float` 28 | * Added array proxy class, allowing arrays to lazy-evaluate simple operators 29 | * Allows user-defined temporary file array path 30 | * Fixed `fmap` issues, using better guesses for default `.input_size` 31 | * Fixed a memory bug caused when partition margin has elements greater than 1 and when `FARR_subset_sequential` is used 32 | * Used `fastmap` to avoid environment look-up 33 | * `fa_eval_ops` allows dimension names 34 | * Larger default buffer size to allow `2^20` in single partition 35 | * Disabled single indexing 36 | * Fixed `endian` issue on `big-endian` platforms 37 | 38 | # filearray 0.1.5 39 | 40 | * Fixed a bug when trying to read array data sequentially. The bug is caused by buffer size being greater than the array length, making in a pointer that controls the partition number exceed the end of vector, resulting in undefined behavior. The functions affected are: `fmap`, `fmap2`. The bug has been fixed and passed `valgrind` memory check. 41 | 42 | # filearray 0.1.4 43 | 44 | * Fixed a bug when allocated memory is one byte short than requested. The bug would crash R when triggered in certain cases. 45 | * Removed limit to the maximum number of partitions when writing. The previous implementation creates and opens related file descriptors all at once before writing. This setup will raise errors when the number of connections reach to certain limit, often defined by the operating systems. This update only opens the connection on demand. The performance might be impacted when writing to disk, but in return, the program will be more robust 46 | * Fixed `subset` function environment not resolved correctly when using formula 47 | * Added `filearray_load_or_create` as an alternative to `filearray_checkload` by automatically replace existing obsolete array files if the headers, dimensions, or data types don't match. Also `on_missing` argument is provided to allow array initialization if new array is created. 48 | 49 | # filearray 0.1.3 50 | 51 | * Automatically detect whether symbolic-link works and show warnings 52 | * Warnings can be suppressed 53 | * Allow extra headers to be set in `meta` file 54 | * Added header signature method 55 | * Fixed symbolic-link issues on `Windows` when partition sizes are 0 56 | * Added check-load function `filearray_checkload` to validate header 57 | * Fixed collapse method when `dimnames` are set 58 | * Fixed an unprotected variable in `C++` code 59 | * `filearray_bind` can use cache if the header signatures agree 60 | * `filearray_bind` can choose to force overwrite 61 | * Added package `digest` to `Imports` 62 | * Fixed a typo and several small bugs 63 | 64 | 65 | # filearray 0.1.2 66 | 67 | * Removed `flush` in saving data to let system decide when to flush to hard drive 68 | * Allowed array to expand along the partition margin 69 | * Fixed dimension name getting dropped under certain situations 70 | * Use 2 cores by default when `R CMD check` is detected 71 | 72 | # filearray 0.1.1 73 | 74 | * Added `OpenMP` flag in the `MakeVars` 75 | * Fixed critical bugs that could cause `segfaults` 76 | * Can store `complex` and `float` data types 77 | * Re-implemented read/write functions to use memory map 78 | * Allowed `dimnames` to be set 79 | * Added generics `subset` to subset using `dimnames` 80 | * Added vignette to compare performance 81 | * Added speed comparisons in `README.md` 82 | * Added `collapse` to calculate marginal summation with little memory overhead 83 | * Added `fmap`, `fmap2` to apply functions to one or multiple file arrays with little memory overhead (also very fast) 84 | * Fixed 'unprotected' issues warned by `rchk` 85 | 86 | # filearray 0.1.0 87 | 88 | * Added a `NEWS.md` file to track changes to the package. 89 | * Initial implementation 90 | -------------------------------------------------------------------------------- /inst/include/TinyParallel/TinyThread.h: -------------------------------------------------------------------------------- 1 | #ifndef __FILEARRAY_PARALLEL_TINYTHREAD__ 2 | #define __FILEARRAY_PARALLEL_TINYTHREAD__ 3 | 4 | #include 5 | #include 6 | 7 | #include "Common.h" 8 | 9 | #include 10 | 11 | 12 | #include 13 | 14 | namespace TinyParallel { 15 | 16 | namespace { 17 | 18 | // Class which represents a range of indexes to perform work on 19 | // (worker functions are passed this range so they know which 20 | // elements are safe to read/write to) 21 | class IndexRange { 22 | public: 23 | 24 | // Initizlize with a begin and (exclusive) end index 25 | IndexRange(std::size_t begin, std::size_t end) 26 | : begin_(begin), end_(end) 27 | { 28 | } 29 | 30 | // Access begin() and end() 31 | std::size_t begin() const { return begin_; } 32 | std::size_t end() const { return end_; } 33 | std::size_t size() const { return end_ - begin_ ; } 34 | 35 | private: 36 | std::size_t begin_; 37 | std::size_t end_; 38 | }; 39 | 40 | 41 | // Because tinythread allows us to pass only a plain C function 42 | // we need to pass our worker and range within a struct that we 43 | // can cast to/from void* 44 | struct Work { 45 | Work(IndexRange range, Worker& worker) 46 | : range(range), worker(worker) 47 | { 48 | } 49 | IndexRange range; 50 | Worker& worker; 51 | }; 52 | 53 | // Thread which performs work (then deletes the work object 54 | // when it's done) 55 | extern "C" inline void workerThread(void* data) { 56 | try 57 | { 58 | Work* pWork = static_cast(data); 59 | pWork->worker(pWork->range.begin(), pWork->range.end()); 60 | delete pWork; 61 | } 62 | catch(...) 63 | { 64 | } 65 | } 66 | 67 | // Function to calculate the ranges for a given input 68 | std::vector splitInputRange(const IndexRange& range, 69 | std::size_t grainSize) { 70 | 71 | // determine max number of threads 72 | std::size_t threads = tthread::thread::hardware_concurrency(); 73 | char* numThreads = ::getenv("FILEARRAY_NUM_THREADS"); 74 | if (numThreads != NULL) { 75 | int parsedThreads = ::atoi(numThreads); 76 | if (parsedThreads > 0) 77 | threads = parsedThreads; 78 | } 79 | // printf("Number of threads: %ld\n", threads); 80 | // std::cout << "Number of threads: " << threads << "\n"; 81 | 82 | // compute grainSize (including enforcing requested minimum) 83 | std::size_t length = range.end() - range.begin(); 84 | if (threads == 1) 85 | grainSize = length; 86 | else if ((length % threads) == 0) // perfect division 87 | grainSize = std::max(length / threads, grainSize); 88 | else // imperfect division, divide by threads - 1 89 | grainSize = std::max(length / (threads-1), grainSize); 90 | 91 | // allocate ranges 92 | std::vector ranges; 93 | std::size_t begin = range.begin(); 94 | std::size_t end = begin; 95 | while (begin < range.end()) { 96 | if ((range.end() - (begin + grainSize)) < grainSize) 97 | end = range.end(); 98 | else 99 | end = std::min(begin + grainSize, range.end()); 100 | 101 | ranges.push_back(IndexRange(begin, end)); 102 | begin = end; 103 | } 104 | 105 | // return ranges 106 | return ranges; 107 | } 108 | 109 | } // anonymous namespace 110 | 111 | // Execute the Worker over the IndexRange in parallel 112 | inline void ttParallelFor(std::size_t begin, 113 | std::size_t end, 114 | Worker& worker, 115 | std::size_t grainSize = 1) 116 | { 117 | // split the work 118 | IndexRange inputRange(begin, end); 119 | std::vector ranges = splitInputRange(inputRange, grainSize); 120 | 121 | // create threads 122 | std::vector threads; 123 | for (std::size_t i = 0; ijoin(); 130 | delete threads[i]; 131 | } 132 | } 133 | 134 | // Execute the IWorker over the range in parallel then join results 135 | template 136 | inline void ttParallelReduce(std::size_t begin, 137 | std::size_t end, 138 | Reducer& reducer, 139 | std::size_t grainSize = 1) 140 | { 141 | // split the work 142 | IndexRange inputRange(begin, end); 143 | std::vector ranges = splitInputRange(inputRange, grainSize); 144 | 145 | // create threads (split for each thread and track the allocated workers) 146 | std::vector threads; 147 | std::vector workers; 148 | for (std::size_t i = 0; ijoin(); 158 | 159 | // join the results 160 | reducer.join(static_cast(*workers[i])); 161 | 162 | // delete the worker (which we split above) and the thread 163 | delete workers[i]; 164 | delete threads[i]; 165 | } 166 | } 167 | 168 | } // namespace TinyParallel 169 | 170 | #endif // __FILEARRAY_PARALLEL_TINYTHREAD__ 171 | -------------------------------------------------------------------------------- /adhoc/other docs/comparison-threads.R: -------------------------------------------------------------------------------- 1 | sudo_pwd <- rstudioapi::askForPassword("sudo password") 2 | cold_start <- TRUE 3 | 4 | set.seed(1) 5 | dim <- c(100,200,200,100) 6 | filex <- filearray::filearray_create( 7 | tempfile(), dim, 'double') 8 | filex$initialize_partition() 9 | 10 | set.seed(1) 11 | tmp <- rnorm(1e8) 12 | 13 | threads <- c(1,2,4,8) 14 | 15 | # run in order to make sure we have cold start 16 | speed1 <- rowMeans(replicate(5, { 17 | res <- sapply(threads, function(thread){ 18 | filearray::filearray_threads(thread) 19 | gc() 20 | if(cold_start){system("sudo -kS purge", input = sudo_pwd)} 21 | Sys.sleep(1) 22 | system.time({ 23 | filex[,,,1:25 + (log2(thread) * 25)] <- tmp 24 | }) 25 | }) 26 | speed <- length(filex) *8e-6 / 4 / res[3,]; speed 27 | })); speed1 28 | 29 | rm(tmp); gc() 30 | 31 | 32 | speed2 <- rowMeans(replicate(5, { 33 | res <- sapply(threads, function(thread){ 34 | filearray::filearray_threads(thread) 35 | gc() 36 | if(cold_start){system("sudo -kS purge", input = sudo_pwd)} 37 | Sys.sleep(1) 38 | system.time({ 39 | filex[,,,1:25 + (log2(thread) * 25)] 40 | }) 41 | }) 42 | speed <- length(filex) *8e-6 / 4 / res[3,]; speed 43 | })); speed2 44 | 45 | 46 | # 800 MB data indices 47 | set.seed(1) 48 | locs <- lapply(dim, function(d){ 49 | sample(1:d, replace = FALSE, size = 100) 50 | }) 51 | 52 | # CMD: sudo purge 53 | speed3 <- rowMeans(replicate(5, { 54 | res <- sapply(threads, function(thread){ 55 | filearray::filearray_threads(thread) 56 | gc() 57 | if(cold_start){system("sudo -kS purge", input = sudo_pwd)} 58 | Sys.sleep(1) 59 | system.time({ 60 | filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] 61 | }, gcFirst = TRUE) 62 | }) 63 | speed <- 800 / res[3,]; speed 64 | })); speed3 65 | 66 | 67 | tmp <- rnorm(prod(sapply(locs, length))) 68 | # filex <- filearray::filearray_load(filex$.filebase) 69 | speed4 <- rowMeans(replicate(5, { 70 | res <- sapply(threads, function(thread){ 71 | filearray::filearray_threads(thread) 72 | stopifnot(filearray::filearray_threads() == thread) 73 | gc() 74 | if(cold_start){system("sudo -kS purge", input = sudo_pwd)} 75 | Sys.sleep(1) 76 | system.time({ 77 | filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp 78 | }, gcFirst = TRUE) 79 | }) 80 | speed <- 800 / res[3,]; speed 81 | })); speed4 82 | 83 | rm(tmp, sudo_pwd); gc() 84 | # save.image("./adhoc/other docs/comparison-threads.RData") 85 | 86 | f <- function(txt_cex = 0.7){ 87 | 88 | par(mfrow = c(1,2)) 89 | 90 | cols <- c("orangered", "orange", "darkgreen", "dodgerblue3") 91 | speed <- cbind( 92 | speed1, 93 | speed4, 94 | speed2, 95 | speed3 96 | ) / 1024 97 | rownames(speed) <-c("1-thread", paste0(c(2,4,8), "-threads")) 98 | colnames(speed) <- NULL 99 | 100 | 101 | txt_cex2 = txt_cex 102 | 103 | idx <- 1:2 104 | plt <- barplot.default( 105 | speed[,idx], beside = TRUE, 106 | ylab = "Speed (GB/s)", 107 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 108 | ylim = c(0, 1.2), las = 1, yaxt = "n", 109 | border = NA, 110 | main = "Speed Comparisons (Different Threads, memory purged)", 111 | cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4 112 | ) 113 | axis(2, c(0,0.5,1), las = 1) 114 | axis(1, colMeans(plt), las = 1, label = c( 115 | "Write 800MB Data\n(Sequential)", 116 | "Substitute 800MB Data\n(Random)" 117 | # "Read 800MB Data\n(Sequential)", 118 | # "Subset 800MB Data\n(Random)" 119 | ), tick = FALSE) 120 | text.default( 121 | x = plt, 122 | y = speed[,idx], 123 | labels = sprintf(c( 124 | "\n%.0f MB/s", 125 | "\n%.0f MB/s" 126 | ), speed[,idx] * 1024), 127 | cex = txt_cex, 128 | col = 'white' 129 | ) 130 | text.default( 131 | x = plt, 132 | y = speed[,idx], 133 | labels = sprintf("%s\n", rownames(speed)), 134 | cex = txt_cex2, 135 | col = cols 136 | ) 137 | 138 | 139 | idx <- 3:4 140 | plt <- barplot.default( 141 | speed[,idx], beside = TRUE, 142 | ylab = "Speed (GB/s)", 143 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 144 | ylim = c(0, 3.5), las = 1, yaxt = "n", 145 | border = NA, 146 | main = "Speed Comparisons (Different Threads, memory purged)", 147 | cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4 148 | ) 149 | axis(2, c(0:3), las = 1) 150 | axis(1, colMeans(plt), las = 1, label = c( 151 | # "Write 800MB Data\n(Sequential)", 152 | # "Substitute 800MB Data\n(Random)" 153 | "Read 800MB Data\n(Sequential)", 154 | "Subset 800MB Data\n(Random)" 155 | ), tick = FALSE) 156 | text.default( 157 | x = plt, 158 | y = speed[,idx], 159 | labels = sprintf(c( 160 | "\n%.0f MB/s", 161 | "\n%.0f MB/s" 162 | ), speed[,idx] * 1024), 163 | cex = txt_cex, 164 | col = 'white' 165 | ) 166 | text.default( 167 | x = plt, 168 | y = speed[,idx], 169 | labels = sprintf("%s\n", rownames(speed)), 170 | cex = txt_cex2, 171 | col = cols 172 | ) 173 | 174 | 175 | } 176 | 177 | png(sprintf("./adhoc/other docs/comparison-threads-%s.png", ifelse(cold_start, "coldstart", "warmstart")), width = 4267, height = 1600, res = 300) 178 | f() 179 | dev.off() 180 | -------------------------------------------------------------------------------- /man/filearray.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/class-filearray.R, R/class-proxy.R, R/load.R 3 | \docType{class} 4 | \name{as_filearray} 5 | \alias{as_filearray} 6 | \alias{FileArrayProxy-class} 7 | \alias{FileArrayProxy} 8 | \alias{as_filearrayproxy} 9 | \alias{filearray} 10 | \alias{filearray_create} 11 | \alias{filearray_load} 12 | \alias{filearray_checkload} 13 | \alias{filearray_load_or_create} 14 | \title{Create or load existing file arrays} 15 | \usage{ 16 | as_filearray(x, ...) 17 | 18 | as_filearrayproxy(x, ...) 19 | 20 | filearray_create( 21 | filebase, 22 | dimension, 23 | type = c("double", "float", "integer", "logical", "raw", "complex"), 24 | partition_size = NA, 25 | initialize = FALSE, 26 | ... 27 | ) 28 | 29 | filearray_load(filebase, mode = c("readwrite", "readonly")) 30 | 31 | filearray_checkload( 32 | filebase, 33 | mode = c("readonly", "readwrite"), 34 | ..., 35 | symlink_ok = TRUE 36 | ) 37 | 38 | filearray_load_or_create( 39 | filebase, 40 | dimension, 41 | on_missing = NULL, 42 | type = NA, 43 | ..., 44 | mode = c("readonly", "readwrite"), 45 | symlink_ok = TRUE, 46 | initialize = FALSE, 47 | partition_size = NA, 48 | verbose = FALSE, 49 | auto_set_headers = TRUE 50 | ) 51 | } 52 | \arguments{ 53 | \item{x}{R object such as array, file array proxy, or character that can be 54 | transformed into file array} 55 | 56 | \item{...}{additional headers to check used by \code{filearray_checkload} 57 | (see 'Details'). This argument is ignored by \code{filearray_create}, 58 | reserved for future compatibility.} 59 | 60 | \item{filebase}{a directory path to store arrays in the local file 61 | system. When creating an array, the path must not exist.} 62 | 63 | \item{dimension}{dimension of the array, at least length of 2} 64 | 65 | \item{type}{storage type of the array; default is \code{'double'}. Other 66 | options include \code{'integer'}, \code{'logical'}, and \code{'raw'}.} 67 | 68 | \item{partition_size}{positive partition size for the last margin, or 69 | \code{NA} to automatically guess; see 'Details'.} 70 | 71 | \item{initialize}{whether to initialize partition files; default is false 72 | for performance considerations. However, if the array is dense, it is 73 | recommended to set to true} 74 | 75 | \item{mode}{whether allows writing to the file; choices are 76 | \code{'readwrite'} and \code{'readonly'}.} 77 | 78 | \item{symlink_ok}{whether arrays with symbolic-link partitions can pass 79 | the test; this is usually used on bound arrays with symbolic-links; see 80 | \code{\link{filearray_bind}};} 81 | 82 | \item{on_missing}{function to handle file array (such as initialization) 83 | when a new array is created; must take only one argument, the array object} 84 | 85 | \item{verbose}{whether to print out some debug messages} 86 | 87 | \item{auto_set_headers}{whether to automatically set headers if array is 88 | missing or to be created; default is true} 89 | } 90 | \value{ 91 | A \code{\link{FileArray-class}} instance. 92 | } 93 | \description{ 94 | Create or load existing file arrays 95 | } 96 | \details{ 97 | The file arrays partition out-of-memory array objects and store them 98 | separately in local file systems. Since R stores matrices/arrays 99 | in column-major style, file array uses the slowest margin (the 100 | last margin) to slice the partitions. This helps to align the elements 101 | within the files with the corresponding memory order. An array with 102 | dimension \code{100x200x300x400} has 4 margins. The length of the 103 | last margin is 400, which is also the maximum number of potential 104 | partitions. The number of partitions are determined by the last margin 105 | size divided by \code{partition_size}. For example, if the partition 106 | size is 1, then there will be 400 partitions. If the partition size 107 | if 3, there will be 134 partitions. The default partition sizes 108 | are determined internally following these priorities: 109 | \describe{ 110 | \item{1. }{the file size of each partition does not exceed \code{1GB}} 111 | \item{2. }{the number of partitions do not exceed 100} 112 | } 113 | These two rules are not hard requirements. The goal is to reduce the 114 | numbers of partitions as much as possible. 115 | 116 | The arguments \code{...} in \code{filearray_checkload} should be named 117 | arguments that provide additional checks for the header information. 118 | The check will fail if at least one header is not identical. For example, 119 | if an array contains header key-signature pair, one can use 120 | \code{filearray_checkload(..., key = signature)} to validate the signature. 121 | Note the comparison will be rigid, meaning the storage type of the headers 122 | will be considered as well. If the signature stored in the array is an 123 | integer while provided is a double, then the check will result in failure. 124 | } 125 | 126 | \examples{ 127 | 128 | 129 | # Prepare 130 | library(filearray) 131 | filebase <- tempfile() 132 | if(file.exists(filebase)){ unlink(filebase, TRUE) } 133 | 134 | # create array 135 | x <- filearray_create(filebase, dimension = c(200, 30, 8)) 136 | print(x) 137 | 138 | # Assign values 139 | x[] <- rnorm(48000) 140 | 141 | # Subset 142 | x[1,2,] 143 | 144 | # load existing array 145 | filearray_load(filebase) 146 | 147 | x$set_header("signature", "tom") 148 | filearray_checkload(filebase, signature = "tom") 149 | 150 | \dontrun{ 151 | # Trying to load with wrong signature 152 | filearray_checkload(filebase, signature = "jerry") 153 | } 154 | 155 | 156 | # check-load, and create a new array if fail 157 | x <- filearray_load_or_create( 158 | filebase = filebase, dimension = c(200, 30, 8), 159 | verbose = FALSE, signature = "henry" 160 | ) 161 | x$get_header("signature") 162 | 163 | # check-load with initialization 164 | x <- filearray_load_or_create( 165 | filebase = filebase, 166 | dimension = c(3, 4, 5), 167 | verbose = FALSE, mode = "readonly", 168 | on_missing = function(array) { 169 | array[] <- seq_len(60) 170 | } 171 | ) 172 | 173 | x[1:3,1,1] 174 | 175 | # Clean up 176 | unlink(filebase, recursive = TRUE) 177 | 178 | } 179 | \author{ 180 | Zhengjia Wang 181 | } 182 | -------------------------------------------------------------------------------- /R/header.R: -------------------------------------------------------------------------------- 1 | #' Position (size): description - example value 2 | #' 0 (8): determine endianness - 1.0 (double) 3 | #' 8 (4 x 3): version number - [1, 0, 0] (int[3]) 4 | #' 20 (4): SEXP type - 14L (SEXPTYPE, data are double type) 5 | #' 24 (4): element size - 8L (int, data element size is 8) 6 | #' 28 (8): number of last dimensions per partition - 1 (double) 7 | #' 36 (8): total number of elements - 24 (double) 8 | #' 44 (4): number of dimensions (ndims) - 3 (int) 9 | #' 48 (8 x ndims): dimensions - [2, 3, 4] (double) 10 | #' 11 | #' ... 12 | #' 13 | #' 1012 (4): size of the header - 72 (int) 14 | #' 1016 (8): number of elements in the file - (double) 15 | #' 16 | #' @noRd 17 | NULL 18 | 19 | write_header <- function(fid, partition, dimension, type, size){ 20 | seek(con = fid, where = 0, rw = "write") 21 | 22 | # 0 (8): determine endianness - 1.0 (double) 23 | writeBin(con = fid, object = 1.0, size = 8L, endian = ENDIANNESS) 24 | 25 | # 8 (4 x 3): version number - [1, 0, 0] (int[3]) 26 | writeBin(con = fid, object = FILE_VER, size = 4L, endian = ENDIANNESS) 27 | 28 | sexp_type <- type_to_sexp(type) 29 | 30 | # 20 (4): SEXP type - 14L (SEXPTYPE, data are double type) 31 | writeBin(con = fid, object = sexp_type, size = 4L, 32 | endian = ENDIANNESS) 33 | 34 | # 24 (4): element size - 8L (int, data element size is 8) 35 | writeBin(con = fid, object = as.integer(size), size = 4L, 36 | endian = ENDIANNESS) 37 | 38 | # partition number 39 | partition <- as.double(partition) 40 | 41 | # 28 (8): number of last dimensions per partition - 1 (double) 42 | writeBin(con = fid, object = partition, size = 8L, endian = ENDIANNESS) 43 | 44 | ## # write length of dimension 45 | dimension <- as.double(dimension) 46 | len <- prod(dimension) 47 | 48 | # 36 (8): total number of elements - 24 (double) 49 | writeBin(con = fid, object = len, size = 8L, endian = ENDIANNESS) 50 | 51 | # 44 (4): number of dimensions (ndims) - 3 (int) 52 | writeBin(con = fid, object = length(dimension), size = 4L, 53 | endian = ENDIANNESS) 54 | 55 | # 48 (8 x ndims): dimensions - [2, 3, 4] (double) 56 | writeBin(con = fid, object = dimension, size = 8L, 57 | endian = ENDIANNESS) 58 | header_len <- 48L + 8L * length(dimension) 59 | writeBin(con = fid, object = rep(0L, (HEADER_SIZE - header_len) / 4 - 3), 60 | size = 4L, endian = ENDIANNESS) 61 | writeBin(con = fid, object = header_len, size = 4L, endian = ENDIANNESS) 62 | writeBin(con = fid, object = 0.0, size = 8L, endian = ENDIANNESS) 63 | seek(con = fid, where = 0, rw = "write") 64 | } 65 | 66 | read_header <- function(fid){ 67 | seek(con = fid, where = 0, rw = "read") 68 | 69 | one <- readBin(fid, n = 1, size = 8L, what = 'double', endian = ENDIANNESS) 70 | native <- one == 1 71 | 72 | if( native ){ 73 | endian <- ENDIANNESS 74 | } else { 75 | if( ENDIANNESS == "little" ){ 76 | endian <- 'big' 77 | # TODO: support big-endian file? maybe not 78 | stop("The file endianess is not little?") 79 | } else { 80 | endian <- "little" 81 | } 82 | } 83 | 84 | version <- readBin(fid, what = 'int', n = 3, size = 4L, endian = endian) 85 | 86 | # type, size 87 | sexp_type <- readBin(fid, what = 'int', size = 4L, endian = endian) 88 | vsize <- readBin(fid, what = 'int', size = 4L, endian = endian) 89 | 90 | # partition number 91 | partition <- readBin(con = fid, what = 'double', size = 8L, endian = endian) 92 | 93 | # dimension 94 | size <- readBin(con = fid, what = 'double', size = 8L, endian = endian) 95 | ndims <- readBin(con = fid, what = 'int', size = 4L, endian = endian) 96 | dim <- readBin(con = fid, what = 'double', size = 8L, n = ndims, 97 | endian = endian) 98 | 99 | seek(con = fid, where = HEADER_SIZE - 12L, rw = "read") 100 | header_bytes <- readBin(con = fid, what = 'int', size = 4L, 101 | endian = endian) 102 | 103 | content_length <- readBin(con = fid, what = 'double', size = 8L, 104 | endian = endian) 105 | list( 106 | endianness = endian, 107 | version = version, 108 | sexp_type = sexp_type, 109 | unit_bytes = vsize, 110 | partition = partition, 111 | partition_size = size, 112 | partition_dim = dim, 113 | header_bytes = header_bytes, 114 | content_length = content_length 115 | ) 116 | } 117 | 118 | validate_header <- function(file, fid){ 119 | fz <- -1 120 | if(!missing(file)){ 121 | if(!file.exists(file)){ 122 | stop("File is missing") 123 | } 124 | fz <- file.size(file) 125 | if(fz < HEADER_SIZE){ 126 | # Might be on windows and partition files are symlinked 127 | if(get_os() != "windows" || fz != 0){ 128 | stop("Invalid `filearray` partition. File size too small:\n ", file) 129 | } 130 | } 131 | fid <- file(description = file, open = "rb") 132 | on.exit({ 133 | close(fid) 134 | }) 135 | } 136 | 137 | header <- read_header(fid) 138 | if( header$header_bytes != 48 + 8 * length(header$partition_dim) ){ 139 | stop("Filearray partition header is corrupted.") 140 | } 141 | if( fz > 0 && header$content_length + HEADER_SIZE > fz ){ 142 | stop("Filearray data is corrupted") 143 | } 144 | 145 | return(header) 146 | } 147 | 148 | set_meta_content <- function(meta_file, data){ 149 | stopifnot(file.exists(meta_file)) 150 | data <- as.list(data) 151 | data$header_version <- HEADER_VER 152 | 153 | keys <- names(data) 154 | names(data) <- sprintf("__%s__", keys) 155 | 156 | conn <- rawConnection(raw(), "w+b") 157 | saveRDS(file = conn, data, ascii = FALSE) 158 | v <- rawConnectionValue(conn) 159 | close(conn) 160 | 161 | fid <- file(meta_file, "r+b") 162 | on.exit({ close(fid) }) 163 | seek(con = fid, where = HEADER_SIZE, origin = "start", rw = "write") 164 | writeBin(object = v, con = fid, endian = ENDIANNESS) 165 | 166 | seek(con = fid, where = HEADER_SIZE - 8L, origin = "start", rw = "write") 167 | writeBin(con = fid, object = as.double(length(v)), size = 8L, endian = ENDIANNESS) 168 | seek(con = fid, where = 0, rw = "write") 169 | } 170 | 171 | -------------------------------------------------------------------------------- /tests/testthat/test-map.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | 3 | 4 | test_that("map with proxy", { 5 | 6 | set.seed(10) 7 | 8 | # A large array example 9 | x1 <- filearray_create(temp_path(check = TRUE), dimension = c(28, 100, 3, 4), initialize = FALSE, partition_size = 3L) 10 | x1[] <- rnorm(33600) 11 | x2 <- x1 + 1 12 | x3 <- x1 + x2 13 | x4 <- as_filearray(matrix(1:12, nrow = 4)) 14 | 15 | x <- list(x1, x2, x3, x4) 16 | # check common input size 17 | 18 | bc <- 12 19 | re <- fmap2(x, function(input) { 20 | testthat::expect_length(input, 4) 21 | testthat::expect_length(input[[1]], length(x1) / bc) 22 | testthat::expect_length(input[[2]], length(x2) / bc) 23 | testthat::expect_length(input[[3]], length(x3) / bc) 24 | testthat::expect_length(input[[4]], length(x4) / bc) 25 | testthat::expect_equal(input[[1]] + 1, input[[2]]) 26 | testthat::expect_equal(input[[3]], input[[2]] + input[[1]]) 27 | 28 | sum(input[[4]]) + sum(input[[3]] - input[[2]] - input[[1]]) 29 | }, .buffer_count = bc) 30 | expect_equal(re, colSums(matrix(x4[], ncol = bc))) 31 | 32 | 33 | bc <- 4 34 | re <- fmap2(x, function(input) { 35 | testthat::expect_length(input, 4) 36 | testthat::expect_length(input[[1]], length(x1) / bc) 37 | testthat::expect_length(input[[2]], length(x2) / bc) 38 | testthat::expect_length(input[[3]], length(x3) / bc) 39 | testthat::expect_length(input[[4]], length(x4) / bc) 40 | testthat::expect_equal(input[[1]] + 1, input[[2]]) 41 | testthat::expect_equal(input[[3]], input[[2]] + input[[1]]) 42 | 43 | sum(input[[4]]) + sum(input[[3]] - input[[2]] - input[[1]]) 44 | }, .buffer_count = bc) 45 | expect_equal(re, colSums(matrix(x4[], ncol = bc))) 46 | 47 | bc <- 1 48 | re <- fmap2(x, function(input) { 49 | testthat::expect_length(input, 4) 50 | testthat::expect_length(input[[1]], length(x1) / bc) 51 | testthat::expect_length(input[[2]], length(x2) / bc) 52 | testthat::expect_length(input[[3]], length(x3) / bc) 53 | testthat::expect_length(input[[4]], length(x4) / bc) 54 | testthat::expect_equal(input[[1]] + 1, input[[2]]) 55 | testthat::expect_equal(input[[3]], input[[2]] + input[[1]]) 56 | 57 | sum(input[[4]]) + sum(input[[3]] - input[[2]] - input[[1]]) 58 | }, .buffer_count = bc) 59 | expect_equal(re, colSums(matrix(x4[], ncol = bc))) 60 | 61 | 62 | # check fmap 63 | bc <- 12 64 | 65 | y <- filearray_create(temp_path(), dimension = c(12,1)) 66 | fmap(x, function(input) { 67 | testthat::expect_length(input, 4) 68 | testthat::expect_length(input[[1]], length(x1) / bc) 69 | testthat::expect_length(input[[2]], length(x2) / bc) 70 | testthat::expect_length(input[[3]], length(x3) / bc) 71 | testthat::expect_length(input[[4]], length(x4) / bc) 72 | testthat::expect_equal(input[[1]] + 1, input[[2]]) 73 | testthat::expect_equal(input[[3]], input[[2]] + input[[1]]) 74 | 75 | input[[4]] + sum(input[[3]] - input[[2]] - input[[1]]) 76 | }, .buffer_count = bc, .y = y) 77 | expect_equal(as.vector(y[]), as.vector(x4[])) 78 | 79 | clear_cache() 80 | }) 81 | 82 | 83 | test_that("map filearrays", { 84 | 85 | # A large array example 86 | x <- filearray_create(temp_path(check = TRUE), dimension = c(28, 100, 301, 4), initialize = FALSE, partition_size = 3L) 87 | dnames <- list( 88 | Trial = sample(c("A", "B"), 28, replace = TRUE), 89 | Marker = 1:100, 90 | Time = seq(-1,2,0.01), 91 | Location = 1:4 92 | ) 93 | dimnames(x) <- dnames 94 | 95 | expect_equal(dimnames(x), dnames) 96 | 97 | y <- array(rnorm(length(x)), dim(x)) 98 | x[] <- y 99 | 100 | output <- filearray_create(temp_path(check = TRUE), dimension = dim(x), initialize = FALSE, partition_size = 4L) 101 | 102 | f <- function(input){ 103 | # get locational data 104 | if(is.list(input)){ 105 | location_data <- input[[1]] 106 | } else { 107 | location_data <- input 108 | } 109 | 110 | dim(location_data) <- c(28, 100, 301) 111 | 112 | # collapse over first 50 time points for 113 | # each trial, and marker 114 | baseline <- apply(location_data[,,1:50], c(1,2), mean) 115 | 116 | # calibrate 117 | calibrated <- sweep(location_data, c(1,2), baseline, 118 | FUN = function(data, bl){ 119 | (data / bl - 1) * 100 120 | }) 121 | return(calibrated) 122 | } 123 | 124 | fmap(x, f, .y = output, .buffer_count = 4) 125 | 126 | b <- apply(y, 4, f) 127 | dim(b) <- dim(y) 128 | 129 | expect_equal(output[], b) 130 | 131 | d <- fmap2(x, f, .buffer_count = 4, .simplify = TRUE) 132 | expect_equal(d, b) 133 | 134 | x$delete() 135 | output$delete() 136 | clear_cache() 137 | }) 138 | 139 | 140 | test_that("fwhich", { 141 | x <- filearray_create(temp_path(check = TRUE), dimension = c(28, 100, 301, 4), initialize = FALSE, partition_size = 3L, type = "complex") 142 | dnames <- list( 143 | Trial = sample(c("A", "B"), 28, replace = TRUE), 144 | Marker = 1:100, 145 | Time = seq(-1,2,0.01), 146 | Location = 1:4 147 | ) 148 | dimnames(x) <- dnames 149 | 150 | y <- array(rnorm(length(x)), dim(x)) 151 | x[] <- y 152 | x[1,c(1,3),301,4] <- c(80 + 10i, 80 + 10i) 153 | y <- x[] 154 | 155 | idx <- fwhich(x, val = 80 + 10i, arr.ind = FALSE, ret.values = FALSE) 156 | expect_equal(idx, which(y == (80 + 10i))) 157 | 158 | idx <- fwhich(x, val = 80 + 10i, arr.ind = FALSE, ret.values = TRUE) 159 | expect_equal(attr(idx, "values"), rep(80 + 10i, length(idx))) 160 | 161 | idx1 <- fwhich(x, val = 80 + 10i, arr.ind = TRUE, ret.values = TRUE) 162 | idx2 <- fwhich(y, val = 80 + 10i, arr.ind = TRUE, ret.values = TRUE) 163 | expect_equal(idx1, idx2) 164 | 165 | # val is a function 166 | impl <- function(z) { Re(z) > 4 } 167 | idx1 <- fwhich(x, impl, arr.ind = FALSE, ret.values = TRUE) 168 | idx2 <- fwhich(x[], impl, arr.ind = FALSE, ret.values = TRUE) 169 | expect_equal(idx1, idx2) 170 | 171 | impl <- function(z) { Im(z) < -100 } 172 | idx1 <- fwhich(x, impl, arr.ind = TRUE, ret.values = TRUE) 173 | idx2 <- fwhich(x[], impl, arr.ind = TRUE, ret.values = TRUE) 174 | expect_equal(idx1, idx2) 175 | 176 | x$delete() 177 | }) 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /R/helpers.R: -------------------------------------------------------------------------------- 1 | # ---- UUID generator ------------------------------------------------------- 2 | new_uuid <- function(prefix = 0L) { 3 | sprintf("%04d-%s", prefix, uuid::UUIDgenerate(output = "string")) 4 | } 5 | 6 | # ---- Index helpers -------------------------------------------------------- 7 | 8 | # Check if two arrays share the same dimension 9 | is_same_dim <- function(x, y) { 10 | dx <- dim(x) 11 | if(length(dx) >= 2L) { 12 | dy <- dim(y) 13 | if(length(dx) != length(dy)) { return(FALSE) } 14 | # valid # of margins and dimensions are consistent 15 | if(all(dx == dy)) { return(TRUE) } 16 | } else if( length(x) == length(y) ) { 17 | # dx dy might not exist, check length 18 | return(TRUE) 19 | } 20 | return(FALSE) 21 | } 22 | 23 | # Guess partition size from given dimensions and element size. 24 | # This function is used to estimate a decent partition size when creating 25 | # arrays 26 | guess_partition <- function(dim, elem_size){ 27 | last_margin <- dim[[length(dim)]] 28 | unit_size <- prod(dim) / last_margin * elem_size 29 | 30 | # 1: partition size cannot go beyond 1GB 31 | max_ <- floor(2^30 / unit_size) 32 | if(max_ <= 1L){ 33 | return(1L) 34 | } 35 | # 2: n partitions <= 100 36 | if(last_margin <= 100){ 37 | return(1L) 38 | } 39 | # 3: at most max_ units, at least fct units 40 | fct <- ceiling(last_margin / max_) 41 | if(fct > 50){ 42 | return(max_) 43 | } 44 | while(fct <= 50){ 45 | max_ <- max_ - 1L 46 | fct <- ceiling(last_margin / max_) 47 | if(max_ <= 1L){ 48 | return(1L) 49 | } 50 | } 51 | return(max_) 52 | } 53 | 54 | 55 | guess_fmap_buffer_size <- function(dim, element_size = 16L) { 56 | buffer_len <- get_buffer_size() / element_size 57 | buffer_large <- get_buffer_size() * 16L # default is 8.5MB, 1024 x 1024 58 | 59 | 60 | dimcprod <- cumprod(dim) 61 | len <- dimcprod[[length(dimcprod)]] 62 | min_partition_size <- dimcprod[[length(dimcprod) - 1]] 63 | 64 | # tiny array 65 | if( buffer_len >= len ) { return( as.integer(len) ) } 66 | 67 | # small array 68 | if( buffer_large >= min_partition_size ) { return( as.integer(min_partition_size) ) } 69 | 70 | # large array 71 | dimcprod <- dimcprod[dimcprod <= buffer_large] 72 | if(!length(dimcprod)) { return( dim[[1]] * dim[[2]] ) } 73 | 74 | # mid array 75 | unit_size <- dimcprod[[length(dimcprod)]] 76 | fct <- floor(buffer_large / unit_size) 77 | if(fct < 1) { fct <- 1 } 78 | return( as.integer(unit_size * fct) ) 79 | } 80 | 81 | common_fmap_buffer_count <- function(..., .list = NULL) { 82 | stopifnot(...length() + length(.list) > 0) 83 | buffer_large <- get_buffer_size() * 16L 84 | sizes <- vapply(c(list(...), .list), function(x){ 85 | as.double(prod(x)) 86 | }, FUN.VALUE = 0.0) 87 | if(length(sizes) == 1L) { return( sizes ) } 88 | if(all(sizes <= buffer_large)) { return(1) } 89 | 90 | gcd <- Reduce(function(a, b) { 91 | if( a <= 0 || b <= 0 ) { 92 | stop("Cannot find proper input buffer size when data has zero length") 93 | } 94 | rem <- a %% b 95 | if( rem == 0 ) { return( b ) } 96 | return( Recall(b, rem) ) 97 | }, sizes) 98 | max_buffer <- max(sizes / gcd) 99 | 100 | max_mult <- floor(buffer_large / max_buffer) 101 | if(max_mult <= 1) { return(gcd) } 102 | 103 | guess_factors <- seq_len(min(max_mult, gcd)) 104 | guess_factors <- guess_factors[floor(gcd / guess_factors) * guess_factors == gcd] 105 | return(gcd / max(guess_factors)) 106 | } 107 | 108 | validate_fmap_buffer_count <- function(count, input_lens) { 109 | count <- count[[1]] 110 | if( is.na(count) || count != round(count) ) { 111 | return(FALSE) 112 | } 113 | 114 | sizes <- input_lens[input_lens > 0] 115 | if( !length(sizes) ) { return(TRUE) } 116 | 117 | # # elems in buffer is input_lens / count 118 | buffer_nelems <- input_lens / count 119 | 120 | if(any( buffer_nelems != round(buffer_nelems))) { 121 | return(FALSE) 122 | } 123 | return(structure(TRUE, buffer_nelems = buffer_nelems)) 124 | } 125 | 126 | # ---- data type helpers ---------------------------------------------------- 127 | 128 | is_filearray <- function(object, proxy_ok = TRUE){ 129 | if(!isS4(object)){ return(FALSE) } 130 | if( !proxy_ok && is_fileproxy(object) ) { 131 | return( FALSE ) 132 | } 133 | return(inherits(object, c("FileArray", "FileArrayProxy"))) 134 | } 135 | 136 | is_fileproxy <- function(object){ 137 | if(!isS4(object)){ return(FALSE) } 138 | return(inherits(object, "FileArrayProxy")) 139 | } 140 | 141 | 142 | # Guess output data types for meth operators such as \code{+-*/} 143 | operation_output_type <- function( 144 | type1, type2, 145 | logical = c("integer", "logical"), raw = c("error", "integer"), 146 | float = getOption("filearray.operator.precision", "double") 147 | ) { 148 | 149 | raw <- match.arg(raw) 150 | has_raw <- identical(type1, "raw") || identical(type2, "raw") 151 | 152 | if( has_raw && identical(raw, "error") ) { 153 | stop("non-numeric argument to binary operator") 154 | } 155 | 156 | if( identical(type1, "complex") ) { return("complex") } 157 | if( identical(type2, "complex") ) { return("complex") } 158 | 159 | if( identical(type1, "double") ) { return(float) } 160 | if( identical(type2, "double") ) { return(float) } 161 | 162 | if( identical(type1, "float") ) { return(float) } 163 | if( identical(type2, "float") ) { return(float) } 164 | 165 | if( identical(type1, "integer") ) { return("integer") } 166 | if( identical(type2, "integer") ) { return("integer") } 167 | 168 | if( identical(type1, "raw") ) { return("integer") } 169 | if( identical(type2, "raw") ) { return("integer") } 170 | 171 | logical <- match.arg(logical) 172 | if( identical(type1, "logical") ) { return(logical) } 173 | if( identical(type2, "logical") ) { return(logical) } 174 | 175 | stop("unrecognized combination of types: ", type1, ", ", type2) 176 | } 177 | 178 | 179 | 180 | # ---- function calls --------------------------------------------------- 181 | parent_call <- function(def, deparse = FALSE, env = parent.frame()) { 182 | tryCatch({ 183 | call <- with(env, {match.call(expand.dots = FALSE)}) 184 | if(!missing(def)) { 185 | def <- substitute(def) 186 | call[[1]] <- def 187 | } 188 | if(deparse) { 189 | call <- deparse1(call) 190 | } 191 | call 192 | }, error = function(e) { NULL }) 193 | } -------------------------------------------------------------------------------- /inst/include/TinyParallel/Timer.h: -------------------------------------------------------------------------------- 1 | #ifndef __FILEARRAY_PARALLEL_TIMER__ 2 | #define __FILEARRAY_PARALLEL_TIMER__ 3 | 4 | namespace TinyParallel { 5 | typedef uint64_t nanotime_t; 6 | 7 | template 8 | class ProportionTimer { 9 | public: 10 | ProportionTimer() : 11 | timer(), n(0), id(0) 12 | {} 13 | 14 | ProportionTimer( nanotime_t origin, int id_ ) : 15 | timer(origin), n(0), id(id_) 16 | {} 17 | 18 | inline operator SEXP() const { 19 | Rcpp::NumericVector out = (SEXP)timer ; 20 | out.attr("n") = n ; 21 | return out ; 22 | } 23 | 24 | inline nanotime_t origin() const{ 25 | return timer.origin() ; 26 | } 27 | 28 | inline int get_n() const { 29 | return n ; 30 | } 31 | 32 | inline void step( const std::string& name) { 33 | timer.step(name) ; 34 | } 35 | 36 | Timer timer ; 37 | int n ; 38 | int id ; 39 | 40 | } ; 41 | 42 | template 43 | class SingleTimer { 44 | public: 45 | SingleTimer() : timer(){} 46 | 47 | inline operator SEXP(){ 48 | Rcpp::List out = Rcpp::List::create(timer) ; 49 | out.attr("class") = Rcpp::CharacterVector::create( "SingleTimer", "Timer" ); 50 | return out ; 51 | } 52 | 53 | inline void step( const char* name ){ 54 | timer.step(name) ; 55 | } 56 | 57 | private: 58 | Timer timer ; 59 | } ; 60 | 61 | template 62 | class FixedSizeTimers { 63 | public: 64 | FixedSizeTimers( int n, int ndata_ ) : 65 | timers(n), ndata(ndata_) 66 | {} 67 | 68 | inline ProportionTimer& get(int i) { 69 | return timers[i] ; 70 | } 71 | 72 | inline ProportionTimer& operator[](int i){ 73 | return timers[i]; 74 | } 75 | 76 | inline operator SEXP(){ 77 | Rcpp::List out = wrap(timers) ; 78 | out.attr("class") = Rcpp::CharacterVector::create("FixedSizeTimers", "Timer") ; 79 | out.attr("n") = ndata ; 80 | return out ; 81 | } 82 | 83 | private: 84 | std::vector< ProportionTimer > timers ; 85 | int ndata ; 86 | } ; 87 | 88 | template 89 | class TimersList { 90 | public: 91 | 92 | TimersList(int n_): 93 | timers(), origin(Rcpp::get_nanotime()), n(n_) 94 | { 95 | timers.push_back( ProportionTimer( origin, 0 ) ) ; 96 | childs.push_back( std::vector() ); 97 | } 98 | 99 | ProportionTimer& front(){ 100 | return timers.front() ; 101 | } 102 | 103 | ProportionTimer& get_new_timer(int parent_id){ 104 | Locker lock(mutex) ; 105 | int id = timers.size() ; 106 | timers.push_back( ProportionTimer( origin, id ) ) ; 107 | childs.push_back( std::vector() ); 108 | std::list< std::vector >::iterator it = childs.begin() ; 109 | for(int i=0; ipush_back(id) ; 111 | return timers.back() ; 112 | } 113 | 114 | inline operator SEXP(){ 115 | int nt = timers.size() ; 116 | Rcpp::List data(nt) ; 117 | typename std::list >::const_iterator timers_it = timers.begin() ; 118 | std::list >::const_iterator childs_it = childs.begin() ; 119 | for( int i=0; i > timers ; 131 | std::list< std::vector > childs ; 132 | Mutex mutex ; 133 | nanotime_t origin ; 134 | int n ; 135 | 136 | private: 137 | TimersList( const TimersList& ) ; 138 | 139 | } ; 140 | 141 | template 142 | class TimedReducer : public Worker { 143 | public: 144 | typedef TimersList Timers ; 145 | 146 | Reducer* reducer ; 147 | Timers& timers ; 148 | ProportionTimer& timer ; 149 | bool owner ; 150 | 151 | TimedReducer( Reducer& reducer_, Timers& timers_) : 152 | reducer(&reducer_), 153 | timers(timers_), 154 | timer(timers.get_new_timer(0)), 155 | owner(false) 156 | { 157 | timer.step("init"); 158 | } 159 | 160 | TimedReducer( const TimedReducer& other, Split s) : 161 | reducer( new Reducer(*other.reducer, s) ), 162 | timers( other.timers ), 163 | timer( timers.get_new_timer(other.timer.id) ), 164 | owner(true) 165 | { 166 | timer.step("init") ; 167 | } 168 | 169 | ~TimedReducer(){ 170 | if(owner && reducer) { 171 | delete reducer ; 172 | } 173 | reducer = 0 ; 174 | } 175 | 176 | inline void operator()( size_t begin, size_t end){ 177 | timer.n += (end-begin) ; 178 | timer.step("start") ; 179 | reducer->operator()(begin, end) ; 180 | timer.step("work") ; 181 | } 182 | 183 | inline void join(const TimedReducer& rhs){ 184 | rhs.timer.step("start") ; 185 | reducer->join(*rhs.reducer) ; 186 | rhs.timer.step("join") ; 187 | } 188 | 189 | inline Rcpp::List get() const { 190 | timers.front().step("start") ; 191 | OUT out = reducer->get() ; 192 | timers.front().step("structure") ; 193 | 194 | Rcpp::List res = Rcpp::List::create( (SEXP)timers, out ); 195 | return res ; 196 | } 197 | 198 | private: 199 | // just to be on the safe side, making sure these are not called 200 | TimedReducer() ; 201 | TimedReducer( const TimedReducer& ) ; 202 | } ; 203 | 204 | 205 | } // namespace TinyParallel 206 | 207 | 208 | #endif // __FILEARRAY_PARALLEL_COMMON__ 209 | -------------------------------------------------------------------------------- /R/mapreduce.R: -------------------------------------------------------------------------------- 1 | #' @title A map-reduce method to iterate blocks of file-array data with little memory usage 2 | #' @param x a file array object 3 | #' @param map mapping function that receives 3 arguments; see 'Details' 4 | #' @param reduce \code{NULL}, or a function that takes a list as input 5 | #' @param buffer_size control how we split the array; see 'Details' 6 | #' @param ... passed to other methods 7 | #' @return If \code{reduce} is \code{NULL}, return mapped results, otherwise 8 | #' return reduced results from \code{reduce} function 9 | #' 10 | #' @details When handling out-of-memory arrays, it is recommended to load 11 | #' a block of array at a time and execute on block level. See 12 | #' \code{\link{apply}} for a implementation. When an array is too large, 13 | #' and when there are too many blocks, this operation will become 14 | #' very slow if computer memory is low. 15 | #' This is because the R will perform garbage collection frequently. 16 | #' Implemented in \code{C++}, \code{mapreduce} creates a buffer to store 17 | #' the block data. By reusing the memory over and over again, it is possible 18 | #' to iterate through the array with minimal garbage collections. Many 19 | #' statistics, including \code{min}, \code{max}, \code{sum}, 20 | #' \code{mean}, ... These statistics can be calculated in this 21 | #' way efficiently. 22 | #' 23 | #' The function \code{map} contains three arguments: \code{data} (mandate), 24 | #' \code{size} (optional), and \code{first_index} (optional). 25 | #' The \code{data} is the buffer, 26 | #' whose length is consistent across iterations. \code{size} indicates 27 | #' the effective size of the buffer. If the partition size 28 | #' is not divisible by the buffer size, only first \code{size} elements of 29 | #' the data are from array, and the rest elements will be \code{NA}. 30 | #' This situation could only occurs when \code{buffer_size} is manually 31 | #' specified. By default, all of \code{data} should belong to arrays. 32 | #' The last argument \code{first_index} is the index of the first element 33 | #' \code{data[1]} in the whole array. It is useful when positional data 34 | #' is needed. 35 | #' 36 | #' The buffer size, specified by \code{buffer_size} is an 37 | #' additional optional argument in \code{...}. Its default is \code{NA}, 38 | #' and will be calculated automatically. If manually specified, a 39 | #' large buffer size would be desired to speed up the calculation. 40 | #' The default buffer size will not exceed \eqn{nThreads x 2MB}, where 41 | #' \code{nThreads} is the number of threads set by \code{\link{filearray_threads}}. 42 | #' When partition length cannot be divided by the buffer size, instead of 43 | #' trimming the buffer, \code{NA}s will be filled to the buffer, 44 | #' passed to \code{map} function; see previous paragraph for treatments. 45 | #' 46 | #' The function \code{mapreduce} ignores the missing partitions. That means 47 | #' if a partition is missing, its data will not be read nor passed to 48 | #' \code{map} function. Please run \code{x$initialize_partition()} to make sure 49 | #' partition files exist. 50 | #' 51 | #' @examples 52 | #' 53 | #' 54 | #' x <- filearray_create(tempfile(), c(100, 100, 10)) 55 | #' x[] <- rnorm(1e5) 56 | #' 57 | #' ## calculate summation 58 | #' # identical to sum(x[]), but is more feasible in large cases 59 | #' 60 | #' mapreduce(x, map = function(data, size){ 61 | #' # make sure `data` is all from array 62 | #' if(length(data) != size){ 63 | #' data <- data[1:size] 64 | #' } 65 | #' sum(data) 66 | #' }, reduce = function(mapped_list){ 67 | #' do.call(sum, mapped_list) 68 | #' }) 69 | #' 70 | #' 71 | #' ## Find elements are less than -3 72 | #' positions <- mapreduce( 73 | #' x, 74 | #' map = function(data, size, first_index) { 75 | #' if (length(data) != size) { 76 | #' data <- data[1:size] 77 | #' } 78 | #' which(data < -3) + (first_index - 1) 79 | #' }, 80 | #' reduce = function(mapped_list) { 81 | #' do.call(c, mapped_list) 82 | #' } 83 | #' ) 84 | #' 85 | #' if(length(positions)){ 86 | #' x[[positions[1]]] 87 | #' } 88 | #' 89 | #' 90 | #' @export 91 | setGeneric("mapreduce", function(x, map, reduce, ...){ 92 | standardGeneric("mapreduce") 93 | }) 94 | 95 | buffer_mapreduce <- function(x, map, reduce = NULL, buffer_size = NA){ 96 | # TODO: edit for proxy arrays 97 | if(!x$valid()){ 98 | stop("Invalid file array") 99 | } 100 | if( is_fileproxy(x) ) { 101 | x <- fa_eval_ops(x) 102 | } 103 | 104 | current_bsz <- get_buffer_size() 105 | on.exit({ 106 | set_buffer_size(current_bsz) 107 | }) 108 | set_buffer_size(max_buffer_size()) 109 | 110 | argnames <- names(formals(map)) 111 | if(length(argnames) < 3 && !'...' %in% argnames){ 112 | if( length(argnames) == 1 ){ 113 | map_ <- function(data, size, idx){ 114 | map(data) 115 | } 116 | } else { 117 | map_ <- function(data, size, idx){ 118 | map(data, size) 119 | } 120 | } 121 | } else { 122 | map_ <- function(data, size, idx){ 123 | map(data, size, idx) 124 | } 125 | } 126 | 127 | if(is.function(reduce) && !length(formals(reduce))){ 128 | stop("Reduce function must contain at least one argument") 129 | } 130 | 131 | dim <- x$dimension() 132 | cum_partlen <- x$.partition_info[,3] 133 | sexp_type <- x$sexp_type() 134 | 135 | if(is.na(buffer_size)){ 136 | elem_size <- get_elem_size(typeof(x)) 137 | mbsz <- max_buffer_size() * getThreads(FALSE) / elem_size 138 | 139 | sel <- cumprod(dim) <= mbsz 140 | if(any(sel)){ 141 | buffer_size <- prod(dim[sel]) 142 | } else { 143 | buffer_size <- dim[[1]] 144 | } 145 | } 146 | 147 | if(buffer_size < 1){ 148 | buffer_size <- 1 149 | } 150 | 151 | # filebase <- paste0(x$.filebase, x$.sep) 152 | # FARR_buffer_mapreduce(filebase, map_, reduce, dim, 153 | # cum_partlen, buffer_size, sexp_type) 154 | FARR_buffer_mapreduce( 155 | x$.filebase, 156 | map_, reduce, 157 | buffer_size 158 | ) 159 | } 160 | 161 | #' @rdname mapreduce 162 | #' @export 163 | setMethod( 164 | mapreduce, signature(x = "FileArray", reduce = "function"), 165 | function(x, map, reduce, buffer_size = NA, ...){ 166 | buffer_mapreduce(x, map, reduce, buffer_size) 167 | } 168 | ) 169 | 170 | #' @rdname mapreduce 171 | #' @export 172 | setMethod( 173 | mapreduce, signature(x = "FileArray", reduce = "NULL"), 174 | function(x, map, reduce, buffer_size = NA, ...){ 175 | buffer_mapreduce(x, map, NULL, buffer_size, ...) 176 | } 177 | ) 178 | 179 | #' @rdname mapreduce 180 | #' @export 181 | setMethod( 182 | mapreduce, signature(x = "FileArray", reduce = "missing"), 183 | function(x, map, reduce, buffer_size = NA, ...){ 184 | buffer_mapreduce(x, map, NULL, buffer_size, ...) 185 | } 186 | ) 187 | -------------------------------------------------------------------------------- /R/methods-subset.R: -------------------------------------------------------------------------------- 1 | # DIPSAUS DEBUG START 2 | # verbose <- TRUE 3 | # z = filearray_create(temp_path(), c(2,3,4)) 4 | # z[] <- 1:24 5 | # aaa <- z+1 6 | # print(aaa[,,4]) 7 | # idx <- as_filearray(z[] > 2) 8 | # 9 | # print(z[idx]) 10 | # print(aaa[idx]) 11 | 12 | # Normal subset via indices 13 | fa_subset1 <- function(x, ..., drop = TRUE, reshape = NULL, strict = TRUE, dimnames = TRUE, split_dim = 0) { 14 | if(!x$valid()){ 15 | stop("Invalid file array") 16 | } 17 | drop <- isTRUE(drop) 18 | # file <- temp_path(); x <- filearray_create(file, c(300, 400, 100, 1)) 19 | arglen <- ...length() 20 | elem_size <- x$element_size() 21 | dim <- x$dimension() 22 | 23 | listOrEnv <- list() 24 | if(arglen == 1){ 25 | tmp <- tryCatch({ 26 | ...elt(1) 27 | }, error = function(e){ 28 | NULL 29 | }) 30 | if(length(tmp)){ 31 | stop("Subset FileArray only allows x[], x[i:j] or x[i,j,...] (single logical index like x[c(TRUE, ...)] is not allowed)") 32 | } 33 | 34 | 35 | } else if(arglen > 1){ 36 | if(arglen != length(dim)){ 37 | stop("Subset FileArray dimension mismatch.") 38 | } 39 | missing_args <- check_missing_dots(environment()) 40 | 41 | for(ii in seq_len(arglen)){ 42 | if( missing_args[[ii]] ){ 43 | listOrEnv[[ii]] <- seq_len(dim[[ii]]) 44 | } else { 45 | tmp <- ...elt(ii) 46 | if(!length(tmp)){ 47 | tmp <- integer(0L) 48 | } else if(is.logical(tmp)){ 49 | if(length(tmp) > dim[[ii]]){ 50 | stop("(subscript) logical subscript too long") 51 | } 52 | tmp <- rep(tmp, ceiling(dim[[ii]] / length(tmp))) 53 | tmp <- tmp[seq_len(dim[[ii]])] 54 | tmp <- seq_along(tmp)[tmp] 55 | } 56 | listOrEnv[[ii]] <- tmp 57 | } 58 | } 59 | } 60 | 61 | # guess split dim 62 | max_buffer <- get_buffer_size() / elem_size 63 | 64 | if(length(listOrEnv) == length(dim)){ 65 | idxrange <- sapply(listOrEnv, function(x){ 66 | if(!length(x) || all(is.na(x))){ return(1L) } 67 | rg <- range(x, na.rm = TRUE) 68 | return(rg[2] - rg[1] + 1) 69 | }) 70 | } else { 71 | idxrange <- dim 72 | } 73 | split_dim <- as.integer(split_dim) 74 | if(is.na(split_dim) || split_dim <= 0 || split_dim >= length(dim)){ 75 | # worst-case time-complexity 76 | time_complexity <- 77 | sapply(seq_len(length(dim) - 1), function(split_dim) { 78 | dim[[length(dim)]] <- 1 79 | idx1dim <- dim[seq_len(split_dim)] 80 | idx1dim[[split_dim]] <- idxrange[[split_dim]] 81 | idx1len <- prod(idx1dim) 82 | idx2len <- prod(dim[-seq_len(split_dim)]) 83 | buffer_sz <- 84 | ifelse(idx1len > max_buffer, max_buffer, idx1len) 85 | nloops <- ceiling(idx1len / buffer_sz) 86 | (idx1len * nloops + idx2len) * idx2len 87 | }) 88 | split_dim <- which.min(time_complexity) 89 | split_dim <- split_dim[[length(split_dim)]] 90 | } 91 | 92 | 93 | FARR_subset2( 94 | filebase = x$.filebase, 95 | listOrEnv = listOrEnv, 96 | reshape = reshape, 97 | drop = drop, 98 | use_dimnames = isTRUE(dimnames), 99 | thread_buffer = get_buffer_size(), 100 | split_dim = split_dim, 101 | strict = isTRUE(strict) 102 | ) 103 | 104 | } 105 | 106 | # Special subset using another array 107 | fa_subset2 <- function(x, i, ...) { 108 | 109 | single_index <- FALSE 110 | if(!is_filearray(i)) { 111 | if(is.array(i) && is.logical(i)) { 112 | if(!is_same_dim(x, i)) { 113 | stop("x[i]: `x` and `i` must share the same dimension when i is a logical array.") 114 | } 115 | i <- as_filearray(i) 116 | } else if (is.numeric(i) && is.vector(i)) { 117 | single_index <- TRUE 118 | } else { 119 | stop("x[i]: Cannot determine subset mode from `i`: must be a logical array or integer vector") 120 | } 121 | } 122 | 123 | if( single_index ) { 124 | stop("Single vector indexing (e.g. x[c(1,2,3,4,...)]) hasn't been implemented yet.") 125 | # re <- fastmap::fastqueue() 126 | # idx <- fastmap::fastmap() 127 | # idx$set("start_idx", 0L) 128 | # idx$set("idx_left", length(i)) 129 | # 130 | # 131 | # fa_eval_ops(x, addon = function(env, data, uuid) { 132 | # idx_left <- idx$get("idx_left") 133 | # 134 | # if( idx_left < 0 ) { 135 | # return(NULL) 136 | # } 137 | # start_idx <- idx$get("start_idx") 138 | # 139 | # vec <- env[[ uuid ]] 140 | # 141 | # end_idx <- start_idx + length(vec) 142 | # sel <- i[ i > start_idx & i <= end_idx] - start_idx 143 | # if(length(sel)) { 144 | # if(idx_left <= length(sel)) { 145 | # sel <- sel[seq_len(idx_left)] 146 | # re$add(vec[sel]) 147 | # idx_left <- 0L 148 | # } else { 149 | # re$add(vec[sel]) 150 | # } 151 | # idx$set("idx_left", idx_left - length(sel)) 152 | # } 153 | # idx$set("start_idx", start_idx) 154 | # }) 155 | # re <- unlist(re$as_list()) 156 | # 157 | } else { 158 | re <- unlist( 159 | fmap2( 160 | x = list(x, i), 161 | fun = function(v) { 162 | v[[1]][v[[2]]] 163 | }, 164 | .simplify = FALSE 165 | ) 166 | ) 167 | } 168 | 169 | expected_mode <- operation_output_type( 170 | typeof(x), typeof(x), float = "double", logical = "logical") 171 | if(!identical(expected_mode, mode(re))) { 172 | mode(re) <- expected_mode 173 | } 174 | return(re) 175 | 176 | } 177 | 178 | #' @describeIn S3-filearray subset array 179 | #' @export 180 | `[.FileArray` <- function(x, i, ..., drop = TRUE, reshape = NULL, strict = TRUE, dimnames = TRUE, split_dim = 0) { 181 | x <- fa_eval_ops(x) 182 | if(missing(i)) { 183 | return(fa_subset1(x, , ..., drop = drop, reshape = reshape, strict = strict, dimnames = dimnames, split_dim = split_dim)) 184 | } else { 185 | if(is_filearray(i) || is.array(i) || 186 | ( ...length() == 0 && !is.logical(i) )) { 187 | return(fa_subset2(x, i)) 188 | } else { 189 | return(fa_subset1(x, i, ..., drop = drop, reshape = reshape, strict = strict, dimnames = dimnames, split_dim = split_dim)) 190 | } 191 | } 192 | } 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # File-Backed Array for Out-of-memory Computation 4 | 5 | 6 | 7 | [![R-check](https://github.com/dipterix/filearray/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/dipterix/filearray/actions/workflows/R-CMD-check.yaml) 8 | [![CRAN status](https://www.r-pkg.org/badges/version/filearray)](https://CRAN.R-project.org/package=filearray) 9 | [![Develop](https://dipterix.r-universe.dev/badges/filearray)](https://dipterix.r-universe.dev/) 10 | 11 | 12 | Stores large arrays in files to avoid occupying large memories. Implemented with super fast gigabyte-level multi-threaded reading/writing via `OpenMP`. Supports multiple non-character data types (double, float, integer, complex, logical and raw). 13 | 14 | ![](https://raw.githubusercontent.com/dipterix/filearray/main/adhoc/readme-speed.png) 15 | 16 | Speed comparisons with `lazyarray` (`zstd`-compressed out-of-memory array), and in-memory operation. The speed test was conducted on an `MacBook Air (M1, 2020, 8GB RAM)`, with 8-threads. `filearray` is uniformly faster than `lazyarray`. Random access has almost the same speed as the native array operation in R. *(The actual speed may vary depending on the storage type and memory size)* 17 | 18 | ## Installation 19 | 20 | ```r 21 | install.packages("filearray") 22 | ``` 23 | 24 | ### Install Develop Version 25 | 26 | The internal functions are written in `C++`. To avoid compiling the packages, you can install from my personal repository. It's automatically updated every hour. Currently available on `Windows` and `osx (Intel chip)` only. 27 | 28 | ```r 29 | options(repos = c( 30 | dipterix = 'https://dipterix.r-universe.dev', 31 | CRAN = 'https://cloud.r-project.org')) 32 | 33 | install.packages('filearray') 34 | ``` 35 | 36 | Alternatively, you can compile from `Github` repository. This requires proper compilers (`rtools` on `windows`, or `xcode-select --install` on `osx`, or `build-essentials` on `linux`). 37 | 38 | ```r 39 | # install.packages("remotes") 40 | remotes::install_github("dipterix/filearray") 41 | ``` 42 | 43 | ## Basic Usage 44 | 45 | #### Create/load file array 46 | 47 | ```r 48 | library(filearray) 49 | file <- tempfile() 50 | x <- filearray_create(file, c(100, 100, 100, 100)) 51 | 52 | # load existing 53 | x <- filearray_load(file) 54 | ``` 55 | 56 | See more: `help("filearray")` 57 | 58 | #### Assign & subset array 59 | 60 | ```r 61 | x[,,,1] <- rnorm(1e6) 62 | x[1:10,1,1,1] 63 | ``` 64 | 65 | #### Generics 66 | 67 | ```r 68 | typeof(x) 69 | max(x, na.rm = TRUE) 70 | apply(x, 3, min, na.rm = TRUE) 71 | 72 | val = x[1,1,5,1] 73 | fwhich(x, val, arr.ind = TRUE) 74 | ``` 75 | 76 | See more: `help("S3-filearray")`, `help("fwhich")` 77 | 78 | #### Map-reduce 79 | 80 | Process segments of array and reduce to save memories. 81 | 82 | ``` 83 | # Identical to sum(x, na.rm = TRUE) 84 | mapreduce(x, 85 | map = \(data){ sum(data, na.rm = TRUE) }, 86 | reduce = \(mapped){ do.call(sum, mapped) }) 87 | ``` 88 | 89 | See more: `help("mapreduce")` 90 | 91 | #### Collapse 92 | 93 | Transform data, and collapse (calculate sum or mean) along margins. 94 | 95 | ``` 96 | a <- x$collapse(keep = 4, method = "mean", transform = "asis") 97 | 98 | # equivalent to 99 | b <- apply(x[], 4, mean) 100 | 101 | a[1] - b[1] 102 | ``` 103 | 104 | Available `transform` for double/integer numbers are: 105 | 106 | * `asis`: no transform 107 | * `10log10`: `10 * log10(v)` 108 | * `square`: `v * v` 109 | * `sqrt`: `sqrt(v)` 110 | 111 | For complex numbers, `transform` is a little bit different: 112 | 113 | * `asis`: no transform 114 | * `10log10`: `10 * log10(|x|^2)` (power to decibel unit) 115 | * `square`: `|x|^2` 116 | * `sqrt`: `|x|` (modulus) 117 | * `normalize`: `x / |x|` (unit length) 118 | 119 | ## Notes 120 | 121 | #### I. Notes on precision 122 | 123 | 1. `complex` numbers: In native `R`, complex numbers are combination of two `double` numbers - real and imaginary (total 16 bytes). In `filearray`, complex numbers are coerced to two `float` numbers and store each number in 8 bytes. This conversion will gain performance speed, but lose precision at around 8 decimal place. For example, `1.0000001` will be store as `1`, or `123456789` will be stored as `123456792` (first 7 digits are accurate). 124 | 125 | 2. `float` type: Native R does not have float type. All numeric values are stored in double precision. Since float numbers use half of the space, float arrays can be faster when hard drive speed is the bottle-neck (see [performance comparisons](https://dipterix.org/filearray/articles/performance.html)). However coercing double to float comes at costs: 126 | a). float number has less precision 127 | b). float number has smaller range ($3.4\times 10^{38}$) than double ($1.7\times 10^{308}$) 128 | hence use with caution when data needs high precision or the max is super large. 129 | 130 | 3. `collapse` function: when data range is large (say `x[[1]]=1`, but `x[[2]]=10^20`), `collapse` method might lose precision. This is `double` only uses 8 bytes of memory space. When calculating summations, R internally uses `long double` to prevent precision loss, but current `filearray` implementation uses `double`, causing floating error around 16 decimal place. 131 | 132 | #### II. Cold-start vs warm-start 133 | 134 | As of version `0.1.1`, most file read/write operations are switched from `fopen` to memory map for two simplify the logic (buffer size, kernel cache...), and to boost the writing/some types of reading speed. While sacrificing the speed of reading large block of data from 2.4GB/s to 1.7GB/s, the writing speed was boosted from 300MB/s to 700MB/s, and the speed of random accessing small slices of data was increased from 900MB/s to 2.5GB/s. As a result, some functions can reach to really high speed (close to in-memory calls) while using much less memory. 135 | 136 | The additional performance improvements brought by the memory mapping approach might be impacted by "cold" start. When reading/writing files, most modern systems will cache the files so that it can load up these files faster next time. I personally call it a cold start. Memory mapping have a little bit extra overhead during the cold start, resulting in decreased performance (but it's still fast). Accessing the same data after the cold start is called warm start. When operating with warm starts, `filearray` is as fast as native R arrays (sometimes even faster due to the indexing method and fewer garbage collections). This means `filearray` reaches its best performance when the arrays are re-used. 137 | 138 | #### III. Using traditional HDD? 139 | 140 | `filearray` relies on `SSD`, especially `NVMe SSD` that allows you to fast-access random hard disk address. If you use `HDD`, `filearray` can provide very limited improvement. 141 | 142 | If you use `filearray` to direct access to HDD, please set number of threads to `1` via `filearray::filearray_threads(1)` at start up, or set system environment `FILEARRAY_NUM_THREADS` to `"1"`. 143 | 144 | 145 | -------------------------------------------------------------------------------- /R/methods-subsetAssign.R: -------------------------------------------------------------------------------- 1 | fa_subsetAssign1 <- function(x, ..., value){ 2 | if(!x$valid()){ 3 | stop("Invalid file array") 4 | } 5 | if(isTRUE(x$.mode == 'readonly')){ 6 | stop("File array is read-only") 7 | } 8 | 9 | buf_bytes <- get_buffer_size() 10 | on.exit({ 11 | set_buffer_size(buf_bytes) 12 | }) 13 | 14 | # parse ... 15 | dim <- x$dimension() 16 | arglen <- ...length() 17 | 18 | 19 | locs <- list() 20 | if(arglen <= 1){ 21 | if( arglen == 1 ){ 22 | missing_args <- check_missing_dots(environment()) 23 | if(!missing_args){ 24 | stop("SubsetAssign FileArray only allows x[] <- v or x[i,j,...] <- v (single index not allowed)") 25 | } 26 | } 27 | if(!length(value) %in% c(1, prod(dim))){ 28 | stop("SubsetAssign FileArray `value` length mismatch: `value` length must be either 1 or the same length of the subset.") 29 | } 30 | target_dim <- dim 31 | x$initialize_partition(x$.partition_info[, 1]) 32 | } else if(arglen > 1){ 33 | if(arglen != length(dim)){ 34 | stop("SubsetAssign FileArray dimension mismatch.") 35 | } 36 | 37 | missing_args <- check_missing_dots(environment()) 38 | for(ii in seq_len(arglen)){ 39 | if(missing_args[[ii]]){ 40 | locs[[ii]] <- seq_len(dim[[ii]]) 41 | } else { 42 | tmp <- ...elt(ii) 43 | if( !length(tmp) ){ 44 | return(x) 45 | } 46 | if(any(is.na(tmp))){ 47 | stop("SubsetAssign cannot contain duplicated or invalid indices.") 48 | } 49 | if(is.logical(tmp)){ 50 | if(length(tmp) > dim[[ii]]){ 51 | stop("(subscript) logical subscript too long") 52 | } 53 | tmp <- rep(tmp, ceiling(dim[[ii]] / length(tmp))) 54 | tmp <- which(tmp[seq_len(dim[[ii]])]) 55 | } else if(any(tmp > dim[[ii]]) || 56 | any(tmp <= 0) || 57 | any(duplicated(tmp))){ 58 | stop("SubsetAssign cannot contain duplicated or invalid indices.") 59 | } 60 | locs[[ii]] <- tmp 61 | } 62 | } 63 | 64 | target_dim <- sapply(locs, length) 65 | if(!length(value) %in% c(1, prod(target_dim))){ 66 | stop("SubsetAssign FileArray `value` length mismatch: `value` length must be either 1 or the same length of the subset.") 67 | } 68 | 69 | # make sure partitions exist 70 | tmp <- locs[[length(locs)]] 71 | sapply(tmp, function(i){ 72 | sel <- x$.partition_info[,3] <= i 73 | if(any(sel)){ 74 | sel <- max(x$.partition_info[sel,1]) 75 | if(x$.partition_info[sel, 3] < i){ 76 | sel <- sel + 1 77 | } 78 | } else { 79 | sel <- 1 80 | } 81 | x$initialize_partition(sel) 82 | }) 83 | } 84 | 85 | if(prod(target_dim) == 0){ 86 | return(invisible(x)) 87 | } 88 | 89 | # decide split_dim 90 | buffer_sz <- buf_bytes / x$element_size() 91 | cprod <- cumprod(dim) 92 | if(length(locs) == length(dim)){ 93 | tmp <- sapply(locs, function(x){ 94 | if(!length(x) || all(is.na(x))){ return(1L) } 95 | rg <- range(x, na.rm = TRUE) 96 | return(rg[2] - rg[1] + 1L) 97 | }) 98 | cprod <- cprod / dim * tmp 99 | } 100 | cprod <- cprod[-length(cprod)] 101 | if(all(cprod > buffer_sz)){ 102 | split_dim <- 1 103 | } else { 104 | split_dim <- max(which(cprod <= buffer_sz)) 105 | } 106 | 107 | # filebase <- paste0(x$.filebase, x$.sep) 108 | # FARR_subset_assign( 109 | # filebase, 110 | # listOrEnv = locs, 111 | # dim = x$dimension(), 112 | # cum_part_sizes = x$.partition_info[, 3], 113 | # split_dim = split_dim, 114 | # type = x$sexp_type(), 115 | # value_ = value 116 | # ) 117 | FARR_subset_assign2( 118 | filebase = x$.filebase, 119 | value = value, 120 | listOrEnv = locs, 121 | split_dim = split_dim, 122 | thread_buffer = buf_bytes 123 | ) 124 | invisible(x) 125 | } 126 | 127 | # setMethod("[<-", 128 | # signature(x = "Filearray"), 129 | # function (x, i, j, ..., value) 130 | # { 131 | # stop("need a definition for the method here") 132 | # } 133 | # ) 134 | 135 | # DIPSAUS DEBUG START 136 | # verbose <- TRUE 137 | # z = filearray_create(temp_path(), c(2,3,4)) 138 | # z[] <- 1:24 139 | # x <- z+1 140 | # i <- as_filearray(z[] < 5) 141 | 142 | # `i` is a filearray 143 | fa_subsetAssign2 <- function(x, i, value, label = "subset-assign (lazy)") { 144 | stopifnot(is_filearray(x) && is_filearray(i)) 145 | 146 | if(typeof(i) != "logical") { 147 | stop("`fa_subsetAssign2`: subset index filearray must be logical") 148 | } 149 | 150 | e1 <- as_filearrayproxy(x) 151 | e2 <- as_filearrayproxy(i) 152 | uuid1 <- e1$uuid() 153 | uuid2 <- e2$uuid() 154 | e1$link_proxy( e2 ) 155 | 156 | # check input types 157 | out_type <- typeof(e1) 158 | 159 | globals <- fastmap::fastmap() 160 | globals$set("starting_idx", 0L) 161 | value_len <- length(value) 162 | 163 | op_func <- function(value_list, ...) { 164 | data <- value_list[[ uuid1 ]] 165 | 166 | idx <- value_list[[ uuid2 ]] 167 | n_assigned <- sum(idx) 168 | if(n_assigned == 0) { return(data) } 169 | 170 | if( value_len == 1L ) { 171 | data[ idx ] <- value 172 | } else { 173 | starting_idx <- globals$get("starting_idx") 174 | data[ idx ] <- value[starting_idx + seq_len(n_assigned)] 175 | globals$set("starting_idx", starting_idx + n_assigned) 176 | } 177 | 178 | return(data) 179 | } 180 | 181 | e1$add_operator(op_func, 182 | out_type = out_type, 183 | context = "array_subset_assign", 184 | label = label) 185 | 186 | return( e1 ) 187 | } 188 | 189 | 190 | #' @describeIn S3-filearray subset assign array 191 | #' @export 192 | `[<-.FileArray` <- function(x, i, ..., lazy = FALSE, value) { 193 | if(!x$valid()){ 194 | stop("Invalid file array") 195 | } 196 | 197 | if(missing(i)) { 198 | if(is_fileproxy(x) && length(x$.ops) > 0) { 199 | x <- fa_eval_ops(x) 200 | x$.mode <- "readwrite" 201 | } 202 | return(fa_subsetAssign1(x, , ..., value = value)) 203 | } else { 204 | if(is_filearray(i) || is.array(i) || 205 | ( ...length() == 0 && !is.logical(i) )) { 206 | re <- fa_subsetAssign2(x, i, value = value) 207 | if(!lazy) { 208 | re <- fa_eval_ops(re) 209 | re$.mode <- "readwrite" 210 | } 211 | return(re) 212 | } else { 213 | if(is_fileproxy(x) && length(x$.ops) > 0) { 214 | x <- fa_eval_ops(x) 215 | x$.mode <- "readwrite" 216 | } 217 | return(fa_subsetAssign1(x, i, ..., value = value)) 218 | } 219 | } 220 | } -------------------------------------------------------------------------------- /adhoc/readme-figure.R: -------------------------------------------------------------------------------- 1 | set.seed(1) 2 | dim <- c(100,200,200,100) 3 | lazyx <- lazyarray::create_lazyarray( 4 | tempfile(), storage_format = 'double', dim = dim) 5 | lazyx[] <- NA 6 | filex <- filearray::filearray_create( 7 | tempfile(), dim, 'double') 8 | filex$initialize_partition() 9 | 10 | set.seed(1) 11 | tmp <- rnorm(4e7) 12 | res1 <- microbenchmark::microbenchmark( 13 | lazyarray = { 14 | for(i in 1:10){ 15 | lazyx[,,,(i-1)*10 +1:10] <- tmp 16 | } 17 | }, 18 | filearray = { 19 | for(i in 1:10){ 20 | filex[,,,(i-1)*10 +1:10] <- tmp 21 | } 22 | }, times = 5, setup = quote(gc()) 23 | ); res1 24 | 25 | 26 | 27 | speed1 <- sapply(split(res1, res1$expr), function(res){ 28 | speed <- length(filex) *8000 / res$time 29 | c(mean(speed), sd(speed) / 2) 30 | }); speed1 31 | 32 | 33 | res2 <- microbenchmark::microbenchmark( 34 | lazyarray = { 35 | for(i in 1:10){ 36 | lazyx[,,,1:10 + (i-1) * 10] 37 | } 38 | }, 39 | filex = { 40 | for(i in 1:10){ 41 | filex[,,,1:10 + (i-1) * 10] 42 | } 43 | }, times = 5, setup = quote(gc()) 44 | ) 45 | speed2 <- sapply(split(res2, res2$expr), function(res){ 46 | speed <- length(filex) *8000 / res$time 47 | c(mean(speed), sd(speed) / 3) 48 | }); speed2 49 | 50 | 51 | 52 | set.seed(1) 53 | locs <- lapply(dim, function(d){ 54 | sample(1:d, replace = FALSE, size = sample(ifelse(d>50,50,1):d, 1)) 55 | }) 56 | 57 | res3 <- microbenchmark::microbenchmark( 58 | lazyarray = { 59 | lazyx[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] 60 | }, 61 | filearray = { 62 | filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] 63 | }, 64 | times = 10, setup = quote(gc())) 65 | 66 | speed3 <- sapply(split(res3, res3$expr), function(res){ 67 | speed <- prod(sapply(locs, length)) * 8000 / res$time 68 | c(mean(speed), sd(speed) / 3) 69 | }); speed3 70 | 71 | 72 | z <- filex[] 73 | res5 <- microbenchmark::microbenchmark( 74 | nativeR = { 75 | z[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] 76 | }, 77 | times = 10, setup = quote(gc())) 78 | speed3 <- cbind(speed3, sapply(split(res5, res5$expr), function(res){ 79 | speed <- prod(sapply(locs, length)) * 8000 / res$time 80 | c(mean(speed), sd(speed) / 3) 81 | })) 82 | 83 | 84 | rm(z); gc() 85 | tmp <- rnorm(prod(sapply(locs, length))) 86 | res4 <- microbenchmark::microbenchmark( 87 | lazyarray = { 88 | lazyx[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp 89 | }, 90 | filearray = { 91 | filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp 92 | }, times = 10, setup = quote(gc()) 93 | ) 94 | speed4 <- sapply(split(res4, res4$expr), function(res){ 95 | speed <- prod(sapply(locs, length)) * 8000 / res$time 96 | c(mean(speed), sd(speed) / 3) 97 | }); speed4 98 | 99 | z <- filex[] 100 | res5 <- microbenchmark::microbenchmark( 101 | nativeR = { 102 | z[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp 103 | }, times = 10, setup = quote(gc()) 104 | ) 105 | rm(z); gc() 106 | speed4 <- cbind(speed4, sapply(split(res5, res5$expr), function(res){ 107 | speed <- prod(sapply(locs, length)) * 8000 / res$time 108 | c(mean(speed), sd(speed) / 3) 109 | })) 110 | mean(prod(sapply(locs, length)) / 1e8 * 800 / res5$time) * 1e9 111 | 112 | f <- function(){ 113 | 114 | par(mfrow = c(1,4)) 115 | 116 | cols <- ravebuiltins:::group_colors[1:2] 117 | cols2 <- ravebuiltins:::group_colors[1:3] 118 | 119 | speed <- cbind( 120 | speed1[1,], 121 | speed2[1,] 122 | ) / 1024 123 | rownames(speed) <- c("lazyarray", "filearray") 124 | colnames(speed) <- c("Write", "Read") 125 | 126 | txt_cex = 1.2 127 | txt_cex2 = 1.2 128 | 129 | plt <- barplot.default( 130 | speed[,1,drop=FALSE], beside = TRUE, 131 | ylab = "Speed (GB/s)", 132 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 133 | ylim = c(0, 3), las = 1, yaxt = "n", 134 | border = NA, 135 | main = "Write 3GB Data", 136 | cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4 137 | ) 138 | axis(2, 0:3, las = 1) 139 | text.default(x = plt[,], y = speed[,1], 140 | labels = sprintf(c("lazyarray\n%.0f MB/s","filearray\n%.0f MB/s"), speed[,1] * 1024), cex = txt_cex, 141 | col = 'white') 142 | text.default(x = plt[,], y = speed[,1], 143 | labels = c("lazyarray\n","filearray\n"), 144 | cex = txt_cex2, 145 | col = cols) 146 | 147 | 148 | plt <- barplot.default( 149 | speed[,2,drop=FALSE], beside = TRUE, 150 | ylab = "Speed (GB/s)", 151 | col = dipsaus::col2hexStr(cols, alpha = 0.5), 152 | ylim = c(0, 3), las = 1, yaxt = "n", 153 | border = NA, 154 | main = "Read 3GB Data" 155 | ) 156 | axis(2, c(0, 1, 2, 3), las = 1) 157 | text.default(x = plt[,], y = speed[,2], 158 | labels = sprintf(c("lazyarray\n%.0f MB/s","filearray\n%.0f MB/s"), speed[,2] * 1024), cex = txt_cex, 159 | col = 'white') 160 | text.default(x = plt[,], y = speed[,2], 161 | labels = c("lazyarray\n","filearray\n"), 162 | cex = txt_cex2, 163 | col = cols) 164 | 165 | speed <- cbind( 166 | speed3[1,], 167 | speed4[1,] 168 | ) / 1024 169 | rownames(speed) <- c("lazyarray", "filearray", "in-memory") 170 | colnames(speed) <- c("Subset", "SubsetAssign") 171 | plt <- barplot.default( 172 | speed[,1,drop=FALSE], beside = TRUE, 173 | ylab = "Speed (GB/s)", 174 | col = dipsaus::col2hexStr(cols2, alpha = 0.5), 175 | ylim = c(0, 3), las = 1, yaxt = "n", 176 | border = NA, 177 | main = "Randomly Subset\n800MB Data" 178 | ) 179 | axis(2, c(0, 1, 2, 3), las = 1) 180 | text.default(x = plt, y = speed[,1], 181 | labels = sprintf( 182 | c("lazyarray\n%.0f MB/s", 183 | "filearray\n%.0f MB/s", 184 | "in-memory\n%.0f MB/s"), 185 | speed[,1]*1024), cex = txt_cex, 186 | col = 'white') 187 | text.default(x = plt[,], y = speed[,1], 188 | labels = c("lazyarray\n", 189 | "filearray\n", 190 | "in-memory\n"), 191 | cex = txt_cex2, 192 | col = cols2) 193 | 194 | 195 | plt <- barplot.default( 196 | speed[,2,drop=FALSE], beside = TRUE, 197 | ylab = "Speed (GB/s)", 198 | col = dipsaus::col2hexStr(cols2, alpha = 0.5), 199 | ylim = c(0, 3), las = 1, yaxt = "n", 200 | border = NA, 201 | main = "Randomly Replace\n800MB Data" 202 | ) 203 | axis(2, c(0, 1, 2, 3), las = 1) 204 | text.default(x = plt, y = speed[,2], 205 | labels = sprintf( 206 | c("lazyarray\n%.0f MB/s", 207 | "filearray\n%.0f MB/s", 208 | "in-memory\n%.0f MB/s"), 209 | speed[,2]*1024), cex = txt_cex, 210 | col = 'white') 211 | text.default(x = plt[,], y = speed[,2], 212 | labels = c("lazyarray\n", 213 | "filearray\n", 214 | "in-memory\n"), 215 | cex = txt_cex2, 216 | col = cols2) 217 | 218 | } 219 | 220 | png("./adhoc/readme-speed.png", width = 4267, height = 1600, res = 300) 221 | f() 222 | dev.off() 223 | -------------------------------------------------------------------------------- /tests/testthat/test-method_sub.R: -------------------------------------------------------------------------------- 1 | test_that("method: addition", { 2 | on.exit({ 3 | options("filearray.operator.precision" = NULL) 4 | clear_cache() 5 | }) 6 | 7 | dm <- c(2,3,4, 10) 8 | len <- prod(dm) 9 | 10 | x_dbl <- array(rnorm(len), dim = dm) 11 | x_int <- array(seq_len(len), dim = dm) 12 | x_lgl <- x_dbl > 0 13 | 14 | arr_dbl <- as_filearray(x_dbl, type = "double", partition_size = 2) 15 | arr_int <- as_filearray(x_int, type = "integer", partition_size = 3) 16 | arr_lgl <- as_filearray(x_lgl, type = "logical", partition_size = 4) 17 | 18 | 19 | proxy_dbl <- as_filearrayproxy(arr_dbl) 20 | proxy_int <- as_filearrayproxy(arr_int) 21 | proxy_lgl <- as_filearrayproxy(arr_lgl) 22 | 23 | # sanity 24 | expect_equal(arr_dbl[dimnames = NULL], x_dbl, tolerance = 1e-5) 25 | expect_equal(arr_int[dimnames = NULL], x_int) 26 | expect_equal(arr_lgl[dimnames = NULL], x_lgl) 27 | 28 | expect_equal(proxy_dbl[dimnames = NULL], x_dbl, tolerance = 1e-5) 29 | expect_equal(proxy_int[dimnames = NULL], x_int) 30 | expect_equal(proxy_lgl[dimnames = NULL], x_lgl) 31 | 32 | check_add <- function(e1, e2, d1, d2, type, tolerance = .Machine$double.eps) { 33 | y <- e1-e2 34 | res <- y[dimnames = NULL] 35 | 36 | if(!missing(type)) { 37 | expect_identical(typeof(y), type) 38 | } 39 | 40 | expect_equal(res, d1 - d2, tolerance = tolerance) 41 | } 42 | 43 | options("filearray.operator.precision" = "double") 44 | 45 | # filearrayproxy - Double 46 | check_add(proxy_dbl, proxy_dbl, x_dbl, x_dbl, "double", 1e-5) 47 | check_add(proxy_dbl, proxy_int, x_dbl, x_int, "double", 1e-5) 48 | check_add(proxy_dbl, proxy_lgl, x_dbl, x_lgl, "double", 1e-5) 49 | 50 | check_add(proxy_dbl, arr_dbl, x_dbl, x_dbl, "double", 1e-5) 51 | check_add(proxy_dbl, arr_int, x_dbl, x_int, "double", 1e-5) 52 | check_add(proxy_dbl, arr_lgl, x_dbl, x_lgl, "double", 1e-5) 53 | 54 | check_add(proxy_dbl, 1.0, x_dbl, 1.0, "double", 1e-5) 55 | check_add(proxy_dbl, 1L, x_dbl, 1L, "double", 1e-5) 56 | check_add(proxy_dbl, TRUE, x_dbl, TRUE, "double", 1e-5) 57 | 58 | # filearrayproxy - Int 59 | check_add(proxy_int, proxy_dbl, x_int, x_dbl, "double", 1e-5) 60 | check_add(proxy_int, proxy_int, x_int, x_int, "integer", 1e-5) 61 | check_add(proxy_int, proxy_lgl, x_int, x_lgl, "integer", 1e-5) 62 | 63 | check_add(proxy_int, arr_dbl, x_int, x_dbl, "double", 1e-5) 64 | check_add(proxy_int, arr_int, x_int, x_int, "integer", 1e-5) 65 | check_add(proxy_int, arr_lgl, x_int, x_lgl, "integer", 1e-5) 66 | 67 | check_add(proxy_int, 1.0, x_int, 1.0, "double", 1e-5) 68 | check_add(proxy_int, 1L, x_int, 1L, "integer", 1e-5) 69 | check_add(proxy_int, TRUE, x_int, TRUE, "integer", 1e-5) 70 | 71 | # filearrayproxy - Logical 72 | check_add(proxy_lgl, proxy_dbl, x_lgl, x_dbl, "double", 1e-5) 73 | check_add(proxy_lgl, proxy_int, x_lgl, x_int, "integer", 1e-5) 74 | check_add(proxy_lgl, proxy_lgl, x_lgl, x_lgl, "integer", 1e-5) 75 | 76 | check_add(proxy_lgl, arr_dbl, x_lgl, x_dbl, "double", 1e-5) 77 | check_add(proxy_lgl, arr_int, x_lgl, x_int, "integer", 1e-5) 78 | check_add(proxy_lgl, arr_lgl, x_lgl, x_lgl, "integer", 1e-5) 79 | 80 | check_add(proxy_lgl, 1.0, x_lgl, 1.0, "double", 1e-5) 81 | check_add(proxy_lgl, 1L, x_lgl, 1L, "integer", 1e-5) 82 | check_add(proxy_lgl, TRUE, x_lgl, TRUE, "integer", 1e-5) 83 | 84 | # filearray - Double 85 | check_add(arr_dbl, proxy_dbl, x_dbl, x_dbl, "double", 1e-5) 86 | check_add(arr_dbl, proxy_int, x_dbl, x_int, "double", 1e-5) 87 | check_add(arr_dbl, proxy_lgl, x_dbl, x_lgl, "double", 1e-5) 88 | 89 | check_add(arr_dbl, arr_dbl, x_dbl, x_dbl, "double", 1e-5) 90 | check_add(arr_dbl, arr_int, x_dbl, x_int, "double", 1e-5) 91 | check_add(arr_dbl, arr_lgl, x_dbl, x_lgl, "double", 1e-5) 92 | 93 | check_add(arr_dbl, 1.0, x_dbl, 1.0, "double", 1e-5) 94 | check_add(arr_dbl, 1L, x_dbl, 1L, "double", 1e-5) 95 | check_add(arr_dbl, TRUE, x_dbl, TRUE, "double", 1e-5) 96 | 97 | # filearray - Int 98 | check_add(arr_int, proxy_dbl, x_int, x_dbl, "double", 1e-5) 99 | check_add(arr_int, proxy_int, x_int, x_int, "integer", 1e-5) 100 | check_add(arr_int, proxy_lgl, x_int, x_lgl, "integer", 1e-5) 101 | 102 | check_add(arr_int, arr_dbl, x_int, x_dbl, "double", 1e-5) 103 | check_add(arr_int, arr_int, x_int, x_int, "integer", 1e-5) 104 | check_add(arr_int, arr_lgl, x_int, x_lgl, "integer", 1e-5) 105 | 106 | check_add(arr_int, 1.0, x_int, 1.0, "double", 1e-5) 107 | check_add(arr_int, 1L, x_int, 1L, "integer", 1e-5) 108 | check_add(arr_int, TRUE, x_int, TRUE, "integer", 1e-5) 109 | 110 | # filearray - Logical 111 | check_add(arr_lgl, proxy_dbl, x_lgl, x_dbl, "double", 1e-5) 112 | check_add(arr_lgl, proxy_int, x_lgl, x_int, "integer", 1e-5) 113 | check_add(arr_lgl, proxy_lgl, x_lgl, x_lgl, "integer", 1e-5) 114 | 115 | check_add(arr_lgl, arr_dbl, x_lgl, x_dbl, "double", 1e-5) 116 | check_add(arr_lgl, arr_int, x_lgl, x_int, "integer", 1e-5) 117 | check_add(arr_lgl, arr_lgl, x_lgl, x_lgl, "integer", 1e-5) 118 | 119 | check_add(arr_lgl, 1.0, x_lgl, 1.0, "double", 1e-5) 120 | check_add(arr_lgl, 1L, x_lgl, 1L, "integer", 1e-5) 121 | check_add(arr_lgl, TRUE, x_lgl, TRUE, "integer", 1e-5) 122 | 123 | 124 | # scalar - double 125 | check_add(1.0, proxy_dbl, 1.0, x_dbl, "double", 1e-5) 126 | check_add(1.0, proxy_int, 1.0, x_int, "double", 1e-5) 127 | check_add(1.0, proxy_lgl, 1.0, x_lgl, "double", 1e-5) 128 | 129 | check_add(1.0, arr_dbl, 1.0, x_dbl, "double", 1e-5) 130 | check_add(1.0, arr_int, 1.0, x_int, "double", 1e-5) 131 | check_add(1.0, arr_lgl, 1.0, x_lgl, "double", 1e-5) 132 | 133 | # scalar - int 134 | check_add(1L, proxy_dbl, 1L, x_dbl, "double", 1e-5) 135 | check_add(1L, proxy_int, 1L, x_int, "integer", 1e-5) 136 | check_add(1L, proxy_lgl, 1L, x_lgl, "integer", 1e-5) 137 | 138 | check_add(1L, arr_dbl, 1L, x_dbl, "double", 1e-5) 139 | check_add(1L, arr_int, 1L, x_int, "integer", 1e-5) 140 | check_add(1L, arr_lgl, 1L, x_lgl, "integer", 1e-5) 141 | 142 | # scalar - lgl 143 | check_add(FALSE, proxy_dbl, FALSE, x_dbl, "double", 1e-5) 144 | check_add(FALSE, proxy_int, FALSE, x_int, "integer", 1e-5) 145 | check_add(FALSE, proxy_lgl, FALSE, x_lgl, "integer", 1e-5) 146 | 147 | check_add(FALSE, arr_dbl, FALSE, x_dbl, "double", 1e-5) 148 | check_add(FALSE, arr_int, FALSE, x_int, "integer", 1e-5) 149 | check_add(FALSE, arr_lgl, FALSE, x_lgl, "integer", 1e-5) 150 | 151 | # wrong dimensions 152 | testthat::expect_error(arr_int - array(0L, c(10,20,1))) 153 | testthat::expect_error(arr_int - 1:10) 154 | tmp <- arr_int[] 155 | dm <- dim(arr_int) 156 | dm <- c(dm[1] * dm[2], dm[-c(1,2)]) 157 | dim(tmp) <- dm 158 | tmp_arr <- as_filearray(tmp) 159 | testthat::expect_error( arr_int - tmp ) 160 | testthat::expect_error( tmp - arr_int ) 161 | testthat::expect_error( arr_int - tmp_arr ) 162 | testthat::expect_error( tmp_arr - arr_int ) 163 | testthat::expect_equal( 164 | (arr_int - seq_len(length(arr_int)))[dimnames = NULL], 165 | arr_int[dimnames = NULL] - seq_len(length(arr_int)) 166 | ) 167 | testthat::expect_equal( 168 | (seq_len(length(arr_int)) - arr_int)[dimnames = NULL], 169 | seq_len(length(arr_int)) - arr_int[dimnames = NULL] 170 | ) 171 | 172 | }) 173 | --------------------------------------------------------------------------------