├── R
    ├── lock.R
    ├── todo-list.R
    ├── methods-typeof.R
    ├── methods-subtract.R
    ├── misc-docs.R
    ├── aaa-docs.R
    ├── internal_misc.R
    ├── write.R
    ├── aaa.R
    ├── methods-apply.R
    ├── filearray-package.R
    ├── RcppExports.R
    ├── header.R
    ├── helpers.R
    ├── mapreduce.R
    ├── methods-subset.R
    └── methods-subsetAssign.R
├── .github
    ├── .gitignore
    └── workflows
    │   ├── pkgdown.yaml
    │   ├── R-CMD-check.yaml
    │   └── rhub.yaml
├── adhoc
    ├── .gitignore
    ├── other docs
    │   ├── comparisons.md
    │   ├── .gitignore
    │   ├── comparison-threads-coldstart.png
    │   ├── comparison-threads-warmstart.png
    │   ├── comparison-singlethread-coldstart.png
    │   ├── readme-figure.R
    │   ├── comparison-singlethread.R
    │   └── comparison-threads.R
    ├── readme-speed.png
    ├── rchk.sh
    ├── debug1.R
    ├── performance.R
    ├── performance-write.R
    ├── debug.R
    ├── map-test.R
    ├── gctorture.R
    ├── profiling2.R
    ├── profiling.R
    └── readme-figure.R
├── inst
    ├── WORDLIST
    ├── hexbadge.png
    └── include
    │   ├── TinyParallel.h
    │   └── TinyParallel
    │       ├── Common.h
    │       ├── TinyThread.h
    │       └── Timer.h
├── src
    ├── .gitignore
    ├── Makevars
    ├── common.h
    ├── threadSettings.h
    ├── defs.h
    ├── load.h
    ├── threadSettings.cpp
    ├── utils.h
    ├── serialize.h
    ├── interfaces.cpp
    ├── save.h
    ├── core.h
    ├── conversion.h
    └── serialize.cpp
├── vignettes
    └── .gitignore
├── cran-comments.md
├── _pkgdown.yml
├── CRAN-SUBMISSION
├── .Rbuildignore
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-method_subset.R
    │   ├── test-method_subsetAssign.R
    │   ├── test-as_filearray.R
    │   ├── test-methods_simple.R
    │   ├── test-methods.R
    │   ├── test-dimnames.R
    │   ├── test-helpers.R
    │   ├── test-bind.R
    │   ├── test-map.R
    │   └── test-method_sub.R
├── filearray.Rproj
├── .gitignore
├── man
    ├── filearray_threads.Rd
    ├── typeof.Rd
    ├── apply.Rd
    ├── FileArray-class.Rd
    ├── fwhich.Rd
    ├── S3-filearray.Rd
    ├── filearray_bind.Rd
    ├── mapreduce.Rd
    ├── fmap.Rd
    └── filearray.Rd
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
└── README.md


/R/lock.R:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/adhoc/.gitignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | 


--------------------------------------------------------------------------------
/adhoc/other docs/comparisons.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
1 | FileArray
2 | OpenMP
3 | 


--------------------------------------------------------------------------------
/adhoc/other docs/.gitignore:
--------------------------------------------------------------------------------
1 | *.RData
2 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_CPPFLAGS= -I../inst/include
2 | PKG_LIBS=
3 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## R CMD check results
2 | 
3 | 0 errors | 0 warnings | 0 notes
4 | 


--------------------------------------------------------------------------------
/inst/hexbadge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipterix/filearray/HEAD/inst/hexbadge.png


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://dipterix.org/filearray/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/adhoc/readme-speed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/readme-speed.png


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 0.1.9
2 | Date: 2024-11-08 16:38:30 UTC
3 | SHA: 5f59eb137a26b7e2915a36884d8742407ce9fa72
4 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
1 | #ifndef FARR_COMMON_H
2 | #define FARR_COMMON_H
3 | 
4 | #include <Rcpp.h>
5 | 
6 | #endif // FARR_COMMON_H


--------------------------------------------------------------------------------
/adhoc/other docs/comparison-threads-coldstart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/other docs/comparison-threads-coldstart.png


--------------------------------------------------------------------------------
/adhoc/other docs/comparison-threads-warmstart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/other docs/comparison-threads-warmstart.png


--------------------------------------------------------------------------------
/R/todo-list.R:
--------------------------------------------------------------------------------
1 | 
2 | # message("TODO: add test functions for `apply` on proxy arrays")
3 | # message("TODO: add test functions for `mapreduce` on proxy arrays")
4 | 


--------------------------------------------------------------------------------
/adhoc/other docs/comparison-singlethread-coldstart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipterix/filearray/HEAD/adhoc/other docs/comparison-singlethread-coldstart.png


--------------------------------------------------------------------------------
/adhoc/rchk.sh:
--------------------------------------------------------------------------------
1 | # rm "$HOME/Dropbox/projects/filearray_0.1.3.9001.tar.gz"
2 | docker run -v "$HOME/Dropbox/projects":/projects kalibera/rchk:latest "/projects/filearray_0.1.3.9001.tar.gz"
3 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | tests2
 4 | README.md
 5 | ^adhoc
 6 | ^\.github$
 7 | ^_pkgdown\.yml$
 8 | ^docs$
 9 | ^pkgdown$
10 | ^cran-comments\.md$
11 | ^CRAN-RELEASE$
12 | ^inst/hexbadge\.png$
13 | ^CRAN-SUBMISSION$
14 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(filearray)
 3 | 
 4 | # Sys.setenv("FILEARRAY_SKIP_COLLAPSE" = "TRUE")
 5 | 
 6 | cat(utils::capture.output({
 7 |     print(Sys.getenv())
 8 | }), sep = "\n", file = stderr())
 9 | 
10 | test_check("filearray")
11 | 
12 | filearray:::clear_cache()
13 | 


--------------------------------------------------------------------------------
/src/threadSettings.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_OPENMP_H
 2 | #define FARR_OPENMP_H
 3 | 
 4 | #ifdef _OPENMP
 5 | #include <omp.h>
 6 | #include <pthread.h>
 7 | #define FARR_HAS_OPENMP true
 8 | #else
 9 | #define omp_get_thread_num() 0
10 | #define omp_get_max_threads() 1
11 | #define FARR_HAS_OPENMP false
12 | #endif
13 | 
14 | #include "common.h"
15 | #include <algorithm>
16 | 
17 | 
18 | int getThreads(const bool& max = false);
19 | 
20 | 
21 | #endif  // FARR_OPENMP_H
22 | 


--------------------------------------------------------------------------------
/filearray.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | ProjectId: 21f88fad-b806-4cce-a5b4-b11d735e9125
 3 | 
 4 | RestoreWorkspace: No
 5 | SaveWorkspace: No
 6 | AlwaysSaveHistory: No
 7 | 
 8 | EnableCodeIndexing: Yes
 9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 4
11 | Encoding: UTF-8
12 | 
13 | RnwWeave: knitr
14 | LaTeX: pdfLaTeX
15 | 
16 | BuildType: Package
17 | PackageUseDevtools: Yes
18 | PackageInstallArgs: --no-multiarch --with-keep.source
19 | PackageCheckArgs: --as-cran
20 | PackageRoxygenize: rd,collate,namespace,vignette
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | # Session Data files
 5 | .RData
 6 | # Example code in package build process
 7 | *-Ex.R
 8 | # RStudio files
 9 | .Rproj.user/
10 | # produced vignettes
11 | vignettes/*.html
12 | vignettes/*.pdf
13 | vignettes/UsingFilematrices_cache/*
14 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
15 | .httr-oauth
16 | .Rproj.user
17 | # extra tests
18 | tests2/
19 | *.DS_Store
20 | docs
21 | CRAN-RELEASE
22 | inst/doc
23 | # C++ related
24 | *.a
25 | *.dll
26 | *.o
27 | *.so
28 | *.dll
29 | 


--------------------------------------------------------------------------------
/man/filearray_threads.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/misc-docs.R
 3 | \name{filearray_threads}
 4 | \alias{filearray_threads}
 5 | \title{Set or get file array threads}
 6 | \usage{
 7 | filearray_threads(n, ...)
 8 | }
 9 | \arguments{
10 | \item{n}{number of threads to set. If \code{n} is negative,
11 | then default to the number of cores that computer has.}
12 | 
13 | \item{...}{internally used}
14 | }
15 | \value{
16 | An integer of current number of threads
17 | }
18 | \description{
19 | Will enable/disable multi-threaded reading or writing
20 | at \code{C++} level.
21 | }
22 | 


--------------------------------------------------------------------------------
/src/defs.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_DEF_H
 2 | #define FARR_DEF_H
 3 | 
 4 | #ifndef NA_INTEGER64
 5 | #define NA_INTEGER64 LLONG_MIN
 6 | #endif
 7 | 
 8 | #ifndef FARR_HEADER_LENGTH
 9 | #define FARR_HEADER_LENGTH 1024
10 | #endif
11 | 
12 | /**********************************************************
13 |  * Extended types
14 |  ***********************************************************/
15 | 
16 | #define FLTSXP 26
17 | #define FLOAT(x) ((float*) INTEGER(x))
18 | 
19 | const static float NA_FLOAT = NAN;
20 | const static Rbyte NA_RBYTE = 2;
21 | 
22 | #define INT64SXP REALSXP
23 | #define INTEGER64(x) ((int64_t*) REAL(x))
24 | 
25 | #endif  // FARR_DEF_H


--------------------------------------------------------------------------------
/man/typeof.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods-typeof.R
 3 | \name{typeof}
 4 | \alias{typeof}
 5 | \alias{typeof,FileArray-method}
 6 | \alias{typeof,FileArrayProxy-method}
 7 | \title{The type of a file array (extended)}
 8 | \usage{
 9 | typeof(x)
10 | 
11 | \S4method{typeof}{FileArray}(x)
12 | 
13 | \S4method{typeof}{FileArrayProxy}(x)
14 | }
15 | \arguments{
16 | \item{x}{any file array}
17 | }
18 | \value{
19 | A character string. The possible values are \code{"double"},
20 | \code{"integer"}, \code{"logical"}, and \code{"raw"}
21 | }
22 | \description{
23 | The type of a file array (extended)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/load.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_LOAD_H
 2 | #define FARR_LOAD_H
 3 | 
 4 | #include "common.h"
 5 | 
 6 | SEXP FARR_subset2(
 7 |         const std::string& filebase,
 8 |         const SEXP listOrEnv,
 9 |         const SEXP reshape,
10 |         const bool drop,
11 |         const bool use_dimnames,
12 |         size_t thread_buffer,
13 |         int split_dim,
14 |         const int strict
15 | );
16 | 
17 | SEXP FARR_subset_sequential(
18 |         const std::string& filebase, 
19 |         const int64_t& unit_partlen, 
20 |         SEXP cum_partsizes, 
21 |         SEXPTYPE array_type,
22 |         SEXP ret, 
23 |         const int64_t from, 
24 |         const int64_t len
25 | );
26 |     
27 | #endif // FARR_LOAD_H


--------------------------------------------------------------------------------
/src/threadSettings.cpp:
--------------------------------------------------------------------------------
 1 | #include "threadSettings.h"
 2 | 
 3 | #include <tthread/tinythread.h>
 4 | #include <TinyParallel.h>
 5 | 
 6 | // [[Rcpp::export]]
 7 | SEXP getDefaultNumThreads() {
 8 |     SEXP threadsSEXP = PROTECT(Rf_allocVector(INTSXP, 1));
 9 |     INTEGER(threadsSEXP)[0] = tthread::thread::hardware_concurrency();
10 |     UNPROTECT(1);
11 |     return threadsSEXP;
12 | }
13 | 
14 | // [[Rcpp::export]]
15 | int getThreads(const bool& max){
16 |     int maxThreads = tthread::thread::hardware_concurrency();
17 |     if( max ) {
18 |         return( maxThreads );
19 |     }
20 |     int n = TinyParallel::resolveValue("FILEARRAY_NUM_THREADS", -1, maxThreads);
21 |     if( n <= 0 || n > maxThreads ) {
22 |         n = maxThreads;
23 |     }
24 |     return( n );
25 | }
26 | 


--------------------------------------------------------------------------------
/R/methods-typeof.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title The type of a file array (extended)
 3 | #' @param x any file array
 4 | #' @return A character string. The possible values are \code{"double"},
 5 | #' \code{"integer"}, \code{"logical"}, and \code{"raw"}
 6 | #' @export
 7 | setGeneric("typeof")
 8 | 
 9 | #' @rdname typeof
10 | setMethod('typeof', signature(x = "FileArray"), function(x){
11 |     if(!x$valid()){
12 |         stop("Invalid file array")
13 |     }
14 |     x$type()
15 | })
16 | 
17 | #' @rdname typeof
18 | setMethod('typeof', signature(x = "FileArrayProxy"), function(x){
19 |     if(!x$valid()){
20 |         stop("Invalid file array")
21 |     }
22 |     if( length(x$.ops) ) {
23 |         final_op <- x$.ops[[length(x$.ops)]]
24 |         return( final_op$output_type )
25 |     } else {
26 |         x$type()
27 |     }
28 | })
29 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_UTILS_H
 2 | #define FARR_UTILS_H
 3 | 
 4 | #include "common.h"
 5 | #include "defs.h"
 6 | #include "core.h"
 7 | 
 8 | int guess_splitdim(SEXP dim, int elem_size, size_t buffer_bytes);
 9 | 
10 | void set_buffer(SEXP dim, int elem_size, size_t buffer_bytes, int split_dim);
11 | 
12 | SEXPTYPE file_buffer_sxptype(SEXPTYPE array_type);
13 | SEXPTYPE array_memory_sxptype(SEXPTYPE array_type);
14 | 
15 | int file_element_size(SEXPTYPE array_type);
16 | int memory_element_size(SEXPTYPE array_type);
17 | 
18 | int kinda_sorted(SEXP idx, int64_t min_, int64_t buffer_count);
19 | 
20 | SEXP check_missing_dots(const SEXP env);
21 | 
22 | SEXP sub_vec(SEXP x, SEXP idx_int64);
23 | SEXP sub_vec_range(SEXP x, const R_xlen_t& min_, const R_xlen_t& len_);
24 | 
25 | SEXP reshape_or_drop(SEXP x, SEXP reshape, bool drop);
26 | 
27 | SEXP subset_dimnames(SEXP dimnames, SEXP sliceIdx);
28 | 
29 | 
30 | #endif  // FARR_UTILS_H


--------------------------------------------------------------------------------
/src/serialize.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_UNSERIALIZE_H
 2 | #define FARR_UNSERIALIZE_H
 3 | 
 4 | #include "common.h"
 5 | 
 6 | // buffer type
 7 | typedef struct {
 8 |     size_t length;
 9 |     size_t pos;
10 |     unsigned char *data;
11 | } buffer_t;
12 | 
13 | int read_byte(R_inpstream_t stream);
14 | void read_bytes(R_inpstream_t stream, void *dst, int length);
15 | 
16 | SEXP unserialize_raw(SEXP x);
17 | SEXP unserialize_connection(FILE* conn, size_t len);
18 | 
19 | /**********************************************************
20 |  * Endianess
21 |  ***********************************************************/
22 | bool isLittleEndian();
23 | 
24 | void swap_endianess(void *ptr, const size_t& size, const size_t& nmemb);
25 | 
26 | size_t lendian_fwrite(void *ptr, size_t size, size_t nmemb, FILE *stream);
27 | 
28 | size_t lendian_fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
29 | 
30 | void lendian_assign(void* dst, const void* src, const size_t& elem_size, const size_t& nelems = 1);
31 | 
32 | #endif // FARR_UNSERIALIZE_H


--------------------------------------------------------------------------------
/man/apply.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods-apply.R
 3 | \name{apply}
 4 | \alias{apply}
 5 | \alias{apply,FileArray-method}
 6 | \alias{apply,FileArrayProxy-method}
 7 | \title{Apply functions over file array margins (extended)}
 8 | \usage{
 9 | apply(X, MARGIN, FUN, ..., simplify = TRUE)
10 | 
11 | \S4method{apply}{FileArray}(X, MARGIN, FUN, ..., simplify = TRUE)
12 | 
13 | \S4method{apply}{FileArrayProxy}(X, MARGIN, FUN, ..., simplify = TRUE)
14 | }
15 | \arguments{
16 | \item{X}{a file array}
17 | 
18 | \item{MARGIN}{scalar giving the subscripts which the function will be applied over. Current implementation only allows margin size to be one}
19 | 
20 | \item{FUN}{the function to be applied}
21 | 
22 | \item{...}{optional arguments to \code{FUN}}
23 | 
24 | \item{simplify}{a logical indicating whether results should be simplified if possible}
25 | }
26 | \value{
27 | See Section 'Value' in \code{\link[base]{apply}};
28 | }
29 | \description{
30 | Apply functions over file array margins (extended)
31 | }
32 | 


--------------------------------------------------------------------------------
/R/methods-subtract.R:
--------------------------------------------------------------------------------
 1 | # 
 2 | # fa_subtract <- function(e1, e2) {
 3 | #     call <- match.call()
 4 | #     call[[1]] <- quote(`-`)
 5 | #     label <- sprintf("Calculating: %s (fa_subtract)", deparse1(call))
 6 | #     
 7 | #     fa_pairwise_operator(e1, e2, op = "-", label = label)
 8 | # }
 9 | # 
10 | # 
11 | # 
12 | # setMethod('-', signature(e1 = "FileArray", e2 = "FileArray"), fa_subtract)
13 | # 
14 | # 
15 | # setMethod('-', signature(e1 = "FileArray", e2 = "numeric"), fa_subtract)
16 | # 
17 | # setMethod('-', signature(e1 = "numeric", e2 = "FileArray"), fa_subtract)
18 | # 
19 | # 
20 | # setMethod('-', signature(e1 = "FileArray", e2 = "complex"), fa_subtract)
21 | # 
22 | # setMethod('-', signature(e1 = "complex", e2 = "FileArray"), fa_subtract)
23 | # 
24 | # 
25 | # setMethod('-', signature(e1 = "FileArray", e2 = "logical"), fa_subtract)
26 | # 
27 | # setMethod('-', signature(e1 = "logical", e2 = "FileArray"), fa_subtract)
28 | # 
29 | # 
30 | # setMethod('-', signature(e1 = "FileArray", e2 = "array"), fa_subtract)
31 | # 
32 | # setMethod('-', signature(e1 = "array", e2 = "FileArray"), fa_subtract)
33 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: filearray
 2 | Type: Package
 3 | Title: File-Backed Array for Out-of-Memory Computation
 4 | Version: 0.2.0
 5 | Language: en-US
 6 | Encoding: UTF-8
 7 | License: LGPL-3
 8 | URL: https://dipterix.org/filearray/,
 9 |     https://github.com/dipterix/filearray
10 | BugReports: https://github.com/dipterix/filearray/issues
11 | Authors@R: c(
12 |     person(
13 |         "Zhengjia", "Wang", role = c("aut", "cre", "cph"),
14 |         email = "dipterix.wang@gmail.com"
15 |     ))
16 | Description: Stores large arrays in files to avoid occupying large
17 |     memories. Implemented with super fast gigabyte-level multi-threaded 
18 |     reading/writing via 'OpenMP'. Supports multiple non-character data 
19 |     types (double, float, complex, integer, logical, and raw).
20 | Imports: 
21 |     digest,
22 |     fastmap (>= 1.1.1),
23 |     methods,
24 |     Rcpp,
25 |     uuid (>= 1.1.0)
26 | Suggests: 
27 |     bit64,
28 |     knitr,
29 |     rmarkdown,
30 |     testthat (>= 3.0.0)
31 | RoxygenNote: 7.3.2
32 | LinkingTo: 
33 |     BH,
34 |     Rcpp
35 | Config/testthat/edition: 3
36 | VignetteBuilder: knitr
37 | 


--------------------------------------------------------------------------------
/src/interfaces.cpp:
--------------------------------------------------------------------------------
 1 | #include "core.h"
 2 | #include "save.h"
 3 | #include "load.h"
 4 | using namespace Rcpp;
 5 | 
 6 | // [[Rcpp:interfaces(cpp)]]
 7 | 
 8 | // [[Rcpp::export]]
 9 | SEXP filearray_meta(
10 |         const std::string& filebase
11 | ){
12 |     List re = FARR_meta(filebase);
13 |     return( re );
14 | }
15 | 
16 | // [[Rcpp::export]]
17 | SEXP filearray_assign(
18 |         const std::string& filebase, SEXP value,
19 |         const SEXP position_indices
20 | ) {
21 |     size_t thread_buffer = get_buffer_size();
22 |     FARR_subset_assign2(
23 |         filebase, value, position_indices, 
24 |         thread_buffer, 0);
25 |     return(R_NilValue);
26 | }
27 | 
28 | // [[Rcpp::export]]
29 | SEXP filearray_subset(
30 |         const std::string& filebase,
31 |         const SEXP position_indices,
32 |         const bool drop = true,
33 |         const bool use_dimnames = true,
34 |         const SEXP reshape = R_NilValue
35 | ) {
36 |     size_t thread_buffer = get_buffer_size();
37 |     SEXP ret = PROTECT(FARR_subset2(filebase, position_indices, reshape, drop,
38 |                                     use_dimnames, thread_buffer, 0, 1));
39 |     UNPROTECT(1);
40 |     return( ret );
41 | }


--------------------------------------------------------------------------------
/src/save.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_SAVE_H
 2 | #define FARR_SAVE_H
 3 | 
 4 | #include "common.h"
 5 | /**
 6 |  * @param filebase root path of array, must be absolute and "corrected" (ends with "/" or "\\", depending on system)
 7 |  * @param sch returned from `schedule`, scheduled indices
 8 |  * @param value value to set, must be coerced
 9 |  * @param buff_ptrs buffer points: length of `buff_ptrs` are the number of cores; each element must be at least buffer size in bytes (get_buffer_size())
10 |  */
11 | 
12 | SEXP FARR_subset_assign2(
13 |         const std::string& filebase,
14 |         SEXP value,
15 |         const SEXP listOrEnv,
16 |         const size_t thread_buffer,
17 |         int split_dim
18 | );
19 | 
20 | SEXP FARR_subset_assign_sequential(
21 |         const std::string& filebase, 
22 |         const int64_t& unit_partlen, 
23 |         SEXP cum_partsizes, 
24 |         SEXPTYPE array_type,
25 |         SEXP value, 
26 |         const int64_t from
27 | );
28 | 
29 | SEXP FARR_subset_assign_sequential_bare(
30 |         const std::string& filebase, 
31 |         const int64_t& unit_partlen, 
32 |         SEXP cum_partsizes, 
33 |         SEXPTYPE array_type,
34 |         SEXP value_, 
35 |         const int64_t from
36 | );
37 | 
38 | #endif //FARR_SAVE_H
39 |     
40 | 


--------------------------------------------------------------------------------
/inst/include/TinyParallel.h:
--------------------------------------------------------------------------------
 1 | #ifndef __FILEARRAY_PARALLEL__
 2 | #define __FILEARRAY_PARALLEL__
 3 | 
 4 | // TinyThread implementation
 5 | #include "TinyParallel/TinyThread.h"
 6 | 
 7 | namespace TinyParallel {
 8 | 
 9 | inline void parallelFor(std::size_t begin,
10 |                         std::size_t end,
11 |                         Worker& worker,
12 |                         std::size_t grainSize = 1,
13 |                         int numThreads = -1)
14 | {
15 |    grainSize = resolveValue("FILEARRAY_GRAIN_SIZE", grainSize, 1u);
16 |    numThreads = resolveValue("FILEARRAY_NUM_THREADS", numThreads, -1);
17 | 
18 |    ttParallelFor(begin, end, worker, grainSize);
19 | }
20 | 
21 | template <typename Reducer>
22 | inline void parallelReduce(std::size_t begin,
23 |                            std::size_t end,
24 |                            Reducer& reducer,
25 |                            std::size_t grainSize = 1,
26 |                            int numThreads = -1)
27 | {
28 |    grainSize = resolveValue("FILEARRAY_GRAIN_SIZE", grainSize, 1);
29 |    numThreads = resolveValue("FILEARRAY_NUM_THREADS", numThreads, -1);
30 | 
31 |    ttParallelReduce(begin, end, reducer, grainSize);
32 | }
33 | 
34 | } // end namespace TinyParallel
35 | 
36 | #endif // __FILEARRAY_PARALLEL__
37 | 


--------------------------------------------------------------------------------
/tests/testthat/test-method_subset.R:
--------------------------------------------------------------------------------
 1 | test_that("subset filearray-proxy", {
 2 | 
 3 |     # normal indexing
 4 |     x <- as_filearray(1:120, dimension = c(10,12))
 5 |     dimnames(x) <- list(A = 1:10, B = 1:12)
 6 |     y <- x + 1L
 7 |     z <- y + y
 8 |     
 9 |     idx1 <- c(3,3,4,5,5,4,3,2,1,1,2,3,3,2,2)
10 |     idx2 <- idx1 * 2
11 |     
12 |     expect_equal(y[idx1, idx2], x[idx1, idx2] + 1)
13 |     expect_equal(z[idx1, idx2], 2 * y[idx1, idx2])
14 |     
15 |     expect_equal(y[, idx2], x[, idx2] + 1)
16 |     expect_equal(z[, idx2], 2 * y[, idx2])
17 |     
18 |     expect_equal(y[idx1, ], x[idx1, ] + 1)
19 |     expect_equal(z[idx1, ], 2 * y[idx1, ])
20 |     
21 |     # filearray as index
22 |     i <- array(FALSE, dim(x))
23 |     i[idx1, idx2] <- TRUE
24 |     j <- as_filearray(i)
25 |     expect_equal(typeof(j), "logical")
26 |     
27 |     expect_equal(x[i], x[][i])
28 |     expect_equal(x[i], x[][j[]])
29 |     
30 |     expect_equal(y[i], x[i] + 1L)
31 |     expect_equal(y[j], y[i])
32 |     
33 |     expect_equal(z[i], y[i] * 2)
34 |     expect_equal(z[j], z[i])
35 |     
36 |     # integer index
37 |     idx <- rep(which(i), 2)
38 |     
39 |     # expect_equal(x[idx], x[][idx])
40 |     # expect_equal(y[idx], y[idx])
41 |     # expect_equal(z[idx], z[idx])
42 |     expect_error(x[idx])
43 |     
44 | })
45 | 


--------------------------------------------------------------------------------
/adhoc/debug1.R:
--------------------------------------------------------------------------------
 1 | devtools::load_all()
 2 | loadNamespace("bit64")
 3 | unlink('~/Desktop/junk/arr', recursive = TRUE)
 4 | arr <- filearray::filearray_create(
 5 |     filebase = '~/Desktop/junk/arr',
 6 |     dimension = c(861584, 233),
 7 |     type = "integer", partition_size = 1
 8 | )
 9 | 
10 | parition_size <- 9001
11 | niters <- 96
12 | begin <- 216025
13 | end <- 225025
14 | by <- 9001
15 | # arr[begin-1 + seq_len(9000),] <- rep(1:233, 9000)
16 | arr[begin-1 + seq_len(9001),] <- rep(1:233, 9001)
17 | 
18 | 0x102204000
19 | 0x1021fb360
20 | 
21 | filearray::filearray_threads(1)
22 | 
23 | arr[1,] <- 1:233
24 | 
25 | arr[seq_len(by),] <- rep(1:233,by)
26 | arr[end,] <- rep(1:233,1)
27 | i <- begin
28 | arr[i-1 + 1:100,] <- rep(1:233,100)
29 | 
30 | 
31 | # 0, skip=0
32 | # idx start-end: 0 - 0
33 | # [1] "/Users/dipterix/Desktop/junk/arr/0.farr"
34 | # block_size: 861584, idx1len: 9001, idx1_start: 216024, idx2_start: 0, idx2_len: 1
35 | # ### idx2ii:0, start_loc: 0, buf pos: 0, idx1_start: 216024
36 | 
37 | # *idx1ptr: 216024, idx1_start: 216024
38 | # *idx1ptr: 225024, idx1_start: 216024
39 | arr[begin-1 + seq_len(by),] <- rep(1:233,by)
40 | 
41 | 
42 | for(i in seq(begin, 861584-by+1, by = by)) {
43 |     print(i)
44 |     try({ arr[i-1 + seq_len(by),] <- rep(1:233,by) })
45 | }
46 | 225024 - 216024
47 | 
48 | file.size(arr$partition_path(1))
49 | (3447360 - 1024) / 4
50 | 


--------------------------------------------------------------------------------
/R/misc-docs.R:
--------------------------------------------------------------------------------
 1 | #' @title Set or get file array threads
 2 | #' @description Will enable/disable multi-threaded reading or writing
 3 | #' at \code{C++} level. 
 4 | #' @param n number of threads to set. If \code{n} is negative,
 5 | #' then default to the number of cores that computer has. 
 6 | #' @param ... internally used
 7 | #' @return An integer of current number of threads
 8 | #' @export
 9 | filearray_threads <- function(n, ...){
10 |     
11 |     if(!missing(n)){
12 |         setThreads(n, ...)
13 |     }
14 |     
15 |     return(getThreads(FALSE))
16 | }
17 | 
18 | setThreads <- function (n = "auto", stack_size = "auto", ...) {
19 |     if (identical(n, "auto")) {
20 |         n <- -1L
21 |     } else {
22 |         n <- as.integer(n)
23 |         if (length(n) != 1 || is.na(n) || !is.numeric(n)) {
24 |             stop("n must be an integer")
25 |         }
26 |     }
27 |     if (identical(stack_size, "auto")) {
28 |         stack_size <- 0L
29 |     } else if (!is.numeric(stack_size)) {
30 |         stop("stack_size must be an integer")
31 |     } else {
32 |         stack_size <- as.integer(stack_size)
33 |     }
34 |     if (n == -1L) {
35 |         Sys.unsetenv("FILEARRAY_NUM_THREADS")
36 |     } else {
37 |         Sys.setenv(FILEARRAY_NUM_THREADS = n)
38 |     }
39 |     if (stack_size == 0L) {
40 |         Sys.unsetenv("FILEARRAY_STACK_SIZE")
41 |     } else {
42 |         Sys.setenv(FILEARRAY_STACK_SIZE = stack_size)
43 |     }
44 |     invisible()
45 | }
46 | 


--------------------------------------------------------------------------------
/R/aaa-docs.R:
--------------------------------------------------------------------------------
 1 | ## usethis namespace: start
 2 | #' @importFrom Rcpp sourceCpp
 3 | #' @useDynLib filearray, .registration = TRUE
 4 | ## usethis namespace: end
 5 | NULL
 6 | 
 7 | #' @title 'S3' methods for 'FileArray'
 8 | #' @name S3-filearray
 9 | #' @description These are 'S3' methods for 'FileArray'
10 | #' @param x a file array
11 | #' @param drop whether to drop dimensions; see topic \code{\link[base]{Extract}}
12 | #' @param reshape a new dimension to set before returning subset results; default is \code{NULL} (use default dimensions)
13 | #' @param strict whether to allow indices to exceed bound; currently only accept \code{TRUE}
14 | #' @param dimnames whether to preserve \code{\link[base]{dimnames}}
15 | #' @param value value to substitute or set
16 | #' @param na.rm whether to remove \code{NA} values during the calculation
17 | #' @param split_dim internally used; split dimension and calculate indices to
18 | #' manually speed up the subset; value ranged from 0 to size of dimension minus
19 | #' one.
20 | #' @param lazy whether to lazy-evaluate the method, only works when assigning
21 | #' arrays with logical array index
22 | #' @param i,... index set, or passed to other methods
23 | #' @param .env environment to evaluate formula when evaluating subset margin indices.
24 | NULL
25 | 
26 | #' @name S4-filearray
27 | #' @title 'S4' methods for \code{FileArray}
28 | #' @param x,z,e1,e2 \code{FileArray} or compatible data
29 | #' @param base,digits,... passed to other methods
30 | #' @returns See \code{\link[methods]{S4groupGeneric}}
31 | NULL
32 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("[",FileArray)
 4 | S3method("[<-",FileArray)
 5 | S3method("[[",FileArray)
 6 | S3method("dimnames<-",FileArray)
 7 | S3method(as.array,FileArray)
 8 | S3method(as_filearray,FileArray)
 9 | S3method(as_filearray,FileArrayProxy)
10 | S3method(as_filearray,character)
11 | S3method(as_filearray,default)
12 | S3method(as_filearrayproxy,FileArray)
13 | S3method(as_filearrayproxy,FileArrayProxy)
14 | S3method(as_filearrayproxy,default)
15 | S3method(dim,FileArray)
16 | S3method(dimnames,FileArray)
17 | S3method(fwhich,FileArray)
18 | S3method(fwhich,default)
19 | S3method(length,FileArray)
20 | S3method(max,FileArray)
21 | S3method(min,FileArray)
22 | S3method(range,FileArray)
23 | S3method(subset,FileArray)
24 | S3method(sum,FileArray)
25 | export(apply)
26 | export(as_filearray)
27 | export(as_filearrayproxy)
28 | export(filearray_bind)
29 | export(filearray_checkload)
30 | export(filearray_create)
31 | export(filearray_load)
32 | export(filearray_load_or_create)
33 | export(filearray_threads)
34 | export(fmap)
35 | export(fmap2)
36 | export(fmap_element_wise)
37 | export(fwhich)
38 | export(mapreduce)
39 | export(typeof)
40 | exportClasses(FileArray)
41 | exportClasses(FileArrayProxy)
42 | exportMethods(apply)
43 | exportMethods(mapreduce)
44 | importFrom(Rcpp,sourceCpp)
45 | importFrom(methods,new)
46 | importFrom(methods,setGeneric)
47 | importFrom(methods,setMethod)
48 | importFrom(methods,setRefClass)
49 | importFrom(methods,signature)
50 | useDynLib(filearray, .registration = TRUE)
51 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | permissions: read-all
15 | 
16 | jobs:
17 |   pkgdown:
18 |     runs-on: ubuntu-latest
19 |     # Only restrict concurrency for non-PR jobs
20 |     concurrency:
21 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
22 |     env:
23 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
24 |     permissions:
25 |       contents: write
26 |     steps:
27 |       - uses: actions/checkout@v4
28 | 
29 |       - uses: r-lib/actions/setup-pandoc@v2
30 | 
31 |       - uses: r-lib/actions/setup-r@v2
32 |         with:
33 |           use-public-rspm: true
34 | 
35 |       - uses: r-lib/actions/setup-r-dependencies@v2
36 |         with:
37 |           extra-packages: any::pkgdown, local::.
38 |           needs: website
39 | 
40 |       - name: Build site
41 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
42 |         shell: Rscript {0}
43 | 
44 |       - name: Deploy to GitHub pages 🚀
45 |         if: github.event_name != 'pull_request'
46 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
47 |         with:
48 |           clean: false
49 |           branch: gh-pages
50 |           folder: docs
51 | 


--------------------------------------------------------------------------------
/src/core.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_CORE_H
 2 | #define FARR_CORE_H
 3 | 
 4 | #include "common.h"
 5 | #include "defs.h"
 6 | 
 7 | /**********************************************************
 8 |  * Buffer
 9 |  ***********************************************************/
10 | 
11 | int system_buffer_size();
12 | 
13 | int set_buffer_size(int size);
14 | 
15 | int get_buffer_size();
16 | 
17 | /**********************************************************
18 |  * Load meta
19 |  ***********************************************************/
20 | 
21 | std::string correct_filebase(const std::string& filebase);
22 | 
23 | Rcpp::List FARR_meta(const std::string& filebase);
24 | 
25 | /**********************************************************
26 |  * Schedule reading
27 |  ***********************************************************/
28 | SEXP locationList(const SEXP listOrEnv, const Rcpp::NumericVector& dim, const int strict);
29 | 
30 | SEXP addCycle(SEXP x, SEXP ret, const R_xlen_t step = 1, const R_xlen_t mag = 1);
31 | 
32 | SEXP loc2idx(const Rcpp::List sliceIdx, const Rcpp::NumericVector& dim);
33 | 
34 | Rcpp::List schedule(const SEXP listOrEnv, 
35 |                     const Rcpp::NumericVector& dim,
36 |                     const Rcpp::NumericVector& cum_part_sizes,
37 |                     const int split_dim, const int strict = 1);
38 | 
39 | /**********************************************************
40 |  * Utils
41 |  ***********************************************************/
42 | 
43 | SEXP seq_len_int64(const R_xlen_t len);
44 | double prod_double(const Rcpp::NumericVector& x);
45 | 
46 | #endif // FARR_CORE_H


--------------------------------------------------------------------------------
/R/internal_misc.R:
--------------------------------------------------------------------------------
 1 | # misc internal functions that are byte-compiled
 2 | 
 3 | # move here to byte-compile
 4 | .max_mapper1 <- function(data, size, idx){
 5 |     if(!length(size)){
 6 |         return(numeric(0))
 7 |     }
 8 |     max(data, na.rm = TRUE)
 9 | }
10 | .max_mapper2 <- function(data, size, idx){
11 |     if(!length(size)){
12 |         return(numeric(0))
13 |     }
14 |     if(length(data) != size){
15 |         data <- data[seq_len(size)]
16 |     }
17 |     max(data, na.rm = FALSE)
18 | }
19 | .min_mapper1 <- function(data, size, idx){
20 |     if(!length(size)){
21 |         return(numeric(0))
22 |     }
23 |     min(data, na.rm = TRUE)
24 | }
25 | .min_mapper2 <- function(data, size, idx){
26 |     if(!length(size)){
27 |         return(numeric(0))
28 |     }
29 |     if(length(data) != size){
30 |         data <- data[seq_len(size)]
31 |     }
32 |     min(data, na.rm = FALSE)
33 | }
34 | .range_mapper1 <- function(data, size, idx){
35 |     if(!length(size)){
36 |         return(numeric(0))
37 |     }
38 |     range(data, na.rm = TRUE)
39 | }
40 | .range_mapper2 <- function(data, size, idx){
41 |     if(!length(size)){
42 |         return(numeric(0))
43 |     }
44 |     if(length(data) != size){
45 |         data <- data[seq_len(size)]
46 |     }
47 |     range(data, na.rm = FALSE)
48 | }
49 | 
50 | .sum_mapper1 <- function(data, size, idx){
51 |     if(!length(size)){
52 |         return(0)
53 |     }
54 |     sum(data, na.rm = TRUE)
55 | }
56 | .sum_mapper2 <- function(data, size, idx){
57 |     if(!length(size)){
58 |         return(0)
59 |     }
60 |     if(length(data) != size){
61 |         data <- data[seq_len(size)]
62 |     }
63 |     sum(data, na.rm = FALSE)
64 | }
65 | 


--------------------------------------------------------------------------------
/adhoc/performance.R:
--------------------------------------------------------------------------------
 1 | devtools::load_all()
 2 | 
 3 | # 800 MB data
 4 | dim3 <- 100
 5 | 
 6 | set.seed(1)
 7 | file <- tempfile()
 8 | unlink(file, recursive = TRUE)
 9 | 
10 | dim <- c(400, 500, dim3, 5)
11 | 
12 | arr <- filearray_create(file, dim, type = 'integer', partition_size = 2)
13 | x <- array(as.integer(seq_len(prod(dim))), dim)
14 | for(ii in 1:5){
15 |     x[1,1,1,ii] <- NA
16 | }
17 | 
18 | system.time({
19 |     arr$set_partition(1, x[,,,1:2], lock = FALSE)
20 |     arr$set_partition(2, x[,,,3:4], lock = FALSE)
21 |     arr$set_partition(3, x[,,,5],,,,1,lock = FALSE)
22 | })
23 | 
24 | basefile <- paste0(normalizePath(arr$.filebase), arr$.sep)
25 | 
26 | sample1 = function(x, ...){x}
27 | sample1 <- base::sample
28 | sample2 <- base::sample
29 | replace <- TRUE
30 | set.seed(1)
31 | locs <- list(
32 |     quote(x),
33 |     drop=F,
34 |     sample1(c(1:dim[[1]]), replace = replace),
35 |     sample1(c(1:dim[[2]]), replace = replace),
36 |     sample2(c(1:dim3), replace = replace),
37 |     sample2(c(1:5), replace = replace)
38 | )
39 | # get_buffer_size()
40 | # set_buffer_size(1048576)->bs; bs; log10(bs)
41 | # set_buffer_size(2^16)
42 | getThreads()
43 | setThreads(1)
44 | 
45 | system.time({
46 |     b <- do.call('[', locs)
47 | })
48 | 
49 | # system.time({
50 | #     a <- FARR_subset(filebase = basefile, 
51 | #                     listOrEnv = locs[-(1:2)], 
52 | #                     dim = dim, 
53 | #                     cum_part_sizes = 1:5, 
54 | #                     type = type_to_sexp('integer'), 
55 | #                     split_dim = 3)})
56 | 
57 | system.time({
58 |     a <- local({
59 |         x <- arr
60 |         do.call('[', locs)
61 |     })
62 | })
63 | 
64 | 
65 | identical(a, b)
66 | 


--------------------------------------------------------------------------------
/adhoc/performance-write.R:
--------------------------------------------------------------------------------
 1 | devtools::load_all()
 2 | set.seed(1)
 3 | dim <- c(500, 400, 100, 5)
 4 | file <- tempfile(); unlink(file, recursive = TRUE);
 5 | x <- filearray_create(file, dim, type = 'integer', partition_size = 1)
 6 | # na <- NA; storage.mode(na) <- 'integer'
 7 | na <- as.integer(NA)
 8 | v <- array(na, dim)
 9 | 
10 | filebase <- paste0(x$.filebase, x$.sep)
11 | pna <- local({
12 |     dim[[length(dim)]] <- x$partition_size()
13 |     rep(na, prod(dim))
14 | })
15 | reset <- function(){
16 |     system.time({
17 |         lapply(seq_len(ceiling(dim[[length(dim)]] / x$partition_size())), function(i){
18 |             dim[[length(dim)]] <- x$partition_size()
19 |             write_partition(
20 |                 file = x$partition_path(i),
21 |                 partition = i,
22 |                 dimension = dim,
23 |                 type = 'integer',
24 |                 value = pna
25 |             )
26 |         })
27 |     })
28 | }
29 | 
30 | sample <- function(x, ...){x}
31 | sample <- base::sample
32 | 
33 | listOrEnv <- function(seed){
34 |     set.seed(seed)
35 |     lapply(dim, function(d){
36 |         sample(seq_len(d), size = sample(seq_len(d), size = 1), replace = FALSE)
37 |     })
38 | }
39 | 
40 | 
41 | seed = 1
42 | locs <- listOrEnv(seed)
43 | vals <- 1:prod(sapply(locs, length))
44 | system.time({
45 |     b <- local({
46 |         locs <- listOrEnv(seed)
47 |         v[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.integer(vals)
48 |         v
49 |     })
50 | })
51 | 
52 | locs <- listOrEnv(seed)
53 | reset(); gc()
54 | system.time({
55 |     setThreads(5)
56 |     locs <- listOrEnv(seed)
57 |     x[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.double(vals)
58 | }, gcFirst = TRUE)
59 | system.time({
60 |     x[,,,]
61 | })
62 | 
63 | 
64 | identical(x[], b)
65 | 


--------------------------------------------------------------------------------
/adhoc/debug.R:
--------------------------------------------------------------------------------
 1 | f <- function(seed){
 2 |     require(testthat)
 3 |     require(bit64)
 4 |     bsz <- get_buffer_size()
 5 |     on.exit({
 6 |         set_buffer_size(bsz)
 7 |         max_buffer_size(2097152)
 8 |     })
 9 |     set_buffer_size(16L)
10 |     max_buffer_size(64L)
11 |     
12 |     set.seed(seed)
13 |     file <- tempfile()
14 |     unlink(file, recursive = TRUE)
15 |     dim <- 33:35
16 |     x <- filearray_create(file, dim, partition_size = 1, type = "double")
17 |     
18 |     expect_equal(x[[2]], x$.na)
19 |     
20 |     y <- array(x$.na, dim)
21 |     tmp <- 1:(prod(dim))
22 |     x[] <- tmp
23 |     y[] <- tmp
24 |     
25 |     eps <- 10^(ceiling(log10(max(abs(y)))) - 7)
26 |     # y[] <- x[]
27 |     locs <-
28 |         lapply(dim, function(d) {
29 |             d1 <- sample(c(1,3:d), 10, replace = TRUE )
30 |             d2 <- NULL#c(NA, NA)
31 |             sort(as.double(sample(c(d1,d2))), decreasing = TRUE)
32 |         })
33 |     assign('locs', locs, envir = globalenv())
34 |     
35 |     a <- x[locs[[1]], locs[[2]], locs[[3]]]
36 |     b <- y[locs[[1]], locs[[2]], locs[[3]]]
37 |     assign('x', x, envir = globalenv())
38 |     assign('y', y, envir = globalenv())
39 |     eps <- 1e-6
40 |     sel <- !is.na(a-b) & (a-b) > eps
41 |     
42 |     assign('sel', sel, envir = globalenv())
43 |     assign('a', a, envir = globalenv())
44 |     assign('b', b, envir = globalenv())
45 |     
46 |     expect_equal(is.na(a), is.na(b))
47 |     
48 |     
49 |     if(length(a[sel])){
50 |         # fail the test
51 |         print(a[sel])
52 |         print(b[sel])
53 |         expect_length(object = (a-b)[sel], n = 0)
54 |         stop()
55 |     }
56 |     
57 |     unlink(file, recursive = TRUE)
58 | }
59 | # setThreads(1)
60 | for(i in 1:1000){
61 |     print(i)
62 |     f(i)
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/tests/testthat/test-method_subsetAssign.R:
--------------------------------------------------------------------------------
 1 | test_that("subset-assign filearray-proxy", {
 2 |     
 3 |     # normal indexing
 4 |     x0 <- array(1:120, dim = c(10,12))
 5 |     dimnames(x0) <- list(A = 1:10, B = 1:12)
 6 |     x <- as_filearray(x0)
 7 |     y <- x + 1L
 8 |     y[,1] <- rep(0, 10)
 9 |     y[1,] <- rep(0, 12)
10 |     z <- y + y
11 |     
12 |     idx1 <- c(3,4,5,2,1)
13 |     idx2 <- idx1 * 2
14 |     
15 |     expect_equal(x[idx1, idx2], x0[idx1, idx2])
16 |     expect_equal(y[1,1:12, dimnames = NULL], rep(0L, 12))
17 |     expect_equal(y[1:10,1, dimnames = NULL], rep(0L, 10))
18 |     expect_equal(y[2:10,2:10], x0[2:10,2:10] + 1L)
19 |     expect_equal(z[, idx2], 2 * y[, idx2])
20 |     
21 |     expect_equal(y[2:10, 2:12], x[2:10, 2:12] + 1)
22 |     expect_equal(z[], 2 * y[])
23 |     
24 |     clear_cache()
25 |     
26 |     # Array indexing
27 |     idx1 <- c(3,4,5,2,1)
28 |     idx2 <- idx1 * 2
29 |     
30 |     i <- array(FALSE, dim(x))
31 |     i[idx1, idx2] <- TRUE
32 |     j <- as_filearray(i)
33 |     expect_equal(typeof(j), "logical")
34 |     
35 |     x0 <- array(1:120, dim = c(10,12))
36 |     dimnames(x0) <- list(A = 1:10, B = 1:12)
37 |     x <- as_filearray(x0)
38 |     y <- x + 1L
39 |     y[j,lazy = TRUE] <- 0
40 |     z <- y + y
41 |     z[j, lazy = TRUE] <- 1
42 |     
43 |     expect_equal(y$.filebase, x$.filebase)
44 |     expect_equal(z$.filebase, x$.filebase)
45 |     
46 |     # mature z
47 |     expect_equal(z[j], rep(1, sum(j)))
48 |     expect_equal(y[j], rep(0, sum(j)))
49 |     
50 |     expect_equal(x[!i], x0[!i])
51 |     expect_equal(y[!i], x[!i] + 1L)
52 |     expect_equal(z[!i], y[!i] * 2)
53 |     
54 |     expect_equal(x[!j], x0[!i])
55 |     expect_equal(y[!j], x[!i] + 1L)
56 |     expect_equal(z[!j], y[!i] * 2)
57 |     
58 |     clear_cache()
59 | })
60 | 


--------------------------------------------------------------------------------
/adhoc/map-test.R:
--------------------------------------------------------------------------------
 1 | require(filearray)
 2 | dim <- c(100,100,10,40)
 3 | set.seed(1); 
 4 | tmp <- seq_len(1e5)
 5 | xs <- lapply(1:2, function(i){
 6 |     file <- tempfile(); unlink(file, recursive = TRUE)
 7 |     x <- filearray_create(file, dim, type = 'double')
 8 |     for(i in 1:40){
 9 |         x[,,,i] <- tmp
10 |     }
11 |     x
12 | })
13 | set.seed(2); file <- tempfile(); unlink(file, recursive = TRUE)
14 | y <- filearray_create(file, c(100,10,40), type = 'double')
15 | y$initialize_partition()
16 | 
17 | 
18 | system.time({
19 |     z <- xs[[1]]$collapse(keep = c(2,3,4), method = "sum")
20 | })
21 | system.time({
22 |     fmap(xs, function(x, a){
23 |         # a <<- c(a, Sys.time() - now)
24 |         z <- x[[1]] + x[[2]]
25 |         dim(z) <- c(100,100,10)
26 |         z <- dipsaus::collapse(z, keep = c(2,3))
27 |         as.list(z + a)
28 |     }, .y = y, .input_size = 100000, .output_size = 1000,
29 |     a = 2
30 |     )
31 | })
32 | range(y[] - z*2)
33 | 
34 | # aa <- c()
35 | system.time({
36 |     now <- Sys.time()
37 |         # aa <<- c(aa, Sys.time() - now)
38 |         y[] <- xs[[1]][] + xs[[2]][]
39 | })
40 | 
41 | # filearray:::setThreads(1)
42 | filearray:::set_buffer_size(2097152)
43 | filearray:::set_buffer_size(8000000)
44 | filearray:::get_buffer_size()
45 | filearray_threads(8)
46 | env <- new.env(parent = emptyenv())
47 | env$a <- matrix(NA_real_, nrow = 4, ncol = ceiling(length(y) / filearray:::get_buffer_size() * 8))
48 | env$count = 1;
49 | system.time({
50 |     now <- Sys.time()
51 |     fmap_element_wise(xs, function(input) {
52 |         input[[1]] + input[[2]]
53 |     }, y
54 |     , profile = function(){
55 |         env$a[[env$count]] <- Sys.time() - now
56 |         now <<- Sys.time()
57 |         env$count <- env$count + 1
58 |     }
59 |     )
60 | })
61 | b <- t(env$a)
62 | colSums(b, na.rm = TRUE)
63 | summary(b)
64 | 
65 | range(y[] - xs[[1]][] - xs[[2]][])
66 | 


--------------------------------------------------------------------------------
/adhoc/gctorture.R:
--------------------------------------------------------------------------------
 1 | devtools::load_all()
 2 | set.seed(1)
 3 | dim <- c(5, 4, 2, 5)
 4 | file <- tempfile(); unlink(file, recursive = TRUE);
 5 | x <- filearray_create(file, dim, type = 'integer', partition_size = 1)
 6 | # na <- NA; storage.mode(na) <- 'integer'
 7 | na <- as.integer(NA)
 8 | v <- array(na, dim)
 9 | 
10 | filebase <- paste0(x$.filebase, x$.sep)
11 | pna <- local({
12 |     dim[[length(dim)]] <- x$partition_size()
13 |     rep(na, prod(dim))
14 | })
15 | reset <- function(){
16 |     system.time({
17 |         lapply(seq_len(ceiling(dim[[length(dim)]] / x$partition_size())), function(i){
18 |             dim[[length(dim)]] <- x$partition_size()
19 |             write_partition(
20 |                 file = x$partition_path(i),
21 |                 partition = i,
22 |                 dimension = dim,
23 |                 type = 'integer',
24 |                 value = pna
25 |             )
26 |         })
27 |     })
28 | }
29 | 
30 | sample <- function(x, ...){x}
31 | sample <- base::sample
32 | 
33 | listOrEnv <- function(seed){
34 |     set.seed(seed)
35 |     lapply(dim, function(d){
36 |         sample(seq_len(d), size = sample(seq_len(d), size = 1), replace = FALSE)
37 |     })
38 | }
39 | 
40 | 
41 | seed = 1
42 | locs <- listOrEnv(seed)
43 | vals <- 1:prod(sapply(locs, length))
44 | system.time({
45 |     b <- local({
46 |         locs <- listOrEnv(seed)
47 |         v[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.integer(vals)
48 |         v
49 |     })
50 | })
51 | 
52 | locs <- listOrEnv(seed)
53 | reset(); gc()
54 | system.time({
55 |     setThreads(5)
56 |     locs <- listOrEnv(seed)
57 |     gctorture2(1)
58 |     x[locs[[1]], locs[[2]], locs[[3]], locs[[4]]] <- as.double(vals)
59 |     gctorture2(0)
60 | }, gcFirst = TRUE)
61 | system.time({
62 |     gctorture2(1)
63 |     tmp <- x[,,,]
64 |     gctorture2(0)
65 |     identical(tmp, b)
66 | })
67 | 
68 | 
69 | identical(x[], b)
70 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 |     branches: [main, master]
12 | 
13 | name: R-CMD-check
14 | 
15 | permissions: read-all
16 | 
17 | jobs:
18 |   R-CMD-check:
19 |     runs-on: ${{ matrix.config.os }}
20 | 
21 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
22 | 
23 |     strategy:
24 |       fail-fast: false
25 |       matrix:
26 |         config:
27 |           - {os: macos-latest,   r: 'release'}
28 | 
29 |           - {os: windows-latest, r: 'release'}
30 |           # use 4.1 to check with rtools40's older compiler
31 |           - {os: windows-latest, r: '4.1'}
32 | 
33 |           - {os: ubuntu-latest,  r: 'devel', http-user-agent: 'release'}
34 |           - {os: ubuntu-latest,  r: 'release'}
35 |           - {os: ubuntu-latest,  r: 'oldrel-1'}
36 |           - {os: ubuntu-latest,  r: 'oldrel-2'}
37 |           - {os: ubuntu-latest,  r: 'oldrel-3'}
38 | 
39 |     env:
40 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
41 |       R_KEEP_PKG_SOURCE: yes
42 | 
43 |     steps:
44 |       - uses: actions/checkout@v4
45 | 
46 |       - uses: r-lib/actions/setup-pandoc@v2
47 | 
48 |       - uses: r-lib/actions/setup-r@v2
49 |         with:
50 |           r-version: ${{ matrix.config.r }}
51 |           http-user-agent: ${{ matrix.config.http-user-agent }}
52 |           use-public-rspm: true
53 | 
54 |       - uses: r-lib/actions/setup-r-dependencies@v2
55 |         with:
56 |           extra-packages: any::rcmdcheck
57 |           needs: check
58 | 
59 |       - uses: r-lib/actions/check-r-package@v2
60 |         with:
61 |           upload-snapshots: true
62 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
63 | 


--------------------------------------------------------------------------------
/inst/include/TinyParallel/Common.h:
--------------------------------------------------------------------------------
 1 | #ifndef __FILEARRAY_PARALLEL_COMMON__
 2 | #define __FILEARRAY_PARALLEL_COMMON__
 3 | 
 4 | #include <cerrno>
 5 | #include <cstddef>
 6 | #include <cstdlib>
 7 | 
 8 | namespace TinyParallel {
 9 | 
10 | template <typename T, typename U>
11 | inline int resolveValue(const char* envvar,
12 |                         T requestedValue,
13 |                         U defaultValue)
14 | {
15 |    // if the requested value is non-zero and not the default, we can use it
16 |    if (requestedValue != defaultValue && requestedValue > 0)
17 |       return requestedValue;
18 |    
19 |    // otherwise, try reading the default from associated envvar
20 |    // if the environment variable is unset, use the default
21 |    const char* var = getenv(envvar);
22 |    if (var == NULL)
23 |       return defaultValue;
24 |    
25 |    // try to convert the string to a number
26 |    // if an error occurs during conversion, just use default
27 |    errno = 0;
28 |    char* end;
29 |    long value = strtol(var, &end, 10);
30 |    
31 |    // check for conversion failure
32 |    if (end == var || *end != '\0' || errno == ERANGE)
33 |       return defaultValue;
34 |    
35 |    // okay, return the parsed environment variable value   
36 |    return value;
37 | }
38 | 
39 | // Work executed within a background thread. We implement dynamic
40 | // dispatch using vtables so we can have a stable type to cast
41 | // to from the void* passed to the worker thread (required because
42 | // the tinythreads interface allows to pass only a void* to the
43 | // thread main rather than a generic type / template)
44 | 
45 | struct Worker 
46 | {  
47 |    // construct and destruct (delete virtually)
48 |    Worker() {}
49 |    virtual ~Worker() {}
50 |    
51 |    // dispatch work over a range of values
52 |    virtual void operator()(std::size_t begin, std::size_t end) = 0;   
53 |    
54 |    // disable copying and assignment
55 | private:
56 |    Worker(const Worker&);
57 |    void operator=(const Worker&);
58 | };
59 | 
60 | // Tag type used for disambiguating splitting constructors
61 | 
62 | struct Split {};
63 | 
64 | } // namespace TinyParallel
65 | 
66 | 
67 | #endif // __FILEARRAY_PARALLEL_COMMON__
68 | 


--------------------------------------------------------------------------------
/tests/testthat/test-as_filearray.R:
--------------------------------------------------------------------------------
 1 | test_that("as_filearray", {
 2 |     options("filearray.operator.precision" = "float")
 3 |     on.exit({
 4 |         options("filearray.operator.precision" = NULL)
 5 |     }, add = TRUE)
 6 |     
 7 |     x <- rnorm(24)
 8 |     dim(x) <- c(2, 3, 4)
 9 |     
10 |     # as_filearray.default
11 |     y <- as_filearray(x)
12 |     testthat::expect_equal(y$type(), getOption("filearray.operator.precision"))
13 |     testthat::expect_equal(y[], x, tolerance = 1e-5, ignore_attr = TRUE)
14 |     testthat::expect_equal(y$.mode, "readwrite")
15 |     
16 |     # as_filearray.character
17 |     y <- as_filearray(y$.filebase)
18 |     testthat::expect_equal(y[dimnames = FALSE], x, tolerance = 1e-5, ignore_attr = TRUE)
19 |     testthat::expect_equal(y$.mode, "readonly")
20 |     
21 |     # as_filearray.FileArray(Proxy)
22 |     z <- as_filearray(y)
23 |     testthat::expect_identical(z, y)
24 |     
25 |     z <- as_filearray(as_filearrayproxy(y))
26 |     testthat::expect_equal(typeof(z), getOption("filearray.operator.precision"))
27 |     testthat::expect_equal(z[], x, tolerance = 1e-5, ignore_attr = TRUE)
28 |     
29 |     y$.mode <- "readwrite"
30 |     y$delete()
31 |     
32 |     testthat::expect_false(z$valid())
33 |     
34 | })
35 | 
36 | 
37 | test_that("as_filearrayproxy", {
38 |     options("filearray.operator.precision" = "float")
39 |     on.exit({
40 |         options("filearray.operator.precision" = NULL)
41 |     }, add = TRUE)
42 |     
43 |     x <- rnorm(24)
44 |     dim(x) <- c(2, 3, 4)
45 |     dimnames(x) <- list(A = 1:2, B = 1:3, C = letters[1:4])
46 |     
47 |     # as_filearrayproxy.default
48 |     y <- as_filearrayproxy(x)
49 |     testthat::expect_equal(typeof(y), getOption("filearray.operator.precision"))
50 |     testthat::expect_equal(y[], x, tolerance = 1e-5)
51 |     testthat::expect_equal(y$.mode, "readwrite")
52 |     
53 |     # as_filearrayproxy.character
54 |     y <- as_filearrayproxy(y$.filebase)
55 |     testthat::expect_equal(y[], x, tolerance = 1e-5)
56 |     testthat::expect_equal(y$.mode, "readwrite")
57 |     
58 |     # as_filearrayproxy.FileArrayProxy
59 |     z <- as_filearrayproxy(y)
60 |     testthat::expect_false(identical(z, y))
61 |     testthat::expect_equal(y[], z[])
62 |     
63 |     y$delete()
64 |     
65 |     testthat::expect_false(z$valid())
66 |     
67 | })
68 | 


--------------------------------------------------------------------------------
/man/FileArray-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/class-filearray.R
 3 | \name{FileArray-class}
 4 | \alias{FileArray-class}
 5 | \title{Definition of file array}
 6 | \description{
 7 | \code{S4} class definition of \code{FileArray}. Please 
 8 | use \code{\link{filearray_create}} and \code{\link{filearray_load}}
 9 | to create instances.
10 | }
11 | \section{Public Methods}{
12 | 
13 | \describe{
14 | \item{\code{get_header(key, default = NULL)}}{Get header information; returns \code{default} if \code{key} is missing}
15 | \item{\code{set_header(key, value)}}{Set header information; the extra headers will be stored in meta file. Please do not store large headers as they will be loaded into memory frequently.}
16 | \item{\code{can_write()}}{Whether the array data can be altered}
17 | \item{\code{create(filebase, dimension, type = "double", partition_size = 1)}}{Create a file array instance}
18 | \item{\code{delete(force = FALSE)}}{Remove array from local file system and reset}
19 | \item{\code{dimension()}}{Get dimension vector}
20 | \item{\code{dimnames(v)}}{Set/get dimension names}
21 | \item{\code{element_size()}}{Internal storage: bytes per element}
22 | \item{\code{fill_partition(part, value)}}{Fill a partition with given scalar}
23 | \item{\code{get_partition(part, reshape = NULL)}}{Get partition data, and reshape (if not null) to desired dimension}
24 | \item{\code{expand(n)}}{Expand array along the last margin; returns true if expanded; if the \code{dimnames} have been assigned prior to expansion, the last dimension names will be filled with \code{NA}}
25 | \item{\code{initialize_partition()}}{Make sure a partition file exists; if not, create one and fill with \code{NA}s or 0 (\code{type='raw'})}
26 | \item{\code{load(filebase, mode = c("readwrite", "readonly"))}}{Load file array from existing directory}
27 | \item{\code{partition_path(part)}}{Get partition file path}
28 | \item{\code{partition_size()}}{Get partition size; see \code{\link{filearray}}}
29 | \item{\code{set_partition(part, value, ..., strict = TRUE)}}{Set partition value}
30 | \item{\code{sexp_type()}}{Get data \code{SEXP} type; see R internal manuals}
31 | \item{\code{show()}}{Print information}
32 | \item{\code{type()}}{Get data type}
33 | \item{\code{valid()}}{Check if the array is valid.}
34 | }
35 | }
36 | 
37 | \seealso{
38 | \code{\link{filearray}}
39 | }
40 | 


--------------------------------------------------------------------------------
/man/fwhich.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{fwhich}
 4 | \alias{fwhich}
 5 | \alias{fwhich.default}
 6 | \alias{fwhich.FileArray}
 7 | \title{A generic function of \code{which} that is \code{'FileArray'} compatible}
 8 | \usage{
 9 | fwhich(x, val, arr.ind = FALSE, ret.values = FALSE, ...)
10 | 
11 | \method{fwhich}{default}(x, val, arr.ind = FALSE, ret.values = FALSE, ...)
12 | 
13 | \method{fwhich}{FileArray}(x, val, arr.ind = FALSE, ret.values = FALSE, ...)
14 | }
15 | \arguments{
16 | \item{x}{any R vector, matrix, array or file-array}
17 | 
18 | \item{val}{values to find, or a function taking one argument (a slice of 
19 | data vector) and returns either logical vector with the same length as the 
20 | slice or index of the slice; see 'Examples'}
21 | 
22 | \item{arr.ind}{logical; should array indices be 
23 | returned when \code{x} is an array?}
24 | 
25 | \item{ret.values}{whether to return the values of corresponding indices as 
26 | an attributes; default is false}
27 | 
28 | \item{...}{passed to \code{val} if \code{val} is a function}
29 | }
30 | \value{
31 | The indices of \code{x} elements that are listed in \code{val}.
32 | }
33 | \description{
34 | A generic function of \code{which} that is \code{'FileArray'} compatible
35 | }
36 | \examples{
37 | 
38 | 
39 | # ---- Default case ------------------------------------
40 | x <- array(1:27 + 2, rep(3,3))
41 | 
42 | # find index of `x` equal to either 4 or 5
43 | fwhich(x, c(4,5))
44 | res <- fwhich(x, c(4,5), ret.values = TRUE)
45 | res
46 | attr(res, "values")
47 | 
48 | # ---- file-array case --------------------------------
49 | arr <- filearray_create(tempfile(), dim(x))
50 | arr[] <- x
51 | fwhich(arr, c(4,5))
52 | fwhich(arr, c(4,5), arr.ind = TRUE, ret.values = TRUE)
53 | 
54 | arr[2:3, 1, 1]
55 | 
56 | # Clean up this example
57 | arr$delete()
58 | 
59 | # ---- `val` is a function ----------------------------
60 | x <- as_filearray(c(sample(15), 15), dimension = c(4,4))
61 | 
62 | ret <- fwhich(x, val = which.max, 
63 |               ret.values = TRUE, arr.ind = FALSE)
64 | 
65 | # ret is the index
66 | ret == which.max(x[])
67 | 
68 | # attr(ret, "values") is the max value
69 | max(x[]) == attr(ret, "values")
70 | 
71 | # customize `val`
72 | fwhich(x, ret.values = TRUE, arr.ind = FALSE,
73 |        val = function( slice ) {
74 |            slice > 10 # or which(slice > 10)
75 |        })
76 | 
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/R/write.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ensure_partition <- function(
 3 |     file, partition, dimension, 
 4 |     type = c("double","integer","logical","raw"), size = NULL){
 5 |     
 6 |     type <- match.arg(type)
 7 |     if(is.null(size)){
 8 |         size <- get_elem_size(type)
 9 |     } else {
10 |         size <- as.integer(size)
11 |     }
12 |     
13 |     if( !file.exists(file) ){
14 |         fid <- file(description = file, open = "w+b")
15 |         write_header(fid, partition, dimension, type, size)
16 |         close(fid)
17 |     }
18 |     
19 |     header <- validate_header(file)
20 |     
21 |     expected_type <- sexp_to_type(header$sexp_type)
22 |     
23 |     if( type != expected_type ){
24 |         stop(sprintf("Partition data type mismatch: %s != %s", expected_type, type))
25 |     }
26 |     
27 |     if( header$partition != partition ){
28 |         quiet_warning(sprintf("Partition number mismatch: %s != %s", header$partition, partition))
29 |     }
30 |     if( prod(dimension) != header$partition_size ){
31 |         quiet_warning(sprintf("Partition size mismatch: %s != %s", header$partition_size, prod(dimension)))
32 |     }
33 |     
34 |     return(header)
35 |     
36 | }
37 | 
38 | sexp_to_type <- function(sexp){
39 |     switch(
40 |         as.character(sexp),
41 |         '14' = 'double',
42 |         '13' = 'integer',
43 |         '10' = 'logical',
44 |         '24' = 'raw',
45 |         '15' = 'complex',
46 |         '26' = 'float',
47 |         stop("Unknown SEXP code: ", sexp)
48 |     )
49 | }
50 | 
51 | type_to_sexp <- function(type){
52 |     switch(
53 |         type,
54 |         double = 14L,
55 |         integer = 13L,
56 |         logical = 10L,
57 |         raw = 24L,
58 |         complex = 15L,
59 |         float = 26L,
60 |         stop("Unknown data type: ", type)
61 |     )
62 | }
63 | 
64 | load_partition <- function(file, dim){
65 |     stopifnot(file.exists(file))
66 |     fid <- file(description = file, open = "rb")
67 |     on.exit({
68 |         close(fid)
69 |     })
70 |     header <- validate_header(fid = fid)
71 |     type <- sexp_to_type(header$sexp_type)
72 |     
73 |     if( missing(dim) ){
74 |         dim <- header$partition_dim
75 |     } else {
76 |         stopifnot(prod(header$partition_dim) == prod(dim))
77 |     }
78 |     
79 |     structure(
80 |         readBin(con = fid, what = type, size = header$unit_bytes, 
81 |                 n = header$content_length, endian = header$endianness),
82 |         dim = dim
83 |     )
84 | }
85 | 


--------------------------------------------------------------------------------
/tests/testthat/test-methods_simple.R:
--------------------------------------------------------------------------------
 1 | test_that("Simple operators", {
 2 |   
 3 |     # logical
 4 |     x <- as_filearrayproxy(1:24, dimension = c(4,6), type = "double")
 5 |     x[3:4,3] <- rep(NA, 2)
 6 |     y <- !is.na(x)
 7 |     testthat::expect_s4_class(y, "FileArrayProxy")
 8 |     testthat::expect_equal(y[], !is.na(x[]))
 9 |     
10 |     # math to double
11 |     x0 <- x[]
12 |     suppressWarnings({
13 |         for(op in FILEARRAY_SIMPLE_OPS$math) {
14 |             y <- do.call(op, list(x))
15 |             testthat::expect_s4_class(y, "FileArrayProxy")
16 |             testthat::expect_equal(y[], do.call(op, list(x0)))
17 |         }
18 |     })
19 |     
20 |     # math to complex
21 |     x0 <- array(rnorm(20) + rnorm(20) * 1i, c(5,4))
22 |     x <- as_filearrayproxy(x0)
23 |     suppressWarnings({
24 |         for(op in FILEARRAY_SIMPLE_OPS$math) {
25 |             valid_op <- tryCatch({
26 |                 y0 <- do.call(op, list(x0))
27 |                 TRUE
28 |             }, error = function(e) {
29 |                 FALSE
30 |             })
31 |             
32 |             if( valid_op ) {
33 |                 y <- do.call(op, list(x))
34 |                 testthat::expect_s4_class(y, "FileArrayProxy")
35 |                 dif <- y[dimnames = NULL] - y0
36 |                 mag <- Mod(y[dimnames = NULL] + y0)
37 |                 mag[mag < 1] <- 1
38 |                 dif <- dif / Mod(mag)
39 |                 testthat::expect_lt(max(abs(Re(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op))
40 |                 testthat::expect_lt(max(abs(Im(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op))
41 |             } else {
42 |                 testthat::expect_error(do.call(op, list(x)), label = sprintf("Operator `%s` on complex should fail, but", op))
43 |             }
44 |             
45 |             
46 |         }
47 |     })
48 | 
49 |     # complex
50 |     for(op in FILEARRAY_SIMPLE_OPS$complex) {
51 |         y0 <- do.call(op, list(x0))
52 |         y <- do.call(op, list(x))
53 |         testthat::expect_s4_class(y, "FileArrayProxy")
54 |         dif <- y[dimnames = NULL] - y0
55 |         mag <- Mod(y[dimnames = NULL] + y0)
56 |         mag[mag < 1] <- 1
57 |         dif <- dif / Mod(mag)
58 |         testthat::expect_lt(max(abs(Re(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op))
59 |         testthat::expect_lt(max(abs(Im(dif))), 1e-5, label = sprintf("Operator `%s` on complex", op))
60 |     }
61 |     
62 | })
63 | 


--------------------------------------------------------------------------------
/tests/testthat/test-methods.R:
--------------------------------------------------------------------------------
 1 | test_that("R: FileArray-class", {
 2 |     set.seed(2)
 3 |     file <- tempfile()
 4 |     unlink(file, recursive = TRUE)
 5 |     dim <- 3:5
 6 |     x <- filearray_create(file, dim, partition_size = 3, initialize = FALSE)
 7 |     
 8 |     expect_equal(x$dimension(), dim)
 9 |     expect_true(x$can_write())
10 |     expect_equal(x$type(), 'double')
11 |     expect_equal(x$element_size(), get_elem_size(x$type()))
12 |     expect_error(x$fill_partition(-1, 1))
13 |     expect_error(x$fill_partition(NA, 1))
14 |     expect_error(x$fill_partition(0, 1))
15 |     expect_warning(x$fill_partition(1, 1:10))
16 |     x$fill_partition(2, 2)
17 |     expect_equal(x[[60]], 2)
18 |     
19 |     unlink(file, recursive = TRUE)
20 |     x <- filearray_create(file, dim, partition_size = 1, initialize = FALSE)
21 |     expect_equal(x$.mode, "readwrite")
22 |     x$fill_partition(1, 2)
23 |     
24 |     x <- filearray_load(file, mode = 'readonly')
25 |     expect_equal(x$.mode, "readonly")
26 |     expect_error(x$fill_partition(1, 1))
27 |     
28 |     x$initialize_partition(1:2)
29 |     expect_equal(x[1,1,1:2], c(2,NA))
30 |     expect_equal(file.exists(x$partition_path(1:3)), c(TRUE, TRUE, FALSE))
31 |     expect_equal(x$.mode, "readonly")
32 |     
33 |     expect_error(x[] <- 1:60)
34 |     
35 |     unlink(file, recursive = TRUE)
36 |     # not removed by $delete
37 |     expect_true(x$.valid)
38 |     expect_false(x$valid())
39 |     expect_error(x[])
40 |     expect_error(as.array(x))
41 |     expect_error(x[[1]])
42 |     expect_error(mapreduce(x, I))
43 |     expect_error(typeof(x))
44 |     expect_error(max(x))
45 |     
46 |     x$delete()
47 |     expect_false(x$.valid)
48 |     
49 |     unlink(file, recursive = TRUE)
50 | })
51 | 
52 | 
53 | test_that("R: S3 methods", {
54 |     set.seed(3)
55 |     file <- tempfile()
56 |     unlink(file, recursive = TRUE)
57 |     dim <- 3:5
58 |     x <- filearray_create(file, dim, partition_size = 3, initialize = FALSE)
59 |     expect_equal(file.exists(x$partition_path(1:3)), c(FALSE, FALSE, FALSE))
60 |     x[,,4] <- 1:12
61 |     expect_equal(as.integer(x[,,4]), 1:12)
62 |     expect_equal(file.exists(x$partition_path(1:3)), c(FALSE, TRUE, FALSE))
63 |     
64 |     expect_equal(typeof(x), 'double')
65 |     expect_equal(range(x, na.rm = TRUE), as.double(c(1, 12)))
66 |     expect_equal(file.exists(x$partition_path(1:3)), c(FALSE, TRUE, FALSE))
67 |     expect_equal(range(x), as.double(c(NA, NA)))
68 |     expect_equal(file.exists(x$partition_path(1:3)), c(TRUE, TRUE, FALSE))
69 |     
70 |     unlink(file, recursive = TRUE)
71 | })
72 | 


--------------------------------------------------------------------------------
/R/aaa.R:
--------------------------------------------------------------------------------
 1 | #' @importFrom methods new
 2 | #' @importFrom methods signature
 3 | #' @importFrom methods setGeneric
 4 | #' @importFrom methods setRefClass
 5 | #' @importFrom methods setMethod
 6 | NULL
 7 | HEADER_SIZE <- 1024
 8 | FILE_VER <- c( 1L, 1L, 0L )
 9 | HEADER_VER <- 1L
10 | RESERVED_HEADERS <- c("endianness", "version", "sexp_type", 
11 |                       "unit_bytes", "partition", "partition_size", 
12 |                       "partition_dim", "header_bytes", "header_version",
13 |                       "dimnames", "content_length")
14 | 
15 | # The saved files are always little endian
16 | ENDIANNESS <- "little"
17 | 
18 | max_buffer_size <- local({
19 |     # By default, maximum of 2MB buffer size
20 |     size <- 2097152
21 |     function(v){
22 |         if(!missing(v)){
23 |             if(v < 64){
24 |                 stop("Maximum buffer size is too small.")
25 |             }
26 |             v <- 2^ceiling(log2(v))
27 |             if(v > 2^30){
28 |                 stop("Maximum buffer size is too large.")
29 |             }
30 |             size <<- v
31 |         }
32 |         return(size)
33 |     }
34 | })
35 | 
36 | quiet_warning <- function(..., call. = FALSE){
37 |     if(!getOption("filearray.quiet", FALSE)){
38 |         warning(..., '\n\n* To suppress this message, set `options("filearray.quiet" = TRUE)`', call. = call.)
39 |     }
40 | }
41 | 
42 | get_os <- function(){
43 |     if("windows" %in% tolower(.Platform$OS.type)){
44 |         return("windows")
45 |     }
46 |     os <- tolower(R.version$os)
47 |     if(startsWith(os, "darwin")){
48 |         return('darwin')
49 |     }
50 |     if(startsWith(os, "linux")){
51 |         return('linux')
52 |     }
53 |     if(startsWith(os, "solaris")){
54 |         return('solaris')
55 |     }
56 |     if(startsWith(os, "win")){
57 |         return('windows')
58 |     }
59 |     return('unknown')
60 | }
61 | 
62 | 
63 | deparse1 <- function (expr, collapse = " ") {
64 |     paste(deparse(expr), collapse = collapse)
65 | }
66 | 
67 | temp_dir <- function(check = FALSE) {
68 |     re <- file.path(getOption("filearray.temporary.path", tempdir()), "_filearray_tempdir")
69 |     if(check && !dir.exists(re)) {
70 |         dir.create(re, showWarnings = FALSE, recursive = TRUE)
71 |     }
72 |     re
73 | }
74 | 
75 | temp_path <- function(pattern = "tmpfilearray", fileext = ".farr", check = FALSE) {
76 |     tempfile(pattern = pattern, tmpdir = temp_dir(check = check), fileext = fileext)
77 | }
78 | 
79 | clear_cache <- function() {
80 |     tdir <- temp_dir()
81 |     if(dir.exists(tdir)) {
82 |         unlink(tdir, recursive = TRUE)
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/adhoc/profiling2.R:
--------------------------------------------------------------------------------
 1 | library(filearray)
 2 | set.seed(1)
 3 | file <- tempfile(); unlink(file, recursive = TRUE)
 4 | dim <- c(100, 100, 50, 100)
 5 | x <- filearray_create(file, dim, type = "complex")
 6 | xlen <- length(x)
 7 | y <- rnorm(xlen) + 1i * rnorm(xlen)
 8 | 
 9 | x$initialize_partition()
10 | 
11 | # converting 800MB to 400MB and write
12 | system.time({
13 |     x[] <- y
14 | })
15 | #   user  system elapsed 
16 | #  0.268   0.770   0.368
17 | 
18 | y <- x[]
19 | 
20 | # Load 400MB, then convert to 800MB
21 | system.time({
22 |     x[]
23 | })
24 | #   user  system elapsed 
25 | #  0.228   0.342   0.351
26 | locs <- lapply(dim, function(d){
27 |     sample(1:d, size = sample(1:d, size = 1))
28 | })
29 | microbenchmark::microbenchmark(
30 |     filearray = {
31 |         x[locs[[1]], locs[[2]], locs[[3]], locs[[4]]]
32 |     },
33 |     native = {
34 |         y[locs[[1]], locs[[2]], locs[[3]], locs[[4]]]
35 |     }, times = 10, unit = "ms"
36 | )
37 | # Unit: milliseconds
38 | #       expr       min       lq      mean    median        uq      max neval
39 | #  filearray  8.401187  8.48126  9.764138  8.759076  9.615115 16.98675    10
40 | #     native 19.753554 19.82092 23.815748 20.394937 21.714994 53.37421    10
41 | 
42 | keep <- c(3, 4)
43 | microbenchmark::microbenchmark(
44 |     filearray = {
45 |         x$collapse(keep = keep, method = "sum")
46 |     },
47 |     native = {
48 |         apply(y, keep, sum)
49 |     }, 
50 |     dipsaus = {
51 |         dipsaus::collapse(y, keep, average = FALSE)
52 |     }, 
53 |     times = 5, unit = "s"
54 | )
55 | 
56 | # Unit: seconds
57 | #       expr       min        lq      mean    median        uq       max neval
58 | #  filearray 0.3544366 0.3599713 0.4557022 0.4204501 0.5650227 0.5786305     5
59 | #     native 2.0568676 2.1961080 2.3898924 2.2768022 2.6102381 2.8094461     5
60 | #    dipsaus 0.3548668 0.4553942 0.5307459 0.4588728 0.6577143 0.7268814     5
61 | 
62 | 
63 | keep <- c(3, 2)
64 | microbenchmark::microbenchmark(
65 |     filearray = {
66 |         x$collapse(keep = keep, method = "sum")
67 |     },
68 |     native = {
69 |         apply(y, keep, sum)
70 |     }, 
71 |     dipsaus = {
72 |         dipsaus::collapse(y, keep, average = FALSE)
73 |     }, 
74 |     times = 5, unit = "s"
75 | )
76 | 
77 | # Unit: seconds
78 | #       expr       min        lq      mean    median        uq       max neval
79 | #  filearray 0.3574229 0.4281074 0.4975226 0.4827639 0.5061053 0.7132133     5
80 | #     native 0.7374846 1.3993005 1.4811525 1.5285852 1.5995152 2.1408772     5
81 | #    dipsaus 0.3303778 0.4295939 0.6026428 0.6484702 0.7453199 0.8594522     5
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/tests/testthat/test-dimnames.R:
--------------------------------------------------------------------------------
 1 | test_that("Subset with dimnames", {
 2 |     x <- filearray_create(tempfile(), dimension = c(3,4,5,6))
 3 |     on.exit({
 4 |         x$delete()
 5 |     })
 6 |     y <- array(as.double(1:240), c(3,4,5,6))
 7 |     x[] <- y
 8 |     expect_identical(dimnames(x[]), dimnames(y))
 9 |     
10 |     dnames1 <- list(
11 |         A = 1:3,
12 |         B = 1:4,
13 |         NULL,
14 |         D = 1:6
15 |     )
16 |     dnames2 <- list(
17 |         A = 1:3
18 |     )
19 |     dnames3 <- list(
20 |         A = 1:3,
21 |         B = 1:4,
22 |         C = NULL,
23 |         D = NULL
24 |     )
25 |     dnames4 <- list(
26 |         A = 1:3,
27 |         B = 1:4,
28 |         C = 1:5,
29 |         D = 1:6
30 |     )
31 |     
32 |     expect_error({
33 |         dimnames(x) <- list(
34 |             A = 1:4
35 |         )
36 |     })
37 |     
38 |     
39 |     dimnames(y) <- dnames1
40 |     dimnames(x) <- dnames1
41 |     
42 |     expect_identical(dimnames(x[]), dimnames(y))
43 |     expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4]))
44 |     expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]),
45 |                      dimnames(y[1,1:2,2:3,1:4,drop=FALSE]))
46 |     expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1]))
47 |     
48 |     dimnames(y) <- dnames2
49 |     dimnames(x) <- dnames2
50 |     
51 |     expect_identical(dimnames(x[]), dimnames(y))
52 |     expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4]))
53 |     expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]),
54 |                      dimnames(y[1,1:2,2:3,1:4,drop=FALSE]))
55 |     expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1]))
56 |     
57 |     dimnames(y) <- dnames3
58 |     dimnames(x) <- dnames3
59 |     
60 |     expect_identical(dimnames(x[]), dimnames(y))
61 |     expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4]))
62 |     expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]),
63 |                      dimnames(y[1,1:2,2:3,1:4,drop=FALSE]))
64 |     expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1]))
65 |     
66 |     
67 |     dimnames(y) <- dnames4
68 |     dimnames(x) <- dnames4
69 |     
70 |     expect_identical(dimnames(x[]), dimnames(y))
71 |     expect_identical(dimnames(x[1,1:2,2:3,1:4]), dimnames(y[1,1:2,2:3,1:4]))
72 |     expect_identical(dimnames(x[1,1:2,2:3,1:4,drop=FALSE]),
73 |                      dimnames(y[1,1:2,2:3,1:4,drop=FALSE]))
74 |     expect_identical(names(x[2,1,1,1]), names(y[2,1,1,1]))
75 |     
76 |     
77 |     # expand
78 |     dimnames(x) <- dnames4
79 |     x$expand(10)
80 |     expect_equal(dimnames(x)$D, c(dnames4$D, rep(NA_integer_, 4)))
81 |     
82 | })
83 | 


--------------------------------------------------------------------------------
/R/methods-apply.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @title Apply functions over file array margins (extended)
 3 | #' @param X a file array
 4 | #' @param MARGIN scalar giving the subscripts which the function will be applied over. Current implementation only allows margin size to be one
 5 | #' @param FUN the function to be applied
 6 | #' @param ... optional arguments to \code{FUN}
 7 | #' @param simplify a logical indicating whether results should be simplified if possible
 8 | #' @return See Section 'Value' in \code{\link[base]{apply}};
 9 | #' @export
10 | setGeneric("apply")
11 | 
12 | apply_filearray <- function(X, MARGIN, FUN, ..., simplify = TRUE){
13 |     if(!X$valid()){
14 |         stop("Invalid file array")
15 |     }
16 |     dim <- X$dimension()
17 |     
18 |     FUN <- match.fun(FUN)
19 |     simplify <- isTRUE(simplify)
20 |     d <- dim(X)
21 |     dl <- length(d)
22 |     dn <- dimnames(X)
23 |     ds <- seq_len(dl)
24 |     if (is.character(MARGIN)) {
25 |         dnn <- names(dn)
26 |         if (is.null(dnn)) 
27 |             stop("'X' must have named dimnames")
28 |         MARGIN <- match(MARGIN, dnn)
29 |         if (anyNA(MARGIN)) 
30 |             stop("not all elements of 'MARGIN' are names of dimensions")
31 |     }
32 |     d.call <- d[-MARGIN]
33 |     d.ans <- d[MARGIN]
34 |     if (anyNA(d.call) || anyNA(d.ans)) {
35 |         stop("'MARGIN' does not match dim(X)")
36 |     }
37 |     s.call <- ds[-MARGIN]
38 |     s.ans <- ds[MARGIN]
39 |     if(length(s.ans) != 1){
40 |         stop("`apply` on FileArray margin size can only be 1.")
41 |     }
42 |     dn.call <- dn[-MARGIN]
43 |     dn.ans <- dn[MARGIN]
44 |     d2 <- prod(d.ans)
45 |     if (d2 == 0L) {
46 |         newX <- array(vector(typeof(X), 1L), 
47 |                       dim = c(prod(d.call), 1L))
48 |         if (length(d.call) < 2L) {
49 |             tmp <- newX[, 1]
50 |         } else {
51 |             tmp <- array(newX[, 1L], d.call, dn.call)
52 |         }
53 |         ans <- forceAndCall(1, FUN, tmp, ...)
54 |         if(is.null(ans)){
55 |             return(ans)
56 |         } else if (length(d.ans) < 2L) {
57 |             return(ans[1L][-1L])
58 |         } else {
59 |             return(array(ans, d.ans, dn.ans))
60 |         }
61 |     }
62 |     
63 |     tmp <- rep("", dl)
64 |     tmp[[s.ans]] <- ".__i__."
65 |     f <- sprintf("function(.__i__., ...){ FUN(X[%s], ...) }", paste(tmp, collapse = ","))
66 |     f <- eval(parse(text = f))
67 |     
68 |     sapply(seq_len(d[[s.ans]]), f, ..., simplify = simplify)
69 | }
70 | 
71 | #' @rdname apply
72 | #' @export
73 | setMethod('apply', signature(X = "FileArray"), apply_filearray)
74 | 
75 | #' @rdname apply
76 | #' @export
77 | setMethod('apply', signature(X = "FileArrayProxy"), apply_filearray)
78 | 
79 | 


--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
 3 | # You can update this file to a newer version using the rhub2 package:
 4 | #
 5 | # rhub::rhub_setup()
 6 | #
 7 | # It is unlikely that you need to modify this file manually.
 8 | 
 9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 | 
12 | on:
13 |   workflow_dispatch:
14 |     inputs:
15 |       config:
16 |         description: 'A comma separated list of R-hub platforms to use.'
17 |         type: string
18 |         default: 'linux,windows,macos'
19 |       name:
20 |         description: 'Run name. You can leave this empty now.'
21 |         type: string
22 |       id:
23 |         description: 'Unique ID. You can leave this empty now.'
24 |         type: string
25 | 
26 | jobs:
27 | 
28 |   setup:
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       containers: ${{ steps.rhub-setup.outputs.containers }}
32 |       platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 | 
34 |     steps:
35 |     # NO NEED TO CHECKOUT HERE
36 |     - uses: r-hub/actions/setup@v1
37 |       with:
38 |         config: ${{ github.event.inputs.config }}
39 |       id: rhub-setup
40 | 
41 |   linux-containers:
42 |     needs: setup
43 |     if: ${{ needs.setup.outputs.containers != '[]' }}
44 |     runs-on: ubuntu-latest
45 |     name: ${{ matrix.config.label }}
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         config: ${{ fromJson(needs.setup.outputs.containers) }}
50 |     container:
51 |       image: ${{ matrix.config.container }}
52 | 
53 |     steps:
54 |       - uses: r-hub/actions/checkout@v1
55 |       - uses: r-hub/actions/platform-info@v1
56 |         with:
57 |           token: ${{ secrets.RHUB_TOKEN }}
58 |           job-config: ${{ matrix.config.job-config }}
59 |       - uses: r-hub/actions/setup-deps@v1
60 |         with:
61 |           token: ${{ secrets.RHUB_TOKEN }}
62 |           job-config: ${{ matrix.config.job-config }}
63 |       - uses: r-hub/actions/run-check@v1
64 |         with:
65 |           token: ${{ secrets.RHUB_TOKEN }}
66 |           job-config: ${{ matrix.config.job-config }}
67 | 
68 |   other-platforms:
69 |     needs: setup
70 |     if: ${{ needs.setup.outputs.platforms != '[]' }}
71 |     runs-on: ${{ matrix.config.os }}
72 |     name: ${{ matrix.config.label }}
73 |     strategy:
74 |       fail-fast: false
75 |       matrix:
76 |         config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 | 
78 |     steps:
79 |       - uses: r-hub/actions/checkout@v1
80 |       - uses: r-hub/actions/setup-r@v1
81 |         with:
82 |           job-config: ${{ matrix.config.job-config }}
83 |           token: ${{ secrets.RHUB_TOKEN }}
84 |       - uses: r-hub/actions/platform-info@v1
85 |         with:
86 |           token: ${{ secrets.RHUB_TOKEN }}
87 |           job-config: ${{ matrix.config.job-config }}
88 |       - uses: r-hub/actions/setup-deps@v1
89 |         with:
90 |           job-config: ${{ matrix.config.job-config }}
91 |           token: ${{ secrets.RHUB_TOKEN }}
92 |       - uses: r-hub/actions/run-check@v1
93 |         with:
94 |           job-config: ${{ matrix.config.job-config }}
95 |           token: ${{ secrets.RHUB_TOKEN }}
96 | 


--------------------------------------------------------------------------------
/src/conversion.h:
--------------------------------------------------------------------------------
 1 | #ifndef FARR_CONVERSION_H
 2 | #define FARR_CONVERSION_H
 3 | 
 4 | #include "common.h"
 5 | #include "serialize.h"
 6 | #include "defs.h"
 7 | 
 8 | double na_cplx_dbl();
 9 | SEXP realToInt64(Rcpp::NumericVector x, const double min_ = NA_REAL, const double max_ = NA_REAL, const int strict = 1);
10 | SEXP realToInt64_inplace(SEXP x, const double min_ = NA_REAL, const double max_ = NA_REAL, const int strict = 1);
11 | SEXP convert_as(SEXP x, SEXPTYPE type);
12 | SEXP convert_as2(SEXP x, SEXP y, SEXPTYPE type);
13 | 
14 | // void realToCplx(double* x, Rcomplex* y, size_t nelem);
15 | void realToCplx(const double* x, Rcomplex* y, const size_t& nelem, const bool swap_endian = false);
16 | 
17 | void cplxToReal(Rcomplex* x, double* y, size_t nelem);
18 | 
19 | void realToFloat(double* x, float* y, size_t nelem);
20 | 
21 | void floatToReal(float* x, double* y, size_t nelem);
22 | 
23 | /**********************************************************
24 |  * Transform functions
25 |  ***********************************************************/
26 | 
27 | template <typename T>
28 | inline void transform_asis(const T* x, T* y, const bool& swap_endian = false) {
29 |     if( swap_endian ){
30 |         const size_t size = sizeof(T);
31 |         T tmp;
32 |         
33 |         unsigned char *buffer_src = (unsigned char*) x;
34 |         unsigned char *buffer_dst = (unsigned char*) (&tmp);
35 |         
36 |         for (size_t ix = 0; ix < size; ix++) {
37 |             *(buffer_dst + (size - ix - 1)) = *(buffer_src + ix);
38 |         }
39 |         *y = tmp;
40 |     } else {
41 |         memcpy(y, x, sizeof(T));
42 |         // *y = *x;
43 |     }
44 |     
45 | }
46 | void transform_float(const float* x, double* y, const bool& swap_endian = false);
47 | void transform_logical(const Rbyte* x, int* y, const bool& swap_endian = false);
48 | void transform_complex(const double* x, Rcomplex* y, const bool& swap_endian = false);
49 | 
50 | 
51 | template <typename T>
52 | inline void transforms_asis(const T* x, T* y, const int& nelem, const bool& swap_endian = false){
53 |     memcpy(y, x, nelem * sizeof(T));
54 |     if( swap_endian ){
55 |         swap_endianess(y, sizeof(T), nelem);
56 |     }
57 | }
58 | 
59 | void transforms_float(const float* x, double* y, const int& nelem, const bool& swap_endian = false);
60 | void transforms_logical(const Rbyte* x, int* y, const int& nelem, const bool& swap_endian = false);
61 | void transforms_complex(const double* x, Rcomplex* y, const int& nelem, const bool& swap_endian = false);
62 | 
63 | template <typename T, typename B>
64 | inline void transforms(const T* x, B* y, const int& nelem, const bool& swap_endian = false){
65 |     if(std::is_same<T, B>::value){
66 |         transforms_asis<T>(x, (T*) y, nelem, swap_endian);
67 |         return;
68 |     }
69 |     if(std::is_same<T, float>::value){
70 |         transforms_float((const float*) x, (double*) y, nelem, swap_endian);
71 |         return;
72 |     }
73 |     
74 |     if(std::is_same<T, Rbyte>::value){
75 |         transforms_logical((const Rbyte*) x, (int*) y, nelem, swap_endian);
76 |         return;
77 |     }
78 |     if(std::is_same<B, Rcomplex>::value){
79 |         transforms_complex((const double*) x, (Rcomplex*) y, nelem, swap_endian);
80 |         return;
81 |     }
82 |     Rcpp::stop("Unknown type in `transforms`.");
83 |     
84 | }
85 |     
86 | #endif  // FARR_CONVERSION_H


--------------------------------------------------------------------------------
/man/S3-filearray.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/aaa-docs.R, R/methods-subset.R,
  3 | %   R/methods-subsetAssign.R, R/methods.R
  4 | \name{S3-filearray}
  5 | \alias{S3-filearray}
  6 | \alias{[.FileArray}
  7 | \alias{[<-.FileArray}
  8 | \alias{[[.FileArray}
  9 | \alias{as.array.FileArray}
 10 | \alias{dim.FileArray}
 11 | \alias{dimnames.FileArray}
 12 | \alias{dimnames<-.FileArray}
 13 | \alias{length.FileArray}
 14 | \alias{max.FileArray}
 15 | \alias{min.FileArray}
 16 | \alias{range.FileArray}
 17 | \alias{sum.FileArray}
 18 | \alias{subset.FileArray}
 19 | \title{'S3' methods for 'FileArray'}
 20 | \usage{
 21 | \method{[}{FileArray}(
 22 |   x,
 23 |   i,
 24 |   ...,
 25 |   drop = TRUE,
 26 |   reshape = NULL,
 27 |   strict = TRUE,
 28 |   dimnames = TRUE,
 29 |   split_dim = 0
 30 | )
 31 | 
 32 | \method{[}{FileArray}(x, i, ..., lazy = FALSE) <- value
 33 | 
 34 | \method{[[}{FileArray}(x, i)
 35 | 
 36 | \method{as.array}{FileArray}(x, reshape = NULL, drop = FALSE, ...)
 37 | 
 38 | \method{dim}{FileArray}(x)
 39 | 
 40 | \method{dimnames}{FileArray}(x)
 41 | 
 42 | \method{dimnames}{FileArray}(x) <- value
 43 | 
 44 | \method{length}{FileArray}(x)
 45 | 
 46 | \method{max}{FileArray}(x, na.rm = FALSE, ...)
 47 | 
 48 | \method{min}{FileArray}(x, na.rm = FALSE, ...)
 49 | 
 50 | \method{range}{FileArray}(x, na.rm = FALSE, ...)
 51 | 
 52 | \method{sum}{FileArray}(x, na.rm = FALSE, ...)
 53 | 
 54 | \method{subset}{FileArray}(x, ..., drop = FALSE, .env = parent.frame())
 55 | }
 56 | \arguments{
 57 | \item{x}{a file array}
 58 | 
 59 | \item{i, ...}{index set, or passed to other methods}
 60 | 
 61 | \item{drop}{whether to drop dimensions; see topic \code{\link[base]{Extract}}}
 62 | 
 63 | \item{reshape}{a new dimension to set before returning subset results; default is \code{NULL} (use default dimensions)}
 64 | 
 65 | \item{strict}{whether to allow indices to exceed bound; currently only accept \code{TRUE}}
 66 | 
 67 | \item{dimnames}{whether to preserve \code{\link[base]{dimnames}}}
 68 | 
 69 | \item{split_dim}{internally used; split dimension and calculate indices to
 70 | manually speed up the subset; value ranged from 0 to size of dimension minus
 71 | one.}
 72 | 
 73 | \item{lazy}{whether to lazy-evaluate the method, only works when assigning
 74 | arrays with logical array index}
 75 | 
 76 | \item{value}{value to substitute or set}
 77 | 
 78 | \item{na.rm}{whether to remove \code{NA} values during the calculation}
 79 | 
 80 | \item{.env}{environment to evaluate formula when evaluating subset margin indices.}
 81 | }
 82 | \description{
 83 | These are 'S3' methods for 'FileArray'
 84 | }
 85 | \section{Functions}{
 86 | \itemize{
 87 | \item \code{[}: subset array
 88 | 
 89 | \item \code{`[`(FileArray) <- value}: subset assign array
 90 | 
 91 | \item \code{[[}: get element by index
 92 | 
 93 | \item \code{as.array(FileArray)}: converts file array to native array in R
 94 | 
 95 | \item \code{dim(FileArray)}: get dimensions
 96 | 
 97 | \item \code{dimnames(FileArray)}: get dimension names
 98 | 
 99 | \item \code{dimnames(FileArray) <- value}: set dimension names
100 | 
101 | \item \code{length(FileArray)}: get array length
102 | 
103 | \item \code{max(FileArray)}: get max value
104 | 
105 | \item \code{min(FileArray)}: get min value
106 | 
107 | \item \code{range(FileArray)}: get value range
108 | 
109 | \item \code{sum(FileArray)}: get summation
110 | 
111 | \item \code{subset(FileArray)}: get subset file array with formulae
112 | 
113 | }}
114 | 


--------------------------------------------------------------------------------
/R/filearray-package.R:
--------------------------------------------------------------------------------
  1 | in_rcmdcheck <- function (...) {
  2 |     evidences <- list()
  3 |     args <- commandArgs()
  4 |     is_vanilla <- is.element("--vanilla", args)
  5 |     if(!is_vanilla){
  6 |         return(FALSE)
  7 |     }
  8 |     pwd <- getwd()
  9 |     dirname <- basename(pwd)
 10 |     parent <- basename(dirname(pwd))
 11 |     pattern <- ".+[.]Rcheck$"
 12 |     in_test <- (grepl(pattern, parent) && grepl("^tests(|_.*)$", dirname))
 13 |     if(!(in_test || grepl(pattern, dirname))){
 14 |         return(FALSE)
 15 |     }
 16 |     in_examples <- is.element("CheckExEnv", search())
 17 |     in_win_builder <- (.Platform$OS.type == "windows" && grepl("Rterm[.]exe$", args[1]))
 18 |     if(in_win_builder){
 19 |         n <- length(args)
 20 |         if (!all(c("--no-save", "--no-restore", "--no-site-file", 
 21 |                   "--no-init-file") %in% args)) {
 22 |             return(FALSE)
 23 |         }
 24 |         if (!grepl(pattern, parent)) {
 25 |             return(FALSE)
 26 |         }
 27 |     }
 28 |     if(in_test){
 29 |         return(structure(TRUE, status = "tests"))
 30 |     }
 31 |     if(in_examples){
 32 |         return(structure(TRUE, status = "examples"))
 33 |     }
 34 |     return(FALSE)
 35 | }
 36 | 
 37 | symlink_enabled <- local({
 38 |     enabled <- NA
 39 |     function(){
 40 |         if(!is.na(enabled)){ return(enabled) }
 41 |         tempdir(check = TRUE)
 42 |         f1 <- temp_path(pattern = 'filearray_simlink_test_from')
 43 |         f2 <- temp_path(pattern = 'filearray_simlink_test_to')
 44 |         on.exit({
 45 |             if(file.exists(f1)){
 46 |                 unlink(f1)
 47 |             }
 48 |             if(file.exists(f2)){
 49 |                 unlink(f2)
 50 |             }
 51 |         }, add = FALSE)
 52 |         s <- paste(sample(LETTERS), collapse = "")
 53 |         writeLines(s, con = f1)
 54 |         file.symlink(f1, to = f2)
 55 |         en <- tryCatch({
 56 |             if(identical(readLines(f2), s)){
 57 |                 TRUE
 58 |             } else {
 59 |                 FALSE
 60 |             }
 61 |         }, error = function(e){
 62 |             FALSE
 63 |         }, warning = function(e){
 64 |             FALSE
 65 |         })
 66 |         enabled <<- en
 67 |         
 68 |         if(file.exists(f1)){
 69 |             unlink(f1)
 70 |         }
 71 |         if(file.exists(f2)){
 72 |             unlink(f2)
 73 |         }
 74 |         on.exit({}, add = FALSE)
 75 |         
 76 |         return(enabled)
 77 |     }
 78 | })
 79 | 
 80 | 
 81 | .onLoad <- function(libname, pkgname){
 82 |     # Check if in R CMD check mode
 83 |     if(Sys.getenv("_R_CHECK_LIMIT_CORES_") == "TRUE"){
 84 |         #  R CMD check with --as-cran
 85 |         n <- 2L
 86 |     } else if(in_rcmdcheck()){
 87 |         #  R CMD check (without CRAN)
 88 |         n <- 2L
 89 |     } else {
 90 |         n <- filearray_threads(-1)
 91 |         if(n > 8L){
 92 |             n <- 8L
 93 |         }
 94 |     }
 95 |     
 96 |     filearray_threads(n)
 97 |     ns <- asNamespace(pkgname)
 98 |     ns$NA_float_ <- get_float_na()
 99 | }
100 | 
101 | .onAttach <- function(libname, pkgname){
102 |     if(Sys.getenv("_R_CHECK_LIMIT_CORES_") == "TRUE"){
103 |         packageStartupMessage(
104 |             "Found environment variable `_R_CHECK_LIMIT_CORES_`=TRUE. Using ",
105 |             filearray_threads(), " threads.")
106 |     } else if(in_rcmdcheck()){
107 |         packageStartupMessage("R CMD check mode. Using ", 
108 |                               filearray_threads(), " threads.")
109 |     } else {
110 |         packageStartupMessage(
111 |             "Filearray is using ", filearray_threads(), " threads.")
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/adhoc/profiling.R:
--------------------------------------------------------------------------------
  1 | # performance test
  2 | 
  3 | filearray::filearray_threads()
  4 | # 2.56GB file
  5 | 
  6 | set.seed(1)
  7 | file <- tempfile()
  8 | unlink(file, recursive = TRUE)
  9 | 
 10 | x <- filearray::filearray_create(file, c(128, 128, 128, 128))
 11 | 
 12 | # One-time initialize partition files with NA
 13 | system.time({
 14 |     x$initialize_partition()
 15 | })
 16 | #   user  system elapsed 
 17 | #  0.236   2.476   2.864
 18 | 
 19 | # -------------------- WRITE - single threaded ------------------------
 20 | # write to fast margin - single threaded
 21 | filearray::filearray_threads(1)
 22 | tmp <- rnorm(128^3)
 23 | system.time({
 24 |     for(i in 1:128){
 25 |         x[,,,i] <- tmp
 26 |     }
 27 | })
 28 | #   user  system elapsed 
 29 | #  0.220   1.097   2.089
 30 | 
 31 | # 1.2 GB/s
 32 | 
 33 | 
 34 | system.time({
 35 |     for(i in 1:128){
 36 |         x[,,i,] <- tmp
 37 |     }
 38 | })
 39 | #   user  system elapsed 
 40 | #  1.044   1.400   4.795
 41 | 
 42 | # 500 MB/s
 43 | 
 44 | # write to slow margin - Never recommended to write on single 
 45 | # (see next section - block write)
 46 | system.time({
 47 |     # for(i in 1:128){
 48 |         x[1,,,] <- tmp
 49 |     # }
 50 | })
 51 | #   user  system elapsed 
 52 | #  0.057   1.048   2.097
 53 | 
 54 | # Using memory map boosts the speed
 55 | # 1.2 GB/s
 56 | 
 57 | # -------------------- WRITE (block) - 4-threaded ------------------------
 58 | # write to fast margin - 4 threaded
 59 | filearray::filearray_threads(4)
 60 | 
 61 | # 1GB data
 62 | tmp <- rnorm(128^3 * 50)
 63 | idx <- sample(1:128, 50)
 64 | system.time({
 65 |     x[,,,idx] <- tmp
 66 | })
 67 | #   user  system elapsed 
 68 | #  0.135   0.642   0.730
 69 | 
 70 | # 1.4 GB/s
 71 | 
 72 | 
 73 | system.time({
 74 |     x[,,idx,] <- tmp
 75 | })
 76 | #   user  system elapsed 
 77 | #  0.288   0.707   1.165
 78 | 
 79 | # 1 GB/s
 80 | 
 81 | system.time({
 82 |     x[,idx,,] <- tmp
 83 | })
 84 | #   user  system elapsed 
 85 | #  0.166   1.379   2.082
 86 | 
 87 | # 500 MB/s
 88 | 
 89 | system.time({
 90 |     x[idx,,,] <- tmp
 91 | })
 92 | #   user  system elapsed 
 93 | #  0.152   1.555   1.995
 94 | 
 95 | # 500 MB/s
 96 | 
 97 | rm(tmp); gc()
 98 | # -------------------- READ - single-threaded ------------------------
 99 | ### single-threaded
100 | filearray::filearray_threads(1)
101 | 
102 | # read 1GB subset
103 | 
104 | #### ordered indices ####
105 | idx <- sort(sample(1:128, 50))
106 | 
107 | # partition margin 
108 | system.time({
109 |     x[,,,idx]
110 | })
111 | #   user  system elapsed 
112 | #  0.135   0.267   0.620 
113 | 
114 | # 1.6 GB/s
115 | 
116 | # fast margin
117 | system.time({
118 |     x[,,idx,]
119 | })
120 | #   user  system elapsed 
121 | #  0.135   0.414   0.955
122 | 
123 | # 1 GB/s
124 | 
125 | # slow margin
126 | system.time({
127 |     x[idx,,,]
128 | })
129 | #   user  system elapsed 
130 | #  0.141   0.376   0.950 
131 | 
132 | # 1 GB/s
133 | 
134 | 
135 | #### random indices ####
136 | idx <- sample(1:128, 50)
137 | 
138 | # partition margin 
139 | system.time({
140 |     x[,,,idx]
141 | })
142 | #   user  system elapsed 
143 | #  0.127   0.219   0.507 
144 | 
145 | # 2 GB/s
146 | 
147 | # fast margin
148 | system.time({
149 |     x[,,idx,]
150 | })
151 | #   user  system elapsed 
152 | #  0.133   0.355   0.796
153 | 
154 | # 1.4 GB/s
155 | 
156 | # slow margin
157 | system.time({
158 |     x[idx,,,]
159 | })
160 | #   user  system elapsed 
161 | #  0.141   0.347   0.953
162 | 
163 | # 1 GB/s
164 | 
165 | 
166 | 
167 | # -------------------- READ - 4-threaded ------------------------
168 | filearray::filearray_threads(8)
169 | idx <- sample(1:128, 50)
170 | 
171 | # partition margin 
172 | system.time({
173 |     x[,,,idx]
174 | })
175 | #   user  system elapsed 
176 | #  0.175   0.264   0.289
177 | 
178 | # 3.4 GB/s
179 | 
180 | # fast margin
181 | system.time({
182 |     x[,,idx,]
183 | })
184 | #   user  system elapsed 
185 | #  0.378   0.324   0.248 
186 | 
187 | # 4 GB/s
188 | 
189 | # slow margin
190 | system.time({
191 |     x[idx,,,]
192 | })
193 | #   user  system elapsed 
194 | #  0.174   0.448   0.681
195 | 
196 | # 1.5 GB/s
197 | 
198 | 
199 | system.time({
200 |     x[1:65,,,]
201 | })
202 | system.time({
203 |     x[c(1, 67:128),,,]
204 | })
205 | 


--------------------------------------------------------------------------------
/tests/testthat/test-helpers.R:
--------------------------------------------------------------------------------
 1 | test_that("is_same_dim", {
 2 |     
 3 |     # sanity check
 4 |     testthat::expect_true(is_same_dim(array(1:12, c(3,4)), matrix(NA, nrow = 3, ncol = 4)))
 5 |     testthat::expect_false(is_same_dim(array(1:12, c(3,4)), matrix(NA, nrow = 3, ncol = 3)))
 6 |     testthat::expect_false(is_same_dim(array(1:12, c(3,4)), array(NA, c(3,4,1))))
 7 |     testthat::expect_false(is_same_dim(array(1:12, c(3,4)), array(NA, c(4, 3))))
 8 |     
 9 |     testthat::expect_false(is_same_dim(array(NA, c(3,4)), list()))
10 |     testthat::expect_false(is_same_dim(array(NA, c(3,4)), data.frame()))
11 |     testthat::expect_false(is_same_dim(array(NA, c(3,4)), NULL))
12 |     testthat::expect_false(is_same_dim(array(NA, c(3,4)), 1))
13 |     
14 |     # number of margins <= 1
15 |     testthat::expect_true(is_same_dim(1, 1))
16 |     testthat::expect_false(is_same_dim(1, 1:3))
17 |     testthat::expect_false(is_same_dim(1, 1:3))
18 |     testthat::expect_true(is_same_dim(NULL, numeric()))
19 |     
20 |     # margin =1 but with dim
21 |     x <- 1:5
22 |     dim(x) <- 5
23 |     testthat::expect_true(is_same_dim(x, 1:5))
24 |     
25 | })
26 | 
27 | test_that("operation_output_type", {
28 |     double_type <- "float"
29 |     options("filearray.operator.precision" = double_type)
30 |     on.exit({
31 |         options("filearray.operator.precision" = NULL)
32 |     })
33 |     
34 |     # complex
35 |     type1 <- "complex"
36 |     for(type2 in c("complex", "double", "float", "integer", "logical")) {
37 |         testthat::expect_equal(operation_output_type(type1, type2), type1)
38 |     }
39 |     testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), type1)
40 |     testthat::expect_error(operation_output_type(type1, "raw"))
41 |     
42 |     # double/float
43 |     type1 <- "double"
44 |     for(type2 in c("double", "float", "integer", "logical")) {
45 |         testthat::expect_equal(operation_output_type(type1, type2), double_type)
46 |     }
47 |     testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), double_type)
48 |     testthat::expect_error(operation_output_type(type1, "raw"))
49 |     
50 |     type1 <- "float"
51 |     for(type2 in c("double", "float", "integer", "logical")) {
52 |         testthat::expect_equal(operation_output_type(type1, type2), double_type)
53 |     }
54 |     testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), double_type)
55 |     testthat::expect_error(operation_output_type(type1, "raw"))
56 |     
57 |     # integer
58 |     type1 <- "integer"
59 |     for(type2 in c("integer", "logical")) {
60 |         testthat::expect_equal(operation_output_type(type1, type2), type1)
61 |     }
62 |     testthat::expect_equal(operation_output_type(type1, 'raw', raw = "integer"), type1)
63 |     testthat::expect_error(operation_output_type(type1, "raw"))
64 |     
65 |     # logical
66 |     testthat::expect_equal(operation_output_type("logical", 'logical'), 'integer')
67 |     testthat::expect_equal(operation_output_type("logical", 'logical', logical = "logical"), 'logical')
68 |     testthat::expect_equal(operation_output_type('logical', 'raw', raw = "integer"), 'integer')
69 |     testthat::expect_error(operation_output_type('logical', "raw"))
70 |     
71 |     # raw
72 |     testthat::expect_equal(operation_output_type('raw', 'raw', raw = "integer"), 'integer')
73 |     testthat::expect_error(operation_output_type('raw', "raw"))
74 |     
75 | })
76 | 
77 | test_that("fmap_buffer", {
78 |     
79 |     buffer_large <- get_buffer_size() * 16
80 |     
81 |     testthat::expect_equal(common_fmap_buffer_count(84, 12, 96), 1)
82 |     testthat::expect_equal(common_fmap_buffer_count(84, 12, 97), 1)
83 |     
84 |     testthat::expect_equal(common_fmap_buffer_count(buffer_large*2, 12, 1024), 2)
85 |     testthat::expect_equal(common_fmap_buffer_count(buffer_large*2 + 2, 12, 1024), 2)
86 |     testthat::expect_equal(common_fmap_buffer_count(buffer_large*4096, buffer_large, 1024), 1024)
87 |     testthat::expect_equal(common_fmap_buffer_count(buffer_large*2, 13, 1024), 1)
88 |     
89 |     expect_true(validate_fmap_buffer_count(12, input_lens = c(84, 12, 96)))
90 |     expect_true(validate_fmap_buffer_count(1, input_lens = c(84, 12, 96)))
91 |     expect_false(validate_fmap_buffer_count(15, input_lens = c(84, 12, 96)))
92 |     expect_true(validate_fmap_buffer_count(1, input_lens = c(84, 12, 97)))
93 |     
94 | })


--------------------------------------------------------------------------------
/man/filearray_bind.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/bind.R
  3 | \name{filearray_bind}
  4 | \alias{filearray_bind}
  5 | \title{Merge and bind homogeneous file arrays}
  6 | \usage{
  7 | filearray_bind(
  8 |   ...,
  9 |   .list = list(),
 10 |   filebase = tempfile(),
 11 |   symlink = FALSE,
 12 |   overwrite = FALSE,
 13 |   cache_ok = FALSE
 14 | )
 15 | }
 16 | \arguments{
 17 | \item{..., .list}{file array instances}
 18 | 
 19 | \item{filebase}{where to create merged array}
 20 | 
 21 | \item{symlink}{whether to use \code{\link[base]{file.symlink}}; if true,
 22 | then partition files will be symbolic-linked to the original arrays,
 23 | otherwise the partition files will be copied over. If you want your data
 24 | to be portable, do not use symbolic-links. The default value is \code{FALSE}}
 25 | 
 26 | \item{overwrite}{whether to overwrite when \code{filebase} already exists;
 27 | default is false, which raises errors}
 28 | 
 29 | \item{cache_ok}{see 'Details', only used if \code{overwrite} is true.}
 30 | }
 31 | \value{
 32 | A bound array in \code{'FileArray'} class.
 33 | }
 34 | \description{
 35 | The file arrays to be merged must be homogeneous:
 36 | same data type, partition size, and partition length
 37 | }
 38 | \details{
 39 | The input arrays must share the same data type and partition size.
 40 | The dimension for each partition should also be the same. For example
 41 | an array \code{x1} has dimension \eqn{100x20x30} with partition size 
 42 | \code{1}, then each partition dimension is \eqn{100x20x1}, and there are 
 43 | \code{30} partitions. \code{x1} can bind with another array of the same 
 44 | partition size. This means if \code{x2} has dimension
 45 | \eqn{100x20x40} and each partition size is \code{1}, then \code{x1} and 
 46 | \code{x2} can be merged.
 47 | 
 48 | If \code{filebase} exists and \code{overwrite} is \code{FALSE}, an error will 
 49 | always raise. If \code{overwrite=TRUE} and \code{cache_ok=FALSE}, then
 50 | the existing \code{filebase} will be erased and any data stored within will
 51 | be lost. 
 52 | If both \code{overwrite} and \code{cache_ok} are \code{TRUE}, then 
 53 | , before erasing \code{filebase}, the function validates the existing
 54 | array header and compare the header signatures. If the existing header
 55 | signature is the same as the array to be created, then the existing array 
 56 | will be returned. This \code{cache_ok} could be extremely useful when
 57 | binding large arrays with \code{symlink=FALSE} as the cache might avoid
 58 | moving files around. However, \code{cache_ok} should be enabled with caution.
 59 | This is because only the header information will be compared, but the 
 60 | partition data will not be compared. If the existing array was generated from
 61 | an old versions of the source arrays, but the data from the source arrays
 62 | has been altered, then the \code{cache_ok=TRUE} is rarely proper as the cache
 63 | is outdated.
 64 | 
 65 | The \code{symlink} option should be used with extra caution. Creating 
 66 | symbolic links is definitely faster than copying partition files. However, 
 67 | since the partition files are simply linked to the original partition files, 
 68 | changing to the input arrays will also affect the merged arrays, and 
 69 | vice versa; see 'Examples'. Also for arrays created from symbolic links, if 
 70 | the original 
 71 | arrays are deleted, while the merged arrays will not be invalidated, 
 72 | the corresponding partitions will no longer be accessible. Attempts to 
 73 | set deleted partitions will likely result in failure. Therefore
 74 | \code{symlink} should be set to true when creating merged arrays are
 75 | temporary for read-only purpose, and when speed and disk space is in
 76 | consideration. For extended reading, please check \code{\link[base]{files}} 
 77 | for details.
 78 | }
 79 | \examples{
 80 | partition_size <- 1
 81 | type <- "double"
 82 | x1 <- filearray_create(
 83 |     tempfile(), c(2,2), type = type,
 84 |     partition_size = partition_size)
 85 | x1[] <- 1:4
 86 | x2 <- filearray_create(
 87 |     tempfile(), c(2,1), type = type,
 88 |     partition_size = partition_size)
 89 | x2[] <- 5:6
 90 | 
 91 | y1 <- filearray_bind(x1, x2, symlink = FALSE)
 92 | y2 <- filearray_bind(x1, x2)
 93 | 
 94 | # y1 copies partition files, and y2 simply creates links 
 95 | # if symlink is supported
 96 | 
 97 | y1[] - y2[]
 98 | 
 99 | # change x1
100 | x1[1,1] <- NA
101 | 
102 | # y1 is not affected
103 | y1[]
104 | 
105 | # y2 changes 
106 | y2[]
107 | 
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | FARR_collapse <- function(filebase, dim, keep, cum_part, array_type, method = 1L, remove_na = FALSE, scale = 1.0) {
  5 |     .Call(`_filearray_FARR_collapse`, filebase, dim, keep, cum_part, array_type, method, remove_na, scale)
  6 | }
  7 | 
  8 | FARR_collapse_complex <- function(filebase, dim, keep, cum_part, method = 1L, remove_na = FALSE, scale = 1.0) {
  9 |     .Call(`_filearray_FARR_collapse_complex`, filebase, dim, keep, cum_part, method, remove_na, scale)
 10 | }
 11 | 
 12 | realToInt64 <- function(x, min_, max_, strict) {
 13 |     .Call(`_filearray_realToInt64`, x, min_, max_, strict)
 14 | }
 15 | 
 16 | cplxToReal2 <- function(x) {
 17 |     .Call(`_filearray_cplxToReal2`, x)
 18 | }
 19 | 
 20 | realToCplx2 <- function(x) {
 21 |     .Call(`_filearray_realToCplx2`, x)
 22 | }
 23 | 
 24 | realToFloat2 <- function(x) {
 25 |     .Call(`_filearray_realToFloat2`, x)
 26 | }
 27 | 
 28 | floatToReal2 <- function(x) {
 29 |     .Call(`_filearray_floatToReal2`, x)
 30 | }
 31 | 
 32 | get_float_na <- function() {
 33 |     .Call(`_filearray_get_float_na`)
 34 | }
 35 | 
 36 | set_buffer_size <- function(size) {
 37 |     .Call(`_filearray_set_buffer_size`, size)
 38 | }
 39 | 
 40 | get_buffer_size <- function() {
 41 |     .Call(`_filearray_get_buffer_size`)
 42 | }
 43 | 
 44 | FARR_meta <- function(filebase) {
 45 |     .Call(`_filearray_FARR_meta`, filebase)
 46 | }
 47 | 
 48 | loc2idx <- function(sliceIdx, dim) {
 49 |     .Call(`_filearray_loc2idx`, sliceIdx, dim)
 50 | }
 51 | 
 52 | locationList <- function(listOrEnv, dim, strict) {
 53 |     .Call(`_filearray_locationList`, listOrEnv, dim, strict)
 54 | }
 55 | 
 56 | schedule <- function(listOrEnv, dim, cum_part_sizes, split_dim, strict) {
 57 |     .Call(`_filearray_schedule`, listOrEnv, dim, cum_part_sizes, split_dim, strict)
 58 | }
 59 | 
 60 | filearray_meta <- function(filebase) {
 61 |     .Call(`_filearray_filearray_meta`, filebase)
 62 | }
 63 | 
 64 | filearray_assign <- function(filebase, value, position_indices) {
 65 |     .Call(`_filearray_filearray_assign`, filebase, value, position_indices)
 66 | }
 67 | 
 68 | filearray_subset <- function(filebase, position_indices, drop = TRUE, use_dimnames = TRUE, reshape = NULL) {
 69 |     .Call(`_filearray_filearray_subset`, filebase, position_indices, drop, use_dimnames, reshape)
 70 | }
 71 | 
 72 | FARR_subset_sequential <- function(filebase, unit_partlen, cum_partsizes, array_type, ret, from = 0L, len = 1L) {
 73 |     .Call(`_filearray_FARR_subset_sequential`, filebase, unit_partlen, cum_partsizes, array_type, ret, from, len)
 74 | }
 75 | 
 76 | FARR_subset2 <- function(filebase, listOrEnv, reshape = NULL, drop = FALSE, use_dimnames = TRUE, thread_buffer = 0L, split_dim = 0L, strict = 1L) {
 77 |     .Call(`_filearray_FARR_subset2`, filebase, listOrEnv, reshape, drop, use_dimnames, thread_buffer, split_dim, strict)
 78 | }
 79 | 
 80 | FARR_buffer_map <- function(input_filebases, output_filebase, map, buffer_nelems, result_nelems = 0L) {
 81 |     .Call(`_filearray_FARR_buffer_map`, input_filebases, output_filebase, map, buffer_nelems, result_nelems)
 82 | }
 83 | 
 84 | FARR_buffer_map2 <- function(input_filebases, map, buffer_nelems) {
 85 |     .Call(`_filearray_FARR_buffer_map2`, input_filebases, map, buffer_nelems)
 86 | }
 87 | 
 88 | FARR_buffer_mapreduce <- function(filebase, map, reduce, buffer_nelems) {
 89 |     .Call(`_filearray_FARR_buffer_mapreduce`, filebase, map, reduce, buffer_nelems)
 90 | }
 91 | 
 92 | FARR_subset_assign_sequential <- function(filebase, unit_partlen, cum_partsizes, array_type, value, from) {
 93 |     .Call(`_filearray_FARR_subset_assign_sequential`, filebase, unit_partlen, cum_partsizes, array_type, value, from)
 94 | }
 95 | 
 96 | FARR_subset_assign2 <- function(filebase, value, listOrEnv, thread_buffer = 2097152L, split_dim = 0L) {
 97 |     .Call(`_filearray_FARR_subset_assign2`, filebase, value, listOrEnv, thread_buffer, split_dim)
 98 | }
 99 | 
100 | getDefaultNumThreads <- function() {
101 |     .Call(`_filearray_getDefaultNumThreads`)
102 | }
103 | 
104 | getThreads <- function(max) {
105 |     .Call(`_filearray_getThreads`, max)
106 | }
107 | 
108 | kinda_sorted <- function(idx, min_, buffer_count) {
109 |     .Call(`_filearray_kinda_sorted`, idx, min_, buffer_count)
110 | }
111 | 
112 | check_missing_dots <- function(env) {
113 |     .Call(`_filearray_check_missing_dots`, env)
114 | }
115 | 
116 | reshape_or_drop <- function(x, reshape, drop) {
117 |     .Call(`_filearray_reshape_or_drop`, x, reshape, drop)
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/src/serialize.cpp:
--------------------------------------------------------------------------------
  1 | #include "serialize.h"
  2 | using namespace Rcpp;
  3 | 
  4 | int read_byte(R_inpstream_t stream){
  5 |     buffer_t *buf = (buffer_t *)stream->data;
  6 |     if (buf->pos >= buf->length) {
  7 |         stop("Overflow in `read_byte()`");
  8 |     }
  9 |     return buf->data[buf->pos++];
 10 | }
 11 | 
 12 | void read_bytes(R_inpstream_t stream, void *dst, int length) {
 13 |     buffer_t *buf = (buffer_t *)stream->data;
 14 |     if (buf->pos + length > buf->length) {
 15 |         stop("Overflow in `read_bytes()`");
 16 |     }
 17 |     memcpy(dst, buf->data + buf->pos, length);
 18 |     buf->pos += length;
 19 | }
 20 | 
 21 | SEXP unserialize_raw(SEXP x) {
 22 |     if (TYPEOF(x) != RAWSXP) {
 23 |         stop("`unserialize_raw` requires raw input");
 24 |     }
 25 |     Rbyte *vec = RAW(x);
 26 |     R_xlen_t len = Rf_xlength(x);
 27 |     
 28 |     buffer_t *buf = (buffer_t*) malloc(sizeof(buffer_t));
 29 |     if (buf == NULL) {
 30 |         stop("`unserialize_raw` Cannot allocate memory for buffer");
 31 |     }
 32 |     buf->length = len;
 33 |     buf->pos    = 0;
 34 |     buf->data   = (unsigned char *) vec;
 35 |     
 36 |     // Treat the data buffer as an input stream
 37 |     struct R_inpstream_st input_stream;
 38 |     
 39 |     R_InitInPStream(
 40 |         &input_stream,                 
 41 |         (R_pstream_data_t) buf,      
 42 |         R_pstream_any_format,        
 43 |         read_byte, 
 44 |         read_bytes, 
 45 |         NULL,       
 46 |         NULL        
 47 |     );
 48 |     
 49 |     // Unserialize!
 50 |     SEXP ret  = PROTECT(R_Unserialize(&input_stream));
 51 |     
 52 |     free(buf);
 53 |     buf = NULL;
 54 |     UNPROTECT(1);
 55 |     return ret;
 56 | }
 57 | 
 58 | SEXP unserialize_connection(FILE* conn, size_t len) {
 59 |     SEXP raw = PROTECT(Rf_allocVector(RAWSXP, len));
 60 |     lendian_fread(RAW(raw), 1, len, conn);
 61 |     SEXP re = PROTECT(unserialize_raw(raw));
 62 |     UNPROTECT(2);
 63 |     return(re);
 64 | }
 65 | 
 66 | bool isLittleEndian(){
 67 |     int x = 1;
 68 |     bool is_little = *((char*)&x) == 1;
 69 |     return ( is_little );
 70 |     // DEBUG test big endianess
 71 |     // return(!is_little);
 72 | }
 73 | 
 74 | void swap_endianess_old(void *ptr, size_t size, size_t nmemb){
 75 |     unsigned char *buffer_src = (unsigned char*)ptr;
 76 |     unsigned char *buffer_dst = new unsigned char[size];
 77 |     size_t ix = 0;
 78 |     for (size_t i = 0; i < nmemb; i++, buffer_src += size) {
 79 |         for (ix = 0; ix < size; ix++) {
 80 |             *(buffer_dst + (size - 1 - ix)) = *(buffer_src + ix);
 81 |         }
 82 |         memcpy(buffer_src, buffer_dst, size);
 83 |     }
 84 |     delete[] buffer_dst;
 85 | }
 86 | 
 87 | void swap_endianess(void *ptr, const size_t& size, const size_t& nmemb){
 88 |     if( size <= 1 || nmemb <= 0 ){ return; }
 89 |     unsigned char *buffer_src1 = (unsigned char*)ptr;
 90 |     unsigned char *buffer_src2 = buffer_src1 + (size - 1);
 91 |     
 92 |     unsigned char tmp = 0;
 93 |     unsigned char* tmpptr = &(tmp);
 94 |     
 95 |     size_t ix = 0;
 96 |     const size_t half_size = size / 2;
 97 |     for (size_t i = 0; i < nmemb; i++) {
 98 |         for(ix = 0; ix < half_size; ix++){
 99 |             *tmpptr = *buffer_src1;
100 |             *buffer_src1++ = *buffer_src2;
101 |             *buffer_src2-- = *tmpptr;
102 |         }
103 |         buffer_src1 += half_size;
104 |         buffer_src2 += half_size + size;
105 |     }
106 | }
107 | 
108 | size_t lendian_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
109 |     const size_t len = fread(ptr, size, nmemb, stream);
110 |     if( !isLittleEndian() ){
111 |         // little endian to big endian
112 |         swap_endianess(ptr, size, nmemb);
113 |     }
114 |     return( len );
115 | }
116 | 
117 | size_t lendian_fwrite(void *ptr, size_t size, size_t nmemb, FILE *stream) {
118 |     if( !isLittleEndian() ){
119 |         // big endian to little endian
120 |         swap_endianess(ptr, size, nmemb);
121 |     }
122 |     return( fwrite(ptr, size, nmemb, stream) );
123 | }
124 | 
125 | 
126 | void lendian_assign(void* dst, const void* src, const size_t& elem_size, const size_t& nelems){
127 |     if( !isLittleEndian() ){
128 |         const unsigned char *buffer_src = (const unsigned char*)src;
129 |         unsigned char *buffer_dst = (unsigned char*)dst;
130 |         size_t i = 0;
131 |         for(size_t idx = 0; idx < nelems; idx++){
132 |             for(i = 0; i < elem_size; i++, buffer_dst++){
133 |                 *buffer_dst = *(buffer_src + (idx * elem_size + elem_size - i - 1));
134 |             }
135 |         }
136 |     } else {
137 |         memcpy(dst, src, elem_size * nelems);
138 |     }
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/man/mapreduce.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/mapreduce.R
  3 | \name{mapreduce}
  4 | \alias{mapreduce}
  5 | \alias{mapreduce,FileArray,ANY,function-method}
  6 | \alias{mapreduce,FileArray,ANY,NULL-method}
  7 | \alias{mapreduce,FileArray,ANY,missing-method}
  8 | \title{A map-reduce method to iterate blocks of file-array data with little memory usage}
  9 | \usage{
 10 | mapreduce(x, map, reduce, ...)
 11 | 
 12 | \S4method{mapreduce}{FileArray,ANY,function}(x, map, reduce, buffer_size = NA, ...)
 13 | 
 14 | \S4method{mapreduce}{FileArray,ANY,NULL}(x, map, reduce, buffer_size = NA, ...)
 15 | 
 16 | \S4method{mapreduce}{FileArray,ANY,missing}(x, map, reduce, buffer_size = NA, ...)
 17 | }
 18 | \arguments{
 19 | \item{x}{a file array object}
 20 | 
 21 | \item{map}{mapping function that receives 3 arguments; see 'Details'}
 22 | 
 23 | \item{reduce}{\code{NULL}, or a function that takes a list as input}
 24 | 
 25 | \item{...}{passed to other methods}
 26 | 
 27 | \item{buffer_size}{control how we split the array; see 'Details'}
 28 | }
 29 | \value{
 30 | If \code{reduce} is \code{NULL}, return mapped results, otherwise
 31 | return reduced results from \code{reduce} function
 32 | }
 33 | \description{
 34 | A map-reduce method to iterate blocks of file-array data with little memory usage
 35 | }
 36 | \details{
 37 | When handling out-of-memory arrays, it is recommended to load
 38 | a block of array at a time and execute on block level. See 
 39 | \code{\link{apply}} for a implementation. When an array is too large,
 40 | and when there are too many blocks, this operation will become 
 41 | very slow if computer memory is low. 
 42 | This is because the R will perform garbage collection frequently. 
 43 | Implemented in \code{C++}, \code{mapreduce} creates a buffer to store
 44 | the block data. By reusing the memory over and over again, it is possible
 45 | to iterate through the array with minimal garbage collections. Many 
 46 | statistics, including \code{min}, \code{max}, \code{sum}, 
 47 | \code{mean}, ... These statistics can be calculated in this 
 48 | way efficiently.
 49 | 
 50 | The function \code{map} contains three arguments: \code{data} (mandate), 
 51 | \code{size} (optional), and \code{first_index} (optional). 
 52 | The \code{data} is the buffer,
 53 | whose length is consistent across iterations. \code{size} indicates
 54 | the effective size of the buffer. If the partition size
 55 | is not divisible by the buffer size, only first \code{size} elements of
 56 | the data are from array, and the rest elements will be \code{NA}. 
 57 | This situation could only occurs when \code{buffer_size} is manually 
 58 | specified. By default, all of \code{data} should belong to arrays.
 59 | The last argument \code{first_index} is the index of the first element
 60 | \code{data[1]} in the whole array. It is useful when positional data 
 61 | is needed. 
 62 | 
 63 | The buffer size, specified by \code{buffer_size} is an 
 64 | additional optional argument in \code{...}. Its default is \code{NA},
 65 | and will be calculated automatically. If manually specified, a
 66 | large buffer size would be desired to speed up the calculation.
 67 | The default buffer size will not exceed \eqn{nThreads x 2MB}, where 
 68 | \code{nThreads} is the number of threads set by \code{\link{filearray_threads}}.
 69 | When partition length cannot be divided by the buffer size, instead of
 70 | trimming the buffer, \code{NA}s will be filled to the buffer, 
 71 | passed to \code{map} function; see previous paragraph for treatments.
 72 | 
 73 | The function \code{mapreduce} ignores the missing partitions. That means
 74 | if a partition is missing, its data will not be read nor passed to 
 75 | \code{map} function. Please run \code{x$initialize_partition()} to make sure
 76 | partition files exist.
 77 | }
 78 | \examples{
 79 | 
 80 | 
 81 | x <- filearray_create(tempfile(), c(100, 100, 10))
 82 | x[] <- rnorm(1e5)
 83 | 
 84 | ## calculate summation
 85 | # identical to sum(x[]), but is more feasible in large cases
 86 | 
 87 | mapreduce(x, map = function(data, size){
 88 |     # make sure `data` is all from array
 89 |     if(length(data) != size){
 90 |         data <- data[1:size]
 91 |     }
 92 |     sum(data)
 93 | }, reduce = function(mapped_list){
 94 |     do.call(sum, mapped_list)
 95 | })
 96 | 
 97 | 
 98 | ## Find elements are less than -3
 99 | positions <- mapreduce(
100 |     x,
101 |     map = function(data, size, first_index) {
102 |         if (length(data) != size) {
103 |             data <- data[1:size]
104 |         }
105 |         which(data < -3) + (first_index - 1)
106 |     },
107 |     reduce = function(mapped_list) {
108 |         do.call(c, mapped_list)
109 |     }
110 | )
111 | 
112 | if(length(positions)){
113 |     x[[positions[1]]]
114 | }
115 | 
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/adhoc/other docs/readme-figure.R:
--------------------------------------------------------------------------------
  1 | set.seed(1)
  2 | dim <- c(100,200,200,100)
  3 | lazyx <- lazyarray::create_lazyarray(
  4 |     tempfile(), storage_format = 'double', dim = dim)
  5 | lazyx[] <- NA
  6 | filex <- filearray::filearray_create(
  7 |     tempfile(), dim, 'double')
  8 | filex$initialize_partition()
  9 | xs <- list(lazyx, filex)
 10 | 
 11 | lazyarray:::set_lazy_threads(8)
 12 | filearray::filearray_threads(8)
 13 | 
 14 | set.seed(1)
 15 | tmp <- rnorm(4e7)
 16 | 
 17 | speed1 <- rowMeans(replicate(1, {
 18 |     res <- sapply(xs, function(x){
 19 |         gc()
 20 |         system.time({
 21 |             for(i in 1:10){
 22 |                 x[,,,(i-1)*10 +1:10] <- tmp
 23 |             }
 24 |         }, gcFirst = TRUE)
 25 |     })
 26 |     speed <- prod(dim) *8e-6 / res[3,]; speed
 27 | })); speed1
 28 | 
 29 | speed2 <- rowMeans(replicate(1, {
 30 |     res <- sapply(xs, function(x){
 31 |         gc()
 32 |         system.time({
 33 |             for(i in 1:10){
 34 |                 x[,,,(i-1)*10 +1:10]
 35 |             }
 36 |         }, gcFirst = TRUE)
 37 |     })
 38 |     speed <- prod(dim) *8e-6 / res[3,]; speed
 39 | })); speed2
 40 | 
 41 | set.seed(1)
 42 | locs <- lapply(dim, function(d){
 43 |     sample(1:d, replace = FALSE, size = sample(50:d, 1))
 44 | })
 45 | 
 46 | speed3 <- rowMeans(replicate(10, {
 47 |     res <- sapply(xs, function(x){
 48 |         gc()
 49 |         system.time({
 50 |             x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]]
 51 |         }, gcFirst = TRUE)
 52 |     })
 53 |     speed <- prod(sapply(locs, length)) *8e-6 / res[3,]; speed
 54 | })); speed3
 55 | 
 56 | tmp <- rnorm(prod(sapply(locs, length)))
 57 | speed4 <- rowMeans(replicate(10, {
 58 |     res <- sapply(xs, function(x){
 59 |         gc()
 60 |         system.time({
 61 |             x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp
 62 |         }, gcFirst = TRUE)
 63 |     })
 64 |     speed <- prod(sapply(locs, length)) *8e-6 / res[3,]; speed
 65 | })); speed4
 66 | 
 67 | 
 68 | rm(tmp, sudo_pwd); gc()
 69 | 
 70 | f <- function(txt_cex = 1.2){
 71 |     
 72 |     cols <- c("orange", "dodgerblue3")
 73 |     speed <- cbind(
 74 |         speed1,
 75 |         speed4,
 76 |         speed2,
 77 |         speed3
 78 |     )
 79 |     rownames(speed) <-c("lazyarray", "filearray")
 80 |     colnames(speed) <- NULL
 81 |     
 82 |     txt_cex2 = txt_cex
 83 |     
 84 |     par(mfrow = c(1,2))
 85 |     
 86 |     plt <- barplot.default(
 87 |         speed[,1:2], beside = TRUE,
 88 |         ylab = "Speed (MB/s)", 
 89 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
 90 |         ylim = c(0, 1000), las = 1, yaxt = "n", 
 91 |         border = NA,
 92 |         main = "Single threaded\nmemory purged", 
 93 |         cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4
 94 |     )
 95 |     axis(2, c(0, 350, 700), las = 1)
 96 |     axis(1, colMeans(plt), las = 1, label = c(
 97 |         "Write 10GB Data\n(Sequential)",
 98 |         "Substitute 800MB Data\n(Random)"
 99 |         # "Read 10GB Data\n(Sequential)",
100 |         # "Subset 800MB Data\n(Random)"
101 |     ), tick = FALSE)
102 |     text.default(
103 |         x = plt,
104 |         y = speed[,1:2],
105 |         labels = sprintf(c(
106 |             "\n%.0f MB/s", 
107 |             "\n%.0f MB/s"
108 |         ), speed[,1:2]),
109 |         cex = txt_cex,
110 |         col = 'white'
111 |     )
112 |     text.default(
113 |         x = plt,
114 |         y = speed[,1:2],
115 |         labels = sprintf("%s\n", rownames(speed)),
116 |         cex = txt_cex2,
117 |         col = cols
118 |     )
119 |     
120 |     plt <- barplot.default(
121 |         speed[,3:4], beside = TRUE,
122 |         ylab = "Speed (GB/s)", 
123 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
124 |         ylim = c(0, 2500), las = 1, yaxt = "n", 
125 |         border = NA,
126 |         main = "Single threaded\nmemory purged", 
127 |         cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4
128 |     )
129 |     axis(2, c(0, 1000, 2000), las = 1, labels = c(0,1,2))
130 |     axis(1, colMeans(plt), las = 1, label = c(
131 |         # "Write 10GB Data\n(Sequential)",
132 |         # "Substitute 800MB Data\n(Random)"
133 |         "Read 10GB Data\n(Sequential)",
134 |         "Subset 800MB Data\n(Random)"
135 |     ), tick = FALSE)
136 |     text.default(
137 |         x = plt,
138 |         y = speed[,3:4],
139 |         labels = sprintf(c(
140 |             "\n%.0f MB/s", 
141 |             "\n%.0f MB/s"
142 |         ), speed[,3:4]),
143 |         cex = txt_cex,
144 |         col = 'white'
145 |     )
146 |     text.default(
147 |         x = plt,
148 |         y = speed[,3:4],
149 |         labels = sprintf("%s\n", rownames(speed)),
150 |         cex = txt_cex2,
151 |         col = cols
152 |     )
153 |     
154 |     
155 | }
156 | 
157 | # png("./adhoc/other docs/comparison-simple.png", width = 4267, height = 1600, res = 300)
158 | f(0.7)
159 | dev.off()
160 | 


--------------------------------------------------------------------------------
/tests/testthat/test-bind.R:
--------------------------------------------------------------------------------
  1 | test_that("bind (FileArray)", {
  2 |     
  3 |     
  4 |     x <- array(rnorm(120), 2:5)
  5 |     y <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L)
  6 |     z <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L)
  7 |     
  8 |     options("filearray.quiet" = FALSE)
  9 |     on.exit({
 10 |         options("filearray.quiet" = FALSE)
 11 |         y$delete(force = TRUE)
 12 |         z$delete(force = TRUE)
 13 |     }, add = TRUE)
 14 |     lapply(1:10, function(ii){
 15 |         if(ii %% 2 == 0){
 16 |             y[,,,ii] <- x[,,,ii / 2]
 17 |             z[,,,ii] <- x[,,,ii / 2]
 18 |         }
 19 |     })
 20 |     
 21 |     testthat::expect_warning({
 22 |         w <- filearray_bind(y, z, symlink = FALSE)
 23 |         w$delete()
 24 |     }, regexp = "^One or more arrays have last margin size.+")
 25 |     
 26 |     
 27 |     
 28 |     options("filearray.quiet" = TRUE)
 29 |     y$expand(n = 12)
 30 |     z$expand(n = 12)
 31 |     w <- filearray_bind(y, z, symlink = TRUE)
 32 |     l <- filearray_load(w$.filebase, mode = "readonly")
 33 |     
 34 |     on.exit({
 35 |         w$.mode <- "readwrite"
 36 |         w$delete()
 37 |         l$.mode <- "readwrite"
 38 |         l$delete()
 39 |     }, add = TRUE)
 40 |     
 41 |     expect_null({
 42 |         filearray_checkload(filebase = w$.filebase, symlink_ok = TRUE)
 43 |         filearray_checkload(filebase = w$.filebase, partition = 3)
 44 |         NULL
 45 |     })
 46 |     
 47 |     if(w$.header$filearray_bind$symlink){
 48 |         expect_error({
 49 |             filearray_checkload(filebase = w$.filebase, symlink_ok = FALSE)
 50 |         })
 51 |     }
 52 |     
 53 |     expect_identical(w[], l[])
 54 |     expect_identical(w[,,,seq(2,10,2)], x)
 55 |     expect_identical(w[,,,seq(2,10,2) + 12], x)
 56 |     expect_identical(l[,,,seq(2,10,2)], x)
 57 |     expect_identical(l[,,,seq(2,10,2) + 12], x)
 58 |     
 59 |     expect_identical(l[,,,seq(2,10,2) + c(0, 12, 12, 0, NA)], x[,,,c(1:4, NA)])
 60 |     
 61 |     expect_equal(
 62 |         l$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE),
 63 |         apply(x, c(2,3), sum) * 2
 64 |     )
 65 |     
 66 |     # Check if cached bind works
 67 |     l <- filearray_bind(y, z, filebase = w$.filebase, symlink = w$.header$filearray_bind$symlink, overwrite = TRUE, cache_ok = TRUE)
 68 |     
 69 |     expect_true(attr(l, "cached_bind"))
 70 |     
 71 | })
 72 | 
 73 | 
 74 | test_that("bind (FileArrayProxy)", {
 75 |     
 76 |     
 77 |     y <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L, type = "integer")
 78 |     z <- filearray_create(tempfile(), dimension = c(2,3,4,10), partition_size = 3L, type = "double")
 79 |     
 80 |     options("filearray.quiet" = FALSE)
 81 |     on.exit({
 82 |         options("filearray.quiet" = FALSE)
 83 |         y$delete(force = TRUE)
 84 |         z$delete(force = TRUE)
 85 |     }, add = TRUE)
 86 |     
 87 |     y[] <- rep(0L, 240)
 88 |     z[] <- rep(0L, 240)
 89 |     x <- array(rnorm(240), c(2,3,4,10))
 90 |     
 91 |     y <- y + x
 92 |     z <- z + x
 93 |     
 94 |     testthat::expect_error(y$expand(n = 12))
 95 |     
 96 |     options("filearray.quiet" = TRUE)
 97 |     w <- filearray_bind(y, z, symlink = TRUE)
 98 |     l <- filearray_load(w$.filebase, mode = "readonly")
 99 |     
100 |     on.exit({
101 |         w$.mode <- "readwrite"
102 |         w$delete()
103 |         l$.mode <- "readwrite"
104 |         l$delete()
105 |     }, add = TRUE)
106 |     
107 |     expect_null({
108 |         filearray_checkload(filebase = w$.filebase, symlink_ok = TRUE)
109 |         filearray_checkload(filebase = w$.filebase, partition = 3)
110 |         NULL
111 |     })
112 |     
113 |     if(w$.header$filearray_bind$symlink){
114 |         expect_error({
115 |             filearray_checkload(filebase = w$.filebase, symlink_ok = FALSE)
116 |         })
117 |     }
118 |     
119 |     expect_identical(w[], l[])
120 |     expect_identical(w[,,,1:10, dimnames = NULL], x)
121 |     expect_identical(w[,,,1:10 + 12, dimnames = NULL], x)
122 |     expect_identical(l[,,,1:10, dimnames = NULL], x)
123 |     expect_identical(l[,,,1:10 + 12, dimnames = NULL], x)
124 |     
125 |     expect_identical(l[,,,seq(2,10,2) + c(0, 12, 12, 0, NA), dimnames = NULL], x[,,,c(1:4*2, NA)])
126 |     
127 |     expect_equal(
128 |         l$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE),
129 |         apply(x, c(2,3), sum) * 2
130 |     )
131 |     
132 |     expect_equal(
133 |         y$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE),
134 |         apply(x, c(2,3), sum)
135 |     )
136 |     expect_equal(
137 |         z$collapse(keep = c(2, 3), method = "sum", na.rm = TRUE),
138 |         apply(x, c(2,3), sum)
139 |     )
140 |     
141 |     
142 |     # Check if cached bind works
143 |     # l <- filearray_bind(y, z, filebase = w$.filebase, symlink = w$.header$filearray_bind$symlink, overwrite = TRUE, cache_ok = TRUE)
144 |     # 
145 |     # expect_true(attr(l, "cached_bind"))
146 |     
147 | })
148 | 


--------------------------------------------------------------------------------
/adhoc/other docs/comparison-singlethread.R:
--------------------------------------------------------------------------------
  1 | sudo_pwd <- rstudioapi::askForPassword("sudo password")
  2 | cold_start <- TRUE
  3 | purge_memory <- function(){
  4 |     if(cold_start){
  5 |         system("sudo -kS purge", input = sudo_pwd)
  6 |         Sys.sleep(1)
  7 |     }
  8 | }
  9 | 
 10 | set.seed(1)
 11 | dim <- c(100,200,200,100)
 12 | lazyx <- lazyarray::create_lazyarray(
 13 |     tempfile(), storage_format = 'double', dim = dim)
 14 | lazyx[] <- NA
 15 | filex <- filearray::filearray_create(
 16 |     tempfile(), dim, 'double')
 17 | filex$initialize_partition()
 18 | xs <- list(lazyx, filex)
 19 | 
 20 | lazyarray:::set_lazy_threads(1)
 21 | filearray::filearray_threads(1)
 22 | 
 23 | set.seed(1)
 24 | tmp <- rnorm(4e7)
 25 | 
 26 | 
 27 | speed1 <- rowMeans(replicate(5, {
 28 |     res <- sapply(xs, function(x){
 29 |         gc()
 30 |         purge_memory()
 31 |         system.time({
 32 |             for(i in 1:10){
 33 |                 x[,,,(i-1)*10 +1:10] <- tmp
 34 |             }
 35 |         }, gcFirst = TRUE)
 36 |     })
 37 |     speed <- prod(dim) *8e-6 / res[3,]; speed
 38 | })); speed1
 39 | 
 40 | speed2 <- rowMeans(replicate(5, {
 41 |     res <- sapply(xs, function(x){
 42 |         gc()
 43 |         purge_memory()
 44 |         system.time({
 45 |             for(i in 1:10){
 46 |                 x[,,,(i-1)*10 +1:10]
 47 |             }
 48 |         }, gcFirst = TRUE)
 49 |     })
 50 |     speed <- prod(dim) *8e-6 / res[3,]; speed
 51 | })); speed2
 52 | 
 53 | set.seed(1)
 54 | locs <- lapply(dim, function(d){
 55 |     sample(1:d, replace = FALSE, size = 100)
 56 | })
 57 | 
 58 | speed3 <- rowMeans(replicate(5, {
 59 |     res <- sapply(xs, function(x){
 60 |         gc()
 61 |         purge_memory()
 62 |         system.time({
 63 |             x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]]
 64 |         }, gcFirst = TRUE)
 65 |     })
 66 |     speed <- 800 / res[3,]; speed
 67 | })); speed3
 68 | 
 69 | tmp <- rnorm(prod(sapply(locs, length)))
 70 | speed4 <- rowMeans(replicate(5, {
 71 |     res <- sapply(xs, function(x){
 72 |         gc()
 73 |         purge_memory()
 74 |         system.time({
 75 |             x[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp
 76 |         }, gcFirst = TRUE)
 77 |     })
 78 |     speed <- 800 / res[3,]; speed
 79 | })); speed4
 80 | 
 81 | 
 82 | rm(tmp, sudo_pwd); gc()
 83 | 
 84 | f <- function(){
 85 |     
 86 |     cols <- c("orange", "dodgerblue3")
 87 |     speed <- cbind(
 88 |         speed1,
 89 |         speed4,
 90 |         speed2,
 91 |         speed3
 92 |     )
 93 |     rownames(speed) <-c("lazyarray", "filearray")
 94 |     colnames(speed) <- NULL
 95 |     
 96 |     txt_cex = 1.2
 97 |     txt_cex2 = 1.2
 98 |     
 99 |     par(mfrow = c(1,2))
100 |     
101 |     plt <- barplot.default(
102 |         speed[,1:2], beside = TRUE,
103 |         ylab = "Speed (MB/s)", 
104 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
105 |         ylim = c(0, 1000), las = 1, yaxt = "n", 
106 |         border = NA,
107 |         main = sprintf("Single threaded\n%s", ifelse(cold_start, "memory purged", "")), 
108 |         cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4
109 |     )
110 |     axis(2, c(0, 350, 700), las = 1)
111 |     axis(1, colMeans(plt), las = 1, label = c(
112 |         "Write 10GB Data\n(Sequential)",
113 |         "Substitute 800MB Data\n(Random)"
114 |         # "Read 10GB Data\n(Sequential)",
115 |         # "Subset 800MB Data\n(Random)"
116 |     ), tick = FALSE)
117 |     text.default(
118 |         x = plt,
119 |         y = speed[,1:2],
120 |         labels = sprintf(c(
121 |             "\n%.0f MB/s", 
122 |             "\n%.0f MB/s"
123 |         ), speed[,1:2]),
124 |         cex = txt_cex,
125 |         col = 'white'
126 |     )
127 |     text.default(
128 |         x = plt,
129 |         y = speed[,1:2],
130 |         labels = sprintf("%s\n", rownames(speed)),
131 |         cex = txt_cex2,
132 |         col = cols
133 |     )
134 |     
135 |     plt <- barplot.default(
136 |         speed[,3:4], beside = TRUE,
137 |         ylab = "Speed (GB/s)", 
138 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
139 |         ylim = c(0, 2500), las = 1, yaxt = "n", 
140 |         border = NA,
141 |         main = sprintf("Single threaded\n%s", ifelse(cold_start, "memory purged", "")), 
142 |         cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4
143 |     )
144 |     axis(2, c(0, 1000, 2000), las = 1, labels = c(0,1,2))
145 |     axis(1, colMeans(plt), las = 1, label = c(
146 |         # "Write 10GB Data\n(Sequential)",
147 |         # "Substitute 800MB Data\n(Random)"
148 |         "Read 10GB Data\n(Sequential)",
149 |         "Subset 800MB Data\n(Random)"
150 |     ), tick = FALSE)
151 |     text.default(
152 |         x = plt,
153 |         y = speed[,3:4],
154 |         labels = sprintf(c(
155 |             "\n%.0f MB/s", 
156 |             "\n%.0f MB/s"
157 |         ), speed[,3:4]),
158 |         cex = txt_cex,
159 |         col = 'white'
160 |     )
161 |     text.default(
162 |         x = plt,
163 |         y = speed[,3:4],
164 |         labels = sprintf("%s\n", rownames(speed)),
165 |         cex = txt_cex2,
166 |         col = cols
167 |     )
168 |     
169 |     
170 | }
171 | 
172 | png(sprintf("./adhoc/other docs/comparison-singlethread-%s.png", ifelse(cold_start, "coldstart", "warmstart")), width = 4267, height = 1600, res = 300)
173 | f()
174 | dev.off()
175 | 


--------------------------------------------------------------------------------
/man/fmap.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/method-map.R
  3 | \name{fmap}
  4 | \alias{fmap}
  5 | \alias{fmap2}
  6 | \alias{fmap_element_wise}
  7 | \title{Map multiple file arrays and save results}
  8 | \usage{
  9 | fmap(
 10 |   x,
 11 |   fun,
 12 |   .y = NULL,
 13 |   .buffer_count = NA_integer_,
 14 |   .output_size = NA_integer_,
 15 |   ...
 16 | )
 17 | 
 18 | fmap2(x, fun, .buffer_count = NA, .simplify = TRUE, ...)
 19 | 
 20 | fmap_element_wise(x, fun, .y, ..., .input_size = NA)
 21 | }
 22 | \arguments{
 23 | \item{x}{a list of file arrays to map; each element of \code{x} must 
 24 | share the same dimensions.}
 25 | 
 26 | \item{fun}{function that takes one list}
 27 | 
 28 | \item{.y}{a file array object, used to save results}
 29 | 
 30 | \item{.buffer_count}{number of total buffers (chunks) to run}
 31 | 
 32 | \item{.output_size}{\code{fun} output vector length}
 33 | 
 34 | \item{...}{other arguments passing to \code{fun}}
 35 | 
 36 | \item{.simplify}{whether to apply \code{\link[base]{simplify2array}} to 
 37 | the result}
 38 | 
 39 | \item{.input_size}{number of elements to read from each array of \code{x}}
 40 | }
 41 | \value{
 42 | File array instance \code{.y}
 43 | }
 44 | \description{
 45 | Advanced mapping function for multiple file arrays. \code{fmap}
 46 | runs the mapping functions and stores the results in file arrays. 
 47 | \code{fmap2} stores results in memory. This 
 48 | feature is experimental. There are several constraints to the input. 
 49 | Failure to meet these constraints may result in undefined results, or 
 50 | even crashes. Please read Section 'Details' carefully before using 
 51 | this function.
 52 | }
 53 | \details{
 54 | Denote the first argument of \code{fun} as \code{input}, The length 
 55 | of \code{input} equals the length of \code{x}. The size of each
 56 | element of \code{input} is defined by \code{.input_size}, except for the
 57 | last loop. For example, given dimension of each input array as 
 58 | \eqn{10x10x10x10}, if \code{.input_size=100}, then 
 59 | \code{length(input[[1]])=100}. The total number of runs equals to
 60 | \code{length(x[[1]])/100}. If \code{.input_size=300}, then 
 61 | \code{length(input[[1]])} will be \code{300} except for the last run. 
 62 | This is because \eqn{10000} cannot be divided by \code{300}. 
 63 | The element length of the last run will be \code{100}. 
 64 | 
 65 | The returned variable length of \code{fun} will be checked by 
 66 | \code{.output_size}. If the output length exceed \code{.output_size}, 
 67 | an error will be raised.
 68 | 
 69 | Please make sure that \code{length(.y)/length(x[[1]])} equals to 
 70 | \code{.output_size/.input_size}.
 71 | 
 72 | For \code{fmap_element_wise}, the \code{input[[1]]} and output length 
 73 | must be the consistent.
 74 | }
 75 | \examples{
 76 | 
 77 | 
 78 | set.seed(1)
 79 | x1 <- filearray_create(tempfile(), dimension = c(100,20,3))
 80 | x1[] <- rnorm(6000)
 81 | x2 <- filearray_create(tempfile(), dimension = c(100,20,3))
 82 | x2[] <- rnorm(6000)
 83 | 
 84 | # Add two arrays
 85 | output <- filearray_create(tempfile(), dimension = c(100,20,3))
 86 | fmap(list(x1, x2), function(input){
 87 |     input[[1]] + input[[2]]
 88 | }, output)
 89 | 
 90 | # check
 91 | range(output[] - (x1[] + x2[]))
 92 | 
 93 | output$delete()
 94 | 
 95 | # Calculate the maximum of x1/x2 for every 100 elements
 96 | # total 60 batches/loops (`.buffer_count`)
 97 | output <- filearray_create(tempfile(), dimension = c(20,3))
 98 | fmap(list(x1, x2), function(input){
 99 |     max(input[[1]] / input[[2]])
100 | }, .y = output, .buffer_count = 60)
101 | 
102 | # check
103 | range(output[] - apply(x1[] / x2[], c(2,3), max))
104 | 
105 | output$delete()
106 | 
107 | # A large array example
108 | if(interactive()){
109 |     x <- filearray_create(tempfile(), dimension = c(287, 100, 301, 4))
110 |     dimnames(x) <- list(
111 |         Trial = 1:287,
112 |         Marker = 1:100,
113 |         Time = 1:301,
114 |         Location = 1:4
115 |     )
116 | 
117 |     for(i in 1:4){
118 |         x[,,,i] <- runif(8638700)
119 |     }
120 |     # Step 1:
121 |     # for each location, trial, and marker, calibrate (baseline)
122 |     # according to first 50 time-points
123 | 
124 |     output <- filearray_create(tempfile(), dimension = dim(x))
125 | 
126 |     # baseline-percentage change
127 |     fmap(
128 |         list(x),
129 |         function(input){
130 |             # get locational data
131 |             location_data <- input[[1]]
132 |             dim(location_data) <- c(287, 100, 301)
133 | 
134 |             # collapse over first 50 time points for
135 |             # each trial, and marker
136 |             baseline <- apply(location_data[,,1:50], c(1,2), mean)
137 | 
138 |             # calibrate
139 |             calibrated <- sweep(location_data, c(1,2), baseline,
140 |                                 FUN = function(data, bl){
141 |                                     (data / bl - 1) * 100
142 |                                 })
143 |             return(calibrated)
144 |         },
145 | 
146 |         .y = output,
147 | 
148 |         # input dimension is 287 x 100 x 301 for each location
149 |         # hence 4 loops in total
150 |         .buffer_count = 4
151 |     )
152 | 
153 |     # cleanup
154 |     x$delete()
155 | 
156 | }
157 | 
158 | # cleanup
159 | x1$delete()
160 | x2$delete()
161 | output$delete()
162 | }
163 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # filearray 0.2.0
 2 | 
 3 | * Additional `audo_set_headers` to function `filearray_load_or_create` to avoid automatically setting headers, with default being `TRUE` for compatibility concerns
 4 | 
 5 | # filearray 0.1.9
 6 | 
 7 | * Fixed `CRAN` check issue brought by `testthat` update, which requires `bit64` when checking `int64` arrays. Added `bit64` to `Suggests` field to mitigate this check error
 8 | 
 9 | # filearray 0.1.8
10 | 
11 | * Fixed a bug where `C++` errors are not correctly thrown due to a recent `Rcpp` update. This bug may cause erroneous code to execute ignoring try-catch statement and imbalanced protecting stacks (`fwhich`, `mapreduce`)
12 | 
13 | # filearray 0.1.7
14 | 
15 | * Removed non-API R C interface
16 | * Allows `subsetAssign` to assign when value length is 1 (thanks `@talegari` issue `#9`)
17 | * Allows more flexible `fwhich` function with functional `val` input and options to include corresponding values as part of return (thanks `@talegari` issue `#10`)
18 | * Fixed `mapreduce` bug when partition length is way larger than the buffer size, causing buffer to stop updating
19 | * Fixed other `C++` issues, such as unused buffer allocation in subset-assign, and `min_` offset in `sub_vec_range`. These issues are not considered bugs since they don't affect the results, but the performance is slightly increased after fix.
20 | * Replaced `dipsaus` with more updated `ravetools` in the vignette performance profiling
21 | * Updated examples to get rid of warning messages and out-dated arguments.
22 | 
23 | # filearray 0.1.6
24 | 
25 | * Removed `c++11` from system requirement and `Makevars`
26 | * Used `TinyThreads` instead of `OpenMP` to get parallel working on `OSX`
27 | * Added `as_filearray` method, and support optional `float`
28 | * Added array proxy class, allowing arrays to lazy-evaluate simple operators
29 | * Allows user-defined temporary file array path
30 | * Fixed `fmap` issues, using better guesses for default `.input_size`
31 | * Fixed a memory bug caused when partition margin has elements greater than 1 and when `FARR_subset_sequential` is used
32 | * Used `fastmap` to avoid environment look-up
33 | * `fa_eval_ops` allows dimension names
34 | * Larger default buffer size to allow `2^20` in single partition
35 | * Disabled single indexing
36 | * Fixed `endian` issue on `big-endian` platforms
37 | 
38 | # filearray 0.1.5
39 | 
40 | * Fixed a bug when trying to read array data sequentially. The bug is caused by buffer size being greater than the array length, making in a pointer that controls the partition number exceed the end of vector, resulting in undefined behavior. The functions affected are: `fmap`, `fmap2`. The bug has been fixed and passed `valgrind` memory check.
41 | 
42 | # filearray 0.1.4
43 | 
44 | * Fixed a bug when allocated memory is one byte short than requested. The bug would crash R when triggered in certain cases.
45 | * Removed limit to the maximum number of partitions when writing. The previous implementation creates and opens related file descriptors all at once before writing. This setup will raise errors when the number of connections reach to certain limit, often defined by the operating systems. This update only opens the connection on demand. The performance might be impacted when writing to disk, but in return, the program will be more robust
46 | * Fixed `subset` function environment not resolved correctly when using formula
47 | * Added `filearray_load_or_create` as an alternative to `filearray_checkload` by automatically replace existing obsolete array files if the headers, dimensions, or data types don't match. Also `on_missing` argument is provided to allow array initialization if new array is created.
48 | 
49 | # filearray 0.1.3
50 | 
51 | * Automatically detect whether symbolic-link works and show warnings
52 | * Warnings can be suppressed
53 | * Allow extra headers to be set in `meta` file
54 | * Added header signature method
55 | * Fixed symbolic-link issues on `Windows` when partition sizes are 0
56 | * Added check-load function `filearray_checkload` to validate header
57 | * Fixed collapse method when `dimnames` are set
58 | * Fixed an unprotected variable in `C++` code
59 | * `filearray_bind` can use cache if the header signatures agree
60 | * `filearray_bind` can choose to force overwrite
61 | * Added package `digest` to `Imports`
62 | * Fixed a typo and several small bugs
63 | 
64 | 
65 | # filearray 0.1.2
66 | 
67 | * Removed `flush` in saving data to let system decide when to flush to hard drive
68 | * Allowed array to expand along the partition margin
69 | * Fixed dimension name getting dropped under certain situations
70 | * Use 2 cores by default when `R CMD check` is detected
71 | 
72 | # filearray 0.1.1
73 | 
74 | * Added `OpenMP` flag in the `MakeVars`
75 | * Fixed critical bugs that could cause `segfaults`
76 | * Can store `complex` and `float` data types
77 | * Re-implemented read/write functions to use memory map
78 | * Allowed `dimnames` to be set
79 | * Added generics `subset` to subset using `dimnames`
80 | * Added vignette to compare performance
81 | * Added speed comparisons in `README.md`
82 | * Added `collapse` to calculate marginal summation with little memory overhead
83 | * Added `fmap`, `fmap2` to apply functions to one or multiple file arrays with little memory overhead (also very fast)
84 | * Fixed 'unprotected' issues warned by `rchk`
85 | 
86 | # filearray 0.1.0
87 | 
88 | * Added a `NEWS.md` file to track changes to the package.
89 | * Initial implementation
90 | 


--------------------------------------------------------------------------------
/inst/include/TinyParallel/TinyThread.h:
--------------------------------------------------------------------------------
  1 | #ifndef __FILEARRAY_PARALLEL_TINYTHREAD__
  2 | #define __FILEARRAY_PARALLEL_TINYTHREAD__
  3 | 
  4 | #include <cstdlib>
  5 | #include <cstdio>
  6 | 
  7 | #include "Common.h"
  8 | 
  9 | #include <tthread/tinythread.h>
 10 | 
 11 | 
 12 | #include <vector>
 13 | 
 14 | namespace TinyParallel {
 15 | 
 16 | namespace {
 17 | 
 18 | // Class which represents a range of indexes to perform work on
 19 | // (worker functions are passed this range so they know which
 20 | // elements are safe to read/write to)
 21 | class IndexRange {
 22 | public:
 23 | 
 24 |    // Initizlize with a begin and (exclusive) end index
 25 |    IndexRange(std::size_t begin, std::size_t end)
 26 |     : begin_(begin), end_(end)
 27 |    {
 28 |    }
 29 | 
 30 |    // Access begin() and end()
 31 |    std::size_t begin() const { return begin_; }
 32 |    std::size_t end() const { return end_; }
 33 |    std::size_t size() const { return end_ - begin_ ; }
 34 | 
 35 | private:
 36 |    std::size_t begin_;
 37 |    std::size_t end_;
 38 | };
 39 | 
 40 | 
 41 | // Because tinythread allows us to pass only a plain C function
 42 | // we need to pass our worker and range within a struct that we
 43 | // can cast to/from void*
 44 | struct Work {
 45 |    Work(IndexRange range, Worker& worker)
 46 |     :  range(range), worker(worker)
 47 |    {
 48 |    }
 49 |    IndexRange range;
 50 |    Worker& worker;
 51 | };
 52 | 
 53 | // Thread which performs work (then deletes the work object
 54 | // when it's done)
 55 | extern "C" inline void workerThread(void* data) {
 56 |    try
 57 |    {
 58 |       Work* pWork = static_cast<Work*>(data);
 59 |       pWork->worker(pWork->range.begin(), pWork->range.end());
 60 |       delete pWork;
 61 |    }
 62 |    catch(...)
 63 |    {
 64 |    }
 65 | }
 66 | 
 67 | // Function to calculate the ranges for a given input
 68 | std::vector<IndexRange> splitInputRange(const IndexRange& range,
 69 |                                         std::size_t grainSize) {
 70 | 
 71 |    // determine max number of threads
 72 |    std::size_t threads = tthread::thread::hardware_concurrency();
 73 |    char* numThreads = ::getenv("FILEARRAY_NUM_THREADS");
 74 |    if (numThreads != NULL) {
 75 |       int parsedThreads = ::atoi(numThreads);
 76 |       if (parsedThreads > 0)
 77 |          threads = parsedThreads;
 78 |    }
 79 |    // printf("Number of threads: %ld\n", threads);
 80 |    // std::cout << "Number of threads: " << threads << "\n";
 81 | 
 82 |    // compute grainSize (including enforcing requested minimum)
 83 |    std::size_t length = range.end() - range.begin();
 84 |    if (threads == 1)
 85 |       grainSize = length;
 86 |    else if ((length % threads) == 0) // perfect division
 87 |       grainSize = std::max(length / threads, grainSize);
 88 |    else // imperfect division, divide by threads - 1
 89 |       grainSize = std::max(length / (threads-1), grainSize);
 90 | 
 91 |    // allocate ranges
 92 |    std::vector<IndexRange> ranges;
 93 |    std::size_t begin = range.begin();
 94 |    std::size_t end = begin;
 95 |    while (begin < range.end()) {
 96 |      if ((range.end() - (begin + grainSize)) < grainSize)
 97 |        end = range.end();
 98 |      else
 99 |        end = std::min(begin + grainSize, range.end());
100 | 
101 |      ranges.push_back(IndexRange(begin, end));
102 |      begin = end;
103 |    }
104 | 
105 |    // return ranges
106 |    return ranges;
107 | }
108 | 
109 | } // anonymous namespace
110 | 
111 | // Execute the Worker over the IndexRange in parallel
112 | inline void ttParallelFor(std::size_t begin,
113 |                           std::size_t end,
114 |                           Worker& worker,
115 |                           std::size_t grainSize = 1)
116 | {
117 |    // split the work
118 |    IndexRange inputRange(begin, end);
119 |    std::vector<IndexRange> ranges = splitInputRange(inputRange, grainSize);
120 | 
121 |    // create threads
122 |    std::vector<tthread::thread*> threads;
123 |    for (std::size_t i = 0; i<ranges.size(); ++i) {
124 |       threads.push_back(new tthread::thread(workerThread, new Work(ranges[i], worker)));
125 |    }
126 | 
127 |    // join and delete them
128 |    for (std::size_t i = 0; i<threads.size(); ++i) {
129 |       threads[i]->join();
130 |       delete threads[i];
131 |    }
132 | }
133 | 
134 | // Execute the IWorker over the range in parallel then join results
135 | template <typename Reducer>
136 | inline void ttParallelReduce(std::size_t begin,
137 |                              std::size_t end,
138 |                              Reducer& reducer,
139 |                              std::size_t grainSize = 1)
140 | {
141 |    // split the work
142 |    IndexRange inputRange(begin, end);
143 |    std::vector<IndexRange> ranges = splitInputRange(inputRange, grainSize);
144 | 
145 |    // create threads (split for each thread and track the allocated workers)
146 |    std::vector<tthread::thread*> threads;
147 |    std::vector<Worker*> workers;
148 |    for (std::size_t i = 0; i<ranges.size(); ++i) {
149 |       Reducer* pReducer = new Reducer(reducer, TinyParallel::Split());
150 |       workers.push_back(pReducer);
151 |       threads.push_back(new tthread::thread(workerThread, new Work(ranges[i], *pReducer)));
152 |    }
153 | 
154 |    // wait for each thread, join it's results, then delete the worker & thread
155 |    for (std::size_t i = 0; i<threads.size(); ++i) {
156 |       // wait for thread
157 |       threads[i]->join();
158 | 
159 |       // join the results
160 |       reducer.join(static_cast<Reducer&>(*workers[i]));
161 | 
162 |       // delete the worker (which we split above) and the thread
163 |       delete workers[i];
164 |       delete threads[i];
165 |    }
166 | }
167 | 
168 | } // namespace TinyParallel
169 | 
170 | #endif // __FILEARRAY_PARALLEL_TINYTHREAD__
171 | 


--------------------------------------------------------------------------------
/adhoc/other docs/comparison-threads.R:
--------------------------------------------------------------------------------
  1 | sudo_pwd <- rstudioapi::askForPassword("sudo password")
  2 | cold_start <- TRUE
  3 | 
  4 | set.seed(1)
  5 | dim <- c(100,200,200,100)
  6 | filex <- filearray::filearray_create(
  7 |     tempfile(), dim, 'double')
  8 | filex$initialize_partition()
  9 | 
 10 | set.seed(1)
 11 | tmp <- rnorm(1e8)
 12 | 
 13 | threads <- c(1,2,4,8)
 14 | 
 15 | # run in order to make sure we have cold start
 16 | speed1 <- rowMeans(replicate(5, {
 17 |     res <- sapply(threads, function(thread){
 18 |         filearray::filearray_threads(thread)
 19 |         gc()
 20 |         if(cold_start){system("sudo -kS purge", input = sudo_pwd)}
 21 |         Sys.sleep(1)
 22 |         system.time({
 23 |             filex[,,,1:25 + (log2(thread) * 25)] <- tmp
 24 |         })
 25 |     })
 26 |     speed <- length(filex) *8e-6 / 4 / res[3,]; speed
 27 | })); speed1
 28 | 
 29 | rm(tmp); gc()
 30 | 
 31 | 
 32 | speed2 <- rowMeans(replicate(5, {
 33 |     res <- sapply(threads, function(thread){
 34 |         filearray::filearray_threads(thread)
 35 |         gc()
 36 |         if(cold_start){system("sudo -kS purge", input = sudo_pwd)}
 37 |         Sys.sleep(1)
 38 |         system.time({
 39 |             filex[,,,1:25 + (log2(thread) * 25)]
 40 |         })
 41 |     })
 42 |     speed <- length(filex) *8e-6 / 4 / res[3,]; speed
 43 | })); speed2
 44 | 
 45 | 
 46 | # 800 MB data indices
 47 | set.seed(1)
 48 | locs <- lapply(dim, function(d){
 49 |     sample(1:d, replace = FALSE, size = 100)
 50 | })
 51 | 
 52 | # CMD: sudo purge
 53 | speed3 <- rowMeans(replicate(5, {
 54 |     res <- sapply(threads, function(thread){
 55 |         filearray::filearray_threads(thread)
 56 |         gc()
 57 |         if(cold_start){system("sudo -kS purge", input = sudo_pwd)}
 58 |         Sys.sleep(1)
 59 |         system.time({
 60 |             filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]]
 61 |         }, gcFirst = TRUE)
 62 |     })
 63 |     speed <- 800 / res[3,]; speed
 64 | })); speed3
 65 | 
 66 | 
 67 | tmp <- rnorm(prod(sapply(locs, length)))
 68 | # filex <- filearray::filearray_load(filex$.filebase)
 69 | speed4 <- rowMeans(replicate(5, {
 70 |     res <- sapply(threads, function(thread){
 71 |         filearray::filearray_threads(thread)
 72 |         stopifnot(filearray::filearray_threads() == thread)
 73 |         gc()
 74 |         if(cold_start){system("sudo -kS purge", input = sudo_pwd)}
 75 |         Sys.sleep(1)
 76 |         system.time({
 77 |             filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp
 78 |         }, gcFirst = TRUE)
 79 |     })
 80 |     speed <- 800 / res[3,]; speed
 81 | })); speed4
 82 | 
 83 | rm(tmp, sudo_pwd); gc()
 84 | # save.image("./adhoc/other docs/comparison-threads.RData")
 85 | 
 86 | f <- function(txt_cex = 0.7){
 87 |     
 88 |     par(mfrow = c(1,2))
 89 |     
 90 |     cols <- c("orangered", "orange", "darkgreen", "dodgerblue3")
 91 |     speed <- cbind(
 92 |         speed1,
 93 |         speed4,
 94 |         speed2,
 95 |         speed3
 96 |     ) / 1024
 97 |     rownames(speed) <-c("1-thread", paste0(c(2,4,8), "-threads"))
 98 |     colnames(speed) <- NULL
 99 |     
100 |     
101 |     txt_cex2 = txt_cex
102 |     
103 |     idx <- 1:2
104 |     plt <- barplot.default(
105 |         speed[,idx], beside = TRUE,
106 |         ylab = "Speed (GB/s)", 
107 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
108 |         ylim = c(0, 1.2), las = 1, yaxt = "n", 
109 |         border = NA,
110 |         main = "Speed Comparisons (Different Threads, memory purged)", 
111 |         cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4
112 |     )
113 |     axis(2, c(0,0.5,1), las = 1)
114 |     axis(1, colMeans(plt), las = 1, label = c(
115 |         "Write 800MB Data\n(Sequential)",
116 |         "Substitute 800MB Data\n(Random)"
117 |         # "Read 800MB Data\n(Sequential)",
118 |         # "Subset 800MB Data\n(Random)"
119 |     ), tick = FALSE)
120 |     text.default(
121 |         x = plt,
122 |         y = speed[,idx],
123 |         labels = sprintf(c(
124 |             "\n%.0f MB/s", 
125 |             "\n%.0f MB/s"
126 |         ), speed[,idx] * 1024),
127 |         cex = txt_cex,
128 |         col = 'white'
129 |     )
130 |     text.default(
131 |         x = plt,
132 |         y = speed[,idx],
133 |         labels = sprintf("%s\n", rownames(speed)),
134 |         cex = txt_cex2,
135 |         col = cols
136 |     )
137 |     
138 |     
139 |     idx <- 3:4
140 |     plt <- barplot.default(
141 |         speed[,idx], beside = TRUE,
142 |         ylab = "Speed (GB/s)", 
143 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
144 |         ylim = c(0, 3.5), las = 1, yaxt = "n", 
145 |         border = NA,
146 |         main = "Speed Comparisons (Different Threads, memory purged)", 
147 |         cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4
148 |     )
149 |     axis(2, c(0:3), las = 1)
150 |     axis(1, colMeans(plt), las = 1, label = c(
151 |         # "Write 800MB Data\n(Sequential)",
152 |         # "Substitute 800MB Data\n(Random)"
153 |         "Read 800MB Data\n(Sequential)",
154 |         "Subset 800MB Data\n(Random)"
155 |     ), tick = FALSE)
156 |     text.default(
157 |         x = plt,
158 |         y = speed[,idx],
159 |         labels = sprintf(c(
160 |             "\n%.0f MB/s", 
161 |             "\n%.0f MB/s"
162 |         ), speed[,idx] * 1024),
163 |         cex = txt_cex,
164 |         col = 'white'
165 |     )
166 |     text.default(
167 |         x = plt,
168 |         y = speed[,idx],
169 |         labels = sprintf("%s\n", rownames(speed)),
170 |         cex = txt_cex2,
171 |         col = cols
172 |     )
173 |     
174 |     
175 | }
176 | 
177 | png(sprintf("./adhoc/other docs/comparison-threads-%s.png", ifelse(cold_start, "coldstart", "warmstart")), width = 4267, height = 1600, res = 300)
178 | f()
179 | dev.off()
180 | 


--------------------------------------------------------------------------------
/man/filearray.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/class-filearray.R, R/class-proxy.R, R/load.R
  3 | \docType{class}
  4 | \name{as_filearray}
  5 | \alias{as_filearray}
  6 | \alias{FileArrayProxy-class}
  7 | \alias{FileArrayProxy}
  8 | \alias{as_filearrayproxy}
  9 | \alias{filearray}
 10 | \alias{filearray_create}
 11 | \alias{filearray_load}
 12 | \alias{filearray_checkload}
 13 | \alias{filearray_load_or_create}
 14 | \title{Create or load existing file arrays}
 15 | \usage{
 16 | as_filearray(x, ...)
 17 | 
 18 | as_filearrayproxy(x, ...)
 19 | 
 20 | filearray_create(
 21 |   filebase,
 22 |   dimension,
 23 |   type = c("double", "float", "integer", "logical", "raw", "complex"),
 24 |   partition_size = NA,
 25 |   initialize = FALSE,
 26 |   ...
 27 | )
 28 | 
 29 | filearray_load(filebase, mode = c("readwrite", "readonly"))
 30 | 
 31 | filearray_checkload(
 32 |   filebase,
 33 |   mode = c("readonly", "readwrite"),
 34 |   ...,
 35 |   symlink_ok = TRUE
 36 | )
 37 | 
 38 | filearray_load_or_create(
 39 |   filebase,
 40 |   dimension,
 41 |   on_missing = NULL,
 42 |   type = NA,
 43 |   ...,
 44 |   mode = c("readonly", "readwrite"),
 45 |   symlink_ok = TRUE,
 46 |   initialize = FALSE,
 47 |   partition_size = NA,
 48 |   verbose = FALSE,
 49 |   auto_set_headers = TRUE
 50 | )
 51 | }
 52 | \arguments{
 53 | \item{x}{R object such as array, file array proxy, or character that can be 
 54 | transformed into file array}
 55 | 
 56 | \item{...}{additional headers to check used by \code{filearray_checkload}
 57 | (see 'Details'). This argument is ignored by \code{filearray_create}, 
 58 | reserved for future compatibility.}
 59 | 
 60 | \item{filebase}{a directory path to store arrays in the local file 
 61 | system. When creating an array, the path must not exist.}
 62 | 
 63 | \item{dimension}{dimension of the array, at least length of 2}
 64 | 
 65 | \item{type}{storage type of the array; default is \code{'double'}. Other
 66 | options include \code{'integer'}, \code{'logical'}, and \code{'raw'}.}
 67 | 
 68 | \item{partition_size}{positive partition size for the last margin, or
 69 | \code{NA} to automatically guess; see 'Details'.}
 70 | 
 71 | \item{initialize}{whether to initialize partition files; default is false
 72 | for performance considerations. However, if the array is dense, it is 
 73 | recommended to set to true}
 74 | 
 75 | \item{mode}{whether allows writing to the file; choices are 
 76 | \code{'readwrite'} and \code{'readonly'}.}
 77 | 
 78 | \item{symlink_ok}{whether arrays with symbolic-link partitions can pass 
 79 | the test; this is usually used on bound arrays with symbolic-links; see 
 80 | \code{\link{filearray_bind}};}
 81 | 
 82 | \item{on_missing}{function to handle file array (such as initialization)
 83 | when a new array is created; must take only one argument, the array object}
 84 | 
 85 | \item{verbose}{whether to print out some debug messages}
 86 | 
 87 | \item{auto_set_headers}{whether to automatically set headers if array is 
 88 | missing or to be created; default is true}
 89 | }
 90 | \value{
 91 | A \code{\link{FileArray-class}} instance.
 92 | }
 93 | \description{
 94 | Create or load existing file arrays
 95 | }
 96 | \details{
 97 | The file arrays partition out-of-memory array objects and store them 
 98 | separately in local file systems. Since R stores matrices/arrays 
 99 | in column-major style, file array uses the slowest margin (the 
100 | last margin) to slice the partitions. This helps to align the elements
101 | within the files with the corresponding memory order. An array with 
102 | dimension \code{100x200x300x400} has 4 margins. The length of the 
103 | last margin is 400, which is also the maximum number of potential
104 | partitions. The number of partitions are determined by the last margin
105 | size divided by \code{partition_size}. For example, if the partition
106 | size is 1, then there will be 400 partitions. If the partition size 
107 | if 3, there will be 134 partitions. The default partition sizes 
108 | are determined internally following these priorities:
109 | \describe{
110 | \item{1. }{the file size of each partition does not exceed \code{1GB}}
111 | \item{2. }{the number of partitions do not exceed 100}
112 | }
113 | These two rules are not hard requirements. The goal is to reduce the
114 | numbers of partitions as much as possible. 
115 | 
116 | The arguments \code{...} in \code{filearray_checkload} should be named
117 | arguments that provide additional checks for the header information. 
118 | The check will fail if at least one header is not identical. For example,
119 | if an array contains header key-signature pair, one can use 
120 | \code{filearray_checkload(..., key = signature)} to validate the signature.
121 | Note the comparison will be rigid, meaning the storage type of the headers 
122 | will be considered as well. If the signature stored in the array is an 
123 | integer while provided is a double, then the check will result in failure.
124 | }
125 | 
126 | \examples{
127 | 
128 | 
129 | # Prepare
130 | library(filearray)
131 | filebase <- tempfile()
132 | if(file.exists(filebase)){ unlink(filebase, TRUE) }
133 | 
134 | # create array
135 | x <- filearray_create(filebase, dimension = c(200, 30, 8))
136 | print(x)
137 | 
138 | # Assign values
139 | x[] <- rnorm(48000)
140 | 
141 | # Subset
142 | x[1,2,]
143 | 
144 | # load existing array
145 | filearray_load(filebase)
146 | 
147 | x$set_header("signature", "tom")
148 | filearray_checkload(filebase, signature = "tom")
149 | 
150 | \dontrun{
151 | # Trying to load with wrong signature
152 | filearray_checkload(filebase, signature = "jerry")
153 | }
154 | 
155 | 
156 | # check-load, and create a new array if fail
157 | x <- filearray_load_or_create(
158 |     filebase = filebase, dimension = c(200, 30, 8),
159 |     verbose = FALSE, signature = "henry"
160 | )
161 | x$get_header("signature")
162 | 
163 | # check-load with initialization
164 | x <- filearray_load_or_create(
165 |     filebase = filebase,
166 |     dimension = c(3, 4, 5),
167 |     verbose = FALSE, mode = "readonly",
168 |     on_missing = function(array) {
169 |         array[] <- seq_len(60)
170 |     }
171 | )
172 | 
173 | x[1:3,1,1]
174 | 
175 | # Clean up
176 | unlink(filebase, recursive = TRUE)
177 | 
178 | }
179 | \author{
180 | Zhengjia Wang
181 | }
182 | 


--------------------------------------------------------------------------------
/R/header.R:
--------------------------------------------------------------------------------
  1 | #' Position (size): description - example value
  2 | #' 0 (8): determine endianness - 1.0 (double)
  3 | #' 8 (4 x 3): version number - [1, 0, 0] (int[3])
  4 | #' 20 (4): SEXP type - 14L (SEXPTYPE, data are double type)
  5 | #' 24 (4): element size - 8L (int, data element size is 8)
  6 | #' 28 (8): number of last dimensions per partition - 1 (double)
  7 | #' 36 (8): total number of elements - 24 (double)
  8 | #' 44 (4): number of dimensions (ndims) - 3 (int)
  9 | #' 48 (8 x ndims): dimensions - [2, 3, 4] (double)
 10 | #' 
 11 | #' ...
 12 | #' 
 13 | #' 1012 (4): size of the header - 72 (int)
 14 | #' 1016 (8): number of elements in the file - (double)
 15 | #' 
 16 | #' @noRd
 17 | NULL
 18 | 
 19 | write_header <- function(fid, partition, dimension, type, size){
 20 |     seek(con = fid, where = 0, rw = "write")
 21 |     
 22 |     # 0 (8): determine endianness - 1.0 (double)
 23 |     writeBin(con = fid, object = 1.0, size = 8L, endian = ENDIANNESS)
 24 |     
 25 |     # 8 (4 x 3): version number - [1, 0, 0] (int[3])
 26 |     writeBin(con = fid, object = FILE_VER, size = 4L, endian = ENDIANNESS)
 27 |     
 28 |     sexp_type <- type_to_sexp(type)
 29 |     
 30 |     # 20 (4): SEXP type - 14L (SEXPTYPE, data are double type)
 31 |     writeBin(con = fid, object = sexp_type, size = 4L,
 32 |              endian = ENDIANNESS)
 33 |     
 34 |     # 24 (4): element size - 8L (int, data element size is 8)
 35 |     writeBin(con = fid, object = as.integer(size), size = 4L,
 36 |              endian = ENDIANNESS)
 37 |     
 38 |     # partition number
 39 |     partition <- as.double(partition)
 40 |     
 41 |     # 28 (8): number of last dimensions per partition - 1 (double)
 42 |     writeBin(con = fid, object = partition, size = 8L, endian = ENDIANNESS)
 43 |     
 44 |     ## # write length of dimension
 45 |     dimension <- as.double(dimension)
 46 |     len <- prod(dimension)
 47 |     
 48 |     # 36 (8): total number of elements - 24 (double)
 49 |     writeBin(con = fid, object = len, size = 8L, endian = ENDIANNESS)
 50 |     
 51 |     # 44 (4): number of dimensions (ndims) - 3 (int)
 52 |     writeBin(con = fid, object = length(dimension), size = 4L, 
 53 |              endian = ENDIANNESS)
 54 |     
 55 |     # 48 (8 x ndims): dimensions - [2, 3, 4] (double)
 56 |     writeBin(con = fid, object = dimension, size = 8L, 
 57 |              endian = ENDIANNESS)
 58 |     header_len <- 48L + 8L * length(dimension)
 59 |     writeBin(con = fid, object = rep(0L, (HEADER_SIZE - header_len) / 4 - 3),
 60 |              size = 4L, endian = ENDIANNESS)
 61 |     writeBin(con = fid, object = header_len, size = 4L, endian = ENDIANNESS)
 62 |     writeBin(con = fid, object = 0.0, size = 8L, endian = ENDIANNESS)
 63 |     seek(con = fid, where = 0, rw = "write")
 64 | }
 65 | 
 66 | read_header <- function(fid){
 67 |     seek(con = fid, where = 0, rw = "read")
 68 |     
 69 |     one <- readBin(fid, n = 1, size = 8L, what = 'double', endian = ENDIANNESS)
 70 |     native <- one == 1
 71 |     
 72 |     if( native ){
 73 |         endian <- ENDIANNESS
 74 |     } else {
 75 |         if( ENDIANNESS == "little" ){
 76 |             endian <- 'big'
 77 |             # TODO: support big-endian file? maybe not
 78 |             stop("The file endianess is not little?")
 79 |         } else {
 80 |             endian <- "little"
 81 |         }
 82 |     }
 83 |     
 84 |     version <- readBin(fid, what = 'int', n = 3, size = 4L, endian = endian)
 85 |     
 86 |     # type, size
 87 |     sexp_type <- readBin(fid, what = 'int', size = 4L, endian = endian)
 88 |     vsize <- readBin(fid, what = 'int', size = 4L, endian = endian)
 89 |     
 90 |     # partition number
 91 |     partition <- readBin(con = fid, what = 'double', size = 8L, endian = endian)
 92 |     
 93 |     # dimension
 94 |     size <- readBin(con = fid, what = 'double', size = 8L, endian = endian)
 95 |     ndims <- readBin(con = fid, what = 'int', size = 4L, endian = endian)
 96 |     dim <- readBin(con = fid, what = 'double', size = 8L, n = ndims, 
 97 |                    endian = endian)
 98 |     
 99 |     seek(con = fid, where = HEADER_SIZE - 12L, rw = "read")
100 |     header_bytes <- readBin(con = fid, what = 'int', size = 4L, 
101 |                             endian = endian)
102 |     
103 |     content_length <- readBin(con = fid, what = 'double', size = 8L, 
104 |                               endian = endian)
105 |     list(
106 |         endianness = endian,
107 |         version = version,
108 |         sexp_type = sexp_type,
109 |         unit_bytes = vsize,
110 |         partition = partition,
111 |         partition_size = size,
112 |         partition_dim = dim,
113 |         header_bytes = header_bytes,
114 |         content_length = content_length
115 |     )
116 | }
117 | 
118 | validate_header <- function(file, fid){
119 |     fz <- -1
120 |     if(!missing(file)){
121 |         if(!file.exists(file)){
122 |             stop("File is missing")
123 |         }
124 |         fz <- file.size(file)
125 |         if(fz < HEADER_SIZE){
126 |             # Might be on windows and partition files are symlinked
127 |             if(get_os() != "windows" || fz != 0){
128 |                 stop("Invalid `filearray` partition. File size too small:\n  ", file)
129 |             }
130 |         }
131 |         fid <- file(description = file, open = "rb")
132 |         on.exit({
133 |             close(fid)
134 |         })
135 |     }
136 |     
137 |     header <- read_header(fid)
138 |     if( header$header_bytes != 48 + 8 * length(header$partition_dim) ){
139 |         stop("Filearray partition header is corrupted.")
140 |     }
141 |     if( fz > 0 && header$content_length + HEADER_SIZE > fz ){
142 |         stop("Filearray data is corrupted")
143 |     }
144 |     
145 |     return(header)
146 | }
147 | 
148 | set_meta_content <- function(meta_file, data){
149 |     stopifnot(file.exists(meta_file))
150 |     data <- as.list(data)
151 |     data$header_version <- HEADER_VER
152 |     
153 |     keys <- names(data)
154 |     names(data) <- sprintf("__%s__", keys)
155 |     
156 |     conn <- rawConnection(raw(), "w+b")
157 |     saveRDS(file = conn, data, ascii = FALSE)
158 |     v <- rawConnectionValue(conn)
159 |     close(conn)
160 |     
161 |     fid <- file(meta_file, "r+b")
162 |     on.exit({ close(fid) })
163 |     seek(con = fid, where = HEADER_SIZE, origin = "start", rw = "write")
164 |     writeBin(object = v, con = fid, endian = ENDIANNESS)
165 |     
166 |     seek(con = fid, where = HEADER_SIZE - 8L, origin = "start", rw = "write")
167 |     writeBin(con = fid, object = as.double(length(v)), size = 8L, endian = ENDIANNESS)
168 |     seek(con = fid, where = 0, rw = "write")
169 | }
170 | 
171 | 


--------------------------------------------------------------------------------
/tests/testthat/test-map.R:
--------------------------------------------------------------------------------
  1 | library(testthat)
  2 | 
  3 | 
  4 | test_that("map with proxy", {
  5 |     
  6 |     set.seed(10)
  7 |     
  8 |     # A large array example
  9 |     x1 <- filearray_create(temp_path(check = TRUE), dimension = c(28, 100, 3, 4), initialize = FALSE, partition_size = 3L)
 10 |     x1[] <- rnorm(33600)
 11 |     x2 <- x1 + 1
 12 |     x3 <- x1 + x2
 13 |     x4 <- as_filearray(matrix(1:12, nrow = 4))
 14 |     
 15 |     x <- list(x1, x2, x3, x4)
 16 |     # check common input size
 17 |     
 18 |     bc <- 12
 19 |     re <- fmap2(x, function(input) {
 20 |         testthat::expect_length(input, 4)
 21 |         testthat::expect_length(input[[1]], length(x1) / bc)
 22 |         testthat::expect_length(input[[2]], length(x2) / bc)
 23 |         testthat::expect_length(input[[3]], length(x3) / bc)
 24 |         testthat::expect_length(input[[4]], length(x4) / bc)
 25 |         testthat::expect_equal(input[[1]] + 1, input[[2]])
 26 |         testthat::expect_equal(input[[3]], input[[2]] + input[[1]])
 27 |         
 28 |         sum(input[[4]]) + sum(input[[3]] - input[[2]] - input[[1]])
 29 |     }, .buffer_count = bc)
 30 |     expect_equal(re, colSums(matrix(x4[], ncol = bc)))
 31 |     
 32 |     
 33 |     bc <- 4
 34 |     re <- fmap2(x, function(input) {
 35 |         testthat::expect_length(input, 4)
 36 |         testthat::expect_length(input[[1]], length(x1) / bc)
 37 |         testthat::expect_length(input[[2]], length(x2) / bc)
 38 |         testthat::expect_length(input[[3]], length(x3) / bc)
 39 |         testthat::expect_length(input[[4]], length(x4) / bc)
 40 |         testthat::expect_equal(input[[1]] + 1, input[[2]])
 41 |         testthat::expect_equal(input[[3]], input[[2]] + input[[1]])
 42 |         
 43 |         sum(input[[4]]) + sum(input[[3]] - input[[2]] - input[[1]])
 44 |     }, .buffer_count = bc)
 45 |     expect_equal(re, colSums(matrix(x4[], ncol = bc)))
 46 |     
 47 |     bc <- 1
 48 |     re <- fmap2(x, function(input) {
 49 |         testthat::expect_length(input, 4)
 50 |         testthat::expect_length(input[[1]], length(x1) / bc)
 51 |         testthat::expect_length(input[[2]], length(x2) / bc)
 52 |         testthat::expect_length(input[[3]], length(x3) / bc)
 53 |         testthat::expect_length(input[[4]], length(x4) / bc)
 54 |         testthat::expect_equal(input[[1]] + 1, input[[2]])
 55 |         testthat::expect_equal(input[[3]], input[[2]] + input[[1]])
 56 |         
 57 |         sum(input[[4]]) + sum(input[[3]] - input[[2]] - input[[1]])
 58 |     }, .buffer_count = bc)
 59 |     expect_equal(re, colSums(matrix(x4[], ncol = bc)))    
 60 |     
 61 |     
 62 |     # check fmap
 63 |     bc <- 12
 64 |     
 65 |     y <- filearray_create(temp_path(), dimension = c(12,1))
 66 |     fmap(x, function(input) {
 67 |         testthat::expect_length(input, 4)
 68 |         testthat::expect_length(input[[1]], length(x1) / bc)
 69 |         testthat::expect_length(input[[2]], length(x2) / bc)
 70 |         testthat::expect_length(input[[3]], length(x3) / bc)
 71 |         testthat::expect_length(input[[4]], length(x4) / bc)
 72 |         testthat::expect_equal(input[[1]] + 1, input[[2]])
 73 |         testthat::expect_equal(input[[3]], input[[2]] + input[[1]])
 74 |         
 75 |         input[[4]] + sum(input[[3]] - input[[2]] - input[[1]])
 76 |     }, .buffer_count = bc, .y = y)
 77 |     expect_equal(as.vector(y[]), as.vector(x4[]))
 78 |     
 79 |     clear_cache()
 80 | })
 81 | 
 82 | 
 83 | test_that("map filearrays", {
 84 |     
 85 |     # A large array example
 86 |     x <- filearray_create(temp_path(check = TRUE), dimension = c(28, 100, 301, 4), initialize = FALSE, partition_size = 3L)
 87 |     dnames <- list(
 88 |         Trial = sample(c("A", "B"), 28, replace = TRUE),
 89 |         Marker = 1:100,
 90 |         Time = seq(-1,2,0.01),
 91 |         Location = 1:4
 92 |     )
 93 |     dimnames(x) <- dnames
 94 |     
 95 |     expect_equal(dimnames(x), dnames)
 96 |     
 97 |     y <- array(rnorm(length(x)), dim(x))
 98 |     x[] <- y
 99 |     
100 |     output <- filearray_create(temp_path(check = TRUE), dimension = dim(x), initialize = FALSE, partition_size = 4L)
101 |     
102 |     f <- function(input){
103 |         # get locational data
104 |         if(is.list(input)){
105 |             location_data <- input[[1]]
106 |         } else {
107 |             location_data <- input
108 |         }
109 |         
110 |         dim(location_data) <- c(28, 100, 301)
111 |         
112 |         # collapse over first 50 time points for
113 |         # each trial, and marker
114 |         baseline <- apply(location_data[,,1:50], c(1,2), mean)
115 |         
116 |         # calibrate
117 |         calibrated <- sweep(location_data, c(1,2), baseline,
118 |                             FUN = function(data, bl){
119 |                                 (data / bl - 1) * 100
120 |                             })
121 |         return(calibrated)
122 |     }
123 |     
124 |     fmap(x, f, .y = output, .buffer_count = 4)
125 |     
126 |     b <- apply(y, 4, f)
127 |     dim(b) <- dim(y)
128 |     
129 |     expect_equal(output[], b)
130 |     
131 |     d <- fmap2(x, f, .buffer_count = 4, .simplify = TRUE)
132 |     expect_equal(d, b)
133 |     
134 |     x$delete()
135 |     output$delete()
136 |     clear_cache()
137 | })
138 | 
139 | 
140 | test_that("fwhich", {
141 |     x <- filearray_create(temp_path(check = TRUE), dimension = c(28, 100, 301, 4), initialize = FALSE, partition_size = 3L, type = "complex")
142 |     dnames <- list(
143 |         Trial = sample(c("A", "B"), 28, replace = TRUE),
144 |         Marker = 1:100,
145 |         Time = seq(-1,2,0.01),
146 |         Location = 1:4
147 |     )
148 |     dimnames(x) <- dnames
149 |     
150 |     y <- array(rnorm(length(x)), dim(x))
151 |     x[] <- y
152 |     x[1,c(1,3),301,4] <- c(80 + 10i, 80 + 10i)
153 |     y <- x[]
154 |     
155 |     idx <- fwhich(x, val = 80 + 10i, arr.ind = FALSE, ret.values = FALSE)
156 |     expect_equal(idx, which(y == (80 + 10i)))
157 |     
158 |     idx <- fwhich(x, val = 80 + 10i, arr.ind = FALSE, ret.values = TRUE)
159 |     expect_equal(attr(idx, "values"), rep(80 + 10i, length(idx)))
160 |     
161 |     idx1 <- fwhich(x, val = 80 + 10i, arr.ind = TRUE, ret.values = TRUE)
162 |     idx2 <- fwhich(y, val = 80 + 10i, arr.ind = TRUE, ret.values = TRUE)
163 |     expect_equal(idx1, idx2)
164 |     
165 |     # val is a function
166 |     impl <- function(z) { Re(z) > 4 }
167 |     idx1 <- fwhich(x, impl, arr.ind = FALSE, ret.values = TRUE)
168 |     idx2 <- fwhich(x[], impl, arr.ind = FALSE, ret.values = TRUE)
169 |     expect_equal(idx1, idx2)
170 |     
171 |     impl <- function(z) { Im(z) < -100 }
172 |     idx1 <- fwhich(x, impl, arr.ind = TRUE, ret.values = TRUE)
173 |     idx2 <- fwhich(x[], impl, arr.ind = TRUE, ret.values = TRUE)
174 |     expect_equal(idx1, idx2)
175 |     
176 |     x$delete()
177 | })
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/R/helpers.R:
--------------------------------------------------------------------------------
  1 | # ---- UUID generator -------------------------------------------------------
  2 | new_uuid <- function(prefix = 0L) {
  3 |     sprintf("%04d-%s", prefix, uuid::UUIDgenerate(output = "string"))
  4 | }
  5 | 
  6 | # ---- Index helpers --------------------------------------------------------
  7 | 
  8 | # Check if two arrays share the same dimension
  9 | is_same_dim <- function(x, y) {
 10 |     dx <- dim(x)
 11 |     if(length(dx) >= 2L) {
 12 |         dy <- dim(y)
 13 |         if(length(dx) != length(dy)) { return(FALSE) }
 14 |         # valid # of margins and dimensions are consistent
 15 |         if(all(dx == dy)) { return(TRUE) }
 16 |     } else if( length(x) == length(y) ) {
 17 |         # dx dy might not exist, check length
 18 |         return(TRUE)
 19 |     }
 20 |     return(FALSE)
 21 | }
 22 | 
 23 | # Guess partition size from given dimensions and element size.
 24 | # This function is used to estimate a decent partition size when creating 
 25 | # arrays
 26 | guess_partition <- function(dim, elem_size){
 27 |     last_margin <- dim[[length(dim)]]
 28 |     unit_size <- prod(dim) / last_margin * elem_size
 29 |     
 30 |     # 1: partition size cannot go beyond 1GB
 31 |     max_ <- floor(2^30 / unit_size)
 32 |     if(max_ <= 1L){
 33 |         return(1L)
 34 |     }
 35 |     # 2: n partitions <= 100
 36 |     if(last_margin <= 100){
 37 |         return(1L)
 38 |     }
 39 |     # 3: at most max_ units, at least fct units
 40 |     fct <- ceiling(last_margin / max_)
 41 |     if(fct > 50){
 42 |         return(max_)
 43 |     }
 44 |     while(fct <= 50){
 45 |         max_ <- max_ - 1L
 46 |         fct <- ceiling(last_margin / max_)
 47 |         if(max_ <= 1L){
 48 |             return(1L)
 49 |         }
 50 |     }
 51 |     return(max_)
 52 | }
 53 | 
 54 | 
 55 | guess_fmap_buffer_size <- function(dim, element_size = 16L) {
 56 |     buffer_len <- get_buffer_size() / element_size
 57 |     buffer_large <- get_buffer_size() * 16L # default is 8.5MB, 1024 x 1024
 58 |     
 59 |     
 60 |     dimcprod <- cumprod(dim)
 61 |     len <- dimcprod[[length(dimcprod)]]
 62 |     min_partition_size <- dimcprod[[length(dimcprod) - 1]]
 63 |     
 64 |     # tiny array
 65 |     if( buffer_len >= len ) { return( as.integer(len) ) }
 66 |     
 67 |     # small array
 68 |     if( buffer_large >= min_partition_size ) { return( as.integer(min_partition_size) ) }
 69 |     
 70 |     # large array
 71 |     dimcprod <- dimcprod[dimcprod <= buffer_large]
 72 |     if(!length(dimcprod)) { return( dim[[1]] * dim[[2]] ) }
 73 |     
 74 |     # mid array
 75 |     unit_size <- dimcprod[[length(dimcprod)]]
 76 |     fct <- floor(buffer_large / unit_size)
 77 |     if(fct < 1) { fct <- 1 }
 78 |     return( as.integer(unit_size * fct) )
 79 | }
 80 | 
 81 | common_fmap_buffer_count <- function(..., .list = NULL) {
 82 |     stopifnot(...length() + length(.list) > 0)
 83 |     buffer_large <- get_buffer_size() * 16L
 84 |     sizes <- vapply(c(list(...), .list), function(x){
 85 |         as.double(prod(x))
 86 |     }, FUN.VALUE = 0.0)
 87 |     if(length(sizes) == 1L) { return( sizes ) }
 88 |     if(all(sizes <= buffer_large)) { return(1) }
 89 |     
 90 |     gcd <- Reduce(function(a, b) {
 91 |         if( a <= 0 || b <= 0 ) {
 92 |             stop("Cannot find proper input buffer size when data has zero length")
 93 |         }
 94 |         rem <- a %% b
 95 |         if( rem == 0 ) { return( b ) }
 96 |         return( Recall(b, rem) )
 97 |     }, sizes)
 98 |     max_buffer <- max(sizes / gcd)
 99 |     
100 |     max_mult <- floor(buffer_large / max_buffer)
101 |     if(max_mult <= 1) { return(gcd) }
102 |     
103 |     guess_factors <- seq_len(min(max_mult, gcd))
104 |     guess_factors <- guess_factors[floor(gcd / guess_factors) * guess_factors == gcd]
105 |     return(gcd / max(guess_factors))
106 | }
107 | 
108 | validate_fmap_buffer_count <- function(count, input_lens) {
109 |     count <- count[[1]]
110 |     if( is.na(count) || count != round(count) ) {
111 |         return(FALSE)
112 |     }
113 |     
114 |     sizes <- input_lens[input_lens > 0]
115 |     if( !length(sizes) ) { return(TRUE) }
116 |     
117 |     # # elems in buffer is input_lens / count
118 |     buffer_nelems <- input_lens / count
119 |     
120 |     if(any( buffer_nelems != round(buffer_nelems))) {
121 |         return(FALSE)
122 |     }
123 |     return(structure(TRUE, buffer_nelems = buffer_nelems))
124 | }
125 | 
126 | # ---- data type helpers ----------------------------------------------------
127 | 
128 | is_filearray <- function(object, proxy_ok = TRUE){
129 |     if(!isS4(object)){ return(FALSE) }
130 |     if( !proxy_ok && is_fileproxy(object) ) {
131 |         return( FALSE )
132 |     }
133 |     return(inherits(object, c("FileArray", "FileArrayProxy")))
134 | }
135 | 
136 | is_fileproxy <- function(object){
137 |     if(!isS4(object)){ return(FALSE) }
138 |     return(inherits(object, "FileArrayProxy"))
139 | }
140 | 
141 | 
142 | # Guess output data types for meth operators such as \code{+-*/}
143 | operation_output_type <- function(
144 |         type1, type2, 
145 |         logical = c("integer", "logical"), raw = c("error", "integer"),
146 |         float = getOption("filearray.operator.precision", "double")
147 | ) {
148 |     
149 |     raw <- match.arg(raw)
150 |     has_raw <- identical(type1, "raw") || identical(type2, "raw")
151 |     
152 |     if( has_raw && identical(raw, "error") ) {
153 |         stop("non-numeric argument to binary operator")
154 |     }
155 |     
156 |     if( identical(type1, "complex") ) { return("complex") }
157 |     if( identical(type2, "complex") ) { return("complex") }
158 |     
159 |     if( identical(type1, "double") ) { return(float) }
160 |     if( identical(type2, "double") ) { return(float) }
161 |     
162 |     if( identical(type1, "float") ) { return(float) }
163 |     if( identical(type2, "float") ) { return(float) }
164 |     
165 |     if( identical(type1, "integer") ) { return("integer") }
166 |     if( identical(type2, "integer") ) { return("integer") }
167 |     
168 |     if( identical(type1, "raw") ) { return("integer") }
169 |     if( identical(type2, "raw") ) { return("integer") }
170 |     
171 |     logical <- match.arg(logical)
172 |     if( identical(type1, "logical") ) { return(logical) }
173 |     if( identical(type2, "logical") ) { return(logical) }
174 |     
175 |     stop("unrecognized combination of types: ", type1, ", ", type2)
176 | }
177 | 
178 | 
179 | 
180 | # ---- function calls ---------------------------------------------------
181 | parent_call <- function(def, deparse = FALSE, env = parent.frame()) {
182 |     tryCatch({
183 |         call <- with(env, {match.call(expand.dots = FALSE)})
184 |         if(!missing(def)) {
185 |             def <- substitute(def)
186 |             call[[1]] <- def
187 |         }
188 |         if(deparse) {
189 |             call <- deparse1(call)
190 |         }
191 |         call
192 |     }, error = function(e) { NULL })
193 | }


--------------------------------------------------------------------------------
/inst/include/TinyParallel/Timer.h:
--------------------------------------------------------------------------------
  1 | #ifndef __FILEARRAY_PARALLEL_TIMER__
  2 | #define __FILEARRAY_PARALLEL_TIMER__
  3 | 
  4 | namespace TinyParallel {
  5 |     typedef uint64_t nanotime_t;
  6 | 
  7 |     template <typename Timer>
  8 |     class ProportionTimer {
  9 |     public:
 10 |         ProportionTimer() : 
 11 |             timer(), n(0), id(0)
 12 |         {}
 13 |         
 14 |         ProportionTimer( nanotime_t origin, int id_ ) : 
 15 |             timer(origin), n(0), id(id_)
 16 |         {}
 17 |         
 18 |         inline operator SEXP() const {
 19 |             Rcpp::NumericVector out = (SEXP)timer ;
 20 |             out.attr("n") = n ;
 21 |             return out ;
 22 |         }
 23 |         
 24 |         inline nanotime_t origin() const{
 25 |             return timer.origin() ;
 26 |         }
 27 |         
 28 |         inline int get_n() const {
 29 |             return n ;    
 30 |         }
 31 |         
 32 |         inline void step( const std::string& name) {
 33 |             timer.step(name) ;    
 34 |         }
 35 |         
 36 |         Timer timer ;
 37 |         int n ;
 38 |         int id ;
 39 |         
 40 |     } ;
 41 |     
 42 |     template <typename Timer>
 43 |     class SingleTimer {
 44 |     public:
 45 |         SingleTimer() : timer(){}
 46 |         
 47 |         inline operator SEXP(){
 48 |             Rcpp::List out = Rcpp::List::create(timer) ;
 49 |             out.attr("class") = Rcpp::CharacterVector::create( "SingleTimer", "Timer" );
 50 |             return out ;
 51 |         }
 52 |         
 53 |         inline void step( const char* name ){
 54 |             timer.step(name) ;    
 55 |         }
 56 |         
 57 |     private:
 58 |         Timer timer ;
 59 |     } ;
 60 |     
 61 |     template <typename Timer>
 62 |     class FixedSizeTimers {
 63 |     public:
 64 |         FixedSizeTimers( int n, int ndata_ ) : 
 65 |             timers(n), ndata(ndata_)
 66 |         {}
 67 |         
 68 |         inline ProportionTimer<Timer>& get(int i) {
 69 |             return timers[i] ;
 70 |         }
 71 |         
 72 |         inline ProportionTimer<Timer>& operator[](int i){ 
 73 |             return timers[i]; 
 74 |         }
 75 |         
 76 |         inline operator SEXP(){
 77 |             Rcpp::List out = wrap(timers) ;
 78 |             out.attr("class") = Rcpp::CharacterVector::create("FixedSizeTimers", "Timer") ;
 79 |             out.attr("n") = ndata ;
 80 |             return out ;
 81 |         }
 82 |         
 83 |     private:
 84 |         std::vector< ProportionTimer<Timer> > timers ;
 85 |         int ndata ;
 86 |     } ;
 87 |     
 88 |     template <typename Timer, typename Mutex, typename Locker>
 89 |     class TimersList {
 90 |     public:
 91 |         
 92 |         TimersList(int n_): 
 93 |             timers(), origin(Rcpp::get_nanotime()), n(n_)
 94 |         {
 95 |             timers.push_back( ProportionTimer<Timer>( origin, 0 ) ) ;
 96 |             childs.push_back( std::vector<int>() );
 97 |         }
 98 |         
 99 |         ProportionTimer<Timer>& front(){
100 |             return timers.front() ;
101 |         }
102 |         
103 |         ProportionTimer<Timer>& get_new_timer(int parent_id){
104 |             Locker lock(mutex) ;
105 |             int id = timers.size() ;
106 |             timers.push_back( ProportionTimer<Timer>( origin, id ) ) ;
107 |             childs.push_back( std::vector<int>() );
108 |             std::list< std::vector<int> >::iterator it = childs.begin() ;
109 |             for(int i=0; i<parent_id; i++) ++it ;
110 |             it->push_back(id) ;
111 |             return timers.back() ;
112 |         }
113 |         
114 |         inline operator SEXP(){
115 |             int nt = timers.size() ;
116 |             Rcpp::List data(nt) ;
117 |             typename std::list<ProportionTimer<Timer> >::const_iterator timers_it = timers.begin() ;
118 |             std::list<std::vector<int> >::const_iterator childs_it = childs.begin() ;
119 |             for( int i=0; i<nt; i++, ++timers_it, ++childs_it ){
120 |                 Rcpp::NumericVector x = (SEXP)*timers_it ;
121 |                 Rcpp::IntegerVector kids = Rcpp::wrap(*childs_it) ;
122 |                 x.attr("childs") = kids + 1 ;
123 |                 data[i] = x ;
124 |             }
125 |             data.attr("class") = Rcpp::CharacterVector::create( "TimersList" , "Timer" ) ;
126 |             data.attr("n") = n ;
127 |             return data ;
128 |         }
129 |         
130 |         std::list< ProportionTimer<Timer> > timers ;
131 |         std::list< std::vector<int> > childs ;
132 |         Mutex mutex ;
133 |         nanotime_t origin ;
134 |         int n ;
135 |         
136 |     private:
137 |         TimersList( const TimersList& ) ;
138 |     
139 |     } ;
140 |     
141 |     template <typename Reducer, typename Timer, typename Mutex, typename Locker, typename OUT = Rcpp::List>
142 |     class TimedReducer : public Worker {
143 |     public:
144 |         typedef TimersList<Timer,Mutex,Locker> Timers ; 
145 |         
146 |         Reducer* reducer ;
147 |         Timers& timers ;
148 |         ProportionTimer<Timer>& timer ;
149 |         bool owner ;
150 |         
151 |         TimedReducer( Reducer& reducer_, Timers& timers_) : 
152 |             reducer(&reducer_), 
153 |             timers(timers_), 
154 |             timer(timers.get_new_timer(0)), 
155 |             owner(false)
156 |         {
157 |             timer.step("init");
158 |         }
159 |         
160 |         TimedReducer( const TimedReducer& other, Split s) : 
161 |             reducer( new Reducer(*other.reducer, s) ), 
162 |             timers( other.timers ), 
163 |             timer( timers.get_new_timer(other.timer.id) ),
164 |             owner(true)
165 |         {
166 |             timer.step("init") ;
167 |         }
168 |         
169 |         ~TimedReducer(){
170 |             if(owner && reducer) {
171 |                 delete reducer ;
172 |             }
173 |             reducer = 0 ;
174 |         }
175 |         
176 |         inline void operator()( size_t begin, size_t end){
177 |             timer.n += (end-begin) ;
178 |             timer.step("start") ;
179 |             reducer->operator()(begin, end) ;
180 |             timer.step("work") ;
181 |         }
182 |         
183 |         inline void join(const TimedReducer& rhs){
184 |             rhs.timer.step("start") ;
185 |             reducer->join(*rhs.reducer) ;
186 |             rhs.timer.step("join") ;
187 |         }
188 |         
189 |         inline Rcpp::List get() const {
190 |             timers.front().step("start") ;
191 |             OUT out = reducer->get() ;
192 |             timers.front().step("structure") ;
193 |             
194 |             Rcpp::List res = Rcpp::List::create( (SEXP)timers, out );
195 |             return res ;
196 |         }
197 |         
198 |     private:
199 |         // just to be on the safe side, making sure these are not called
200 |         TimedReducer() ;
201 |         TimedReducer( const TimedReducer& ) ;
202 |     } ;
203 |     
204 |     
205 | } // namespace TinyParallel
206 | 
207 | 
208 | #endif // __FILEARRAY_PARALLEL_COMMON__
209 | 


--------------------------------------------------------------------------------
/R/mapreduce.R:
--------------------------------------------------------------------------------
  1 | #' @title A map-reduce method to iterate blocks of file-array data with little memory usage
  2 | #' @param x a file array object
  3 | #' @param map mapping function that receives 3 arguments; see 'Details'
  4 | #' @param reduce \code{NULL}, or a function that takes a list as input
  5 | #' @param buffer_size control how we split the array; see 'Details'
  6 | #' @param ... passed to other methods
  7 | #' @return If \code{reduce} is \code{NULL}, return mapped results, otherwise
  8 | #' return reduced results from \code{reduce} function
  9 | #' 
 10 | #' @details When handling out-of-memory arrays, it is recommended to load
 11 | #' a block of array at a time and execute on block level. See 
 12 | #' \code{\link{apply}} for a implementation. When an array is too large,
 13 | #' and when there are too many blocks, this operation will become 
 14 | #' very slow if computer memory is low. 
 15 | #' This is because the R will perform garbage collection frequently. 
 16 | #' Implemented in \code{C++}, \code{mapreduce} creates a buffer to store
 17 | #' the block data. By reusing the memory over and over again, it is possible
 18 | #' to iterate through the array with minimal garbage collections. Many 
 19 | #' statistics, including \code{min}, \code{max}, \code{sum}, 
 20 | #' \code{mean}, ... These statistics can be calculated in this 
 21 | #' way efficiently.
 22 | #' 
 23 | #' The function \code{map} contains three arguments: \code{data} (mandate), 
 24 | #' \code{size} (optional), and \code{first_index} (optional). 
 25 | #' The \code{data} is the buffer,
 26 | #' whose length is consistent across iterations. \code{size} indicates
 27 | #' the effective size of the buffer. If the partition size
 28 | #' is not divisible by the buffer size, only first \code{size} elements of
 29 | #' the data are from array, and the rest elements will be \code{NA}. 
 30 | #' This situation could only occurs when \code{buffer_size} is manually 
 31 | #' specified. By default, all of \code{data} should belong to arrays.
 32 | #' The last argument \code{first_index} is the index of the first element
 33 | #' \code{data[1]} in the whole array. It is useful when positional data 
 34 | #' is needed. 
 35 | #' 
 36 | #' The buffer size, specified by \code{buffer_size} is an 
 37 | #' additional optional argument in \code{...}. Its default is \code{NA},
 38 | #' and will be calculated automatically. If manually specified, a
 39 | #' large buffer size would be desired to speed up the calculation.
 40 | #' The default buffer size will not exceed \eqn{nThreads x 2MB}, where 
 41 | #' \code{nThreads} is the number of threads set by \code{\link{filearray_threads}}.
 42 | #' When partition length cannot be divided by the buffer size, instead of
 43 | #' trimming the buffer, \code{NA}s will be filled to the buffer, 
 44 | #' passed to \code{map} function; see previous paragraph for treatments.
 45 | #' 
 46 | #' The function \code{mapreduce} ignores the missing partitions. That means
 47 | #' if a partition is missing, its data will not be read nor passed to 
 48 | #' \code{map} function. Please run \code{x$initialize_partition()} to make sure
 49 | #' partition files exist.
 50 | #' 
 51 | #' @examples 
 52 | #' 
 53 | #' 
 54 | #' x <- filearray_create(tempfile(), c(100, 100, 10))
 55 | #' x[] <- rnorm(1e5)
 56 | #' 
 57 | #' ## calculate summation
 58 | #' # identical to sum(x[]), but is more feasible in large cases
 59 | #' 
 60 | #' mapreduce(x, map = function(data, size){
 61 | #'     # make sure `data` is all from array
 62 | #'     if(length(data) != size){
 63 | #'         data <- data[1:size]
 64 | #'     }
 65 | #'     sum(data)
 66 | #' }, reduce = function(mapped_list){
 67 | #'     do.call(sum, mapped_list)
 68 | #' })
 69 | #' 
 70 | #' 
 71 | #' ## Find elements are less than -3
 72 | #' positions <- mapreduce(
 73 | #'     x,
 74 | #'     map = function(data, size, first_index) {
 75 | #'         if (length(data) != size) {
 76 | #'             data <- data[1:size]
 77 | #'         }
 78 | #'         which(data < -3) + (first_index - 1)
 79 | #'     },
 80 | #'     reduce = function(mapped_list) {
 81 | #'         do.call(c, mapped_list)
 82 | #'     }
 83 | #' )
 84 | #' 
 85 | #' if(length(positions)){
 86 | #'     x[[positions[1]]]
 87 | #' }
 88 | #' 
 89 | #' 
 90 | #' @export
 91 | setGeneric("mapreduce", function(x, map, reduce, ...){
 92 |     standardGeneric("mapreduce")
 93 | })
 94 | 
 95 | buffer_mapreduce <- function(x, map, reduce = NULL, buffer_size = NA){
 96 |     # TODO: edit for proxy arrays
 97 |     if(!x$valid()){
 98 |         stop("Invalid file array")
 99 |     }
100 |     if( is_fileproxy(x) ) {
101 |         x <- fa_eval_ops(x)
102 |     }
103 |     
104 |     current_bsz <- get_buffer_size()
105 |     on.exit({
106 |         set_buffer_size(current_bsz)
107 |     })
108 |     set_buffer_size(max_buffer_size())
109 |     
110 |     argnames <- names(formals(map))
111 |     if(length(argnames) < 3 && !'...' %in% argnames){
112 |         if( length(argnames) == 1 ){
113 |             map_ <- function(data, size, idx){
114 |                 map(data)
115 |             }
116 |         } else {
117 |             map_ <- function(data, size, idx){
118 |                 map(data, size)
119 |             }
120 |         }
121 |     } else {
122 |         map_ <- function(data, size, idx){
123 |             map(data, size, idx)
124 |         }
125 |     }
126 |     
127 |     if(is.function(reduce) && !length(formals(reduce))){
128 |         stop("Reduce function must contain at least one argument")
129 |     }
130 |     
131 |     dim <- x$dimension()
132 |     cum_partlen <- x$.partition_info[,3]
133 |     sexp_type <- x$sexp_type()
134 |     
135 |     if(is.na(buffer_size)){
136 |         elem_size <- get_elem_size(typeof(x))
137 |         mbsz <- max_buffer_size() * getThreads(FALSE) / elem_size
138 |         
139 |         sel <- cumprod(dim) <= mbsz
140 |         if(any(sel)){
141 |             buffer_size <- prod(dim[sel])
142 |         } else {
143 |             buffer_size <- dim[[1]]
144 |         }
145 |     }
146 |     
147 |     if(buffer_size < 1){
148 |         buffer_size <- 1
149 |     }
150 |     
151 |     # filebase <- paste0(x$.filebase, x$.sep)
152 |     # FARR_buffer_mapreduce(filebase, map_, reduce, dim, 
153 |     #                       cum_partlen, buffer_size, sexp_type)
154 |     FARR_buffer_mapreduce(
155 |         x$.filebase,
156 |         map_, reduce,
157 |         buffer_size
158 |     )
159 | }
160 | 
161 | #' @rdname mapreduce
162 | #' @export
163 | setMethod(
164 |     mapreduce, signature(x = "FileArray", reduce = "function"),
165 |     function(x, map, reduce, buffer_size = NA, ...){
166 |         buffer_mapreduce(x, map, reduce, buffer_size)
167 |     }
168 | )
169 | 
170 | #' @rdname mapreduce
171 | #' @export
172 | setMethod(
173 |     mapreduce, signature(x = "FileArray", reduce = "NULL"),
174 |     function(x, map, reduce, buffer_size = NA, ...){
175 |         buffer_mapreduce(x, map, NULL, buffer_size, ...)
176 |     }
177 | )
178 | 
179 | #' @rdname mapreduce
180 | #' @export
181 | setMethod(
182 |     mapreduce, signature(x = "FileArray", reduce = "missing"),
183 |     function(x, map, reduce, buffer_size = NA, ...){
184 |         buffer_mapreduce(x, map, NULL, buffer_size, ...)
185 |     }
186 | )
187 | 


--------------------------------------------------------------------------------
/R/methods-subset.R:
--------------------------------------------------------------------------------
  1 | # DIPSAUS DEBUG START
  2 | # verbose <- TRUE
  3 | # z = filearray_create(temp_path(), c(2,3,4))
  4 | # z[] <- 1:24
  5 | # aaa <- z+1
  6 | # print(aaa[,,4])
  7 | # idx <- as_filearray(z[] > 2)
  8 | # 
  9 | # print(z[idx])
 10 | # print(aaa[idx])
 11 | 
 12 | # Normal subset via indices
 13 | fa_subset1 <- function(x, ..., drop = TRUE, reshape = NULL, strict = TRUE, dimnames = TRUE, split_dim = 0) {
 14 |     if(!x$valid()){
 15 |         stop("Invalid file array")
 16 |     }
 17 |     drop <- isTRUE(drop)
 18 |     # file <- temp_path(); x <- filearray_create(file, c(300, 400, 100, 1))
 19 |     arglen <- ...length()
 20 |     elem_size <- x$element_size()
 21 |     dim <- x$dimension()
 22 |     
 23 |     listOrEnv <- list()
 24 |     if(arglen == 1){
 25 |         tmp <- tryCatch({
 26 |             ...elt(1)
 27 |         }, error = function(e){
 28 |             NULL
 29 |         })
 30 |         if(length(tmp)){
 31 |             stop("Subset FileArray only allows x[], x[i:j] or x[i,j,...] (single logical index like x[c(TRUE, ...)] is not allowed)")
 32 |         }
 33 |         
 34 |         
 35 |     } else if(arglen > 1){
 36 |         if(arglen != length(dim)){
 37 |             stop("Subset FileArray dimension mismatch.")
 38 |         }
 39 |         missing_args <- check_missing_dots(environment())
 40 |         
 41 |         for(ii in seq_len(arglen)){
 42 |             if( missing_args[[ii]] ){
 43 |                 listOrEnv[[ii]] <- seq_len(dim[[ii]])
 44 |             } else {
 45 |                 tmp <- ...elt(ii)
 46 |                 if(!length(tmp)){
 47 |                     tmp <- integer(0L)
 48 |                 } else if(is.logical(tmp)){
 49 |                     if(length(tmp) > dim[[ii]]){
 50 |                         stop("(subscript) logical subscript too long")
 51 |                     }
 52 |                     tmp <- rep(tmp, ceiling(dim[[ii]] / length(tmp)))
 53 |                     tmp <- tmp[seq_len(dim[[ii]])]
 54 |                     tmp <- seq_along(tmp)[tmp]
 55 |                 }
 56 |                 listOrEnv[[ii]] <- tmp
 57 |             }
 58 |         }
 59 |     }
 60 |     
 61 |     # guess split dim
 62 |     max_buffer <- get_buffer_size() / elem_size
 63 |     
 64 |     if(length(listOrEnv) == length(dim)){
 65 |         idxrange <- sapply(listOrEnv, function(x){
 66 |             if(!length(x) || all(is.na(x))){ return(1L) }
 67 |             rg <- range(x, na.rm = TRUE)
 68 |             return(rg[2] - rg[1] + 1)
 69 |         })
 70 |     } else {
 71 |         idxrange <- dim
 72 |     }
 73 |     split_dim <- as.integer(split_dim)
 74 |     if(is.na(split_dim) || split_dim <= 0 || split_dim >= length(dim)){
 75 |         # worst-case time-complexity
 76 |         time_complexity <-
 77 |             sapply(seq_len(length(dim) - 1), function(split_dim) {
 78 |                 dim[[length(dim)]] <- 1
 79 |                 idx1dim <- dim[seq_len(split_dim)]
 80 |                 idx1dim[[split_dim]] <- idxrange[[split_dim]]
 81 |                 idx1len <- prod(idx1dim)
 82 |                 idx2len <- prod(dim[-seq_len(split_dim)])
 83 |                 buffer_sz <-
 84 |                     ifelse(idx1len > max_buffer, max_buffer, idx1len)
 85 |                 nloops <- ceiling(idx1len / buffer_sz)
 86 |                 (idx1len * nloops + idx2len) * idx2len
 87 |             })
 88 |         split_dim <- which.min(time_complexity)
 89 |         split_dim <- split_dim[[length(split_dim)]]
 90 |     }
 91 |     
 92 |     
 93 |     FARR_subset2(
 94 |         filebase = x$.filebase,
 95 |         listOrEnv = listOrEnv,
 96 |         reshape = reshape,
 97 |         drop = drop,
 98 |         use_dimnames = isTRUE(dimnames),
 99 |         thread_buffer = get_buffer_size(), 
100 |         split_dim = split_dim,
101 |         strict = isTRUE(strict)
102 |     )
103 |     
104 | }
105 | 
106 | # Special subset using another array
107 | fa_subset2 <- function(x, i, ...) {
108 |     
109 |     single_index <- FALSE
110 |     if(!is_filearray(i)) {
111 |         if(is.array(i) && is.logical(i)) {
112 |             if(!is_same_dim(x, i)) {
113 |                 stop("x[i]: `x` and `i` must share the same dimension when i is a logical array.")
114 |             }
115 |             i <- as_filearray(i)
116 |         } else if (is.numeric(i) && is.vector(i)) {
117 |             single_index <- TRUE
118 |         } else {
119 |             stop("x[i]: Cannot determine subset mode from `i`: must be a logical array or integer vector")
120 |         }
121 |     }
122 |     
123 |     if( single_index ) {
124 |         stop("Single vector indexing (e.g. x[c(1,2,3,4,...)]) hasn't been implemented yet.")
125 |         # re <- fastmap::fastqueue()
126 |         # idx <- fastmap::fastmap()
127 |         # idx$set("start_idx", 0L)
128 |         # idx$set("idx_left", length(i))
129 |         # 
130 |         # 
131 |         # fa_eval_ops(x, addon = function(env, data, uuid) {
132 |         #     idx_left <- idx$get("idx_left")
133 |         #     
134 |         #     if( idx_left < 0 ) {
135 |         #         return(NULL)
136 |         #     }
137 |         #     start_idx <- idx$get("start_idx")
138 |         #     
139 |         #     vec <- env[[ uuid ]]
140 |         #     
141 |         #     end_idx <- start_idx + length(vec)
142 |         #     sel <- i[ i > start_idx & i <= end_idx] - start_idx
143 |         #     if(length(sel)) {
144 |         #         if(idx_left <= length(sel)) {
145 |         #             sel <- sel[seq_len(idx_left)]
146 |         #             re$add(vec[sel])
147 |         #             idx_left <- 0L
148 |         #         } else {
149 |         #             re$add(vec[sel])
150 |         #         }
151 |         #         idx$set("idx_left", idx_left - length(sel))
152 |         #     }
153 |         #     idx$set("start_idx", start_idx)
154 |         # })
155 |         # re <- unlist(re$as_list())
156 |         # 
157 |     } else {
158 |         re <- unlist(
159 |             fmap2(
160 |                 x = list(x, i),
161 |                 fun = function(v) {
162 |                     v[[1]][v[[2]]]
163 |                 }, 
164 |                 .simplify = FALSE
165 |             )
166 |         )
167 |     }
168 |     
169 |     expected_mode <- operation_output_type(
170 |         typeof(x), typeof(x), float = "double", logical = "logical")
171 |     if(!identical(expected_mode, mode(re))) {
172 |         mode(re) <- expected_mode
173 |     }
174 |     return(re)
175 |     
176 | }
177 | 
178 | #' @describeIn S3-filearray subset array
179 | #' @export
180 | `[.FileArray` <- function(x, i, ..., drop = TRUE, reshape = NULL, strict = TRUE, dimnames = TRUE, split_dim = 0) {
181 |     x <- fa_eval_ops(x)
182 |     if(missing(i)) {
183 |         return(fa_subset1(x, , ..., drop = drop, reshape = reshape, strict = strict, dimnames = dimnames, split_dim = split_dim))
184 |     } else {
185 |         if(is_filearray(i) || is.array(i) || 
186 |            ( ...length() == 0 && !is.logical(i) )) {
187 |             return(fa_subset2(x, i))
188 |         } else {
189 |             return(fa_subset1(x, i, ..., drop = drop, reshape = reshape, strict = strict, dimnames = dimnames, split_dim = split_dim))
190 |         }
191 |     }
192 | }
193 | 
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img src="https://raw.githubusercontent.com/dipterix/filearray/main/inst/hexbadge.png" height="160px" align="right" />
  2 | 
  3 | # File-Backed Array for Out-of-memory Computation
  4 | 
  5 | 
  6 | <!-- badges: start -->
  7 | [![R-check](https://github.com/dipterix/filearray/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/dipterix/filearray/actions/workflows/R-CMD-check.yaml)
  8 | [![CRAN status](https://www.r-pkg.org/badges/version/filearray)](https://CRAN.R-project.org/package=filearray)
  9 | [![Develop](https://dipterix.r-universe.dev/badges/filearray)](https://dipterix.r-universe.dev/)
 10 | <!-- badges: end -->
 11 | 
 12 | Stores large arrays in files to avoid occupying large memories. Implemented with super fast gigabyte-level multi-threaded reading/writing via `OpenMP`. Supports multiple non-character data types (double, float, integer, complex, logical and raw).
 13 | 
 14 | ![](https://raw.githubusercontent.com/dipterix/filearray/main/adhoc/readme-speed.png)
 15 | 
 16 | <small> Speed comparisons with `lazyarray` (`zstd`-compressed out-of-memory array), and in-memory operation. The speed test was conducted on an `MacBook Air (M1, 2020, 8GB RAM)`, with 8-threads. `filearray` is uniformly faster than `lazyarray`. Random access has almost the same speed as the native array operation in R. *(The actual speed may vary depending on the storage type and memory size)* </small>
 17 | 
 18 | ## Installation
 19 | 
 20 | ```r
 21 | install.packages("filearray")
 22 | ```
 23 | 
 24 | ### Install Develop Version
 25 | 
 26 | The internal functions are written in `C++`. To avoid compiling the packages, you can install from my personal repository. It's automatically updated every hour. Currently available on `Windows` and `osx (Intel chip)` only.
 27 | 
 28 | ```r
 29 | options(repos = c(
 30 |     dipterix = 'https://dipterix.r-universe.dev',
 31 |     CRAN = 'https://cloud.r-project.org'))
 32 | 
 33 | install.packages('filearray')
 34 | ```
 35 | 
 36 | Alternatively, you can compile from `Github` repository. This requires proper compilers (`rtools` on `windows`, or `xcode-select --install` on `osx`, or `build-essentials` on `linux`).
 37 | 
 38 | ```r
 39 | # install.packages("remotes")
 40 | remotes::install_github("dipterix/filearray")
 41 | ```
 42 | 
 43 | ## Basic Usage
 44 | 
 45 | #### Create/load file array
 46 | 
 47 | ```r
 48 | library(filearray)
 49 | file <- tempfile()
 50 | x <- filearray_create(file, c(100, 100, 100, 100))
 51 | 
 52 | # load existing
 53 | x <- filearray_load(file)
 54 | ```
 55 | 
 56 | See more: `help("filearray")`
 57 | 
 58 | #### Assign & subset array
 59 | 
 60 | ```r
 61 | x[,,,1] <- rnorm(1e6)
 62 | x[1:10,1,1,1]
 63 | ```
 64 | 
 65 | #### Generics
 66 | 
 67 | ```r
 68 | typeof(x)
 69 | max(x, na.rm = TRUE)
 70 | apply(x, 3, min, na.rm = TRUE)
 71 | 
 72 | val = x[1,1,5,1]
 73 | fwhich(x, val, arr.ind = TRUE)
 74 | ```
 75 | 
 76 | See more: `help("S3-filearray")`, `help("fwhich")`
 77 | 
 78 | #### Map-reduce
 79 | 
 80 | Process segments of array and reduce to save memories.
 81 | 
 82 | ```
 83 | # Identical to sum(x, na.rm = TRUE)
 84 | mapreduce(x, 
 85 |           map = \(data){ sum(data, na.rm = TRUE) }, 
 86 |           reduce = \(mapped){ do.call(sum, mapped) })
 87 | ```
 88 | 
 89 | See more: `help("mapreduce")`
 90 | 
 91 | #### Collapse
 92 | 
 93 | Transform data, and collapse (calculate sum or mean) along margins.
 94 | 
 95 | ```
 96 | a <- x$collapse(keep = 4, method = "mean", transform = "asis")
 97 | 
 98 | # equivalent to
 99 | b <- apply(x[], 4, mean)
100 | 
101 | a[1] - b[1]
102 | ```
103 | 
104 | Available `transform` for double/integer numbers are:
105 | 
106 | * `asis`: no transform
107 | * `10log10`: `10 * log10(v)`
108 | * `square`: `v * v` 
109 | * `sqrt`: `sqrt(v)`
110 | 
111 | For complex numbers, `transform` is a little bit different:
112 | 
113 | * `asis`: no transform
114 | * `10log10`: `10 * log10(|x|^2)` (power to decibel unit)
115 | * `square`: `|x|^2` 
116 | * `sqrt`: `|x|` (modulus)
117 | * `normalize`: `x / |x|` (unit length)
118 | 
119 | ## Notes
120 | 
121 | #### I. Notes on precision
122 | 
123 | 1. `complex` numbers: In native `R`, complex numbers are combination of two `double` numbers - real and imaginary (total 16 bytes). In `filearray`, complex numbers are coerced to two `float` numbers and store each number in 8 bytes. This conversion will gain performance speed, but lose precision at around 8 decimal place. For example, `1.0000001` will be store as `1`, or `123456789` will be stored as `123456792` (first 7 digits are accurate).
124 | 
125 | 2. `float` type: Native R does not have float type. All numeric values are stored in double precision. Since float numbers use half of the space, float arrays can be faster when hard drive speed is the bottle-neck (see [performance comparisons](https://dipterix.org/filearray/articles/performance.html)). However coercing double to float comes at costs:
126 |   a). float number has less precision 
127 |   b). float number has smaller range ($3.4\times 10^{38}$) than double ($1.7\times 10^{308}$)
128 | hence use with caution when data needs high precision or the max is super large.
129 | 
130 | 3. `collapse` function: when data range is large (say `x[[1]]=1`, but `x[[2]]=10^20`), `collapse` method might lose precision. This is `double` only uses 8 bytes of memory space. When calculating summations, R internally uses `long double` to prevent precision loss, but current `filearray` implementation uses `double`, causing floating error around 16 decimal place. 
131 | 
132 | #### II. Cold-start vs warm-start
133 | 
134 | As of version `0.1.1`, most file read/write operations are switched from `fopen` to memory map for two simplify the logic (buffer size, kernel cache...), and to boost the writing/some types of reading speed. While sacrificing the speed of reading large block of data from 2.4GB/s to 1.7GB/s, the writing speed was boosted from 300MB/s to 700MB/s, and the speed of random accessing small slices of data was increased from 900MB/s to 2.5GB/s. As a result, some functions can reach to really high speed (close to in-memory calls) while using much less memory. 
135 | 
136 | The additional performance improvements brought by the memory mapping approach might be impacted by "cold" start. When reading/writing files, most modern systems will cache the files so that it can load up these files faster next time. I personally call it a cold start. Memory mapping have a little bit extra overhead during the cold start, resulting in decreased performance (but it's still fast). Accessing the same data after the cold start is called warm start. When operating with warm starts, `filearray` is as fast as native R arrays (sometimes even faster due to the indexing method and fewer garbage collections). This means `filearray` reaches its best performance when the arrays are re-used.
137 | 
138 | #### III. Using traditional HDD?
139 | 
140 | `filearray` relies on `SSD`, especially `NVMe SSD` that allows you to fast-access random hard disk address. If you use `HDD`, `filearray` can provide very limited improvement. 
141 | 
142 | If you use `filearray` to direct access to HDD, please set number of threads to `1` via `filearray::filearray_threads(1)` at start up, or set system environment `FILEARRAY_NUM_THREADS` to `"1"`.
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/R/methods-subsetAssign.R:
--------------------------------------------------------------------------------
  1 | fa_subsetAssign1 <- function(x, ..., value){
  2 |     if(!x$valid()){
  3 |         stop("Invalid file array")
  4 |     }
  5 |     if(isTRUE(x$.mode == 'readonly')){
  6 |         stop("File array is read-only")
  7 |     }
  8 |     
  9 |     buf_bytes <- get_buffer_size()
 10 |     on.exit({
 11 |         set_buffer_size(buf_bytes)
 12 |     })
 13 |     
 14 |     # parse ...
 15 |     dim <- x$dimension()
 16 |     arglen <- ...length()
 17 |     
 18 |     
 19 |     locs <- list()
 20 |     if(arglen <= 1){
 21 |         if( arglen == 1 ){
 22 |             missing_args <- check_missing_dots(environment())
 23 |             if(!missing_args){
 24 |                 stop("SubsetAssign FileArray only allows x[] <- v or x[i,j,...] <- v (single index not allowed)")
 25 |             }
 26 |         }
 27 |         if(!length(value) %in% c(1, prod(dim))){
 28 |             stop("SubsetAssign FileArray `value` length mismatch: `value` length must be either 1 or the same length of the subset.")
 29 |         }
 30 |         target_dim <- dim
 31 |         x$initialize_partition(x$.partition_info[, 1])
 32 |     } else if(arglen > 1){
 33 |         if(arglen != length(dim)){
 34 |             stop("SubsetAssign FileArray dimension mismatch.")
 35 |         }
 36 |         
 37 |         missing_args <- check_missing_dots(environment())
 38 |         for(ii in seq_len(arglen)){
 39 |             if(missing_args[[ii]]){
 40 |                 locs[[ii]] <- seq_len(dim[[ii]])
 41 |             } else {
 42 |                 tmp <- ...elt(ii)
 43 |                 if( !length(tmp) ){
 44 |                     return(x)
 45 |                 }
 46 |                 if(any(is.na(tmp))){
 47 |                     stop("SubsetAssign cannot contain duplicated or invalid indices.")
 48 |                 }
 49 |                 if(is.logical(tmp)){
 50 |                     if(length(tmp) > dim[[ii]]){
 51 |                         stop("(subscript) logical subscript too long")
 52 |                     }
 53 |                     tmp <- rep(tmp, ceiling(dim[[ii]] / length(tmp)))
 54 |                     tmp <- which(tmp[seq_len(dim[[ii]])])
 55 |                 } else if(any(tmp > dim[[ii]]) ||
 56 |                           any(tmp <= 0) ||
 57 |                           any(duplicated(tmp))){
 58 |                     stop("SubsetAssign cannot contain duplicated or invalid indices.")
 59 |                 }
 60 |                 locs[[ii]] <- tmp
 61 |             }
 62 |         }
 63 |         
 64 |         target_dim <- sapply(locs, length)
 65 |         if(!length(value) %in% c(1, prod(target_dim))){
 66 |             stop("SubsetAssign FileArray `value` length mismatch: `value` length must be either 1 or the same length of the subset.")
 67 |         }
 68 |         
 69 |         # make sure partitions exist
 70 |         tmp <- locs[[length(locs)]]
 71 |         sapply(tmp, function(i){
 72 |             sel <- x$.partition_info[,3] <= i
 73 |             if(any(sel)){
 74 |                 sel <- max(x$.partition_info[sel,1])
 75 |                 if(x$.partition_info[sel, 3] < i){
 76 |                     sel <- sel + 1
 77 |                 }
 78 |             } else {
 79 |                 sel <- 1
 80 |             }
 81 |             x$initialize_partition(sel)
 82 |         })
 83 |     }
 84 |     
 85 |     if(prod(target_dim) == 0){
 86 |         return(invisible(x))
 87 |     }
 88 |     
 89 |     # decide split_dim
 90 |     buffer_sz <- buf_bytes / x$element_size()
 91 |     cprod <- cumprod(dim)
 92 |     if(length(locs) == length(dim)){
 93 |         tmp <- sapply(locs, function(x){
 94 |             if(!length(x) || all(is.na(x))){ return(1L) }
 95 |             rg <- range(x, na.rm = TRUE)
 96 |             return(rg[2] - rg[1] + 1L)
 97 |         })
 98 |         cprod <- cprod / dim * tmp
 99 |     }
100 |     cprod <- cprod[-length(cprod)]
101 |     if(all(cprod > buffer_sz)){
102 |         split_dim <- 1
103 |     } else {
104 |         split_dim <- max(which(cprod <= buffer_sz))
105 |     }
106 |     
107 |     # filebase <- paste0(x$.filebase, x$.sep)
108 |     # FARR_subset_assign(
109 |     #     filebase,
110 |     #     listOrEnv = locs,
111 |     #     dim = x$dimension(),
112 |     #     cum_part_sizes = x$.partition_info[, 3],
113 |     #     split_dim = split_dim,
114 |     #     type = x$sexp_type(),
115 |     #     value_ = value
116 |     # )
117 |     FARR_subset_assign2(
118 |         filebase = x$.filebase, 
119 |         value = value,
120 |         listOrEnv = locs,
121 |         split_dim = split_dim,
122 |         thread_buffer = buf_bytes
123 |     ) 
124 |     invisible(x)
125 | }
126 | 
127 | # setMethod("[<-",
128 | #     signature(x = "Filearray"),
129 | #     function (x, i, j, ..., value) 
130 | #     {
131 | #         stop("need a definition for the method here")
132 | #     }
133 | # )
134 | 
135 | # DIPSAUS DEBUG START
136 | # verbose <- TRUE
137 | # z = filearray_create(temp_path(), c(2,3,4))
138 | # z[] <- 1:24
139 | # x <- z+1
140 | # i <- as_filearray(z[] < 5)
141 | 
142 | # `i` is a filearray
143 | fa_subsetAssign2 <- function(x, i, value, label = "subset-assign (lazy)") {
144 |     stopifnot(is_filearray(x) && is_filearray(i))
145 |     
146 |     if(typeof(i) != "logical") {
147 |         stop("`fa_subsetAssign2`: subset index filearray must be logical")
148 |     }
149 |     
150 |     e1 <- as_filearrayproxy(x)
151 |     e2 <- as_filearrayproxy(i)
152 |     uuid1 <- e1$uuid()
153 |     uuid2 <- e2$uuid()
154 |     e1$link_proxy( e2 )
155 |     
156 |     # check input types 
157 |     out_type <- typeof(e1)
158 |     
159 |     globals <- fastmap::fastmap()
160 |     globals$set("starting_idx", 0L)
161 |     value_len <- length(value)
162 |     
163 |     op_func <- function(value_list, ...) {
164 |         data <- value_list[[ uuid1 ]]
165 |         
166 |         idx <- value_list[[ uuid2 ]]
167 |         n_assigned <- sum(idx)
168 |         if(n_assigned == 0) { return(data) }
169 |         
170 |         if( value_len == 1L ) {
171 |             data[ idx ] <- value
172 |         } else {
173 |             starting_idx <- globals$get("starting_idx")
174 |             data[ idx ] <- value[starting_idx + seq_len(n_assigned)]
175 |             globals$set("starting_idx", starting_idx + n_assigned)
176 |         }
177 |         
178 |         return(data)
179 |     }
180 |     
181 |     e1$add_operator(op_func,
182 |                     out_type = out_type,
183 |                     context = "array_subset_assign",
184 |                     label = label)
185 |     
186 |     return( e1 )
187 | }
188 | 
189 | 
190 | #' @describeIn S3-filearray subset assign array
191 | #' @export
192 | `[<-.FileArray` <- function(x, i, ..., lazy = FALSE, value) {
193 |     if(!x$valid()){
194 |         stop("Invalid file array")
195 |     }
196 |     
197 |     if(missing(i)) {
198 |         if(is_fileproxy(x) && length(x$.ops) > 0) {
199 |             x <- fa_eval_ops(x)
200 |             x$.mode <- "readwrite"
201 |         }
202 |         return(fa_subsetAssign1(x, , ..., value = value))
203 |     } else {
204 |         if(is_filearray(i) || is.array(i) || 
205 |            ( ...length() == 0 && !is.logical(i) )) {
206 |             re <- fa_subsetAssign2(x, i, value = value)
207 |             if(!lazy) {
208 |                 re <- fa_eval_ops(re)
209 |                 re$.mode <- "readwrite"
210 |             }
211 |             return(re)
212 |         } else {
213 |             if(is_fileproxy(x) && length(x$.ops) > 0) {
214 |                 x <- fa_eval_ops(x)
215 |                 x$.mode <- "readwrite"
216 |             }
217 |             return(fa_subsetAssign1(x, i, ..., value = value))
218 |         }
219 |     }
220 | }


--------------------------------------------------------------------------------
/adhoc/readme-figure.R:
--------------------------------------------------------------------------------
  1 | set.seed(1)
  2 | dim <- c(100,200,200,100)
  3 | lazyx <- lazyarray::create_lazyarray(
  4 |     tempfile(), storage_format = 'double', dim = dim)
  5 | lazyx[] <- NA
  6 | filex <- filearray::filearray_create(
  7 |     tempfile(), dim, 'double')
  8 | filex$initialize_partition()
  9 | 
 10 | set.seed(1)
 11 | tmp <- rnorm(4e7)
 12 | res1 <- microbenchmark::microbenchmark(
 13 |     lazyarray = {
 14 |         for(i in 1:10){
 15 |             lazyx[,,,(i-1)*10 +1:10] <- tmp
 16 |         }
 17 |     },
 18 |     filearray = {
 19 |         for(i in 1:10){
 20 |             filex[,,,(i-1)*10 +1:10] <- tmp
 21 |         }
 22 |     }, times = 5, setup = quote(gc())
 23 | ); res1
 24 | 
 25 | 
 26 | 
 27 | speed1 <- sapply(split(res1, res1$expr), function(res){
 28 |     speed <- length(filex) *8000 / res$time
 29 |     c(mean(speed), sd(speed) / 2)
 30 | }); speed1
 31 | 
 32 | 
 33 | res2 <- microbenchmark::microbenchmark(
 34 |     lazyarray = {
 35 |         for(i in 1:10){
 36 |             lazyx[,,,1:10 + (i-1) * 10]
 37 |         }
 38 |     },
 39 |     filex = {
 40 |         for(i in 1:10){
 41 |             filex[,,,1:10 + (i-1) * 10]
 42 |         }
 43 |     }, times = 5, setup = quote(gc())
 44 | )
 45 | speed2 <- sapply(split(res2, res2$expr), function(res){
 46 |     speed <- length(filex) *8000 / res$time
 47 |     c(mean(speed), sd(speed) / 3)
 48 | }); speed2
 49 | 
 50 | 
 51 | 
 52 | set.seed(1)
 53 | locs <- lapply(dim, function(d){
 54 |     sample(1:d, replace = FALSE, size = sample(ifelse(d>50,50,1):d, 1))
 55 | })
 56 | 
 57 | res3 <- microbenchmark::microbenchmark(
 58 |     lazyarray = {
 59 |         lazyx[locs[[1]],locs[[2]],locs[[3]],locs[[4]]]
 60 |     },
 61 |     filearray = {
 62 |         filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]]
 63 |     }, 
 64 |     times = 10, setup = quote(gc()))
 65 | 
 66 | speed3 <- sapply(split(res3, res3$expr), function(res){
 67 |     speed <- prod(sapply(locs, length)) * 8000 / res$time
 68 |     c(mean(speed), sd(speed) / 3)
 69 | }); speed3
 70 | 
 71 | 
 72 | z <- filex[]
 73 | res5 <- microbenchmark::microbenchmark(
 74 |     nativeR = {
 75 |         z[locs[[1]],locs[[2]],locs[[3]],locs[[4]]]
 76 |     }, 
 77 |     times = 10, setup = quote(gc()))
 78 | speed3 <- cbind(speed3, sapply(split(res5, res5$expr), function(res){
 79 |     speed <- prod(sapply(locs, length)) * 8000 / res$time
 80 |     c(mean(speed), sd(speed) / 3)
 81 | }))
 82 | 
 83 | 
 84 | rm(z); gc()
 85 | tmp <- rnorm(prod(sapply(locs, length)))
 86 | res4 <- microbenchmark::microbenchmark(
 87 |     lazyarray = {
 88 |         lazyx[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp
 89 |     },
 90 |     filearray = {
 91 |         filex[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp
 92 |     }, times = 10, setup = quote(gc())
 93 | )
 94 | speed4 <- sapply(split(res4, res4$expr), function(res){
 95 |     speed <- prod(sapply(locs, length)) * 8000 / res$time
 96 |     c(mean(speed), sd(speed) / 3)
 97 | }); speed4
 98 | 
 99 | z <- filex[]
100 | res5 <- microbenchmark::microbenchmark(
101 |     nativeR = {
102 |         z[locs[[1]],locs[[2]],locs[[3]],locs[[4]]] <- tmp
103 |     }, times = 10, setup = quote(gc())
104 | )
105 | rm(z); gc()
106 | speed4 <- cbind(speed4, sapply(split(res5, res5$expr), function(res){
107 |     speed <- prod(sapply(locs, length)) * 8000 / res$time
108 |     c(mean(speed), sd(speed) / 3)
109 | }))
110 | mean(prod(sapply(locs, length)) / 1e8 * 800 / res5$time) * 1e9
111 | 
112 | f <- function(){
113 |     
114 |     par(mfrow = c(1,4))
115 |     
116 |     cols <- ravebuiltins:::group_colors[1:2]
117 |     cols2 <- ravebuiltins:::group_colors[1:3]
118 |     
119 |     speed <- cbind(
120 |         speed1[1,],
121 |         speed2[1,]
122 |     ) / 1024
123 |     rownames(speed) <- c("lazyarray", "filearray")
124 |     colnames(speed) <- c("Write", "Read")
125 |     
126 |     txt_cex = 1.2
127 |     txt_cex2 = 1.2
128 |     
129 |     plt <- barplot.default(
130 |         speed[,1,drop=FALSE], beside = TRUE,
131 |         ylab = "Speed (GB/s)", 
132 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
133 |         ylim = c(0, 3), las = 1, yaxt = "n", 
134 |         border = NA,
135 |         main = "Write 3GB Data", 
136 |         cex.names = 1.4, cex.lab = 1.4, cex.main = 1.4
137 |     )
138 |     axis(2, 0:3, las = 1)
139 |     text.default(x = plt[,], y = speed[,1], 
140 |                  labels = sprintf(c("lazyarray\n%.0f MB/s","filearray\n%.0f MB/s"), speed[,1] * 1024), cex = txt_cex,
141 |                  col = 'white')
142 |     text.default(x = plt[,], y = speed[,1], 
143 |                  labels = c("lazyarray\n","filearray\n"),
144 |                  cex = txt_cex2,
145 |                  col = cols)
146 |     
147 |     
148 |     plt <- barplot.default(
149 |         speed[,2,drop=FALSE], beside = TRUE,
150 |         ylab = "Speed (GB/s)", 
151 |         col = dipsaus::col2hexStr(cols, alpha = 0.5),
152 |         ylim = c(0, 3), las = 1, yaxt = "n", 
153 |         border = NA,
154 |         main = "Read 3GB Data"
155 |     )
156 |     axis(2, c(0, 1, 2, 3), las = 1)
157 |     text.default(x = plt[,], y = speed[,2], 
158 |                  labels = sprintf(c("lazyarray\n%.0f MB/s","filearray\n%.0f MB/s"), speed[,2] * 1024), cex = txt_cex,
159 |                  col = 'white')
160 |     text.default(x = plt[,], y = speed[,2], 
161 |                  labels = c("lazyarray\n","filearray\n"),
162 |                  cex = txt_cex2,
163 |                  col = cols)
164 |     
165 |     speed <- cbind(
166 |         speed3[1,],
167 |         speed4[1,]
168 |     ) / 1024
169 |     rownames(speed) <- c("lazyarray", "filearray", "in-memory")
170 |     colnames(speed) <- c("Subset", "SubsetAssign")
171 |     plt <- barplot.default(
172 |         speed[,1,drop=FALSE], beside = TRUE,
173 |         ylab = "Speed (GB/s)", 
174 |         col = dipsaus::col2hexStr(cols2, alpha = 0.5),
175 |         ylim = c(0, 3), las = 1, yaxt = "n", 
176 |         border = NA,
177 |         main = "Randomly Subset\n800MB Data"
178 |     )
179 |     axis(2, c(0, 1, 2, 3), las = 1)
180 |     text.default(x = plt, y = speed[,1], 
181 |                  labels = sprintf(
182 |                      c("lazyarray\n%.0f MB/s",
183 |                        "filearray\n%.0f MB/s",
184 |                        "in-memory\n%.0f MB/s"), 
185 |                      speed[,1]*1024), cex = txt_cex,
186 |                  col = 'white')
187 |     text.default(x = plt[,], y = speed[,1], 
188 |                  labels = c("lazyarray\n", 
189 |                             "filearray\n",
190 |                             "in-memory\n"),
191 |                  cex = txt_cex2,
192 |                  col = cols2)
193 |     
194 |     
195 |     plt <- barplot.default(
196 |         speed[,2,drop=FALSE], beside = TRUE,
197 |         ylab = "Speed (GB/s)", 
198 |         col = dipsaus::col2hexStr(cols2, alpha = 0.5),
199 |         ylim = c(0, 3), las = 1, yaxt = "n", 
200 |         border = NA,
201 |         main = "Randomly Replace\n800MB Data"
202 |     )
203 |     axis(2, c(0, 1, 2, 3), las = 1)
204 |     text.default(x = plt, y = speed[,2], 
205 |                  labels = sprintf(
206 |                      c("lazyarray\n%.0f MB/s",
207 |                        "filearray\n%.0f MB/s",
208 |                        "in-memory\n%.0f MB/s"), 
209 |                      speed[,2]*1024), cex = txt_cex,
210 |                  col = 'white')
211 |     text.default(x = plt[,], y = speed[,2], 
212 |                  labels = c("lazyarray\n", 
213 |                             "filearray\n",
214 |                             "in-memory\n"),
215 |                  cex = txt_cex2,
216 |                  col = cols2)
217 |     
218 | }
219 | 
220 | png("./adhoc/readme-speed.png", width = 4267, height = 1600, res = 300)
221 | f()
222 | dev.off()
223 | 


--------------------------------------------------------------------------------
/tests/testthat/test-method_sub.R:
--------------------------------------------------------------------------------
  1 | test_that("method: addition", {
  2 |     on.exit({
  3 |         options("filearray.operator.precision" = NULL)
  4 |         clear_cache()
  5 |     })
  6 |     
  7 |     dm <- c(2,3,4, 10)
  8 |     len <- prod(dm)
  9 |     
 10 |     x_dbl <- array(rnorm(len), dim = dm)
 11 |     x_int <- array(seq_len(len), dim = dm)
 12 |     x_lgl <- x_dbl > 0
 13 |     
 14 |     arr_dbl <- as_filearray(x_dbl, type = "double", partition_size = 2)
 15 |     arr_int <- as_filearray(x_int, type = "integer", partition_size = 3)
 16 |     arr_lgl <- as_filearray(x_lgl, type = "logical", partition_size = 4)
 17 |     
 18 |     
 19 |     proxy_dbl <- as_filearrayproxy(arr_dbl)
 20 |     proxy_int <- as_filearrayproxy(arr_int)
 21 |     proxy_lgl <- as_filearrayproxy(arr_lgl)
 22 |     
 23 |     # sanity
 24 |     expect_equal(arr_dbl[dimnames = NULL], x_dbl, tolerance = 1e-5)
 25 |     expect_equal(arr_int[dimnames = NULL], x_int)
 26 |     expect_equal(arr_lgl[dimnames = NULL], x_lgl)
 27 |     
 28 |     expect_equal(proxy_dbl[dimnames = NULL], x_dbl, tolerance = 1e-5)
 29 |     expect_equal(proxy_int[dimnames = NULL], x_int)
 30 |     expect_equal(proxy_lgl[dimnames = NULL], x_lgl)
 31 |     
 32 |     check_add <- function(e1, e2, d1, d2, type, tolerance = .Machine$double.eps) {
 33 |         y <- e1-e2
 34 |         res <- y[dimnames = NULL]
 35 |         
 36 |         if(!missing(type)) {
 37 |             expect_identical(typeof(y), type)
 38 |         }
 39 |         
 40 |         expect_equal(res, d1 - d2, tolerance = tolerance)
 41 |     }
 42 |     
 43 |     options("filearray.operator.precision" = "double")
 44 |     
 45 |     # filearrayproxy - Double
 46 |     check_add(proxy_dbl, proxy_dbl, x_dbl, x_dbl, "double", 1e-5)
 47 |     check_add(proxy_dbl, proxy_int, x_dbl, x_int, "double", 1e-5)
 48 |     check_add(proxy_dbl, proxy_lgl, x_dbl, x_lgl, "double", 1e-5)
 49 |     
 50 |     check_add(proxy_dbl, arr_dbl, x_dbl, x_dbl, "double", 1e-5)
 51 |     check_add(proxy_dbl, arr_int, x_dbl, x_int, "double", 1e-5)
 52 |     check_add(proxy_dbl, arr_lgl, x_dbl, x_lgl, "double", 1e-5)
 53 |     
 54 |     check_add(proxy_dbl, 1.0, x_dbl, 1.0, "double", 1e-5)
 55 |     check_add(proxy_dbl, 1L, x_dbl, 1L, "double", 1e-5)
 56 |     check_add(proxy_dbl, TRUE, x_dbl, TRUE, "double", 1e-5)
 57 |     
 58 |     # filearrayproxy - Int
 59 |     check_add(proxy_int, proxy_dbl, x_int, x_dbl, "double", 1e-5)
 60 |     check_add(proxy_int, proxy_int, x_int, x_int, "integer", 1e-5)
 61 |     check_add(proxy_int, proxy_lgl, x_int, x_lgl, "integer", 1e-5)
 62 |     
 63 |     check_add(proxy_int, arr_dbl, x_int, x_dbl, "double", 1e-5)
 64 |     check_add(proxy_int, arr_int, x_int, x_int, "integer", 1e-5)
 65 |     check_add(proxy_int, arr_lgl, x_int, x_lgl, "integer", 1e-5)
 66 |     
 67 |     check_add(proxy_int, 1.0, x_int, 1.0, "double", 1e-5)
 68 |     check_add(proxy_int, 1L, x_int, 1L, "integer", 1e-5)
 69 |     check_add(proxy_int, TRUE, x_int, TRUE, "integer", 1e-5)
 70 |     
 71 |     # filearrayproxy - Logical
 72 |     check_add(proxy_lgl, proxy_dbl, x_lgl, x_dbl, "double", 1e-5)
 73 |     check_add(proxy_lgl, proxy_int, x_lgl, x_int, "integer", 1e-5)
 74 |     check_add(proxy_lgl, proxy_lgl, x_lgl, x_lgl, "integer", 1e-5)
 75 |     
 76 |     check_add(proxy_lgl, arr_dbl, x_lgl, x_dbl, "double", 1e-5)
 77 |     check_add(proxy_lgl, arr_int, x_lgl, x_int, "integer", 1e-5)
 78 |     check_add(proxy_lgl, arr_lgl, x_lgl, x_lgl, "integer", 1e-5)
 79 |     
 80 |     check_add(proxy_lgl, 1.0, x_lgl, 1.0, "double", 1e-5)
 81 |     check_add(proxy_lgl, 1L, x_lgl, 1L, "integer", 1e-5)
 82 |     check_add(proxy_lgl, TRUE, x_lgl, TRUE, "integer", 1e-5)
 83 |     
 84 |     # filearray - Double
 85 |     check_add(arr_dbl, proxy_dbl, x_dbl, x_dbl, "double", 1e-5)
 86 |     check_add(arr_dbl, proxy_int, x_dbl, x_int, "double", 1e-5)
 87 |     check_add(arr_dbl, proxy_lgl, x_dbl, x_lgl, "double", 1e-5)
 88 |     
 89 |     check_add(arr_dbl, arr_dbl, x_dbl, x_dbl, "double", 1e-5)
 90 |     check_add(arr_dbl, arr_int, x_dbl, x_int, "double", 1e-5)
 91 |     check_add(arr_dbl, arr_lgl, x_dbl, x_lgl, "double", 1e-5)
 92 |     
 93 |     check_add(arr_dbl, 1.0, x_dbl, 1.0, "double", 1e-5)
 94 |     check_add(arr_dbl, 1L, x_dbl, 1L, "double", 1e-5)
 95 |     check_add(arr_dbl, TRUE, x_dbl, TRUE, "double", 1e-5)
 96 |     
 97 |     # filearray - Int
 98 |     check_add(arr_int, proxy_dbl, x_int, x_dbl, "double", 1e-5)
 99 |     check_add(arr_int, proxy_int, x_int, x_int, "integer", 1e-5)
100 |     check_add(arr_int, proxy_lgl, x_int, x_lgl, "integer", 1e-5)
101 |     
102 |     check_add(arr_int, arr_dbl, x_int, x_dbl, "double", 1e-5)
103 |     check_add(arr_int, arr_int, x_int, x_int, "integer", 1e-5)
104 |     check_add(arr_int, arr_lgl, x_int, x_lgl, "integer", 1e-5)
105 |     
106 |     check_add(arr_int, 1.0, x_int, 1.0, "double", 1e-5)
107 |     check_add(arr_int, 1L, x_int, 1L, "integer", 1e-5)
108 |     check_add(arr_int, TRUE, x_int, TRUE, "integer", 1e-5)
109 |     
110 |     # filearray - Logical
111 |     check_add(arr_lgl, proxy_dbl, x_lgl, x_dbl, "double", 1e-5)
112 |     check_add(arr_lgl, proxy_int, x_lgl, x_int, "integer", 1e-5)
113 |     check_add(arr_lgl, proxy_lgl, x_lgl, x_lgl, "integer", 1e-5)
114 |     
115 |     check_add(arr_lgl, arr_dbl, x_lgl, x_dbl, "double", 1e-5)
116 |     check_add(arr_lgl, arr_int, x_lgl, x_int, "integer", 1e-5)
117 |     check_add(arr_lgl, arr_lgl, x_lgl, x_lgl, "integer", 1e-5)
118 |     
119 |     check_add(arr_lgl, 1.0, x_lgl, 1.0, "double", 1e-5)
120 |     check_add(arr_lgl, 1L, x_lgl, 1L, "integer", 1e-5)
121 |     check_add(arr_lgl, TRUE, x_lgl, TRUE, "integer", 1e-5)
122 |     
123 |     
124 |     # scalar - double
125 |     check_add(1.0, proxy_dbl, 1.0, x_dbl, "double", 1e-5)
126 |     check_add(1.0, proxy_int, 1.0, x_int, "double", 1e-5)
127 |     check_add(1.0, proxy_lgl, 1.0, x_lgl, "double", 1e-5)
128 |     
129 |     check_add(1.0, arr_dbl, 1.0, x_dbl, "double", 1e-5)
130 |     check_add(1.0, arr_int, 1.0, x_int, "double", 1e-5)
131 |     check_add(1.0, arr_lgl, 1.0, x_lgl, "double", 1e-5)
132 |     
133 |     # scalar - int
134 |     check_add(1L, proxy_dbl, 1L, x_dbl, "double", 1e-5)
135 |     check_add(1L, proxy_int, 1L, x_int, "integer", 1e-5)
136 |     check_add(1L, proxy_lgl, 1L, x_lgl, "integer", 1e-5)
137 |     
138 |     check_add(1L, arr_dbl, 1L, x_dbl, "double", 1e-5)
139 |     check_add(1L, arr_int, 1L, x_int, "integer", 1e-5)
140 |     check_add(1L, arr_lgl, 1L, x_lgl, "integer", 1e-5)
141 |     
142 |     # scalar - lgl
143 |     check_add(FALSE, proxy_dbl, FALSE, x_dbl, "double", 1e-5)
144 |     check_add(FALSE, proxy_int, FALSE, x_int, "integer", 1e-5)
145 |     check_add(FALSE, proxy_lgl, FALSE, x_lgl, "integer", 1e-5)
146 |     
147 |     check_add(FALSE, arr_dbl, FALSE, x_dbl, "double", 1e-5)
148 |     check_add(FALSE, arr_int, FALSE, x_int, "integer", 1e-5)
149 |     check_add(FALSE, arr_lgl, FALSE, x_lgl, "integer", 1e-5)
150 |     
151 |     # wrong dimensions
152 |     testthat::expect_error(arr_int - array(0L, c(10,20,1)))
153 |     testthat::expect_error(arr_int - 1:10)
154 |     tmp <- arr_int[]
155 |     dm <- dim(arr_int)
156 |     dm <- c(dm[1] * dm[2], dm[-c(1,2)])
157 |     dim(tmp) <- dm
158 |     tmp_arr <- as_filearray(tmp)
159 |     testthat::expect_error( arr_int - tmp )
160 |     testthat::expect_error( tmp - arr_int )
161 |     testthat::expect_error( arr_int - tmp_arr )
162 |     testthat::expect_error( tmp_arr - arr_int )
163 |     testthat::expect_equal(
164 |         (arr_int - seq_len(length(arr_int)))[dimnames = NULL],
165 |         arr_int[dimnames = NULL] - seq_len(length(arr_int))
166 |     )
167 |     testthat::expect_equal(
168 |         (seq_len(length(arr_int)) - arr_int)[dimnames = NULL],
169 |         seq_len(length(arr_int)) - arr_int[dimnames = NULL]
170 |     )
171 |     
172 | })
173 | 


--------------------------------------------------------------------------------