├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── pkgdown.yaml ├── .gitignore ├── .travis.yml ├── CRAN-RELEASE ├── DESCRIPTION ├── Makefile ├── NAMESPACE ├── NEWS.md ├── R ├── .gitignore ├── RcppExports.R ├── aaa.R ├── class-abstract.R ├── class-filearray.R ├── class-fstarray.R ├── generics.R ├── lazyarray.R ├── model-lm.R ├── utils.R └── zzz.R ├── cran-comments.md ├── experiment ├── class-lazyarray.R ├── class-lazymatrix.R ├── create_array.R ├── data │ └── .gitignore ├── funcs-lm.R ├── junk.sh ├── lazyarray.R ├── lazymatrix.R ├── loader2ext.cpp ├── loader2ext.h ├── old │ ├── bigmemory-example.R │ ├── bmWrapper.cpp │ ├── bmWrapper.h │ ├── classLazyArray.h │ ├── comparison-bigmemory.R │ ├── comparison-lazyarray.R │ ├── filearrsub.R │ ├── int64_double.cpp │ ├── junk.R │ ├── memcpy.cpp │ ├── new_subset.R │ ├── old_scripts.R │ ├── profile.R │ ├── s4-definition.R │ ├── s4-generics.R │ ├── speed test.R │ ├── speed_test.R │ ├── test_io.R │ └── write.R ├── performance-fst-vs-file.R ├── s3-lazyarray.R └── s3-lazymatrix.R ├── inst ├── WORDLIST └── include │ ├── interfaces │ ├── BMArray.h │ ├── FstArray.h │ ├── FstMatrix.h │ ├── LazyArrayBase.h │ └── entry.h │ ├── lazyarray.h │ └── lazyarray_RcppExports.h ├── lazyarray.Rproj ├── man ├── .gitignore ├── auto_clear_lazyarray.Rd ├── chunk_map.Rd ├── crossprod.Rd ├── lazy_parallel.Rd ├── lazyarray-threads.Rd ├── lazyarray.Rd ├── lazylm.Rd ├── partition_map.Rd ├── partition_table.Rd └── typeof-AbstractLazyArray-method.Rd ├── readme.md ├── src ├── .gitignore ├── Makevars ├── RcppExports.cpp ├── classIndexSchedule.cpp ├── classIndexSchedule.h ├── classLazyArray.cpp ├── common.cpp ├── common.h ├── fstWrapper.cpp ├── fstWrapper.h ├── indexConvert.cpp ├── indexConvert.h ├── lazyarray-ext.cpp ├── lazycommon.h ├── loader1.cpp ├── loader1.h ├── loader2.cpp ├── loader2.h ├── loader3.cpp ├── old │ ├── loader2ext.cpp │ ├── loader2ext.h │ └── loader3.cpp ├── openMPInterface.cpp ├── openMPInterface.h ├── playground.cpp ├── reshape.cpp ├── reshape.h ├── saver2.cpp ├── saver2.h ├── saver2ext.cpp ├── saver2ext.h ├── utils.cpp └── utils.h ├── tests ├── testthat.R └── testthat │ ├── test-cpp_io2.R │ ├── test-cpp_loc2idx.R │ ├── test-cpp_parseAndScheduleBlocks.R │ ├── test-cpp_subsetIdx.R │ ├── test-generics.R │ ├── test-lazylm.R │ ├── test-matmul.R │ ├── test-subset.R │ ├── test.cpp_io.R │ └── test.setget.R └── vignettes ├── .gitignore └── basic-usage-of-lazyarray.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^experiment/ 4 | ^docs/ 5 | ^readme\.md$ 6 | ^cran-comments\.md$ 7 | ^CRAN-RELEASE$ 8 | ^Makefile$ 9 | ^\.travis\.yml$ 10 | ^\.github$ 11 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # NOTE: This workflow is overkill for most R packages 2 | # check-standard.yaml is likely a better choice 3 | # usethis::use_github_action("check-standard") will install it. 4 | # 5 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 6 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 7 | on: 8 | push: 9 | branches: 10 | - main 11 | - master 12 | pull_request: 13 | branches: 14 | - main 15 | - master 16 | 17 | name: R-CMD-check 18 | 19 | jobs: 20 | R-CMD-check: 21 | runs-on: ${{ matrix.config.os }} 22 | 23 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 24 | 25 | strategy: 26 | fail-fast: false 27 | matrix: 28 | config: 29 | - {os: macOS-latest, r: 'release'} 30 | - {os: windows-latest, r: 'release'} 31 | - {os: windows-latest, r: '3.6'} 32 | - {os: ubuntu-16.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest", http-user-agent: "R/4.0.0 (ubuntu-16.04) R (4.0.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } 33 | - {os: ubuntu-16.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} 34 | - {os: ubuntu-16.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} 35 | - {os: ubuntu-16.04, r: '3.5', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} 36 | 37 | env: 38 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 39 | RSPM: ${{ matrix.config.rspm }} 40 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 41 | 42 | steps: 43 | - uses: actions/checkout@v2 44 | 45 | - uses: r-lib/actions/setup-r@v1 46 | with: 47 | r-version: ${{ matrix.config.r }} 48 | http-user-agent: ${{ matrix.config.http-user-agent }} 49 | 50 | - uses: r-lib/actions/setup-pandoc@v1 51 | 52 | - name: Query dependencies 53 | run: | 54 | install.packages('remotes') 55 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 56 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 57 | shell: Rscript {0} 58 | 59 | - name: Cache R packages 60 | if: runner.os != 'Windows' 61 | uses: actions/cache@v2 62 | with: 63 | path: ${{ env.R_LIBS_USER }} 64 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 65 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 66 | 67 | - name: Install system dependencies 68 | if: runner.os == 'Linux' 69 | run: | 70 | while read -r cmd 71 | do 72 | eval sudo $cmd 73 | done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "16.04"))') 74 | 75 | - name: Install dependencies 76 | run: | 77 | remotes::install_deps(dependencies = TRUE) 78 | remotes::install_cran("rcmdcheck") 79 | shell: Rscript {0} 80 | 81 | - name: Session info 82 | run: | 83 | options(width = 100) 84 | pkgs <- installed.packages()[, "Package"] 85 | sessioninfo::session_info(pkgs, include_base = TRUE) 86 | shell: Rscript {0} 87 | 88 | - name: Check 89 | env: 90 | _R_CHECK_CRAN_INCOMING_: false 91 | run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 92 | shell: Rscript {0} 93 | 94 | - name: Show testthat output 95 | if: always() 96 | run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true 97 | shell: bash 98 | 99 | - name: Upload check results 100 | if: failure() 101 | uses: actions/upload-artifact@main 102 | with: 103 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 104 | path: check 105 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | 7 | name: pkgdown 8 | 9 | jobs: 10 | pkgdown: 11 | runs-on: macOS-latest 12 | env: 13 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - uses: r-lib/actions/setup-r@v1 18 | 19 | - uses: r-lib/actions/setup-pandoc@v1 20 | 21 | - name: Query dependencies 22 | run: | 23 | install.packages('remotes') 24 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 25 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 26 | shell: Rscript {0} 27 | 28 | - name: Cache R packages 29 | uses: actions/cache@v2 30 | with: 31 | path: ${{ env.R_LIBS_USER }} 32 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 33 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 34 | 35 | - name: Install dependencies 36 | run: | 37 | remotes::install_deps(dependencies = TRUE) 38 | install.packages("pkgdown", type = "binary") 39 | shell: Rscript {0} 40 | 41 | - name: Install package 42 | run: R CMD INSTALL . 43 | 44 | - name: Deploy package 45 | run: | 46 | git config --local user.email "actions@github.com" 47 | git config --local user.name "GitHub Actions" 48 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | inst/doc 9 | .DS_Store 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | warnings_are_errors: false 5 | os: 6 | - linux 7 | compiler: 8 | - gcc 9 | - clang 10 | env: 11 | - _R_CHECK_SYSTEM_CLOCK_=0 12 | cache: 13 | directories: 14 | - $TRAVIS_BUILD_DIR/packrat/src 15 | - $TRAVIS_BUILD_DIR/packrat/lib 16 | packages: true 17 | r: 18 | - release 19 | - devel 20 | - oldrel 21 | -------------------------------------------------------------------------------- /CRAN-RELEASE: -------------------------------------------------------------------------------- 1 | This package was submitted to CRAN on 2020-07-17. 2 | Once it is accepted, delete this file and tag the release (commit c3bd1bcb2e). 3 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: lazyarray 2 | Type: Package 3 | Title: Persistent Large Data Array with Lazy-Loading on Demand 4 | Version: 1.1.0.9000 5 | Language: en-US 6 | License: AGPL-3 7 | Encoding: UTF-8 8 | SystemRequirements: 9 | C++11 10 | little-endian platform 11 | RoxygenNote: 7.1.1 12 | URL: https://github.com/dipterix/lazyarray 13 | BugReports: https://github.com/dipterix/lazyarray/issues 14 | Authors@R: c( 15 | person("Zhengjia", "Wang", email = "dipterix.wang@gmail.com", role = c("aut", "cre", "cph")), 16 | person("Mark", "Klik", email = "markklik@gmail.com", role = c("ctb", "cph"), 17 | comment = "Copyright holder of fstcore package") 18 | ) 19 | Description: Multi-threaded serialization of compressed array that 20 | fully utilizes modern solid state drives. It allows 21 | to store and load extremely large data on demand within seconds 22 | without occupying too much memories. With data stored on hard drive, 23 | a lazy-array data can be loaded, shared across multiple R sessions. 24 | For arrays with partition mode on, multiple R sessions can write to 25 | a same array simultaneously along the last dimension (partition). 26 | The internal storage format is provided by 'fstcore' package geared by 27 | 'LZ4' and 'ZSTD' compressors. 28 | Imports: 29 | utils, 30 | methods, 31 | parallel, 32 | stats, 33 | Rcpp (>= 1.0.5), 34 | R6, 35 | fstcore, 36 | yaml, 37 | filematrix 38 | LinkingTo: 39 | Rcpp, 40 | fstcore, 41 | Suggests: 42 | future, 43 | future.callr, 44 | testthat, 45 | knitr, 46 | fst, 47 | rmarkdown, 48 | dipsaus (>= 0.0.8) 49 | VignetteBuilder: knitr 50 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # h/t to @jimhester and @yihui for this parse block: 2 | # https://github.com/yihui/knitr/blob/dc5ead7bcfc0ebd2789fe99c527c7d91afb3de4a/Makefile#L1-L4 3 | # Note the portability change as suggested in the manual: 4 | # https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Writing-portable-packages 5 | PKGNAME = `sed -n "s/Package: *\([^ ]*\)/\1/p" DESCRIPTION` 6 | PKGVERS = `sed -n "s/Version: *\([^ ]*\)/\1/p" DESCRIPTION` 7 | 8 | 9 | all: check 10 | 11 | build: 12 | R CMD build . 13 | 14 | check: build 15 | R CMD check --no-manual $(PKGNAME)_$(PKGVERS).tar.gz 16 | 17 | install_deps: 18 | Rscript \ 19 | -e 'if (!requireNamespace("remotes")) install.packages("remotes")' \ 20 | -e 'remotes::install_deps(dependencies = TRUE)' 21 | 22 | install: install_deps build 23 | R CMD INSTALL $(PKGNAME)_$(PKGVERS).tar.gz 24 | 25 | clean: 26 | @rm -rf $(PKGNAME)_$(PKGVERS).tar.gz $(PKGNAME).Rcheck 27 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method("[",FileArray) 4 | S3method("[",FstArray) 5 | S3method("[<-",FileArray) 6 | S3method("[<-",FstArray) 7 | S3method("dim<-",AbstractLazyArray) 8 | S3method("dimnames<-",AbstractLazyArray) 9 | S3method(as.lazyarray,AbstractLazyArray) 10 | S3method(as.lazyarray,default) 11 | S3method(as.lazymatrix,AbstractLazyArray) 12 | S3method(as.lazymatrix,default) 13 | S3method(chunk_map,AbstractLazyArray) 14 | S3method(dim,AbstractLazyArray) 15 | S3method(dimnames,AbstractLazyArray) 16 | S3method(fitted,lazylm) 17 | S3method(head,AbstractLazyArray) 18 | S3method(length,AbstractLazyArray) 19 | S3method(max,AbstractLazyArray) 20 | S3method(mean,AbstractLazyArray) 21 | S3method(min,AbstractLazyArray) 22 | S3method(partition_map,AbstractLazyArray) 23 | S3method(partition_map,array) 24 | S3method(partition_table,AbstractLazyArray) 25 | S3method(partition_table,array) 26 | S3method(print,"LazyArray-summary") 27 | S3method(print,summary.lazylm) 28 | S3method(range,AbstractLazyArray) 29 | S3method(residuals,lazylm) 30 | S3method(subset,AbstractLazyArray) 31 | S3method(sum,AbstractLazyArray) 32 | S3method(summary,AbstractLazyArray) 33 | S3method(summary,lazylm) 34 | S3method(tail,AbstractLazyArray) 35 | export(as.lazyarray) 36 | export(as.lazymatrix) 37 | export(auto_clear_lazyarray) 38 | export(chunk_map) 39 | export(filearray) 40 | export(fstarray) 41 | export(get_lazy_threads) 42 | export(lazy_parallel) 43 | export(lazyarray) 44 | export(lazylm) 45 | export(partition_map) 46 | export(partition_table) 47 | export(set_lazy_threads) 48 | exportMethods(crossprod) 49 | exportMethods(typeof) 50 | import(stats) 51 | import(utils) 52 | importFrom(R6,R6Class) 53 | importFrom(Rcpp,evalCpp) 54 | importFrom(Rcpp,sourceCpp) 55 | importFrom(methods,setGeneric) 56 | importFrom(methods,setMethod) 57 | importFrom(methods,signature) 58 | importFrom(yaml,read_yaml) 59 | importFrom(yaml,write_yaml) 60 | useDynLib(lazyarray, .registration = TRUE) 61 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | lazyarray 1.1.0 2 | ======= 3 | 4 | * Renamed `LazyArray` to `ClassLazyArray` 5 | * Added `lazyarray` function to create, load or import lazy arrays in one function 6 | * Added `auto_clear_lazyarray` to automatically remove data on hard disks upon garbage collecting (require `dipsaus` package installed, optional) 7 | * Supported customized multi-part file names 8 | 9 | lazyarray 1.0.0 10 | ======= 11 | 12 | * Initial CRAN release! 13 | 14 | lazyarray 0.0.0 15 | ======= 16 | 17 | * Initial private beta release! 18 | * Imported `fstcore` as major back-end and implemented c++ interface 19 | -------------------------------------------------------------------------------- /R/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | setLazyBlockSize <- function(size) { 5 | .Call(`_lazyarray_setLazyBlockSize`, size) 6 | } 7 | 8 | getLazyBlockSize <- function() { 9 | .Call(`_lazyarray_getLazyBlockSize`) 10 | } 11 | 12 | fstMeta <- function(fileName) { 13 | .Call(`_lazyarray_fstMeta`, fileName) 14 | } 15 | 16 | fstRetrieve <- function(fileName, colSel, start, end) { 17 | .Call(`_lazyarray_fstRetrieve`, fileName, colSel, start, end) 18 | } 19 | 20 | fstStore <- function(fileName, table, compression, uniformEncoding) { 21 | .Call(`_lazyarray_fstStore`, fileName, table, compression, uniformEncoding) 22 | } 23 | 24 | checkFstMeta <- function(file, expect_nrows, cnames) { 25 | .Call(`_lazyarray_checkFstMeta`, file, expect_nrows, cnames) 26 | } 27 | 28 | loc2idx3 <- function(locations, parent_dim) { 29 | .Call(`_lazyarray_loc2idx3`, locations, parent_dim) 30 | } 31 | 32 | extractSlices <- function(listOrEnv, ndims) { 33 | .Call(`_lazyarray_extractSlices`, listOrEnv, ndims) 34 | } 35 | 36 | parseSlices <- function(listOrEnv, dim, pos_subscript = TRUE) { 37 | .Call(`_lazyarray_parseSlices`, listOrEnv, dim, pos_subscript) 38 | } 39 | 40 | parseAndScheduleBlocks2 <- function(sliceIdx, dim, forceSchedule = FALSE) { 41 | .Call(`_lazyarray_parseAndScheduleBlocks2`, sliceIdx, dim, forceSchedule) 42 | } 43 | 44 | reshapeOrDrop <- function(x, reshape = NULL, drop = FALSE) { 45 | .Call(`_lazyarray_reshapeOrDrop`, x, reshape, drop) 46 | } 47 | 48 | cpp_create_lazyarray <- function(x, dim, fileName, compression, uniformEncoding) { 49 | .Call(`_lazyarray_cpp_create_lazyarray`, x, dim, fileName, compression, uniformEncoding) 50 | } 51 | 52 | lazyMapReduceByPartition <- function(fileName, colSel, start, end = NULL, custom_func = NULL, reshape = NULL) { 53 | .Call(`_lazyarray_lazyMapReduceByPartition`, fileName, colSel, start, end, custom_func, reshape) 54 | } 55 | 56 | lazyLoadOld <- function(files, partition_locations, partition_dim, ndim, value_type) { 57 | .Call(`_lazyarray_lazyLoadOld`, files, partition_locations, partition_dim, ndim, value_type) 58 | } 59 | 60 | subsetFST <- function(rootPath, listOrEnv, dim, dtype, reshape = NULL, drop = FALSE) { 61 | .Call(`_lazyarray_subsetFST`, rootPath, listOrEnv, dim, dtype, reshape, drop) 62 | } 63 | 64 | scheduleFST <- function(listOrEnv, dim, forceSchedule = FALSE, hint = -1L) { 65 | .Call(`_lazyarray_scheduleFST`, listOrEnv, dim, forceSchedule, hint) 66 | } 67 | 68 | executeScheduleFST <- function(rootPath, dtype, reshape, drop, partition) { 69 | .Call(`_lazyarray_executeScheduleFST`, rootPath, dtype, reshape, drop, partition) 70 | } 71 | 72 | scheduleExistsFST <- function() { 73 | .Call(`_lazyarray_scheduleExistsFST`) 74 | } 75 | 76 | freeScheduleFST <- function() { 77 | .Call(`_lazyarray_freeScheduleFST`) 78 | } 79 | 80 | subsetFM <- function(rootPath, listOrEnv, dim, dtype, reshape, drop) { 81 | .Call(`_lazyarray_subsetFM`, rootPath, listOrEnv, dim, dtype, reshape, drop) 82 | } 83 | 84 | getLazyThread <- function(max = FALSE) { 85 | .Call(`_lazyarray_getLazyThread`, max) 86 | } 87 | 88 | setLazyThread <- function(n, reset_after_fork = NULL) { 89 | .Call(`_lazyarray_setLazyThread`, n, reset_after_fork) 90 | } 91 | 92 | hasOpenMP <- function() { 93 | .Call(`_lazyarray_hasOpenMP`) 94 | } 95 | 96 | subsetAssignFST <- function(values, file, listOrEnv, dim, dtype, compression = 50L, uniformEncoding = TRUE) { 97 | .Call(`_lazyarray_subsetAssignFST`, values, file, listOrEnv, dim, dtype, compression, uniformEncoding) 98 | } 99 | 100 | dropDimension <- function(x) { 101 | .Call(`_lazyarray_dropDimension`, x) 102 | } 103 | 104 | prod2 <- function(x, na_rm = FALSE) { 105 | .Call(`_lazyarray_prod2`, x, na_rm) 106 | } 107 | 108 | parseDots <- function(env, eval) { 109 | .Call(`_lazyarray_parseDots`, env, eval) 110 | } 111 | 112 | stopIfNot <- function(isValid, message, stopIfError = TRUE) { 113 | .Call(`_lazyarray_stopIfNot`, isValid, message, stopIfError) 114 | } 115 | 116 | getSexpType <- function(x) { 117 | .Call(`_lazyarray_getSexpType`, x) 118 | } 119 | 120 | tik <- function() { 121 | .Call(`_lazyarray_tik`) 122 | } 123 | 124 | tok <- function(msg, stop = FALSE) { 125 | .Call(`_lazyarray_tok`, msg, stop) 126 | } 127 | 128 | subsetAssignVector <- function(x, start, value) { 129 | .Call(`_lazyarray_subsetAssignVector`, x, start, value) 130 | } 131 | 132 | # Register entry points for exported C++ functions 133 | methods::setLoadAction(function(ns) { 134 | .Call('_lazyarray_RcppExport_registerCCallable', PACKAGE = 'lazyarray') 135 | }) 136 | -------------------------------------------------------------------------------- /R/aaa.R: -------------------------------------------------------------------------------- 1 | #' @importFrom R6 R6Class 2 | #' @importFrom Rcpp evalCpp 3 | #' @importFrom Rcpp sourceCpp 4 | #' @importFrom yaml read_yaml 5 | #' @importFrom yaml write_yaml 6 | #' @import utils 7 | #' @import stats 8 | #' @importFrom methods setGeneric 9 | #' @importFrom methods setMethod 10 | #' @importFrom methods signature 11 | #' @useDynLib lazyarray, .registration = TRUE 12 | NULL 13 | 14 | has_dipsaus <- function(){ 15 | system.file('', package = 'dipsaus') != '' 16 | } 17 | 18 | 19 | #' @title Set Number of Threads for Lazy Arrays 20 | #' @description Set number of threads used by 'OpenMP' for both \code{lazyarray} 21 | #' and \code{fstcore} packages. 22 | #' @param nr_of_threads number of CPU cores to use, or \code{NULL} to 23 | #' stay unchanged, default is \code{getOption('lazyarray.nthreads')} 24 | #' @param reset_after_fork whether to reset after forked process 25 | #' @param max whether return maximum available threads 26 | #' @return Number of cores currently used. 27 | #' @seealso \code{\link[fstcore]{threads_fstlib}} 28 | #' @name lazyarray-threads 29 | #' @export 30 | set_lazy_threads <- function(nr_of_threads = getOption('lazyarray.nthreads'), reset_after_fork = NULL){ 31 | if(!is.null(reset_after_fork)){ 32 | reset_after_fork <- isTRUE(reset_after_fork) 33 | } 34 | if(is.null(nr_of_threads) || !is.numeric(nr_of_threads) || nr_of_threads == 0){ 35 | nr_of_threads <- max(fstcore::threads_fstlib(), getLazyThread()) 36 | } 37 | nr_of_threads = max(nr_of_threads, 1) 38 | setLazyThread(n = nr_of_threads, reset_after_fork = reset_after_fork) 39 | fstcore::threads_fstlib(nr_of_threads = nr_of_threads, reset_after_fork = reset_after_fork) 40 | getLazyThread() 41 | } 42 | 43 | #' @rdname lazyarray-threads 44 | #' @export 45 | get_lazy_threads <- function(max = FALSE){ 46 | getLazyThread(max = isTRUE(max)) 47 | } 48 | 49 | 50 | get_missing_value <- function(){ 51 | (function(...){ 52 | parseDots(environment(), FALSE)[[1]] 53 | })(,) 54 | } 55 | 56 | 57 | rand_string <- function(length = 50){ 58 | paste(sample(c(letters, LETTERS, 0:9), length, replace = TRUE), collapse = '') 59 | } 60 | 61 | 62 | #' Automatically remove array data 63 | #' @author Zhengjia Wang 64 | #' @description Remove the files containing array data once no 65 | #' 'lazyarray' instance is using the folder. Require 66 | #' installation of \code{dipsaus} package (at least version 0.0.8). 67 | #' @param x 'lazyarray' instance 68 | #' @param onexit passed to \code{\link{reg.finalizer}} 69 | #' 70 | #' @details \code{auto_clear_lazyarray} attempts to remove the entire folder 71 | #' containing array data. However, if some files are not created by the 72 | #' array, only partition data and meta file will be removed, all the 73 | #' artifacts will remain and warning will be displayed. One exception is 74 | #' if all files left in the array directory are \code{*.meta} files, 75 | #' all these meta files will be removed along with the folder. 76 | #' 77 | #' @examples 78 | #' 79 | #' path <- tempfile() 80 | #' arr_dbl <- lazyarray(path, storage_format = 'double', 81 | #' dim = 2:4, meta_name = 'meta-dbl.meta') 82 | #' arr_dbl[] <- 1:24 83 | #' auto_clear_lazyarray(arr_dbl) 84 | #' 85 | #' arr_chr <- lazyarray(path, storage_format = 'character', 86 | #' dim = 2:4, meta_name = 'meta-chr.meta') 87 | #' auto_clear_lazyarray(arr_chr) 88 | #' 89 | #' # remove either one, the directory still exists 90 | #' rm(arr_dbl); invisible(gc(verbose = FALSE)) 91 | #' 92 | #' arr_chr[1,1,1] 93 | #' 94 | #' # Remove the other one, and path will be removed 95 | #' rm(arr_chr); invisible(gc(verbose = FALSE)) 96 | #' 97 | #' dir.exists(path) 98 | #' arr_check <- lazyarray(path, storage_format = 'character', 99 | #' dim = 2:4, meta_name = 'meta-chr') 100 | #' 101 | #' # data is removed, so there should be no data (NAs) 102 | #' arr_check[] 103 | #' 104 | #' @export 105 | auto_clear_lazyarray <- function(x, onexit = FALSE){ 106 | if(requireNamespace('dipsaus', quietly = TRUE)){ 107 | path <- x$storage_path 108 | path <- normalizePath(path) 109 | dipsaus::shared_finalizer(x, key = path, function(e){ 110 | e$remove_data(force = TRUE) 111 | }, onexit = onexit) 112 | rm(path) 113 | } 114 | rm(x, onexit) 115 | invisible() 116 | } 117 | -------------------------------------------------------------------------------- /R/class-filearray.R: -------------------------------------------------------------------------------- 1 | #' @noRd 2 | #' @title Internal Class definition for \code{FstArray} 3 | #' @author Zhengjia Wang 4 | #' @description Internal class definition of lazy array objects 5 | FileArray <- R6::R6Class( 6 | classname = "FileArray", 7 | portable = TRUE, 8 | inherit = AbstractLazyArray, 9 | private = list( 10 | .backend = "filearray" 11 | ), 12 | public = list( 13 | print = function(...){ 14 | cat(" (", private$.storage_format, ')\n', sep = '') 15 | cat('Dimension:\t', paste(sprintf('%d ', private$.dim), collapse = 'x '), '\n') 16 | cat('File format: bmat\n') 17 | invisible(self) 18 | }, 19 | 20 | initialize = function(path, dim, storage_format = c('double', 'integer'), 21 | read_only = TRUE, meta_name = 'lazyarray.meta'){ 22 | private$.file_format <- "bmat" 23 | storage_format <- match.arg(storage_format) 24 | if(missing(dim)){ 25 | super$initialize(path = path, storage_format = storage_format, 26 | read_only = read_only, meta_name = meta_name) 27 | } else { 28 | super$initialize(path = path, dim = dim, storage_format = storage_format, 29 | read_only = read_only, meta_name = meta_name) 30 | } 31 | # cannot convert types 32 | if(length(self$raw_meta$storage_format) && storage_format != self$raw_meta$storage_format){ 33 | stop("Data format inconsistent (cannot type-convert). Header info: ", 34 | self$raw_meta$storage_format, "; provided: ", storage_format) 35 | } 36 | }, 37 | 38 | get_partition_fpath = function(part, full_path = TRUE, summary_file = FALSE, type = c('data', 'desc', 'combined')){ 39 | type <- match.arg(type) 40 | if(missing(part)){ 41 | part <- seq_len(self$npart) 42 | } else { 43 | part <- as.integer(part) 44 | if(base::anyNA(part) || any(part <= 0)){ 45 | stop("partition number must be all positive: ", part) 46 | } 47 | } 48 | res <- sprintf('%s%s', part, self$get_file_format()) 49 | if(full_path){ 50 | res <- file.path(private$.path, res) 51 | } 52 | if(summary_file){ 53 | res <- sprintf('%s.summary', res) 54 | return(res) 55 | } 56 | if(type == 'desc'){ 57 | res <- sprintf('%s.desc.txt', part) 58 | } else if(type == 'combined'){ 59 | res <- sprintf('%s', part) 60 | } else { 61 | res <- sprintf('%s%s', part, self$get_file_format()) 62 | } 63 | if(full_path){ 64 | res <- file.path(private$.path, res) 65 | } 66 | res 67 | }, 68 | 69 | has_partition = function(part){ 70 | stopifnot(length(part) == 1) 71 | file <- self$get_partition_fpath(part, full_path = TRUE) 72 | desc <- self$get_partition_fpath(part, full_path = TRUE, type = 'desc') 73 | if(file.exists(file) && file.exists(desc)){ 74 | return(TRUE) 75 | } 76 | return(FALSE) 77 | }, 78 | initialize_partition = function(part, nofill = FALSE){ 79 | if(!self$has_partition(part)){ 80 | file <- self$get_partition_fpath(part, full_path = TRUE, type = 'combined') 81 | ptr <- filematrix::fm.create(file, self$partition_length, 1, type = self$storage_format) 82 | if(!nofill){ 83 | ptr[] <- rep(self$sample_na, self$partition_length) 84 | } 85 | filematrix::close(ptr) 86 | return(TRUE) 87 | } 88 | return(FALSE) 89 | }, 90 | get_partition_data = function(part, reshape = NULL){ 91 | if(self$has_partition(part)){ 92 | file <- self$get_partition_fpath(part, full_path = TRUE, type = 'combined') 93 | ptr <- filematrix::fm.open(file) 94 | re <- ptr[] 95 | filematrix::close(ptr) 96 | if(is.null(reshape)){ 97 | reshapeOrDrop(re, reshape = self$partition_dim(), drop = FALSE) 98 | } else { 99 | reshapeOrDrop(re, reshape = as.numeric(reshape), drop = FALSE) 100 | } 101 | 102 | return(re) 103 | } else { 104 | array(self$sample_na, self$partition_dim()) 105 | } 106 | } 107 | ) 108 | ) 109 | 110 | 111 | #' @export 112 | `[.FileArray` <- function(x, ..., drop = TRUE, reshape = NULL){ 113 | if(!x$is_valid){ 114 | stop("`[.FileArray`: x is no longer valid (data has been removed).") 115 | } 116 | if(!is.null(reshape)){ 117 | reshape <- as.numeric(reshape) 118 | stopifnot(all(reshape>=0)) 119 | } 120 | drop <- isTRUE(drop) 121 | 122 | # set block size to be the first margin to maximize reading speed 123 | block_size <- dim(x)[[1]] 124 | block_size <- max(block_size, 1) 125 | block_size <- min(block_size, 16384) 126 | setLazyBlockSize(block_size) 127 | 128 | on.exit({ 129 | # reset block size for fst arrays 130 | block_size <- getOption('lazyarray.fstarray.blocksize', -1) 131 | if(block_size <= 1){ 132 | block_size <- -1 133 | } 134 | setLazyBlockSize(block_size) 135 | }) 136 | 137 | subsetFM(rootPath = x$storage_path,listOrEnv = environment(), 138 | dim = x$dim,dtype = x$sexptype,reshape = reshape,drop = drop) 139 | } 140 | 141 | 142 | #' @export 143 | `[<-.FileArray` <- function(x, ..., value){ 144 | if(!x$is_valid){ 145 | stop("`[<-.FileArray`: x is no longer valid (data has been removed).") 146 | } 147 | if(!x$can_write){ 148 | stop("`[<-.FileArray`: x is read-only") 149 | } 150 | 151 | parsed <- parseAndScheduleBlocks2(environment(), x$dim, TRUE) 152 | # parsed <- parseAndScheduleBlocks2(list(1:10,2:10,3:10,4:10), x$dim, TRUE) 153 | # parsed <- parseAndScheduleBlocks2(list(1,1,1,1), x$dim, TRUE) 154 | 155 | if(parsed$subset_mode == 1){ 156 | stop("FstArray does not support single subscript (x[i]<-v), try x[]<-v or x[i,j,k,...]<-v") 157 | } 158 | partition_length <- prod(x$partition_dim()) 159 | 160 | # x[] 161 | if(parsed$subset_mode == 2){ 162 | value <- array(value, dim = x$dim) 163 | fake_idx <- lapply(x$dim, function(x){ get_missing_value() }) 164 | slice_value <- function(ii){ 165 | fake_idx[[x$ndim]] <- ii 166 | do.call(`[`, c(list(quote(value)), fake_idx)) 167 | } 168 | # copy all to re inplace 169 | for(ii in seq_len(x$npart)){ 170 | x$initialize_partition(part = ii, nofill = TRUE) 171 | file <- x$get_partition_fpath(ii, full_path = TRUE, type = 'combined') 172 | ptr_file <- filematrix::fm.open(file) 173 | ptr_file[] <- slice_value(ii) 174 | filematrix::close(ptr_file) 175 | } 176 | } else { 177 | # x[i,j,k] 178 | loc <- parsed$location_indices 179 | if(!is.numeric(loc[[x$ndim]])){ 180 | # missing, all partitions 181 | partitions <- seq_len(x$npart) 182 | } else { 183 | partitions <- loc[[x$ndim]] 184 | } 185 | # check if the schedule is made 186 | schedule <- parsed$schedule 187 | block_ndims <- schedule$block_ndims 188 | 189 | ptr <- 1 190 | blocksize <- schedule$block_expected_length 191 | 192 | if(schedule$block_indexed){ 193 | value <- array(value, dim = c(blocksize, length(schedule$schedule_index), length(partitions))) 194 | } else { 195 | 196 | value <- array(value, dim = c(parsed$expected_length / length(partitions), length(partitions))) 197 | } 198 | 199 | for(ff in seq_along(partitions)){ 200 | file_ii <- partitions[[ff]] 201 | # No file, NA 202 | x$initialize_partition(part = file_ii) 203 | file <- x$get_partition_fpath(file_ii, full_path = TRUE, type = 'combined') 204 | ptr_file <- filematrix::fm.open(file, readonly = FALSE) 205 | 206 | if(schedule$block_indexed){ 207 | # file exists 208 | for(ii in seq_along(schedule$schedule_index)){ 209 | schedule_ii <- schedule$schedule_index[[ii]] 210 | row_number <- blocksize * (schedule_ii-1) + schedule$block_schedule 211 | sel <- row_number > 0 212 | ptr_file[row_number[sel], 1] <- value[sel,ii,ff] 213 | } 214 | } else { 215 | # ndim == 2 216 | row_number <- loc[[1]] 217 | tryCatch({ 218 | sel <- row_number > 0 219 | ptr_file[row_number[sel], 1] <- value[sel,ff] 220 | }, error = function(e){ 221 | ptr_file[, 1] <- value[,ff] 222 | }) 223 | } 224 | filematrix::close(ptr_file) 225 | 226 | } 227 | 228 | } 229 | 230 | invisible(x) 231 | } 232 | 233 | -------------------------------------------------------------------------------- /R/class-fstarray.R: -------------------------------------------------------------------------------- 1 | #' @noRd 2 | #' @title Internal Class definition for \code{FstArray} 3 | #' @author Zhengjia Wang 4 | #' @description Internal class definition of lazy array objects 5 | FstArray <- R6::R6Class( 6 | classname = "FstArray", 7 | portable = TRUE, 8 | inherit = AbstractLazyArray, 9 | private = list( 10 | .compress_level = 50, 11 | .backend = "fstarray" 12 | ), 13 | public = list( 14 | print = function(...){ 15 | cat(" (", private$.storage_format, ')\n', sep = '') 16 | cat('Dimension:\t', paste(sprintf('%d ', private$.dim), collapse = 'x '), '\n') 17 | cat('File format: fst\n') 18 | invisible(self) 19 | }, 20 | 21 | initialize = function(path, dim, storage_format = c('double', 'integer', 'complex', 'character'), 22 | read_only = TRUE, meta_name = 'lazyarray.meta'){ 23 | private$.file_format <- "fst" 24 | private$.compress_level <- 50 25 | if(missing(dim)){ 26 | super$initialize(path = path, storage_format = storage_format, 27 | read_only = read_only, meta_name = meta_name) 28 | } else { 29 | super$initialize(path = path, dim = dim, storage_format = storage_format, 30 | read_only = read_only, meta_name = meta_name) 31 | } 32 | 33 | }, 34 | has_partition = function(part){ 35 | stopifnot(length(part) == 1) 36 | file <- self$get_partition_fpath(part, full_path = TRUE) 37 | cname <- "V1" 38 | if(self$storage_format == 'complex'){ 39 | cname <- c("V1R", "V1I") 40 | } 41 | if(file.exists(file)){ 42 | try({ 43 | return(checkFstMeta(file, expect_nrows = self$partition_length, cnames = cname)); 44 | }, silent = TRUE) 45 | } 46 | return(FALSE) 47 | }, 48 | get_partition_data = function(part, reshape = NULL){ 49 | if(self$has_partition(part)){ 50 | file <- self$get_partition_fpath(part, full_path = TRUE) 51 | 52 | if(self$storage_format == 'complex'){ 53 | cname <- c("V1R", "V1I") 54 | re <- fstRetrieve(file, cname, 1L, NULL) 55 | re <- re$resTable$V1R + (re$resTable$V1I) * 1i 56 | } else { 57 | cname <- "V1" 58 | re <- fstRetrieve(file, cname, 1L, NULL) 59 | re <- re$resTable$V1 60 | } 61 | if(is.null(reshape)){ 62 | reshapeOrDrop(re, reshape = self$partition_dim(), drop = FALSE) 63 | } else { 64 | reshapeOrDrop(re, reshape = as.numeric(reshape), drop = FALSE) 65 | } 66 | 67 | return(re) 68 | } else { 69 | array(self$sample_na, self$partition_dim()) 70 | } 71 | }, 72 | 73 | `@chunk_map` = function( 74 | map_function, max_nchunks = 50, partitions = 'all', ... 75 | ){ 76 | 77 | if(!is.function(map_function)){ 78 | stop("map_function must be a function") 79 | } 80 | if(length(formals(map_function)) < 2){ 81 | map_f <- function(data, chunk, idx){ 82 | map_function(data) 83 | } 84 | } else if(length(formals(map_function)) < 2){ 85 | map_f <- function(data, chunk, idx){ 86 | map_function(data, chunk) 87 | } 88 | } else { 89 | map_f <- map_function 90 | } 91 | 92 | nrows <- self$partition_length 93 | 94 | # get chunk size 95 | chunkf <- make_chunks(nrows, max_nchunks = max_nchunks, ...) 96 | if(isTRUE(partitions == 'all')){ 97 | files <- self$get_partition_fpath() 98 | # ncols <- self$npart 99 | } else { 100 | files <- self$get_partition_fpath(partitions) 101 | # ncols <- length 102 | } 103 | 104 | # partition_locations <- list( 105 | # numeric(0), 106 | # seq_len(ncols) 107 | # ) 108 | 109 | sdata <- self$sample_na 110 | 111 | lapply2(seq_len(chunkf$nchunks), function(ii){ 112 | idx_range <- chunkf$get_indices(ii, as_numeric = TRUE)[[1]] 113 | chunk_data <- lazyLoadOld(files = files, partition_dim = c(nrows, 1), 114 | partition_locations = list(seq.int(idx_range[[1]], idx_range[[2]]), 1L), 115 | ndim = 2L, value_type = sdata) 116 | map_f(chunk_data, ii, idx_range) 117 | }) 118 | } 119 | 120 | 121 | 122 | ), 123 | active = list( 124 | 125 | compress_level = function(v){ 126 | if(!missing(v)){ 127 | stopifnot(v >= 0 & v <= 100) 128 | private$.compress_level <- v 129 | self$save_meta() 130 | } 131 | private$.compress_level 132 | }, 133 | 134 | meta = function(){ 135 | list( 136 | dim = self$dim, 137 | dimnames = self$dimnames, 138 | storage_format = self$storage_format, 139 | file_format = private$.file_format, 140 | compress_level = private$.compress_level 141 | ) 142 | } 143 | 144 | 145 | 146 | ) 147 | ) 148 | 149 | 150 | 151 | 152 | #' @export 153 | `[.FstArray` <- function(x, ..., drop = TRUE, reshape = NULL){ 154 | if(!x$is_valid){ 155 | stop("`[.FstArray`: x is no longer valid (data has been removed).") 156 | } 157 | if(!is.null(reshape)){ 158 | reshape <- as.numeric(reshape) 159 | stopifnot(all(reshape>=0)) 160 | } 161 | drop <- isTRUE(drop) 162 | 163 | bsize <- getOption('lazyarray.fstarray.blocksize', -1) 164 | if(bsize <= 1) { 165 | bsize = -1 166 | } 167 | setLazyBlockSize(bsize) 168 | 169 | subsetFST(rootPath = x$storage_path,listOrEnv = environment(), 170 | dim = x$dim,dtype = x$sexptype,reshape = reshape,drop = drop) 171 | } 172 | 173 | #' @export 174 | `[<-.FstArray` <- function(x, ..., value){ 175 | if(!x$is_valid){ 176 | stop("`[<-.FstArray`: x is no longer valid (data has been removed).") 177 | } 178 | if(!x$can_write){ 179 | stop("`[<-.FstArray`: x is read-only") 180 | } 181 | 182 | parts <- subsetAssignFST(values = value, file = x$storage_path, listOrEnv = environment(), 183 | dim = x$dim, dtype = x$sexptype, 184 | compression = as.integer(x$compress_level),uniformEncoding = TRUE) 185 | if(isTRUE(any(parts == -1))){ 186 | x$generate_summary() 187 | } else { 188 | parts <- parts[!is.na(parts) & parts > 0 & parts <= x$npart] 189 | if(length(parts)){ 190 | x$generate_summary(parts) 191 | } 192 | } 193 | # # get 194 | # 195 | # x$generate_summary() 196 | invisible(x) 197 | } 198 | 199 | -------------------------------------------------------------------------------- /R/generics.R: -------------------------------------------------------------------------------- 1 | 2 | #' Generate partition summary statistics for array objects along the last 3 | #' dimension 4 | #' @param x an array or \code{LazyArray} 5 | #' @param na.rm whether to remove \code{NA} when calculating summary statistics 6 | #' @param ... passed to other methods or ignored 7 | #' @return A data frame with the following possible columns: \code{Min}, 8 | #' \code{Max}, \code{Mean}, \code{Standard Deviation}, \code{NAs} (total number 9 | #' of \code{NA}), and \code{Length}. 10 | #' @name partition_table 11 | #' @examples 12 | #' 13 | #' # R array 14 | #' x <- array(1:27, c(3,3,3)) 15 | #' partition_table(x) 16 | #' 17 | #' # LazyArray 18 | #' x <- lazyarray(tempfile(), storage_format = 'double', dim = c(3,3,3)) 19 | #' x[] <- 1:27 20 | #' partition_table(x, quiet=TRUE) 21 | #' 22 | #' @export 23 | partition_table <- function(x, na.rm = FALSE, ...){ 24 | UseMethod('partition_table') 25 | } 26 | 27 | #' @rdname partition_table 28 | #' @export 29 | partition_table.array <- function(x, na.rm = FALSE, ...){ 30 | dim <- dim(x) 31 | suppressWarnings({ 32 | smry <- t(apply(x, length(dim), function(d){ 33 | c( 34 | min(d, na.rm = na.rm), 35 | max(d, na.rm = na.rm), 36 | mean(d, na.rm = na.rm), 37 | sd(d, na.rm = na.rm), 38 | sum(is.na(d)), 39 | length(d) 40 | ) 41 | })) 42 | }) 43 | smry <- as.data.frame(smry) 44 | names(smry) <- c('Min', 'Max', 'Mean', 'Standard Deviation', 'NAs', 'Length') 45 | smry 46 | } 47 | 48 | #' @rdname partition_table 49 | #' @export 50 | partition_table.AbstractLazyArray <- function(x, na.rm = FALSE, ...){ 51 | smry <- summary(x, na.rm = FALSE, ...) 52 | re <- smry$partitions 53 | re$Count <- re$Length - re$NAs 54 | re 55 | } 56 | 57 | 58 | #' Apply function along the last dimension of an array and aggregate the results 59 | #' @name partition_map 60 | #' @param x R array or \code{LazyArray} 61 | #' @param map_fun function that takes in a slice of array and an optional 62 | #' argument indicating current partition number 63 | #' @param reduce function that accept a list of results returned by 64 | #' \code{map_fun}, can be missing 65 | #' @param partitions integers of partitions, i.e. the slices of array to be 66 | #' applied to, can be missing. If missing, then applies to all partitions 67 | #' @param ... internally used 68 | #' @return If \code{reduce} is missing, returns a list of results. Each result 69 | #' is returned by \code{map_fun}, and the total length equals to number of 70 | #' partitions mapped. If \code{reduce} is a function, that list of results will 71 | #' be passed to \code{reduce} and \code{partition_map} returns the results 72 | #' generated from \code{reduce}. 73 | #' @examples 74 | #' 75 | #' # -------------------------- Ordinary R array --------------------------- 76 | #' 77 | #' x <- array(1:24, c(2,3,4)) 78 | #' partition_map(x, function(slice, part){ 79 | #' sum(slice) 80 | #' }) 81 | #' 82 | #' # When reduce and partitions are missing, the following code is equivalent 83 | #' as.list(apply(x, 3, sum)) 84 | #' 85 | #' # When reduce is present 86 | #' partition_map(x, function(slice, part){ 87 | #' sum(slice) 88 | #' }, function(slice_sum){ 89 | #' max(unlist(slice_sum)) 90 | #' }) 91 | #' 92 | #' # equivalently, we could call 93 | #' slice_sum <- partition_map(x, function(slice, part){ 94 | #' sum(slice) 95 | #' }) 96 | #' max(unlist(slice_sum)) 97 | #' 98 | #' # When partition is specified 99 | #' # Partition 1, 2, and 4 exist but 5 is missing 100 | #' # when a partition is missing, the missing slice will be NA 101 | #' partition_map(x, function(slice, part){ 102 | #' sum(slice) 103 | #' }, partitions = c(1,2,4,5)) 104 | #' 105 | #' # -------------------------- LazyArray --------------------------- 106 | #' x <- lazyarray(tempfile(), storage_format = 'complex', dim = c(2,3,4)) 107 | #' x[] <- 1:24 + (24:1) * 1i 108 | #' 109 | #' partition_map(x, function(slice, part){ 110 | #' slice[1, ,] * slice[2, ,] 111 | #' }, reduce = function(mapped_prod){ 112 | #' mean(unlist(mapped_prod)) 113 | #' }) 114 | #' 115 | #' 116 | #' 117 | #' @export 118 | partition_map <- function(x, map_fun, reduce, partitions, ...){ 119 | UseMethod('partition_map') 120 | } 121 | 122 | #' @export 123 | partition_map.array <- function(x, map_fun, reduce, partitions, ...){ 124 | if(length(formals(map_fun)) == 1){ 125 | mfun <- function(x, part){ 126 | map_fun(x) 127 | } 128 | } else { 129 | mfun <- map_fun 130 | } 131 | dim <- dim(x) 132 | available_partitions <- seq_len(dim[[length(dim)]]) 133 | substr <- paste(rep('', length(dim)), collapse = ',') 134 | 135 | if(missing(partitions)){ 136 | partitions <- available_partitions 137 | } 138 | 139 | res <- lapply(partitions, function(part){ 140 | if(part %in% available_partitions){ 141 | slice <- eval(parse(text = sprintf('x[%s%d,drop=FALSE]', substr, part))) 142 | } else { 143 | slice <- array(NA, c(dim[-length(dim)], 1)) 144 | } 145 | mfun(slice, part) 146 | }) 147 | 148 | if(!missing(reduce) && is.function(reduce)){ 149 | res <- reduce(res) 150 | } 151 | 152 | res 153 | } 154 | 155 | #' @export 156 | partition_map.AbstractLazyArray <- function(x, map_fun, reduce, partitions, further_split = FALSE, ...){ 157 | if(missing(partitions)){ 158 | partitions <- seq_len(x$npart) 159 | } else { 160 | partitions <- as.integer(partitions) 161 | partitions <- partitions[partitions > 0 & partitions <= x$npart] 162 | } 163 | 164 | if(length(formals(map_fun)) == 1){ 165 | mfun <- function(x, part){ 166 | map_fun(x) 167 | } 168 | } else { 169 | mfun <- map_fun 170 | } 171 | 172 | mapped <- lapply2(partitions, function(part){ 173 | mfun(x$get_partition_data(part), part) 174 | }) 175 | 176 | if(!missing(reduce)){ 177 | mapped <- reduce(mapped) 178 | } 179 | mapped 180 | 181 | } 182 | 183 | 184 | #' Apply functions to all partitions, but small chunks each time 185 | #' @seealso \code{\link{partition_map}} 186 | #' @param x a \code{LazyArray} or R array 187 | #' @param map_fun function to apply to each chunk 188 | #' @param reduce similar to \code{reduce} in \code{\link{partition_map}} 189 | #' @param max_nchunks maximum number of chunks. If number of chunks is too 190 | #' large, then \code{chunk_size} will be re-calculated. 191 | #' @param chunk_size integer chunk size. If \code{chunk_size} is too small, it 192 | #' will be ignored 193 | #' @param ... ignored or passed to other methods 194 | #' @return If \code{reduce} is missing, returns a list of results. Each result 195 | #' is returned by \code{map_fun}, and the total length equals to number of 196 | #' chunks mapped. If \code{reduce} is a function, that list of results will 197 | #' be passed to \code{reduce} and \code{chunk_map} returns the results 198 | #' generated from \code{reduce}. 199 | #' @details The difference between \code{chunk_map} and 200 | #' \code{partition_map} is the margin or direction to apply mapping 201 | #' functions. In \code{partition_map}, mapping function is applied to 202 | #' each partition. If \code{x} is a matrix, this means applying to each column. 203 | #' \code{chunk_map} generate small chunks along all dimensions except the last, 204 | #' and apply mapping functions to each chunks. If \code{x} is a matrix, it 205 | #' make chunks along rows and apply mapping functions along rows. 206 | #' @examples 207 | #' 208 | #' x <- as.lazymatrix(matrix(1:100, ncol = 2)) 209 | #' x 210 | #' 211 | #' # Set max_nchunks=Inf and chunk_size=10 to force total number of chunks 212 | #' # is around nrow(x)/10 and each chunk contains at most 10 rows 213 | #' chunk_map(x, function(chunk){chunk[1:2,]}, chunk_size = 10, max_nchunks = Inf) 214 | #' 215 | #' # For each chunks, calculate mean, then calculate the mean of chunk mean 216 | #' chunk_map(x, function(chunk) { 217 | #' colMeans(chunk) 218 | #' }, function(chunk_means) { 219 | #' Reduce('+', chunk_means) / length(chunk_means) 220 | #' }) 221 | #' 222 | #' colMeans(x[]) 223 | #' 224 | #' 225 | #' @export 226 | chunk_map <- function(x, map_fun, reduce, max_nchunks, chunk_size, ...){ 227 | UseMethod('chunk_map') 228 | } 229 | 230 | 231 | #' @export 232 | chunk_map.AbstractLazyArray <- function(x, map_fun, reduce, max_nchunks, chunk_size, partitions = 'all', ...){ 233 | 234 | if(missing(max_nchunks)){ 235 | # calculate such that each chunk size is at most 0.5GB 236 | max_nchunks <- auto_chunks(x) 237 | } 238 | new_x <- as.lazymatrix(x) 239 | new_x$make_readonly() 240 | 241 | if(missing(chunk_size)){ 242 | chunk_size <- 1024L 243 | } 244 | mapped <- x$`@chunk_map`(map_function = map_fun, max_nchunks = max_nchunks, chunk_size = chunk_size, partitions = partitions) 245 | 246 | if(!missing(reduce)){ 247 | mapped <- reduce(mapped) 248 | } 249 | 250 | return(mapped) 251 | } 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | dir_create <- function(path, showWarnings = FALSE, recursive = TRUE, ...){ 2 | dir.create(path = path, showWarnings = showWarnings, recursive = recursive, ...) 3 | } 4 | 5 | load_yaml <- function(path, ...){ 6 | read_yaml(path, ...) 7 | } 8 | 9 | 10 | save_yaml <- function(x, path, ...){ 11 | write_yaml(x, path, ...) 12 | } 13 | 14 | 15 | deparse1 <- function(..., collapse = ''){ 16 | paste0(deparse(...), collapse = collapse) 17 | } 18 | 19 | rand_string <- function(length = 10){ 20 | paste(sample(c(letters, LETTERS, 0:9), length, replace = TRUE), collapse = '') 21 | } 22 | 23 | make_chunks <- function(dim, chunk_size, max_nchunks = 200, recursive = FALSE){ 24 | max_nchunks <- floor(max_nchunks) 25 | len <- prod(dim) 26 | drange <- lapply(dim, function(d){ c(1, d) }) 27 | 28 | 29 | if(len == 0){ 30 | return(list(nchunks = 0, get_indices = function(i, as_numeric = FALSE){ 31 | if(as_numeric){ return(NULL) } 32 | paste(rep('', length(dim)), collapse = ',') 33 | })) 34 | } 35 | 36 | if(missing(chunk_size)){ 37 | chunk_size <- getOption('lazyarray.chunk_memory', 80) * 125000 38 | } 39 | 40 | 41 | if(len <= chunk_size ){ 42 | return(list(nchunks = 1, get_indices = function(i, as_numeric = FALSE){ 43 | if(as_numeric){ return( drange ) } 44 | paste(rep('', length(dim)), collapse = ',') 45 | })) 46 | } 47 | 48 | lastdim <- dim[length(dim)] 49 | 50 | if( len < chunk_size * max_nchunks ){ 51 | max_nchunks <- ceiling(len / chunk_size); 52 | if( chunk_size * max_nchunks < len ){ 53 | max_nchunks <- max_nchunks + 1 54 | } 55 | } 56 | 57 | if(!recursive && lastdim < max_nchunks){ 58 | max_nchunks <- lastdim 59 | } 60 | 61 | if(lastdim >= max_nchunks){ 62 | nchunks <- max_nchunks 63 | m <- ceiling(lastdim / max_nchunks) 64 | x2 <- m * nchunks - lastdim 65 | x1 <- nchunks - x2 66 | return(list( 67 | nchunks = nchunks, 68 | get_indices = function(i, as_numeric = FALSE){ 69 | if( i <= x1 ){ 70 | s <- (i - 1) * m + 1 71 | e <- i * m 72 | } else { 73 | s <- x1 * m + (i-x1-1) * (m-1) + 1 74 | e <- x1 * m + (i-x1) * (m-1) 75 | } 76 | if(as_numeric){ 77 | re <- drange 78 | re[[length(dim)]] <- c(s, e) 79 | return(re) 80 | } else { 81 | re <- paste(rep('', length(dim)), collapse = ',') 82 | if(s == e){ 83 | return(sprintf('%s%d', re, s)) 84 | } else { 85 | return(sprintf('%s%d:%d', re, s, e)) 86 | } 87 | } 88 | 89 | 90 | } 91 | )) 92 | } 93 | 94 | # lastdim < max_nchunks and recursive 95 | if( lastdim > max_nchunks / 2 ){ 96 | return(list( 97 | nchunks = lastdim, 98 | get_indices = function(i, as_numeric = FALSE){ 99 | if(as_numeric){ 100 | re <- drange 101 | re[[length(dim)]] <- c(i, i) 102 | return(re) 103 | } else { 104 | re <- paste(rep('', length(dim)), collapse = ',') 105 | sprintf('%s%d', re, i) 106 | } 107 | } 108 | )) 109 | } 110 | re <- 111 | Recall( 112 | dim[-length(dim)], 113 | chunk_size = chunk_size, 114 | max_nchunks = max_nchunks / lastdim, 115 | recursive = FALSE 116 | ) 117 | 118 | nchunks = re$nchunks * lastdim 119 | get_indices <- function(i, as_numeric = FALSE){ 120 | i1 <- floor((i - 1) / lastdim) + 1 121 | i2 <- i - lastdim * (i1-1) 122 | 123 | s <- re$get_indices(i1, as_numeric = as_numeric) 124 | 125 | if(as_numeric){ 126 | s[[length(dim)]] <- c(i2, i2) 127 | return(s) 128 | } else { 129 | return(sprintf('%s,%d', s, i2)) 130 | } 131 | 132 | } 133 | 134 | return(list( 135 | nchunks = nchunks, 136 | get_indices = get_indices 137 | )) 138 | 139 | } 140 | 141 | lapply2 <- function(x, FUN, ...){ 142 | if( length(x) > 1 && has_dipsaus() ){ 143 | dipsaus::lapply_async2(x, FUN, FUN.args = list(...), plan = getOption('lazyarray.parallel.strategy', FALSE)) 144 | } else { 145 | lapply(x, FUN, ...) 146 | } 147 | } 148 | 149 | 150 | auto_chunks <- function(x, limit = 0.5){ 151 | files <- x$get_partition_fpath() 152 | if(length(files)){ 153 | fct <- mean(file.exists(files)) 154 | } else { 155 | fct <- 1 156 | } 157 | 158 | max_nchunks <- x$filesize / limit 159 | if( fct > 0 ){ 160 | max_nchunks <- max_nchunks / fct 161 | } 162 | max_nchunks <- max(ceiling(max_nchunks), 1L) 163 | max_nchunks 164 | } 165 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | # Rcpp::loadModule('LazyArrayModules', TRUE) 2 | 3 | .onUnload <- function (libpath) { 4 | library.dynam.unload("lazyarray", libpath) 5 | } 6 | 7 | 8 | .onLoad <- function(libname, pkgname){ 9 | options('lazyarray.parallel.strategy' = FALSE) 10 | options('lazyarray.chunk_memory' = 80) 11 | options('lazyarray.fstarray.blocksize' = -1) 12 | 13 | ncores <- parallel::detectCores(logical = TRUE) 14 | options('lazyarray.nthreads' = ncores) 15 | set_lazy_threads(ncores, TRUE) 16 | } 17 | 18 | 19 | #' Schedule parallel processes for \code{LazyArray} 20 | #' @description Enable parallel processing, need \code{dipsaus} to be installed. 21 | #' For \code{"callr"} strategy, please also install \code{future.callr}. 22 | #' @param enabled whether multiple-process strategy is enabled 23 | #' @param strategy strategies to apply, see \code{\link[future]{plan}} for 24 | #' some of the details. For \code{"callr"} plan, please install package 25 | #' @param workers number of 'CPU' cores to use. 26 | #' \code{future.callr}. 27 | #' @param workers positive integer or \code{"auto"}, number of 'CPU' to use. 28 | #' The default value is \code{"auto"}, i.e. \code{future::availableCores()} 29 | #' @param ... Further passed to \code{\link[future]{plan}} 30 | #' 31 | #' @export 32 | lazy_parallel <- function( 33 | strategy = c( 34 | 'multisession', 'multicore', 35 | 'multiprocess', 'cluster', 'remote', 'callr'), 36 | enabled = TRUE, workers = 'auto', 37 | ... 38 | ){ 39 | 40 | options('lazyarray.parallel.strategy' = FALSE) 41 | strategy <- match.arg(strategy) 42 | if(!has_dipsaus()){ 43 | stop('Package dipsaus not detected. Please install.packages("dipsaus")') 44 | } 45 | 46 | if(isTRUE(workers == 'auto')){ 47 | # get maximum available workers 48 | workers <- future::availableCores() 49 | } 50 | 51 | if(enabled){ 52 | 53 | if(strategy == 'multicore'){ 54 | dipsaus::make_forked_clusters(..., workers = workers) 55 | } else if(strategy == 'callr'){ 56 | future::plan(future.callr::callr, ..., workers = workers) 57 | } else { 58 | args <- list(...) 59 | tryCatch({ 60 | future::plan(strategy, ..., workers = workers) 61 | }, error = function(e){ 62 | do.call(future::plan, c(list(strategy), args)) 63 | }) 64 | } 65 | 66 | } else { 67 | future::plan('sequential') 68 | } 69 | 70 | invisible() 71 | } 72 | 73 | setOldClass(c('FstArray', 'AbstractLazyArray', 'R6')) 74 | setOldClass(c('FileArray', 'AbstractLazyArray', 'R6')) 75 | 76 | 77 | setGeneric("typeof") 78 | 79 | 80 | #' Type of \code{LazyArray} 81 | #' @param x a \code{LazyArray} or an R object 82 | #' @return The type of data stored in the input 83 | #' @exportMethod typeof 84 | setMethod("typeof", signature(x="AbstractLazyArray"), function(x){ 85 | x$storage_format 86 | }) 87 | 88 | 89 | setGeneric("crossprod") 90 | # setGeneric("tcrossprod") 91 | 92 | #' Matrix Crossproduct 93 | #' @param x a \code{LazyArray} or an R matrix 94 | #' @param y \code{NULL} or matrix 95 | #' @param weights numeric vector used as weight 96 | #' @param ... passed to further methods 97 | #' @return Matrix of cross product if data is small, or \code{LazyMatrix} if 98 | #' matrix is too large 99 | #' @name crossprod 100 | #' 101 | #' @examples 102 | #' 103 | #' x <- matrix(1:100, 50) 104 | #' crossprod(x) 105 | #' 106 | #' lazy_x <- as.lazymatrix(x) 107 | #' crossprod(lazy_x)[] 108 | #' 109 | #' weights <- (1:50)/50 110 | #' 111 | #' t(x) %*% diag(weights) %*% x 112 | #' crossprod(lazy_x, weights = weights) 113 | #' 114 | #' \dontrun{ 115 | #' 116 | #' # large data set ~ 1.6GB 117 | #' x <- as.lazymatrix(matrix(rnorm(2e8), ncol = 2)) 118 | #' 119 | #' crossprod(x) 120 | #' } 121 | #' 122 | #' 123 | NULL 124 | 125 | #' @rdname crossprod 126 | #' @exportMethod crossprod 127 | setMethod("crossprod", signature(x="AbstractLazyArray", y = 'AbstractLazyArray'), function(x, y = NULL, weights = NULL, ...){ 128 | lazy_crossprod(x, y, weights = weights, ...) 129 | }) 130 | 131 | #' @rdname crossprod 132 | #' @exportMethod crossprod 133 | setMethod("crossprod", signature(x="AbstractLazyArray", y = 'NULL'), function(x, y = NULL, weights = NULL, ...){ 134 | lazy_crossprod(x, NULL, weights = weights, ...) 135 | }) 136 | 137 | #' @rdname crossprod 138 | #' @exportMethod crossprod 139 | setMethod("crossprod", signature(x="AbstractLazyArray", y = "missing"), function(x, y = NULL, weights = NULL, ...){ 140 | lazy_crossprod(x, NULL, weights = weights, ...) 141 | }) 142 | 143 | #' @rdname crossprod 144 | #' @exportMethod crossprod 145 | setMethod("crossprod", signature(x="AbstractLazyArray", y = 'matrix'), function(x, y = NULL, weights = NULL, ...){ 146 | if(!is.null(weights)){ 147 | stopifnot(length(weights) == x$partition_length) 148 | res <- lapply(seq_len(x$npart), function(ii){ 149 | x$get_partition_data(ii, reshape = c(1, x$partition_length)) %*% (y * weights) 150 | }) 151 | } else { 152 | res <- lapply(seq_len(x$npart), function(ii){ 153 | x$get_partition_data(ii, reshape = c(1, x$partition_length)) %*% y 154 | }) 155 | } 156 | 157 | do.call('rbind', res) 158 | }) 159 | 160 | 161 | lazy_crossprod <- function(x, y = NULL, weights = NULL, ...){ 162 | 163 | if(!is.null(weights)){ 164 | stopifnot(length(weights) == x$partition_length) 165 | } 166 | 167 | new_x <- as.lazymatrix(x) 168 | new_x$make_readonly() 169 | if(is.null(y)){ 170 | yisx <- TRUE 171 | new_y <- new_x 172 | } else { 173 | yisx <- isTRUE(x$storage_path == y$storage_path && x$get_file_format() == y$get_file_format()) 174 | new_y <- as.lazymatrix(y) 175 | } 176 | 177 | if(length(weights)){ 178 | ftile <- filematrix::fm.create(tempfile(), nrow = length(weights), ncol = 1) 179 | ftile[] <- weights 180 | on.exit(filematrix::close(ftile)) 181 | 182 | chunk_map(new_x, map_fun = function(data, ii, idx_range){ 183 | idx <- seq.int(idx_range[[1]], idx_range[[2]]) 184 | if(yisx){ 185 | return(crossprod(data, data * as.vector(ftile[idx,1]))) 186 | } else { 187 | sub_y <- y[idx,,drop=FALSE] * as.vector(ftile[idx,1]) 188 | return(crossprod(data, sub_y)) 189 | } 190 | }, reduce = function(mapped){ 191 | Reduce('+', mapped) 192 | }, ...) 193 | 194 | } else { 195 | chunk_map(new_x, map_fun = function(data, ii, idx_range){ 196 | 197 | if(yisx){ 198 | return(crossprod(data)) 199 | } else { 200 | sub_y <- y[seq.int(idx_range[[1]], idx_range[[2]]),,drop=FALSE] 201 | return(crossprod(data, sub_y)) 202 | } 203 | }, reduce = function(mapped){ 204 | Reduce('+', mapped) 205 | }, ...) 206 | } 207 | 208 | } 209 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | Current submission 1.1.0: 2 | 3 | Self check, 0 errors, 0 warnings, 0 notes 4 | 5 | Changes: 6 | * Explicitly import `R6Class` to clear the note on [CRAN check website](https://cran.r-project.org/web/checks/check_results_lazyarray.html) 7 | * Added `little-endian platform` to `SystemRequirements` 8 | * Added the compression method and backend into `DESCRIPTION`: 9 | > The internal storage format is provided by 'fstcore' package geared by 'LZ4' and 'ZSTD' compressors. 10 | 11 | 12 | Last submission 1.0.0: passed 13 | 14 | Comments 15 | ``` 16 | For your next update: 17 | If there are references describing the (theoretical background of) 18 | methods or algorithms in your package, please add these in the 19 | Description field of your DESCRIPTION file in the form 20 | authors (year) 21 | authors (year) 22 | authors (year, ISBN:...) 23 | with no space after 'doi:', 'arXiv:' and angle brackets for auto-linking. 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /experiment/create_array.R: -------------------------------------------------------------------------------- 1 | # create arrays on disk 2 | 3 | #' @title Create a lazy-array with given format and dimension 4 | #' @author Zhengjia Wang 5 | #' @description Create a directory to store lazy-array. The path must be missing. See \code{\link{load_lazyarray}} for more details 6 | #' @param path path to a local drive to store array data 7 | #' @param storage_format data type, choices are \code{"double"}, 8 | #' \code{"integer"}, \code{"character"}, and \code{"complex"} 9 | #' @param dim integer vector, dimension of array, see \code{\link{dim}} 10 | #' @param dimnames list of vectors, names of each dimension, see \code{\link{dimnames}} 11 | #' @param compress_level 0 to 100, level of compression. 0 means 12 | #' no compression, 100 means maximum compression. For persistent data, 13 | #' it's recommended to set 100. Default is 50. 14 | #' @param meta_name header file name, default is \code{"lazyarray.meta"} 15 | #' @return A \code{ClassLazyArray} instance 16 | #' @details Lazy array stores array into hard drive, and load them on 17 | #' demand. It uses multi-thread which gains significant 18 | #' speed boost on solid state drives. 19 | #' 20 | #' For examples, see \code{\link{lazyarray}}. 21 | #' 22 | #' @export 23 | create_lazyarray <- function( 24 | path, storage_format, dim, dimnames = NULL, compress_level = 50L, 25 | meta_name = 'lazyarray.meta'){ 26 | 27 | if(dir.exists(path)){ 28 | stop("Path already exists.") 29 | } 30 | multipart <- TRUE 31 | multipart_mode <- 1L 32 | 33 | stopifnot(compress_level <= 100 & compress_level >= 0) 34 | 35 | if(length(dim) < 1){ 36 | stop("length(dim) must be at least 2") 37 | } 38 | 39 | if(!is.list(dimnames) && !is.null(dimnames)){ 40 | stop("dimnames must be a list or NULL") 41 | } 42 | 43 | # check if dim matches with dimnames 44 | if(!is.null(dimnames)){ 45 | dnl <- sapply(dimnames, length) 46 | if(length(dnl) != length(dim) || !all(dnl - dim == 0)){ 47 | stop("Invalid dimension") 48 | } 49 | } 50 | 51 | stopifnot(storage_format %in% c('character', 'double', 'integer', 'complex')) 52 | 53 | part_dimension <- dim 54 | part_dimension[length(dim)] <- 1 55 | 56 | ##### 57 | meta <- list( 58 | lazyarray_version = 0, 59 | file_format = 'fst', 60 | storage_format = storage_format, 61 | dim = dim, 62 | dimnames = dimnames, 63 | # partitioned = multipart, 64 | # prefix = prefix, 65 | part_dimension = part_dimension, 66 | postfix = '.fst', 67 | compress_level = compress_level 68 | # file_names = file_names 69 | ) 70 | 71 | dir.create(path, showWarnings = TRUE, recursive = TRUE) 72 | path <- normalizePath(path, mustWork = TRUE) 73 | 74 | meta_path <- file.path(path, meta_name) 75 | save_yaml(meta, meta_path) 76 | 77 | ClassLazyArray$new(path = path, read_only = FALSE, meta_name = meta_name) 78 | 79 | } 80 | 81 | 82 | 83 | #' @title Load Lazy Array from Given Path 84 | #' @author Zhengjia Wang 85 | #' @param path character, path of the array 86 | #' @param read_only whether setting data is allowed 87 | #' @param meta_name header file name, default is \code{"lazyarray.meta"} 88 | #' @return A \code{ClassLazyArray} instance 89 | #' @examples 90 | #' 91 | #' path <- tempfile() 92 | #' create_lazyarray(path, 'double', dim = c(3,4,5)) 93 | #' 94 | #' x <- load_lazyarray(path, read_only = FALSE) 95 | #' x[2,3:4, 2:1] <- 1:4 96 | #' x[ , , 2] 97 | #' 98 | #' # Changing dimension for multiple partition data only 99 | #' dim(x) <- c(3,4,6) 100 | #' dimnames(x) <- list(dim1 = as.character(1:3), 101 | #' dim2 = letters[1:4], 102 | #' dim3 = LETTERS[1:6]) 103 | #' names(dimnames(x)) 104 | #' subset(x, dim1 ~ dim1 == '2', dim2 ~ dim2 %in% c('a', 'c'), drop = TRUE) 105 | #' 106 | #' # Free up space 107 | #' x$remove_data() 108 | #' 109 | #' \donttest{ 110 | #' 111 | #' # This example needs at least 4 GB hard disk space and it takes 112 | #' # time to run for performance profile 113 | #' 114 | #' # Speed test 115 | #' path <- tempfile() 116 | #' x <- create_lazyarray(path, 'complex', dim = c(100,200,300,20)) 117 | #' 118 | #' # automatically call x$remove_data() upon garbage collection 119 | #' x$flag_auto_clean(TRUE) 120 | #' 121 | #' 122 | #' # set data (4 GB data) using 4 cores, compression level 50 123 | #' # data creation ~10 s, disk IO ~15-20 seconds, ~250MB/s 124 | #' 125 | #' system.time({ 126 | #' lapply(1:20, function(ii){ 127 | #' # Generating partition data (~10 sec) 128 | #' tmp <- rnorm(100*200*300) * (1+2i) 129 | #' 130 | #' # Write to disk (~16 sec) 131 | #' x[,,,ii] <- tmp 132 | #' NULL 133 | #' }) 134 | #' }) 135 | #' 136 | #' # Reading 64 MB data using 4 cores 137 | #' # ~0.25 seconds 138 | #' 139 | #' system.time({ 140 | #' x[1:100, sample(200, 200), 100:1, 2:4] 141 | #' }) 142 | #' 143 | #' # This call requires 4GB of RAM 144 | #' # Reading all 4GB data using 4 cores 145 | #' # ~4 seconds (1 GB/s) 146 | #' 147 | #' system.time({ 148 | #' x[] 149 | #' }) 150 | #' 151 | #' } 152 | #' 153 | #' @export 154 | load_lazyarray <- function(path, read_only = TRUE, meta_name = 'lazyarray.meta'){ 155 | path <- normalizePath(path, mustWork = TRUE) 156 | ClassLazyArray$new(path = path, read_only = read_only, meta_name = meta_name) 157 | } 158 | -------------------------------------------------------------------------------- /experiment/data/.gitignore: -------------------------------------------------------------------------------- 1 | *.rds 2 | -------------------------------------------------------------------------------- /experiment/funcs-lm.R: -------------------------------------------------------------------------------- 1 | # x <- rnorm(100); dim(x) = c(20,5) 2 | # x <- as.lazymatrix(x) 3 | # colnames(x) <- paste0('V', 1:5) 4 | # rownames(x) <- 1:20 5 | # dimnames(x) 6 | # data = x 7 | # formula <- V1+V3 * V1 ~ (.-V3)*V4 8 | # weights = NULL 9 | 10 | #' @name lazy_lm 11 | #' @title Perform linear regression on \code{LazyMatrix} object 12 | #' @param data \code{LazyMatrix} object 13 | #' @param formula R formula 14 | #' @param yidx column indices for response variable 15 | #' @param xidx column indices for predictor 16 | #' @param weights weights of observations 17 | #' @param intercept whether to include intercept 18 | #' @param na.action see \code{\link[stats]{lm}} 19 | #' @param center,scale logical or numeric vectors with length equaling to 20 | #' data columns 21 | #' @param residuals whether to return residuals 22 | #' @param ... passed to \code{\link{chunk_map}} 23 | #' @return A \code{"lm"} or \code{"mlm"} object 24 | #' 25 | #' @examples 26 | #' 27 | #' a <- matrix(rnorm(100), ncol = 5) 28 | #' x <- as.lazymatrix(a) 29 | #' 30 | #' 31 | #' stats_lm <- stats::lm.fit(x = x[,-1], y = x[,1])$coefficients 32 | #' 33 | #' lazy_lms <- lazy_lm_simple(x, yidx = 1, intercept = FALSE) 34 | #' lazy_lms 35 | #' 36 | #' 37 | #' # lazy_lm requires column names 38 | #' colnames(x) <- c('y', 'x1', 'x2', 'x3', 'x4') 39 | #' lazy_lm <- lazy_lm(y~.-1, data = x) 40 | #' lazy_lm 41 | #' 42 | #' 43 | #' 44 | #' @export 45 | lazy_lm <- function(formula, data, weights = NULL, na.action = 'na.pass', 46 | center = FALSE, scale = FALSE, residuals = FALSE,...){ 47 | 48 | data <- as.lazymatrix(data, read_only = TRUE, storage_format = 'double') 49 | 50 | if(data$`@transposed`){ 51 | stop('data must be column-major to perform lazy_lm (otherwise this would be a very slow process)') 52 | } 53 | 54 | cnames <- dimnames(data) 55 | cnames <- cnames[[length(cnames)]] 56 | if(!length(cnames)){ 57 | stop("lazy_lm cannot find column or partition names") 58 | } 59 | 60 | na.omit = TRUE 61 | smry_table <- partition_table(data) 62 | if(isTRUE(center)){ 63 | center <- smry_table$Mean 64 | } 65 | 66 | if(isTRUE(scale)){ 67 | scale <- smry_table$`Standard Deviation` 68 | } 69 | 70 | nobs <- nrow(data) 71 | force(na.action) 72 | # Create fake data to build model 73 | fake_data <- data[seq_len(ncol(data)),,drop=FALSE] 74 | fake_data <- as.data.frame(fake_data) 75 | names(fake_data) <- cnames 76 | mf <- stats::model.frame(formula, data = fake_data, na.action = na.action) 77 | mt <- attr(mf, "terms") 78 | y <- model.response(mf, "numeric") 79 | 80 | mlm <- is.matrix(y) 81 | ny <- if (mlm) nrow(y) else length(y) 82 | nyr <- if (mlm) ncol(y) else 1L 83 | 84 | if (is.empty.model(mt)) { 85 | x <- NULL 86 | z <- list(coefficients = if (mlm) matrix(NA_real_, 0, ny) else numeric(), rank = 0L) 87 | 88 | # TODO: add termsand formula 89 | return(z) 90 | } 91 | 92 | simple_fm <- formula 93 | simple_fm[[3]] <- quote(0) 94 | 95 | if(!is.null(weights)){ 96 | weights <- as.lazymatrix(weights, storage_format = 'double') 97 | } 98 | 99 | 100 | map_f <- function(chunk_x, chunk_number, chunk_idxrange){ 101 | if(!isFALSE(center) || !isFALSE(scale)){ 102 | chunk_x <- t(chunk_x) 103 | if( !isFALSE(center) ){ 104 | # need to center chunk 105 | chunk_x <- chunk_x - center 106 | } 107 | if( !isFALSE(scale) ){ 108 | # need to scale 109 | chunk_x <- chunk_x / scale 110 | } 111 | chunk_x <- t(chunk_x) 112 | } 113 | chunk_x <- as.data.frame(chunk_x) 114 | names(chunk_x) <- cnames 115 | 116 | mf <- stats::model.frame(simple_fm, data = chunk_x, na.action = na.action) 117 | y <- model.response(mf, "numeric") 118 | suppressWarnings({ 119 | x <- model.matrix(mt, chunk_x, contrasts) 120 | }) 121 | 122 | chunk_x <- cbind(y, x) 123 | 124 | if(!is.null(weights)){ 125 | cross <- crossprod(chunk_x, chunk_x * (weights[seq.int(chunk_idxrange[1], chunk_idxrange[2]), 1, drop = TRUE] / nobs)) 126 | } else { 127 | cross <- crossprod(chunk_x, chunk_x / nobs) 128 | } 129 | cross 130 | 131 | } 132 | 133 | cross <- chunk_map(data, map_function = map_f, 134 | reduce = function(mapped){ 135 | Reduce('+', mapped) 136 | }, ...) 137 | coef <- solve(cross[-seq_len(nyr), -seq_len(nyr)]) %*% cross[-seq_len(nyr),seq_len(nyr)] 138 | 139 | ret.residuals <- residuals 140 | residuals <- NULL 141 | if(ret.residuals){ 142 | residuals <- chunk_map(data, function(chunk_x, chunk_number, chunk_idxrange){ 143 | if(!isFALSE(center) || !isFALSE(scale)){ 144 | chunk_x <- t(chunk_x) 145 | if( !isFALSE(center) ){ 146 | # need to center chunk 147 | chunk_x <- chunk_x - center 148 | } 149 | if( !isFALSE(scale) ){ 150 | # need to scale 151 | chunk_x <- chunk_x / scale 152 | } 153 | chunk_x <- t(chunk_x) 154 | } 155 | chunk_x <- as.data.frame(chunk_x) 156 | names(chunk_x) <- cnames 157 | 158 | mf <- stats::model.frame(simple_fm, data = chunk_x, na.action = na.action) 159 | y <- model.response(mf, "numeric") 160 | suppressWarnings({ 161 | x <- model.matrix(mt, chunk_x, contrasts) 162 | }) 163 | 164 | y - x %*% coef 165 | }, function(res){ 166 | unlist(res) 167 | }, ...) 168 | } 169 | 170 | 171 | z <- list( 172 | coefficients = drop(coef), 173 | na.action = na.action, 174 | offset = 0, 175 | call = match.call(), 176 | terms = mt, 177 | residuals = residuals 178 | ) 179 | class(z) <- c(if (mlm) "mlm", "lm") 180 | z 181 | } 182 | 183 | #' @rdname lazy_lm 184 | #' @export 185 | lazy_lm_simple <- function(data, yidx, xidx, intercept = TRUE, weights = NULL, residuals = FALSE, ...){ 186 | 187 | data <- as.lazymatrix(data, read_only = TRUE, storage_format = 'double') 188 | 189 | if(data$`@transposed`){ 190 | stop('data must be column-major to perform lazy_lm_simple (otherwise this would be a very slow process)') 191 | } 192 | 193 | 194 | cidx <- seq_len(ncol(data)) 195 | nobs <- nrow(data) 196 | 197 | stopifnot(all(yidx %in% cidx)) 198 | 199 | if(missing(xidx)){ 200 | xidx <- cidx[!cidx %in% yidx] 201 | } 202 | 203 | if(!length(xidx) && !intercept){ 204 | x <- NULL 205 | z <- list(coefficients = if (length(yidx) > 1) matrix(NA_real_, 0, length(yidx)) else numeric(), rank = 0L) 206 | 207 | return(z) 208 | } 209 | 210 | 211 | if(!is.null(weights)){ 212 | stopifnot(length(weights) == nobs) 213 | weights <- as.lazymatrix(weights, storage_format = 'double') 214 | } 215 | 216 | cross <- chunk_map(data, function(chunk_x, ii, chunk_idxrange){ 217 | chunk_x <- chunk_x[, c(yidx, xidx)] 218 | if(intercept){ 219 | chunk_x <- cbind(chunk_x, 1) 220 | } 221 | if(!is.null(weights)){ 222 | cross <- crossprod(chunk_x, chunk_x * (weights[seq.int(chunk_idxrange[1], chunk_idxrange[2]), 1, drop = TRUE] / nobs)) 223 | } else { 224 | cross <- crossprod(chunk_x, chunk_x / nobs) 225 | } 226 | cross 227 | }, reduce = function(mapped){ 228 | Reduce('+', mapped) 229 | }, ...) 230 | 231 | coef <- solve(cross[-seq_along(yidx), -seq_along(yidx)]) %*% cross[-seq_along(yidx),seq_along(yidx)] 232 | 233 | ret.residuals <- residuals 234 | residuals <- NULL 235 | if(ret.residuals){ 236 | residuals <- chunk_map(data, function(chunk_x, chunk_number, chunk_idxrange){ 237 | x <- chunk_x[,xidx] 238 | if(intercept){ 239 | x <- cbind(x, 1) 240 | } 241 | chunk_x[,yidx] - x %*% coef 242 | }, function(res){ 243 | unlist(res) 244 | }, ...) 245 | } 246 | 247 | dnames <- colnames(data) 248 | if(!length(dnames)){ 249 | dnames <- sprintf('V%d', seq_len(ncol(data))) 250 | } 251 | if(intercept){ 252 | coef <- c(coef[length(coef)], coef[-length(coef)]) 253 | names(coef) <- c('(Intercept)', dnames[xidx]) 254 | } else { 255 | coef <- drop(coef) 256 | names(coef) <- dnames[xidx] 257 | } 258 | 259 | 260 | z <- list( 261 | coefficients = coef, 262 | na.action = 'na.pass', 263 | offset = 0, 264 | call = match.call(), 265 | residuals = residuals 266 | ) 267 | class(z) <- c(if (length(yidx) > 1) "mlm", "lm") 268 | z 269 | 270 | } 271 | 272 | 273 | -------------------------------------------------------------------------------- /experiment/junk.sh: -------------------------------------------------------------------------------- 1 | devtools::load_all(); 2 | dim = c(2,3,4,5) 3 | x = array(1:(prod(dim)), dim)# + 1i * array(rnorm(prod(dim)), dim) 4 | a = as.lazyarray(x, path = tempfile()) 5 | idx <- c(1:120, 1:120)#sample(length(x), size = 200, replace = TRUE) 6 | # idx[sample(200, 20)] = NA 7 | re <- a[idx] 8 | cp <- x[idx] 9 | expect_equivalent(re, cp) 10 | 11 | tik() 12 | gctorture2(1) 13 | a[1:2] -> re 14 | 15 | # a[] -> re 16 | gctorture(FALSE) 17 | as.data.frame(tok("", stop=TRUE)) 18 | -------------------------------------------------------------------------------- /experiment/lazymatrix.R: -------------------------------------------------------------------------------- 1 | #' @rdname lazyarray 2 | #' @export 3 | lazymatrix <- function( 4 | path, storage_format, dim, dimnames = NULL, 5 | compress_level = 50L, meta_name = 'lazyarray.meta', 6 | read_only = FALSE, quiet = FALSE, ... 7 | ){ 8 | if(file.exists(path) && !dir.exists(path)){ 9 | stop('lazymatrix path must be a directory path, but a file was found.') 10 | } 11 | 12 | if(!dir.exists(path)){ 13 | if(length(dim) != 2){ 14 | stop('dim must have length of 2 for matrix') 15 | } 16 | # not exists, create a new one 17 | arr <- create_lazyarray( 18 | path = path, storage_format = storage_format, dim = dim, 19 | dimnames = dimnames, compress_level = compress_level, meta_name = meta_name) 20 | return(as.lazymatrix(arr, read_only = read_only)) 21 | } 22 | 23 | # path exists, locate meta_name 24 | if(file.exists(file.path(path, meta_name))){ 25 | path <- normalizePath(path, mustWork = TRUE) 26 | arr <- ClassLazyMatrix$new(path = path, read_only = read_only, meta_name = meta_name) 27 | return(arr) 28 | } 29 | 30 | if(!quiet){ 31 | message('meta file not found, create one with existing files') 32 | } 33 | 34 | 35 | # Otherwise meta_name does not exist 36 | if(length(dim) != 2){ 37 | stop('dim must have length of 2 for matrix') 38 | } 39 | 40 | lazyarray( 41 | path = path, storage_format = storage_format, dim = dim, dimnames = dimnames, 42 | compress_level = compress_level, meta_name = meta_name, 43 | read_only = read_only, quiet = quiet 44 | ) 45 | 46 | ClassLazyMatrix$new(path = path, read_only = read_only, meta_name = meta_name) 47 | } 48 | 49 | 50 | 51 | #' @rdname lazyarray 52 | #' @export 53 | as.lazymatrix <- function(x, read_only = FALSE, ...){ 54 | UseMethod('as.lazymatrix') 55 | } 56 | 57 | #' @rdname lazyarray 58 | #' @export 59 | as.lazymatrix.default <- function(x, read_only = FALSE, storage_format, path = tempfile(), ...){ 60 | x <- unlist(x) 61 | call <- match.call() 62 | call[[1]] <- quote(as.lazymatrix.array) 63 | eval(call) 64 | } 65 | 66 | #' @rdname lazyarray 67 | #' @export 68 | as.lazymatrix.array <- function(x, read_only = FALSE, storage_format, path = tempfile(), ...){ 69 | if(dir.exists(path)){ 70 | stop("path exists, please specify a different path") 71 | } 72 | if(missing(storage_format)){ 73 | storage_format <- storage.mode(x) 74 | } 75 | 76 | dm <- dim(x) 77 | if(length(dm) < 2){ 78 | dm <- c(length(x), 1) 79 | } else { 80 | dm <- c( prod(dm) / dm[[length(dm)]], dm[[length(dm)]] ) 81 | } 82 | dim(x) <- dm 83 | 84 | if(dm[[1]] < dm[[2]]){ 85 | dm = rev(dm) 86 | re <- lazyarray(path = path, storage_format = storage_format, dim = dm) 87 | 88 | ii <- 1 89 | for(ii in seq_len(dm[[2]])){ 90 | re[,ii] <- x[ii,] 91 | } 92 | re <- as.lazymatrix.LazyArray(re, read_only = read_only, ...) 93 | re <- t(re) 94 | } else { 95 | re <- lazyarray(path = path, storage_format = storage_format, dim = dm) 96 | 97 | ii <- 1 98 | for(ii in seq_len(dm[[2]])){ 99 | re[,ii] <- x[,ii] 100 | } 101 | 102 | re <- as.lazymatrix.LazyArray(re, read_only = read_only, ...) 103 | } 104 | re 105 | 106 | } 107 | 108 | #' @rdname lazyarray 109 | #' @export 110 | as.lazymatrix.LazyArray <- function(x, read_only = FALSE, storage_format, ...){ 111 | path <- dirname(x$storage_path) 112 | meta_name <- x$meta_name 113 | if(is.na(read_only) || !is.logical(read_only)){ 114 | read_only <- !x$can_write 115 | } 116 | 117 | if(missing(storage_format)){ 118 | storage_format <- x$get_storage_format() 119 | } 120 | 121 | stopifnot(storage_format %in% x$storage_formats_avail) 122 | 123 | meta_name <- sprintf("%s_version-%s", storage_format, x$meta_name) 124 | 125 | # create header 126 | meta <- load_yaml(x$storage_path) 127 | meta$storage_format <- storage_format 128 | meta$dim <- c(length(x) / x$npart, x$npart) 129 | meta$part_dimension <- c(meta$dim[[1]], 1L) 130 | 131 | if(length(meta$dimnames) == length(x$dim)){ 132 | dn <- meta$dimnames 133 | ndim <- length(dn) 134 | dk <- names(dn) 135 | if(length(dk == ndim)){ 136 | meta$dimnames <- structure(list(NULL, dn[[ndim]]), names = c('', names(dn)[[ndim]])) 137 | } else { 138 | meta$dimnames <- list(NULL, dn[[ndim]]) 139 | } 140 | 141 | } else { 142 | meta$dimnames <- NULL 143 | } 144 | 145 | save_yaml(meta, file.path(path, meta_name)) 146 | 147 | ClassLazyMatrix$new(path = path, read_only = read_only, meta_name = meta_name) 148 | } 149 | 150 | #' @rdname lazyarray 151 | #' @export 152 | as.lazymatrix.LazyMatrix <- function(x, read_only = FALSE, storage_format, ...){ 153 | re <- as.lazymatrix.LazyArray(x, read_only, storage_format, ...) 154 | re$`@transposed` <- x$`@transposed` 155 | re 156 | } 157 | -------------------------------------------------------------------------------- /experiment/loader2ext.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H 4 | #define LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H 5 | 6 | #include "Rcpp.h" 7 | 8 | SEXP subsetFST_double(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 9 | 10 | SEXP subsetFST_integer(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 11 | 12 | SEXP subsetFST_character(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 13 | 14 | SEXP subsetFST_complex(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 15 | 16 | #endif // LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H 17 | -------------------------------------------------------------------------------- /experiment/old/bigmemory-example.R: -------------------------------------------------------------------------------- 1 | path <- '~/Desktop/junk/' 2 | f <- 'bigmemory-ieeg.testfile' 3 | # unlink(file.path(path, f)) 4 | dir.create(path) 5 | ncol = 84 6 | x <- bigmemory::attach.resource(file.path(path, 'bigmemory-ieeg.testfile.desc')) 7 | # x <- big.matrix(34497400, ncol = ncol, type = 'double', backingfile = f, backingpath = path) 8 | # for(ii in 1:ncol){ 9 | # x[,ii] <- y[,ii] 10 | # } 11 | system.time({x[]}) 12 | -------------------------------------------------------------------------------- /experiment/old/bmWrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef LAZYARRAY_BM_SUBSET_H 2 | #define LAZYARRAY_BM_SUBSET_H 3 | 4 | #include 5 | // [[Rcpp::depends(BH, bigmemory)]] 6 | #include 7 | #include 8 | 9 | // [[Rcpp::interfaces(cpp)]] 10 | 11 | // [[Rcpp::export]] 12 | SEXP subsetBM(SEXP pBigMat, SEXP listOrEnv, Rcpp::NumericVector dim, SEXPTYPE dtype,SEXP reshape = R_NilValue, bool drop = false); 13 | 14 | #endif // LAZYARRAY_BM_SUBSET_H 15 | -------------------------------------------------------------------------------- /experiment/old/classLazyArray.h: -------------------------------------------------------------------------------- 1 | #ifndef LAZYARRAY_CLASS_H 2 | #define LAZYARRAY_CLASS_H 3 | 4 | # include 5 | #include "common.h" 6 | #include "utils.h" 7 | using namespace Rcpp; 8 | 9 | class FstLazyArray { 10 | 11 | public: 12 | enum DataType {DOUBLE=REALSXP}; 13 | 14 | // Constructors and field getter/setter 15 | public: 16 | FstLazyArray( 17 | std::vector fstFiles, std::vector dimension, int dataType 18 | ): 19 | fstFiles(fstFiles), dimension(dimension), _dataType(dataType), _readOnly(true) 20 | { 21 | _nparts = *(dimension.end() - 1); 22 | _totalLen = std::accumulate(dimension.begin(), dimension.end(), INTEGER64_ONE, std::multiplies()); 23 | _partLen = _totalLen / _nparts; 24 | validate(); 25 | } 26 | 27 | virtual ~FstLazyArray(){} 28 | 29 | std::vector fstFiles; 30 | std::vector dimension; 31 | 32 | // getter 33 | int64_t nparts() const { return _nparts; } 34 | int64_t partLen() const { return _partLen; } 35 | int dataType() const { return _dataType; } 36 | const bool readOnly() const { return _readOnly; } 37 | 38 | // setter 39 | void readOnly(const bool isReadOnly){ 40 | _readOnly = isReadOnly; 41 | } 42 | 43 | bool validate(bool stopIfError = true); 44 | 45 | 46 | protected: 47 | 48 | int64_t _nparts; 49 | int64_t _partLen; 50 | int64_t _totalLen; 51 | int _dataType; 52 | bool _readOnly; 53 | }; 54 | 55 | 56 | 57 | 58 | // Define methods inline 59 | 60 | inline bool FstLazyArray::validate(bool stopIfError) { 61 | //_nparts _totalLen fstFiles dimension 62 | bool isValid = true; 63 | isValid = isValid || stopIfNot(dimension.size() >= 2, "FstLazyArray must dimension >= 2", stopIfError); 64 | isValid = isValid || stopIfNot(*(dimension.end() - 1) == _nparts, "FstLazyArray dimensions inconsistent with number of partitions", stopIfError); 65 | isValid = isValid || stopIfNot(fstFiles.size() == _nparts, "FstLazyArray file counts inconsistent with number of partitions", stopIfError); 66 | 67 | int64_t expectedLen = std::accumulate(dimension.begin(), dimension.end(), INTEGER64_ONE, std::multiplies()); 68 | isValid = isValid || stopIfNot(expectedLen == _totalLen, "FstLazyArray file counts inconsistent with number of partitions", stopIfError); 69 | return isValid; 70 | } 71 | 72 | 73 | 74 | 75 | #endif // LAZYARRAY_CLASS_H 76 | 77 | -------------------------------------------------------------------------------- /experiment/old/comparison-bigmemory.R: -------------------------------------------------------------------------------- 1 | # Installation (dev version) 2 | remotes::install_github('dipterix/dipsaus') 3 | remotes::install_github('dipterix/lazyarray') 4 | 5 | library(bigmemory) 6 | library(biganalytics) 7 | library(dipsaus) 8 | library(lazyarray) 9 | library(future) 10 | 11 | 12 | path <- '~/Desktop/junk/lazyarray_test2' 13 | arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double') 14 | arr <- as.lazymatrix(arr) 15 | colnames(arr) <- c('movie', 'customer', 'rating', 'year', 'month') 16 | # for(ii in 1:5){ 17 | # arr[,ii] <- x[,ii] 18 | # } 19 | system.time({ 20 | arr[,1] 21 | }) 22 | 23 | 24 | path <- '~/Desktop/junk/' 25 | f <- 'bigmemory.testfile' 26 | # unlink(file.path(path, f)) 27 | dir.create(path) 28 | ncol = 5 29 | x <- bigmemory::attach.resource(file.path(path, 'bigmemory.testfile.desc')) 30 | # x <- big.matrix(99072112, ncol = ncol, type = 'double', backingfile = f, backingpath = path) 31 | # for(ii in 1:ncol){ 32 | # x[,ii] <- arr[,ii] 33 | # } 34 | system.time({x[,2]}) 35 | 36 | # The challenge is calculation of X^T*X and X^T*y 37 | { 38 | options(bigmemory.allow.dimnames=TRUE) 39 | colnames(x) <- c('movie', 'customer', 'rating', 'year', 'month') 40 | a <- proc.time() 41 | res <- biglm.big.matrix(rating ~ movie + customer + year + month - 1, data = x) 42 | b <- proc.time(); b - a 43 | } 44 | # Time difference of 26.069 secs 45 | 46 | biglm::biglm(rating ~ movie + customer + year + month - 1, data = x) 47 | 48 | 49 | future::plan('multisession') 50 | { 51 | a <- proc.time() 52 | ll <- lazy_lm_simple(arr, yidx = 3, intercept = FALSE) 53 | b <- proc.time(); b - a 54 | } 55 | 56 | coefficients(ll) / coefficients(res) 57 | 58 | future::plan('sequential') 59 | { 60 | a <- proc.time() 61 | lazyres <- lazy_lm(rating ~ movie + customer + year + month - 1, data = arr) 62 | b <- proc.time(); b - a 63 | } 64 | coefficients(lazyres) / coefficients(res) 65 | -------------------------------------------------------------------------------- /experiment/old/comparison-lazyarray.R: -------------------------------------------------------------------------------- 1 | # lazyarray (col major) 2 | path <- tempfile() 3 | arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double') 4 | x <- as.lazymatrix(arr) 5 | y <- x$transpose() 6 | for(ii in 1:5){ 7 | arr[,ii] <- rnorm(99072112) 8 | } 9 | 10 | { 11 | a = Sys.time() 12 | # fstcore::threads_fstlib(4) 13 | # dipsaus::make_forked_clusters(4) 14 | future::plan(future::multisession) 15 | nrows <- 99072112 16 | # xtx <- dipsaus::async_works(1:500, function(ii, ...){ 17 | # arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double') 18 | # idx <- seq.int((ii-1) * 200000 + 1, ii * 200000) 19 | # idx <- idx[idx <= nrows] 20 | # if(!length(idx)){ return(list( 21 | # xtx = matrix(0, 5,5), 22 | # xty = rep(0, 5) 23 | # )) } 24 | # 25 | # tmp <- cbind(1, arr[idx, c(1,2,4,5)]) 26 | # 27 | # list( 28 | # xtx = t(tmp) %*% tmp, 29 | # xty = t(tmp) %*% arr[idx, 3] 30 | # ) 31 | # 32 | # }, .globals = list(nrows = nrows, path = path), .nworkers = 4) 33 | 34 | xtx <- dipsaus::lapply_async2(1:500, function(ii, ...){ 35 | arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double') 36 | idx <- seq.int((ii-1) * 200000 + 1, ii * 200000) 37 | idx <- idx[idx <= nrows] 38 | if(!length(idx)){ return(list( 39 | xtx = matrix(0, 5,5), 40 | xty = rep(0, 5) 41 | )) } 42 | 43 | tmp <- cbind(1, arr[idx, c(1,2,4,5)]) 44 | 45 | list( 46 | xtx = t(tmp) %*% tmp, 47 | xty = t(tmp) %*% arr[idx, 3] 48 | ) 49 | 50 | }) 51 | 52 | xty <- Reduce('+', lapply(1:500, function(ii){ 53 | xtx[[ii]]$xty 54 | })) 55 | xtx <- Reduce('+', lapply(1:500, function(ii){ 56 | xtx[[ii]]$xtx 57 | })) 58 | coef <- solve(xtx) %*% xty 59 | b <- Sys.time(); b - a 60 | } 61 | 62 | # single thread 63 | # Time difference of 15.4888 secs 64 | 65 | # future, 4 cores, no openmp 66 | # Time difference of 6.66791 secs 67 | 68 | # openmp, 4 cores, one process 69 | # Time difference of 18.39945 secs 70 | 71 | # --------------------------------------------------------------------- 72 | # lazyarray (row major) 73 | path <- tempfile() 74 | arr <- lazyarray::lazyarray(path, dim = c(200000, 500, 5), storage_format = 'double', multipart_mode = 2) 75 | for(ii in 1:5){ 76 | print(ii) 77 | arr[,,ii] <- rnorm(200000*500) 78 | } 79 | 80 | { 81 | # dipsaus::make_forked_clusters(4) 82 | future::plan(future::multisession) 83 | a = Sys.time() 84 | 85 | nrows <- 99072112 86 | xtx <- dipsaus::lapply_async2(1:500, function(ii, nrows, arr){ 87 | idx <- seq.int((ii-1) * 200000 + 1, ii * 200000) 88 | idx <- idx <= nrows 89 | if(!any(idx)){ return(list( 90 | xtx = matrix(0, 5,5), 91 | xty = rep(0, 5) 92 | )) } 93 | tmp <- cbind(1, arr[idx, ii, c(1,2,4,5), drop = TRUE]) 94 | list( 95 | xtx = t(tmp) %*% tmp, 96 | xty = t(tmp) %*% arr[idx, ii, 3, drop = TRUE] 97 | ) 98 | }, plan = FALSE, FUN.args = list(nrows = nrows, arr = arr))#, .globals = list(nrows = nrows, arr = arr), .rs = TRUE) 99 | 100 | xty <- Reduce('+', lapply(1:500, function(ii){ 101 | xtx[[ii]]$xty 102 | })) 103 | xtx <- Reduce('+', lapply(1:500, function(ii){ 104 | xtx[[ii]]$xtx 105 | })) 106 | 107 | coef <- solve(xtx) %*% xty 108 | b <- Sys.time(); b - a 109 | } 110 | 111 | # Time difference of 13.36511 secs 112 | 113 | coef - coefficients(res) 114 | 115 | -------------------------------------------------------------------------------- /experiment/old/filearrsub.R: -------------------------------------------------------------------------------- 1 | `[.FileArray` <- function(x, ..., drop = TRUE, reshape = NULL){ 2 | if(!x$is_valid){ 3 | stop("`[.FileArray`: x is no longer valid (data has been removed).") 4 | } 5 | if(!is.null(reshape)){ 6 | reshape <- as.numeric(reshape) 7 | stopifnot(all(reshape>=0)) 8 | } 9 | drop <- isTRUE(drop) 10 | # get schedule 11 | 12 | parsed <- parseAndScheduleBlocks(environment(), x$dim, TRUE) 13 | # parsed <- parseAndScheduleBlocks(list(1:10,2:10,3:10,4:10), x$dim, TRUE) 14 | # parsed <- parseAndScheduleBlocks(list(1,1,1,1), x$dim, TRUE) 15 | 16 | if(parsed$subset_mode == 1){ 17 | stop("FstArray does not support single subscript (x[i]), try x[] or x[i,j,k,...]") 18 | } 19 | 20 | re <- array(x$sample_na, parsed$target_dimension) 21 | 22 | if(parsed$expected_length == 0){ 23 | reshapeOrDrop(re, reshape, drop) 24 | return(re) 25 | } 26 | 27 | partition_length <- prod(x$partition_dim()) 28 | 29 | # x[] 30 | if(parsed$subset_mode == 2){ 31 | 32 | blocksize <- partition_length 33 | blocksize[[length(blocksize)]] <- 1 34 | blocksize <- prod2(blocksize) 35 | 36 | # copy all to re inplace 37 | for(ii in seq_len(x$npart)){ 38 | if(x$has_partition(ii)){ 39 | sub <- x$get_partition_data(ii) 40 | subsetAssignVector(re, blocksize * (ii-1) + 1, sub) 41 | } 42 | } 43 | } else { 44 | # x[i,j,k] 45 | loc <- parsed$location_indices 46 | if(!is.numeric(loc[[4]])){ 47 | # missing, all partitions 48 | partitions <- seq_len(x$npart) 49 | } else { 50 | partitions <- loc[[4]] 51 | } 52 | # check if the schedule is made 53 | schedule <- parsed$schedule 54 | block_ndims <- schedule$block_ndims 55 | 56 | ptr <- 1 57 | blocksize <- schedule$block_expected_length 58 | 59 | for(file_ii in partitions){ 60 | # No file, NA 61 | if(!x$has_partition(file_ii)){ 62 | ptr = ptr + blocksize * schedule$schedule_counts_per_part 63 | next 64 | } 65 | 66 | file <- x$get_partition_fpath(file_ii, full_path = TRUE, type = 'combined') 67 | ptr_file <- filematrix::fm.open(file) 68 | 69 | if(schedule$block_indexed){ 70 | # file exists 71 | for(schedule_ii in schedule$schedule_index){ 72 | row_number <- blocksize * (schedule_ii-1) + schedule$block_schedule 73 | subsetAssignVector(re, ptr, ptr_file[row_number, 1]) 74 | ptr = ptr + blocksize 75 | } 76 | } else { 77 | # ndim == 2 78 | row_number <- loc[[1]] 79 | if(is.numeric(row_number)){ 80 | # x[i,j] 81 | buffer <- ptr_file[row_number, 1] 82 | } else { 83 | # x[i,j] 84 | buffer <- ptr_file[, 1] 85 | } 86 | subsetAssignVector(re, ptr, buffer) 87 | ptr = ptr + length(buffer) 88 | } 89 | 90 | 91 | } 92 | 93 | } 94 | 95 | reshapeOrDrop(re, reshape, drop) 96 | re 97 | } 98 | -------------------------------------------------------------------------------- /experiment/old/int64_double.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | 5 | SEXP int64t2Sexp(const std::vector& x) { 6 | const R_xlen_t len = x.size(); 7 | SEXP re = PROTECT(Rf_allocVector(REALSXP, len)); 8 | std::memcpy(REAL(re), &(x[0]), len * sizeof(double)); 9 | Rf_setAttrib(re, wrap("class"), wrap("integer64")); 10 | return re; 11 | } 12 | 13 | 14 | std::vector numericVector2Int64tVec(const NumericVector& x){ 15 | const R_xlen_t len = x.size(); 16 | std::vector re( len ); 17 | std::memcpy(&(re[0]), &(x[0]), len * sizeof(int64_t)); 18 | return(re); 19 | } 20 | 21 | int64_t doubleInt64t(const double& x){ 22 | int64_t re; 23 | std::memcpy(&(re), &(x), sizeof(int64_t)); 24 | return(re); 25 | } 26 | 27 | 28 | double int64t2double(const int64_t& x){ 29 | double re; 30 | std::memcpy(&(re), &(x), sizeof(double)); 31 | return(re); 32 | } 33 | 34 | NumericVector int64tVec2NumericVector(const std::vector& x){ 35 | const R_xlen_t len = x.size(); 36 | NumericVector re = no_init( len ); 37 | std::memcpy(&(re[0]), &(x[0]), len * sizeof(double)); 38 | return(re); 39 | } 40 | 41 | std::vector sexp2Int64tVec(SEXP x){ 42 | const R_xlen_t len = Rf_xlength(x); 43 | std::vector re(len); 44 | if(TYPEOF(x) == REALSXP){ 45 | std::memcpy(&(re[0]), REAL(x), len * sizeof(int64_t)); 46 | }else{ 47 | SEXP y = PROTECT(Rf_coerceVector(x, REALSXP)); 48 | std::memcpy(&(re[0]), REAL(y), len * sizeof(int64_t)); 49 | UNPROTECT(1); 50 | } 51 | return re; 52 | } 53 | -------------------------------------------------------------------------------- /experiment/old/junk.R: -------------------------------------------------------------------------------- 1 | library(lazyarray) 2 | path <- "~/Desktop/lazyarray_data/" 3 | dimension <- c(287, 200, 601, 84) 4 | x <- lazyarray(path, storage_format = "double", dim = dimension) 5 | x$make_readonly() 6 | system.time(x[,,,1]) 7 | gc() 8 | prod(dim(x)) 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /experiment/old/memcpy.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "util.h" 3 | 4 | // [[Rcpp::export]] 5 | Rcpp::List cpp_array_to_list_integer(IntegerVector x, IntegerVector cutoff){ 6 | 7 | _rcpp_timer.step("start cpp_array_to_list_integer"); 8 | 9 | Rcpp::List re(cutoff.size() - 1); 10 | 11 | // memory copy x 12 | R_xlen_t size = x.size(); 13 | int* array = new int[size]; 14 | ::memcpy(array, &(*x.begin()), size * sizeof(int)); 15 | 16 | _rcpp_timer.step("memory-copy"); 17 | 18 | // cutoff 19 | int *p1, *p2; 20 | p1 = array + cutoff[0] - 1; 21 | p2 = p1; 22 | for( R_xlen_t ii = 0; ii < cutoff.size() - 1; ii++ ){ 23 | p2 += cutoff[ii + 1] - cutoff[ii]; 24 | re[ii] = IntegerVector(p1, p2); 25 | p1 = p2; 26 | _rcpp_timer.step("split-" + std::to_string(ii)); 27 | } 28 | _rcpp_timer.step("split-finished"); 29 | 30 | // return NumericVector(data,data+sizeof(data)/sizeof(int)); 31 | 32 | if( LAZYARRAY_DEBUG ){ 33 | 34 | NumericVector _res(_rcpp_timer); 35 | _res = _res / 1000.0; 36 | Rcpp::print(_res); 37 | } 38 | 39 | return re; 40 | } 41 | 42 | /*** R 43 | timesTwo(42) 44 | */ 45 | -------------------------------------------------------------------------------- /experiment/old/new_subset.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | arr <- lazyarray:::as.lazyarray(array(1:prod(1:4), 1:4)) 3 | a <- function(i, ...) { 4 | files = arr$get_partition_fpath() 5 | lazyarray:::subsetFST(files, environment(), dim(arr), getSexpType(0.1)) 6 | } 7 | dipsaus::lapply_async2(1:4, function(i, ...){ 8 | a(,,,1) 9 | lazyarray:::getLazyThread() 10 | }) 11 | 12 | 13 | 14 | 15 | a(, , , 1) 16 | -------------------------------------------------------------------------------- /experiment/old/old_scripts.R: -------------------------------------------------------------------------------- 1 | `[.LazyArray` <- function(x, ..., drop = TRUE){ 2 | 3 | # check dimensions 4 | nidx <- ...length() 5 | idx <- list() 6 | dim <- x$dim 7 | if(nidx == length(dim)){ 8 | for(ii in seq_len(nidx)){ 9 | idx[[ ii ]] <- tryCatch({ 10 | tmp <- ...elt(ii) 11 | if(is.logical(tmp)){ 12 | if(length(tmp) < dim[ii]){ 13 | tmp <- rep(tmp, floor(dim[ii] / length(tmp))) 14 | tmp <- tmp[seq_len(dim[ii])] 15 | } 16 | if(length(tmp) > dim[ii]){ 17 | stop("index out of bound at index ", ii) 18 | } 19 | which(tmp) 20 | } else if (is.numeric(tmp)){ 21 | tmp 22 | } else if (is.character(tmp)){ 23 | unlist(lapply(tmp, function(s){ 24 | re <- which(x$dimnames[[ii]] == s) 25 | if(length(re)){ re[[1]] } else { -1 } 26 | })) 27 | } 28 | }, error = function(e){ 29 | seq_len(dim[[ii]]) 30 | }) 31 | } 32 | target_dim <- sapply(idx, length) 33 | if(prod(target_dim) == 0){ 34 | if(drop){ 35 | return(x$`@sample_data`()[NULL]) 36 | } else { 37 | return(array(x$`@sample_data`(), dim = target_dim)) 38 | } 39 | 40 | } 41 | 42 | idx$drop <- drop 43 | return(do.call(x$`@get_data`, idx)) 44 | } 45 | 46 | has_idx <- FALSE 47 | if(...length() == 1){ 48 | tryCatch({ 49 | idx <- ...elt(1) 50 | has_idx <- TRUE 51 | }, error = function(e){}) 52 | } 53 | 54 | if(has_idx){ 55 | if(!length(idx)){ 56 | return(logical(0)) 57 | } else { 58 | # stop('lazyarray x[a:b] is not supported right now') 59 | 60 | # idx to each partition? 61 | dm <- dim(x) 62 | part_size<- length(x) / dm[[length(dm)]] 63 | 64 | partition_idx <- ((idx - 1) %% part_size) + 1 65 | partition <- (idx - partition_idx) / part_size + 1 66 | 67 | if(isTRUE(x$`@transposed`)){ 68 | tmp <- partition 69 | partition <- partition_idx 70 | partition_idx <- tmp 71 | } 72 | 73 | re <- partition_map(x, function(slice, part){ 74 | sel <- partition == part 75 | list( 76 | data = slice[partition_idx[sel]], 77 | sel = sel 78 | ) 79 | }, reduce = function(l){ 80 | re <- rep(NA, length(partition)) 81 | for(ii in seq_along(l)){ 82 | re[l[[ii]]$sel] <- l[[ii]]$data 83 | } 84 | re 85 | }) 86 | return(re) 87 | } 88 | 89 | } 90 | 91 | 92 | x$`@get_data`(drop = drop) 93 | 94 | } 95 | -------------------------------------------------------------------------------- /experiment/old/profile.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | library(lazyarray) 3 | 4 | x <- as.lazyarray(array(1:27,c(3,3,3)), storage_format = 'double') 5 | lazyarray:::setLazyBlockSize(2) 6 | x[,,1] 7 | loc2idx2(list(c(NA,1,0), c(2,NA,1), 1), dim(x)) 8 | 9 | lazyarray:::setLazyBlockSize(-1) 10 | path <- "~/Desktop/lazyarray_data/" 11 | path <- "~/Desktop/junk/lazyarray_test2/" 12 | dimension <- c(287, 200, 601, 84) 13 | x <- lazyarray(path, storage_format = "double", dim = dimension) 14 | x 15 | a=x[,,,1:10] 16 | 17 | subf <- function(i, ...){ 18 | .Call(lazyarray:::`_lazyarray_subsetFST`, x$get_partition_fpath(), environment(), dim(x), 19 | getSexpType(x$`@sample_data`()), reshape=NULL, drop=FALSE) 20 | } 21 | subf <- function(i, ...){ 22 | lazyarray:::subsetFST(x$get_partition_fpath(), environment(), dim(x), 23 | getSexpType(x$`@sample_data`()), reshape=NULL, drop=FALSE) 24 | } 25 | 26 | (subf(,,,1)) 27 | lazyarray:::setLazyBlockSize(-1) 28 | system.time(subf(,,,1:10)) 29 | lazyarray:::setLazyBlockSize(31250000) 30 | system.time(subf(,,,1:10)) 31 | system.time(x$`@get_data`(1:287, 1:200, 1, 1:10)) 32 | 33 | lazyarray::set_lazy_threads(4, reset_after_fork = T) 34 | -------------------------------------------------------------------------------- /experiment/old/s4-definition.R: -------------------------------------------------------------------------------- 1 | ensure_path <- function(x){ 2 | if(!dir.exists(x)){ 3 | dir_create(x) 4 | } 5 | x 6 | } 7 | 8 | #' @export 9 | register_lazyarray <- function(x){ 10 | x@method_list$validate <- function(stopIfError = TRUE){ 11 | x@method_list$with_instance(function(pointer){ 12 | .Call("LazyArrayBase__validate", pointer, stopIfError) 13 | }) 14 | } 15 | x@method_list$subset <- function(env, reshape = NULL, drop = TRUE){ 16 | stopifnot(is.environment(env) || is.list(env)) 17 | x@method_list$with_instance(function(pointer){ 18 | .Call("LazyArrayBase__subset", pointer, env, reshape, isTRUE(drop)) 19 | }) 20 | } 21 | x@method_list$subsetAssign <- function(env, value){ 22 | stopifnot(is.environment(env) || is.list(env)) 23 | x@method_list$with_instance(function(pointer){ 24 | .Call("LazyArrayBase__subsetAssign", pointer, value, env) 25 | }) 26 | } 27 | 28 | 29 | x@binding_list$nparts <- function(){ 30 | x@method_list$with_instance(function(pointer){ 31 | .Call("LazyArrayBase__nparts", pointer) 32 | }) 33 | } 34 | x@binding_list$read_only <- function(v){ 35 | if(missing(v)) v <- NULL 36 | x@method_list$with_instance(function(pointer){ 37 | .Call("LazyArrayBase__readOnly", pointer, v) 38 | }) 39 | } 40 | x@binding_list$dim <- function(){ 41 | x@method_list$with_instance(function(pointer){ 42 | .Call("LazyArrayBase__getDim", pointer) 43 | }) 44 | } 45 | x@binding_list$length_per_part <- function(){ 46 | x@method_list$with_instance(function(pointer){ 47 | .Call("LazyArrayBase__partLen", pointer) 48 | }) 49 | } 50 | x@binding_list$data_type <- function(){ 51 | x@method_list$with_instance(function(pointer){ 52 | .Call("LazyArrayBase__dataType", pointer) 53 | }) 54 | } 55 | invisible(x) 56 | } 57 | 58 | make_methods_lazymatrix <- function(x){ 59 | invisible(x) 60 | } 61 | 62 | #' @export 63 | check_data_type <- function(dataType){ 64 | if(is.numeric(dataType)){ 65 | dataType <- as.integer(dataType) 66 | } else if(is.character(dataType)){ 67 | dataType <- list("int" = 13L, "integer" = 13L, "numeric" = 14L, "double" = 14L, "complex" = 15L, "character" = 16L, "string" = 16L)[[dataType]] 68 | } 69 | if(!isTRUE(is.integer(dataType))){ 70 | stop("dataType must be character or integer. Choices are: integer(13), double(14), complex(15), or string(16)") 71 | } 72 | dataType 73 | } 74 | 75 | 76 | #' @title Base S4 class for 'LazyArray' 77 | #' @export 78 | setClass("LazyArrayBase", slots = c( 79 | classname = "character", method_list = "list", 80 | binding_list = "list", dimension = "numeric", dataType = "integer" 81 | ), contains = 'oldClass') 82 | 83 | #' @export 84 | setClass("FstArray", contains = "LazyArrayBase", slots = c(rootPath = "character", compression = "integer", uniformEncoding = "logical")) 85 | 86 | #' @export 87 | setClass("FstMatrix", contains = "FstArray", slots = c(transposed = "logical")) 88 | 89 | setMethod( 90 | "initialize", "LazyArrayBase", 91 | function(.Object, ...) { 92 | stop("LazyArrayBase is an abstract class. Please create from sub-classes") 93 | } 94 | ) 95 | 96 | 97 | setMethod( 98 | "initialize", "FstArray", 99 | function(.Object, rootPath, dimension, dataType = "double", compression = 50, uniformEncoding = TRUE) { 100 | stopifnot(is.numeric(dimension)) 101 | .Object@classname <- "FstArray" 102 | .Object@dimension <- dimension 103 | .Object@dataType <- check_data_type(dataType)[[1]] 104 | .Object@compression <- as.integer(compression)[[1]] 105 | .Object@uniformEncoding <- isTRUE(uniformEncoding)[[1]] 106 | .Object@rootPath <- as.character(rootPath)[[1]] 107 | 108 | if(!is.list(.Object@method_list)){ 109 | .Object@method_list <- list() 110 | } 111 | if(!is.list(.Object@binding_list)){ 112 | .Object@binding_list <- list() 113 | } 114 | .Object@method_list$with_instance <- function(FUN){ 115 | ensure_path(.Object@rootPath) 116 | pointer <- .Call("FstArray__new", .Object@rootPath, .Object@dimension, .Object@dataType, .Object@compression, .Object@uniformEncoding) 117 | FUN(pointer) 118 | } 119 | .Object <- register_lazyarray(.Object) 120 | .Object 121 | } 122 | ) 123 | 124 | setMethod( 125 | "initialize", "FstMatrix", 126 | function(.Object, rootPath, dimension, transposed = FALSE, dataType = "double", compression = 50, uniformEncoding = TRUE) { 127 | stopifnot(is.numeric(dimension)) 128 | .Object@classname <- "FstMatrix" 129 | .Object@dimension = dimension 130 | .Object@dataType <- check_data_type(dataType)[[1]] 131 | .Object@compression <- as.integer(compression)[[1]] 132 | .Object@uniformEncoding <- isTRUE(uniformEncoding)[[1]] 133 | .Object@rootPath <- as.character(rootPath)[[1]] 134 | if( transposed ){ transposed <- TRUE } else { transposed <- FALSE } 135 | .Object@transposed <- transposed; 136 | 137 | if(!is.list(.Object@method_list)){ 138 | .Object@method_list <- list() 139 | } 140 | if(!is.list(.Object@binding_list)){ 141 | .Object@binding_list <- list() 142 | } 143 | .Object@method_list$with_instance <- function(FUN){ 144 | ensure_path(.Object@rootPath) 145 | FUN(.Call("FstMatrix__new", .Object@rootPath, .Object@dimension, 146 | .Object@transposed, .Object@dataType, .Object@compression, .Object@uniformEncoding)) 147 | } 148 | .Object <- register_lazyarray(.Object) 149 | .Object <- make_methods_lazymatrix(.Object) 150 | .Object 151 | } 152 | ) 153 | 154 | setOldClass(c("FstMatrix", "FstArray", "LazyArrayBase")) 155 | 156 | #' @export 157 | lazyarray2 <- function(dim, storage_mode = "double", ..., type = 'fstarray'){ 158 | UseMethod("lazyarray2", structure(type, class = type)) 159 | } 160 | 161 | #' @export 162 | lazyarray2.fstarray <- function(path, dim, storage_mode = "double", ..., type){ 163 | x <- new("FstArray", path, dim, storage_mode, ...) 164 | # new_s3class <- c(oldClass(x), "LazyArrayBase") 165 | # attributes(new_s3class) <- attributes(oldClass(x)) 166 | # oldClass(x) <- new_s3class 167 | x 168 | } 169 | 170 | 171 | -------------------------------------------------------------------------------- /experiment/old/s4-generics.R: -------------------------------------------------------------------------------- 1 | setGeneric("$") 2 | setMethod( 3 | "$", "LazyArrayBase", function(x, name){ 4 | if(name %in% names(x@binding_list)){ 5 | return(x@binding_list[[name]]()) 6 | } 7 | return(x@method_list[[name]]) 8 | } 9 | ) 10 | 11 | setGeneric("names") 12 | setMethod( 13 | "names", "LazyArrayBase", function(x){ 14 | c(names(x@binding_list), names(x@method_list)) 15 | } 16 | ) 17 | 18 | #' @export 19 | `[.LazyArrayBase` <- function(x, ..., reshape = NULL, drop = TRUE){ 20 | drop <- as.logical(drop) 21 | if(!is.null(reshape)){ 22 | reshape <- as.numeric(reshape) 23 | } 24 | x$subset(environment(), reshape = reshape, drop = drop) 25 | } 26 | 27 | #' @export 28 | `[<-.LazyArrayBase` <- function(x, ..., value){ 29 | x$subsetAssign(environment(), value = value) 30 | } 31 | 32 | 33 | -------------------------------------------------------------------------------- /experiment/old/speed test.R: -------------------------------------------------------------------------------- 1 | library(lazyarray) 2 | library(dipsaus) 3 | library(future) 4 | options(future.fork.enable = TRUE) 5 | options('width' = 75) 6 | path <- "~/Desktop/lazyarray_data/" 7 | dimension <- c(287, 200, 601, 84) 8 | x <- lazyarray(path, storage_format = "double", dim = dimension) 9 | 10 | 11 | subarray <- x[,,,1,drop=FALSE] 12 | 13 | tempf <- tempfile() 14 | temp_rds <- paste0(tempf, '.rds') 15 | temp_h5 <- paste0(tempf, '.h5') 16 | temp_lazy <- paste0(tempf, '.lazyarray') 17 | 18 | unlink(temp_rds) 19 | unlink(temp_h5) 20 | unlink(temp_lazy, recursive = TRUE) 21 | 22 | rds <- bench::mark( 23 | { 24 | saveRDS(subarray, temp_rds) 25 | }, 26 | { 27 | raveio::save_h5(subarray, file = temp_h5, name = '/data', chunk = c(287, 1, 10, 1)) 28 | }, 29 | { 30 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 31 | y[] <- subarray 32 | }, memory = TRUE, time_unit = 's', iterations = 1, check = FALSE 33 | ) 34 | 35 | 36 | -------------------------------------------------------------------------------- /experiment/old/speed_test.R: -------------------------------------------------------------------------------- 1 | library(lazyarray) 2 | library(dipsaus) 3 | library(future) 4 | options(future.fork.enable = TRUE) 5 | options('width' = 75) 6 | path <- "~/Desktop/lazyarray_data/" 7 | dimension <- c(287, 200, 601, 84) 8 | x <- lazyarray(path, storage_format = "double", dim = dimension) 9 | 10 | tempf <- tempfile() 11 | 12 | prof <- lapply(1:10, function(ii){ 13 | print(ii) 14 | subarray <- x[,,,1:ii,drop=FALSE] 15 | 16 | temp_rds <- paste0(tempf, '.rds') 17 | temp_h5 <- paste0(tempf, '.h5') 18 | temp_lazy <- paste0(tempf, '.lazyarray') 19 | 20 | unlink(temp_rds) 21 | unlink(temp_h5) 22 | unlink(temp_lazy, recursive = TRUE) 23 | 24 | # pause for a while 25 | Sys.sleep(2) 26 | 27 | plan('sequential') 28 | 29 | rds <- bench::mark( 30 | { 31 | saveRDS(subarray, temp_rds) 32 | }, 33 | { 34 | raveio::save_h5(subarray, file = temp_h5, name = '/data', chunk = c(100, 1, 10, 1)) 35 | }, 36 | { 37 | lazyarray::set_lazy_threads(1L) 38 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 39 | y[] <- subarray 40 | }, 41 | { 42 | lazyarray::set_lazy_threads(2L) 43 | unlink(temp_lazy, recursive = TRUE) 44 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 45 | y[] <- subarray 46 | }, 47 | { 48 | lazyarray::set_lazy_threads(3L) 49 | unlink(temp_lazy, recursive = TRUE) 50 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 51 | y[] <- subarray 52 | }, 53 | { 54 | lazyarray::set_lazy_threads(4L) 55 | unlink(temp_lazy, recursive = TRUE) 56 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 57 | y[] <- subarray 58 | }, 59 | memory = TRUE, time_unit = 's', iterations = 1, check = FALSE, filter_gc = FALSE 60 | ) 61 | 62 | saveRDS(object = rds, file = sprintf('~/Dropbox/projects/lazyarray/experiment/data/%d.rds', ii)) 63 | rds 64 | }) 65 | 66 | 67 | 68 | temp_rds <- paste0(tempf, '.rds') 69 | temp_h5 <- paste0(tempf, '.h5') 70 | temp_lazy <- paste0(tempf, '.lazyarray') 71 | 72 | plan('sequential') 73 | 74 | lapply(1:10, function(ii){ 75 | rds <- bench::mark( 76 | { 77 | raveio::load_h5(file = temp_h5, name = '/data')[,,,1:ii] 78 | }, 79 | { 80 | lazyarray::set_lazy_threads(1L) 81 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 82 | y[,,,1:ii] 83 | }, 84 | { 85 | lazyarray::set_lazy_threads(2L) 86 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 87 | y[,,,1:ii] 88 | }, 89 | { 90 | lazyarray::set_lazy_threads(3L) 91 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 92 | y[,,,1:ii] 93 | }, 94 | { 95 | lazyarray::set_lazy_threads(4L) 96 | y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray)) 97 | y[,,,1:ii] 98 | }, 99 | memory = TRUE, time_unit = 's', iterations = 1, check = FALSE, filter_gc = FALSE 100 | ) 101 | 102 | saveRDS(object = rds, file = sprintf('~/Dropbox/projects/lazyarray/experiment/data/read_%d.rds', ii)) 103 | }) 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /experiment/old/test_io.R: -------------------------------------------------------------------------------- 1 | 2 | require(testthat) 3 | 4 | context("cpp - IO base") 5 | 6 | 7 | dim <- c(10,20,50); 8 | 9 | f <- normalizePath(tempfile(), mustWork = FALSE) 10 | 11 | test_that("IO - double, NumericVector", { 12 | 13 | x <- rnorm(10000); dim(x) <- dim 14 | unlink(f) 15 | test_data <- x; dim(test_data) <- c(200, 50); test_data <- as.data.frame(test_data) 16 | .Call(fstcore:::`_fstcore_fststore`, f, test_data, 100L, TRUE) 17 | expect_true(file.exists(f), label = "fstcore can write to file") 18 | 19 | unlink(f) 20 | # write to a file 21 | file.create(f) 22 | f <- normalizePath(f, mustWork = FALSE) 23 | unlink(f) 24 | lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE); 25 | 26 | expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file") 27 | 28 | 29 | # Make sure we have invalid indices 30 | idx_loc <- list( 31 | as.integer(sample(12) - 1), 32 | as.integer(sample(22)-1), 33 | as.integer(sample(52)-1) 34 | ) 35 | target_dim = sapply(idx_loc, length) 36 | y1 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.1) 37 | 38 | a = idx_loc[[1]]; a[(a<1) | (a >10)] = NA 39 | b = idx_loc[[2]]; b[b<1 | b > 20] = NA 40 | c = idx_loc[[3]]; c[c<1 | c > 50] = NA 41 | y2 <- x[a,b,c] 42 | 43 | expect_equal(sum(abs(is.na(y2) - is.na(y1))), 0, label = 'lazyarray subset (double) vs base -> index') 44 | expect_equal(range(y2 - y1, na.rm = TRUE), c(0,0), label = 'lazyarray subset (double) vs base -> value') 45 | 46 | # trying other loader 47 | y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 0L) 48 | expect_equal(range(as.integer(y2) - y3, na.rm = TRUE), c(0,0), label = 'lazyarray stored: double -> loader: int') 49 | 50 | y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, '') 51 | expect_equal(local({y2 <- as.character(y2); dim(y2) = dim(y3); y2}), y3, label = 'lazyarray stored: double -> loader: char') 52 | 53 | }) 54 | 55 | 56 | test_that("IO - double, IntegerVector", { 57 | x <- as.integer(sample(10000)); dim(x) <- dim 58 | unlink(f) 59 | lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE); 60 | 61 | expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file") 62 | 63 | idx_loc <- list( 64 | as.integer(sample(12) - 1), 65 | as.integer(sample(22)-1), 66 | as.integer(sample(52)-1) 67 | ) 68 | y1 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 9L) 69 | 70 | a = idx_loc[[1]]; a[(a<1) | (a >10)] = NA 71 | b = idx_loc[[2]]; b[b<1 | b > 20] = NA 72 | c = idx_loc[[3]]; c[c<1 | c > 50] = NA 73 | y2 <- x[a,b,c] 74 | 75 | expect_equal(sum(abs(is.na(y2) - is.na(y1))), 0, label = 'lazyarray subset (int) vs base -> index') 76 | expect_equal(range(y2 - y1, na.rm = TRUE), c(0,0), label = 'lazyarray subset (int) vs base -> value') 77 | expect_true(is.integer(y1), label = "check if it's integer") 78 | 79 | # trying other loaders 80 | y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.0) 81 | expect_equal(range(y2 - y3, na.rm = TRUE), c(0,0), label = 'lazyarray stored: int -> loader: double') 82 | 83 | y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, '') 84 | expect_equal(local({y2 <- as.character(y2); dim(y2) = dim(y3); y2}), y3, label = 'lazyarray stored: int -> loader: char') 85 | 86 | }) 87 | 88 | 89 | 90 | test_that("IO - double, CharacterVector", { 91 | 92 | x <- paste(sample(LETTERS, 10000, replace = TRUE), " asdasd"); dim(x) <- dim 93 | unlink(f) 94 | lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE); 95 | 96 | expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file") 97 | 98 | idx_loc <- list( 99 | as.integer(sample(12) - 1), 100 | as.integer(sample(22)-1), 101 | as.integer(sample(52)-1) 102 | ) 103 | y1 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, '') 104 | 105 | a = idx_loc[[1]]; a[(a<1) | (a >10)] = NA 106 | b = idx_loc[[2]]; b[b<1 | b > 20] = NA 107 | c = idx_loc[[3]]; c[c<1 | c > 50] = NA 108 | y2 <- x[a,b,c] 109 | 110 | expect_equal(sum(abs(is.na(y2) - is.na(y1))), 0, label = 'lazyarray subset (char) vs base -> index') 111 | expect_equal(y1, y2, label = 'lazyarray subset (char) vs base -> value') 112 | expect_true(is.character(y1), label = "check if it's integer") 113 | 114 | # trying other loaders 115 | expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.0)) 116 | 117 | expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0L)) 118 | 119 | x <- paste0(sample(0:9, 10000, replace = TRUE), ""); dim(x) <- dim 120 | unlink(f) 121 | lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE); 122 | 123 | expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.0)) 124 | 125 | expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0L)) 126 | 127 | }) 128 | -------------------------------------------------------------------------------- /experiment/old/write.R: -------------------------------------------------------------------------------- 1 | f = tempfile() 2 | data = list( 3 | a = 1:10, 4 | b = 1:2 5 | ) 6 | 7 | 8 | .Call(fstcore:::`_fstcore_fststore`, normalizePath(f, mustWork = FALSE), data, 20L, TRUE) 9 | tbl = fst::fst(f) 10 | tbl 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /experiment/performance-fst-vs-file.R: -------------------------------------------------------------------------------- 1 | # remotes::install_github('dipterix/lazyarray@filearray') 2 | 3 | require(lazyarray) 4 | 5 | path1 <- tempfile() 6 | path2 <- tempfile() 7 | 8 | dim <- c(100, 100, 100, 100) 9 | data <- rnorm(prod(dim)) 10 | pryr::object_size(data) 11 | 12 | fstarray <- fstarray(path1, dim) 13 | filearray <- lazyarray(path2, dim) 14 | 15 | fstarray[] <- data 16 | filearray[] <- data 17 | 18 | system.time({fstarray[,,,]}) 19 | 20 | # user system elapsed 21 | # 2.380 1.052 1.296 22 | 23 | system.time({filearray[,,,]}) 24 | 25 | # user system elapsed 26 | # 0.377 0.322 0.281 27 | 28 | 29 | # range(fstarray[,,,] - filearray[,,,]) 30 | lazyarray:::setLazyBlockSize(100) 31 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | cloneable 2 | fstcore 3 | LazyArray 4 | LZ 5 | OpenMP 6 | readRDS 7 | ZSTD 8 | -------------------------------------------------------------------------------- /inst/include/interfaces/BMArray.h: -------------------------------------------------------------------------------- 1 | #ifndef API_LAZYARRAY_BMCLASS_H 2 | #define API_LAZYARRAY_BMCLASS_H 3 | 4 | #include "entry.h" 5 | #include "LazyArrayBase.h" 6 | 7 | namespace lazyarray { 8 | 9 | class BMArray : public LazyArrayBase { 10 | 11 | // Constructors and field getter/setter 12 | public: 13 | BMArray( 14 | BigMatrix *pMat, std::vector dimension, SEXPTYPE dataType 15 | ): 16 | LazyArrayBase(dimension, dataType), _xpMat( pMat ) 17 | { 18 | validate(); 19 | } 20 | 21 | // To be used by Rcpp modules 22 | virtual ~BMArray(){} 23 | 24 | // methods 25 | bool validate(bool stopIfError = true) override { 26 | 27 | // Rcpp::XPtr _xpMat(_addr); 28 | 29 | //_nparts _totalLen fstFiles dimension 30 | bool isValid = true; 31 | isValid = isValid && stopIfNot(_dimension.size() >= 2, "BMArray must dimension >= 2", stopIfError); 32 | isValid = isValid && stopIfNot(*(_dimension.end() - 1) == _nparts, "BMArray dimensions inconsistent with number of partitions", stopIfError); 33 | 34 | 35 | int64_t ncols = _xpMat->ncol(); 36 | int64_t nrows = _xpMat->nrow(); 37 | isValid = isValid && stopIfNot(ncols == _nparts, "BMArray internal matrix column inconsistent with number of partitions", stopIfError); 38 | 39 | int64_t expectedLen = std::accumulate(_dimension.begin(), _dimension.end(), INTEGER64_ONE, std::multiplies()); 40 | isValid = isValid && stopIfNot((expectedLen == _totalLen) && (nrows * ncols == _totalLen), 41 | "BMArray file counts inconsistent with number of partitions", stopIfError); 42 | 43 | // _dataType 13:int, 14:double, 15: complex, 16: string 44 | isValid = isValid && stopIfNot( 45 | _dataType == INTSXP || _dataType == REALSXP || _dataType == STRSXP, 46 | "BMArray data type invalid. Supported are: int(13), double(14), string(16)", stopIfError); 47 | 48 | return isValid; 49 | } 50 | 51 | 52 | SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) override { 53 | NumericVector dim = getDim(); 54 | Rcpp::XPtr ptr(_xpMat); 55 | return subsetBM(Rcpp::Shield(Rcpp::wrap(ptr)), listOrEnv, dim, _dataType, reshape, drop); 56 | // return R_NilValue; 57 | }; 58 | 59 | private: 60 | BigMatrix* _xpMat; 61 | }; 62 | 63 | } 64 | 65 | #endif // API_LAZYARRAY_BMCLASS_H 66 | -------------------------------------------------------------------------------- /inst/include/interfaces/FstArray.h: -------------------------------------------------------------------------------- 1 | #ifndef API_LAZYARRAY_FSTCLASS_H 2 | #define API_LAZYARRAY_FSTCLASS_H 3 | 4 | #include 5 | #include "LazyArrayBase.h" 6 | 7 | namespace lazyarray { 8 | 9 | class FstArray : public LazyArrayBase { 10 | 11 | // Constructors and field getter/setter 12 | public: 13 | FstArray( 14 | const std::string rootPath, std::vector dimension, 15 | SEXPTYPE dataType, int& compression, bool& uniformEncoding 16 | ): 17 | LazyArrayBase(dimension, dataType), 18 | _compression(compression), _uniformEncoding(uniformEncoding) 19 | { 20 | if(rootPath.size() == 0){ 21 | _rootPath = "./"; 22 | } else { 23 | std::string ending = "/"; 24 | if(std::equal(ending.rbegin(), ending.rend(), rootPath.rbegin())){ 25 | _rootPath = rootPath; 26 | } else { 27 | _rootPath = rootPath + ending; 28 | } 29 | } 30 | validate(); 31 | } 32 | 33 | virtual ~FstArray(){ destroy(); } 34 | 35 | // methods 36 | inline bool validate(bool stopIfError = true) { 37 | //_nparts _totalLen fstFiles dimension 38 | bool isValid = LazyArrayBase::validate(stopIfError); 39 | isValid = isValid && stopIfNot( 40 | _dataType == INTSXP || _dataType == REALSXP || _dataType == CPLXSXP || _dataType == STRSXP, 41 | "FstArray/FstMatrix data type invalid. Supported are: int(13), double(14), complex(15), string(16)", stopIfError); 42 | return isValid; 43 | } 44 | 45 | inline SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) override { 46 | tok("S subset"); 47 | SEXP res = subsetFST(_rootPath, listOrEnv, _dimension, _dataType, reshape, drop);; 48 | tok("E subset"); 49 | return res; 50 | }; 51 | 52 | inline SEXP subsetAssign(SEXP values, SEXP listOrEnv) override { 53 | tok("S subsetAssign"); 54 | subsetAssignFST(values, _rootPath, listOrEnv, _dimension, _dataType, _compression, _uniformEncoding); 55 | tok("E subsetAssign"); 56 | return R_NilValue; 57 | } 58 | 59 | inline Rcpp::StringVector get_partition_path(SEXP part = R_NilValue){ 60 | tok("S get_partition_path"); 61 | Rcpp::StringVector fstFiles; 62 | if(Rf_isNull(part)){ 63 | fstFiles = Rcpp::StringVector(_nparts); 64 | Rcpp::StringVector::iterator ptr_fstFiles = fstFiles.begin(); 65 | for(int64_t ii = 1; ptr_fstFiles != fstFiles.end(); ii++, ptr_fstFiles++){ 66 | *ptr_fstFiles = _rootPath + std::to_string(ii) + ".fst"; 67 | } 68 | } else { 69 | std::vector part_alt = as>(part); 70 | fstFiles = Rcpp::StringVector(part_alt.size()); 71 | Rcpp::StringVector::iterator ptr_fstFiles = fstFiles.begin(); 72 | std::vector::iterator ptr_part_alt = part_alt.begin(); 73 | for(; ptr_fstFiles != fstFiles.end(); ptr_part_alt++, ptr_fstFiles++){ 74 | *ptr_fstFiles = _rootPath + std::to_string(*ptr_part_alt) + ".fst"; 75 | } 76 | } 77 | tok("E get_partition_path"); 78 | return fstFiles; 79 | } 80 | 81 | 82 | protected: 83 | int _compression; 84 | bool _uniformEncoding; 85 | std::string _rootPath; 86 | 87 | 88 | }; 89 | 90 | 91 | } 92 | 93 | 94 | #endif // API_LAZYARRAY_FSTCLASS_H 95 | -------------------------------------------------------------------------------- /inst/include/interfaces/FstMatrix.h: -------------------------------------------------------------------------------- 1 | #ifndef API_LAZYARRAY_FSTMATRIX_H 2 | #define API_LAZYARRAY_FSTMATRIX_H 3 | 4 | #include "FstArray.h" 5 | 6 | namespace lazyarray { 7 | 8 | class FstMatrix : public FstArray { 9 | 10 | // Constructors and field getter/setter 11 | public: 12 | FstMatrix( 13 | const std::string rootPath, std::vector dimension, bool& transposed, 14 | SEXPTYPE dataType, int& compression, bool& uniformEncoding 15 | ): FstArray(rootPath, dimension, dataType, compression, uniformEncoding), _transposed(transposed) {} 16 | 17 | virtual ~FstMatrix(){ destroy(); } 18 | 19 | bool validate(bool stopIfError = true) { 20 | //_nparts _totalLen fstFiles dimension 21 | bool isValid = FstArray::validate(stopIfError); 22 | isValid = isValid && stopIfNot(_dimension.size() == 2, "FstMatrix must be two dimension", stopIfError); 23 | return isValid; 24 | } 25 | 26 | NumericVector getDim() override { 27 | NumericVector dim(2); 28 | if( _transposed ){ 29 | dim[0] = _dimension[1]; 30 | dim[1] = _dimension[0]; 31 | dim.attr("is_transposed") = true; 32 | } else { 33 | dim[0] = _dimension[0]; 34 | dim[1] = _dimension[1]; 35 | } 36 | return dim; 37 | } 38 | 39 | // SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) override { 40 | // validate(); 41 | // 42 | // List sliceIdx = extractSlices(listOrEnv, 2); 43 | // bool resultNeedTranspose = false; 44 | // if(_transposed){ 45 | // resultNeedTranspose = true; 46 | // if( sliceIdx.size() == 2 ){ 47 | // SEXP tmp = sliceIdx[0]; 48 | // sliceIdx[0] = sliceIdx[1]; 49 | // sliceIdx[1] = tmp; 50 | // } else if( sliceIdx.size() == 1 && sliceIdx[0] != R_MissingArg ){ 51 | // // x[i] -> calculate row & col for i, switch back and calculate new i 52 | // resultNeedTranspose = false; 53 | // std::vector idx = sliceIdx[0]; 54 | // int64_t tmp; 55 | // int64_t nrow = _dimension[1]; // column is row now 56 | // int64_t ncol = _dimension[0]; 57 | // for(std::vector::iterator ptr_idx = idx.begin(); ptr_idx != idx.end(); ptr_idx++ ){ 58 | // if(*ptr_idx != NA_REAL && *ptr_idx != LLONG_MIN){ 59 | // tmp = ((*ptr_idx) - 1) % nrow; 60 | // *ptr_idx = ((*ptr_idx) - 1 - tmp) / nrow + tmp * ncol + 1; 61 | // } 62 | // } 63 | // } 64 | // } 65 | // NumericVector dim = int64t2NumericVector(_dimension); 66 | // const List subparsed = parseAndScheduleBlocks(sliceIdx, dim); 67 | // Rcpp::checkUserInterrupt(); 68 | // 69 | // Rcpp::StringVector fstFiles = get_partition_path(); 70 | // 71 | // SEXP res = subsetFSTBare(fstFiles, subparsed, dim, _dataType); 72 | // if( resultNeedTranspose ){ 73 | // Environment base = Environment::base_env(); 74 | // res = as(base["t.default"])(res); 75 | // } 76 | // return res; 77 | // }; 78 | 79 | // SEXP subsetAssign(SEXP values, SEXP listOrEnv) override { 80 | // validate(); 81 | // // List sliceIdx = extractSlices(listOrEnv, 2); 82 | // // bool resultNeedTranspose = false; 83 | // // if(_transposed){ 84 | // // resultNeedTranspose = true; 85 | // // if( sliceIdx.size() == 2 ){ 86 | // // SEXP tmp = sliceIdx[0]; 87 | // // sliceIdx[0] = sliceIdx[1]; 88 | // // sliceIdx[1] = tmp; 89 | // // } else if( sliceIdx.size() == 1 && sliceIdx[0] != R_MissingArg ){ 90 | // // // x[i] -> calculate row & col for i, switch back and calculate new i 91 | // // resultNeedTranspose = false; 92 | // // std::vector idx = sliceIdx[0]; 93 | // // int64_t tmp; 94 | // // int64_t nrow = _dimension[1]; // column is row now 95 | // // int64_t ncol = _dimension[0]; 96 | // // for(std::vector::iterator ptr_idx = idx.begin(); ptr_idx != idx.end(); ptr_idx++ ){ 97 | // // if(*ptr_idx != NA_REAL && *ptr_idx != NA_INTEGER64){ 98 | // // tmp = ((*ptr_idx) - 1) % nrow; 99 | // // *ptr_idx = ((*ptr_idx) - 1 - tmp) / nrow + tmp * ncol + 1; 100 | // // } 101 | // // } 102 | // // } 103 | // // } 104 | // // NumericVector dim = int64t2NumericVector(_dimension); 105 | // // const List subparsed = parseAndScheduleBlocks(sliceIdx, dim); 106 | // // Rcpp::checkUserInterrupt(); 107 | // // 108 | // // SEXP res = subsetFSTBare(fstFiles, subparsed, dim, _dataType); 109 | // // if( resultNeedTranspose ){ 110 | // // Environment base = Environment::base_env(); 111 | // // res = as(base["t.default"])(res); 112 | // // } 113 | // // return res; 114 | // // subsetAssignFST(values, fstFiles, listOrEnv, dim, _dataType, _compression, _uniformEncoding); 115 | // return R_NilValue; 116 | // } 117 | 118 | protected: 119 | bool _transposed; 120 | 121 | 122 | }; 123 | 124 | 125 | } 126 | 127 | 128 | #endif // API_LAZYARRAY_FSTMATRIX_H 129 | -------------------------------------------------------------------------------- /inst/include/interfaces/LazyArrayBase.h: -------------------------------------------------------------------------------- 1 | #ifndef API_LAZYARRAY_BASECLASS_H 2 | #define API_LAZYARRAY_BASECLASS_H 3 | 4 | #include "entry.h" 5 | 6 | namespace lazyarray { 7 | 8 | namespace { 9 | const int64_t INTEGER64_ONE = 1; 10 | 11 | 12 | inline Rcpp::NumericVector int64t2NumericVector(std::vector x){ 13 | Rcpp::NumericVector re = Rcpp::NumericVector(x.begin(), x.end()); 14 | return(re); 15 | } 16 | } 17 | 18 | class LazyArrayBase { 19 | 20 | public: 21 | LazyArrayBase( 22 | std::vector dimension, SEXPTYPE dataType 23 | ): 24 | _dimension(dimension), _dataType(dataType), _readOnly(true) 25 | { 26 | _nparts = *(_dimension.end() - 1); 27 | _totalLen = std::accumulate(_dimension.begin(), _dimension.end(), INTEGER64_ONE, std::multiplies()); 28 | _partLen = _totalLen / _nparts; 29 | // validate(); 30 | } 31 | 32 | virtual ~LazyArrayBase(){ destroy(); } 33 | 34 | // dimension of array 35 | 36 | 37 | // getter 38 | int64_t nparts() const { return _nparts; } 39 | int64_t partLen() const { return _partLen; } 40 | int dataType() const { return static_cast(_dataType); } 41 | bool getReadOnly() const { return _readOnly; } 42 | virtual NumericVector getDim() { return int64t2NumericVector(_dimension); } 43 | 44 | // setter 45 | bool setReadOnly(const bool isReadOnly){ 46 | _readOnly = isReadOnly; 47 | return _readOnly; 48 | } 49 | 50 | // abstract methods 51 | 52 | virtual SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) { 53 | stop("c++: LazyArrayBase::subset not implemented"); 54 | }; 55 | virtual SEXP subsetAssign(SEXP values, SEXP listOrEnv) { 56 | stop("c++: LazyArrayBase::subsetAssign not implemented"); 57 | } 58 | 59 | // pre-defined members 60 | /** 61 | * Validate whether the array has errors 62 | * @param stopIfError if true, stop or raise exceptions when errors are detected 63 | * @return true or false whether the array passes the validation test 64 | */ 65 | bool validate(bool stopIfError = true) { 66 | bool isValid = true; 67 | isValid = isValid && stopIfNot(_dimension.size() >= 2, "LazyArray must have dimension size >= 2", stopIfError); 68 | isValid = isValid && stopIfNot(*(_dimension.end() - 1) == _nparts, "LazyArray dimensions inconsistent with number of partitions", stopIfError); 69 | int64_t expectedLen = std::accumulate(_dimension.begin(), _dimension.end(), INTEGER64_ONE, std::multiplies()); 70 | isValid = isValid && stopIfNot(expectedLen == _totalLen, "LazyArray file counts inconsistent with number of partitions", stopIfError); 71 | return isValid; 72 | }; 73 | 74 | /** 75 | * Parse indices and prepare the subset 76 | * @param sliceIdx a list (VECSXP), its length should match with dimension size, 77 | * and each element should be REALSXP (int64_t) or INTSXP (int), 78 | * or just R_MissingValue indicating the indices to subset. 79 | * For example, 80 | * x[1,2,3] - sliceIdx=list(1, 2, 3) 81 | * x[, c(0:10,NA)] - sliceIdx=list(get_missing_value(), c(0:10,NA_real_)) 82 | * x[] - sliceIdx=list() or list(get_missing_value()) 83 | * x[-(1,2)] - sliceIdx=list(c(-1,-1)) 84 | * @return A list containing parsed information: 85 | * - subset_mode (int): subset mode: 0 means x[i,j,k], 1 means x[i], 2 means x[] 86 | * - target_dimension (std::vector or NumericVector): expected dimension of results 87 | * - expected_length (int64_t): expected result length (= prod(target_dimension)) 88 | * - negative_subscript: whether this margin should be negative sliced (currectly all false) 89 | * - location_indices: a list of cleaned sliceIdx. For example: 90 | * [expression] => [location_indices] 91 | * x[1,2,3] => list(1,2,3) 92 | * x[, c(0:10,NA)] => list({R_MissingValue}, c(1:10,NA_REAL)) 93 | * x[] => list({R_MissingValue}) 94 | * x[-(0:1),2] => list(2:3, 2), suppose dimension is c(3,4) 95 | * - schedule: a list or R_NilValue. If subset=0, schedule will schedule indexing 96 | * blocks to read from array. 97 | */ 98 | inline Rcpp::List scheduleBlocks(SEXP sliceIdx) { 99 | NumericVector dim = int64t2NumericVector(_dimension); 100 | return parseAndScheduleBlocks2(sliceIdx, dim); 101 | }; 102 | 103 | 104 | 105 | 106 | protected: 107 | 108 | std::vector _dimension; 109 | 110 | // Number of partitions - last dimension 111 | int64_t _nparts; 112 | 113 | // Partition size 114 | int64_t _partLen; 115 | 116 | // 117 | int64_t _totalLen; 118 | SEXPTYPE _dataType; 119 | bool _readOnly; 120 | 121 | protected: 122 | virtual void destroy(){} 123 | 124 | }; 125 | 126 | } 127 | 128 | #endif // API_LAZYARRAY_BASECLASS_H 129 | -------------------------------------------------------------------------------- /inst/include/interfaces/entry.h: -------------------------------------------------------------------------------- 1 | #ifndef API_LAZYARRAY_ENTRY 2 | #define API_LAZYARRAY_ENTRY 3 | 4 | // This first 5 | // migrated to BMArray 6 | // include 7 | 8 | #include 9 | 10 | #include "lazyarray_RcppExports.h" 11 | 12 | #endif // API_LAZYARRAY_ENTRY 13 | -------------------------------------------------------------------------------- /inst/include/lazyarray.h: -------------------------------------------------------------------------------- 1 | #ifndef RCPP_lazyarray_H_GEN_ 2 | #define RCPP_lazyarray_H_GEN_ 3 | 4 | #include "interfaces/entry.h" 5 | #include "interfaces/LazyArrayBase.h" 6 | #include "interfaces/FstArray.h" 7 | #include "interfaces/FstMatrix.h" 8 | //include "interfaces/BMArray.h" 9 | 10 | // namespace lazyarray { 11 | // namespace { 12 | // const int64_t INTEGER64_ONE = 1; 13 | // 14 | // 15 | // inline Rcpp::NumericVector int64t2NumericVector(std::vector x){ 16 | // Rcpp::NumericVector re = Rcpp::NumericVector(x.begin(), x.end()); 17 | // return(re); 18 | // } 19 | // } 20 | // 21 | // 22 | // // Implement class members here 23 | // Rcpp::List scheduleBlocks(SEXP sliceIdx); 24 | // LazyArrayBase::bool validate(bool stopIfError = true); 25 | // 26 | // 27 | // } 28 | 29 | #endif // RCPP_lazyarray_H_GEN_ 30 | -------------------------------------------------------------------------------- /lazyarray.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | 17 | BuildType: Package 18 | PackageUseDevtools: Yes 19 | PackageInstallArgs: --no-multiarch --with-keep.source 20 | PackageCheckArgs: --as-cran 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /man/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /man/auto_clear_lazyarray.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aaa.R 3 | \name{auto_clear_lazyarray} 4 | \alias{auto_clear_lazyarray} 5 | \title{Automatically remove array data} 6 | \usage{ 7 | auto_clear_lazyarray(x, onexit = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{'lazyarray' instance} 11 | 12 | \item{onexit}{passed to \code{\link{reg.finalizer}}} 13 | } 14 | \description{ 15 | Remove the files containing array data once no 16 | 'lazyarray' instance is using the folder. Require 17 | installation of \code{dipsaus} package (at least version 0.0.8). 18 | } 19 | \details{ 20 | \code{auto_clear_lazyarray} attempts to remove the entire folder 21 | containing array data. However, if some files are not created by the 22 | array, only partition data and meta file will be removed, all the 23 | artifacts will remain and warning will be displayed. One exception is 24 | if all files left in the array directory are \code{*.meta} files, 25 | all these meta files will be removed along with the folder. 26 | } 27 | \examples{ 28 | 29 | path <- tempfile() 30 | arr_dbl <- lazyarray(path, storage_format = 'double', 31 | dim = 2:4, meta_name = 'meta-dbl.meta') 32 | arr_dbl[] <- 1:24 33 | auto_clear_lazyarray(arr_dbl) 34 | 35 | arr_chr <- lazyarray(path, storage_format = 'character', 36 | dim = 2:4, meta_name = 'meta-chr.meta') 37 | auto_clear_lazyarray(arr_chr) 38 | 39 | # remove either one, the directory still exists 40 | rm(arr_dbl); invisible(gc(verbose = FALSE)) 41 | 42 | arr_chr[1,1,1] 43 | 44 | # Remove the other one, and path will be removed 45 | rm(arr_chr); invisible(gc(verbose = FALSE)) 46 | 47 | dir.exists(path) 48 | arr_check <- lazyarray(path, storage_format = 'character', 49 | dim = 2:4, meta_name = 'meta-chr') 50 | 51 | # data is removed, so there should be no data (NAs) 52 | arr_check[] 53 | 54 | } 55 | \author{ 56 | Zhengjia Wang 57 | } 58 | -------------------------------------------------------------------------------- /man/chunk_map.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generics.R 3 | \name{chunk_map} 4 | \alias{chunk_map} 5 | \title{Apply functions to all partitions, but small chunks each time} 6 | \usage{ 7 | chunk_map(x, map_fun, reduce, max_nchunks, chunk_size, ...) 8 | } 9 | \arguments{ 10 | \item{x}{a \code{LazyArray} or R array} 11 | 12 | \item{map_fun}{function to apply to each chunk} 13 | 14 | \item{reduce}{similar to \code{reduce} in \code{\link{partition_map}}} 15 | 16 | \item{max_nchunks}{maximum number of chunks. If number of chunks is too 17 | large, then \code{chunk_size} will be re-calculated.} 18 | 19 | \item{chunk_size}{integer chunk size. If \code{chunk_size} is too small, it 20 | will be ignored} 21 | 22 | \item{...}{ignored or passed to other methods} 23 | } 24 | \value{ 25 | If \code{reduce} is missing, returns a list of results. Each result 26 | is returned by \code{map_fun}, and the total length equals to number of 27 | chunks mapped. If \code{reduce} is a function, that list of results will 28 | be passed to \code{reduce} and \code{chunk_map} returns the results 29 | generated from \code{reduce}. 30 | } 31 | \description{ 32 | Apply functions to all partitions, but small chunks each time 33 | } 34 | \details{ 35 | The difference between \code{chunk_map} and 36 | \code{partition_map} is the margin or direction to apply mapping 37 | functions. In \code{partition_map}, mapping function is applied to 38 | each partition. If \code{x} is a matrix, this means applying to each column. 39 | \code{chunk_map} generate small chunks along all dimensions except the last, 40 | and apply mapping functions to each chunks. If \code{x} is a matrix, it 41 | make chunks along rows and apply mapping functions along rows. 42 | } 43 | \examples{ 44 | 45 | x <- as.lazymatrix(matrix(1:100, ncol = 2)) 46 | x 47 | 48 | # Set max_nchunks=Inf and chunk_size=10 to force total number of chunks 49 | # is around nrow(x)/10 and each chunk contains at most 10 rows 50 | chunk_map(x, function(chunk){chunk[1:2,]}, chunk_size = 10, max_nchunks = Inf) 51 | 52 | # For each chunks, calculate mean, then calculate the mean of chunk mean 53 | chunk_map(x, function(chunk) { 54 | colMeans(chunk) 55 | }, function(chunk_means) { 56 | Reduce('+', chunk_means) / length(chunk_means) 57 | }) 58 | 59 | colMeans(x[]) 60 | 61 | 62 | } 63 | \seealso{ 64 | \code{\link{partition_map}} 65 | } 66 | -------------------------------------------------------------------------------- /man/crossprod.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zzz.R 3 | \name{crossprod} 4 | \alias{crossprod} 5 | \alias{crossprod,AbstractLazyArray,AbstractLazyArray-method} 6 | \alias{crossprod,AbstractLazyArray,NULL-method} 7 | \alias{crossprod,AbstractLazyArray,missing-method} 8 | \alias{crossprod,AbstractLazyArray,matrix-method} 9 | \title{Matrix Crossproduct} 10 | \usage{ 11 | \S4method{crossprod}{AbstractLazyArray,AbstractLazyArray}(x, y = NULL, weights = NULL, ...) 12 | 13 | \S4method{crossprod}{AbstractLazyArray,`NULL`}(x, y = NULL, weights = NULL, ...) 14 | 15 | \S4method{crossprod}{AbstractLazyArray,missing}(x, y = NULL, weights = NULL, ...) 16 | 17 | \S4method{crossprod}{AbstractLazyArray,matrix}(x, y = NULL, weights = NULL, ...) 18 | } 19 | \arguments{ 20 | \item{x}{a \code{LazyArray} or an R matrix} 21 | 22 | \item{y}{\code{NULL} or matrix} 23 | 24 | \item{weights}{numeric vector used as weight} 25 | 26 | \item{...}{passed to further methods} 27 | } 28 | \value{ 29 | Matrix of cross product if data is small, or \code{LazyMatrix} if 30 | matrix is too large 31 | } 32 | \description{ 33 | Matrix Crossproduct 34 | } 35 | \examples{ 36 | 37 | x <- matrix(1:100, 50) 38 | crossprod(x) 39 | 40 | lazy_x <- as.lazymatrix(x) 41 | crossprod(lazy_x)[] 42 | 43 | weights <- (1:50)/50 44 | 45 | t(x) \%*\% diag(weights) \%*\% x 46 | crossprod(lazy_x, weights = weights) 47 | 48 | \dontrun{ 49 | 50 | # large data set ~ 1.6GB 51 | x <- as.lazymatrix(matrix(rnorm(2e8), ncol = 2)) 52 | 53 | crossprod(x) 54 | } 55 | 56 | 57 | } 58 | -------------------------------------------------------------------------------- /man/lazy_parallel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zzz.R 3 | \name{lazy_parallel} 4 | \alias{lazy_parallel} 5 | \title{Schedule parallel processes for \code{LazyArray}} 6 | \usage{ 7 | lazy_parallel( 8 | strategy = c("multisession", "multicore", "multiprocess", "cluster", "remote", 9 | "callr"), 10 | enabled = TRUE, 11 | workers = "auto", 12 | ... 13 | ) 14 | } 15 | \arguments{ 16 | \item{strategy}{strategies to apply, see \code{\link[future]{plan}} for 17 | some of the details. For \code{"callr"} plan, please install package} 18 | 19 | \item{enabled}{whether multiple-process strategy is enabled} 20 | 21 | \item{workers}{positive integer or \code{"auto"}, number of 'CPU' to use. 22 | The default value is \code{"auto"}, i.e. \code{future::availableCores()}} 23 | 24 | \item{...}{Further passed to \code{\link[future]{plan}}} 25 | } 26 | \description{ 27 | Enable parallel processing, need \code{dipsaus} to be installed. 28 | For \code{"callr"} strategy, please also install \code{future.callr}. 29 | } 30 | -------------------------------------------------------------------------------- /man/lazyarray-threads.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aaa.R 3 | \name{lazyarray-threads} 4 | \alias{lazyarray-threads} 5 | \alias{set_lazy_threads} 6 | \alias{get_lazy_threads} 7 | \title{Set Number of Threads for Lazy Arrays} 8 | \usage{ 9 | set_lazy_threads( 10 | nr_of_threads = getOption("lazyarray.nthreads"), 11 | reset_after_fork = NULL 12 | ) 13 | 14 | get_lazy_threads(max = FALSE) 15 | } 16 | \arguments{ 17 | \item{nr_of_threads}{number of CPU cores to use, or \code{NULL} to 18 | stay unchanged, default is \code{getOption('lazyarray.nthreads')}} 19 | 20 | \item{reset_after_fork}{whether to reset after forked process} 21 | 22 | \item{max}{whether return maximum available threads} 23 | } 24 | \value{ 25 | Number of cores currently used. 26 | } 27 | \description{ 28 | Set number of threads used by 'OpenMP' for both \code{lazyarray} 29 | and \code{fstcore} packages. 30 | } 31 | \seealso{ 32 | \code{\link[fstcore]{threads_fstlib}} 33 | } 34 | -------------------------------------------------------------------------------- /man/lazyarray.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lazyarray.R 3 | \name{lazyarray} 4 | \alias{lazyarray} 5 | \alias{fstarray} 6 | \alias{filearray} 7 | \alias{as.lazymatrix} 8 | \alias{as.lazyarray} 9 | \title{Create or load a \code{lazyarray} instance} 10 | \usage{ 11 | lazyarray( 12 | path, 13 | dim, 14 | read_only = FALSE, 15 | type = c("filearray", "fstarray"), 16 | storage_format = c("double", "integer", "complex", "character"), 17 | meta_name = "lazyarray.meta" 18 | ) 19 | 20 | fstarray( 21 | path, 22 | dim, 23 | read_only = FALSE, 24 | storage_format = c("double", "integer", "complex", "character"), 25 | meta_name = "lazyarray.meta" 26 | ) 27 | 28 | filearray( 29 | path, 30 | dim, 31 | read_only = FALSE, 32 | storage_format = c("double", "integer"), 33 | meta_name = "lazyarray.meta" 34 | ) 35 | 36 | as.lazymatrix(x, ...) 37 | 38 | as.lazyarray(x, path, type = "filearray", ...) 39 | } 40 | \arguments{ 41 | \item{path}{path to a local drive where array data should be stored} 42 | 43 | \item{dim}{integer vector, dimension of array, see \code{\link{dim}}} 44 | 45 | \item{read_only}{whether created array is read-only} 46 | 47 | \item{type}{the back-end implementation of the array; choices are 48 | \code{"filearray"} and \code{"fstarray"}.} 49 | 50 | \item{storage_format}{data type, choices are \code{"double"}, 51 | \code{"integer"}, \code{"character"}, and \code{"complex"}; see details} 52 | 53 | \item{meta_name}{header file name, default is \code{"lazyarray.meta"}} 54 | 55 | \item{x}{An R matrix or array} 56 | 57 | \item{...}{passed into \code{lazyarray}} 58 | } 59 | \value{ 60 | An \code{R6} class of \code{lazyarray}. The class name is either 61 | \code{FstArray} or \code{FileArray}, depending on \code{type} specified. 62 | Both inherit \code{AbstractLazyArray}. 63 | } 64 | \description{ 65 | Creates or load a \code{lazyarray} that stores data on the hard 66 | disks. The data content is load on demand. 67 | } 68 | \details{ 69 | The function \code{lazyarray()} can either create or load an array 70 | on the hard drives. When \code{path} exists as a directory, and there is 71 | a valid array instance stored, \code{lazyarray} will ignore other parameters 72 | such as \code{storage_format}, \code{type}, and sometimes \code{dim} (see 73 | Section "Array Partitions"). The function will try to load the existing array 74 | given by the descriptive meta file. When \code{path} is missing or there is 75 | no valid array files inside of the directory, then a new array will be 76 | spawned, and \code{path} will be created automatically if it is missing. 77 | 78 | There are two back-end implementations for \code{lazyarray()}: 79 | \code{"filearray"} and \code{"fstarray"}. You can use \code{type} to 80 | specify which implementation serves your needs. There are some differences 81 | between these two types. Each one has its own strengths and weaknesses. 82 | Please see Section "Array Types" for more details. 83 | 84 | The argument \code{meta_name} specifies the name of file which stores 85 | all the attribute information such as the total dimension, partition size, 86 | file format, and storage format etc. There could be multiple meta files for 87 | the same array object; see Section "Array Partitions" for details. 88 | } 89 | \section{Array Types}{ 90 | 91 | 92 | Type \code{filearray} stores data in its binary form "as-is" to the local 93 | drives. This format is compatible with the package \code{filematrix}. 94 | The data types supported are integers and double-float numbers. 95 | 96 | Type \code{fstarray} stores data in \code{fst} format defined by the 97 | package \code{fstcore} using 'ZSTD' compression technique. Unlike 98 | \code{filearray}, \code{fstarray} supports complex numbers and string 99 | characters in addition to integer and double numbers. 100 | 101 | The performance on solid-state drives mounted on 'NVMe' shows 102 | \code{filearray} can reach up to 3 GB per second for reading speed and 103 | \code{fstarray} can reach up to 1 GB per second. 104 | 105 | By default, \code{filearray} will be used if the storage format is supported, 106 | and \code{fstarray} is the back-up option. However, if the array data is 107 | structured or ordered, or the storage size is a major concern, 108 | \code{fstarray} might achieve a better performance because it compresses 109 | data before writing to hard drive. 110 | 111 | To explicitly create file array, use the function \code{filearray()}. 112 | Similarly, use \code{fstarray()} to create \code{fst}-based array. 113 | } 114 | 115 | \section{Array Partitions}{ 116 | 117 | 118 | A \code{lazyarray} partitions data in two ways: file partitions and in-file 119 | blocks. 120 | 121 | 1. File-level Partition: 122 | 123 | The number of file partitions matches with the last array margin. 124 | Given a \eqn{100 x 200 x 30 x 4} array, there will be 4 partitions, each 125 | partition stores a slice of data containing a \eqn{100 x 200 x 30} 126 | sub-array, or \eqn{2,400,000} elements. 127 | 128 | Once an array is created, the length of each partition does not change 129 | anymore. However, the shape of each partition can be changed. The number of 130 | partitions can grow or trim. To change these, you just need to create a 131 | new meta file and specify the new dimension at no additional cost. Use 132 | the previous example. The partition sub-dimension can be 133 | \eqn{10000 x 60}, \eqn{2000 x 300}, or \eqn{1000 x 200 x 3} as 134 | long as the total length matches. The total partitions can change to 135 | 3, 5, or 100, or any positive integer. To change the total dimension to 136 | \eqn{2400000 x 100}, you can call \code{lazyarray} with the new dimension ( 137 | see examples). Please make sure the \code{type} and \code{meta_name} are 138 | specified. 139 | 140 | 2. In-file Blocks: 141 | 142 | Within each file, the data are stored in blocks. When reading the data, if 143 | an element within each block is used, then the whole block gets read. 144 | 145 | For \code{filearray}, the block size equals to the first margin. For 146 | example, a \eqn{100 x 200 x 3} file array will have 3 file partitions, 147 | 200 blocks, each block has 100 elements 148 | 149 | As for \code{fstarray}, the lower bound of block size can be set by 150 | \code{options(lazyarray.fstarray.blocksize=...)}. By default, this number is 151 | 16,384. For a \eqn{100 x 200 x 3} array, each partition only has one block 152 | and block number if 20,000. 153 | } 154 | 155 | \section{Indexing and Recommended Dimension Settings}{ 156 | 157 | 158 | If there is a dimension that defines the unit of analysis, then make it the 159 | last margin index. If a margin is rarely indexed, put it in the first margin. 160 | This is because indexing along the last margin is the fastest, and indexing 161 | along the first margin is the slowest. 162 | 163 | If \code{x} has \eqn{200 x 200 x 200} dimension, \code{x[,,i]} is the 164 | fastest, then \code{x[,i,]}, then \code{x[i,,]}. 165 | } 166 | 167 | \examples{ 168 | 169 | library(lazyarray) 170 | 171 | path <- tempfile() 172 | 173 | # ---------------- case 1: Create new array ------------------ 174 | arr <- lazyarray(path, storage_format = 'double', dim = c(2,3,4)) 175 | arr[] <- 1:24 176 | 177 | # Subset and get the first partition 178 | arr[,,1] 179 | 180 | # Partition file path (total 4 partitions) 181 | arr$get_partition_fpath() 182 | 183 | # Removing array doesn't clear the data 184 | rm(arr); gc() 185 | 186 | # ---------------- Case 2: Load from existing directory ---------------- 187 | # Load from existing path, no need to specify other params 188 | arr <- lazyarray(path, read_only = TRUE) 189 | 190 | summary(arr, quiet = TRUE) 191 | 192 | # ---------------- Case 3: Import from existing data ---------------- 193 | 194 | # Change dimension to 6 x 20 195 | 196 | arr1 <- lazyarray(path, dim = c(6,20), meta_name = "arr_6x20.meta") 197 | 198 | arr1[,1:5] 199 | 200 | arr1[,1:6] <- rnorm(36) 201 | 202 | # arr also changes 203 | arr[,,1] 204 | 205 | 206 | # ---------------- Case 4: Converting from R arrays ---------------- 207 | 208 | x <- matrix(1:16, 4) 209 | x <- as.lazymatrix(x, type = 'fstarray', storage_format = "complex") 210 | x[,] # or x[] 211 | 212 | 213 | 214 | } 215 | \author{ 216 | Zhengjia Wang 217 | } 218 | -------------------------------------------------------------------------------- /man/lazylm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/model-lm.R 3 | \name{lazylm} 4 | \alias{lazylm} 5 | \title{Fitting linear models using \code{lazyarray}} 6 | \usage{ 7 | lazylm( 8 | formula, 9 | data, 10 | fitted = FALSE, 11 | weights = NULL, 12 | offset = NULL, 13 | contrasts = NULL, 14 | na.action = getOption("na.action"), 15 | qr.tol = 1e-07, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{formula}{an object of class \code{\link[stats]{formula}}. For variable 21 | names to be used, see 'Details'} 22 | 23 | \item{data}{a \code{lazyarray} object} 24 | 25 | \item{fitted}{whether to calculate fitted data and residuals. This may 26 | take time and memory if data is large} 27 | 28 | \item{weights, offset, contrasts, na.action}{see \code{\link[stats]{lm}}} 29 | 30 | \item{qr.tol}{the tolerance for detecting linear dependencies in the 31 | partitions of \code{data}; see \code{\link{qr}}} 32 | 33 | \item{...}{passed to \code{\link{chunk_map}}} 34 | } 35 | \value{ 36 | An object of class \code{c("lazylm", "lm")} or for multiple 37 | responses of class \code{c("lazylm", "mlm")}. 38 | } 39 | \description{ 40 | Fitting linear models using \code{lazyarray} 41 | } 42 | \details{ 43 | The array will be reshaped to a matrix first before fitting the 44 | linear models. A \eqn{100 x 20 x 5} array will be reshaped to a 45 | \eqn{2000 x 5} lazy matrix. 46 | The variables are the partitions of the array. If \code{dimnames} 47 | are set for the last margin index, then those will be used as variable 48 | names, otherwise \code{lazylm} automatically assign 49 | \eqn{"V1", "V2", "V3", ...} as each partition names. 50 | } 51 | \examples{ 52 | 53 | library(lazyarray) 54 | arr <- array(rnorm(72), c(6,3,4)) 55 | arr[1,1,1] <- NA # Allow NA to be treated 56 | offset = rnorm(18) # offset and weights are optional 57 | weights = runif(18) 58 | 59 | formula <- V1 ~ .-V2-1 + (V2 > 0) 60 | 61 | data <- as.lazyarray(arr, type = 'file') 62 | object <- lazylm(formula, data, weights = weights, offset = offset) 63 | 64 | 65 | # Compare to stats::lm 66 | dim(arr) <- c(18, 4) 67 | lm_data <- as.data.frame(arr) 68 | flm <- lm(formula, lm_data, weights = weights, offset = offset) 69 | 70 | cbind(coef(object), coef(flm)) 71 | cbind(resid(object), resid(flm)) 72 | cbind(fitted(object), fitted(flm)) 73 | summary(object) 74 | summary(flm) 75 | 76 | } 77 | -------------------------------------------------------------------------------- /man/partition_map.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generics.R 3 | \name{partition_map} 4 | \alias{partition_map} 5 | \title{Apply function along the last dimension of an array and aggregate the results} 6 | \usage{ 7 | partition_map(x, map_fun, reduce, partitions, ...) 8 | } 9 | \arguments{ 10 | \item{x}{R array or \code{LazyArray}} 11 | 12 | \item{map_fun}{function that takes in a slice of array and an optional 13 | argument indicating current partition number} 14 | 15 | \item{reduce}{function that accept a list of results returned by 16 | \code{map_fun}, can be missing} 17 | 18 | \item{partitions}{integers of partitions, i.e. the slices of array to be 19 | applied to, can be missing. If missing, then applies to all partitions} 20 | 21 | \item{...}{internally used} 22 | } 23 | \value{ 24 | If \code{reduce} is missing, returns a list of results. Each result 25 | is returned by \code{map_fun}, and the total length equals to number of 26 | partitions mapped. If \code{reduce} is a function, that list of results will 27 | be passed to \code{reduce} and \code{partition_map} returns the results 28 | generated from \code{reduce}. 29 | } 30 | \description{ 31 | Apply function along the last dimension of an array and aggregate the results 32 | } 33 | \examples{ 34 | 35 | # -------------------------- Ordinary R array --------------------------- 36 | 37 | x <- array(1:24, c(2,3,4)) 38 | partition_map(x, function(slice, part){ 39 | sum(slice) 40 | }) 41 | 42 | # When reduce and partitions are missing, the following code is equivalent 43 | as.list(apply(x, 3, sum)) 44 | 45 | # When reduce is present 46 | partition_map(x, function(slice, part){ 47 | sum(slice) 48 | }, function(slice_sum){ 49 | max(unlist(slice_sum)) 50 | }) 51 | 52 | # equivalently, we could call 53 | slice_sum <- partition_map(x, function(slice, part){ 54 | sum(slice) 55 | }) 56 | max(unlist(slice_sum)) 57 | 58 | # When partition is specified 59 | # Partition 1, 2, and 4 exist but 5 is missing 60 | # when a partition is missing, the missing slice will be NA 61 | partition_map(x, function(slice, part){ 62 | sum(slice) 63 | }, partitions = c(1,2,4,5)) 64 | 65 | # -------------------------- LazyArray --------------------------- 66 | x <- lazyarray(tempfile(), storage_format = 'complex', dim = c(2,3,4)) 67 | x[] <- 1:24 + (24:1) * 1i 68 | 69 | partition_map(x, function(slice, part){ 70 | slice[1, ,] * slice[2, ,] 71 | }, reduce = function(mapped_prod){ 72 | mean(unlist(mapped_prod)) 73 | }) 74 | 75 | 76 | 77 | } 78 | -------------------------------------------------------------------------------- /man/partition_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generics.R 3 | \name{partition_table} 4 | \alias{partition_table} 5 | \alias{partition_table.array} 6 | \alias{partition_table.AbstractLazyArray} 7 | \title{Generate partition summary statistics for array objects along the last 8 | dimension} 9 | \usage{ 10 | partition_table(x, na.rm = FALSE, ...) 11 | 12 | \method{partition_table}{array}(x, na.rm = FALSE, ...) 13 | 14 | \method{partition_table}{AbstractLazyArray}(x, na.rm = FALSE, ...) 15 | } 16 | \arguments{ 17 | \item{x}{an array or \code{LazyArray}} 18 | 19 | \item{na.rm}{whether to remove \code{NA} when calculating summary statistics} 20 | 21 | \item{...}{passed to other methods or ignored} 22 | } 23 | \value{ 24 | A data frame with the following possible columns: \code{Min}, 25 | \code{Max}, \code{Mean}, \code{Standard Deviation}, \code{NAs} (total number 26 | of \code{NA}), and \code{Length}. 27 | } 28 | \description{ 29 | Generate partition summary statistics for array objects along the last 30 | dimension 31 | } 32 | \examples{ 33 | 34 | # R array 35 | x <- array(1:27, c(3,3,3)) 36 | partition_table(x) 37 | 38 | # LazyArray 39 | x <- lazyarray(tempfile(), storage_format = 'double', dim = c(3,3,3)) 40 | x[] <- 1:27 41 | partition_table(x, quiet=TRUE) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /man/typeof-AbstractLazyArray-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zzz.R 3 | \name{typeof,AbstractLazyArray-method} 4 | \alias{typeof,AbstractLazyArray-method} 5 | \title{Type of \code{LazyArray}} 6 | \usage{ 7 | \S4method{typeof}{AbstractLazyArray}(x) 8 | } 9 | \arguments{ 10 | \item{x}{a \code{LazyArray} or an R object} 11 | } 12 | \value{ 13 | The type of data stored in the input 14 | } 15 | \description{ 16 | Type of \code{LazyArray} 17 | } 18 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Lazyarray - R package to store/read large array on hard drive 2 | 3 | This package is NOT active anymore. Please consider [filearray](https://CRAN.R-project.org/package=filearray), a long-term project. 4 | 5 | You might see `lazyarray` on CRAN, this is because I have some legacy projects that depend on this package. This package will be off CRAN anytime soon. (Sorry...) 6 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | CXX_STD = CXX11 2 | PKG_CPPFLAGS = -I../inst/include/ 3 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 4 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/classIndexSchedule.h: -------------------------------------------------------------------------------- 1 | #ifndef DIP_LAZYARRAY_SCHEDULE_H 2 | #define DIP_LAZYARRAY_SCHEDULE_H 3 | 4 | #include "common.h" 5 | 6 | class ScheduledIndex { 7 | 8 | public: 9 | 10 | ScheduledIndex() { 11 | block_indexed = false; 12 | } 13 | 14 | ScheduledIndex( 15 | const bool _block_indexed, 16 | const std::vector _dimension, 17 | const std::vector, bool>> _block_location, 18 | const std::vector& _partition_index, 19 | const std::vector _schedule_index, 20 | const std::vector _schedule_dimension, 21 | const std::vector _block_dimension, 22 | const std::vector _block_schedule, 23 | const int64_t _block_schedule_start, 24 | const int64_t _block_schedule_end 25 | ); 26 | 27 | ScheduledIndex(SEXP locations, const std::vector& dim, bool forceSchedule = false, int64_t hint = -1); 28 | 29 | ~ScheduledIndex(){ 30 | #ifdef LAZYARRAY_DEBUG 31 | print(wrap("A ScheduledIndex is destroyed")); 32 | #endif 33 | } 34 | 35 | bool block_indexed; // whether block_schedule can be trusted 36 | std::vector dimension; 37 | // partition level 38 | int64_t partition_counts; // the last dimension - n files to iterate through 39 | std::vector partition_index; // detailed indexes - always exists 40 | 41 | // schedule level 42 | int64_t schedule_counts_per_part; // for each partition, number of blocks to run 43 | std::vector schedule_index; // indices to schedule run blocks 44 | std::vector schedule_dimension; // [schedule dim, partition counts] 45 | 46 | // block level 47 | int64_t block_ndims; // length(block dim) 48 | std::vector block_dimension; // [block dim], full version 49 | std::vector block_prod_dim; // prod([1, block dim]), used to locate indices when block is too large to index 50 | std::vector block_schedule; // given a flattened block (full version), which indices to subset? 51 | int64_t block_schedule_start; 52 | int64_t block_schedule_end; // min, max of block_schedule 53 | 54 | int64_t block_length; // # elements in a block (full version) = prod(block_dimension) 55 | int64_t block_expected_length;// # elements in a block (subset version) = length(block_schedule) 56 | 57 | std::vector, bool>> block_location; // subset of locational indices of blocks 58 | 59 | 60 | Rcpp::List asList(); 61 | }; 62 | 63 | class ParsedIndex { 64 | 65 | public: 66 | ParsedIndex( 67 | const int& subset_mode, const std::vector& target_dimension, 68 | const std::vector& negative_subscript, 69 | const std::vector, bool>>& location_indices, 70 | ScheduledIndex* schedule 71 | ): 72 | subset_mode(subset_mode), 73 | target_dimension(target_dimension), 74 | negative_subscript(negative_subscript), 75 | location_indices(location_indices), 76 | schedule(schedule) { 77 | expected_length = std::accumulate(target_dimension.begin(), target_dimension.end(), INTEGER64_ONE, std::multiplies()); 78 | } 79 | 80 | ~ParsedIndex(){ 81 | if(this->schedule != nullptr){ 82 | delete schedule; 83 | this->schedule = nullptr; 84 | } 85 | #ifdef LAZYARRAY_DEBUG 86 | print(wrap("A ParsedIndex is destroyed")); 87 | #endif 88 | } 89 | 90 | ParsedIndex(const SEXP listOrEnv, const std::vector& dim, bool pos_subscript); 91 | 92 | 93 | int subset_mode; 94 | std::vector target_dimension; 95 | std::vector negative_subscript; 96 | std::vector, bool>> location_indices; 97 | 98 | int64_t expected_length; 99 | ScheduledIndex *schedule; 100 | 101 | Rcpp::List asList(); 102 | 103 | }; 104 | 105 | 106 | #endif // DIP_LAZYARRAY_SCHEDULE_H 107 | -------------------------------------------------------------------------------- /src/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | using namespace Rcpp; 3 | 4 | R_xlen_t setLazyBlockSize(R_xlen_t size){ 5 | if( size < 0 ){ 6 | BLOCKSIZE = 16384; 7 | } else if( size > 0 ){ 8 | BLOCKSIZE = size; 9 | } 10 | 11 | return BLOCKSIZE; 12 | } 13 | 14 | R_xlen_t getLazyBlockSize(){ 15 | if(BLOCKSIZE < 1){ 16 | BLOCKSIZE = 16384; 17 | } 18 | return BLOCKSIZE; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef DIP_LAZYARRAY_COMMON_H 2 | #define DIP_LAZYARRAY_COMMON_H 3 | 4 | // Common header that's required by all (most) files 5 | 6 | //include 7 | //include 8 | //include 9 | //include 10 | //include 11 | //include 12 | // #include 13 | 14 | #include 15 | //include 16 | 17 | using namespace Rcpp; 18 | 19 | /* 20 | * Number of bytes fst uses to compress as a unit 21 | * We use it differently, basically 4x or 8x or 16x this number as our block size 22 | * to avoid repeating too many blocks 23 | */ 24 | #ifndef NA_INTEGER64 25 | //undef NA_INTEGER64 26 | #define NA_INTEGER64 LLONG_MIN 27 | #endif // NA_INTEGER64 28 | 29 | // Lazyarray subset_mode - No index 30 | #ifndef LASUBMOD_NOIDX 31 | #define LASUBMOD_NOIDX 2 32 | #endif 33 | 34 | #ifndef LASUBMOD_SINGLE 35 | #define LASUBMOD_SINGLE 1 36 | #endif 37 | 38 | #ifndef LASUBMOD_MULTI 39 | #define LASUBMOD_MULTI 0 40 | #endif 41 | 42 | #ifdef LAZYARRAY_DEBUG 43 | #undef LAZYARRAY_DEBUG 44 | #endif 45 | 46 | /* 47 | * For array with dimension [287 x 200 x 601 x 84] 48 | * BLOCKSIZE decides the size of block to read into from fst file 49 | * because fst internally stores data in blocks, it's recommended to read blocks with size > 16384 50 | * 51 | * by default, this array will be split into 3 parts 52 | * [287 x 200 x 601 x 84] => [287 x 200] x [601 x 1] x [84] 53 | * 54 | * 84 is # of partition /files 55 | * for each file, read in sub chunk of length 287 x 200 (> BLOCKSIZE) 56 | * total number of chunks to read is 601 per file 57 | * 58 | * loc2idx3 calculates indices within each sub-chunks so that it's easy to find then once data is loaded 59 | * 60 | * However, of sub-block is too large, for example [1e30 x 5] matrix, sub-block size is 1e30, loc2idx3 generates too many 61 | * indices but the indices come with cost of memory (this means super large index set). We wish to calculate 62 | * indices on the fly. The boundary is set by BLOCKLARGE. 63 | * 64 | * If # of indices > BLOCKLARGE, then don't pre-generate indices 65 | * 66 | */ 67 | 68 | // Used to partition to sub-blocks 69 | static R_xlen_t BLOCKSIZE = 16384; 70 | // If sub-block size is too large, don't calculate indices (memory inefficient) 71 | // ~ 250 MB index set 72 | static R_xlen_t BLOCKLARGE = 31250000; 73 | 74 | const static int64_t INTEGER64_ONE = 1; 75 | const static R_xlen_t INTEGER_XLEN_ONE = 1; 76 | 77 | // [[Rcpp::interfaces(r, cpp)]] 78 | // [[Rcpp::export]] 79 | R_xlen_t setLazyBlockSize(R_xlen_t size); 80 | 81 | // [[Rcpp::export]] 82 | R_xlen_t getLazyBlockSize(); 83 | 84 | 85 | #endif // DIP_LAZYARRAY_COMMON_H 86 | -------------------------------------------------------------------------------- /src/fstWrapper.cpp: -------------------------------------------------------------------------------- 1 | #include "fstWrapper.h" 2 | 3 | // [[Rcpp::plugins("cpp11")]] 4 | 5 | #include 6 | #include "common.h" 7 | #include "utils.h" 8 | 9 | using namespace Rcpp; 10 | 11 | 12 | SEXP fstMeta(String fileName){ 13 | return fstcore::fstmetadata(fileName); 14 | } 15 | 16 | /* 17 | * Slightly different than fscore::fstretrieve. It supports 18 | * 19 | */ 20 | SEXP fstRetrieve(String fileName, SEXP colSel, SEXP start, SEXP end){ 21 | #ifdef LAZYARRAY_DEBUG 22 | print(wrap("Reading from fst...")); 23 | #endif 24 | return fstcore::fstretrieve(fileName, colSel, start, end); 25 | } 26 | 27 | SEXP fstStore(String fileName, SEXP table, SEXP compression, SEXP uniformEncoding){ 28 | return fstcore::fststore(fileName, table, compression, uniformEncoding); 29 | } 30 | 31 | 32 | bool checkFstMeta(const String file, const int64_t expect_nrows, StringVector cnames){ 33 | List meta = fstcore::fstmetadata( file ); 34 | if( Rf_inherits(meta, "fst_error") || 35 | !meta.containsElementNamed("nrOfRows") || 36 | !meta.containsElementNamed("colNames")){ 37 | return(false); 38 | } 39 | int64_t nrows = as(meta["nrOfRows"]); 40 | if( expect_nrows >= 0 && nrows != expect_nrows ){ 41 | stop("Unable to read from file: partition size not matches with expected size."); 42 | } 43 | StringVector colNames = as(meta["colNames"]); 44 | 45 | if(colNames.size() < cnames.size()){ 46 | return(false); 47 | } 48 | 49 | R_xlen_t touched = 0; 50 | for(R_xlen_t ii = 0; ii < cnames.size(); ii++ ){ 51 | const String cn = cnames[ii]; 52 | for(StringVector::iterator ptr = colNames.begin(); ptr != colNames.end(); ptr++){ 53 | if((*ptr) == cn){ 54 | touched++; 55 | continue; 56 | } 57 | } 58 | } 59 | 60 | if(touched >= cnames.size()){ 61 | return(true); 62 | } 63 | 64 | return(false); 65 | } 66 | 67 | -------------------------------------------------------------------------------- /src/fstWrapper.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_FST_H 4 | #define LAZYARRAY_FST_H 5 | 6 | #include "lazycommon.h" 7 | 8 | // [[Rcpp::export]] 9 | SEXP fstMeta(Rcpp::String fileName); 10 | 11 | // [[Rcpp::export]] 12 | SEXP fstRetrieve(Rcpp::String fileName, SEXP colSel, SEXP start, SEXP end); 13 | 14 | // [[Rcpp::export]] 15 | SEXP fstStore(Rcpp::String fileName, SEXP table, SEXP compression, SEXP uniformEncoding); 16 | 17 | /* 18 | * Validate fst header information 19 | * @param file fst file path 20 | * @param expect_nrows expected number of rows 21 | * @param cnames column names that must contain 22 | */ 23 | // [[Rcpp::plugins("cpp11")]] 24 | // [[Rcpp::export]] 25 | bool checkFstMeta(const Rcpp::String file, const int64_t expect_nrows, Rcpp::StringVector cnames); 26 | 27 | #endif // LAZYARRAY_FST_H 28 | -------------------------------------------------------------------------------- /src/indexConvert.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_INDEX_H 4 | #define LAZYARRAY_INDEX_H 5 | 6 | #include "Rcpp.h" 7 | #include "classIndexSchedule.h" 8 | 9 | // [[Rcpp::interfaces(r, cpp)]] 10 | 11 | // Though efficient, but no longer used as it errors when indexing >= 2^31 length 12 | Rcpp::IntegerVector loc2idx(Rcpp::List& locations, Rcpp::IntegerVector& parent_dim); 13 | 14 | // Same behavior as loc2idx3, but uses NumericVector, which converts int64_t back to double 15 | // no need to do so 16 | Rcpp::NumericVector loc2idx2(Rcpp::List& locations, Rcpp::NumericVector& parent_dim); 17 | 18 | // [[Rcpp::export]] 19 | std::vector loc2idx3(SEXP locations, std::vector& parent_dim); 20 | 21 | 22 | // subsetIdx and subsetIdx2 should not be used directly as parseSlices combines them all 23 | SEXP subsetIdx(Rcpp::Environment expr_env, Rcpp::NumericVector dim, bool pos_subscript = false); 24 | 25 | // [[Rcpp::export]] 26 | Rcpp::List extractSlices(SEXP listOrEnv, const R_xlen_t& ndims); 27 | 28 | // parseSlices = subsetIdx or subsetIdx2 29 | // WARNING: Always use pos_subscript if you want to use subset or subsetAssign functions in lazyarray 30 | // pos_subscript=false subset is not implemented 31 | // [[Rcpp::export]] 32 | Rcpp::List parseSlices(SEXP listOrEnv, const std::vector& dim, bool pos_subscript = true); 33 | 34 | // parseAndScheduleBlocks = parseSlices + scheduleIndexing 35 | // [[Rcpp::export]] 36 | Rcpp::List parseAndScheduleBlocks2(SEXP sliceIdx, Rcpp::NumericVector dim, bool forceSchedule = false); 37 | 38 | ParsedIndex* parseAndScheduleBlocks(SEXP listOrEnv, const std::vector& dim, bool forceSchedule = false, int64_t hint = -1); 39 | 40 | // [[Rcpp::export]] 41 | SEXP reshapeOrDrop(SEXP x, SEXP reshape = R_NilValue, bool drop = false); 42 | 43 | #endif // LAZYARRAY_INDEX_H 44 | -------------------------------------------------------------------------------- /src/lazyarray-ext.cpp: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #include "utils.h" 4 | #include "reshape.h" 5 | #include "loader1.h" 6 | #include "loader2.h" 7 | #include "fstWrapper.h" 8 | 9 | #include "common.h" 10 | #include "indexConvert.h" 11 | 12 | using namespace Rcpp; 13 | 14 | /** 15 | * Write array to fst files 16 | * @param x array or vector 17 | * @param dim array dimension 18 | * @param fileName,compression,uniformEncoding passed to fstStore 19 | */ 20 | // [[Rcpp::export]] 21 | SEXP cpp_create_lazyarray(SEXP& x, IntegerVector& dim, SEXP fileName, 22 | SEXP compression, SEXP uniformEncoding){ 23 | 24 | // Obtain the array dimension, dimension length must >= 1 25 | R_xlen_t ndim = dim.size(); 26 | 27 | if( ndim < 1 ){ 28 | // shouldn't enter here 29 | Rcpp::stop("cpp_create_lazyarray needs at least one dim. It can be the length."); 30 | } 31 | 32 | /* 33 | * Reshape dimension to two dimension: [first_dim, last_dim] 34 | * Case 1: dim has length 1, e.g. dim = c(10), last_dim is always 1, and first_dim is 10 35 | * Case 2: dim = c(10,10,2), last_dim is always the last dim, i.e. 2, first_dim is prod of the rest, i.e. 100 36 | */ 37 | int64_t last_dim = 1; 38 | int64_t total_length = std::accumulate(dim.begin(), dim.end(), INTEGER64_ONE, std::multiplies()); 39 | if( ndim >= 2 ){ 40 | // case 2 41 | last_dim = *(dim.end() - 1); 42 | } 43 | int64_t first_dim = total_length / last_dim; 44 | 45 | if( total_length == 0 ){ 46 | // no need to write to disk? 47 | // FIXME 48 | } 49 | 50 | // Need to reshape x to a data.frame/list i.e. dim(x) = c(first_dim, last_dim) 51 | Rcpp::List table = arr2df(x, first_dim, last_dim); 52 | // return table; 53 | // Rcpp::print(table); 54 | fstStore(fileName, table, compression, uniformEncoding); 55 | return R_NilValue; 56 | } 57 | 58 | 59 | /** 60 | * @param fileName,colSel,start,end Parameter required to locate each partition for each file 61 | * @param custom_func Function to apply to each partition 62 | * @param reshape Dimension to reshape partition data to 63 | */ 64 | // [[Rcpp::export]] 65 | SEXP lazyMapReduceByPartition( 66 | Rcpp::String fileName, CharacterVector colSel, SEXP start, SEXP end = R_NilValue, 67 | Rcpp::Nullable custom_func = R_NilValue, 68 | Rcpp::Nullable reshape = R_NilValue){ 69 | 70 | SEXP tmp; 71 | tmp = fstRetrieve(fileName, wrap(colSel), start, end); 72 | 73 | tmp = getListElement(tmp, "resTable"); 74 | 75 | SEXP data; 76 | R_xlen_t s; 77 | 78 | // if colSel.size() == 1, non-complex data 79 | bool is_complex = false; 80 | if( colSel.size() == 1 ){ 81 | data = getListElement(tmp, colSel[0]); 82 | s = Rf_xlength(data); 83 | } else { 84 | is_complex = true; 85 | NumericVector re = as(getListElement(tmp, colSel[0])); 86 | NumericVector im = as(getListElement(tmp, colSel[1])); 87 | s = Rf_xlength(re); 88 | 89 | data = PROTECT(Rf_allocVector(CPLXSXP, s)); 90 | 91 | Rcomplex *ptr_data = COMPLEX(data); 92 | NumericVector::iterator ptr_re = re.begin(); 93 | NumericVector::iterator ptr_im = im.begin(); 94 | 95 | for(R_xlen_t ii = 0; ii < s; ii++ ){ 96 | (*ptr_data).r = *ptr_re; 97 | (*ptr_data).i = *ptr_im; 98 | ptr_data++; 99 | ptr_re++; 100 | ptr_im++; 101 | } 102 | 103 | } 104 | 105 | SEXP re; 106 | 107 | if(reshape != R_NilValue){ 108 | Rf_setAttrib(data, wrap("dim"), wrap(reshape)); 109 | } 110 | 111 | if( custom_func.isNotNull() ){ 112 | re = Rcpp::as(custom_func)( data ); 113 | } else { 114 | re = R_NilValue; 115 | } 116 | 117 | if( re != R_NilValue ){ 118 | // re.attr("chunk_length") = s; 119 | Rf_setAttrib(re, wrap("chunk_length"), wrap(s)); 120 | } 121 | 122 | if( is_complex ){ 123 | UNPROTECT(1); 124 | } 125 | 126 | return re; 127 | } 128 | 129 | 130 | 131 | 132 | /*** R 133 | # devtools::load_all() 134 | require(lazyarray) 135 | x <- array(seq_len(prod(c(3000,7,20,8))), c(3000,7,20,8)) 136 | x <- as.lazyarray(x) 137 | 138 | # unlink(x$get_partition_fpath(3)) 139 | # fst::write_fst(data.frame(V2 = 1:8), x$get_partition_fpath(3)) 140 | 141 | b <- function(i,...){ 142 | subsetIdx(environment(), dim(x), TRUE) 143 | } 144 | invisible(b(1:5,-c(1:2,NA),c(1,NA),)) 145 | 146 | files <- x$get_partition_fpath() 147 | a <- function(i,...){ 148 | subsetFST(files, environment(), dim(x), 0.1) 149 | } 150 | # e = a(c(3,1,7,NA,2,1, 27, 16, 15,14,NA, 27)) 151 | e = a(1:5,c(1:2,NA),,) 152 | range(e - x[][1:5,c(1:2,NA),,], na.rm = TRUE) 153 | 154 | a(c(1:20, NA)) 155 | 156 | # array(1:27, c(3,3,3))[c(3,1,7,NA,2,1)] 157 | 158 | 159 | # f <- tempfile() 160 | # fst::write_fst(data.frame(V1R = 1:10, V1I = 10:1), path = f) 161 | # tmp <- fst:::fstretrieve(normalizePath(f), c('V1R', 'V1I'), 1L, NULL) 162 | # tmp 163 | # 164 | # path <- normalizePath(f) 165 | # lazyMapReduceByPartition(path, c('V1R', 'V1I'), 1L, NULL, sum) 166 | # lazyMapReduceByPartition(path, c('V1R'), 1L, NULL, sum) 167 | # 168 | # x <- lazyarray(tempfile(), 'complex', c(2,3,4)) 169 | # x[] <- 1:24 + (24:1)*1i 170 | # 171 | # partition_map(x, function(slice, part){ 172 | # slice 173 | # }) 174 | 175 | 176 | # path = '~/Desktop/lazyarray_data' 177 | # dimension <- c(287, 200, 601, 84) 178 | # x <- lazyarray(path, storage_format = "double", dim = dimension) 179 | # part_loc <- list(1:287L, 1:200L, 1:601L, 1L) 180 | # a <- bench::mark({ 181 | # cpp_load_lazyarray(x$get_partition_fpath(1), part_loc, c(287L, 200L, 601L, 1L), 4, 0.1) 182 | # }, iterations = 1) 183 | # a$memory 184 | # 185 | # path <- tempfile() 186 | # a = data.frame(V1=c(1:10 + rnorm(10), rep(NA,2))) 187 | # fst::write_fst(a, path) 188 | # path <- normalizePath(path) 189 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, sum) 190 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, function(x){mean(x, na.rm = TRUE)}) 191 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, length) 192 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, function(x){NULL}) 193 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, function(x){dim(x)}, c(3L,4L)) 194 | 195 | # a = 1:3; b = 4:6+0.5 196 | # pryr::address(a) 197 | # c =join_vectors(a, b) 198 | # a 199 | # pryr::address(a) 200 | # 201 | # f <- normalizePath(tempfile(), mustWork = FALSE) 202 | # unlink(f) 203 | # dim <- c(10,20,50); 204 | # x <- rnorm(10000); dim(x) <- dim 205 | # x[sample(10000, 2000)] = NA 206 | # 207 | # lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE); 208 | # 209 | # expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file") 210 | # 211 | # 212 | # # Make sure we have invalid indices 213 | # idx_loc <- list( 214 | # as.integer(sample(12) - 1), 215 | # as.integer(sample(22)-1), 216 | # as.integer(sample(52)-1) 217 | # ) 218 | # target_dim = sapply(idx_loc, length) 219 | # cpp_load_lazyarray(f, idx_loc, dim, length(dim), 0.1) 220 | */ 221 | -------------------------------------------------------------------------------- /src/lazycommon.h: -------------------------------------------------------------------------------- 1 | #ifndef LAZYARRAY_H_ 2 | #define LAZYARRAY_H_ 3 | 4 | #include 5 | 6 | #endif // LAZYARRAY_H_ 7 | -------------------------------------------------------------------------------- /src/loader1.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_LOADER1_H 4 | #define LAZYARRAY_LOADER1_H 5 | 6 | #include 7 | 8 | // [[Rcpp::export]] 9 | SEXP lazyLoadOld(Rcpp::StringVector& files, Rcpp::List& partition_locations, 10 | Rcpp::IntegerVector& partition_dim, R_xlen_t ndim, SEXP value_type); 11 | 12 | 13 | #endif // LAZYARRAY_LOADER1_H 14 | -------------------------------------------------------------------------------- /src/loader2.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_LOADER2_H 4 | #define LAZYARRAY_LOADER2_H 5 | 6 | #include "Rcpp.h" 7 | #include "classIndexSchedule.h" 8 | 9 | // [[Rcpp::interfaces(r,cpp)]] 10 | 11 | SEXP subsetFSTBare(const std::string& rootPath, const ParsedIndex* parsed, const ScheduledIndex& schedule, 12 | const std::vector& dim, const SEXPTYPE& dtype); 13 | 14 | // [[Rcpp::export]] 15 | SEXP subsetFST(const std::string& rootPath, SEXP listOrEnv, const std::vector& dim, 16 | SEXPTYPE dtype, SEXP reshape = R_NilValue, bool drop = false); 17 | 18 | 19 | // [[Rcpp::export]] 20 | SEXP scheduleFST(SEXP listOrEnv, const std::vector& dim, bool forceSchedule = false, int64_t hint = -1); 21 | 22 | // [[Rcpp::export]] 23 | SEXP executeScheduleFST(const std::string& rootPath, SEXPTYPE dtype, SEXP reshape, bool drop, int64_t partition); 24 | 25 | // [[Rcpp::export]] 26 | SEXP scheduleExistsFST(); 27 | 28 | // [[Rcpp::export]] 29 | SEXP freeScheduleFST(); 30 | 31 | #endif // LAZYARRAY_LOADER2_H 32 | -------------------------------------------------------------------------------- /src/old/loader2ext.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H 4 | #define LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H 5 | 6 | #include "Rcpp.h" 7 | 8 | // SEXP subsetFST_double(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 9 | // 10 | // SEXP subsetFST_integer(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 11 | // 12 | // SEXP subsetFST_character(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 13 | // 14 | // SEXP subsetFST_complex(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed); 15 | 16 | template 17 | SEXP subsetFSTtemplate(const std::string& rootPath, const std::vector& dim, const Rcpp::List& subparsed); 18 | 19 | #endif // LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H 20 | -------------------------------------------------------------------------------- /src/openMPInterface.cpp: -------------------------------------------------------------------------------- 1 | #include "openMPInterface.h" 2 | 3 | #include 4 | 5 | #ifdef _OPENMP 6 | #include 7 | #endif 8 | 9 | 10 | int getLazyThread(bool max){ 11 | #ifdef _OPENMP 12 | if(detectFork){ 13 | return 1; 14 | } 15 | if( max ){ 16 | return omp_get_max_threads(); 17 | } 18 | int t = lazyThreads < 0 ? omp_get_max_threads() : std::min(lazyThreads, omp_get_max_threads()); 19 | return std::max(t, 1); 20 | #else 21 | return 1; 22 | #endif 23 | } 24 | 25 | int setLazyThread(int n, SEXP reset_after_fork){ 26 | #ifdef _OPENMP 27 | if(!detectFork){ 28 | lazyThreads = n; 29 | } 30 | if( reset_after_fork != R_NilValue ){ 31 | if( Rf_asLogical(reset_after_fork) ){ 32 | reset_forked = true; 33 | } else { 34 | reset_forked = false; 35 | } 36 | } 37 | 38 | return n; 39 | #else 40 | return 1; 41 | #endif 42 | } 43 | 44 | 45 | bool hasOpenMP(){ 46 | return LAZYARRAY_HAS_OPENMP; 47 | } 48 | 49 | void onForked(){ 50 | detectFork = true; 51 | } 52 | void onLeaveFork(){ 53 | if(!reset_forked){ 54 | lazyThreads = 1; 55 | } 56 | detectFork = false; 57 | } 58 | 59 | int detectForked(DllInfo *dll){ 60 | // To disable openmp if fork is detected 61 | #ifdef _OPENMP 62 | return pthread_atfork(&onForked, &onLeaveFork, NULL); 63 | #endif 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /src/openMPInterface.h: -------------------------------------------------------------------------------- 1 | #ifndef LAZYARRAY_OPENMP_H 2 | #define LAZYARRAY_OPENMP_H 3 | 4 | // [[Rcpp::interfaces(r, cpp)]] 5 | // [[Rcpp::plugins(openmp)]] 6 | 7 | #ifdef _OPENMP 8 | #include 9 | #define LAZYARRAY_HAS_OPENMP true 10 | #else 11 | #define omp_get_thread_num() 0 12 | #define omp_get_max_threads() 1 13 | #define LAZYARRAY_HAS_OPENMP false 14 | #endif 15 | 16 | #include 17 | 18 | static int lazyThreads = 0; 19 | 20 | // stores n threads when fork occurs 21 | static bool detectFork = false; 22 | static int reset_forked = true; 23 | 24 | 25 | // [[Rcpp::export]] 26 | int getLazyThread(bool max = false); 27 | 28 | // [[Rcpp::export]] 29 | int setLazyThread(int n, SEXP reset_after_fork = R_NilValue); 30 | 31 | // [[Rcpp::export]] 32 | bool hasOpenMP(); 33 | 34 | // [[Rcpp::init]] 35 | int detectForked(DllInfo *dll); 36 | 37 | #endif // OPEN_MP_HELPER_H 38 | -------------------------------------------------------------------------------- /src/playground.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | using namespace Rcpp; 4 | 5 | 6 | 7 | /*** R 8 | a = 100000 9 | b = 1024 10 | bench::mark( 11 | mod2(a, b), 12 | mod3(a, b), 13 | mod4(a, b) 14 | ) 15 | 16 | */ 17 | -------------------------------------------------------------------------------- /src/reshape.cpp: -------------------------------------------------------------------------------- 1 | #include "reshape.h" 2 | 3 | #include "common.h" 4 | #include "indexConvert.h" 5 | 6 | using namespace Rcpp; 7 | 8 | template 9 | List cpp_array_to_list_template(T x, int64_t nrows, int64_t ncols){ 10 | // length of x has been checked so assume length(x) = nrows * ncols 11 | List re = List::create(); 12 | String colname; 13 | I ptr1_x = x.begin(); 14 | I ptr2_x = x.begin(); 15 | 16 | 17 | for(int64_t ii = 0; ii < ncols; ii++ ){ 18 | ptr2_x += nrows; 19 | colname = "V" + std::to_string(ii + 1); 20 | T slice = T(ptr1_x, ptr2_x); 21 | re.push_back(slice, colname); 22 | ptr1_x = ptr2_x; 23 | } 24 | 25 | 26 | return re; 27 | } 28 | 29 | 30 | List cpp_array_to_list_complex(ComplexVector x, int64_t nrows, int64_t ncols){ 31 | 32 | 33 | List re = List::create(); 34 | String colname; 35 | ComplexVector::iterator ptr1_x = x.begin(); 36 | ComplexVector::iterator ptr2_x = x.begin(); 37 | 38 | for(int64_t ii = 0; ii < ncols; ii++ ){ 39 | ptr2_x += nrows; 40 | const ComplexVector slice = ComplexVector(ptr1_x, ptr2_x); 41 | 42 | colname = "V" + std::to_string(ii + 1) + "R"; 43 | re.push_back(Armor(Rcpp::Re(slice)), colname); 44 | 45 | colname = "V" + std::to_string(ii + 1) + "I"; 46 | re.push_back(Armor(Rcpp::Im(slice)), colname); 47 | 48 | ptr1_x = ptr2_x; 49 | } 50 | 51 | return re; 52 | } 53 | 54 | 55 | /** 56 | * Conver vector x to data.frame with dimension c(first_dim, last_dim) 57 | * 58 | * 2020-09-02: 59 | * 1. renamed from cpp_array_to_list to arr2df 60 | * 2. changed argument, explicitly add nrows and ncols to be memory efficient 61 | * 3. Added length check 62 | */ 63 | Rcpp::List arr2df(SEXP x, int64_t nrows, int64_t ncols){ 64 | // User explicitly tells which storage type of x should be 65 | // 9 CHARSXP internal character strings 66 | // 10 LGLSXP logical vectors 67 | // 13 INTSXP integer vectors 68 | // 14 REALSXP numeric vectors 69 | // 15 CPLXSXP complex vectors 70 | // 16 STRSXP character vectors 71 | // 24 RAWSXP raw vector 72 | // 73 | 74 | 75 | // check length of x 76 | if( nrows * ncols - Rf_xlength(x) != 0 ){ 77 | stop("Cannot reshape array to data.frame, dimension not match"); 78 | } 79 | 80 | Rcpp::List re; 81 | switch (TYPEOF(x)) { 82 | case STRSXP: 83 | case CHARSXP: 84 | re = cpp_array_to_list_template(x, nrows, ncols); 85 | break; 86 | case LGLSXP: 87 | re = cpp_array_to_list_template(x, nrows, ncols); 88 | break; 89 | case INTSXP: 90 | re = cpp_array_to_list_template(x, nrows, ncols); 91 | break; 92 | case REALSXP: 93 | re = cpp_array_to_list_template(x, nrows, ncols); 94 | break; 95 | case CPLXSXP: 96 | re = cpp_array_to_list_complex(x, nrows, ncols); 97 | break; 98 | default: 99 | Rcpp::stop("Unsupported data type, only logical, numeric, complex, character types are supported."); 100 | } 101 | return re; 102 | } 103 | 104 | -------------------------------------------------------------------------------- /src/reshape.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_RESHAPE_H 4 | #define LAZYARRAY_RESHAPE_H 5 | 6 | #include "Rcpp.h" 7 | 8 | Rcpp::List arr2df(SEXP x, int64_t nrows, int64_t ncols); 9 | 10 | #endif // LAZYARRAY_RESHAPE_H 11 | -------------------------------------------------------------------------------- /src/saver2.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_SAVER2_H 4 | #define LAZYARRAY_SAVER2_H 5 | 6 | #include "Rcpp.h" 7 | #include "utils.h" 8 | 9 | // [[Rcpp::interfaces(r, cpp)]] 10 | 11 | template 12 | SEXP writeFstPartition(const Rcpp::Vector& values, const std::string& file, 13 | const std::vector& dim, const Rcpp::List& subparsed, 14 | int compression, bool uniformEncoding); 15 | 16 | // [[Rcpp::export]] 17 | SEXP subsetAssignFST(const SEXP values, const std::string& file, SEXP listOrEnv, 18 | const std::vector& dim, const SEXPTYPE& dtype, 19 | int compression = 50, bool uniformEncoding = true); 20 | 21 | #endif // LAZYARRAY_SAVER2_H 22 | -------------------------------------------------------------------------------- /src/saver2ext.cpp: -------------------------------------------------------------------------------- 1 | #include "saver2ext.h" 2 | 3 | #include "common.h" 4 | #include "utils.h" 5 | #include "fstWrapper.h" 6 | #include "indexConvert.h" 7 | using namespace Rcpp; 8 | 9 | // returns: -1 all partitions affected, otherwise the indices of partitions 10 | 11 | 12 | /*** R 13 | x <- lazyarray:::as.lazyarray(array(rep(1,27), rep(3,3))) 14 | unlink(x$get_partition_fpath(3)) 15 | res <- lazyarray:::parseAndScheduleBlocks(list(lazyarray:::get_missing_value(),1,1:3), dim(x)) 16 | writeFstPartition_double(1.1, x$get_partition_fpath(), dim(x), res, 50L, TRUE) 17 | x[] 18 | x[,1,] 19 | */ 20 | -------------------------------------------------------------------------------- /src/saver2ext.h: -------------------------------------------------------------------------------- 1 | // [[Rcpp::plugins("cpp11")]] 2 | 3 | #ifndef LAZYARRAY_SAVER2EXT_H 4 | #define LAZYARRAY_SAVER2EXT_H 5 | 6 | #include "Rcpp.h" 7 | 8 | 9 | 10 | #endif // LAZYARRAY_SAVER2EXT_H 11 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef DIP_LAZYARRAY_UTILS_H 3 | #define DIP_LAZYARRAY_UTILS_H 4 | 5 | #include "Rcpp.h" 6 | 7 | // [[Rcpp::interfaces(r, cpp)]] 8 | 9 | template 10 | bool contains(T vec, SEXP el); 11 | 12 | SEXP getListElement(SEXP list, const char *str); 13 | SEXP getListElement2(SEXP list, const char *str, const SEXP ifNull); 14 | 15 | // [[Rcpp::export]] 16 | SEXP dropDimension(SEXP x); 17 | 18 | // [[Rcpp::export]] 19 | int64_t prod2(SEXP x, bool na_rm = false); 20 | 21 | // [[Rcpp::export]] 22 | SEXP parseDots(Rcpp::Environment& env, bool eval); 23 | 24 | // [[Rcpp::export]] 25 | bool stopIfNot(const bool isValid, const std::string& message, bool stopIfError = true); 26 | 27 | // [[Rcpp::export]] 28 | SEXPTYPE getSexpType(SEXP x); 29 | 30 | // [[Rcpp::export]] 31 | SEXP tik(); 32 | 33 | // [[Rcpp::export]] 34 | SEXP tok(std::string msg, bool stop = false); 35 | 36 | std::string as_dirpath(std::string x); 37 | 38 | SEXP captureException( const std::exception& e ); 39 | SEXP makeException( std::string msg ); 40 | 41 | // [[Rcpp::export]] 42 | SEXP subsetAssignVector(SEXP x, int64_t start, SEXP value); 43 | 44 | template 45 | inline std::vector seq_len3(int64_t n){ 46 | std::vector re = std::vector(n); 47 | T v = 1; 48 | for(auto it = re.begin(); it != re.end(); it++){ 49 | *it = v++; 50 | } 51 | return re; 52 | } 53 | 54 | void setReIm(Rcpp::ComplexVector x, Rcpp::NumericVector v, bool is_real); 55 | 56 | inline void r_gc(){ 57 | Rcpp::Environment env = Rcpp::Environment::base_env(); 58 | Rcpp::Function gc = env["gc"]; 59 | gc(); 60 | } 61 | 62 | #endif // DIP_LAZYARRAY_UTILS_H 63 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(lazyarray) 3 | 4 | test_check("lazyarray") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-cpp_io2.R: -------------------------------------------------------------------------------- 1 | # devtools::load_all() 2 | 3 | require(testthat) 4 | 5 | context("Loader2 no sub-blocks") 6 | 7 | dim <- c(10,20,50) 8 | 9 | f <- normalizePath(tempfile(), mustWork = FALSE) 10 | on.exit({ 11 | setLazyBlockSize(-1) 12 | unlink(f) 13 | }) 14 | 15 | x = array(1:prod(dim), dim) 16 | a = as.lazyarray(x, path = f, type = 'fstarray') 17 | loader_f <- function(..., samp, reshape, drop){ 18 | lazyarray:::subsetFST(a$storage_path, environment(), dim, getSexpType(samp), reshape, drop) 19 | } 20 | 21 | lazy_test_unit <- function(samp_data, x_alt){ 22 | loader_double = function(..., reshape = NULL, drop = FALSE){ loader_f(samp = samp_data, ..., reshape = reshape, drop = drop) } 23 | if(missing(x_alt)){ 24 | x <- x; storage.mode(x) <- storage.mode(samp_data) 25 | } else { 26 | x <- x_alt; storage.mode(x) <- storage.mode(samp_data) 27 | } 28 | 29 | # 1. a() 30 | re <- loader_double() 31 | expect_equal(storage.mode(re), storage.mode(samp_data)) 32 | expect_equivalent(re, x) 33 | expect_equivalent(dim(re), dim(x)) 34 | 35 | # 2. a(i) 36 | idx <- sample(length(x), size = 200, replace = TRUE) 37 | idx[sample(200, 20)] = NA 38 | re <- loader_double(idx) 39 | cp <- x[idx] 40 | expect_equivalent(re, cp) 41 | 42 | # 2. a(-i) 43 | idx <- -sample(length(x), size = 200, replace = TRUE) 44 | idx[sample(200, 20)] = NA 45 | re <- loader_double(idx) 46 | cp <- x[idx[!is.na(idx)]] 47 | expect_equivalent(re, cp) 48 | 49 | # 2.1 large indices 50 | re <- loader_double(10000000) 51 | cp <- x[10000000] 52 | expect_equivalent(re, cp) 53 | expect_true(is.na(re)) 54 | re <- loader_double(-10000000) 55 | cp <- x[-10000000] 56 | expect_equal(re, cp) 57 | expect_equal(dim(re), NULL) 58 | 59 | 60 | 61 | # 3. a(-i:i) 62 | expect_error(loader_double(-1:1)) 63 | 64 | # 4. a(i,j,k) 65 | idx <- lapply(dim, sample, replace = TRUE, size = 6) 66 | re <- do.call(loader_double, idx) 67 | cp <- eval(as.call(c(list(quote(`[`), quote(x)), idx))) 68 | expect_equivalent(re, cp) 69 | expect_equivalent(dim(re), dim(cp)) 70 | 71 | # 5. negative subscripts 72 | ii <- sample(2)[[1]] + 1 73 | idx[[ii]] <- -idx[[ii]] 74 | re <- do.call(loader_double, idx) 75 | cp <- eval(as.call(c(list(quote(`[`), quote(x)), idx))) 76 | expect_equivalent(re, cp) 77 | expect_equivalent(dim(re), dim(cp)) 78 | ii <- 1 79 | idx[[ii]] <- -idx[[ii]] 80 | re <- do.call(loader_double, idx) 81 | cp <- eval(as.call(c(list(quote(`[`), quote(x)), idx))) 82 | expect_equivalent(re, cp) 83 | expect_equivalent(dim(re), dim(cp)) 84 | 85 | # 6. With missing 86 | re <- loader_double(,c(NA,1:0),c(2,NA,1)) 87 | cp <- x[,c(NA,1:0),c(2,NA,1),drop = FALSE] 88 | expect_equivalent(re, cp) 89 | expect_equivalent(dim(re), dim(cp)) 90 | 91 | re <- loader_double(c(NA,1:0),c(2,NA,1),) 92 | cp <- x[c(NA,1:0),c(2,NA,1),,drop = FALSE] 93 | expect_equivalent(re, cp) 94 | expect_equivalent(dim(re), dim(cp)) 95 | 96 | re <- loader_double(c(NA,1:0),,c(2,NA,1)) 97 | cp <- x[c(NA,1:0),,c(2,NA,1),drop = FALSE] 98 | expect_equivalent(re, cp) 99 | expect_equivalent(dim(re), dim(cp)) 100 | 101 | re <- loader_double(,,) 102 | cp <- x[,,,drop = FALSE] 103 | expect_equivalent(re, cp) 104 | expect_equivalent(dim(re), dim(cp)) 105 | 106 | # 7. drop 107 | re <- loader_double(,c(NA,1:0),c(2,NA,1), drop = TRUE) 108 | cp <- x[,c(NA,1:0),c(2,NA,1),drop = TRUE] 109 | expect_equivalent(re, cp) 110 | expect_equivalent(dim(re), dim(cp)) 111 | re <- loader_double(,c(NA,1:0),c(3), drop = TRUE) 112 | cp <- x[,c(NA,1:0),c(3),drop = TRUE] 113 | expect_equivalent(re, cp) 114 | expect_equivalent(dim(re), dim(cp)) 115 | 116 | # 8. reshape 117 | re <- loader_double(,c(NA,1:0),c(2,NA,1), reshape = c(20, 3)) 118 | cp <- x[,c(NA,1:0),c(2,NA,1),drop = TRUE] 119 | expect_equivalent(as.vector(re), as.vector(cp)) 120 | expect_equivalent(dim(re), c(20, 3)) 121 | re <- loader_double(,c(NA,1:0),c(2,NA,1), reshape = c(60)) 122 | expect_equivalent(dim(re), NULL) 123 | 124 | # 9. Negative with missing 125 | re <- loader_double(,-c(NA,1:0),c(2,NA,1)) 126 | cp <- x[,-c(1:0),c(2,NA,1),drop = FALSE] 127 | expect_equivalent(re, cp) 128 | expect_equivalent(dim(re), dim(cp)) 129 | 130 | re <- loader_double(-c(NA,1:0),c(2,NA,1),) 131 | cp <- x[-c(1:0),c(2,NA,1),,drop = FALSE] 132 | expect_equivalent(re, cp) 133 | expect_equivalent(dim(re), dim(cp)) 134 | 135 | re <- loader_double(c(NA,1:0),,-c(2,NA,1)) 136 | cp <- x[c(NA,1:0),,-c(2,1),drop = FALSE] 137 | expect_equivalent(re, cp) 138 | expect_equivalent(dim(re), dim(cp)) 139 | 140 | re <- loader_double(,,-1000) 141 | cp <- x[,,-1000,drop = FALSE] 142 | expect_equivalent(re, cp) 143 | expect_equivalent(dim(re), dim(cp)) 144 | 145 | re <- loader_double(,-1000,1) 146 | cp <- x[,-1000,1,drop = FALSE] 147 | expect_equivalent(re, cp) 148 | expect_equivalent(dim(re), dim(cp)) 149 | 150 | re <- loader_double(-1000,,) 151 | cp <- x[-1000,,,drop = FALSE] 152 | expect_equivalent(re, cp) 153 | expect_equivalent(dim(re), dim(cp)) 154 | 155 | 156 | # Wrong usages 157 | expect_error(loader_double(100,,)) 158 | expect_error(loader_double(1,1,1,1)) 159 | expect_error(loader_double(1,1)) 160 | expect_error(loader_double(,1,1,)) 161 | expect_error(loader_double(,)) 162 | expect_error(loader_double(,1000,)) 163 | expect_error(loader_double(,,1000)) 164 | expect_error(loader_double(1000,,1000)) 165 | expect_error(loader_double(1000,1000,)) 166 | expect_error(loader_double(,1000,1000)) 167 | 168 | } 169 | 170 | test_that("Loader2 no sub-blocks", { 171 | lazy_test_unit(0.1) 172 | lazy_test_unit(1L) 173 | # lazy_test_unit("") 174 | }) 175 | 176 | 177 | context("Loader2 sub-blocks") 178 | 179 | test_that("Loader2 sub-blocks", { 180 | setLazyBlockSize(1) 181 | lazy_test_unit(0.1) 182 | lazy_test_unit(1L) 183 | # lazy_test_unit("") 184 | 185 | setLazyBlockSize(11) 186 | lazy_test_unit(0.1) 187 | lazy_test_unit(1L) 188 | # lazy_test_unit("") 189 | 190 | setLazyBlockSize(201) 191 | lazy_test_unit(0.1) 192 | lazy_test_unit(1L) 193 | # lazy_test_unit("") 194 | 195 | setLazyBlockSize(-1) 196 | }) 197 | 198 | context("Loader2 with NAs") 199 | 200 | test_that("Loader2 with NAs", { 201 | unlink(a$get_partition_fpath(2)) 202 | setLazyBlockSize(1) 203 | x[,,2] <- NA 204 | expect_equal(x[], a[]) 205 | lazy_test_unit(0.1, x) 206 | lazy_test_unit(1L, x) 207 | # lazy_test_unit("", x) 208 | 209 | setLazyBlockSize(-1) 210 | x[,,2] <- NA 211 | expect_equal(x[], a[]) 212 | lazy_test_unit(0.1, x) 213 | lazy_test_unit(1L, x) 214 | # lazy_test_unit("", x) 215 | }) 216 | 217 | 218 | context("Loader2 with complex data") 219 | 220 | x = array(rnorm(prod(dim)), dim) + 1i * array(rnorm(prod(dim)), dim) 221 | unlink(f, recursive = TRUE) 222 | a = as.lazyarray(x, path = f, type = 'fstarray') 223 | 224 | test_that("Loader2 complex", { 225 | setLazyBlockSize(-1) 226 | lazy_test_unit(x[[1]]) 227 | 228 | setLazyBlockSize(1) 229 | lazy_test_unit(x[[1]]) 230 | 231 | setLazyBlockSize(11) 232 | lazy_test_unit(x[[1]]) 233 | 234 | setLazyBlockSize(201) 235 | lazy_test_unit(x[[1]]) 236 | 237 | setLazyBlockSize(1) 238 | unlink(a$get_partition_fpath(2)) 239 | x[,,2] <- NA 240 | expect_equal(x[], a[]) 241 | lazy_test_unit(x[[1]], x) 242 | 243 | setLazyBlockSize(-1) 244 | unlink(a$get_partition_fpath(2)) 245 | x[,,2] <- NA 246 | expect_equal(x[], a[]) 247 | lazy_test_unit(x[[1]], x) 248 | }) 249 | 250 | context("Loader2 with matrix") 251 | test_that("Loader2 with matrix", { 252 | x <- matrix(1:16,4) 253 | a <- as.lazyarray(x, type = 'fstarray') 254 | loader_f <- function(...){ 255 | lazyarray:::subsetFST(a$storage_path, environment(), dim(x), getSexpType(1L), NULL, FALSE) 256 | } 257 | 258 | expect_equivalent(loader_f(), x) 259 | expect_equivalent(loader_f(,1), x[,1,drop=TRUE]) 260 | expect_equivalent(loader_f(1,), x[1,,drop=TRUE]) 261 | 262 | }) 263 | -------------------------------------------------------------------------------- /tests/testthat/test-cpp_loc2idx.R: -------------------------------------------------------------------------------- 1 | test_that("Test loc2idx3 - validate", { 2 | # Case 1: small dset with NAs 3 | x = array(1:8,c(2,2,2)) 4 | dim = dim(x) 5 | locs = list(0:1L,1L, 3:1) 6 | target_dim = sapply(locs, length) 7 | tmp = loc2idx3(locs, dim);# tmp 8 | tmp[tmp < -9e18] <- NA 9 | # validate in R 10 | mfactor <- c(1, cumprod(dim))[seq_along(dim)] 11 | scaled_loc <- lapply(seq_along(locs), function(ii){ 12 | x <- as.integer(locs[[ii]]) 13 | d <- dim[[ii]] 14 | x[x < 1 | x > d] <- NA 15 | (x - 1) * mfactor[[ii]] 16 | }) 17 | t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1 18 | expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0) 19 | expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0) 20 | expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0)) 21 | 22 | # Case 2: small dset with 0 length 23 | x = array(1:8,c(2,2,2)) 24 | dim = dim(x) 25 | locs = list(0:1L,integer(0), 3:1) 26 | target_dim = sapply(locs, length) 27 | tmp = loc2idx3(locs, dim);# tmp 28 | tmp[tmp < -9e18] <- NA 29 | # validate in R 30 | mfactor <- c(1, cumprod(dim))[seq_along(dim)] 31 | scaled_loc <- lapply(seq_along(locs), function(ii){ 32 | x <- as.integer(locs[[ii]]) 33 | d <- dim[[ii]] 34 | x[x < 1 | x > d] <- NA 35 | (x - 1) * mfactor[[ii]] 36 | }) 37 | t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1 38 | expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0) 39 | expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0) 40 | expect_length(t1, 0) 41 | expect_length(tmp, 0) 42 | 43 | # case 3: random set 44 | x = (1:5000000) 45 | # arr2df(x, c(0L, 25L, 50L), 9L) 46 | dim = c(50000L, 10L, 10L) 47 | locs = list( 48 | as.integer(sample(100000L, 300) - 2000), 49 | as.integer(sample(12)-1), 50 | as.integer(sample(12)-1) 51 | ) 52 | target_dim = sapply(locs, length) 53 | tmp = loc2idx3(locs, dim);# tmp 54 | tmp[tmp < -9e18] <- NA 55 | dim(tmp) = target_dim 56 | 57 | # validate in R 58 | mfactor <- c(1, cumprod(dim))[seq_along(dim)] 59 | scaled_loc <- lapply(seq_along(locs), function(ii){ 60 | x <- as.integer(locs[[ii]]) 61 | d <- dim[[ii]] 62 | x[x < 1 | x > d] <- NA 63 | (x - 1) * mfactor[[ii]] 64 | }) 65 | t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1 66 | expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0) 67 | expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0) 68 | expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0)) 69 | 70 | dim(x) = c(50000L, 10L, 10L) 71 | a = locs[[1]]; a[(a<1) | (a >50000)] = NA 72 | b = locs[[2]]; b[b<1 | b > 10] = NA 73 | c = locs[[3]]; c[c<1 | c > 10] = NA 74 | y2 <- x[a,b,c] 75 | expect_equal(range(y2 - tmp, na.rm = TRUE), c(0,0)) 76 | 77 | y3 = tmp 78 | expect_equal(sum(abs(is.na(y2) - is.na(y3))), 0) 79 | expect_length(which(abs(is.na(y2) - is.na(y3)) > 0, arr.ind = TRUE), 0) 80 | expect_equal(range(y2 - y3, na.rm = TRUE), c(0,0)) 81 | 82 | }) 83 | 84 | test_that("Test loc2idx3 - validate", { 85 | # Case 1: small dset with NAs 86 | x = array(1:8,c(2,2,2)) 87 | dim = dim(x) 88 | locs = list(0:1L,1L, 3:1) 89 | target_dim = sapply(locs, length) 90 | tmp = loc2idx3(locs, dim);# tmp 91 | tmp[tmp < -9e18] <- NA 92 | # validate in R 93 | mfactor <- c(1, cumprod(dim))[seq_along(dim)] 94 | scaled_loc <- lapply(seq_along(locs), function(ii){ 95 | x <- as.integer(locs[[ii]]) 96 | d <- dim[[ii]] 97 | x[x < 1 | x > d] <- NA 98 | (x - 1) * mfactor[[ii]] 99 | }) 100 | t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1 101 | expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0) 102 | expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0) 103 | expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0)) 104 | 105 | # Case 2: small dset with 0 length 106 | x = array(1:8,c(2,2,2)) 107 | dim = dim(x) 108 | locs = list(0:1L,integer(0), 3:1) 109 | target_dim = sapply(locs, length) 110 | tmp = loc2idx3(locs, dim);# tmp 111 | tmp[tmp < -9e18] <- NA 112 | # validate in R 113 | mfactor <- c(1, cumprod(dim))[seq_along(dim)] 114 | scaled_loc <- lapply(seq_along(locs), function(ii){ 115 | x <- as.integer(locs[[ii]]) 116 | d <- dim[[ii]] 117 | x[x < 1 | x > d] <- NA 118 | (x - 1) * mfactor[[ii]] 119 | }) 120 | t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1 121 | expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0) 122 | expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0) 123 | expect_length(t1, 0) 124 | expect_length(tmp, 0) 125 | 126 | # case 3: random set 127 | x = (1:5000000) 128 | # arr2df(x, c(0L, 25L, 50L), 9L) 129 | dim = c(50000L, 10L, 10L) 130 | locs = list( 131 | as.integer(sample(100000L, 300) - 2000), 132 | as.integer(sample(12)-1), 133 | as.integer(sample(12)-1) 134 | ) 135 | target_dim = sapply(locs, length) 136 | tmp = loc2idx3(locs, dim);# tmp 137 | tmp[tmp < -9e18] <- NA 138 | dim(tmp) = target_dim 139 | 140 | # validate in R 141 | mfactor <- c(1, cumprod(dim))[seq_along(dim)] 142 | scaled_loc <- lapply(seq_along(locs), function(ii){ 143 | x <- as.integer(locs[[ii]]) 144 | d <- dim[[ii]] 145 | x[x < 1 | x > d] <- NA 146 | (x - 1) * mfactor[[ii]] 147 | }) 148 | t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1 149 | expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0) 150 | expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0) 151 | expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0)) 152 | 153 | dim(x) = c(50000L, 10L, 10L) 154 | a = locs[[1]]; a[(a<1) | (a >50000)] = NA 155 | b = locs[[2]]; b[b<1 | b > 10] = NA 156 | c = locs[[3]]; c[c<1 | c > 10] = NA 157 | y2 <- x[a,b,c] 158 | expect_equal(range(y2 - tmp, na.rm = TRUE), c(0,0)) 159 | 160 | y3 = tmp 161 | expect_equal(sum(abs(is.na(y2) - is.na(y3))), 0) 162 | expect_length(which(abs(is.na(y2) - is.na(y3)) > 0, arr.ind = TRUE), 0) 163 | expect_equal(range(y2 - y3, na.rm = TRUE), c(0,0)) 164 | 165 | }) 166 | -------------------------------------------------------------------------------- /tests/testthat/test-cpp_parseAndScheduleBlocks.R: -------------------------------------------------------------------------------- 1 | require(testthat) 2 | 3 | # devtools::load_all() 4 | dim <- c(5,7,8,10) 5 | slice <- list(c(1,1,2),1,2:3,4:5) 6 | 7 | check_schedule <- function(slice, dim){ 8 | slice_copy <- lapply(slice, I); force(slice_copy) 9 | res <- parseAndScheduleBlocks2(slice_copy, dim) 10 | re <- res$schedule 11 | block_lb <- getLazyBlockSize() 12 | block_ub <- 31250000 13 | ndims <- length(dim) 14 | 15 | valid_slices <- lapply(seq_along(slice), function(ii){ 16 | x <- slice[[ii]] 17 | if(isTRUE(x == get_missing_value())){ 18 | x <- seq_len(dim[[ii]]) 19 | } else { 20 | x <- x[is.na(x) | x > 0] 21 | } 22 | x 23 | }) 24 | 25 | get_missing_value() 26 | 27 | # redo this procedure in R 28 | cum_dim <- cumprod(dim) 29 | block_ndims <- which(cum_dim > block_lb) 30 | if(!length(block_ndims)){ block_ndims <- ndims } 31 | block_ndims <- block_ndims[[1]] 32 | if( block_ndims == ndims ){ block_ndims = block_ndims - 1 } 33 | 34 | # [3000,7,3] if buffer_margin=1 => [3000] x [7,1] x [3] 35 | # [3000,7,3] if buffer_margin=2 => [3000,7] x [1,1] x [3] 36 | # [100,100,100,100,1] buffer_margin=2 => [100,100] x [100,100,1] x [1] 37 | block_dimension <- dim[seq_len(block_ndims)] 38 | schedule_dimension <- dim[-seq_len(block_ndims)] 39 | partition_index <- slice[[ndims]] 40 | partition_index <- partition_index[is.na(partition_index) | partition_index > 0] 41 | partition_counts <- length(partition_index) 42 | 43 | # make schedules 44 | block_expected_length <- prod(sapply(valid_slices[seq_len(block_ndims)], length)) 45 | schedule_counts_per_part <- prod(sapply(valid_slices, length)) / partition_counts / block_expected_length 46 | block_indexed <- FALSE 47 | if(block_ndims >= 2 && block_expected_length <= block_ub){ 48 | block_schedule <- loc2idx3(valid_slices[seq_len(block_ndims)], dim[seq_len(block_ndims)]) 49 | block_indexed <- TRUE 50 | 51 | 52 | expect_equal(re$block_schedule, block_schedule) 53 | block_schedule[block_schedule < 0] <- NA 54 | expect_equal(re$block_schedule_start, min(block_schedule, na.rm = TRUE)) 55 | expect_equal(re$block_schedule_end, max(block_schedule, na.rm = TRUE)) 56 | expect_equal(re$block_expected_length, block_expected_length) 57 | } else { 58 | # do we need accurate block_expected_length? as it's not actually used if not block indexed 59 | # block_expected_length <- prod(dim[seq_len(block_ndims)]) 60 | block_schedule_start <- 1 61 | block_schedule_end <- prod(dim[seq_len(block_ndims)]) 62 | expect_equal(re$block_schedule_start, block_schedule_start) 63 | expect_equal(re$block_schedule_end, block_schedule_end) 64 | expect_equal(re$block_expected_length, block_expected_length) 65 | } 66 | 67 | block_prod_dim <- c(1, cumprod(block_dimension))[seq_along(block_dimension)] 68 | block_length <- prod(block_dimension) 69 | 70 | # schedule blocks 71 | schedule_dimension_alt <- schedule_dimension 72 | schedule_dimension_alt[[length(schedule_dimension_alt)]] <- 1 73 | tmp <- valid_slices[-seq_len(block_ndims)] 74 | tmp[[length(tmp)]] <- 1 75 | schedule_index <- loc2idx3(tmp, schedule_dimension_alt) 76 | 77 | # checks 78 | expect_equal(re$dimension, dim) 79 | expect_equal(re$block_ndims, block_ndims) 80 | expect_equal(re$block_dimension, block_dimension) 81 | expect_equal(re$schedule_dimension, schedule_dimension) 82 | # re$partition_index[re$partition_index < -9e18] <- NA_real_ 83 | expect_equal(as.numeric(re$partition_index), partition_index) 84 | expect_equal(re$partition_counts, partition_counts) 85 | expect_equal(re$schedule_counts_per_part, schedule_counts_per_part) 86 | expect_equal(re$block_prod_dim, block_prod_dim) 87 | expect_equal(re$block_indexed, block_indexed) 88 | expect_equal(re$block_length, block_length) 89 | expect_equal(re$schedule_index, schedule_index) 90 | } 91 | 92 | 93 | context("subset scheduler-normal case") 94 | 95 | test_that("subset scheduler-normal case", { 96 | dim <- c(5,7,8,10) 97 | slice <- list(c(1,1,2),1,2:3,4:5) 98 | 99 | setLazyBlockSize(1) 100 | check_schedule(slice, dim) 101 | setLazyBlockSize(30) 102 | check_schedule(slice, dim) 103 | setLazyBlockSize(300) 104 | check_schedule(slice, dim) 105 | setLazyBlockSize(10000) 106 | check_schedule(slice, dim) 107 | 108 | }) 109 | 110 | context("subset scheduler-NA & 0 cases") 111 | test_that("subset scheduler-NA & 0 cases", { 112 | dim <- c(5,7,8,10) 113 | 114 | slice <- lapply(dim, function(x){ 115 | x <- sample(x, x, replace = TRUE) 116 | x[sample(length(x), 2)] <- NA 117 | x[sample(length(x), 1)] <- 0 118 | x 119 | }) 120 | 121 | setLazyBlockSize(1) 122 | check_schedule(slice, dim) 123 | setLazyBlockSize(30) 124 | check_schedule(slice, dim) 125 | setLazyBlockSize(300) 126 | check_schedule(slice, dim) 127 | setLazyBlockSize(10000) 128 | check_schedule(slice, dim) 129 | }) 130 | 131 | 132 | -------------------------------------------------------------------------------- /tests/testthat/test-generics.R: -------------------------------------------------------------------------------- 1 | test_that("summary statistics", { 2 | x <- 3 | lazyarray::lazyarray(tempfile(), 4 | dim = c(100, 100), 5 | storage_format = 'double') 6 | x[, 1] <- 1:100 7 | x[, 2] <- NA 8 | x[, 3] <- c(0, rep(NA, 99)) 9 | 10 | 11 | testthat::expect_true(is.na(min(x))) 12 | testthat::expect_true(is.na(max(x))) 13 | 14 | testthat::expect_equivalent(min(x, na.rm=TRUE), 0) 15 | testthat::expect_equivalent(max(x, na.rm=TRUE), 100) 16 | testthat::expect_equivalent(range(x, na.rm=TRUE), c(0, 100)) 17 | testthat::expect_equivalent(range(x), c(NA, NA)) 18 | 19 | testthat::expect_equivalent(mean(x, na.rm = TRUE), 50) 20 | testthat::expect_equivalent(sum(x, na.rm = TRUE), 5050) 21 | 22 | # partition_apply(x, mean) 23 | 24 | }) 25 | -------------------------------------------------------------------------------- /tests/testthat/test-lazylm.R: -------------------------------------------------------------------------------- 1 | if(interactive()){ 2 | require(testthat) 3 | require(lazyarray) 4 | require(stats) 5 | } 6 | 7 | test_that("lazylm", { 8 | 9 | # use a baseenv to evaluate 10 | env <- new.env(parent = asNamespace('lazyarray')) 11 | with(env, { 12 | lazylm( V1 ~ .-V2-1 + (V2 > 0), data = local({ 13 | arr <- array(rnorm(72), c(6,3,4)) 14 | arr[1,1,1] <- NA 15 | as.lazyarray(arr, type = 'file') 16 | }), weights = runif(18), offset = rnorm(18) , na.action = 'na.exclude') 17 | }) 18 | 19 | 20 | arr <- array(rnorm(72), c(6,3,4)) 21 | arr[1,1,1] <- NA # Allow NA to be treated 22 | offset = rnorm(18) # offsets and weights are optional 23 | weights = runif(18) 24 | na_action <- 'na.omit' 25 | 26 | formula <- V1 ~ .-V2-1 + (V2 > 0) 27 | 28 | data <- as.lazyarray(arr, type = 'file') 29 | 30 | dim(arr) <- c(18, 4) 31 | lm_data <- as.data.frame(arr) 32 | 33 | e <- eigen(crossprod(arr[complete.cases(arr), ])) 34 | # if(!min(e$values > 1e-7)){ 35 | # skip("Generated eigenvalue for lazylm is too small, just skip this time") 36 | # } 37 | 38 | object <- lazylm(formula, data, weights = weights, offset = offset, na.action = na_action, fitted = TRUE) 39 | 40 | 41 | # Compare to stats::lm 42 | flm <- lm(formula, lm_data, weights = weights, offset = offset, na.action = na_action) 43 | 44 | assign('object', object, envir = globalenv()) 45 | assign('flm', flm, envir = globalenv()) 46 | 47 | expect_lt(max(abs(coef(object) - coef(flm))), 1e-7) 48 | expect_lt(max(abs(resid(object) - resid(flm))), 1e-7) 49 | expect_lt(max(abs(fitted(object) - fitted(flm))), 1e-7) 50 | 51 | s1 <- summary(object) 52 | s2 <- summary(flm) 53 | 54 | expect_lt(max(abs(s1$coefficients - s2$coefficients)), 1e-7) 55 | 56 | expect_lt(max(abs(s1$r.squared - s2$r.squared)), 1e-7) 57 | expect_equivalent(s1$df, s2$df) 58 | 59 | }) 60 | -------------------------------------------------------------------------------- /tests/testthat/test-matmul.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { 2 | on.exit({options('lazyarray.chunk_memory' = 80) 3 | lazyarray::set_lazy_threads(0)}) 4 | 5 | lazyarray::set_lazy_threads(1) 6 | 7 | 8 | options('lazyarray.chunk_memory' = 0.00001) 9 | 10 | orig <- rnorm(1e4) 11 | 12 | orig1 <- orig; dim(orig1) <- c(10, 1000) 13 | 14 | weights <- rnorm(1000) 15 | 16 | # 0.001 17 | system.time({ 18 | xy0 <- orig1 %*% (t(orig1) * weights) 19 | }) 20 | 21 | # x <- as.lazymatrix(orig1) 22 | # y <- as.lazymatrix(t(orig1)) 23 | 24 | 25 | # # 0.130 26 | # system.time({ 27 | # xy <- lazy_matmul(x, y, weights = weights, hybrid_limit = 0) 28 | # }) 29 | # expect_equal(range(xy[] - xy0), c(0,0)) 30 | # 31 | # y <- t(as.lazymatrix(orig1)) 32 | # 33 | # # 0.188 34 | # a <- system.time({ 35 | # xy <- lazy_matmul(x, y, weights = weights, hybrid_limit = 0) 36 | # }) 37 | # expect_equal(range(xy[] - xy0), c(0,0)) 38 | # 39 | # x <- t(as.lazymatrix(t(orig1))) 40 | # 41 | # # 0.098 42 | # system.time({ 43 | # xy <- lazy_matmul(x, y, weights = weights, hybrid_limit = 0) 44 | # }) 45 | # expect_equal(range(xy[] - xy0), c(0,0)) 46 | 47 | y <- as.lazymatrix(t(orig1)) 48 | 49 | # # 0.017 50 | # system.time({ 51 | # xy <- lazy_matmul(x, y, weights = weights, hybrid_limit = 0) 52 | # }) 53 | # expect_equal(range(xy[] - xy0), c(0,0)) 54 | 55 | x <- as.lazymatrix(t(orig1)) 56 | 57 | # 0.002 58 | system.time({ 59 | xy <- crossprod(x, weights = weights) 60 | }) 61 | expect_equal(range(xy[] - xy0), c(0,0)) 62 | 63 | # x <- t(as.lazymatrix(orig1)) 64 | # 65 | # # 0.162 66 | # system.time({ 67 | # xy <- lazy_crossprod(x, weights = weights) 68 | # }) 69 | # expect_equal(range(xy[] - xy0), c(0,0)) 70 | # 71 | # # bench::mark({ 72 | # # lazy_matmul(x, y) 73 | # # }, { 74 | # # x[] %*% y[] 75 | # # }, check = F, iterations = 1) 76 | # # 77 | }) 78 | 79 | 80 | 81 | # test_that("multiplication works", { 82 | # skip_on_cran() 83 | # skip_on_travis() 84 | # skip_on_appveyor() 85 | # skip_on_bioc() 86 | # skip_on_ci() 87 | # skip_on_covr() 88 | # skip_if_not(dir.exists('~/Desktop/lazyarray_data/')) 89 | # 90 | # self <- lazyarray('~/Desktop/lazyarray_data/') 91 | # self <- as.lazymatrix(self) 92 | # 93 | # future::plan('multisession') 94 | # 95 | # # pryr::mem_used() 96 | # system.time({ 97 | # a <- lazy_crossprod(self) 98 | # }) 99 | # 100 | # }) 101 | -------------------------------------------------------------------------------- /tests/testthat/test-subset.R: -------------------------------------------------------------------------------- 1 | test_that("Subset lazyarray & lazymatrix", { 2 | a <- array(rnorm(80), c(2,4,2,5)) 3 | x <- as.lazyarray(a, type = 'fstarray') 4 | 5 | expect_equal(x[], a) 6 | 7 | idx <- sample(length(x), length(x) * 10, replace = TRUE) 8 | expect_equal(x[idx], a[idx]) 9 | 10 | # matrix 11 | x <- as.lazymatrix(a, type = 'fstarray') 12 | b <- x[] 13 | 14 | expect_equivalent(as.integer(dim(b)), c(16,5)) 15 | 16 | expect_equal(x[], b) 17 | 18 | idx <- sample(length(x), length(x) * 10, replace = TRUE) 19 | expect_equal(x[idx], b[idx]) 20 | 21 | expect_equivalent(x[,1], b[,1]) 22 | expect_equivalent(x[,1,drop=FALSE], b[,1,drop=FALSE]) 23 | 24 | # x <- t(x) 25 | # b <- t(b) 26 | # 27 | # expect_equal(dim(x), c(5,16)) 28 | # 29 | # expect_equal(x[], b) 30 | # 31 | # idx <- sample(length(x), length(x) * 10, replace = TRUE) 32 | # expect_equal(x[idx], b[idx]) 33 | # 34 | # expect_equal(x[,1], b[,1]) 35 | # expect_equal(x[,1,drop=FALSE], b[,1,drop=FALSE]) 36 | 37 | }) 38 | 39 | test_that("Subset filearray", { 40 | a <- array(rnorm(80), c(2,4,2,5)) 41 | f <- tempfile() 42 | x <- FileArray$new(f, dim = dim(a), storage_format = 'double', read_only = FALSE) 43 | x[] <- a 44 | 45 | expect_equal(x[], a) 46 | 47 | i <- sample(4,replace = TRUE) 48 | i[sample(4,2)] <- NA 49 | expect_equal(x[,i,,i], a[,i,,i]) 50 | 51 | expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]]) 52 | 53 | j <- sample(2,replace = TRUE) 54 | i <- sample(5,replace = TRUE) 55 | # TODO: R array drop=TRUE with 0 length doesn't actually drop the array 56 | 57 | i <- 1:5 58 | expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]]) 59 | # expect_equal(x[j,j,j,-i], a[j,j,j,-i[!is.na(i)]]) 60 | 61 | expect_equal(x[,j,,-i,drop=FALSE], a[,j,,-i[!is.na(i)],drop=FALSE]) 62 | expect_equal(x[j,j,j,-i,drop=FALSE], a[j,j,j,-i[!is.na(i)],drop=FALSE]) 63 | 64 | 65 | x[,,,1] <- a[,,,2] 66 | a[,,,1] <- a[,,,2] 67 | 68 | expect_equal(x[], a) 69 | 70 | i <- sample(5,replace = TRUE) 71 | i[sample(5,2)] <- NA 72 | expect_equal(x[,,,i], a[,,,i]) 73 | 74 | expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]]) 75 | 76 | j <- sample(2,replace = TRUE) 77 | i <- sample(5,replace = TRUE) 78 | expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]]) 79 | expect_equal(x[j,j,j,-i], a[j,j,j,-i[!is.na(i)]]) 80 | 81 | expect_error(x[,,,6]) 82 | 83 | expect_equal(x[,,,-6], a) 84 | 85 | }) 86 | 87 | test_that("Subset fstarray", { 88 | a <- array(rnorm(80), c(2,4,2,5)) 89 | f <- tempfile() 90 | x <- FstArray$new(f, dim = dim(a), storage_format = 'double', read_only = FALSE) 91 | x[] <- a 92 | 93 | expect_equal(x[], a) 94 | 95 | i <- sample(5,replace = TRUE) 96 | i[sample(5,2)] <- NA 97 | expect_equal(x[,,,i], a[,,,i]) 98 | 99 | expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]]) 100 | 101 | j <- sample(2,replace = TRUE) 102 | i <- sample(5,replace = TRUE) 103 | expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]]) 104 | 105 | i <- 1:5 106 | expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]]) 107 | 108 | x[,,,1] <- a[,,,2] 109 | a[,,,1] <- a[,,,2] 110 | 111 | expect_equal(x[], a) 112 | 113 | i <- sample(5,replace = TRUE) 114 | i[sample(5,2)] <- NA 115 | expect_equal(x[,,,i], a[,,,i]) 116 | 117 | expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]]) 118 | 119 | j <- sample(2,replace = TRUE) 120 | expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]]) 121 | expect_equal(x[j,j,j,-i], a[j,j,j,-i[!is.na(i)]]) 122 | 123 | }) 124 | -------------------------------------------------------------------------------- /tests/testthat/test.setget.R: -------------------------------------------------------------------------------- 1 | 2 | context("Getter/Setter mode = 2") 3 | 4 | test_that("Getter/Setter mode = 2", { 5 | skip("mode = 2 is not supported anymore") 6 | # path <- tempfile() 7 | # suppressWarnings({ 8 | # self <- create_lazyarray(path, 'double', c(1,3,4), multipart = TRUE, multipart_mode = 2) 9 | # }) 10 | # private <- self$.__enclos_env__$private 11 | # value <- array(1:4, c(1,2,2)) 12 | # 13 | # expect_true(self$can_write) 14 | # 15 | # self$`@set_data`(value, 1.0, 1:2, 1:2) 16 | # 17 | # expect_equal(self$`@get_data`(1, 2, 1), 2) 18 | # 19 | # expect_equal(self$`@get_data`(1, 1:3, 1), c(1,2,NA)) 20 | # 21 | # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4))) 22 | # 23 | # self[1, 1:2, c(3,10)] <- 1:4 24 | # 25 | # # change to read_only 26 | # private$read_only <- TRUE 27 | # expect_false(self$can_write) 28 | # 29 | # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, 1,2,NA, rep(NA, 3)), c(1,3,4))) 30 | # 31 | # expect_error({self[1, 1:2, c(3,10)] <- 1:4}) 32 | # 33 | # # clean up 34 | # self$remove_data() 35 | # 36 | # expect_false(dir.exists(private$.dir)) 37 | # 38 | # # operations will result in error 39 | # expect_error(self[1,1,1]) 40 | 41 | }) 42 | 43 | 44 | 45 | 46 | context("Getter/Setter mode = 1") 47 | 48 | test_that("Getter/Setter mode = 1", { 49 | 50 | self <- lazyarray(tempfile(), storage_format = 'double', dim = c(1,3,4), type = 'fstarray')#, multipart = TRUE, multipart_mode = 1) 51 | private <- self$.__enclos_env__$private 52 | value <- array(1:4, c(1,2,2)) 53 | 54 | expect_true(self$can_write) 55 | 56 | subsetAssignFST(value, file = self$storage_path, listOrEnv = list(1.0, 1:2, 1:2), dim = self$dim, dtype = 14L) 57 | # self$`@set_data`(value, 1.0, 1:2, 1:2) 58 | 59 | expect_equal(subsetFST(self$storage_path, list(1,2,1), self$dim, 14L, drop=TRUE), 2) 60 | # expect_equal(self$`@get_data`(1, 2, 1), 2) 61 | 62 | expect_equal(subsetFST(self$storage_path, list(1, 1:3, 1), self$dim, 14L, drop=TRUE), c(1,2,NA)) 63 | # expect_equal(self$`@get_data`(1, 1:3, 1), c(1,2,NA)) 64 | 65 | expect_equivalent(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4))) 66 | 67 | expect_error({ self[1, 1:2, c(3,10)] <- 1:4 }) 68 | self[1, 1:2, c(3,4)] <- 1:4 69 | 70 | # change to read_only 71 | private$.read_only <- TRUE 72 | expect_false(self$can_write) 73 | 74 | expect_equivalent(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, 1,2,NA, 3,4,NA), c(1,3,4))) 75 | 76 | expect_error({self[1, 1:2, c(3,4)] <- 1:4}) 77 | 78 | # clean up 79 | self$remove_data() 80 | 81 | expect_false(dir.exists(private$.path)) 82 | 83 | # operations will result in error 84 | expect_error(self[1,1,1]) 85 | 86 | }) 87 | 88 | 89 | 90 | 91 | 92 | context("Getter/Setter no partition") 93 | 94 | test_that("Getter/Setter no partition", { 95 | skip("No partition is not supported anymore") 96 | # suppressWarnings({ 97 | # self <- create_lazyarray(tempfile(), 'double', c(1,3,4), multipart = FALSE) 98 | # }) 99 | # private <- self$.__enclos_env__$private 100 | # value <- array(1:4, c(1,2,2)) 101 | # 102 | # expect_true(self$can_write) 103 | # 104 | # self$`@set_data`(value, 1.0, 1:2, 1:2) 105 | # 106 | # expect_equal(self$`@get_data`(1, 2, 1), 2) 107 | # 108 | # expect_equal(self$`@get_data`(1, 1:3, 1), c(1,2,NA)) 109 | # 110 | # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4))) 111 | # 112 | # expect_error({ 113 | # self[1, 1:2, c(3,10)] <- 1:4 114 | # }) 115 | # 116 | # # change to read_only 117 | # private$read_only <- TRUE 118 | # expect_false(self$can_write) 119 | # 120 | # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4))) 121 | # 122 | # expect_error({self[1, 1:2, 3] <- 1:2}) 123 | # 124 | # # clean up 125 | # self$remove_data() 126 | # 127 | # expect_false(dir.exists(private$.dir)) 128 | # 129 | # # operations will result in error 130 | # expect_error(self[1,1,1]) 131 | 132 | }) 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/basic-usage-of-lazyarray.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "A Basic Usage of 'LazyArray'" 3 | output: rmarkdown::html_vignette 4 | author: Zhengjia Wang 5 | vignette: > 6 | %\VignetteIndexEntry{A Basic Usage of 'LazyArray'} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | %\VignetteEncoding{UTF-8} 9 | --- 10 | 11 | ```{r, include = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = "#>" 15 | ) 16 | ``` 17 | 18 | ```{r setup} 19 | library(lazyarray) 20 | ``` 21 | 22 | ### Initialize 23 | 24 | Create a blank array and assign R object 25 | 26 | ```{r} 27 | # Sample data (~24 MB) 28 | x <- rnorm(3e6); dim(x) <- c(10, 100, 100, 30) 29 | 30 | # Save array to a path 31 | path <- tempfile() 32 | arr <- lazyarray(path, dim = dim(x), storage_format = 'double') 33 | arr[] <- x 34 | ``` 35 | 36 | Load existing array 37 | 38 | ```{r} 39 | # Load existing array 40 | arr <- lazyarray(path) 41 | ``` 42 | 43 | To protect array from further changes, make it read-only. 44 | 45 | ```{r} 46 | arr$make_readonly() 47 | arr$can_write 48 | ``` 49 | 50 | To make a read-only array writable: 51 | 52 | ```{r} 53 | arr$make_writable() 54 | arr$can_write 55 | ``` 56 | 57 | ### S3 methods 58 | 59 | 1. Set dimension names 60 | 61 | ```{r} 62 | arr$make_writable() 63 | dimnames(arr) <- list( 64 | A = 1:10, 65 | B = 1:100, 66 | C = 1:100, 67 | D = 1:30 68 | ) 69 | ``` 70 | 71 | 2. Subset and subset assign 72 | 73 | ```{r} 74 | # Subset/read array 75 | y1 <- arr[] 76 | y2 <- arr[,,,3] 77 | 78 | # Write to slice of data, writing to slices along the 79 | # last dimension is optimized 80 | arr[,,,1] <- seq_len(1e5) 81 | ``` 82 | 83 | 3. Subset by formula 84 | 85 | ```{r} 86 | sub <- subset(arr, A ~ A <= 2, B ~ B == 10) 87 | dim(sub) 88 | ``` 89 | 90 | ### Remove Arrays 91 | 92 | Data created via `lazyarray` does not remove automatically. You need to finalize array by yourself. This is because multiple lazy array instances might point to a same dataset. If one of the object is garbage collected, you might not want to remove the data on hard drive as this will invalidate the other instances. To manually remove data, use 93 | 94 | ```{r} 95 | arr$remove_data() 96 | ``` 97 | --------------------------------------------------------------------------------