├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   └── pkgdown.yaml
├── .gitignore
├── .travis.yml
├── CRAN-RELEASE
├── DESCRIPTION
├── Makefile
├── NAMESPACE
├── NEWS.md
├── R
    ├── .gitignore
    ├── RcppExports.R
    ├── aaa.R
    ├── class-abstract.R
    ├── class-filearray.R
    ├── class-fstarray.R
    ├── generics.R
    ├── lazyarray.R
    ├── model-lm.R
    ├── utils.R
    └── zzz.R
├── cran-comments.md
├── experiment
    ├── class-lazyarray.R
    ├── class-lazymatrix.R
    ├── create_array.R
    ├── data
    │   └── .gitignore
    ├── funcs-lm.R
    ├── junk.sh
    ├── lazyarray.R
    ├── lazymatrix.R
    ├── loader2ext.cpp
    ├── loader2ext.h
    ├── old
    │   ├── bigmemory-example.R
    │   ├── bmWrapper.cpp
    │   ├── bmWrapper.h
    │   ├── classLazyArray.h
    │   ├── comparison-bigmemory.R
    │   ├── comparison-lazyarray.R
    │   ├── filearrsub.R
    │   ├── int64_double.cpp
    │   ├── junk.R
    │   ├── memcpy.cpp
    │   ├── new_subset.R
    │   ├── old_scripts.R
    │   ├── profile.R
    │   ├── s4-definition.R
    │   ├── s4-generics.R
    │   ├── speed test.R
    │   ├── speed_test.R
    │   ├── test_io.R
    │   └── write.R
    ├── performance-fst-vs-file.R
    ├── s3-lazyarray.R
    └── s3-lazymatrix.R
├── inst
    ├── WORDLIST
    └── include
    │   ├── interfaces
    │       ├── BMArray.h
    │       ├── FstArray.h
    │       ├── FstMatrix.h
    │       ├── LazyArrayBase.h
    │       └── entry.h
    │   ├── lazyarray.h
    │   └── lazyarray_RcppExports.h
├── lazyarray.Rproj
├── man
    ├── .gitignore
    ├── auto_clear_lazyarray.Rd
    ├── chunk_map.Rd
    ├── crossprod.Rd
    ├── lazy_parallel.Rd
    ├── lazyarray-threads.Rd
    ├── lazyarray.Rd
    ├── lazylm.Rd
    ├── partition_map.Rd
    ├── partition_table.Rd
    └── typeof-AbstractLazyArray-method.Rd
├── readme.md
├── src
    ├── .gitignore
    ├── Makevars
    ├── RcppExports.cpp
    ├── classIndexSchedule.cpp
    ├── classIndexSchedule.h
    ├── classLazyArray.cpp
    ├── common.cpp
    ├── common.h
    ├── fstWrapper.cpp
    ├── fstWrapper.h
    ├── indexConvert.cpp
    ├── indexConvert.h
    ├── lazyarray-ext.cpp
    ├── lazycommon.h
    ├── loader1.cpp
    ├── loader1.h
    ├── loader2.cpp
    ├── loader2.h
    ├── loader3.cpp
    ├── old
    │   ├── loader2ext.cpp
    │   ├── loader2ext.h
    │   └── loader3.cpp
    ├── openMPInterface.cpp
    ├── openMPInterface.h
    ├── playground.cpp
    ├── reshape.cpp
    ├── reshape.h
    ├── saver2.cpp
    ├── saver2.h
    ├── saver2ext.cpp
    ├── saver2ext.h
    ├── utils.cpp
    └── utils.h
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-cpp_io2.R
    │   ├── test-cpp_loc2idx.R
    │   ├── test-cpp_parseAndScheduleBlocks.R
    │   ├── test-cpp_subsetIdx.R
    │   ├── test-generics.R
    │   ├── test-lazylm.R
    │   ├── test-matmul.R
    │   ├── test-subset.R
    │   ├── test.cpp_io.R
    │   └── test.setget.R
└── vignettes
    ├── .gitignore
    └── basic-usage-of-lazyarray.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^experiment/
 4 | ^docs/
 5 | ^readme\.md$
 6 | ^cran-comments\.md$
 7 | ^CRAN-RELEASE$
 8 | ^Makefile$
 9 | ^\.travis\.yml$
10 | ^\.github$
11 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
  1 | # NOTE: This workflow is overkill for most R packages
  2 | # check-standard.yaml is likely a better choice
  3 | # usethis::use_github_action("check-standard") will install it.
  4 | #
  5 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
  6 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
  7 | on:
  8 |   push:
  9 |     branches:
 10 |       - main
 11 |       - master
 12 |   pull_request:
 13 |     branches:
 14 |       - main
 15 |       - master
 16 | 
 17 | name: R-CMD-check
 18 | 
 19 | jobs:
 20 |   R-CMD-check:
 21 |     runs-on: ${{ matrix.config.os }}
 22 | 
 23 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
 24 | 
 25 |     strategy:
 26 |       fail-fast: false
 27 |       matrix:
 28 |         config:
 29 |           - {os: macOS-latest,   r: 'release'}
 30 |           - {os: windows-latest, r: 'release'}
 31 |           - {os: windows-latest, r: '3.6'}
 32 |           - {os: ubuntu-16.04,   r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest", http-user-agent: "R/4.0.0 (ubuntu-16.04) R (4.0.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" }
 33 |           - {os: ubuntu-16.04,   r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
 34 |           - {os: ubuntu-16.04,   r: 'oldrel',  rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
 35 |           - {os: ubuntu-16.04,   r: '3.5',     rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
 36 | 
 37 |     env:
 38 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
 39 |       RSPM: ${{ matrix.config.rspm }}
 40 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 41 | 
 42 |     steps:
 43 |       - uses: actions/checkout@v2
 44 | 
 45 |       - uses: r-lib/actions/setup-r@v1
 46 |         with:
 47 |           r-version: ${{ matrix.config.r }}
 48 |           http-user-agent: ${{ matrix.config.http-user-agent }}
 49 | 
 50 |       - uses: r-lib/actions/setup-pandoc@v1
 51 | 
 52 |       - name: Query dependencies
 53 |         run: |
 54 |           install.packages('remotes')
 55 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
 56 |           writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
 57 |         shell: Rscript {0}
 58 | 
 59 |       - name: Cache R packages
 60 |         if: runner.os != 'Windows'
 61 |         uses: actions/cache@v2
 62 |         with:
 63 |           path: ${{ env.R_LIBS_USER }}
 64 |           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
 65 |           restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
 66 | 
 67 |       - name: Install system dependencies
 68 |         if: runner.os == 'Linux'
 69 |         run: |
 70 |           while read -r cmd
 71 |           do
 72 |             eval sudo $cmd
 73 |           done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "16.04"))')
 74 | 
 75 |       - name: Install dependencies
 76 |         run: |
 77 |           remotes::install_deps(dependencies = TRUE)
 78 |           remotes::install_cran("rcmdcheck")
 79 |         shell: Rscript {0}
 80 | 
 81 |       - name: Session info
 82 |         run: |
 83 |           options(width = 100)
 84 |           pkgs <- installed.packages()[, "Package"]
 85 |           sessioninfo::session_info(pkgs, include_base = TRUE)
 86 |         shell: Rscript {0}
 87 | 
 88 |       - name: Check
 89 |         env:
 90 |           _R_CHECK_CRAN_INCOMING_: false
 91 |         run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
 92 |         shell: Rscript {0}
 93 | 
 94 |       - name: Show testthat output
 95 |         if: always()
 96 |         run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
 97 |         shell: bash
 98 | 
 99 |       - name: Upload check results
100 |         if: failure()
101 |         uses: actions/upload-artifact@main
102 |         with:
103 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
104 |           path: check
105 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |       - master
 6 | 
 7 | name: pkgdown
 8 | 
 9 | jobs:
10 |   pkgdown:
11 |     runs-on: macOS-latest
12 |     env:
13 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
14 |     steps:
15 |       - uses: actions/checkout@v2
16 | 
17 |       - uses: r-lib/actions/setup-r@v1
18 | 
19 |       - uses: r-lib/actions/setup-pandoc@v1
20 | 
21 |       - name: Query dependencies
22 |         run: |
23 |           install.packages('remotes')
24 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
25 |           writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
26 |         shell: Rscript {0}
27 | 
28 |       - name: Cache R packages
29 |         uses: actions/cache@v2
30 |         with:
31 |           path: ${{ env.R_LIBS_USER }}
32 |           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
33 |           restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
34 | 
35 |       - name: Install dependencies
36 |         run: |
37 |           remotes::install_deps(dependencies = TRUE)
38 |           install.packages("pkgdown", type = "binary")
39 |         shell: Rscript {0}
40 | 
41 |       - name: Install package
42 |         run: R CMD INSTALL .
43 | 
44 |       - name: Deploy package
45 |         run: |
46 |           git config --local user.email "actions@github.com"
47 |           git config --local user.name "GitHub Actions"
48 |           Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | src/*.o
 6 | src/*.so
 7 | src/*.dll
 8 | inst/doc
 9 | .DS_Store
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: R
 4 | warnings_are_errors: false
 5 | os:
 6 |   - linux
 7 | compiler:
 8 |   - gcc
 9 |   - clang
10 | env:
11 |   - _R_CHECK_SYSTEM_CLOCK_=0
12 | cache:
13 |   directories:
14 |     - $TRAVIS_BUILD_DIR/packrat/src
15 |     - $TRAVIS_BUILD_DIR/packrat/lib
16 |   packages: true
17 | r:
18 |   - release
19 |   - devel
20 |   - oldrel
21 | 


--------------------------------------------------------------------------------
/CRAN-RELEASE:
--------------------------------------------------------------------------------
1 | This package was submitted to CRAN on 2020-07-17.
2 | Once it is accepted, delete this file and tag the release (commit c3bd1bcb2e).
3 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: lazyarray
 2 | Type: Package
 3 | Title: Persistent Large Data Array with Lazy-Loading on Demand
 4 | Version: 1.1.0.9000
 5 | Language: en-US
 6 | License: AGPL-3
 7 | Encoding: UTF-8
 8 | SystemRequirements: 
 9 |     C++11
10 |     little-endian platform
11 | RoxygenNote: 7.1.1
12 | URL: https://github.com/dipterix/lazyarray
13 | BugReports: https://github.com/dipterix/lazyarray/issues
14 | Authors@R: c(
15 |     person("Zhengjia", "Wang", email = "dipterix.wang@gmail.com", role = c("aut", "cre", "cph")),
16 |     person("Mark", "Klik", email = "markklik@gmail.com", role = c("ctb", "cph"),
17 |       comment = "Copyright holder of fstcore package")
18 |     )
19 | Description: Multi-threaded serialization of compressed array that 
20 |     fully utilizes modern solid state drives. It allows 
21 |     to store and load extremely large data on demand within seconds
22 |     without occupying too much memories. With data stored on hard drive, 
23 |     a lazy-array data can be loaded, shared across multiple R sessions.
24 |     For arrays with partition mode on, multiple R sessions can write to 
25 |     a same array simultaneously along the last dimension (partition). 
26 |     The internal storage format is provided by 'fstcore' package geared by 
27 |     'LZ4' and 'ZSTD' compressors.
28 | Imports: 
29 |     utils,
30 |     methods,
31 |     parallel,
32 |     stats,
33 |     Rcpp (>= 1.0.5),
34 |     R6,
35 |     fstcore,
36 |     yaml,
37 |     filematrix
38 | LinkingTo: 
39 |     Rcpp,
40 |     fstcore,
41 | Suggests: 
42 |     future,
43 |     future.callr,
44 |     testthat,
45 |     knitr,
46 |     fst,
47 |     rmarkdown,
48 |     dipsaus (>= 0.0.8)
49 | VignetteBuilder: knitr
50 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # h/t to @jimhester and @yihui for this parse block:
 2 | # https://github.com/yihui/knitr/blob/dc5ead7bcfc0ebd2789fe99c527c7d91afb3de4a/Makefile#L1-L4
 3 | # Note the portability change as suggested in the manual:
 4 | # https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Writing-portable-packages
 5 | PKGNAME = `sed -n "s/Package: *\([^ ]*\)/\1/p" DESCRIPTION`
 6 | PKGVERS = `sed -n "s/Version: *\([^ ]*\)/\1/p" DESCRIPTION`
 7 | 
 8 | 
 9 | all: check
10 | 
11 | build:
12 | 	R CMD build .
13 | 
14 | check: build
15 | 	R CMD check --no-manual $(PKGNAME)_$(PKGVERS).tar.gz
16 | 
17 | install_deps:
18 | 	Rscript \
19 | 	-e 'if (!requireNamespace("remotes")) install.packages("remotes")' \
20 | 	-e 'remotes::install_deps(dependencies = TRUE)'
21 | 
22 | install: install_deps build
23 | 	R CMD INSTALL $(PKGNAME)_$(PKGVERS).tar.gz
24 | 
25 | clean:
26 | 	@rm -rf $(PKGNAME)_$(PKGVERS).tar.gz $(PKGNAME).Rcheck
27 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("[",FileArray)
 4 | S3method("[",FstArray)
 5 | S3method("[<-",FileArray)
 6 | S3method("[<-",FstArray)
 7 | S3method("dim<-",AbstractLazyArray)
 8 | S3method("dimnames<-",AbstractLazyArray)
 9 | S3method(as.lazyarray,AbstractLazyArray)
10 | S3method(as.lazyarray,default)
11 | S3method(as.lazymatrix,AbstractLazyArray)
12 | S3method(as.lazymatrix,default)
13 | S3method(chunk_map,AbstractLazyArray)
14 | S3method(dim,AbstractLazyArray)
15 | S3method(dimnames,AbstractLazyArray)
16 | S3method(fitted,lazylm)
17 | S3method(head,AbstractLazyArray)
18 | S3method(length,AbstractLazyArray)
19 | S3method(max,AbstractLazyArray)
20 | S3method(mean,AbstractLazyArray)
21 | S3method(min,AbstractLazyArray)
22 | S3method(partition_map,AbstractLazyArray)
23 | S3method(partition_map,array)
24 | S3method(partition_table,AbstractLazyArray)
25 | S3method(partition_table,array)
26 | S3method(print,"LazyArray-summary")
27 | S3method(print,summary.lazylm)
28 | S3method(range,AbstractLazyArray)
29 | S3method(residuals,lazylm)
30 | S3method(subset,AbstractLazyArray)
31 | S3method(sum,AbstractLazyArray)
32 | S3method(summary,AbstractLazyArray)
33 | S3method(summary,lazylm)
34 | S3method(tail,AbstractLazyArray)
35 | export(as.lazyarray)
36 | export(as.lazymatrix)
37 | export(auto_clear_lazyarray)
38 | export(chunk_map)
39 | export(filearray)
40 | export(fstarray)
41 | export(get_lazy_threads)
42 | export(lazy_parallel)
43 | export(lazyarray)
44 | export(lazylm)
45 | export(partition_map)
46 | export(partition_table)
47 | export(set_lazy_threads)
48 | exportMethods(crossprod)
49 | exportMethods(typeof)
50 | import(stats)
51 | import(utils)
52 | importFrom(R6,R6Class)
53 | importFrom(Rcpp,evalCpp)
54 | importFrom(Rcpp,sourceCpp)
55 | importFrom(methods,setGeneric)
56 | importFrom(methods,setMethod)
57 | importFrom(methods,signature)
58 | importFrom(yaml,read_yaml)
59 | importFrom(yaml,write_yaml)
60 | useDynLib(lazyarray, .registration = TRUE)
61 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | lazyarray 1.1.0
 2 | =======
 3 | 
 4 | * Renamed `LazyArray` to `ClassLazyArray`
 5 | * Added `lazyarray` function to create, load or import lazy arrays in one function
 6 | * Added `auto_clear_lazyarray` to automatically remove data on hard disks upon garbage collecting (require `dipsaus` package installed, optional)
 7 | * Supported customized multi-part file names
 8 | 
 9 | lazyarray 1.0.0
10 | =======
11 | 
12 | * Initial CRAN release!
13 | 
14 | lazyarray 0.0.0
15 | =======
16 | 
17 | * Initial private beta release!
18 | * Imported `fstcore` as major back-end and implemented c++ interface
19 | 


--------------------------------------------------------------------------------
/R/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | setLazyBlockSize <- function(size) {
  5 |     .Call(`_lazyarray_setLazyBlockSize`, size)
  6 | }
  7 | 
  8 | getLazyBlockSize <- function() {
  9 |     .Call(`_lazyarray_getLazyBlockSize`)
 10 | }
 11 | 
 12 | fstMeta <- function(fileName) {
 13 |     .Call(`_lazyarray_fstMeta`, fileName)
 14 | }
 15 | 
 16 | fstRetrieve <- function(fileName, colSel, start, end) {
 17 |     .Call(`_lazyarray_fstRetrieve`, fileName, colSel, start, end)
 18 | }
 19 | 
 20 | fstStore <- function(fileName, table, compression, uniformEncoding) {
 21 |     .Call(`_lazyarray_fstStore`, fileName, table, compression, uniformEncoding)
 22 | }
 23 | 
 24 | checkFstMeta <- function(file, expect_nrows, cnames) {
 25 |     .Call(`_lazyarray_checkFstMeta`, file, expect_nrows, cnames)
 26 | }
 27 | 
 28 | loc2idx3 <- function(locations, parent_dim) {
 29 |     .Call(`_lazyarray_loc2idx3`, locations, parent_dim)
 30 | }
 31 | 
 32 | extractSlices <- function(listOrEnv, ndims) {
 33 |     .Call(`_lazyarray_extractSlices`, listOrEnv, ndims)
 34 | }
 35 | 
 36 | parseSlices <- function(listOrEnv, dim, pos_subscript = TRUE) {
 37 |     .Call(`_lazyarray_parseSlices`, listOrEnv, dim, pos_subscript)
 38 | }
 39 | 
 40 | parseAndScheduleBlocks2 <- function(sliceIdx, dim, forceSchedule = FALSE) {
 41 |     .Call(`_lazyarray_parseAndScheduleBlocks2`, sliceIdx, dim, forceSchedule)
 42 | }
 43 | 
 44 | reshapeOrDrop <- function(x, reshape = NULL, drop = FALSE) {
 45 |     .Call(`_lazyarray_reshapeOrDrop`, x, reshape, drop)
 46 | }
 47 | 
 48 | cpp_create_lazyarray <- function(x, dim, fileName, compression, uniformEncoding) {
 49 |     .Call(`_lazyarray_cpp_create_lazyarray`, x, dim, fileName, compression, uniformEncoding)
 50 | }
 51 | 
 52 | lazyMapReduceByPartition <- function(fileName, colSel, start, end = NULL, custom_func = NULL, reshape = NULL) {
 53 |     .Call(`_lazyarray_lazyMapReduceByPartition`, fileName, colSel, start, end, custom_func, reshape)
 54 | }
 55 | 
 56 | lazyLoadOld <- function(files, partition_locations, partition_dim, ndim, value_type) {
 57 |     .Call(`_lazyarray_lazyLoadOld`, files, partition_locations, partition_dim, ndim, value_type)
 58 | }
 59 | 
 60 | subsetFST <- function(rootPath, listOrEnv, dim, dtype, reshape = NULL, drop = FALSE) {
 61 |     .Call(`_lazyarray_subsetFST`, rootPath, listOrEnv, dim, dtype, reshape, drop)
 62 | }
 63 | 
 64 | scheduleFST <- function(listOrEnv, dim, forceSchedule = FALSE, hint = -1L) {
 65 |     .Call(`_lazyarray_scheduleFST`, listOrEnv, dim, forceSchedule, hint)
 66 | }
 67 | 
 68 | executeScheduleFST <- function(rootPath, dtype, reshape, drop, partition) {
 69 |     .Call(`_lazyarray_executeScheduleFST`, rootPath, dtype, reshape, drop, partition)
 70 | }
 71 | 
 72 | scheduleExistsFST <- function() {
 73 |     .Call(`_lazyarray_scheduleExistsFST`)
 74 | }
 75 | 
 76 | freeScheduleFST <- function() {
 77 |     .Call(`_lazyarray_freeScheduleFST`)
 78 | }
 79 | 
 80 | subsetFM <- function(rootPath, listOrEnv, dim, dtype, reshape, drop) {
 81 |     .Call(`_lazyarray_subsetFM`, rootPath, listOrEnv, dim, dtype, reshape, drop)
 82 | }
 83 | 
 84 | getLazyThread <- function(max = FALSE) {
 85 |     .Call(`_lazyarray_getLazyThread`, max)
 86 | }
 87 | 
 88 | setLazyThread <- function(n, reset_after_fork = NULL) {
 89 |     .Call(`_lazyarray_setLazyThread`, n, reset_after_fork)
 90 | }
 91 | 
 92 | hasOpenMP <- function() {
 93 |     .Call(`_lazyarray_hasOpenMP`)
 94 | }
 95 | 
 96 | subsetAssignFST <- function(values, file, listOrEnv, dim, dtype, compression = 50L, uniformEncoding = TRUE) {
 97 |     .Call(`_lazyarray_subsetAssignFST`, values, file, listOrEnv, dim, dtype, compression, uniformEncoding)
 98 | }
 99 | 
100 | dropDimension <- function(x) {
101 |     .Call(`_lazyarray_dropDimension`, x)
102 | }
103 | 
104 | prod2 <- function(x, na_rm = FALSE) {
105 |     .Call(`_lazyarray_prod2`, x, na_rm)
106 | }
107 | 
108 | parseDots <- function(env, eval) {
109 |     .Call(`_lazyarray_parseDots`, env, eval)
110 | }
111 | 
112 | stopIfNot <- function(isValid, message, stopIfError = TRUE) {
113 |     .Call(`_lazyarray_stopIfNot`, isValid, message, stopIfError)
114 | }
115 | 
116 | getSexpType <- function(x) {
117 |     .Call(`_lazyarray_getSexpType`, x)
118 | }
119 | 
120 | tik <- function() {
121 |     .Call(`_lazyarray_tik`)
122 | }
123 | 
124 | tok <- function(msg, stop = FALSE) {
125 |     .Call(`_lazyarray_tok`, msg, stop)
126 | }
127 | 
128 | subsetAssignVector <- function(x, start, value) {
129 |     .Call(`_lazyarray_subsetAssignVector`, x, start, value)
130 | }
131 | 
132 | # Register entry points for exported C++ functions
133 | methods::setLoadAction(function(ns) {
134 |     .Call('_lazyarray_RcppExport_registerCCallable', PACKAGE = 'lazyarray')
135 | })
136 | 


--------------------------------------------------------------------------------
/R/aaa.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom R6 R6Class
  2 | #' @importFrom Rcpp evalCpp
  3 | #' @importFrom Rcpp sourceCpp
  4 | #' @importFrom yaml read_yaml
  5 | #' @importFrom yaml write_yaml
  6 | #' @import utils
  7 | #' @import stats
  8 | #' @importFrom methods setGeneric
  9 | #' @importFrom methods setMethod
 10 | #' @importFrom methods signature
 11 | #' @useDynLib lazyarray, .registration = TRUE
 12 | NULL
 13 | 
 14 | has_dipsaus <- function(){
 15 |   system.file('', package = 'dipsaus') != ''
 16 | }
 17 | 
 18 | 
 19 | #' @title Set Number of Threads for Lazy Arrays
 20 | #' @description Set number of threads used by 'OpenMP' for both \code{lazyarray}
 21 | #' and \code{fstcore} packages.
 22 | #' @param nr_of_threads number of CPU cores to use, or \code{NULL} to 
 23 | #' stay unchanged, default is \code{getOption('lazyarray.nthreads')}
 24 | #' @param reset_after_fork whether to reset after forked process
 25 | #' @param max whether return maximum available threads
 26 | #' @return Number of cores currently used.
 27 | #' @seealso \code{\link[fstcore]{threads_fstlib}}
 28 | #' @name lazyarray-threads
 29 | #' @export
 30 | set_lazy_threads <- function(nr_of_threads = getOption('lazyarray.nthreads'), reset_after_fork = NULL){
 31 |   if(!is.null(reset_after_fork)){
 32 |     reset_after_fork <- isTRUE(reset_after_fork)
 33 |   } 
 34 |   if(is.null(nr_of_threads) || !is.numeric(nr_of_threads) || nr_of_threads == 0){
 35 |     nr_of_threads <- max(fstcore::threads_fstlib(), getLazyThread())
 36 |   }
 37 |   nr_of_threads = max(nr_of_threads, 1)
 38 |   setLazyThread(n = nr_of_threads, reset_after_fork = reset_after_fork)
 39 |   fstcore::threads_fstlib(nr_of_threads = nr_of_threads, reset_after_fork = reset_after_fork)
 40 |   getLazyThread()
 41 | }
 42 | 
 43 | #' @rdname lazyarray-threads
 44 | #' @export
 45 | get_lazy_threads <- function(max = FALSE){
 46 |   getLazyThread(max = isTRUE(max))
 47 | }
 48 | 
 49 | 
 50 | get_missing_value <- function(){
 51 |   (function(...){
 52 |     parseDots(environment(), FALSE)[[1]]
 53 |   })(,)
 54 | }
 55 | 
 56 | 
 57 | rand_string <- function(length = 50){
 58 |   paste(sample(c(letters, LETTERS, 0:9), length, replace = TRUE), collapse = '')
 59 | }
 60 | 
 61 | 
 62 | #' Automatically remove array data 
 63 | #' @author Zhengjia Wang
 64 | #' @description Remove the files containing array data once no
 65 | #' 'lazyarray' instance is using the folder. Require
 66 | #' installation of \code{dipsaus} package (at least version 0.0.8).
 67 | #' @param x 'lazyarray' instance
 68 | #' @param onexit passed to \code{\link{reg.finalizer}}
 69 | #' 
 70 | #' @details \code{auto_clear_lazyarray} attempts to remove the entire folder
 71 | #' containing array data. However, if some files are not created by the
 72 | #' array, only partition data and meta file will be removed, all the 
 73 | #' artifacts will remain and warning will be displayed. One exception is
 74 | #' if all files left in the array directory are \code{*.meta} files, 
 75 | #' all these meta files will be removed along with the folder.
 76 | #' 
 77 | #' @examples 
 78 | #' 
 79 | #' path <- tempfile()
 80 | #' arr_dbl <- lazyarray(path, storage_format = 'double',
 81 | #'                      dim = 2:4, meta_name = 'meta-dbl.meta')
 82 | #' arr_dbl[] <- 1:24
 83 | #' auto_clear_lazyarray(arr_dbl)
 84 | #' 
 85 | #' arr_chr <- lazyarray(path, storage_format = 'character',
 86 | #'                      dim = 2:4, meta_name = 'meta-chr.meta')
 87 | #' auto_clear_lazyarray(arr_chr)
 88 | #' 
 89 | #' # remove either one, the directory still exists
 90 | #' rm(arr_dbl); invisible(gc(verbose = FALSE))
 91 | #' 
 92 | #' arr_chr[1,1,1]
 93 | #' 
 94 | #' # Remove the other one, and path will be removed
 95 | #' rm(arr_chr); invisible(gc(verbose = FALSE))
 96 | #' 
 97 | #' dir.exists(path)
 98 | #' arr_check <- lazyarray(path, storage_format = 'character',
 99 | #'                        dim = 2:4, meta_name = 'meta-chr')
100 | #' 
101 | #' # data is removed, so there should be no data (NAs)
102 | #' arr_check[]
103 | #' 
104 | #' @export
105 | auto_clear_lazyarray <- function(x, onexit = FALSE){
106 |   if(requireNamespace('dipsaus', quietly = TRUE)){
107 |     path <- x$storage_path
108 |     path <- normalizePath(path)
109 |     dipsaus::shared_finalizer(x, key = path, function(e){
110 |       e$remove_data(force = TRUE)
111 |     }, onexit = onexit)
112 |     rm(path)
113 |   }
114 |   rm(x, onexit)
115 |   invisible()
116 | }
117 | 


--------------------------------------------------------------------------------
/R/class-filearray.R:
--------------------------------------------------------------------------------
  1 | #' @noRd
  2 | #' @title Internal Class definition for \code{FstArray}
  3 | #' @author Zhengjia Wang
  4 | #' @description Internal class definition of lazy array objects
  5 | FileArray <- R6::R6Class(
  6 |   classname = "FileArray",
  7 |   portable = TRUE,
  8 |   inherit = AbstractLazyArray,
  9 |   private = list(
 10 |     .backend = "filearray"
 11 |   ),
 12 |   public = list(
 13 |     print = function(...){
 14 |       cat("<FileArray> (", private$.storage_format, ')\n', sep = '')
 15 |       cat('Dimension:\t', paste(sprintf('%d ', private$.dim), collapse = 'x '), '\n')
 16 |       cat('File format: bmat\n')
 17 |       invisible(self)
 18 |     },
 19 |     
 20 |     initialize = function(path, dim, storage_format = c('double', 'integer'),
 21 |                           read_only = TRUE, meta_name = 'lazyarray.meta'){
 22 |       private$.file_format <- "bmat"
 23 |       storage_format <- match.arg(storage_format)
 24 |       if(missing(dim)){
 25 |         super$initialize(path = path, storage_format = storage_format, 
 26 |                          read_only = read_only, meta_name = meta_name)
 27 |       } else {
 28 |         super$initialize(path = path, dim = dim, storage_format = storage_format, 
 29 |                          read_only = read_only, meta_name = meta_name)
 30 |       }
 31 |       # cannot convert types
 32 |       if(length(self$raw_meta$storage_format) && storage_format != self$raw_meta$storage_format){
 33 |         stop("Data format inconsistent (cannot type-convert). Header info: ", 
 34 |              self$raw_meta$storage_format, "; provided: ", storage_format)
 35 |       }
 36 |     },
 37 |     
 38 |     get_partition_fpath = function(part, full_path = TRUE, summary_file = FALSE, type = c('data', 'desc', 'combined')){
 39 |       type <- match.arg(type)
 40 |       if(missing(part)){
 41 |         part <- seq_len(self$npart)
 42 |       } else {
 43 |         part <- as.integer(part)
 44 |         if(base::anyNA(part) || any(part <= 0)){
 45 |           stop("partition number must be all positive: ", part)
 46 |         }
 47 |       }
 48 |       res <- sprintf('%s%s', part, self$get_file_format())
 49 |       if(full_path){
 50 |         res <- file.path(private$.path, res)
 51 |       }
 52 |       if(summary_file){
 53 |         res <- sprintf('%s.summary', res)
 54 |         return(res)
 55 |       }
 56 |       if(type == 'desc'){
 57 |         res <- sprintf('%s.desc.txt', part)
 58 |       } else if(type == 'combined'){
 59 |         res <- sprintf('%s', part)
 60 |       } else {
 61 |         res <- sprintf('%s%s', part, self$get_file_format())
 62 |       }
 63 |       if(full_path){
 64 |         res <- file.path(private$.path, res)
 65 |       }
 66 |       res
 67 |     },
 68 |     
 69 |     has_partition = function(part){
 70 |       stopifnot(length(part) == 1)
 71 |       file <- self$get_partition_fpath(part, full_path = TRUE)
 72 |       desc <- self$get_partition_fpath(part, full_path = TRUE, type = 'desc')
 73 |       if(file.exists(file) && file.exists(desc)){
 74 |         return(TRUE)
 75 |       }
 76 |       return(FALSE)
 77 |     },
 78 |     initialize_partition = function(part, nofill = FALSE){
 79 |       if(!self$has_partition(part)){
 80 |         file <- self$get_partition_fpath(part, full_path = TRUE, type = 'combined')
 81 |         ptr <- filematrix::fm.create(file, self$partition_length, 1, type = self$storage_format)
 82 |         if(!nofill){
 83 |           ptr[] <- rep(self$sample_na, self$partition_length)
 84 |         }
 85 |         filematrix::close(ptr)
 86 |         return(TRUE)
 87 |       }
 88 |       return(FALSE)
 89 |     },
 90 |     get_partition_data = function(part, reshape = NULL){
 91 |       if(self$has_partition(part)){
 92 |         file <- self$get_partition_fpath(part, full_path = TRUE, type = 'combined')
 93 |         ptr <- filematrix::fm.open(file)
 94 |         re <- ptr[]
 95 |         filematrix::close(ptr)
 96 |         if(is.null(reshape)){
 97 |           reshapeOrDrop(re, reshape = self$partition_dim(), drop = FALSE)
 98 |         } else {
 99 |           reshapeOrDrop(re, reshape = as.numeric(reshape), drop = FALSE)
100 |         }
101 |         
102 |         return(re)
103 |       } else {
104 |         array(self$sample_na, self$partition_dim())
105 |       }
106 |     }
107 |   )
108 | )
109 | 
110 | 
111 | #' @export
112 | `[.FileArray` <- function(x, ..., drop = TRUE, reshape = NULL){
113 |   if(!x$is_valid){
114 |     stop("`[.FileArray`: x is no longer valid (data has been removed).")
115 |   }
116 |   if(!is.null(reshape)){
117 |     reshape <- as.numeric(reshape)
118 |     stopifnot(all(reshape>=0))
119 |   }
120 |   drop <- isTRUE(drop)
121 |   
122 |   # set block size to be the first margin to maximize reading speed
123 |   block_size <- dim(x)[[1]]
124 |   block_size <- max(block_size, 1)
125 |   block_size <- min(block_size, 16384)
126 |   setLazyBlockSize(block_size)
127 |   
128 |   on.exit({
129 |     # reset block size for fst arrays
130 |     block_size <- getOption('lazyarray.fstarray.blocksize', -1)
131 |     if(block_size <= 1){
132 |       block_size <- -1
133 |     }
134 |     setLazyBlockSize(block_size)
135 |   })
136 |   
137 |   subsetFM(rootPath = x$storage_path,listOrEnv = environment(),
138 |             dim = x$dim,dtype = x$sexptype,reshape = reshape,drop = drop)
139 | }
140 | 
141 | 
142 | #' @export
143 | `[<-.FileArray` <- function(x, ..., value){
144 |   if(!x$is_valid){
145 |     stop("`[<-.FileArray`: x is no longer valid (data has been removed).")
146 |   }
147 |   if(!x$can_write){
148 |     stop("`[<-.FileArray`: x is read-only")
149 |   }
150 |   
151 |   parsed <- parseAndScheduleBlocks2(environment(), x$dim, TRUE)
152 |   # parsed <- parseAndScheduleBlocks2(list(1:10,2:10,3:10,4:10), x$dim, TRUE)
153 |   # parsed <- parseAndScheduleBlocks2(list(1,1,1,1), x$dim, TRUE)
154 |   
155 |   if(parsed$subset_mode == 1){
156 |     stop("FstArray does not support single subscript (x[i]<-v), try x[]<-v or x[i,j,k,...]<-v")
157 |   }
158 |   partition_length <- prod(x$partition_dim())
159 |   
160 |   # x[]
161 |   if(parsed$subset_mode == 2){
162 |     value <- array(value, dim = x$dim)
163 |     fake_idx <- lapply(x$dim, function(x){ get_missing_value() })
164 |     slice_value <- function(ii){
165 |       fake_idx[[x$ndim]] <- ii
166 |       do.call(`[`, c(list(quote(value)), fake_idx))
167 |     }
168 |     # copy all to re inplace
169 |     for(ii in seq_len(x$npart)){
170 |       x$initialize_partition(part = ii, nofill = TRUE)
171 |       file <- x$get_partition_fpath(ii, full_path = TRUE, type = 'combined')
172 |       ptr_file <- filematrix::fm.open(file)
173 |       ptr_file[] <- slice_value(ii)
174 |       filematrix::close(ptr_file)
175 |     }
176 |   } else {
177 |     # x[i,j,k]
178 |     loc <- parsed$location_indices
179 |     if(!is.numeric(loc[[x$ndim]])){
180 |       # missing, all partitions
181 |       partitions <- seq_len(x$npart)
182 |     } else {
183 |       partitions <- loc[[x$ndim]]
184 |     }
185 |     # check if the schedule is made
186 |     schedule <- parsed$schedule
187 |     block_ndims <- schedule$block_ndims
188 |     
189 |     ptr <- 1
190 |     blocksize <- schedule$block_expected_length
191 |     
192 |     if(schedule$block_indexed){
193 |       value <- array(value, dim = c(blocksize, length(schedule$schedule_index), length(partitions)))
194 |     } else {
195 |       
196 |       value <- array(value, dim = c(parsed$expected_length / length(partitions), length(partitions)))
197 |     }
198 |     
199 |     for(ff in seq_along(partitions)){
200 |       file_ii <- partitions[[ff]]
201 |       # No file, NA
202 |       x$initialize_partition(part = file_ii)
203 |       file <- x$get_partition_fpath(file_ii, full_path = TRUE, type = 'combined')
204 |       ptr_file <- filematrix::fm.open(file, readonly = FALSE)
205 |       
206 |       if(schedule$block_indexed){
207 |         # file exists
208 |         for(ii in seq_along(schedule$schedule_index)){
209 |           schedule_ii <- schedule$schedule_index[[ii]]
210 |           row_number <- blocksize * (schedule_ii-1) + schedule$block_schedule
211 |           sel <- row_number > 0
212 |           ptr_file[row_number[sel], 1] <- value[sel,ii,ff]
213 |         }
214 |       } else {
215 |         # ndim == 2
216 |         row_number <- loc[[1]]
217 |         tryCatch({
218 |           sel <- row_number > 0
219 |           ptr_file[row_number[sel], 1] <- value[sel,ff]
220 |         }, error = function(e){
221 |           ptr_file[, 1] <- value[,ff]
222 |         })
223 |       }
224 |       filematrix::close(ptr_file)
225 |       
226 |     }
227 |     
228 |   }
229 |   
230 |   invisible(x)
231 | }
232 | 
233 | 


--------------------------------------------------------------------------------
/R/class-fstarray.R:
--------------------------------------------------------------------------------
  1 | #' @noRd
  2 | #' @title Internal Class definition for \code{FstArray}
  3 | #' @author Zhengjia Wang
  4 | #' @description Internal class definition of lazy array objects
  5 | FstArray <- R6::R6Class(
  6 |   classname = "FstArray",
  7 |   portable = TRUE,
  8 |   inherit = AbstractLazyArray,
  9 |   private = list(
 10 |     .compress_level = 50,
 11 |     .backend = "fstarray"
 12 |   ),
 13 |   public = list(
 14 |     print = function(...){
 15 |       cat("<FstArray> (", private$.storage_format, ')\n', sep = '')
 16 |       cat('Dimension:\t', paste(sprintf('%d ', private$.dim), collapse = 'x '), '\n')
 17 |       cat('File format: fst\n')
 18 |       invisible(self)
 19 |     },
 20 |     
 21 |     initialize = function(path, dim, storage_format = c('double', 'integer', 'complex', 'character'),
 22 |                           read_only = TRUE, meta_name = 'lazyarray.meta'){
 23 |       private$.file_format <- "fst"
 24 |       private$.compress_level <- 50
 25 |       if(missing(dim)){
 26 |         super$initialize(path = path, storage_format = storage_format, 
 27 |                          read_only = read_only, meta_name = meta_name)
 28 |       } else {
 29 |         super$initialize(path = path, dim = dim, storage_format = storage_format, 
 30 |                          read_only = read_only, meta_name = meta_name)
 31 |       }
 32 |       
 33 |     },
 34 |     has_partition = function(part){
 35 |       stopifnot(length(part) == 1)
 36 |       file <- self$get_partition_fpath(part, full_path = TRUE)
 37 |       cname <- "V1"
 38 |       if(self$storage_format == 'complex'){
 39 |         cname <- c("V1R", "V1I")
 40 |       }
 41 |       if(file.exists(file)){
 42 |         try({
 43 |           return(checkFstMeta(file, expect_nrows = self$partition_length, cnames = cname));
 44 |         }, silent = TRUE)
 45 |       }
 46 |       return(FALSE)
 47 |     },
 48 |     get_partition_data = function(part, reshape = NULL){
 49 |       if(self$has_partition(part)){
 50 |         file <- self$get_partition_fpath(part, full_path = TRUE)
 51 |         
 52 |         if(self$storage_format == 'complex'){
 53 |           cname <- c("V1R", "V1I")
 54 |           re <- fstRetrieve(file, cname, 1L, NULL)
 55 |           re <- re$resTable$V1R + (re$resTable$V1I) * 1i
 56 |         } else {
 57 |           cname <- "V1"
 58 |           re <- fstRetrieve(file, cname, 1L, NULL)
 59 |           re <- re$resTable$V1
 60 |         }
 61 |         if(is.null(reshape)){
 62 |           reshapeOrDrop(re, reshape = self$partition_dim(), drop = FALSE)
 63 |         } else {
 64 |           reshapeOrDrop(re, reshape = as.numeric(reshape), drop = FALSE)
 65 |         }
 66 |         
 67 |         return(re)
 68 |       } else {
 69 |         array(self$sample_na, self$partition_dim())
 70 |       }
 71 |     },
 72 |     
 73 |     `@chunk_map` = function(
 74 |       map_function, max_nchunks = 50, partitions = 'all', ...
 75 |     ){
 76 |       
 77 |       if(!is.function(map_function)){
 78 |         stop("map_function must be a function")
 79 |       }
 80 |       if(length(formals(map_function)) < 2){
 81 |         map_f <- function(data, chunk, idx){
 82 |           map_function(data)
 83 |         }
 84 |       } else if(length(formals(map_function)) < 2){
 85 |         map_f <- function(data, chunk, idx){
 86 |           map_function(data, chunk)
 87 |         }
 88 |       } else {
 89 |         map_f <- map_function
 90 |       }
 91 |       
 92 |       nrows <- self$partition_length
 93 |       
 94 |       # get chunk size
 95 |       chunkf <- make_chunks(nrows, max_nchunks = max_nchunks, ...)
 96 |       if(isTRUE(partitions == 'all')){
 97 |         files <- self$get_partition_fpath()
 98 |         # ncols <- self$npart
 99 |       } else {
100 |         files <- self$get_partition_fpath(partitions)
101 |         # ncols <- length
102 |       }
103 |       
104 |       # partition_locations <- list(
105 |       #   numeric(0),
106 |       #   seq_len(ncols)
107 |       # )
108 |       
109 |       sdata <- self$sample_na
110 |       
111 |       lapply2(seq_len(chunkf$nchunks), function(ii){
112 |         idx_range <- chunkf$get_indices(ii, as_numeric = TRUE)[[1]]
113 |         chunk_data <- lazyLoadOld(files = files, partition_dim = c(nrows, 1), 
114 |                                   partition_locations = list(seq.int(idx_range[[1]], idx_range[[2]]), 1L), 
115 |                                   ndim = 2L, value_type = sdata)
116 |         map_f(chunk_data, ii, idx_range)
117 |       })
118 |     }
119 |     
120 |     
121 |     
122 |   ),
123 |   active = list(
124 |     
125 |     compress_level = function(v){
126 |       if(!missing(v)){
127 |         stopifnot(v >= 0 & v <= 100)
128 |         private$.compress_level <- v
129 |         self$save_meta()
130 |       }
131 |       private$.compress_level
132 |     },
133 |     
134 |     meta = function(){
135 |       list(
136 |         dim = self$dim,
137 |         dimnames = self$dimnames,
138 |         storage_format = self$storage_format,
139 |         file_format = private$.file_format,
140 |         compress_level = private$.compress_level
141 |       )
142 |     }
143 |     
144 |     
145 |     
146 |   )
147 | )
148 | 
149 | 
150 | 
151 | 
152 | #' @export
153 | `[.FstArray` <- function(x, ..., drop = TRUE, reshape = NULL){
154 |   if(!x$is_valid){
155 |     stop("`[.FstArray`: x is no longer valid (data has been removed).")
156 |   }
157 |   if(!is.null(reshape)){
158 |     reshape <- as.numeric(reshape)
159 |     stopifnot(all(reshape>=0))
160 |   }
161 |   drop <- isTRUE(drop)
162 |   
163 |   bsize <- getOption('lazyarray.fstarray.blocksize', -1)
164 |   if(bsize <= 1) {
165 |     bsize = -1
166 |   }
167 |   setLazyBlockSize(bsize)
168 |   
169 |   subsetFST(rootPath = x$storage_path,listOrEnv = environment(),
170 |             dim = x$dim,dtype = x$sexptype,reshape = reshape,drop = drop)
171 | }
172 | 
173 | #' @export
174 | `[<-.FstArray` <- function(x, ..., value){
175 |   if(!x$is_valid){
176 |     stop("`[<-.FstArray`: x is no longer valid (data has been removed).")
177 |   }
178 |   if(!x$can_write){
179 |     stop("`[<-.FstArray`: x is read-only")
180 |   }
181 |   
182 |   parts <- subsetAssignFST(values = value, file = x$storage_path, listOrEnv = environment(),
183 |                   dim = x$dim, dtype = x$sexptype,
184 |                   compression = as.integer(x$compress_level),uniformEncoding = TRUE)
185 |   if(isTRUE(any(parts == -1))){
186 |     x$generate_summary()
187 |   } else {
188 |     parts <- parts[!is.na(parts) & parts > 0 & parts <= x$npart]
189 |     if(length(parts)){
190 |       x$generate_summary(parts)
191 |     }
192 |   }
193 |   # # get 
194 |   # 
195 |   # x$generate_summary()
196 |   invisible(x)
197 | }
198 | 
199 | 


--------------------------------------------------------------------------------
/R/generics.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Generate partition summary statistics for array objects along the last 
  3 | #' dimension
  4 | #' @param x an array or \code{LazyArray}
  5 | #' @param na.rm whether to remove \code{NA} when calculating summary statistics
  6 | #' @param ... passed to other methods or ignored
  7 | #' @return A data frame with the following possible columns: \code{Min}, 
  8 | #' \code{Max}, \code{Mean}, \code{Standard Deviation}, \code{NAs} (total number
  9 | #'  of \code{NA}), and \code{Length}.
 10 | #' @name partition_table
 11 | #' @examples 
 12 | #' 
 13 | #' # R array
 14 | #' x <- array(1:27, c(3,3,3))
 15 | #' partition_table(x)
 16 | #' 
 17 | #' # LazyArray
 18 | #' x <- lazyarray(tempfile(), storage_format = 'double', dim = c(3,3,3))
 19 | #' x[] <- 1:27
 20 | #' partition_table(x, quiet=TRUE)
 21 | #' 
 22 | #' @export
 23 | partition_table <- function(x, na.rm = FALSE, ...){
 24 |   UseMethod('partition_table')
 25 | }
 26 | 
 27 | #' @rdname partition_table
 28 | #' @export
 29 | partition_table.array <- function(x, na.rm = FALSE, ...){
 30 |   dim <- dim(x)
 31 |   suppressWarnings({
 32 |     smry <- t(apply(x, length(dim), function(d){
 33 |       c(
 34 |         min(d, na.rm = na.rm),
 35 |         max(d, na.rm = na.rm),
 36 |         mean(d, na.rm = na.rm),
 37 |         sd(d, na.rm = na.rm),
 38 |         sum(is.na(d)),
 39 |         length(d)
 40 |       )
 41 |     }))
 42 |   })
 43 |   smry <- as.data.frame(smry)
 44 |   names(smry) <- c('Min', 'Max', 'Mean', 'Standard Deviation', 'NAs', 'Length')
 45 |   smry
 46 | }
 47 | 
 48 | #' @rdname partition_table
 49 | #' @export
 50 | partition_table.AbstractLazyArray <- function(x, na.rm = FALSE, ...){
 51 |   smry <- summary(x, na.rm = FALSE, ...)
 52 |   re <- smry$partitions
 53 |   re$Count <- re$Length - re$NAs
 54 |   re
 55 | }
 56 | 
 57 | 
 58 | #' Apply function along the last dimension of an array and aggregate the results
 59 | #' @name partition_map
 60 | #' @param x R array or \code{LazyArray}
 61 | #' @param map_fun function that takes in a slice of array and an optional 
 62 | #' argument indicating current partition number
 63 | #' @param reduce function that accept a list of results returned by 
 64 | #' \code{map_fun}, can be missing
 65 | #' @param partitions integers of partitions, i.e. the slices of array to be 
 66 | #' applied to, can be missing. If missing, then applies to all partitions
 67 | #' @param ... internally used
 68 | #' @return If \code{reduce} is missing, returns a list of results. Each result
 69 | #' is returned by \code{map_fun}, and the total length equals to number of 
 70 | #' partitions mapped. If \code{reduce} is a function, that list of results will
 71 | #' be passed to \code{reduce} and \code{partition_map} returns the results 
 72 | #' generated from \code{reduce}.
 73 | #' @examples 
 74 | #' 
 75 | #' # -------------------------- Ordinary R array ---------------------------
 76 | #' 
 77 | #' x <- array(1:24, c(2,3,4))
 78 | #' partition_map(x, function(slice, part){
 79 | #'   sum(slice)
 80 | #' })
 81 | #' 
 82 | #' # When reduce and partitions are missing, the following code is equivalent
 83 | #' as.list(apply(x, 3, sum))
 84 | #' 
 85 | #' # When reduce is present
 86 | #' partition_map(x, function(slice, part){
 87 | #'   sum(slice)
 88 | #' }, function(slice_sum){
 89 | #'   max(unlist(slice_sum))
 90 | #' })
 91 | #' 
 92 | #' # equivalently, we could call
 93 | #' slice_sum <- partition_map(x, function(slice, part){
 94 | #'   sum(slice)
 95 | #' })
 96 | #' max(unlist(slice_sum))
 97 | #' 
 98 | #' # When partition is specified
 99 | #' # Partition 1, 2, and 4 exist but 5 is missing
100 | #' # when a partition is missing, the missing slice will be NA
101 | #' partition_map(x, function(slice, part){
102 | #'   sum(slice)
103 | #' }, partitions = c(1,2,4,5))
104 | #' 
105 | #' # -------------------------- LazyArray ---------------------------
106 | #' x <- lazyarray(tempfile(), storage_format = 'complex', dim = c(2,3,4))
107 | #' x[] <- 1:24 + (24:1) * 1i
108 | #' 
109 | #' partition_map(x, function(slice, part){
110 | #'   slice[1, ,] * slice[2, ,]
111 | #' }, reduce = function(mapped_prod){
112 | #'   mean(unlist(mapped_prod))
113 | #' })
114 | #' 
115 | #' 
116 | #' 
117 | #' @export
118 | partition_map <- function(x, map_fun, reduce, partitions, ...){
119 |   UseMethod('partition_map')
120 | }
121 | 
122 | #' @export
123 | partition_map.array <- function(x, map_fun, reduce, partitions, ...){
124 |   if(length(formals(map_fun)) == 1){
125 |     mfun <- function(x, part){
126 |       map_fun(x)
127 |     }
128 |   } else {
129 |     mfun <- map_fun
130 |   }
131 |   dim <- dim(x)
132 |   available_partitions <- seq_len(dim[[length(dim)]])
133 |   substr <- paste(rep('', length(dim)), collapse = ',')
134 |   
135 |   if(missing(partitions)){
136 |     partitions <- available_partitions
137 |   }
138 |   
139 |   res <- lapply(partitions, function(part){
140 |     if(part %in% available_partitions){
141 |       slice <- eval(parse(text = sprintf('x[%s%d,drop=FALSE]', substr, part)))
142 |     } else {
143 |       slice <- array(NA, c(dim[-length(dim)], 1))
144 |     }
145 |     mfun(slice, part)
146 |   })
147 |   
148 |   if(!missing(reduce) && is.function(reduce)){
149 |     res <- reduce(res)
150 |   }
151 |   
152 |   res
153 | }
154 | 
155 | #' @export
156 | partition_map.AbstractLazyArray <- function(x, map_fun, reduce, partitions, further_split = FALSE, ...){
157 |   if(missing(partitions)){
158 |     partitions <- seq_len(x$npart)
159 |   } else {
160 |     partitions <- as.integer(partitions)
161 |     partitions <- partitions[partitions > 0 & partitions <= x$npart]
162 |   }
163 |   
164 |   if(length(formals(map_fun)) == 1){
165 |     mfun <- function(x, part){
166 |       map_fun(x)
167 |     }
168 |   } else {
169 |     mfun <- map_fun
170 |   }
171 |   
172 |   mapped <- lapply2(partitions, function(part){
173 |     mfun(x$get_partition_data(part), part)
174 |   })
175 |   
176 |   if(!missing(reduce)){
177 |     mapped <- reduce(mapped)
178 |   }
179 |   mapped
180 |   
181 | }
182 | 
183 | 
184 | #' Apply functions to all partitions, but small chunks each time
185 | #' @seealso \code{\link{partition_map}}
186 | #' @param x a \code{LazyArray} or R array
187 | #' @param map_fun function to apply to each chunk
188 | #' @param reduce similar to \code{reduce} in \code{\link{partition_map}}
189 | #' @param max_nchunks maximum number of chunks. If number of chunks is too 
190 | #' large, then \code{chunk_size} will be re-calculated.
191 | #' @param chunk_size integer chunk size. If \code{chunk_size} is too small, it 
192 | #' will be ignored
193 | #' @param ... ignored or passed to other methods
194 | #' @return If \code{reduce} is missing, returns a list of results. Each result
195 | #' is returned by \code{map_fun}, and the total length equals to number of 
196 | #' chunks mapped. If \code{reduce} is a function, that list of results will
197 | #' be passed to \code{reduce} and \code{chunk_map} returns the results 
198 | #' generated from \code{reduce}.
199 | #' @details The difference between \code{chunk_map} and 
200 | #' \code{partition_map} is the margin or direction to apply mapping
201 | #' functions. In \code{partition_map}, mapping function is applied to 
202 | #' each partition. If \code{x} is a matrix, this means applying to each column.
203 | #' \code{chunk_map} generate small chunks along all dimensions except the last,
204 | #' and apply mapping functions to each chunks. If \code{x} is a matrix, it 
205 | #' make chunks along rows and apply mapping functions along rows. 
206 | #' @examples 
207 | #' 
208 | #' x <- as.lazymatrix(matrix(1:100, ncol = 2))
209 | #' x
210 | #' 
211 | #' # Set max_nchunks=Inf and chunk_size=10 to force total number of chunks
212 | #' # is around nrow(x)/10 and each chunk contains at most 10 rows
213 | #' chunk_map(x, function(chunk){chunk[1:2,]}, chunk_size = 10, max_nchunks = Inf)
214 | #' 
215 | #' # For each chunks, calculate mean, then calculate the mean of chunk mean
216 | #' chunk_map(x, function(chunk) {
217 | #'   colMeans(chunk)
218 | #' }, function(chunk_means) {
219 | #'   Reduce('+', chunk_means) / length(chunk_means)
220 | #' })
221 | #' 
222 | #' colMeans(x[])
223 | #' 
224 | #' 
225 | #' @export
226 | chunk_map <- function(x, map_fun, reduce, max_nchunks, chunk_size, ...){
227 |   UseMethod('chunk_map')
228 | }
229 | 
230 | 
231 | #' @export
232 | chunk_map.AbstractLazyArray <- function(x, map_fun, reduce, max_nchunks, chunk_size, partitions = 'all', ...){
233 |   
234 |   if(missing(max_nchunks)){
235 |     # calculate such that each chunk size is at most 0.5GB
236 |     max_nchunks <- auto_chunks(x)
237 |   }
238 |   new_x <- as.lazymatrix(x)
239 |   new_x$make_readonly()
240 |   
241 |   if(missing(chunk_size)){
242 |     chunk_size <- 1024L
243 |   }
244 |   mapped <- x$`@chunk_map`(map_function = map_fun, max_nchunks = max_nchunks, chunk_size = chunk_size, partitions = partitions)
245 |   
246 |   if(!missing(reduce)){
247 |     mapped <- reduce(mapped)
248 |   }
249 |   
250 |   return(mapped)
251 | }
252 | 
253 | 
254 | 
255 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | dir_create <- function(path, showWarnings = FALSE, recursive = TRUE, ...){
  2 |   dir.create(path = path, showWarnings = showWarnings, recursive = recursive, ...)
  3 | }
  4 | 
  5 | load_yaml <- function(path, ...){
  6 |   read_yaml(path, ...)
  7 | }
  8 | 
  9 | 
 10 | save_yaml <- function(x, path, ...){
 11 |   write_yaml(x, path, ...)
 12 | }
 13 | 
 14 | 
 15 | deparse1 <- function(..., collapse = ''){
 16 |   paste0(deparse(...), collapse = collapse)
 17 | }
 18 | 
 19 | rand_string <- function(length = 10){
 20 |   paste(sample(c(letters, LETTERS, 0:9), length, replace = TRUE), collapse = '')
 21 | }
 22 | 
 23 | make_chunks <- function(dim, chunk_size, max_nchunks = 200, recursive = FALSE){
 24 |   max_nchunks <- floor(max_nchunks)
 25 |   len <- prod(dim)
 26 |   drange <- lapply(dim, function(d){ c(1, d) })
 27 |   
 28 |   
 29 |   if(len == 0){
 30 |     return(list(nchunks = 0, get_indices = function(i, as_numeric = FALSE){
 31 |       if(as_numeric){ return(NULL) }
 32 |       paste(rep('', length(dim)), collapse = ',')
 33 |     }))
 34 |   }
 35 |   
 36 |   if(missing(chunk_size)){
 37 |     chunk_size <- getOption('lazyarray.chunk_memory', 80) * 125000
 38 |   }
 39 |   
 40 |   
 41 |   if(len <= chunk_size ){
 42 |     return(list(nchunks = 1, get_indices = function(i, as_numeric = FALSE){
 43 |       if(as_numeric){ return( drange ) }
 44 |       paste(rep('', length(dim)), collapse = ',')
 45 |     }))
 46 |   }
 47 |   
 48 |   lastdim <- dim[length(dim)]
 49 |   
 50 |   if( len < chunk_size * max_nchunks ){
 51 |     max_nchunks <- ceiling(len / chunk_size);
 52 |     if( chunk_size * max_nchunks < len ){
 53 |       max_nchunks <- max_nchunks + 1
 54 |     }
 55 |   }
 56 |   
 57 |   if(!recursive && lastdim < max_nchunks){
 58 |     max_nchunks <- lastdim
 59 |   }
 60 |   
 61 |   if(lastdim >= max_nchunks){
 62 |     nchunks <- max_nchunks
 63 |     m <- ceiling(lastdim / max_nchunks)
 64 |     x2 <- m * nchunks - lastdim
 65 |     x1 <- nchunks - x2
 66 |     return(list(
 67 |       nchunks = nchunks,
 68 |       get_indices = function(i, as_numeric = FALSE){
 69 |         if( i <= x1 ){
 70 |           s <- (i - 1) * m + 1
 71 |           e <- i * m
 72 |         } else {
 73 |           s <- x1 * m + (i-x1-1) * (m-1) + 1
 74 |           e <- x1 * m + (i-x1) * (m-1)
 75 |         }
 76 |         if(as_numeric){
 77 |           re <- drange
 78 |           re[[length(dim)]] <- c(s, e)
 79 |           return(re)
 80 |         } else {
 81 |           re <- paste(rep('', length(dim)), collapse = ',')
 82 |           if(s == e){
 83 |             return(sprintf('%s%d', re, s))
 84 |           } else {
 85 |             return(sprintf('%s%d:%d', re, s, e))
 86 |           }
 87 |         }
 88 |         
 89 |         
 90 |       }
 91 |     ))
 92 |   }
 93 |   
 94 |   # lastdim < max_nchunks and recursive
 95 |   if( lastdim > max_nchunks / 2 ){
 96 |     return(list(
 97 |       nchunks = lastdim,
 98 |       get_indices = function(i, as_numeric = FALSE){
 99 |         if(as_numeric){
100 |           re <- drange
101 |           re[[length(dim)]] <- c(i, i)
102 |           return(re)
103 |         } else {
104 |           re <- paste(rep('', length(dim)), collapse = ',')
105 |           sprintf('%s%d', re, i)
106 |         }
107 |       }
108 |     ))
109 |   }
110 |   re <-
111 |     Recall(
112 |       dim[-length(dim)],
113 |       chunk_size = chunk_size,
114 |       max_nchunks = max_nchunks / lastdim,
115 |       recursive = FALSE
116 |     )
117 |   
118 |   nchunks = re$nchunks * lastdim
119 |   get_indices <- function(i, as_numeric = FALSE){
120 |     i1 <- floor((i - 1) / lastdim) + 1
121 |     i2 <- i - lastdim * (i1-1)
122 |     
123 |     s <- re$get_indices(i1, as_numeric = as_numeric)
124 |     
125 |     if(as_numeric){
126 |       s[[length(dim)]] <- c(i2, i2)
127 |       return(s)
128 |     } else {
129 |       return(sprintf('%s,%d', s, i2))
130 |     }
131 |     
132 |   }
133 |   
134 |   return(list(
135 |     nchunks = nchunks,
136 |     get_indices = get_indices
137 |   ))
138 |   
139 | }
140 | 
141 | lapply2 <- function(x, FUN, ...){
142 |   if( length(x) > 1 && has_dipsaus() ){
143 |     dipsaus::lapply_async2(x, FUN, FUN.args = list(...), plan = getOption('lazyarray.parallel.strategy', FALSE))
144 |   } else {
145 |     lapply(x, FUN, ...)
146 |   }
147 | }
148 | 
149 | 
150 | auto_chunks <- function(x, limit = 0.5){
151 |   files <- x$get_partition_fpath()
152 |   if(length(files)){
153 |     fct <- mean(file.exists(files))
154 |   } else {
155 |     fct <- 1
156 |   }
157 |   
158 |   max_nchunks <- x$filesize / limit
159 |   if( fct > 0 ){
160 |     max_nchunks <- max_nchunks / fct
161 |   }
162 |   max_nchunks <- max(ceiling(max_nchunks), 1L)
163 |   max_nchunks
164 | }
165 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
  1 | # Rcpp::loadModule('LazyArrayModules', TRUE)
  2 | 
  3 | .onUnload <- function (libpath) {
  4 |   library.dynam.unload("lazyarray", libpath)
  5 | }
  6 | 
  7 | 
  8 | .onLoad <- function(libname, pkgname){
  9 |   options('lazyarray.parallel.strategy' = FALSE)
 10 |   options('lazyarray.chunk_memory' = 80)
 11 |   options('lazyarray.fstarray.blocksize' = -1)
 12 |   
 13 |   ncores <- parallel::detectCores(logical = TRUE)
 14 |   options('lazyarray.nthreads' = ncores)
 15 |   set_lazy_threads(ncores, TRUE)
 16 | }
 17 | 
 18 | 
 19 | #' Schedule parallel processes for \code{LazyArray}
 20 | #' @description Enable parallel processing, need \code{dipsaus} to be installed.
 21 | #' For \code{"callr"} strategy, please also install \code{future.callr}.
 22 | #' @param enabled whether multiple-process strategy is enabled
 23 | #' @param strategy strategies to apply, see \code{\link[future]{plan}} for
 24 | #' some of the details. For \code{"callr"} plan, please install package
 25 | #' @param workers number of 'CPU' cores to use. 
 26 | #' \code{future.callr}.
 27 | #' @param workers positive integer or \code{"auto"}, number of 'CPU' to use.
 28 | #' The default value is \code{"auto"}, i.e. \code{future::availableCores()}
 29 | #' @param ... Further passed to \code{\link[future]{plan}}
 30 | #' 
 31 | #' @export
 32 | lazy_parallel <- function(
 33 |   strategy = c(
 34 |     'multisession', 'multicore', 
 35 |     'multiprocess', 'cluster', 'remote', 'callr'),
 36 |   enabled = TRUE, workers = 'auto',
 37 |   ...
 38 |   ){
 39 |   
 40 |   options('lazyarray.parallel.strategy' = FALSE)
 41 |   strategy <- match.arg(strategy)
 42 |   if(!has_dipsaus()){
 43 |     stop('Package dipsaus not detected. Please install.packages("dipsaus")')
 44 |   }
 45 |   
 46 |   if(isTRUE(workers == 'auto')){
 47 |     # get maximum available workers
 48 |     workers <- future::availableCores()
 49 |   }
 50 |   
 51 |   if(enabled){
 52 |     
 53 |     if(strategy == 'multicore'){
 54 |       dipsaus::make_forked_clusters(..., workers = workers)
 55 |     } else if(strategy == 'callr'){
 56 |       future::plan(future.callr::callr, ..., workers = workers)
 57 |     } else {
 58 |       args <- list(...)
 59 |       tryCatch({
 60 |         future::plan(strategy, ..., workers = workers)
 61 |       }, error = function(e){
 62 |         do.call(future::plan, c(list(strategy), args))
 63 |       })
 64 |     }
 65 |     
 66 |   } else {
 67 |     future::plan('sequential')
 68 |   }
 69 |   
 70 |   invisible()
 71 | }
 72 | 
 73 | setOldClass(c('FstArray', 'AbstractLazyArray', 'R6'))
 74 | setOldClass(c('FileArray', 'AbstractLazyArray', 'R6'))
 75 | 
 76 | 
 77 | setGeneric("typeof")
 78 | 
 79 | 
 80 | #' Type of \code{LazyArray}
 81 | #' @param x a \code{LazyArray} or an R object
 82 | #' @return The type of data stored in the input
 83 | #' @exportMethod typeof
 84 | setMethod("typeof", signature(x="AbstractLazyArray"), function(x){
 85 |   x$storage_format
 86 | })
 87 | 
 88 | 
 89 | setGeneric("crossprod")
 90 | # setGeneric("tcrossprod")
 91 | 
 92 | #' Matrix Crossproduct
 93 | #' @param x a \code{LazyArray} or an R matrix
 94 | #' @param y \code{NULL} or matrix
 95 | #' @param weights numeric vector used as weight
 96 | #' @param ... passed to further methods
 97 | #' @return Matrix of cross product if data is small, or \code{LazyMatrix} if
 98 | #' matrix is too large
 99 | #' @name crossprod
100 | #' 
101 | #' @examples 
102 | #' 
103 | #' x <- matrix(1:100, 50)
104 | #' crossprod(x)
105 | #' 
106 | #' lazy_x <- as.lazymatrix(x)
107 | #' crossprod(lazy_x)[]
108 | #' 
109 | #' weights <- (1:50)/50
110 | #' 
111 | #' t(x) %*% diag(weights) %*% x
112 | #' crossprod(lazy_x, weights = weights)
113 | #' 
114 | #' \dontrun{
115 | #' 
116 | #' # large data set ~ 1.6GB
117 | #' x <- as.lazymatrix(matrix(rnorm(2e8), ncol = 2))
118 | #' 
119 | #' crossprod(x)
120 | #' }
121 | #' 
122 | #' 
123 | NULL
124 | 
125 | #' @rdname crossprod
126 | #' @exportMethod crossprod
127 | setMethod("crossprod", signature(x="AbstractLazyArray", y = 'AbstractLazyArray'), function(x, y = NULL, weights = NULL, ...){
128 |   lazy_crossprod(x, y, weights = weights, ...)
129 | })
130 | 
131 | #' @rdname crossprod
132 | #' @exportMethod crossprod
133 | setMethod("crossprod", signature(x="AbstractLazyArray", y = 'NULL'), function(x, y = NULL, weights = NULL, ...){
134 |   lazy_crossprod(x, NULL, weights = weights, ...)
135 | })
136 | 
137 | #' @rdname crossprod
138 | #' @exportMethod crossprod
139 | setMethod("crossprod", signature(x="AbstractLazyArray", y = "missing"), function(x, y = NULL, weights = NULL, ...){
140 |   lazy_crossprod(x, NULL, weights = weights, ...)
141 | })
142 | 
143 | #' @rdname crossprod
144 | #' @exportMethod crossprod
145 | setMethod("crossprod", signature(x="AbstractLazyArray", y = 'matrix'), function(x, y = NULL, weights = NULL, ...){
146 |   if(!is.null(weights)){
147 |     stopifnot(length(weights) == x$partition_length)
148 |     res <- lapply(seq_len(x$npart), function(ii){
149 |       x$get_partition_data(ii, reshape = c(1, x$partition_length)) %*% (y * weights)
150 |     })
151 |   } else {
152 |     res <- lapply(seq_len(x$npart), function(ii){
153 |       x$get_partition_data(ii, reshape = c(1, x$partition_length)) %*% y
154 |     })
155 |   }
156 |   
157 |   do.call('rbind', res)
158 | })
159 | 
160 | 
161 | lazy_crossprod <- function(x, y = NULL, weights = NULL, ...){
162 |   
163 |   if(!is.null(weights)){
164 |     stopifnot(length(weights) == x$partition_length)
165 |   }
166 |   
167 |   new_x <- as.lazymatrix(x)
168 |   new_x$make_readonly()
169 |   if(is.null(y)){
170 |     yisx <- TRUE
171 |     new_y <- new_x
172 |   } else {
173 |     yisx <- isTRUE(x$storage_path == y$storage_path && x$get_file_format() == y$get_file_format())
174 |     new_y <- as.lazymatrix(y)
175 |   }
176 |   
177 |   if(length(weights)){
178 |     ftile <- filematrix::fm.create(tempfile(), nrow = length(weights), ncol = 1)
179 |     ftile[] <- weights
180 |     on.exit(filematrix::close(ftile))
181 |     
182 |     chunk_map(new_x, map_fun = function(data, ii, idx_range){
183 |       idx <- seq.int(idx_range[[1]], idx_range[[2]])
184 |       if(yisx){
185 |         return(crossprod(data, data * as.vector(ftile[idx,1])))
186 |       } else {
187 |         sub_y <- y[idx,,drop=FALSE] * as.vector(ftile[idx,1])
188 |         return(crossprod(data, sub_y))
189 |       }
190 |     }, reduce = function(mapped){
191 |       Reduce('+', mapped)
192 |     }, ...)
193 |     
194 |   } else {
195 |     chunk_map(new_x, map_fun = function(data, ii, idx_range){
196 |       
197 |       if(yisx){
198 |         return(crossprod(data))
199 |       } else {
200 |         sub_y <- y[seq.int(idx_range[[1]], idx_range[[2]]),,drop=FALSE]
201 |         return(crossprod(data, sub_y))
202 |       }
203 |     }, reduce = function(mapped){
204 |       Reduce('+', mapped)
205 |     }, ...)
206 |   }
207 |   
208 | }
209 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | Current submission 1.1.0:
 2 | 
 3 | Self check, 0 errors, 0 warnings, 0 notes
 4 | 
 5 | Changes:
 6 | * Explicitly import `R6Class` to clear the note on [CRAN check website](https://cran.r-project.org/web/checks/check_results_lazyarray.html)
 7 | * Added `little-endian platform` to `SystemRequirements`
 8 | * Added the compression method and backend into `DESCRIPTION`: 
 9 | > The internal storage format is provided by 'fstcore' package geared by 'LZ4' and 'ZSTD' compressors.
10 | 
11 | 
12 | Last submission 1.0.0: passed
13 | 
14 | Comments
15 | ```
16 | For your next update:
17 | If there are references describing the (theoretical background of)
18 | methods or algorithms in your package, please add these in the
19 | Description field of your DESCRIPTION file in the form
20 | authors (year) <doi:...>
21 | authors (year) <arXiv:...>
22 | authors (year, ISBN:...)
23 | with no space after 'doi:', 'arXiv:' and angle brackets for auto-linking.
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/experiment/create_array.R:
--------------------------------------------------------------------------------
  1 | # create arrays on disk
  2 | 
  3 | #' @title Create a lazy-array with given format and dimension
  4 | #' @author Zhengjia Wang
  5 | #' @description Create a directory to store lazy-array. The path must be missing. See \code{\link{load_lazyarray}} for more details
  6 | #' @param path path to a local drive to store array data
  7 | #' @param storage_format data type, choices are \code{"double"}, 
  8 | #' \code{"integer"}, \code{"character"}, and \code{"complex"}
  9 | #' @param dim integer vector, dimension of array, see \code{\link{dim}}
 10 | #' @param dimnames list of vectors, names of each dimension, see \code{\link{dimnames}}
 11 | #' @param compress_level 0 to 100, level of compression. 0 means
 12 | #' no compression, 100 means maximum compression. For persistent data,
 13 | #' it's recommended to set 100. Default is 50.
 14 | #' @param meta_name header file name, default is \code{"lazyarray.meta"}
 15 | #' @return A \code{ClassLazyArray} instance
 16 | #' @details Lazy array stores array into hard drive, and load them on
 17 | #' demand. It uses multi-thread which gains significant 
 18 | #' speed boost on solid state drives. 
 19 | #' 
 20 | #' For examples, see \code{\link{lazyarray}}.
 21 | #' 
 22 | #' @export
 23 | create_lazyarray <- function(
 24 |   path, storage_format, dim, dimnames = NULL, compress_level = 50L, 
 25 |   meta_name = 'lazyarray.meta'){
 26 |   
 27 |   if(dir.exists(path)){
 28 |     stop("Path already exists.")
 29 |   }
 30 |   multipart <- TRUE
 31 |   multipart_mode <- 1L
 32 |   
 33 |   stopifnot(compress_level <= 100 & compress_level >= 0)
 34 |   
 35 |   if(length(dim) < 1){
 36 |     stop("length(dim) must be at least 2")
 37 |   }
 38 |   
 39 |   if(!is.list(dimnames) && !is.null(dimnames)){
 40 |     stop("dimnames must be a list or NULL")
 41 |   }
 42 |   
 43 |   # check if dim matches with dimnames
 44 |   if(!is.null(dimnames)){
 45 |     dnl <- sapply(dimnames, length)
 46 |     if(length(dnl) != length(dim) || !all(dnl - dim == 0)){
 47 |       stop("Invalid dimension")
 48 |     }
 49 |   }
 50 |   
 51 |   stopifnot(storage_format %in% c('character', 'double', 'integer', 'complex'))
 52 |   
 53 |   part_dimension <- dim
 54 |   part_dimension[length(dim)] <- 1
 55 |   
 56 |   #####
 57 |   meta <- list(
 58 |     lazyarray_version = 0,
 59 |     file_format = 'fst',
 60 |     storage_format = storage_format,
 61 |     dim = dim,
 62 |     dimnames = dimnames,
 63 |     # partitioned = multipart,
 64 |     # prefix = prefix,
 65 |     part_dimension = part_dimension,
 66 |     postfix = '.fst',
 67 |     compress_level = compress_level
 68 |     # file_names = file_names
 69 |   )
 70 |   
 71 |   dir.create(path, showWarnings = TRUE, recursive = TRUE)
 72 |   path <- normalizePath(path, mustWork = TRUE)
 73 |   
 74 |   meta_path <- file.path(path, meta_name)
 75 |   save_yaml(meta, meta_path)
 76 |   
 77 |   ClassLazyArray$new(path = path, read_only = FALSE, meta_name = meta_name)
 78 |   
 79 | }
 80 | 
 81 | 
 82 | 
 83 | #' @title Load Lazy Array from Given Path
 84 | #' @author Zhengjia Wang
 85 | #' @param path character, path of the array
 86 | #' @param read_only whether setting data is allowed
 87 | #' @param meta_name header file name, default is \code{"lazyarray.meta"}
 88 | #' @return A \code{ClassLazyArray} instance
 89 | #' @examples 
 90 | #' 
 91 | #' path <- tempfile()
 92 | #' create_lazyarray(path, 'double', dim = c(3,4,5))
 93 | #' 
 94 | #' x <- load_lazyarray(path, read_only = FALSE)
 95 | #' x[2,3:4, 2:1] <- 1:4
 96 | #' x[ , , 2]
 97 | #' 
 98 | #' # Changing dimension for multiple partition data only
 99 | #' dim(x) <- c(3,4,6)
100 | #' dimnames(x) <- list(dim1 = as.character(1:3),
101 | #'                     dim2 = letters[1:4], 
102 | #'                     dim3 = LETTERS[1:6])
103 | #' names(dimnames(x))
104 | #' subset(x, dim1 ~ dim1 == '2', dim2 ~ dim2 %in% c('a', 'c'), drop = TRUE)
105 | #' 
106 | #' # Free up space
107 | #' x$remove_data()
108 | #' 
109 | #' \donttest{
110 | #' 
111 | #' # This example needs at least 4 GB hard disk space and it takes
112 | #' # time to run for performance profile
113 | #' 
114 | #' # Speed test
115 | #' path <- tempfile()
116 | #' x <- create_lazyarray(path, 'complex', dim = c(100,200,300,20))
117 | #' 
118 | #' # automatically call x$remove_data() upon garbage collection
119 | #' x$flag_auto_clean(TRUE)
120 | #' 
121 | #' 
122 | #' # set data (4 GB data) using 4 cores, compression level 50
123 | #' # data creation ~10 s, disk IO ~15-20 seconds, ~250MB/s
124 | #' 
125 | #' system.time({
126 | #'   lapply(1:20, function(ii){
127 | #'     # Generating partition data (~10 sec)
128 | #'     tmp <- rnorm(100*200*300) * (1+2i)
129 | #'     
130 | #'     # Write to disk (~16 sec)
131 | #'     x[,,,ii] <- tmp
132 | #'     NULL
133 | #'   })
134 | #' })
135 | #' 
136 | #' # Reading 64 MB data using 4 cores
137 | #' # ~0.25 seconds
138 | #' 
139 | #' system.time({
140 | #'   x[1:100, sample(200, 200), 100:1, 2:4]
141 | #' })
142 | #' 
143 | #' # This call requires 4GB of RAM
144 | #' # Reading all 4GB data using 4 cores
145 | #' # ~4 seconds (1 GB/s)
146 | #' 
147 | #' system.time({
148 | #'   x[]
149 | #' })
150 | #' 
151 | #' }
152 | #' 
153 | #' @export
154 | load_lazyarray <- function(path, read_only = TRUE, meta_name = 'lazyarray.meta'){
155 |   path <- normalizePath(path, mustWork = TRUE)
156 |   ClassLazyArray$new(path = path, read_only = read_only, meta_name = meta_name)
157 | }
158 | 


--------------------------------------------------------------------------------
/experiment/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.rds
2 | 


--------------------------------------------------------------------------------
/experiment/funcs-lm.R:
--------------------------------------------------------------------------------
  1 | # x <- rnorm(100); dim(x) = c(20,5)
  2 | # x <- as.lazymatrix(x)
  3 | # colnames(x) <- paste0('V', 1:5)
  4 | # rownames(x) <- 1:20
  5 | # dimnames(x)
  6 | # data = x
  7 | # formula <- V1+V3 * V1 ~ (.-V3)*V4
  8 | # weights = NULL
  9 | 
 10 | #' @name lazy_lm
 11 | #' @title Perform linear regression on \code{LazyMatrix} object
 12 | #' @param data \code{LazyMatrix} object
 13 | #' @param formula R formula
 14 | #' @param yidx column indices for response variable
 15 | #' @param xidx column indices for predictor
 16 | #' @param weights weights of observations
 17 | #' @param intercept whether to include intercept
 18 | #' @param na.action see \code{\link[stats]{lm}}
 19 | #' @param center,scale logical or numeric vectors with length equaling to 
 20 | #' data columns
 21 | #' @param residuals whether to return residuals
 22 | #' @param ... passed to \code{\link{chunk_map}}
 23 | #' @return A \code{"lm"} or \code{"mlm"} object
 24 | #' 
 25 | #' @examples 
 26 | #' 
 27 | #' a <- matrix(rnorm(100), ncol = 5)
 28 | #' x <- as.lazymatrix(a)
 29 | #' 
 30 | #' 
 31 | #' stats_lm <- stats::lm.fit(x = x[,-1], y = x[,1])$coefficients
 32 | #' 
 33 | #' lazy_lms <- lazy_lm_simple(x, yidx = 1, intercept = FALSE)
 34 | #' lazy_lms
 35 | #' 
 36 | #' 
 37 | #' # lazy_lm requires column names
 38 | #' colnames(x) <- c('y', 'x1', 'x2', 'x3', 'x4')
 39 | #' lazy_lm <- lazy_lm(y~.-1, data = x)
 40 | #' lazy_lm
 41 | #' 
 42 | #' 
 43 | #' 
 44 | #' @export
 45 | lazy_lm <- function(formula, data, weights = NULL, na.action = 'na.pass', 
 46 |                     center = FALSE, scale = FALSE, residuals = FALSE,...){
 47 |   
 48 |   data <- as.lazymatrix(data, read_only = TRUE, storage_format = 'double')
 49 |   
 50 |   if(data$`@transposed`){
 51 |     stop('data must be column-major to perform lazy_lm (otherwise this would be a very slow process)')
 52 |   }
 53 |   
 54 |   cnames <- dimnames(data)
 55 |   cnames <- cnames[[length(cnames)]]
 56 |   if(!length(cnames)){
 57 |     stop("lazy_lm cannot find column or partition names")
 58 |   }
 59 |   
 60 |   na.omit = TRUE
 61 |   smry_table <- partition_table(data)
 62 |   if(isTRUE(center)){
 63 |     center <- smry_table$Mean
 64 |   }
 65 |   
 66 |   if(isTRUE(scale)){
 67 |     scale <- smry_table$`Standard Deviation`
 68 |   }
 69 |   
 70 |   nobs <- nrow(data)
 71 |   force(na.action)
 72 |   # Create fake data to build model
 73 |   fake_data <- data[seq_len(ncol(data)),,drop=FALSE]
 74 |   fake_data <- as.data.frame(fake_data)
 75 |   names(fake_data) <- cnames
 76 |   mf <- stats::model.frame(formula, data = fake_data, na.action = na.action)
 77 |   mt <- attr(mf, "terms")
 78 |   y <- model.response(mf, "numeric")
 79 |   
 80 |   mlm <- is.matrix(y)
 81 |   ny <- if (mlm)  nrow(y) else length(y)
 82 |   nyr <- if (mlm)  ncol(y) else 1L
 83 |   
 84 |   if (is.empty.model(mt)) {
 85 |     x <- NULL
 86 |     z <- list(coefficients = if (mlm) matrix(NA_real_, 0, ny) else numeric(), rank = 0L)
 87 |     
 88 |     # TODO: add termsand formula
 89 |     return(z)
 90 |   }
 91 |   
 92 |   simple_fm <- formula
 93 |   simple_fm[[3]] <- quote(0)
 94 |   
 95 |   if(!is.null(weights)){
 96 |     weights <- as.lazymatrix(weights, storage_format = 'double')
 97 |   }
 98 |   
 99 |   
100 |   map_f <- function(chunk_x, chunk_number, chunk_idxrange){
101 |     if(!isFALSE(center) || !isFALSE(scale)){
102 |       chunk_x <- t(chunk_x)
103 |       if( !isFALSE(center) ){
104 |         # need to center chunk
105 |         chunk_x <- chunk_x - center
106 |       }
107 |       if( !isFALSE(scale) ){
108 |         # need to scale
109 |         chunk_x <- chunk_x / scale
110 |       }
111 |       chunk_x <- t(chunk_x)
112 |     }
113 |     chunk_x <- as.data.frame(chunk_x)
114 |     names(chunk_x) <- cnames
115 |     
116 |     mf <- stats::model.frame(simple_fm, data = chunk_x, na.action = na.action)
117 |     y <- model.response(mf, "numeric")
118 |     suppressWarnings({
119 |       x <- model.matrix(mt, chunk_x, contrasts)
120 |     })
121 |     
122 |     chunk_x <- cbind(y, x)
123 |     
124 |     if(!is.null(weights)){
125 |       cross <- crossprod(chunk_x, chunk_x * (weights[seq.int(chunk_idxrange[1], chunk_idxrange[2]), 1, drop = TRUE] / nobs))
126 |     } else {
127 |       cross <- crossprod(chunk_x, chunk_x / nobs)
128 |     }
129 |     cross
130 |     
131 |   }
132 |   
133 |   cross <- chunk_map(data, map_function = map_f, 
134 |                      reduce = function(mapped){
135 |                        Reduce('+', mapped)
136 |                      }, ...)
137 |   coef <- solve(cross[-seq_len(nyr), -seq_len(nyr)]) %*% cross[-seq_len(nyr),seq_len(nyr)]
138 |   
139 |   ret.residuals <- residuals
140 |   residuals <- NULL
141 |   if(ret.residuals){
142 |     residuals <- chunk_map(data, function(chunk_x, chunk_number, chunk_idxrange){
143 |       if(!isFALSE(center) || !isFALSE(scale)){
144 |         chunk_x <- t(chunk_x)
145 |         if( !isFALSE(center) ){
146 |           # need to center chunk
147 |           chunk_x <- chunk_x - center
148 |         }
149 |         if( !isFALSE(scale) ){
150 |           # need to scale
151 |           chunk_x <- chunk_x / scale
152 |         }
153 |         chunk_x <- t(chunk_x)
154 |       }
155 |       chunk_x <- as.data.frame(chunk_x)
156 |       names(chunk_x) <- cnames
157 |       
158 |       mf <- stats::model.frame(simple_fm, data = chunk_x, na.action = na.action)
159 |       y <- model.response(mf, "numeric")
160 |       suppressWarnings({
161 |         x <- model.matrix(mt, chunk_x, contrasts)
162 |       })
163 |       
164 |       y - x %*% coef
165 |     }, function(res){
166 |       unlist(res)
167 |     }, ...)
168 |   }
169 |   
170 |   
171 |   z <- list(
172 |     coefficients = drop(coef),
173 |     na.action = na.action,
174 |     offset = 0,
175 |     call = match.call(),
176 |     terms = mt,
177 |     residuals = residuals
178 |   )
179 |   class(z) <- c(if (mlm) "mlm", "lm")
180 |   z
181 | }
182 | 
183 | #' @rdname lazy_lm
184 | #' @export
185 | lazy_lm_simple <- function(data, yidx, xidx, intercept = TRUE, weights = NULL, residuals = FALSE, ...){
186 |   
187 |   data <- as.lazymatrix(data, read_only = TRUE, storage_format = 'double')
188 |   
189 |   if(data$`@transposed`){
190 |     stop('data must be column-major to perform lazy_lm_simple (otherwise this would be a very slow process)')
191 |   }
192 |   
193 |   
194 |   cidx <- seq_len(ncol(data))
195 |   nobs <- nrow(data)
196 |   
197 |   stopifnot(all(yidx %in% cidx))
198 |   
199 |   if(missing(xidx)){
200 |     xidx <- cidx[!cidx %in% yidx]
201 |   }
202 |   
203 |   if(!length(xidx) && !intercept){
204 |     x <- NULL
205 |     z <- list(coefficients = if (length(yidx) > 1) matrix(NA_real_, 0, length(yidx)) else numeric(), rank = 0L)
206 |     
207 |     return(z)
208 |   }
209 |   
210 |   
211 |   if(!is.null(weights)){
212 |     stopifnot(length(weights) == nobs)
213 |     weights <- as.lazymatrix(weights, storage_format = 'double')
214 |   }
215 |   
216 |   cross <- chunk_map(data, function(chunk_x, ii, chunk_idxrange){
217 |     chunk_x <- chunk_x[, c(yidx, xidx)]
218 |     if(intercept){
219 |       chunk_x <- cbind(chunk_x, 1)
220 |     }
221 |     if(!is.null(weights)){
222 |       cross <- crossprod(chunk_x, chunk_x * (weights[seq.int(chunk_idxrange[1], chunk_idxrange[2]), 1, drop = TRUE] / nobs))
223 |     } else {
224 |       cross <- crossprod(chunk_x, chunk_x / nobs)
225 |     }
226 |     cross
227 |   }, reduce = function(mapped){
228 |     Reduce('+', mapped)
229 |   }, ...)
230 |   
231 |   coef <- solve(cross[-seq_along(yidx), -seq_along(yidx)]) %*% cross[-seq_along(yidx),seq_along(yidx)]
232 |   
233 |   ret.residuals <- residuals
234 |   residuals <- NULL
235 |   if(ret.residuals){
236 |     residuals <- chunk_map(data, function(chunk_x, chunk_number, chunk_idxrange){
237 |       x <- chunk_x[,xidx]
238 |       if(intercept){
239 |         x <- cbind(x, 1)
240 |       }
241 |       chunk_x[,yidx] - x %*% coef
242 |     }, function(res){
243 |       unlist(res)
244 |     }, ...)
245 |   }
246 |   
247 |   dnames <- colnames(data)
248 |   if(!length(dnames)){
249 |     dnames <- sprintf('V%d', seq_len(ncol(data)))
250 |   }
251 |   if(intercept){
252 |     coef <- c(coef[length(coef)], coef[-length(coef)])
253 |     names(coef) <- c('(Intercept)', dnames[xidx])
254 |   } else {
255 |     coef <- drop(coef)
256 |     names(coef) <- dnames[xidx]
257 |   }
258 |   
259 |   
260 |   z <- list(
261 |     coefficients = coef,
262 |     na.action = 'na.pass',
263 |     offset = 0,
264 |     call = match.call(),
265 |     residuals = residuals
266 |   )
267 |   class(z) <- c(if (length(yidx) > 1) "mlm", "lm")
268 |   z
269 |   
270 | }
271 | 
272 | 
273 | 


--------------------------------------------------------------------------------
/experiment/junk.sh:
--------------------------------------------------------------------------------
 1 | devtools::load_all(); 
 2 | dim = c(2,3,4,5)
 3 | x = array(1:(prod(dim)), dim)# + 1i * array(rnorm(prod(dim)), dim)
 4 | a = as.lazyarray(x, path = tempfile())
 5 | idx <- c(1:120, 1:120)#sample(length(x), size = 200, replace = TRUE)
 6 | # idx[sample(200, 20)] = NA
 7 | re <- a[idx]
 8 | cp <- x[idx]
 9 | expect_equivalent(re, cp)
10 | 
11 | tik()
12 | gctorture2(1)
13 | a[1:2] -> re
14 | 
15 | # a[] -> re
16 | gctorture(FALSE)
17 | as.data.frame(tok("", stop=TRUE))
18 | 


--------------------------------------------------------------------------------
/experiment/lazymatrix.R:
--------------------------------------------------------------------------------
  1 | #' @rdname lazyarray
  2 | #' @export
  3 | lazymatrix <- function(
  4 |   path, storage_format, dim, dimnames = NULL, 
  5 |   compress_level = 50L, meta_name = 'lazyarray.meta', 
  6 |   read_only = FALSE, quiet = FALSE, ...
  7 | ){
  8 |   if(file.exists(path) && !dir.exists(path)){
  9 |     stop('lazymatrix path must be a directory path, but a file was found.')
 10 |   }
 11 |   
 12 |   if(!dir.exists(path)){
 13 |     if(length(dim) != 2){
 14 |       stop('dim must have length of 2 for matrix')
 15 |     }
 16 |     # not exists, create a new one
 17 |     arr <- create_lazyarray(
 18 |       path = path, storage_format = storage_format, dim = dim,
 19 |       dimnames = dimnames,  compress_level = compress_level, meta_name = meta_name)
 20 |     return(as.lazymatrix(arr, read_only = read_only))
 21 |   }
 22 |   
 23 |   # path exists, locate meta_name
 24 |   if(file.exists(file.path(path, meta_name))){
 25 |     path <- normalizePath(path, mustWork = TRUE)
 26 |     arr <- ClassLazyMatrix$new(path = path, read_only = read_only, meta_name = meta_name)
 27 |     return(arr)
 28 |   }
 29 |   
 30 |   if(!quiet){
 31 |     message('meta file not found, create one with existing files')
 32 |   }
 33 |   
 34 |   
 35 |   # Otherwise meta_name does not exist
 36 |   if(length(dim) != 2){
 37 |     stop('dim must have length of 2 for matrix')
 38 |   }
 39 |   
 40 |   lazyarray(
 41 |     path = path, storage_format = storage_format, dim = dim, dimnames = dimnames, 
 42 |     compress_level = compress_level, meta_name = meta_name, 
 43 |     read_only = read_only, quiet = quiet
 44 |   )
 45 |   
 46 |   ClassLazyMatrix$new(path = path, read_only = read_only, meta_name = meta_name)
 47 | }
 48 | 
 49 | 
 50 | 
 51 | #' @rdname lazyarray
 52 | #' @export
 53 | as.lazymatrix <- function(x, read_only = FALSE, ...){
 54 |   UseMethod('as.lazymatrix')
 55 | }
 56 | 
 57 | #' @rdname lazyarray
 58 | #' @export
 59 | as.lazymatrix.default <- function(x, read_only = FALSE, storage_format, path = tempfile(), ...){
 60 |   x <- unlist(x)
 61 |   call <- match.call()
 62 |   call[[1]] <- quote(as.lazymatrix.array)
 63 |   eval(call)
 64 | }
 65 | 
 66 | #' @rdname lazyarray
 67 | #' @export
 68 | as.lazymatrix.array <- function(x, read_only = FALSE, storage_format, path = tempfile(), ...){
 69 |   if(dir.exists(path)){
 70 |     stop("path exists, please specify a different path")
 71 |   }
 72 |   if(missing(storage_format)){
 73 |     storage_format <- storage.mode(x)
 74 |   }
 75 |   
 76 |   dm <- dim(x)
 77 |   if(length(dm) < 2){
 78 |     dm <- c(length(x), 1)
 79 |   } else {
 80 |     dm <- c( prod(dm) / dm[[length(dm)]], dm[[length(dm)]] )
 81 |   }
 82 |   dim(x) <- dm
 83 |   
 84 |   if(dm[[1]] < dm[[2]]){
 85 |     dm = rev(dm)
 86 |     re <- lazyarray(path = path, storage_format = storage_format, dim = dm)
 87 |     
 88 |     ii <- 1
 89 |     for(ii in seq_len(dm[[2]])){
 90 |       re[,ii] <- x[ii,]
 91 |     }
 92 |     re <- as.lazymatrix.LazyArray(re, read_only = read_only, ...)
 93 |     re <- t(re)
 94 |   } else {
 95 |     re <- lazyarray(path = path, storage_format = storage_format, dim = dm)
 96 |     
 97 |     ii <- 1
 98 |     for(ii in seq_len(dm[[2]])){
 99 |       re[,ii] <- x[,ii]
100 |     }
101 |     
102 |     re <- as.lazymatrix.LazyArray(re, read_only = read_only, ...)
103 |   }
104 |   re
105 |   
106 | }
107 | 
108 | #' @rdname lazyarray
109 | #' @export
110 | as.lazymatrix.LazyArray <- function(x, read_only = FALSE, storage_format, ...){
111 |   path <- dirname(x$storage_path)
112 |   meta_name <- x$meta_name
113 |   if(is.na(read_only) || !is.logical(read_only)){
114 |     read_only <- !x$can_write
115 |   }
116 |   
117 |   if(missing(storage_format)){
118 |     storage_format <- x$get_storage_format()
119 |   }
120 |   
121 |   stopifnot(storage_format %in% x$storage_formats_avail)
122 |   
123 |   meta_name <- sprintf("%s_version-%s", storage_format, x$meta_name)
124 |   
125 |   # create header
126 |   meta <- load_yaml(x$storage_path)
127 |   meta$storage_format <- storage_format
128 |   meta$dim <- c(length(x) / x$npart, x$npart)
129 |   meta$part_dimension <- c(meta$dim[[1]], 1L)
130 |   
131 |   if(length(meta$dimnames) == length(x$dim)){
132 |     dn <- meta$dimnames
133 |     ndim <- length(dn)
134 |     dk <- names(dn)
135 |     if(length(dk == ndim)){
136 |       meta$dimnames <- structure(list(NULL, dn[[ndim]]), names = c('', names(dn)[[ndim]]))
137 |     } else {
138 |       meta$dimnames <- list(NULL, dn[[ndim]])
139 |     }
140 |     
141 |   } else {
142 |     meta$dimnames <- NULL
143 |   }
144 |   
145 |   save_yaml(meta, file.path(path, meta_name))
146 |   
147 |   ClassLazyMatrix$new(path = path, read_only = read_only, meta_name = meta_name)
148 | }
149 | 
150 | #' @rdname lazyarray
151 | #' @export
152 | as.lazymatrix.LazyMatrix <- function(x, read_only = FALSE, storage_format, ...){
153 |   re <- as.lazymatrix.LazyArray(x, read_only, storage_format, ...)
154 |   re$`@transposed` <- x$`@transposed`
155 |   re
156 | }
157 | 


--------------------------------------------------------------------------------
/experiment/loader2ext.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H
 4 | #define LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H
 5 | 
 6 | #include "Rcpp.h"
 7 | 
 8 | SEXP subsetFST_double(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
 9 | 
10 | SEXP subsetFST_integer(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
11 | 
12 | SEXP subsetFST_character(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
13 | 
14 | SEXP subsetFST_complex(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
15 | 
16 | #endif // LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H
17 | 


--------------------------------------------------------------------------------
/experiment/old/bigmemory-example.R:
--------------------------------------------------------------------------------
 1 | path <- '~/Desktop/junk/'
 2 | f <- 'bigmemory-ieeg.testfile'
 3 | # unlink(file.path(path, f))
 4 | dir.create(path)
 5 | ncol = 84
 6 | x <- bigmemory::attach.resource(file.path(path, 'bigmemory-ieeg.testfile.desc'))
 7 | # x <- big.matrix(34497400, ncol = ncol, type = 'double', backingfile = f, backingpath = path)
 8 | # for(ii in 1:ncol){
 9 | #   x[,ii] <- y[,ii]
10 | # }
11 | system.time({x[]})
12 | 


--------------------------------------------------------------------------------
/experiment/old/bmWrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef LAZYARRAY_BM_SUBSET_H
 2 | #define LAZYARRAY_BM_SUBSET_H
 3 | 
 4 | #include <Rcpp.h>
 5 | // [[Rcpp::depends(BH, bigmemory)]]
 6 | #include <bigmemory/MatrixAccessor.hpp>
 7 | #include <numeric>
 8 | 
 9 | // [[Rcpp::interfaces(cpp)]]
10 | 
11 | // [[Rcpp::export]]
12 | SEXP subsetBM(SEXP pBigMat, SEXP listOrEnv, Rcpp::NumericVector dim, SEXPTYPE dtype,SEXP reshape = R_NilValue, bool drop = false);
13 | 
14 | #endif // LAZYARRAY_BM_SUBSET_H
15 | 


--------------------------------------------------------------------------------
/experiment/old/classLazyArray.h:
--------------------------------------------------------------------------------
 1 | #ifndef LAZYARRAY_CLASS_H
 2 | #define LAZYARRAY_CLASS_H
 3 | 
 4 | # include <Rcpp.h>
 5 | #include "common.h"
 6 | #include "utils.h"
 7 | using namespace Rcpp;
 8 | 
 9 | class FstLazyArray {
10 |   
11 | public:
12 |   enum DataType {DOUBLE=REALSXP};
13 |   
14 |   // Constructors and field getter/setter
15 | public:
16 |   FstLazyArray(
17 |     std::vector<std::string> fstFiles, std::vector<int64_t> dimension, int dataType
18 |   ): 
19 |   fstFiles(fstFiles), dimension(dimension), _dataType(dataType), _readOnly(true)
20 |   {
21 |     _nparts = *(dimension.end() - 1);
22 |     _totalLen = std::accumulate(dimension.begin(), dimension.end(), INTEGER64_ONE, std::multiplies<int64_t>());
23 |     _partLen = _totalLen / _nparts;
24 |     validate();
25 |   }
26 |   
27 |   virtual ~FstLazyArray(){}
28 |   
29 |   std::vector<std::string> fstFiles;
30 |   std::vector<int64_t> dimension;
31 |   
32 |   // getter
33 |   int64_t nparts() const { return _nparts; }
34 |   int64_t partLen() const { return _partLen; }
35 |   int dataType() const { return _dataType; }
36 |   const bool readOnly() const { return _readOnly; }
37 |   
38 |   // setter
39 |   void readOnly(const bool isReadOnly){
40 |     _readOnly = isReadOnly;
41 |   }
42 |   
43 |   bool validate(bool stopIfError = true);
44 |   
45 |   
46 | protected:
47 |   
48 |   int64_t              _nparts;
49 |   int64_t              _partLen;
50 |   int64_t              _totalLen;
51 |   int                  _dataType;
52 |   bool                 _readOnly;
53 | };
54 | 
55 | 
56 | 
57 | 
58 | // Define methods inline
59 | 
60 | inline bool FstLazyArray::validate(bool stopIfError) {
61 |   //_nparts _totalLen fstFiles dimension
62 |   bool isValid = true;
63 |   isValid = isValid || stopIfNot(dimension.size() >= 2, "FstLazyArray must dimension >= 2", stopIfError);
64 |   isValid = isValid || stopIfNot(*(dimension.end() - 1) == _nparts, "FstLazyArray dimensions inconsistent with number of partitions", stopIfError);
65 |   isValid = isValid || stopIfNot(fstFiles.size() == _nparts, "FstLazyArray file counts inconsistent with number of partitions", stopIfError);
66 |   
67 |   int64_t expectedLen = std::accumulate(dimension.begin(), dimension.end(), INTEGER64_ONE, std::multiplies<int64_t>());
68 |   isValid = isValid || stopIfNot(expectedLen == _totalLen, "FstLazyArray file counts inconsistent with number of partitions", stopIfError);
69 |   return isValid;
70 | }
71 | 
72 | 
73 | 
74 | 
75 | #endif // LAZYARRAY_CLASS_H
76 | 
77 | 


--------------------------------------------------------------------------------
/experiment/old/comparison-bigmemory.R:
--------------------------------------------------------------------------------
 1 | # Installation (dev version)
 2 | remotes::install_github('dipterix/dipsaus')
 3 | remotes::install_github('dipterix/lazyarray')
 4 | 
 5 | library(bigmemory)
 6 | library(biganalytics)
 7 | library(dipsaus)
 8 | library(lazyarray)
 9 | library(future)
10 | 
11 | 
12 | path <- '~/Desktop/junk/lazyarray_test2'
13 | arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double')
14 | arr <- as.lazymatrix(arr)
15 | colnames(arr) <- c('movie', 'customer', 'rating', 'year', 'month')
16 | # for(ii in 1:5){
17 | #   arr[,ii] <- x[,ii]
18 | # }
19 | system.time({
20 |   arr[,1]
21 | })
22 | 
23 | 
24 | path <- '~/Desktop/junk/'
25 | f <- 'bigmemory.testfile'
26 | # unlink(file.path(path, f))
27 | dir.create(path)
28 | ncol = 5
29 | x <- bigmemory::attach.resource(file.path(path, 'bigmemory.testfile.desc'))
30 | # x <- big.matrix(99072112, ncol = ncol, type = 'double', backingfile = f, backingpath = path)
31 | # for(ii in 1:ncol){
32 | #   x[,ii] <- arr[,ii]
33 | # }
34 | system.time({x[,2]})
35 | 
36 | # The challenge is calculation of X^T*X and X^T*y
37 | {
38 |   options(bigmemory.allow.dimnames=TRUE)
39 |   colnames(x) <- c('movie', 'customer', 'rating', 'year', 'month')
40 |   a <- proc.time()
41 |   res <- biglm.big.matrix(rating ~ movie + customer + year + month - 1, data = x)
42 |   b <- proc.time(); b - a
43 | }
44 | # Time difference of 26.069 secs
45 | 
46 | biglm::biglm(rating ~ movie + customer + year + month - 1, data = x)
47 | 
48 | 
49 | future::plan('multisession')
50 | {
51 |   a <- proc.time()
52 |   ll <- lazy_lm_simple(arr, yidx = 3, intercept = FALSE)
53 |   b <- proc.time(); b - a
54 | }
55 | 
56 | coefficients(ll) / coefficients(res)
57 | 
58 | future::plan('sequential')
59 | {
60 |   a <- proc.time()
61 |   lazyres <- lazy_lm(rating ~ movie + customer + year + month - 1, data = arr)
62 |   b <- proc.time(); b - a
63 | }
64 | coefficients(lazyres) / coefficients(res)
65 | 


--------------------------------------------------------------------------------
/experiment/old/comparison-lazyarray.R:
--------------------------------------------------------------------------------
  1 | # lazyarray (col major)
  2 | path <- tempfile()
  3 | arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double')
  4 | x <- as.lazymatrix(arr)
  5 | y <- x$transpose()
  6 | for(ii in 1:5){
  7 |   arr[,ii] <- rnorm(99072112)
  8 | }
  9 | 
 10 | {
 11 |   a = Sys.time()
 12 |   # fstcore::threads_fstlib(4)
 13 |   # dipsaus::make_forked_clusters(4)
 14 |   future::plan(future::multisession)
 15 |   nrows <- 99072112
 16 |   # xtx <- dipsaus::async_works(1:500, function(ii, ...){
 17 |   #   arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double')
 18 |   #   idx <- seq.int((ii-1) * 200000 + 1, ii * 200000)
 19 |   #   idx <- idx[idx <= nrows]
 20 |   #   if(!length(idx)){ return(list(
 21 |   #     xtx = matrix(0, 5,5),
 22 |   #     xty = rep(0, 5)
 23 |   #   )) }
 24 |   #   
 25 |   #   tmp <- cbind(1, arr[idx, c(1,2,4,5)])
 26 |   #   
 27 |   #   list(
 28 |   #     xtx = t(tmp) %*% tmp,
 29 |   #     xty = t(tmp) %*% arr[idx, 3]
 30 |   #   )
 31 |   #   
 32 |   # }, .globals =  list(nrows = nrows, path = path), .nworkers = 4)
 33 |   
 34 |   xtx <- dipsaus::lapply_async2(1:500, function(ii, ...){
 35 |     arr <- lazyarray::lazyarray(path, dim = c(99072112, 5), storage_format = 'double')
 36 |     idx <- seq.int((ii-1) * 200000 + 1, ii * 200000)
 37 |     idx <- idx[idx <= nrows]
 38 |     if(!length(idx)){ return(list(
 39 |       xtx = matrix(0, 5,5),
 40 |       xty = rep(0, 5)
 41 |     )) }
 42 |     
 43 |     tmp <- cbind(1, arr[idx, c(1,2,4,5)])
 44 |     
 45 |     list(
 46 |       xtx = t(tmp) %*% tmp,
 47 |       xty = t(tmp) %*% arr[idx, 3]
 48 |     )
 49 |     
 50 |   })
 51 |   
 52 |   xty <- Reduce('+', lapply(1:500, function(ii){
 53 |     xtx[[ii]]$xty
 54 |   }))
 55 |   xtx <- Reduce('+', lapply(1:500, function(ii){
 56 |     xtx[[ii]]$xtx
 57 |   }))
 58 |   coef <- solve(xtx) %*% xty
 59 |   b <- Sys.time(); b - a
 60 | }
 61 | 
 62 | # single thread
 63 | # Time difference of 15.4888 secs
 64 | 
 65 | # future, 4 cores, no openmp
 66 | # Time difference of 6.66791 secs
 67 | 
 68 | # openmp, 4 cores, one process
 69 | # Time difference of 18.39945 secs
 70 | 
 71 | # ---------------------------------------------------------------------
 72 | # lazyarray (row major)
 73 | path <- tempfile()
 74 | arr <- lazyarray::lazyarray(path, dim = c(200000, 500, 5), storage_format = 'double', multipart_mode = 2)
 75 | for(ii in 1:5){
 76 |   print(ii)
 77 |   arr[,,ii] <- rnorm(200000*500)
 78 | }
 79 | 
 80 | {
 81 |   # dipsaus::make_forked_clusters(4)
 82 |   future::plan(future::multisession)
 83 |   a = Sys.time()
 84 |   
 85 |   nrows <- 99072112
 86 |   xtx <- dipsaus::lapply_async2(1:500, function(ii, nrows, arr){
 87 |     idx <- seq.int((ii-1) * 200000 + 1, ii * 200000)
 88 |     idx <- idx <= nrows
 89 |     if(!any(idx)){ return(list(
 90 |       xtx = matrix(0, 5,5),
 91 |       xty = rep(0, 5)
 92 |     )) }
 93 |     tmp <- cbind(1, arr[idx, ii, c(1,2,4,5), drop = TRUE])
 94 |     list(
 95 |       xtx = t(tmp) %*% tmp,
 96 |       xty = t(tmp) %*% arr[idx, ii, 3, drop = TRUE]
 97 |     )
 98 |   }, plan = FALSE, FUN.args = list(nrows = nrows, arr = arr))#, .globals = list(nrows = nrows, arr = arr), .rs = TRUE)
 99 |   
100 |   xty <- Reduce('+', lapply(1:500, function(ii){
101 |     xtx[[ii]]$xty
102 |   }))
103 |   xtx <- Reduce('+', lapply(1:500, function(ii){
104 |     xtx[[ii]]$xtx
105 |   }))
106 |   
107 |   coef <- solve(xtx) %*% xty
108 |   b <- Sys.time(); b - a
109 | }
110 | 
111 | # Time difference of 13.36511 secs
112 | 
113 | coef - coefficients(res)
114 | 
115 | 


--------------------------------------------------------------------------------
/experiment/old/filearrsub.R:
--------------------------------------------------------------------------------
 1 | `[.FileArray` <- function(x, ..., drop = TRUE, reshape = NULL){
 2 |   if(!x$is_valid){
 3 |     stop("`[.FileArray`: x is no longer valid (data has been removed).")
 4 |   }
 5 |   if(!is.null(reshape)){
 6 |     reshape <- as.numeric(reshape)
 7 |     stopifnot(all(reshape>=0))
 8 |   }
 9 |   drop <- isTRUE(drop)
10 |   # get schedule
11 |   
12 |   parsed <- parseAndScheduleBlocks(environment(), x$dim, TRUE)
13 |   # parsed <- parseAndScheduleBlocks(list(1:10,2:10,3:10,4:10), x$dim, TRUE)
14 |   # parsed <- parseAndScheduleBlocks(list(1,1,1,1), x$dim, TRUE)
15 |   
16 |   if(parsed$subset_mode == 1){
17 |     stop("FstArray does not support single subscript (x[i]), try x[] or x[i,j,k,...]")
18 |   }
19 |   
20 |   re <- array(x$sample_na, parsed$target_dimension)
21 |   
22 |   if(parsed$expected_length == 0){
23 |     reshapeOrDrop(re, reshape, drop)
24 |     return(re)
25 |   }
26 |   
27 |   partition_length <- prod(x$partition_dim())
28 |   
29 |   # x[]
30 |   if(parsed$subset_mode == 2){
31 |     
32 |     blocksize <- partition_length
33 |     blocksize[[length(blocksize)]] <- 1
34 |     blocksize <- prod2(blocksize)
35 |     
36 |     # copy all to re inplace
37 |     for(ii in seq_len(x$npart)){
38 |       if(x$has_partition(ii)){
39 |         sub <- x$get_partition_data(ii)
40 |         subsetAssignVector(re, blocksize * (ii-1) + 1, sub)
41 |       }
42 |     }
43 |   } else {
44 |     # x[i,j,k]
45 |     loc <- parsed$location_indices
46 |     if(!is.numeric(loc[[4]])){
47 |       # missing, all partitions
48 |       partitions <- seq_len(x$npart)
49 |     } else {
50 |       partitions <- loc[[4]]
51 |     }
52 |     # check if the schedule is made
53 |     schedule <- parsed$schedule
54 |     block_ndims <- schedule$block_ndims
55 |     
56 |     ptr <- 1
57 |     blocksize <- schedule$block_expected_length
58 |     
59 |     for(file_ii in partitions){
60 |       # No file, NA
61 |       if(!x$has_partition(file_ii)){
62 |         ptr = ptr + blocksize * schedule$schedule_counts_per_part
63 |         next
64 |       }
65 |       
66 |       file <- x$get_partition_fpath(file_ii, full_path = TRUE, type = 'combined')
67 |       ptr_file <- filematrix::fm.open(file)
68 |       
69 |       if(schedule$block_indexed){
70 |         # file exists
71 |         for(schedule_ii in schedule$schedule_index){
72 |           row_number <- blocksize * (schedule_ii-1) + schedule$block_schedule
73 |           subsetAssignVector(re, ptr, ptr_file[row_number, 1])
74 |           ptr = ptr + blocksize
75 |         }
76 |       } else {
77 |         # ndim == 2
78 |         row_number <- loc[[1]]
79 |         if(is.numeric(row_number)){
80 |           # x[i,j]
81 |           buffer <- ptr_file[row_number, 1]
82 |         } else {
83 |           # x[i,j]
84 |           buffer <- ptr_file[, 1]
85 |         }
86 |         subsetAssignVector(re, ptr, buffer)
87 |         ptr = ptr + length(buffer)
88 |       }
89 |       
90 |       
91 |     }
92 |     
93 |   }
94 |   
95 |   reshapeOrDrop(re, reshape, drop)
96 |   re
97 | }
98 | 


--------------------------------------------------------------------------------
/experiment/old/int64_double.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | 
 5 | SEXP int64t2Sexp(const std::vector<int64_t>& x) {
 6 |   const R_xlen_t len = x.size();
 7 |   SEXP re = PROTECT(Rf_allocVector(REALSXP, len));
 8 |   std::memcpy(REAL(re), &(x[0]), len * sizeof(double));
 9 |   Rf_setAttrib(re, wrap("class"), wrap("integer64"));
10 |   return re;
11 | }
12 | 
13 | 
14 | std::vector<int64_t> numericVector2Int64tVec(const NumericVector& x){
15 |   const R_xlen_t len = x.size();
16 |   std::vector<int64_t> re( len );
17 |   std::memcpy(&(re[0]), &(x[0]), len * sizeof(int64_t));
18 |   return(re);
19 | }
20 | 
21 | int64_t doubleInt64t(const double& x){
22 |   int64_t re;
23 |   std::memcpy(&(re), &(x), sizeof(int64_t));
24 |   return(re);
25 | }
26 | 
27 | 
28 | double int64t2double(const int64_t& x){
29 |   double re;
30 |   std::memcpy(&(re), &(x), sizeof(double));
31 |   return(re);
32 | }
33 | 
34 | NumericVector int64tVec2NumericVector(const std::vector<int64_t>& x){
35 |   const R_xlen_t len = x.size();
36 |   NumericVector re = no_init( len );
37 |   std::memcpy(&(re[0]), &(x[0]), len * sizeof(double));
38 |   return(re);
39 | }
40 | 
41 | std::vector<int64_t> sexp2Int64tVec(SEXP x){
42 |   const R_xlen_t len = Rf_xlength(x);
43 |   std::vector<int64_t> re(len);
44 |   if(TYPEOF(x) == REALSXP){
45 |     std::memcpy(&(re[0]), REAL(x), len * sizeof(int64_t));
46 |   }else{
47 |     SEXP y = PROTECT(Rf_coerceVector(x, REALSXP));
48 |     std::memcpy(&(re[0]), REAL(y), len * sizeof(int64_t));
49 |     UNPROTECT(1);
50 |   }
51 |   return re;
52 | }
53 | 


--------------------------------------------------------------------------------
/experiment/old/junk.R:
--------------------------------------------------------------------------------
 1 | library(lazyarray)
 2 | path <- "~/Desktop/lazyarray_data/"
 3 | dimension <- c(287, 200, 601, 84)
 4 | x <- lazyarray(path, storage_format = "double", dim = dimension)
 5 | x$make_readonly()
 6 | system.time(x[,,,1])
 7 | gc()
 8 | prod(dim(x))
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/experiment/old/memcpy.cpp:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "util.h"
 3 | 
 4 | // [[Rcpp::export]]
 5 | Rcpp::List cpp_array_to_list_integer(IntegerVector x, IntegerVector cutoff){
 6 |   
 7 |   _rcpp_timer.step("start cpp_array_to_list_integer");
 8 |   
 9 |   Rcpp::List re(cutoff.size() - 1);
10 |   
11 |   // memory copy x
12 |   R_xlen_t size = x.size();
13 |   int* array = new int[size];
14 |   ::memcpy(array, &(*x.begin()), size * sizeof(int));
15 |   
16 |   _rcpp_timer.step("memory-copy");
17 |   
18 |   // cutoff
19 |   int *p1, *p2;
20 |   p1 = array + cutoff[0] - 1;
21 |   p2 = p1;
22 |   for( R_xlen_t ii = 0; ii < cutoff.size() - 1; ii++ ){
23 |     p2 += cutoff[ii + 1] - cutoff[ii];
24 |     re[ii] = IntegerVector(p1, p2);
25 |     p1 = p2; 
26 |     _rcpp_timer.step("split-" + std::to_string(ii));
27 |   }
28 |   _rcpp_timer.step("split-finished");
29 |   
30 |   // return NumericVector(data,data+sizeof(data)/sizeof(int));
31 |   
32 |   if( LAZYARRAY_DEBUG ){
33 |     
34 |     NumericVector _res(_rcpp_timer);
35 |     _res = _res / 1000.0;
36 |     Rcpp::print(_res);
37 |   }
38 |   
39 |   return re;
40 | }
41 | 
42 | /*** R
43 | timesTwo(42)
44 | */
45 | 


--------------------------------------------------------------------------------
/experiment/old/new_subset.R:
--------------------------------------------------------------------------------
 1 | devtools::load_all()
 2 | arr <- lazyarray:::as.lazyarray(array(1:prod(1:4), 1:4))
 3 | a <- function(i, ...) {
 4 |   files = arr$get_partition_fpath()
 5 |   lazyarray:::subsetFST(files, environment(), dim(arr), getSexpType(0.1))
 6 | }
 7 | dipsaus::lapply_async2(1:4, function(i, ...){
 8 |   a(,,,1)
 9 |   lazyarray:::getLazyThread()
10 | })
11 | 
12 | 
13 |   
14 | 
15 | a(, , , 1)
16 | 


--------------------------------------------------------------------------------
/experiment/old/old_scripts.R:
--------------------------------------------------------------------------------
 1 | `[.LazyArray` <- function(x, ..., drop = TRUE){
 2 |   
 3 |   # check dimensions
 4 |   nidx <- ...length()
 5 |   idx <- list()
 6 |   dim <- x$dim
 7 |   if(nidx == length(dim)){
 8 |     for(ii in seq_len(nidx)){
 9 |       idx[[ ii ]] <- tryCatch({
10 |         tmp <- ...elt(ii)
11 |         if(is.logical(tmp)){
12 |           if(length(tmp) < dim[ii]){
13 |             tmp <- rep(tmp, floor(dim[ii] / length(tmp)))
14 |             tmp <- tmp[seq_len(dim[ii])]
15 |           }
16 |           if(length(tmp) > dim[ii]){
17 |             stop("index out of bound at index ", ii)
18 |           }
19 |           which(tmp)
20 |         } else if (is.numeric(tmp)){
21 |           tmp
22 |         } else if (is.character(tmp)){
23 |           unlist(lapply(tmp, function(s){
24 |             re <- which(x$dimnames[[ii]] == s)
25 |             if(length(re)){ re[[1]] } else { -1 }
26 |           }))
27 |         }
28 |       }, error = function(e){
29 |         seq_len(dim[[ii]])
30 |       })
31 |     }
32 |     target_dim <- sapply(idx, length)
33 |     if(prod(target_dim) == 0){
34 |       if(drop){
35 |         return(x$`@sample_data`()[NULL])
36 |       } else {
37 |         return(array(x$`@sample_data`(), dim = target_dim))
38 |       }
39 |       
40 |     }
41 |     
42 |     idx$drop <- drop
43 |     return(do.call(x$`@get_data`, idx))
44 |   }
45 |   
46 |   has_idx <- FALSE
47 |   if(...length() == 1){
48 |     tryCatch({
49 |       idx <- ...elt(1)
50 |       has_idx <- TRUE
51 |     }, error = function(e){})
52 |   }
53 |   
54 |   if(has_idx){
55 |     if(!length(idx)){
56 |       return(logical(0))
57 |     } else {
58 |       # stop('lazyarray x[a:b] is not supported right now')
59 |       
60 |       # idx to each partition?
61 |       dm <- dim(x)
62 |       part_size<- length(x) / dm[[length(dm)]]
63 |       
64 |       partition_idx <- ((idx - 1) %% part_size) + 1
65 |       partition <- (idx - partition_idx) / part_size + 1
66 |       
67 |       if(isTRUE(x$`@transposed`)){
68 |         tmp <- partition
69 |         partition <- partition_idx
70 |         partition_idx <- tmp
71 |       }
72 |       
73 |       re <- partition_map(x, function(slice, part){
74 |         sel <- partition == part
75 |         list(
76 |           data = slice[partition_idx[sel]],
77 |           sel = sel
78 |         )
79 |       }, reduce = function(l){
80 |         re <- rep(NA, length(partition))
81 |         for(ii in seq_along(l)){
82 |           re[l[[ii]]$sel] <- l[[ii]]$data
83 |         }
84 |         re
85 |       })
86 |       return(re)
87 |     }
88 |     
89 |   }
90 |   
91 |   
92 |   x$`@get_data`(drop = drop)
93 |   
94 | }
95 | 


--------------------------------------------------------------------------------
/experiment/old/profile.R:
--------------------------------------------------------------------------------
 1 | devtools::load_all()
 2 | library(lazyarray)
 3 | 
 4 | x <- as.lazyarray(array(1:27,c(3,3,3)), storage_format = 'double')
 5 | lazyarray:::setLazyBlockSize(2)
 6 | x[,,1]
 7 | loc2idx2(list(c(NA,1,0), c(2,NA,1), 1), dim(x))
 8 | 
 9 | lazyarray:::setLazyBlockSize(-1)
10 | path <- "~/Desktop/lazyarray_data/"
11 | path <- "~/Desktop/junk/lazyarray_test2/"
12 | dimension <- c(287, 200, 601, 84)
13 | x <- lazyarray(path, storage_format = "double", dim = dimension)
14 | x
15 | a=x[,,,1:10]
16 | 
17 | subf <- function(i, ...){
18 |   .Call(lazyarray:::`_lazyarray_subsetFST`, x$get_partition_fpath(), environment(), dim(x),
19 |         getSexpType(x$`@sample_data`()), reshape=NULL, drop=FALSE)
20 | }
21 | subf <- function(i, ...){
22 |   lazyarray:::subsetFST(x$get_partition_fpath(), environment(), dim(x),
23 |                          getSexpType(x$`@sample_data`()), reshape=NULL, drop=FALSE)
24 | }
25 | 
26 | (subf(,,,1))
27 | lazyarray:::setLazyBlockSize(-1)
28 | system.time(subf(,,,1:10))
29 | lazyarray:::setLazyBlockSize(31250000)
30 | system.time(subf(,,,1:10))
31 | system.time(x$`@get_data`(1:287, 1:200, 1, 1:10))
32 | 
33 | lazyarray::set_lazy_threads(4, reset_after_fork = T)
34 | 


--------------------------------------------------------------------------------
/experiment/old/s4-definition.R:
--------------------------------------------------------------------------------
  1 | ensure_path <- function(x){
  2 |   if(!dir.exists(x)){
  3 |     dir_create(x)
  4 |   }
  5 |   x
  6 | }
  7 | 
  8 | #' @export
  9 | register_lazyarray <- function(x){
 10 |   x@method_list$validate <- function(stopIfError = TRUE){
 11 |     x@method_list$with_instance(function(pointer){
 12 |       .Call("LazyArrayBase__validate", pointer, stopIfError)
 13 |     })
 14 |   }
 15 |   x@method_list$subset <- function(env, reshape = NULL, drop = TRUE){ 
 16 |     stopifnot(is.environment(env) || is.list(env))
 17 |     x@method_list$with_instance(function(pointer){
 18 |       .Call("LazyArrayBase__subset", pointer, env, reshape, isTRUE(drop))
 19 |     })
 20 |   }
 21 |   x@method_list$subsetAssign <- function(env, value){ 
 22 |     stopifnot(is.environment(env) || is.list(env))
 23 |     x@method_list$with_instance(function(pointer){
 24 |       .Call("LazyArrayBase__subsetAssign", pointer, value, env)
 25 |     })
 26 |   }
 27 |   
 28 |   
 29 |   x@binding_list$nparts <- function(){
 30 |     x@method_list$with_instance(function(pointer){
 31 |       .Call("LazyArrayBase__nparts", pointer)
 32 |     })
 33 |   }
 34 |   x@binding_list$read_only <- function(v){
 35 |     if(missing(v)) v <- NULL
 36 |     x@method_list$with_instance(function(pointer){
 37 |       .Call("LazyArrayBase__readOnly", pointer, v)
 38 |     })
 39 |   }
 40 |   x@binding_list$dim <- function(){
 41 |     x@method_list$with_instance(function(pointer){
 42 |       .Call("LazyArrayBase__getDim", pointer)
 43 |     })
 44 |   }
 45 |   x@binding_list$length_per_part <- function(){
 46 |     x@method_list$with_instance(function(pointer){
 47 |       .Call("LazyArrayBase__partLen", pointer) 
 48 |     })
 49 |   }
 50 |   x@binding_list$data_type <- function(){
 51 |     x@method_list$with_instance(function(pointer){
 52 |       .Call("LazyArrayBase__dataType", pointer) 
 53 |     })
 54 |   }
 55 |   invisible(x)
 56 | }
 57 | 
 58 | make_methods_lazymatrix <- function(x){
 59 |   invisible(x)
 60 | }
 61 | 
 62 | #' @export
 63 | check_data_type <- function(dataType){
 64 |   if(is.numeric(dataType)){
 65 |     dataType <- as.integer(dataType)
 66 |   } else if(is.character(dataType)){
 67 |     dataType <- list("int" = 13L, "integer" = 13L, "numeric" = 14L, "double" = 14L, "complex" = 15L, "character" = 16L, "string" = 16L)[[dataType]]
 68 |   }
 69 |   if(!isTRUE(is.integer(dataType))){
 70 |     stop("dataType must be character or integer. Choices are: integer(13), double(14), complex(15), or string(16)")
 71 |   }
 72 |   dataType
 73 | }
 74 | 
 75 | 
 76 | #' @title Base S4 class for 'LazyArray'
 77 | #' @export
 78 | setClass("LazyArrayBase", slots = c(
 79 |   classname = "character", method_list = "list", 
 80 |   binding_list = "list", dimension = "numeric", dataType = "integer"
 81 | ), contains = 'oldClass')
 82 | 
 83 | #' @export
 84 | setClass("FstArray", contains = "LazyArrayBase", slots = c(rootPath = "character", compression = "integer", uniformEncoding = "logical"))
 85 | 
 86 | #' @export
 87 | setClass("FstMatrix", contains = "FstArray", slots = c(transposed = "logical"))
 88 | 
 89 | setMethod(
 90 |   "initialize", "LazyArrayBase",
 91 |   function(.Object, ...) {
 92 |     stop("LazyArrayBase is an abstract class. Please create from sub-classes")
 93 |   }
 94 | )
 95 | 
 96 | 
 97 | setMethod(
 98 |   "initialize", "FstArray",
 99 |   function(.Object, rootPath, dimension, dataType = "double", compression = 50, uniformEncoding = TRUE) {
100 |     stopifnot(is.numeric(dimension))
101 |     .Object@classname <- "FstArray"
102 |     .Object@dimension <- dimension
103 |     .Object@dataType <- check_data_type(dataType)[[1]]
104 |     .Object@compression <- as.integer(compression)[[1]]
105 |     .Object@uniformEncoding <- isTRUE(uniformEncoding)[[1]]
106 |     .Object@rootPath <- as.character(rootPath)[[1]]
107 |     
108 |     if(!is.list(.Object@method_list)){
109 |       .Object@method_list <- list()
110 |     }
111 |     if(!is.list(.Object@binding_list)){
112 |       .Object@binding_list <- list()
113 |     }
114 |     .Object@method_list$with_instance <- function(FUN){
115 |       ensure_path(.Object@rootPath)
116 |       pointer <- .Call("FstArray__new", .Object@rootPath, .Object@dimension, .Object@dataType, .Object@compression, .Object@uniformEncoding)
117 |       FUN(pointer)
118 |     }
119 |     .Object <- register_lazyarray(.Object)
120 |     .Object
121 |   }
122 | )
123 | 
124 | setMethod(
125 |   "initialize", "FstMatrix",
126 |   function(.Object, rootPath, dimension, transposed = FALSE, dataType = "double", compression = 50, uniformEncoding = TRUE) {
127 |     stopifnot(is.numeric(dimension))
128 |     .Object@classname <- "FstMatrix"
129 |     .Object@dimension = dimension
130 |     .Object@dataType <- check_data_type(dataType)[[1]]
131 |     .Object@compression <- as.integer(compression)[[1]]
132 |     .Object@uniformEncoding <- isTRUE(uniformEncoding)[[1]]
133 |     .Object@rootPath <- as.character(rootPath)[[1]]
134 |     if( transposed ){ transposed <- TRUE } else { transposed <- FALSE }
135 |     .Object@transposed <- transposed;
136 |     
137 |     if(!is.list(.Object@method_list)){
138 |       .Object@method_list <- list()
139 |     }
140 |     if(!is.list(.Object@binding_list)){
141 |       .Object@binding_list <- list()
142 |     }
143 |     .Object@method_list$with_instance <- function(FUN){
144 |       ensure_path(.Object@rootPath)
145 |       FUN(.Call("FstMatrix__new", .Object@rootPath, .Object@dimension, 
146 |                 .Object@transposed, .Object@dataType, .Object@compression, .Object@uniformEncoding))
147 |     }
148 |     .Object <- register_lazyarray(.Object)
149 |     .Object <- make_methods_lazymatrix(.Object)
150 |     .Object
151 |   }
152 | )
153 | 
154 | setOldClass(c("FstMatrix", "FstArray", "LazyArrayBase"))
155 | 
156 | #' @export
157 | lazyarray2 <- function(dim, storage_mode = "double", ..., type = 'fstarray'){
158 |   UseMethod("lazyarray2", structure(type, class = type))
159 | }
160 | 
161 | #' @export
162 | lazyarray2.fstarray <- function(path, dim, storage_mode = "double", ..., type){
163 |   x <- new("FstArray", path, dim, storage_mode, ...)
164 |   # new_s3class <- c(oldClass(x), "LazyArrayBase")
165 |   # attributes(new_s3class) <- attributes(oldClass(x))
166 |   # oldClass(x) <- new_s3class
167 |   x
168 | }
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/experiment/old/s4-generics.R:
--------------------------------------------------------------------------------
 1 | setGeneric("$")
 2 | setMethod(
 3 |   "$", "LazyArrayBase", function(x, name){
 4 |     if(name %in% names(x@binding_list)){
 5 |       return(x@binding_list[[name]]())
 6 |     }
 7 |     return(x@method_list[[name]])
 8 |   }
 9 | )
10 | 
11 | setGeneric("names")
12 | setMethod(
13 |   "names", "LazyArrayBase", function(x){
14 |     c(names(x@binding_list), names(x@method_list))
15 |   }
16 | )
17 | 
18 | #' @export
19 | `[.LazyArrayBase` <- function(x, ..., reshape = NULL, drop = TRUE){
20 |   drop <- as.logical(drop)
21 |   if(!is.null(reshape)){
22 |     reshape <- as.numeric(reshape)
23 |   }
24 |   x$subset(environment(), reshape = reshape, drop = drop)
25 | }
26 | 
27 | #' @export
28 | `[<-.LazyArrayBase` <- function(x, ..., value){
29 |   x$subsetAssign(environment(), value = value)
30 | }
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/experiment/old/speed test.R:
--------------------------------------------------------------------------------
 1 | library(lazyarray)
 2 | library(dipsaus)
 3 | library(future)
 4 | options(future.fork.enable = TRUE)
 5 | options('width' = 75)
 6 | path <- "~/Desktop/lazyarray_data/"
 7 | dimension <- c(287, 200, 601, 84)
 8 | x <- lazyarray(path, storage_format = "double", dim = dimension)
 9 | 
10 | 
11 | subarray <- x[,,,1,drop=FALSE]
12 | 
13 | tempf <- tempfile()
14 | temp_rds <- paste0(tempf, '.rds')
15 | temp_h5 <- paste0(tempf, '.h5')
16 | temp_lazy <- paste0(tempf, '.lazyarray')
17 | 
18 | unlink(temp_rds)
19 | unlink(temp_h5)
20 | unlink(temp_lazy, recursive = TRUE)
21 | 
22 | rds <- bench::mark(
23 |   {
24 |     saveRDS(subarray, temp_rds)
25 |   },
26 |   {
27 |     raveio::save_h5(subarray, file = temp_h5, name = '/data', chunk = c(287, 1, 10, 1))
28 |   },
29 |   {
30 |     y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
31 |     y[] <- subarray
32 |   }, memory = TRUE, time_unit = 's', iterations = 1, check = FALSE
33 | )
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/experiment/old/speed_test.R:
--------------------------------------------------------------------------------
  1 | library(lazyarray)
  2 | library(dipsaus)
  3 | library(future)
  4 | options(future.fork.enable = TRUE)
  5 | options('width' = 75)
  6 | path <- "~/Desktop/lazyarray_data/"
  7 | dimension <- c(287, 200, 601, 84)
  8 | x <- lazyarray(path, storage_format = "double", dim = dimension)
  9 | 
 10 | tempf <- tempfile()
 11 | 
 12 | prof <- lapply(1:10, function(ii){
 13 |   print(ii)
 14 |   subarray <- x[,,,1:ii,drop=FALSE]
 15 |   
 16 |   temp_rds <- paste0(tempf, '.rds')
 17 |   temp_h5 <- paste0(tempf, '.h5')
 18 |   temp_lazy <- paste0(tempf, '.lazyarray')
 19 |   
 20 |   unlink(temp_rds)
 21 |   unlink(temp_h5)
 22 |   unlink(temp_lazy, recursive = TRUE)
 23 |   
 24 |   # pause for a while
 25 |   Sys.sleep(2)
 26 |   
 27 |   plan('sequential')
 28 |   
 29 |   rds <- bench::mark(
 30 |     {
 31 |       saveRDS(subarray, temp_rds)
 32 |     },
 33 |     {
 34 |       raveio::save_h5(subarray, file = temp_h5, name = '/data', chunk = c(100, 1, 10, 1))
 35 |     },
 36 |     {
 37 |       lazyarray::set_lazy_threads(1L)
 38 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 39 |       y[] <- subarray
 40 |     }, 
 41 |     {
 42 |       lazyarray::set_lazy_threads(2L)
 43 |       unlink(temp_lazy, recursive = TRUE)
 44 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 45 |       y[] <- subarray
 46 |     }, 
 47 |     {
 48 |       lazyarray::set_lazy_threads(3L)
 49 |       unlink(temp_lazy, recursive = TRUE)
 50 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 51 |       y[] <- subarray
 52 |     }, 
 53 |     {
 54 |       lazyarray::set_lazy_threads(4L)
 55 |       unlink(temp_lazy, recursive = TRUE)
 56 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 57 |       y[] <- subarray
 58 |     }, 
 59 |     memory = TRUE, time_unit = 's', iterations = 1, check = FALSE, filter_gc = FALSE
 60 |   )
 61 |   
 62 |   saveRDS(object = rds, file = sprintf('~/Dropbox/projects/lazyarray/experiment/data/%d.rds', ii))
 63 |   rds
 64 | })
 65 | 
 66 | 
 67 | 
 68 | temp_rds <- paste0(tempf, '.rds')
 69 | temp_h5 <- paste0(tempf, '.h5')
 70 | temp_lazy <- paste0(tempf, '.lazyarray')
 71 | 
 72 | plan('sequential')
 73 | 
 74 | lapply(1:10, function(ii){
 75 |   rds <- bench::mark(
 76 |     {
 77 |       raveio::load_h5(file = temp_h5, name = '/data')[,,,1:ii]
 78 |     },
 79 |     {
 80 |       lazyarray::set_lazy_threads(1L)
 81 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 82 |       y[,,,1:ii]
 83 |     }, 
 84 |     {
 85 |       lazyarray::set_lazy_threads(2L)
 86 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 87 |       y[,,,1:ii]
 88 |     }, 
 89 |     {
 90 |       lazyarray::set_lazy_threads(3L)
 91 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 92 |       y[,,,1:ii]
 93 |     }, 
 94 |     {
 95 |       lazyarray::set_lazy_threads(4L)
 96 |       y <- lazyarray(temp_lazy, storage_format = 'double', dim = dim(subarray))
 97 |       y[,,,1:ii]
 98 |     }, 
 99 |     memory = TRUE, time_unit = 's', iterations = 1, check = FALSE, filter_gc = FALSE
100 |   )
101 |   
102 |   saveRDS(object = rds, file = sprintf('~/Dropbox/projects/lazyarray/experiment/data/read_%d.rds', ii))
103 | })
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/experiment/old/test_io.R:
--------------------------------------------------------------------------------
  1 | 
  2 | require(testthat)
  3 | 
  4 | context("cpp - IO base")
  5 | 
  6 | 
  7 | dim <- c(10,20,50); 
  8 | 
  9 | f <- normalizePath(tempfile(), mustWork = FALSE)
 10 | 
 11 | test_that("IO - double, NumericVector", {
 12 |   
 13 |   x <- rnorm(10000); dim(x) <- dim
 14 |   unlink(f)
 15 |   test_data <- x; dim(test_data) <- c(200, 50); test_data <- as.data.frame(test_data)
 16 |   .Call(fstcore:::`_fstcore_fststore`, f, test_data, 100L, TRUE)
 17 |   expect_true(file.exists(f), label = "fstcore can write to file")
 18 |   
 19 |   unlink(f)
 20 |   # write to a file
 21 |   file.create(f)
 22 |   f <- normalizePath(f, mustWork = FALSE)
 23 |   unlink(f)
 24 |   lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE);
 25 |   
 26 |   expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file")
 27 |   
 28 |   
 29 |   # Make sure we have invalid indices
 30 |   idx_loc <- list(
 31 |     as.integer(sample(12) - 1),
 32 |     as.integer(sample(22)-1),
 33 |     as.integer(sample(52)-1)
 34 |   )
 35 |   target_dim = sapply(idx_loc, length)
 36 |   y1 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.1)
 37 |   
 38 |   a = idx_loc[[1]]; a[(a<1) | (a >10)] = NA
 39 |   b = idx_loc[[2]]; b[b<1 | b > 20] = NA
 40 |   c = idx_loc[[3]]; c[c<1 | c > 50] = NA
 41 |   y2 <- x[a,b,c]
 42 |   
 43 |   expect_equal(sum(abs(is.na(y2) - is.na(y1))), 0, label = 'lazyarray subset (double) vs base -> index')
 44 |   expect_equal(range(y2 - y1, na.rm = TRUE), c(0,0), label = 'lazyarray subset (double) vs base -> value')
 45 |   
 46 |   # trying other loader
 47 |   y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 0L)
 48 |   expect_equal(range(as.integer(y2) - y3, na.rm = TRUE), c(0,0), label = 'lazyarray stored: double -> loader: int')
 49 |   
 50 |   y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, '')
 51 |   expect_equal(local({y2 <- as.character(y2); dim(y2) = dim(y3); y2}), y3, label = 'lazyarray stored: double -> loader: char')
 52 |   
 53 | })
 54 | 
 55 | 
 56 | test_that("IO - double, IntegerVector", {
 57 |   x <- as.integer(sample(10000)); dim(x) <- dim
 58 |   unlink(f)
 59 |   lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE);
 60 |   
 61 |   expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file")
 62 |   
 63 |   idx_loc <- list(
 64 |     as.integer(sample(12) - 1),
 65 |     as.integer(sample(22)-1),
 66 |     as.integer(sample(52)-1)
 67 |   )
 68 |   y1 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 9L)
 69 |   
 70 |   a = idx_loc[[1]]; a[(a<1) | (a >10)] = NA
 71 |   b = idx_loc[[2]]; b[b<1 | b > 20] = NA
 72 |   c = idx_loc[[3]]; c[c<1 | c > 50] = NA
 73 |   y2 <- x[a,b,c]
 74 |   
 75 |   expect_equal(sum(abs(is.na(y2) - is.na(y1))), 0, label = 'lazyarray subset (int) vs base -> index')
 76 |   expect_equal(range(y2 - y1, na.rm = TRUE), c(0,0), label = 'lazyarray subset (int) vs base -> value')
 77 |   expect_true(is.integer(y1), label = "check if it's integer")
 78 |   
 79 |   # trying other loaders
 80 |   y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.0)
 81 |   expect_equal(range(y2 - y3, na.rm = TRUE), c(0,0), label = 'lazyarray stored: int -> loader: double')
 82 |   
 83 |   y3 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, '')
 84 |   expect_equal(local({y2 <- as.character(y2); dim(y2) = dim(y3); y2}), y3, label = 'lazyarray stored: int -> loader: char')
 85 |   
 86 | })
 87 | 
 88 | 
 89 | 
 90 | test_that("IO - double, CharacterVector", {
 91 |   
 92 |   x <- paste(sample(LETTERS, 10000, replace = TRUE), " asdasd"); dim(x) <- dim
 93 |   unlink(f)
 94 |   lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE);
 95 |   
 96 |   expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file")
 97 |   
 98 |   idx_loc <- list(
 99 |     as.integer(sample(12) - 1),
100 |     as.integer(sample(22)-1),
101 |     as.integer(sample(52)-1)
102 |   )
103 |   y1 <- lazyarray:::lazyLoadOld(f, idx_loc, dim, '')
104 |   
105 |   a = idx_loc[[1]]; a[(a<1) | (a >10)] = NA
106 |   b = idx_loc[[2]]; b[b<1 | b > 20] = NA
107 |   c = idx_loc[[3]]; c[c<1 | c > 50] = NA
108 |   y2 <- x[a,b,c]
109 |   
110 |   expect_equal(sum(abs(is.na(y2) - is.na(y1))), 0, label = 'lazyarray subset (char) vs base -> index')
111 |   expect_equal(y1, y2, label = 'lazyarray subset (char) vs base -> value')
112 |   expect_true(is.character(y1), label = "check if it's integer")
113 |   
114 |   # trying other loaders
115 |   expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.0))
116 |   
117 |   expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0L))
118 |   
119 |   x <- paste0(sample(0:9, 10000, replace = TRUE), ""); dim(x) <- dim
120 |   unlink(f)
121 |   lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE);
122 |   
123 |   expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0.0))
124 |   
125 |   expect_error(lazyarray:::lazyLoadOld(f, idx_loc, dim, 0L))
126 |   
127 | })
128 | 


--------------------------------------------------------------------------------
/experiment/old/write.R:
--------------------------------------------------------------------------------
 1 | f = tempfile()
 2 | data = list(
 3 |   a = 1:10,
 4 |   b = 1:2
 5 | )
 6 | 
 7 | 
 8 | .Call(fstcore:::`_fstcore_fststore`, normalizePath(f, mustWork = FALSE), data, 20L, TRUE)
 9 | tbl = fst::fst(f)
10 | tbl
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/experiment/performance-fst-vs-file.R:
--------------------------------------------------------------------------------
 1 | # remotes::install_github('dipterix/lazyarray@filearray')
 2 | 
 3 | require(lazyarray)
 4 | 
 5 | path1 <- tempfile()
 6 | path2 <- tempfile()
 7 | 
 8 | dim <- c(100, 100, 100, 100)
 9 | data <- rnorm(prod(dim))
10 | pryr::object_size(data)
11 | 
12 | fstarray <- fstarray(path1, dim)
13 | filearray <- lazyarray(path2, dim)
14 | 
15 | fstarray[] <- data
16 | filearray[] <- data
17 | 
18 | system.time({fstarray[,,,]})
19 | 
20 | #  user  system elapsed 
21 | # 2.380   1.052   1.296
22 | 
23 | system.time({filearray[,,,]})
24 | 
25 | #  user  system elapsed 
26 | # 0.377   0.322   0.281
27 | 
28 | 
29 | # range(fstarray[,,,] - filearray[,,,])
30 | lazyarray:::setLazyBlockSize(100)
31 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
1 | cloneable
2 | fstcore
3 | LazyArray
4 | LZ
5 | OpenMP
6 | readRDS
7 | ZSTD
8 | 


--------------------------------------------------------------------------------
/inst/include/interfaces/BMArray.h:
--------------------------------------------------------------------------------
 1 | #ifndef API_LAZYARRAY_BMCLASS_H
 2 | #define API_LAZYARRAY_BMCLASS_H
 3 | 
 4 | #include "entry.h"
 5 | #include "LazyArrayBase.h"
 6 | 
 7 | namespace lazyarray {
 8 | 
 9 | class BMArray : public LazyArrayBase {
10 |   
11 |   // Constructors and field getter/setter
12 | public:
13 |   BMArray(
14 |     BigMatrix *pMat, std::vector<int64_t> dimension, SEXPTYPE dataType
15 |   ): 
16 |   LazyArrayBase(dimension, dataType), _xpMat( pMat )
17 |   {
18 |     validate();
19 |   }
20 |   
21 |   // To be used by Rcpp modules
22 |   virtual ~BMArray(){}
23 |   
24 |   // methods
25 |   bool validate(bool stopIfError = true) override {
26 |     
27 |     // Rcpp::XPtr<BigMatrix> _xpMat(_addr);
28 |     
29 |     //_nparts _totalLen fstFiles dimension
30 |     bool isValid = true;
31 |     isValid = isValid && stopIfNot(_dimension.size() >= 2, "BMArray must dimension >= 2", stopIfError);
32 |     isValid = isValid && stopIfNot(*(_dimension.end() - 1) == _nparts, "BMArray dimensions inconsistent with number of partitions", stopIfError);
33 |     
34 |     
35 |     int64_t ncols = _xpMat->ncol();
36 |     int64_t nrows = _xpMat->nrow();
37 |     isValid = isValid && stopIfNot(ncols == _nparts, "BMArray internal matrix column inconsistent with number of partitions", stopIfError);
38 |     
39 |     int64_t expectedLen = std::accumulate(_dimension.begin(), _dimension.end(), INTEGER64_ONE, std::multiplies<int64_t>());
40 |     isValid = isValid && stopIfNot((expectedLen == _totalLen) && (nrows * ncols == _totalLen), 
41 |                                    "BMArray file counts inconsistent with number of partitions", stopIfError);
42 |     
43 |     // _dataType 13:int, 14:double, 15: complex, 16: string
44 |     isValid = isValid && stopIfNot(
45 |       _dataType == INTSXP || _dataType == REALSXP || _dataType == STRSXP,
46 |       "BMArray data type invalid. Supported are: int(13), double(14), string(16)", stopIfError);
47 |     
48 |     return isValid;
49 |   }
50 |   
51 |   
52 |   SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) override {
53 |     NumericVector dim = getDim();
54 |     Rcpp::XPtr<BigMatrix> ptr(_xpMat);
55 |     return subsetBM(Rcpp::Shield<SEXP>(Rcpp::wrap(ptr)), listOrEnv, dim, _dataType, reshape, drop);
56 |     // return R_NilValue;
57 |   };
58 |   
59 | private:
60 |   BigMatrix* _xpMat;
61 | };
62 | 
63 | }
64 | 
65 | #endif // API_LAZYARRAY_BMCLASS_H
66 | 


--------------------------------------------------------------------------------
/inst/include/interfaces/FstArray.h:
--------------------------------------------------------------------------------
 1 | #ifndef API_LAZYARRAY_FSTCLASS_H
 2 | #define API_LAZYARRAY_FSTCLASS_H
 3 | 
 4 | #include <string>
 5 | #include "LazyArrayBase.h"
 6 | 
 7 | namespace lazyarray {
 8 | 
 9 | class FstArray : public LazyArrayBase {
10 |   
11 |   // Constructors and field getter/setter
12 | public:
13 |   FstArray(
14 |     const std::string rootPath, std::vector<int64_t> dimension, 
15 |     SEXPTYPE dataType, int& compression, bool& uniformEncoding
16 |   ): 
17 |   LazyArrayBase(dimension, dataType), 
18 |   _compression(compression), _uniformEncoding(uniformEncoding)
19 |   {
20 |     if(rootPath.size() == 0){
21 |       _rootPath = "./";
22 |     } else {
23 |       std::string ending = "/";
24 |       if(std::equal(ending.rbegin(), ending.rend(), rootPath.rbegin())){
25 |         _rootPath = rootPath;
26 |       } else {
27 |         _rootPath = rootPath + ending;
28 |       }
29 |     }
30 |     validate();
31 |   }
32 |   
33 |   virtual ~FstArray(){ destroy(); }
34 |   
35 |   // methods
36 |   inline bool validate(bool stopIfError = true) {
37 |     //_nparts _totalLen fstFiles dimension
38 |     bool isValid = LazyArrayBase::validate(stopIfError);
39 |     isValid = isValid && stopIfNot(
40 |       _dataType == INTSXP || _dataType == REALSXP || _dataType == CPLXSXP || _dataType == STRSXP,
41 |       "FstArray/FstMatrix data type invalid. Supported are: int(13), double(14), complex(15), string(16)", stopIfError);
42 |     return isValid;
43 |   }
44 |   
45 |   inline SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) override {
46 |     tok("S subset");
47 |     SEXP res = subsetFST(_rootPath, listOrEnv, _dimension, _dataType, reshape, drop);;
48 |     tok("E subset");
49 |     return res;
50 |   };
51 |   
52 |   inline SEXP subsetAssign(SEXP values, SEXP listOrEnv) override {
53 |     tok("S subsetAssign");
54 |     subsetAssignFST(values, _rootPath, listOrEnv, _dimension, _dataType, _compression, _uniformEncoding);
55 |     tok("E subsetAssign");
56 |     return R_NilValue;
57 |   }
58 |   
59 |   inline Rcpp::StringVector get_partition_path(SEXP part = R_NilValue){
60 |     tok("S get_partition_path");
61 |     Rcpp::StringVector fstFiles;
62 |     if(Rf_isNull(part)){
63 |       fstFiles = Rcpp::StringVector(_nparts);
64 |       Rcpp::StringVector::iterator ptr_fstFiles = fstFiles.begin();
65 |       for(int64_t ii = 1; ptr_fstFiles != fstFiles.end(); ii++, ptr_fstFiles++){
66 |         *ptr_fstFiles = _rootPath + std::to_string(ii) + ".fst";
67 |       }
68 |     } else {
69 |       std::vector<int64_t> part_alt = as<std::vector<int64_t>>(part);
70 |       fstFiles = Rcpp::StringVector(part_alt.size());
71 |       Rcpp::StringVector::iterator ptr_fstFiles = fstFiles.begin();
72 |       std::vector<int64_t>::iterator ptr_part_alt = part_alt.begin();
73 |       for(; ptr_fstFiles != fstFiles.end(); ptr_part_alt++, ptr_fstFiles++){
74 |         *ptr_fstFiles = _rootPath + std::to_string(*ptr_part_alt) + ".fst";
75 |       }
76 |     }
77 |     tok("E get_partition_path");
78 |     return fstFiles;
79 |   }
80 |   
81 |   
82 | protected:
83 |   int  _compression;
84 |   bool _uniformEncoding;
85 |   std::string _rootPath;
86 | 
87 | 
88 | };
89 | 
90 | 
91 | }
92 | 
93 | 
94 | #endif // API_LAZYARRAY_FSTCLASS_H
95 | 


--------------------------------------------------------------------------------
/inst/include/interfaces/FstMatrix.h:
--------------------------------------------------------------------------------
  1 | #ifndef API_LAZYARRAY_FSTMATRIX_H
  2 | #define API_LAZYARRAY_FSTMATRIX_H
  3 | 
  4 | #include "FstArray.h"
  5 | 
  6 | namespace lazyarray {
  7 | 
  8 | class FstMatrix : public FstArray {
  9 |   
 10 |   // Constructors and field getter/setter
 11 | public:
 12 |   FstMatrix(
 13 |     const std::string rootPath, std::vector<int64_t> dimension, bool& transposed,
 14 |     SEXPTYPE dataType, int& compression, bool& uniformEncoding
 15 |   ): FstArray(rootPath, dimension, dataType, compression, uniformEncoding), _transposed(transposed) {}
 16 |   
 17 |   virtual ~FstMatrix(){ destroy(); }
 18 |   
 19 |   bool validate(bool stopIfError = true) {
 20 |     //_nparts _totalLen fstFiles dimension
 21 |     bool isValid = FstArray::validate(stopIfError);
 22 |     isValid = isValid && stopIfNot(_dimension.size() == 2, "FstMatrix must be two dimension", stopIfError);
 23 |     return isValid;
 24 |   }
 25 |   
 26 |   NumericVector getDim() override {
 27 |     NumericVector dim(2);
 28 |     if( _transposed ){
 29 |       dim[0] = _dimension[1];
 30 |       dim[1] = _dimension[0];
 31 |       dim.attr("is_transposed") = true;
 32 |     } else {
 33 |       dim[0] = _dimension[0];
 34 |       dim[1] = _dimension[1];
 35 |     }
 36 |     return dim;
 37 |   }
 38 |   
 39 |   // SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) override {
 40 |   //   validate();
 41 |   //   
 42 |   //   List sliceIdx = extractSlices(listOrEnv, 2);
 43 |   //   bool resultNeedTranspose = false;
 44 |   //   if(_transposed){
 45 |   //     resultNeedTranspose = true;
 46 |   //     if( sliceIdx.size() == 2 ){
 47 |   //       SEXP tmp = sliceIdx[0];
 48 |   //       sliceIdx[0] = sliceIdx[1];
 49 |   //       sliceIdx[1] = tmp;
 50 |   //     } else if( sliceIdx.size() == 1 && sliceIdx[0] != R_MissingArg ){
 51 |   //       // x[i] -> calculate row & col for i, switch back and calculate new i
 52 |   //       resultNeedTranspose = false;
 53 |   //       std::vector<int64_t> idx = sliceIdx[0];
 54 |   //       int64_t tmp;
 55 |   //       int64_t nrow = _dimension[1]; // column is row now
 56 |   //       int64_t ncol = _dimension[0];
 57 |   //       for(std::vector<int64_t>::iterator ptr_idx = idx.begin(); ptr_idx != idx.end(); ptr_idx++ ){
 58 |   //         if(*ptr_idx != NA_REAL && *ptr_idx != LLONG_MIN){
 59 |   //           tmp = ((*ptr_idx) - 1) % nrow;
 60 |   //           *ptr_idx = ((*ptr_idx) - 1 - tmp) / nrow + tmp * ncol + 1;
 61 |   //         }
 62 |   //       }
 63 |   //     }
 64 |   //   }
 65 |   //   NumericVector dim = int64t2NumericVector(_dimension);
 66 |   //   const List subparsed = parseAndScheduleBlocks(sliceIdx, dim);
 67 |   //   Rcpp::checkUserInterrupt();
 68 |   //   
 69 |   //   Rcpp::StringVector fstFiles = get_partition_path();
 70 |   //   
 71 |   //   SEXP res = subsetFSTBare(fstFiles, subparsed, dim, _dataType);
 72 |   //   if( resultNeedTranspose ){
 73 |   //     Environment base = Environment::base_env();
 74 |   //     res = as<Function>(base["t.default"])(res);
 75 |   //   }
 76 |   //   return res;
 77 |   // };
 78 |   
 79 |   // SEXP subsetAssign(SEXP values, SEXP listOrEnv) override {
 80 |   //   validate();
 81 |   //   // List sliceIdx = extractSlices(listOrEnv, 2);
 82 |   //   // bool resultNeedTranspose = false;
 83 |   //   // if(_transposed){
 84 |   //   //   resultNeedTranspose = true;
 85 |   //   //   if( sliceIdx.size() == 2 ){
 86 |   //   //     SEXP tmp = sliceIdx[0];
 87 |   //   //     sliceIdx[0] = sliceIdx[1];
 88 |   //   //     sliceIdx[1] = tmp;
 89 |   //   //   } else if( sliceIdx.size() == 1 && sliceIdx[0] != R_MissingArg ){
 90 |   //   //     // x[i] -> calculate row & col for i, switch back and calculate new i
 91 |   //   //     resultNeedTranspose = false;
 92 |   //   //     std::vector<int64_t> idx = sliceIdx[0];
 93 |   //   //     int64_t tmp;
 94 |   //   //     int64_t nrow = _dimension[1]; // column is row now
 95 |   //   //     int64_t ncol = _dimension[0];
 96 |   //   //     for(std::vector<int64_t>::iterator ptr_idx = idx.begin(); ptr_idx != idx.end(); ptr_idx++ ){
 97 |   //   //       if(*ptr_idx != NA_REAL && *ptr_idx != NA_INTEGER64){
 98 |   //   //         tmp = ((*ptr_idx) - 1) % nrow;
 99 |   //   //         *ptr_idx = ((*ptr_idx) - 1 - tmp) / nrow + tmp * ncol + 1;
100 |   //   //       }
101 |   //   //     }
102 |   //   //   }
103 |   //   // }
104 |   //   // NumericVector dim = int64t2NumericVector(_dimension);
105 |   //   // const List subparsed = parseAndScheduleBlocks(sliceIdx, dim);
106 |   //   // Rcpp::checkUserInterrupt();
107 |   //   // 
108 |   //   // SEXP res = subsetFSTBare(fstFiles, subparsed, dim, _dataType);
109 |   //   // if( resultNeedTranspose ){
110 |   //   //   Environment base = Environment::base_env();
111 |   //   //   res = as<Function>(base["t.default"])(res);
112 |   //   // }
113 |   //   // return res;
114 |   //   // subsetAssignFST(values, fstFiles, listOrEnv, dim, _dataType, _compression, _uniformEncoding);
115 |   //   return R_NilValue;
116 |   // }
117 |   
118 | protected:
119 |   bool _transposed;
120 |   
121 | 
122 | };
123 | 
124 | 
125 | }
126 | 
127 | 
128 | #endif // API_LAZYARRAY_FSTMATRIX_H
129 | 


--------------------------------------------------------------------------------
/inst/include/interfaces/LazyArrayBase.h:
--------------------------------------------------------------------------------
  1 | #ifndef API_LAZYARRAY_BASECLASS_H
  2 | #define API_LAZYARRAY_BASECLASS_H
  3 | 
  4 | #include "entry.h"
  5 | 
  6 | namespace lazyarray {
  7 | 
  8 |   namespace {
  9 |     const int64_t INTEGER64_ONE = 1;
 10 |     
 11 |     
 12 |     inline Rcpp::NumericVector int64t2NumericVector(std::vector<int64_t> x){
 13 |       Rcpp::NumericVector re = Rcpp::NumericVector(x.begin(), x.end());
 14 |       return(re);
 15 |     }
 16 |   }
 17 | 
 18 | class LazyArrayBase {
 19 |   
 20 | public:
 21 |   LazyArrayBase(
 22 |     std::vector<int64_t> dimension, SEXPTYPE dataType
 23 |   ): 
 24 |   _dimension(dimension), _dataType(dataType), _readOnly(true)
 25 |   {
 26 |     _nparts = *(_dimension.end() - 1);
 27 |     _totalLen = std::accumulate(_dimension.begin(), _dimension.end(), INTEGER64_ONE, std::multiplies<int64_t>());
 28 |     _partLen = _totalLen / _nparts;
 29 |     // validate();
 30 |   }
 31 |   
 32 |   virtual ~LazyArrayBase(){ destroy(); }
 33 |   
 34 |   // dimension of array
 35 |   
 36 |   
 37 |   // getter
 38 |   int64_t nparts() const { return _nparts; }
 39 |   int64_t partLen() const { return _partLen; }
 40 |   int dataType() const { return static_cast<int>(_dataType); }
 41 |   bool getReadOnly() const { return _readOnly; }
 42 |   virtual NumericVector getDim() { return int64t2NumericVector(_dimension); }
 43 |   
 44 |   // setter
 45 |   bool setReadOnly(const bool isReadOnly){
 46 |     _readOnly = isReadOnly;
 47 |     return _readOnly;
 48 |   }
 49 |   
 50 |   // abstract methods
 51 |   
 52 |   virtual SEXP subset(SEXP listOrEnv, SEXP reshape = R_NilValue, bool drop = false) {
 53 |     stop("c++: LazyArrayBase::subset not implemented");
 54 |   };
 55 |   virtual SEXP subsetAssign(SEXP values, SEXP listOrEnv) {
 56 |     stop("c++: LazyArrayBase::subsetAssign not implemented");
 57 |   }
 58 |   
 59 |   // pre-defined members
 60 |   /**
 61 |    * Validate whether the array has errors
 62 |    * @param stopIfError if true, stop or raise exceptions when errors are detected
 63 |    * @return true or false whether the array passes the validation test
 64 |    */
 65 |   bool validate(bool stopIfError = true) {
 66 |     bool isValid = true;
 67 |     isValid = isValid && stopIfNot(_dimension.size() >= 2, "LazyArray must have dimension size >= 2", stopIfError);
 68 |     isValid = isValid && stopIfNot(*(_dimension.end() - 1) == _nparts, "LazyArray dimensions inconsistent with number of partitions", stopIfError);
 69 |     int64_t expectedLen = std::accumulate(_dimension.begin(), _dimension.end(), INTEGER64_ONE, std::multiplies<int64_t>());
 70 |     isValid = isValid && stopIfNot(expectedLen == _totalLen, "LazyArray file counts inconsistent with number of partitions", stopIfError);
 71 |     return isValid;
 72 |   };
 73 |   
 74 |   /**
 75 |    * Parse indices and prepare the subset
 76 |    * @param sliceIdx  a list (VECSXP), its length should match with dimension size,
 77 |    *                  and each element should be REALSXP (int64_t) or INTSXP (int),
 78 |    *                  or just R_MissingValue indicating the indices to subset. 
 79 |    *                  For example, 
 80 |    *   x[1,2,3]         - sliceIdx=list(1, 2, 3)
 81 |    *   x[, c(0:10,NA)]  - sliceIdx=list(get_missing_value(), c(0:10,NA_real_))
 82 |    *   x[]              - sliceIdx=list() or list(get_missing_value())
 83 |    *   x[-(1,2)]        - sliceIdx=list(c(-1,-1))
 84 |    * @return A list containing parsed information:
 85 |    *   - subset_mode (int): subset mode: 0 means x[i,j,k], 1 means x[i], 2 means x[]
 86 |    *   - target_dimension (std::vector<int64_t> or NumericVector): expected dimension of results
 87 |    *   - expected_length (int64_t): expected result length (= prod(target_dimension))
 88 |    *   - negative_subscript: whether this margin should be negative sliced (currectly all false)
 89 |    *   - location_indices: a list of cleaned sliceIdx. For example:
 90 |    *       [expression]      => [location_indices]
 91 |    *       x[1,2,3]          => list(1,2,3)
 92 |    *       x[, c(0:10,NA)]   => list({R_MissingValue}, c(1:10,NA_REAL))
 93 |    *       x[]               => list({R_MissingValue})
 94 |    *       x[-(0:1),2]       => list(2:3, 2), suppose dimension is c(3,4)
 95 |    *   - schedule: a list or R_NilValue. If subset=0, schedule will schedule indexing
 96 |    *                  blocks to read from array. 
 97 |    */
 98 |   inline Rcpp::List scheduleBlocks(SEXP sliceIdx) {
 99 |     NumericVector dim = int64t2NumericVector(_dimension);
100 |     return parseAndScheduleBlocks2(sliceIdx, dim);
101 |   };
102 |   
103 |   
104 |   
105 |   
106 | protected:
107 |   
108 |   std::vector<int64_t> _dimension;
109 |   
110 |   // Number of partitions - last dimension
111 |   int64_t              _nparts;
112 |   
113 |   // Partition size
114 |   int64_t              _partLen;
115 |   
116 |   // 
117 |   int64_t              _totalLen;
118 |   SEXPTYPE             _dataType;
119 |   bool                 _readOnly;
120 |   
121 | protected:
122 |   virtual void destroy(){}
123 |   
124 | };
125 | 
126 | }
127 | 
128 | #endif // API_LAZYARRAY_BASECLASS_H
129 | 


--------------------------------------------------------------------------------
/inst/include/interfaces/entry.h:
--------------------------------------------------------------------------------
 1 | #ifndef API_LAZYARRAY_ENTRY
 2 | #define API_LAZYARRAY_ENTRY
 3 | 
 4 | // This first
 5 | // migrated to BMArray
 6 | // include <bigmemory/MatrixAccessor.hpp>
 7 | 
 8 | #include <Rcpp.h>
 9 | 
10 | #include "lazyarray_RcppExports.h"
11 | 
12 | #endif // API_LAZYARRAY_ENTRY
13 | 


--------------------------------------------------------------------------------
/inst/include/lazyarray.h:
--------------------------------------------------------------------------------
 1 | #ifndef RCPP_lazyarray_H_GEN_
 2 | #define RCPP_lazyarray_H_GEN_
 3 | 
 4 | #include "interfaces/entry.h"
 5 | #include "interfaces/LazyArrayBase.h"
 6 | #include "interfaces/FstArray.h"
 7 | #include "interfaces/FstMatrix.h"
 8 | //include "interfaces/BMArray.h"
 9 | 
10 | // namespace lazyarray {
11 | //   namespace {
12 | //     const int64_t INTEGER64_ONE = 1;
13 | //     
14 | //     
15 | //     inline Rcpp::NumericVector int64t2NumericVector(std::vector<int64_t> x){
16 | //       Rcpp::NumericVector re = Rcpp::NumericVector(x.begin(), x.end());
17 | //       return(re);
18 | //     }
19 | //   }
20 | // 
21 | // 
22 | // // Implement class members here
23 | // Rcpp::List scheduleBlocks(SEXP sliceIdx);
24 | // LazyArrayBase::bool validate(bool stopIfError = true);
25 | // 
26 | // 
27 | // }
28 | 
29 | #endif // RCPP_lazyarray_H_GEN_
30 | 


--------------------------------------------------------------------------------
/lazyarray.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | 
17 | BuildType: Package
18 | PackageUseDevtools: Yes
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | PackageCheckArgs: --as-cran
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/man/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/man/auto_clear_lazyarray.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aaa.R
 3 | \name{auto_clear_lazyarray}
 4 | \alias{auto_clear_lazyarray}
 5 | \title{Automatically remove array data}
 6 | \usage{
 7 | auto_clear_lazyarray(x, onexit = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{'lazyarray' instance}
11 | 
12 | \item{onexit}{passed to \code{\link{reg.finalizer}}}
13 | }
14 | \description{
15 | Remove the files containing array data once no
16 | 'lazyarray' instance is using the folder. Require
17 | installation of \code{dipsaus} package (at least version 0.0.8).
18 | }
19 | \details{
20 | \code{auto_clear_lazyarray} attempts to remove the entire folder
21 | containing array data. However, if some files are not created by the
22 | array, only partition data and meta file will be removed, all the 
23 | artifacts will remain and warning will be displayed. One exception is
24 | if all files left in the array directory are \code{*.meta} files, 
25 | all these meta files will be removed along with the folder.
26 | }
27 | \examples{
28 | 
29 | path <- tempfile()
30 | arr_dbl <- lazyarray(path, storage_format = 'double',
31 |                      dim = 2:4, meta_name = 'meta-dbl.meta')
32 | arr_dbl[] <- 1:24
33 | auto_clear_lazyarray(arr_dbl)
34 | 
35 | arr_chr <- lazyarray(path, storage_format = 'character',
36 |                      dim = 2:4, meta_name = 'meta-chr.meta')
37 | auto_clear_lazyarray(arr_chr)
38 | 
39 | # remove either one, the directory still exists
40 | rm(arr_dbl); invisible(gc(verbose = FALSE))
41 | 
42 | arr_chr[1,1,1]
43 | 
44 | # Remove the other one, and path will be removed
45 | rm(arr_chr); invisible(gc(verbose = FALSE))
46 | 
47 | dir.exists(path)
48 | arr_check <- lazyarray(path, storage_format = 'character',
49 |                        dim = 2:4, meta_name = 'meta-chr')
50 | 
51 | # data is removed, so there should be no data (NAs)
52 | arr_check[]
53 | 
54 | }
55 | \author{
56 | Zhengjia Wang
57 | }
58 | 


--------------------------------------------------------------------------------
/man/chunk_map.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generics.R
 3 | \name{chunk_map}
 4 | \alias{chunk_map}
 5 | \title{Apply functions to all partitions, but small chunks each time}
 6 | \usage{
 7 | chunk_map(x, map_fun, reduce, max_nchunks, chunk_size, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{a \code{LazyArray} or R array}
11 | 
12 | \item{map_fun}{function to apply to each chunk}
13 | 
14 | \item{reduce}{similar to \code{reduce} in \code{\link{partition_map}}}
15 | 
16 | \item{max_nchunks}{maximum number of chunks. If number of chunks is too 
17 | large, then \code{chunk_size} will be re-calculated.}
18 | 
19 | \item{chunk_size}{integer chunk size. If \code{chunk_size} is too small, it 
20 | will be ignored}
21 | 
22 | \item{...}{ignored or passed to other methods}
23 | }
24 | \value{
25 | If \code{reduce} is missing, returns a list of results. Each result
26 | is returned by \code{map_fun}, and the total length equals to number of 
27 | chunks mapped. If \code{reduce} is a function, that list of results will
28 | be passed to \code{reduce} and \code{chunk_map} returns the results 
29 | generated from \code{reduce}.
30 | }
31 | \description{
32 | Apply functions to all partitions, but small chunks each time
33 | }
34 | \details{
35 | The difference between \code{chunk_map} and 
36 | \code{partition_map} is the margin or direction to apply mapping
37 | functions. In \code{partition_map}, mapping function is applied to 
38 | each partition. If \code{x} is a matrix, this means applying to each column.
39 | \code{chunk_map} generate small chunks along all dimensions except the last,
40 | and apply mapping functions to each chunks. If \code{x} is a matrix, it 
41 | make chunks along rows and apply mapping functions along rows.
42 | }
43 | \examples{
44 | 
45 | x <- as.lazymatrix(matrix(1:100, ncol = 2))
46 | x
47 | 
48 | # Set max_nchunks=Inf and chunk_size=10 to force total number of chunks
49 | # is around nrow(x)/10 and each chunk contains at most 10 rows
50 | chunk_map(x, function(chunk){chunk[1:2,]}, chunk_size = 10, max_nchunks = Inf)
51 | 
52 | # For each chunks, calculate mean, then calculate the mean of chunk mean
53 | chunk_map(x, function(chunk) {
54 |   colMeans(chunk)
55 | }, function(chunk_means) {
56 |   Reduce('+', chunk_means) / length(chunk_means)
57 | })
58 | 
59 | colMeans(x[])
60 | 
61 | 
62 | }
63 | \seealso{
64 | \code{\link{partition_map}}
65 | }
66 | 


--------------------------------------------------------------------------------
/man/crossprod.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zzz.R
 3 | \name{crossprod}
 4 | \alias{crossprod}
 5 | \alias{crossprod,AbstractLazyArray,AbstractLazyArray-method}
 6 | \alias{crossprod,AbstractLazyArray,NULL-method}
 7 | \alias{crossprod,AbstractLazyArray,missing-method}
 8 | \alias{crossprod,AbstractLazyArray,matrix-method}
 9 | \title{Matrix Crossproduct}
10 | \usage{
11 | \S4method{crossprod}{AbstractLazyArray,AbstractLazyArray}(x, y = NULL, weights = NULL, ...)
12 | 
13 | \S4method{crossprod}{AbstractLazyArray,`NULL`}(x, y = NULL, weights = NULL, ...)
14 | 
15 | \S4method{crossprod}{AbstractLazyArray,missing}(x, y = NULL, weights = NULL, ...)
16 | 
17 | \S4method{crossprod}{AbstractLazyArray,matrix}(x, y = NULL, weights = NULL, ...)
18 | }
19 | \arguments{
20 | \item{x}{a \code{LazyArray} or an R matrix}
21 | 
22 | \item{y}{\code{NULL} or matrix}
23 | 
24 | \item{weights}{numeric vector used as weight}
25 | 
26 | \item{...}{passed to further methods}
27 | }
28 | \value{
29 | Matrix of cross product if data is small, or \code{LazyMatrix} if
30 | matrix is too large
31 | }
32 | \description{
33 | Matrix Crossproduct
34 | }
35 | \examples{
36 | 
37 | x <- matrix(1:100, 50)
38 | crossprod(x)
39 | 
40 | lazy_x <- as.lazymatrix(x)
41 | crossprod(lazy_x)[]
42 | 
43 | weights <- (1:50)/50
44 | 
45 | t(x) \%*\% diag(weights) \%*\% x
46 | crossprod(lazy_x, weights = weights)
47 | 
48 | \dontrun{
49 | 
50 | # large data set ~ 1.6GB
51 | x <- as.lazymatrix(matrix(rnorm(2e8), ncol = 2))
52 | 
53 | crossprod(x)
54 | }
55 | 
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/man/lazy_parallel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zzz.R
 3 | \name{lazy_parallel}
 4 | \alias{lazy_parallel}
 5 | \title{Schedule parallel processes for \code{LazyArray}}
 6 | \usage{
 7 | lazy_parallel(
 8 |   strategy = c("multisession", "multicore", "multiprocess", "cluster", "remote",
 9 |     "callr"),
10 |   enabled = TRUE,
11 |   workers = "auto",
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{strategy}{strategies to apply, see \code{\link[future]{plan}} for
17 | some of the details. For \code{"callr"} plan, please install package}
18 | 
19 | \item{enabled}{whether multiple-process strategy is enabled}
20 | 
21 | \item{workers}{positive integer or \code{"auto"}, number of 'CPU' to use.
22 | The default value is \code{"auto"}, i.e. \code{future::availableCores()}}
23 | 
24 | \item{...}{Further passed to \code{\link[future]{plan}}}
25 | }
26 | \description{
27 | Enable parallel processing, need \code{dipsaus} to be installed.
28 | For \code{"callr"} strategy, please also install \code{future.callr}.
29 | }
30 | 


--------------------------------------------------------------------------------
/man/lazyarray-threads.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aaa.R
 3 | \name{lazyarray-threads}
 4 | \alias{lazyarray-threads}
 5 | \alias{set_lazy_threads}
 6 | \alias{get_lazy_threads}
 7 | \title{Set Number of Threads for Lazy Arrays}
 8 | \usage{
 9 | set_lazy_threads(
10 |   nr_of_threads = getOption("lazyarray.nthreads"),
11 |   reset_after_fork = NULL
12 | )
13 | 
14 | get_lazy_threads(max = FALSE)
15 | }
16 | \arguments{
17 | \item{nr_of_threads}{number of CPU cores to use, or \code{NULL} to 
18 | stay unchanged, default is \code{getOption('lazyarray.nthreads')}}
19 | 
20 | \item{reset_after_fork}{whether to reset after forked process}
21 | 
22 | \item{max}{whether return maximum available threads}
23 | }
24 | \value{
25 | Number of cores currently used.
26 | }
27 | \description{
28 | Set number of threads used by 'OpenMP' for both \code{lazyarray}
29 | and \code{fstcore} packages.
30 | }
31 | \seealso{
32 | \code{\link[fstcore]{threads_fstlib}}
33 | }
34 | 


--------------------------------------------------------------------------------
/man/lazyarray.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/lazyarray.R
  3 | \name{lazyarray}
  4 | \alias{lazyarray}
  5 | \alias{fstarray}
  6 | \alias{filearray}
  7 | \alias{as.lazymatrix}
  8 | \alias{as.lazyarray}
  9 | \title{Create or load a \code{lazyarray} instance}
 10 | \usage{
 11 | lazyarray(
 12 |   path,
 13 |   dim,
 14 |   read_only = FALSE,
 15 |   type = c("filearray", "fstarray"),
 16 |   storage_format = c("double", "integer", "complex", "character"),
 17 |   meta_name = "lazyarray.meta"
 18 | )
 19 | 
 20 | fstarray(
 21 |   path,
 22 |   dim,
 23 |   read_only = FALSE,
 24 |   storage_format = c("double", "integer", "complex", "character"),
 25 |   meta_name = "lazyarray.meta"
 26 | )
 27 | 
 28 | filearray(
 29 |   path,
 30 |   dim,
 31 |   read_only = FALSE,
 32 |   storage_format = c("double", "integer"),
 33 |   meta_name = "lazyarray.meta"
 34 | )
 35 | 
 36 | as.lazymatrix(x, ...)
 37 | 
 38 | as.lazyarray(x, path, type = "filearray", ...)
 39 | }
 40 | \arguments{
 41 | \item{path}{path to a local drive where array data should be stored}
 42 | 
 43 | \item{dim}{integer vector, dimension of array, see \code{\link{dim}}}
 44 | 
 45 | \item{read_only}{whether created array is read-only}
 46 | 
 47 | \item{type}{the back-end implementation of the array; choices are 
 48 | \code{"filearray"} and \code{"fstarray"}.}
 49 | 
 50 | \item{storage_format}{data type, choices are \code{"double"}, 
 51 | \code{"integer"}, \code{"character"}, and \code{"complex"}; see details}
 52 | 
 53 | \item{meta_name}{header file name, default is \code{"lazyarray.meta"}}
 54 | 
 55 | \item{x}{An R matrix or array}
 56 | 
 57 | \item{...}{passed into \code{lazyarray}}
 58 | }
 59 | \value{
 60 | An \code{R6} class of \code{lazyarray}. The class name is either
 61 | \code{FstArray} or \code{FileArray}, depending on \code{type} specified.
 62 | Both inherit \code{AbstractLazyArray}.
 63 | }
 64 | \description{
 65 | Creates or load a \code{lazyarray} that stores data on the hard 
 66 | disks. The data content is load on demand.
 67 | }
 68 | \details{
 69 | The function \code{lazyarray()} can either create or load an array
 70 | on the hard drives. When \code{path} exists as a directory, and there is
 71 | a valid array instance stored, \code{lazyarray} will ignore other parameters
 72 | such as \code{storage_format}, \code{type}, and sometimes \code{dim} (see 
 73 | Section "Array Partitions"). The function will try to load the existing array
 74 | given by the descriptive meta file. When \code{path} is missing or there is 
 75 | no valid array files inside of the directory, then a new array will be 
 76 | spawned, and \code{path} will be created automatically if it is missing.
 77 | 
 78 | There are two back-end implementations for \code{lazyarray()}:
 79 | \code{"filearray"} and \code{"fstarray"}. You can use \code{type} to 
 80 | specify which implementation serves your needs. There are some differences
 81 | between these two types. Each one has its own strengths and weaknesses. 
 82 | Please see Section "Array Types" for more details.
 83 | 
 84 | The argument \code{meta_name} specifies the name of file which stores
 85 | all the attribute information such as the total dimension, partition size,
 86 | file format, and storage format etc. There could be multiple meta files for 
 87 | the same array object; see Section "Array Partitions" for details.
 88 | }
 89 | \section{Array Types}{
 90 | 
 91 | 
 92 | Type \code{filearray} stores data in its binary form "as-is" to the local
 93 | drives. This format is compatible with the package \code{filematrix}.
 94 | The data types supported are integers and double-float numbers.
 95 | 
 96 | Type \code{fstarray} stores data in \code{fst} format defined by the 
 97 | package \code{fstcore} using 'ZSTD' compression technique. Unlike 
 98 | \code{filearray}, \code{fstarray} supports complex numbers and string 
 99 | characters in addition to integer and double numbers. 
100 | 
101 | The performance on solid-state drives mounted on 'NVMe' shows 
102 | \code{filearray} can reach up to 3 GB per second for reading speed and 
103 | \code{fstarray} can reach up to 1 GB per second. 
104 | 
105 | By default, \code{filearray} will be used if the storage format is supported,
106 | and \code{fstarray} is the back-up option. However, if the array data is
107 | structured or ordered, or the storage size is a major concern, 
108 | \code{fstarray} might achieve a better performance because it compresses 
109 | data before writing to hard drive.
110 | 
111 | To explicitly create file array, use the function \code{filearray()}. 
112 | Similarly, use \code{fstarray()} to create \code{fst}-based array.
113 | }
114 | 
115 | \section{Array Partitions}{
116 | 
117 | 
118 | A \code{lazyarray} partitions data in two ways: file partitions and in-file
119 | blocks. 
120 | 
121 | 1. File-level Partition:
122 | 
123 | The number of file partitions matches with the last array margin.
124 | Given a \eqn{100 x 200 x 30 x 4} array, there will be 4 partitions, each
125 | partition stores a slice of data containing a \eqn{100 x 200 x 30} 
126 | sub-array, or \eqn{2,400,000} elements. 
127 | 
128 | Once an array is created, the length of each partition does not change 
129 | anymore. However, the shape of each partition can be changed. The number of
130 | partitions can grow or trim. To change these, you just need to create a 
131 | new meta file and specify the new dimension at no additional cost. Use 
132 | the previous example. The partition sub-dimension can be 
133 | \eqn{10000 x 60}, \eqn{2000 x 300}, or \eqn{1000 x 200 x 3} as
134 | long as the total length matches. The total partitions can change to 
135 | 3, 5, or 100, or any positive integer. To change the total dimension to 
136 | \eqn{2400000 x 100}, you can call \code{lazyarray} with the new dimension (
137 | see examples). Please make sure the \code{type} and \code{meta_name} are 
138 | specified.
139 | 
140 | 2. In-file Blocks:
141 | 
142 | Within each file, the data are stored in blocks. When reading the data, if
143 | an element within each block is used, then the whole block gets read. 
144 | 
145 | For \code{filearray}, the block size equals to the first margin. For 
146 | example, a \eqn{100 x 200 x 3} file array will have 3 file partitions,
147 | 200 blocks, each block has 100 elements
148 | 
149 | As for \code{fstarray}, the lower bound of block size can be set by 
150 | \code{options(lazyarray.fstarray.blocksize=...)}. By default, this number is
151 | 16,384. For a \eqn{100 x 200 x 3} array, each partition only has one block
152 | and block number if 20,000.
153 | }
154 | 
155 | \section{Indexing and Recommended Dimension Settings}{
156 | 
157 | 
158 | If there is a dimension that defines the unit of analysis, then make it the 
159 | last margin index. If a margin is rarely indexed, put it in the first margin.
160 | This is because indexing along the last margin is the fastest, and indexing
161 | along the first margin is the slowest. 
162 | 
163 | If \code{x} has \eqn{200 x 200 x 200} dimension, \code{x[,,i]} is the 
164 | fastest, then \code{x[,i,]}, then \code{x[i,,]}.
165 | }
166 | 
167 | \examples{
168 | 
169 | library(lazyarray)
170 | 
171 | path <- tempfile()
172 | 
173 | # ---------------- case 1: Create new array ------------------
174 | arr <- lazyarray(path, storage_format = 'double', dim = c(2,3,4))
175 | arr[] <- 1:24
176 | 
177 | # Subset and get the first partition
178 | arr[,,1]
179 | 
180 | # Partition file path (total 4 partitions)
181 | arr$get_partition_fpath()
182 | 
183 | # Removing array doesn't clear the data
184 | rm(arr); gc()
185 | 
186 | # ---------------- Case 2: Load from existing directory ----------------
187 | # Load from existing path, no need to specify other params
188 | arr <- lazyarray(path, read_only = TRUE)
189 | 
190 | summary(arr, quiet = TRUE)
191 | 
192 | # ---------------- Case 3: Import from existing data ----------------
193 | 
194 | # Change dimension to 6 x 20
195 | 
196 | arr1 <- lazyarray(path, dim = c(6,20), meta_name = "arr_6x20.meta")
197 | 
198 | arr1[,1:5]
199 | 
200 | arr1[,1:6] <- rnorm(36)
201 | 
202 | # arr also changes
203 | arr[,,1]
204 | 
205 | 
206 | # ---------------- Case 4: Converting from R arrays ----------------
207 | 
208 | x <- matrix(1:16, 4)
209 | x <- as.lazymatrix(x, type = 'fstarray', storage_format = "complex")
210 | x[,]  # or x[]
211 | 
212 | 
213 | 
214 | }
215 | \author{
216 | Zhengjia Wang
217 | }
218 | 


--------------------------------------------------------------------------------
/man/lazylm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/model-lm.R
 3 | \name{lazylm}
 4 | \alias{lazylm}
 5 | \title{Fitting linear models using \code{lazyarray}}
 6 | \usage{
 7 | lazylm(
 8 |   formula,
 9 |   data,
10 |   fitted = FALSE,
11 |   weights = NULL,
12 |   offset = NULL,
13 |   contrasts = NULL,
14 |   na.action = getOption("na.action"),
15 |   qr.tol = 1e-07,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{formula}{an object of class \code{\link[stats]{formula}}. For variable
21 | names to be used, see 'Details'}
22 | 
23 | \item{data}{a \code{lazyarray} object}
24 | 
25 | \item{fitted}{whether to calculate fitted data and residuals. This may
26 | take time and memory if data is large}
27 | 
28 | \item{weights, offset, contrasts, na.action}{see \code{\link[stats]{lm}}}
29 | 
30 | \item{qr.tol}{the tolerance for detecting linear dependencies in the 
31 | partitions of \code{data}; see \code{\link{qr}}}
32 | 
33 | \item{...}{passed to \code{\link{chunk_map}}}
34 | }
35 | \value{
36 | An object of class \code{c("lazylm", "lm")} or for multiple 
37 | responses of class \code{c("lazylm", "mlm")}.
38 | }
39 | \description{
40 | Fitting linear models using \code{lazyarray}
41 | }
42 | \details{
43 | The array will be reshaped to a matrix first before fitting the 
44 | linear models. A \eqn{100 x 20 x 5} array will be reshaped to a 
45 | \eqn{2000 x 5} lazy matrix.
46 | The variables are the partitions of the array. If \code{dimnames}
47 | are set for the last margin index, then those will be used as variable 
48 | names, otherwise \code{lazylm} automatically assign 
49 | \eqn{"V1", "V2", "V3", ...} as each partition names.
50 | }
51 | \examples{
52 | 
53 | library(lazyarray)
54 | arr <- array(rnorm(72), c(6,3,4))
55 | arr[1,1,1] <- NA        # Allow NA to be treated
56 | offset = rnorm(18)    # offset and weights are optional
57 | weights = runif(18)
58 | 
59 | formula <- V1 ~ .-V2-1 + (V2 > 0)
60 | 
61 | data <- as.lazyarray(arr, type = 'file')
62 | object <- lazylm(formula, data, weights = weights, offset = offset)
63 | 
64 | 
65 | # Compare to stats::lm
66 | dim(arr) <- c(18, 4)
67 | lm_data <- as.data.frame(arr)
68 | flm <- lm(formula, lm_data, weights = weights, offset = offset)
69 | 
70 | cbind(coef(object), coef(flm))
71 | cbind(resid(object), resid(flm))
72 | cbind(fitted(object), fitted(flm))
73 | summary(object)
74 | summary(flm)
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/man/partition_map.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generics.R
 3 | \name{partition_map}
 4 | \alias{partition_map}
 5 | \title{Apply function along the last dimension of an array and aggregate the results}
 6 | \usage{
 7 | partition_map(x, map_fun, reduce, partitions, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{R array or \code{LazyArray}}
11 | 
12 | \item{map_fun}{function that takes in a slice of array and an optional 
13 | argument indicating current partition number}
14 | 
15 | \item{reduce}{function that accept a list of results returned by 
16 | \code{map_fun}, can be missing}
17 | 
18 | \item{partitions}{integers of partitions, i.e. the slices of array to be 
19 | applied to, can be missing. If missing, then applies to all partitions}
20 | 
21 | \item{...}{internally used}
22 | }
23 | \value{
24 | If \code{reduce} is missing, returns a list of results. Each result
25 | is returned by \code{map_fun}, and the total length equals to number of 
26 | partitions mapped. If \code{reduce} is a function, that list of results will
27 | be passed to \code{reduce} and \code{partition_map} returns the results 
28 | generated from \code{reduce}.
29 | }
30 | \description{
31 | Apply function along the last dimension of an array and aggregate the results
32 | }
33 | \examples{
34 | 
35 | # -------------------------- Ordinary R array ---------------------------
36 | 
37 | x <- array(1:24, c(2,3,4))
38 | partition_map(x, function(slice, part){
39 |   sum(slice)
40 | })
41 | 
42 | # When reduce and partitions are missing, the following code is equivalent
43 | as.list(apply(x, 3, sum))
44 | 
45 | # When reduce is present
46 | partition_map(x, function(slice, part){
47 |   sum(slice)
48 | }, function(slice_sum){
49 |   max(unlist(slice_sum))
50 | })
51 | 
52 | # equivalently, we could call
53 | slice_sum <- partition_map(x, function(slice, part){
54 |   sum(slice)
55 | })
56 | max(unlist(slice_sum))
57 | 
58 | # When partition is specified
59 | # Partition 1, 2, and 4 exist but 5 is missing
60 | # when a partition is missing, the missing slice will be NA
61 | partition_map(x, function(slice, part){
62 |   sum(slice)
63 | }, partitions = c(1,2,4,5))
64 | 
65 | # -------------------------- LazyArray ---------------------------
66 | x <- lazyarray(tempfile(), storage_format = 'complex', dim = c(2,3,4))
67 | x[] <- 1:24 + (24:1) * 1i
68 | 
69 | partition_map(x, function(slice, part){
70 |   slice[1, ,] * slice[2, ,]
71 | }, reduce = function(mapped_prod){
72 |   mean(unlist(mapped_prod))
73 | })
74 | 
75 | 
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/man/partition_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generics.R
 3 | \name{partition_table}
 4 | \alias{partition_table}
 5 | \alias{partition_table.array}
 6 | \alias{partition_table.AbstractLazyArray}
 7 | \title{Generate partition summary statistics for array objects along the last 
 8 | dimension}
 9 | \usage{
10 | partition_table(x, na.rm = FALSE, ...)
11 | 
12 | \method{partition_table}{array}(x, na.rm = FALSE, ...)
13 | 
14 | \method{partition_table}{AbstractLazyArray}(x, na.rm = FALSE, ...)
15 | }
16 | \arguments{
17 | \item{x}{an array or \code{LazyArray}}
18 | 
19 | \item{na.rm}{whether to remove \code{NA} when calculating summary statistics}
20 | 
21 | \item{...}{passed to other methods or ignored}
22 | }
23 | \value{
24 | A data frame with the following possible columns: \code{Min}, 
25 | \code{Max}, \code{Mean}, \code{Standard Deviation}, \code{NAs} (total number
26 |  of \code{NA}), and \code{Length}.
27 | }
28 | \description{
29 | Generate partition summary statistics for array objects along the last 
30 | dimension
31 | }
32 | \examples{
33 | 
34 | # R array
35 | x <- array(1:27, c(3,3,3))
36 | partition_table(x)
37 | 
38 | # LazyArray
39 | x <- lazyarray(tempfile(), storage_format = 'double', dim = c(3,3,3))
40 | x[] <- 1:27
41 | partition_table(x, quiet=TRUE)
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/man/typeof-AbstractLazyArray-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/zzz.R
 3 | \name{typeof,AbstractLazyArray-method}
 4 | \alias{typeof,AbstractLazyArray-method}
 5 | \title{Type of \code{LazyArray}}
 6 | \usage{
 7 | \S4method{typeof}{AbstractLazyArray}(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a \code{LazyArray} or an R object}
11 | }
12 | \value{
13 | The type of data stored in the input
14 | }
15 | \description{
16 | Type of \code{LazyArray}
17 | }
18 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Lazyarray - R package to store/read large array on hard drive
2 | 
3 | This package is NOT active anymore. Please consider [filearray](https://CRAN.R-project.org/package=filearray), a long-term project.
4 | 
5 | You might see `lazyarray` on CRAN, this is because I have some legacy projects that depend on this package. This package will be off CRAN anytime soon. (Sorry...)
6 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | CXX_STD      = CXX11
2 | PKG_CPPFLAGS =	-I../inst/include/
3 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
4 | PKG_LIBS     = $(SHLIB_OPENMP_CXXFLAGS)
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/src/classIndexSchedule.h:
--------------------------------------------------------------------------------
  1 | #ifndef DIP_LAZYARRAY_SCHEDULE_H
  2 | #define DIP_LAZYARRAY_SCHEDULE_H
  3 | 
  4 | #include "common.h"
  5 | 
  6 | class ScheduledIndex {
  7 |   
  8 | public:
  9 |   
 10 |   ScheduledIndex() {
 11 |     block_indexed = false;
 12 |   }
 13 |   
 14 |   ScheduledIndex(
 15 |     const bool _block_indexed,
 16 |     const std::vector<int64_t> _dimension,
 17 |     const std::vector<std::pair<std::vector<int64_t>, bool>> _block_location,
 18 |     const std::vector<int64_t>& _partition_index,
 19 |     const std::vector<int64_t> _schedule_index,
 20 |     const std::vector<int64_t> _schedule_dimension,
 21 |     const std::vector<int64_t> _block_dimension,
 22 |     const std::vector<int64_t> _block_schedule,
 23 |     const int64_t _block_schedule_start,
 24 |     const int64_t _block_schedule_end
 25 |   );
 26 |   
 27 |   ScheduledIndex(SEXP locations, const std::vector<int64_t>& dim, bool forceSchedule = false, int64_t hint = -1);
 28 |   
 29 |   ~ScheduledIndex(){
 30 | #ifdef LAZYARRAY_DEBUG
 31 |     print(wrap("A ScheduledIndex is destroyed"));
 32 | #endif
 33 |   }
 34 |   
 35 |   bool block_indexed;                   // whether block_schedule can be trusted
 36 |   std::vector<int64_t> dimension;
 37 |   // partition level
 38 |   int64_t partition_counts;               // the last dimension - n files to iterate through
 39 |   std::vector<int64_t> partition_index;    // detailed indexes   - always exists
 40 |   
 41 |   // schedule level
 42 |   int64_t schedule_counts_per_part; // for each partition, number of blocks to run
 43 |   std::vector<int64_t> schedule_index;      // indices to schedule run blocks
 44 |   std::vector<int64_t> schedule_dimension; // [schedule dim, partition counts]
 45 |   
 46 |   // block level
 47 |   int64_t block_ndims;                    // length(block dim)
 48 |   std::vector<int64_t> block_dimension; // [block dim], full version
 49 |   std::vector<int64_t> block_prod_dim; // prod([1, block dim]), used to locate indices when block is too large to index
 50 |   std::vector<int64_t> block_schedule; // given a flattened block (full version), which indices to subset?
 51 |   int64_t block_schedule_start;
 52 |   int64_t block_schedule_end;      // min, max of block_schedule
 53 |   
 54 |   int64_t block_length;                  // # elements in a block (full version) = prod(block_dimension)
 55 |   int64_t block_expected_length;// # elements in a block (subset version) = length(block_schedule)
 56 |   
 57 |   std::vector<std::pair<std::vector<int64_t>, bool>> block_location;           // subset of locational indices of blocks
 58 |   
 59 |   
 60 |   Rcpp::List asList();
 61 | };
 62 | 
 63 | class ParsedIndex {
 64 |   
 65 | public:
 66 |   ParsedIndex(
 67 |     const int& subset_mode, const std::vector<int64_t>& target_dimension,
 68 |     const std::vector<bool>& negative_subscript,
 69 |     const std::vector<std::pair<std::vector<int64_t>, bool>>& location_indices,
 70 |     ScheduledIndex* schedule
 71 |   ):
 72 |   subset_mode(subset_mode), 
 73 |   target_dimension(target_dimension),
 74 |   negative_subscript(negative_subscript), 
 75 |   location_indices(location_indices),
 76 |   schedule(schedule) {
 77 |     expected_length = std::accumulate(target_dimension.begin(), target_dimension.end(), INTEGER64_ONE, std::multiplies<int64_t>());
 78 |   }
 79 |   
 80 |   ~ParsedIndex(){
 81 |     if(this->schedule != nullptr){
 82 |       delete schedule;
 83 |       this->schedule = nullptr;
 84 |     }
 85 | #ifdef LAZYARRAY_DEBUG
 86 |     print(wrap("A ParsedIndex is destroyed"));
 87 | #endif
 88 |   }
 89 |   
 90 |   ParsedIndex(const SEXP listOrEnv, const std::vector<int64_t>& dim, bool pos_subscript);
 91 |   
 92 |   
 93 |   int subset_mode;
 94 |   std::vector<int64_t> target_dimension;
 95 |   std::vector<bool> negative_subscript;
 96 |   std::vector<std::pair<std::vector<int64_t>, bool>> location_indices;
 97 |   
 98 |   int64_t expected_length;
 99 |   ScheduledIndex *schedule;
100 |   
101 |   Rcpp::List asList();
102 |   
103 | };
104 | 
105 | 
106 | #endif // DIP_LAZYARRAY_SCHEDULE_H
107 | 


--------------------------------------------------------------------------------
/src/common.cpp:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | using namespace Rcpp;
 3 | 
 4 | R_xlen_t setLazyBlockSize(R_xlen_t size){
 5 |   if( size < 0 ){
 6 |     BLOCKSIZE = 16384;
 7 |   } else if( size > 0 ){
 8 |     BLOCKSIZE = size;
 9 |   }
10 |   
11 |   return BLOCKSIZE;
12 | }
13 | 
14 | R_xlen_t getLazyBlockSize(){
15 |   if(BLOCKSIZE < 1){
16 |     BLOCKSIZE = 16384;
17 |   }
18 |   return BLOCKSIZE;
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef DIP_LAZYARRAY_COMMON_H
 2 | #define DIP_LAZYARRAY_COMMON_H
 3 | 
 4 | // Common header that's required by all (most) files
 5 | 
 6 | //include <vector>
 7 | //include <iostream>
 8 | //include <iterator>
 9 | //include <complex>
10 | //include <cmath>
11 | //include <cstring>
12 | // #include <string>
13 | 
14 | #include <Rcpp.h>
15 | //include <Rcpp/Benchmark/Timer.h>
16 | 
17 | using namespace Rcpp;
18 | 
19 | /*
20 |  * Number of bytes fst uses to compress as a unit
21 |  * We use it differently, basically 4x or 8x or 16x this number as our block size
22 |  * to avoid repeating too many blocks
23 |  */ 
24 | #ifndef NA_INTEGER64
25 | //undef NA_INTEGER64
26 | #define NA_INTEGER64 LLONG_MIN
27 | #endif // NA_INTEGER64
28 | 
29 | // Lazyarray subset_mode - No index
30 | #ifndef LASUBMOD_NOIDX
31 | #define LASUBMOD_NOIDX 2
32 | #endif
33 | 
34 | #ifndef LASUBMOD_SINGLE
35 | #define LASUBMOD_SINGLE 1
36 | #endif
37 | 
38 | #ifndef LASUBMOD_MULTI
39 | #define LASUBMOD_MULTI 0
40 | #endif
41 | 
42 | #ifdef LAZYARRAY_DEBUG
43 | #undef LAZYARRAY_DEBUG
44 | #endif
45 | 
46 | /*
47 |  * For array with dimension [287 x 200 x 601 x 84]
48 |  * BLOCKSIZE decides the size of block to read into from fst file
49 |  * because fst internally stores data in blocks, it's recommended to read blocks with size > 16384
50 |  * 
51 |  * by default, this array will be split into 3 parts
52 |  * [287 x 200 x 601 x 84] => [287 x 200] x [601 x 1] x [84]
53 |  * 
54 |  * 84 is # of partition /files
55 |  * for each file, read in sub chunk of length 287 x 200 (> BLOCKSIZE)
56 |  * total number of chunks to read is 601 per file
57 |  * 
58 |  * loc2idx3 calculates indices within each sub-chunks so that it's easy to find then once data is loaded
59 |  * 
60 |  * However, of sub-block is too large, for example [1e30 x 5] matrix, sub-block size is 1e30, loc2idx3 generates too many
61 |  * indices but the indices come with cost of memory (this means super large index set). We wish to calculate 
62 |  * indices on the fly. The boundary is set by BLOCKLARGE.
63 |  * 
64 |  * If # of indices > BLOCKLARGE, then don't pre-generate indices
65 |  * 
66 |  */
67 | 
68 | // Used to partition to sub-blocks
69 | static R_xlen_t BLOCKSIZE = 16384;
70 | // If sub-block size is too large, don't calculate indices (memory inefficient)
71 | // ~ 250 MB index set
72 | static R_xlen_t BLOCKLARGE = 31250000;
73 | 
74 | const static int64_t INTEGER64_ONE = 1;
75 | const static R_xlen_t INTEGER_XLEN_ONE = 1;
76 | 
77 | // [[Rcpp::interfaces(r, cpp)]]
78 | // [[Rcpp::export]]
79 | R_xlen_t setLazyBlockSize(R_xlen_t size);
80 | 
81 | // [[Rcpp::export]]
82 | R_xlen_t getLazyBlockSize();
83 | 
84 | 
85 | #endif // DIP_LAZYARRAY_COMMON_H
86 | 


--------------------------------------------------------------------------------
/src/fstWrapper.cpp:
--------------------------------------------------------------------------------
 1 | #include "fstWrapper.h"
 2 | 
 3 | // [[Rcpp::plugins("cpp11")]]
 4 | 
 5 | #include <fstcore.h>
 6 | #include "common.h"
 7 | #include "utils.h"
 8 | 
 9 | using namespace Rcpp;
10 | 
11 | 
12 | SEXP fstMeta(String fileName){
13 |   return fstcore::fstmetadata(fileName);
14 | }
15 | 
16 | /*
17 |  * Slightly different than fscore::fstretrieve. It supports 
18 |  * 
19 |  */
20 | SEXP fstRetrieve(String fileName, SEXP colSel, SEXP start, SEXP end){
21 | #ifdef LAZYARRAY_DEBUG
22 |   print(wrap("Reading from fst..."));
23 | #endif
24 |   return fstcore::fstretrieve(fileName, colSel, start, end);
25 | }
26 | 
27 | SEXP fstStore(String fileName, SEXP table, SEXP compression, SEXP uniformEncoding){
28 |   return fstcore::fststore(fileName, table, compression, uniformEncoding);
29 | }
30 | 
31 | 
32 | bool checkFstMeta(const String file, const int64_t expect_nrows, StringVector cnames){
33 |   List meta = fstcore::fstmetadata( file );
34 |   if( Rf_inherits(meta, "fst_error") || 
35 |       !meta.containsElementNamed("nrOfRows") ||
36 |       !meta.containsElementNamed("colNames")){
37 |       return(false);
38 |   }
39 |   int64_t nrows = as<int64_t>(meta["nrOfRows"]);
40 |   if( expect_nrows >= 0 && nrows != expect_nrows ){
41 |     stop("Unable to read from file: partition size not matches with expected size.");
42 |   }
43 |   StringVector colNames = as<StringVector>(meta["colNames"]);
44 |   
45 |   if(colNames.size() < cnames.size()){
46 |     return(false);
47 |   }
48 |   
49 |   R_xlen_t touched = 0;
50 |   for(R_xlen_t ii = 0; ii < cnames.size(); ii++ ){
51 |     const String cn = cnames[ii];
52 |     for(StringVector::iterator ptr = colNames.begin(); ptr != colNames.end(); ptr++){
53 |       if((*ptr) == cn){
54 |         touched++;
55 |         continue;
56 |       }
57 |     }
58 |   }
59 |   
60 |   if(touched >= cnames.size()){
61 |     return(true);
62 |   }
63 |   
64 |   return(false);
65 | }
66 | 
67 | 


--------------------------------------------------------------------------------
/src/fstWrapper.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_FST_H
 4 | #define LAZYARRAY_FST_H
 5 | 
 6 | #include "lazycommon.h"
 7 | 
 8 | // [[Rcpp::export]]
 9 | SEXP fstMeta(Rcpp::String fileName);
10 | 
11 | // [[Rcpp::export]]
12 | SEXP fstRetrieve(Rcpp::String fileName, SEXP colSel, SEXP start, SEXP end);
13 | 
14 | // [[Rcpp::export]]
15 | SEXP fstStore(Rcpp::String fileName, SEXP table, SEXP compression, SEXP uniformEncoding);
16 | 
17 | /*
18 |  * Validate fst header information
19 |  * @param file fst file path 
20 |  * @param expect_nrows expected number of rows
21 |  * @param cnames column names that must contain
22 |  */
23 | // [[Rcpp::plugins("cpp11")]]
24 | // [[Rcpp::export]]
25 | bool checkFstMeta(const Rcpp::String file, const int64_t expect_nrows, Rcpp::StringVector cnames);
26 | 
27 | #endif // LAZYARRAY_FST_H
28 | 


--------------------------------------------------------------------------------
/src/indexConvert.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_INDEX_H
 4 | #define LAZYARRAY_INDEX_H
 5 | 
 6 | #include "Rcpp.h"
 7 | #include "classIndexSchedule.h"
 8 | 
 9 | // [[Rcpp::interfaces(r, cpp)]]
10 | 
11 | // Though efficient, but no longer used as it errors when indexing >= 2^31 length
12 | Rcpp::IntegerVector loc2idx(Rcpp::List& locations, Rcpp::IntegerVector& parent_dim);
13 | 
14 | // Same behavior as loc2idx3, but uses NumericVector, which converts int64_t back to double
15 | // no need to do so
16 | Rcpp::NumericVector loc2idx2(Rcpp::List& locations, Rcpp::NumericVector& parent_dim);
17 | 
18 | // [[Rcpp::export]]
19 | std::vector<int64_t> loc2idx3(SEXP locations, std::vector<int64_t>& parent_dim);
20 | 
21 | 
22 | // subsetIdx and subsetIdx2 should not be used directly as parseSlices combines them all
23 | SEXP subsetIdx(Rcpp::Environment expr_env, Rcpp::NumericVector dim, bool pos_subscript = false);
24 | 
25 | // [[Rcpp::export]]
26 | Rcpp::List extractSlices(SEXP listOrEnv, const R_xlen_t& ndims);
27 | 
28 | // parseSlices = subsetIdx or subsetIdx2
29 | // WARNING: Always use pos_subscript if you want to use subset or subsetAssign functions in lazyarray
30 | // pos_subscript=false subset is not implemented
31 | // [[Rcpp::export]]
32 | Rcpp::List parseSlices(SEXP listOrEnv, const std::vector<int64_t>& dim, bool pos_subscript = true);
33 | 
34 | // parseAndScheduleBlocks = parseSlices + scheduleIndexing
35 | // [[Rcpp::export]]
36 | Rcpp::List parseAndScheduleBlocks2(SEXP sliceIdx, Rcpp::NumericVector dim, bool forceSchedule = false);
37 | 
38 | ParsedIndex* parseAndScheduleBlocks(SEXP listOrEnv, const std::vector<int64_t>& dim, bool forceSchedule = false, int64_t hint = -1);
39 | 
40 | // [[Rcpp::export]]
41 | SEXP reshapeOrDrop(SEXP x, SEXP reshape = R_NilValue, bool drop = false);
42 |   
43 | #endif // LAZYARRAY_INDEX_H
44 | 


--------------------------------------------------------------------------------
/src/lazyarray-ext.cpp:
--------------------------------------------------------------------------------
  1 | // [[Rcpp::plugins("cpp11")]]
  2 | 
  3 | #include "utils.h"
  4 | #include "reshape.h"
  5 | #include "loader1.h"
  6 | #include "loader2.h"
  7 | #include "fstWrapper.h"
  8 | 
  9 | #include "common.h"
 10 | #include "indexConvert.h"
 11 | 
 12 | using namespace Rcpp; 
 13 | 
 14 | /**
 15 |  * Write array to fst files
 16 |  * @param x array or vector
 17 |  * @param dim array dimension
 18 |  * @param fileName,compression,uniformEncoding passed to fstStore
 19 |  */
 20 | // [[Rcpp::export]]
 21 | SEXP cpp_create_lazyarray(SEXP& x, IntegerVector& dim, SEXP fileName,
 22 |                           SEXP compression, SEXP uniformEncoding){
 23 |   
 24 |   // Obtain the array dimension, dimension length must >= 1
 25 |   R_xlen_t ndim = dim.size();
 26 | 
 27 |   if( ndim < 1 ){
 28 |     // shouldn't enter here
 29 |     Rcpp::stop("cpp_create_lazyarray needs at least one dim. It can be the length.");
 30 |   }
 31 | 
 32 |   /*
 33 |    * Reshape dimension to two dimension: [first_dim, last_dim]
 34 |    * Case 1: dim has length 1, e.g. dim = c(10), last_dim is always 1, and first_dim is 10
 35 |    * Case 2: dim = c(10,10,2), last_dim is always the last dim, i.e. 2, first_dim is prod of the rest, i.e. 100
 36 |    */
 37 |   int64_t last_dim = 1;
 38 |   int64_t total_length = std::accumulate(dim.begin(), dim.end(), INTEGER64_ONE, std::multiplies<int64_t>());
 39 |   if( ndim >= 2 ){
 40 |     // case 2
 41 |     last_dim = *(dim.end() - 1);
 42 |   }
 43 |   int64_t first_dim = total_length / last_dim;
 44 |   
 45 |   if( total_length == 0 ){
 46 |     // no need to write to disk?
 47 |     // FIXME
 48 |   }
 49 |   
 50 |   // Need to reshape x to a data.frame/list i.e. dim(x) = c(first_dim, last_dim)
 51 |   Rcpp::List table = arr2df(x, first_dim, last_dim);
 52 |   // return table;
 53 |   // Rcpp::print(table);
 54 |   fstStore(fileName, table, compression, uniformEncoding);
 55 |   return R_NilValue;
 56 | }
 57 | 
 58 | 
 59 | /**
 60 |  * @param fileName,colSel,start,end Parameter required to locate each partition for each file
 61 |  * @param custom_func Function to apply to each partition
 62 |  * @param reshape Dimension to reshape partition data to
 63 |  */
 64 | // [[Rcpp::export]]
 65 | SEXP lazyMapReduceByPartition(
 66 |     Rcpp::String fileName, CharacterVector colSel, SEXP start, SEXP end = R_NilValue, 
 67 |     Rcpp::Nullable<Rcpp::Function> custom_func = R_NilValue,
 68 |     Rcpp::Nullable<IntegerVector> reshape = R_NilValue){
 69 |   
 70 |   SEXP tmp;
 71 |   tmp = fstRetrieve(fileName, wrap(colSel), start, end);
 72 |   
 73 |   tmp = getListElement(tmp, "resTable");
 74 |   
 75 |   SEXP data;
 76 |   R_xlen_t s;
 77 |   
 78 |   // if colSel.size() == 1, non-complex data
 79 |   bool is_complex = false;
 80 |   if( colSel.size() == 1 ){
 81 |     data = getListElement(tmp, colSel[0]);
 82 |     s = Rf_xlength(data);
 83 |   } else {
 84 |     is_complex = true;
 85 |     NumericVector re = as<NumericVector>(getListElement(tmp, colSel[0]));
 86 |     NumericVector im = as<NumericVector>(getListElement(tmp, colSel[1]));
 87 |     s = Rf_xlength(re);
 88 |     
 89 |     data = PROTECT(Rf_allocVector(CPLXSXP, s));
 90 |     
 91 |     Rcomplex *ptr_data = COMPLEX(data);
 92 |     NumericVector::iterator ptr_re = re.begin();
 93 |     NumericVector::iterator ptr_im = im.begin();
 94 |     
 95 |     for(R_xlen_t ii = 0; ii < s; ii++ ){
 96 |       (*ptr_data).r = *ptr_re;
 97 |       (*ptr_data).i = *ptr_im;
 98 |       ptr_data++;
 99 |       ptr_re++;
100 |       ptr_im++;
101 |     }
102 |     
103 |   }
104 |   
105 |   SEXP re;
106 |   
107 |   if(reshape != R_NilValue){
108 |     Rf_setAttrib(data, wrap("dim"), wrap(reshape));
109 |   }
110 |   
111 |   if( custom_func.isNotNull() ){
112 |     re = Rcpp::as<Rcpp::Function>(custom_func)( data );
113 |   } else {
114 |     re = R_NilValue;
115 |   }
116 |   
117 |   if( re != R_NilValue ){
118 |     // re.attr("chunk_length") = s;
119 |     Rf_setAttrib(re, wrap("chunk_length"), wrap(s));
120 |   }
121 |   
122 |   if( is_complex ){
123 |     UNPROTECT(1);
124 |   }
125 |   
126 |   return re;
127 | }
128 | 
129 | 
130 | 
131 | 
132 | /*** R
133 | # devtools::load_all()
134 | require(lazyarray)
135 | x <- array(seq_len(prod(c(3000,7,20,8))), c(3000,7,20,8))
136 | x <- as.lazyarray(x)
137 | 
138 | # unlink(x$get_partition_fpath(3))
139 | # fst::write_fst(data.frame(V2 = 1:8), x$get_partition_fpath(3))
140 | 
141 | b <- function(i,...){
142 |   subsetIdx(environment(), dim(x), TRUE)
143 | }
144 | invisible(b(1:5,-c(1:2,NA),c(1,NA),))
145 | 
146 | files <- x$get_partition_fpath()
147 | a <- function(i,...){
148 |   subsetFST(files, environment(), dim(x), 0.1)
149 | }
150 | # e = a(c(3,1,7,NA,2,1, 27, 16, 15,14,NA, 27))
151 | e = a(1:5,c(1:2,NA),,)
152 | range(e - x[][1:5,c(1:2,NA),,], na.rm = TRUE)
153 | 
154 | a(c(1:20, NA))
155 | 
156 | # array(1:27, c(3,3,3))[c(3,1,7,NA,2,1)]
157 | 
158 | 
159 | # f <- tempfile()
160 | # fst::write_fst(data.frame(V1R = 1:10, V1I = 10:1), path = f)
161 | # tmp <- fst:::fstretrieve(normalizePath(f), c('V1R', 'V1I'), 1L, NULL)
162 | # tmp
163 | # 
164 | # path <- normalizePath(f)
165 | # lazyMapReduceByPartition(path, c('V1R', 'V1I'), 1L, NULL, sum)
166 | # lazyMapReduceByPartition(path, c('V1R'), 1L, NULL, sum)
167 | # 
168 | # x <- lazyarray(tempfile(), 'complex', c(2,3,4))
169 | # x[] <- 1:24 + (24:1)*1i
170 | # 
171 | # partition_map(x, function(slice, part){
172 | #   slice
173 | # })
174 | 
175 | 
176 | # path = '~/Desktop/lazyarray_data'
177 | # dimension <- c(287, 200, 601, 84)
178 | # x <- lazyarray(path, storage_format = "double", dim = dimension)
179 | # part_loc <- list(1:287L, 1:200L, 1:601L, 1L)
180 | # a <- bench::mark({
181 | #   cpp_load_lazyarray(x$get_partition_fpath(1), part_loc, c(287L, 200L, 601L, 1L), 4, 0.1)
182 | # }, iterations = 1)
183 | # a$memory
184 | # 
185 | # path <- tempfile()
186 | # a = data.frame(V1=c(1:10 + rnorm(10), rep(NA,2)))
187 | # fst::write_fst(a, path)
188 | # path <- normalizePath(path)
189 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, sum)
190 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, function(x){mean(x, na.rm = TRUE)})
191 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, length)
192 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, function(x){NULL})
193 | # lazyMapReduceByPartition(path, 'V1', 1L, NULL, function(x){dim(x)}, c(3L,4L))
194 | 
195 | # a = 1:3; b = 4:6+0.5
196 | # pryr::address(a)
197 | # c =join_vectors(a, b)
198 | # a
199 | # pryr::address(a)
200 | # 
201 | # f <- normalizePath(tempfile(), mustWork = FALSE)
202 | # unlink(f)
203 | # dim <- c(10,20,50);
204 | # x <- rnorm(10000); dim(x) <- dim
205 | # x[sample(10000, 2000)] = NA
206 | # 
207 | # lazyarray:::cpp_create_lazyarray(x, dim, f, 100L, TRUE);
208 | # 
209 | # expect_true(file.exists(f), label = "cpp_create_lazyarray can write to file")
210 | # 
211 | # 
212 | # # Make sure we have invalid indices
213 | # idx_loc <- list(
214 | #   as.integer(sample(12) - 1),
215 | #   as.integer(sample(22)-1),
216 | #   as.integer(sample(52)-1)
217 | # )
218 | # target_dim = sapply(idx_loc, length)
219 | # cpp_load_lazyarray(f, idx_loc, dim, length(dim), 0.1)
220 | */
221 | 


--------------------------------------------------------------------------------
/src/lazycommon.h:
--------------------------------------------------------------------------------
1 | #ifndef LAZYARRAY_H_
2 | #define LAZYARRAY_H_
3 | 
4 | #include <Rcpp.h>
5 | 
6 | #endif // LAZYARRAY_H_
7 | 


--------------------------------------------------------------------------------
/src/loader1.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_LOADER1_H
 4 | #define LAZYARRAY_LOADER1_H
 5 | 
 6 | #include <Rcpp.h>
 7 | 
 8 | // [[Rcpp::export]]
 9 | SEXP lazyLoadOld(Rcpp::StringVector& files, Rcpp::List& partition_locations, 
10 |                         Rcpp::IntegerVector& partition_dim, R_xlen_t ndim, SEXP value_type);
11 | 
12 | 
13 | #endif // LAZYARRAY_LOADER1_H
14 | 


--------------------------------------------------------------------------------
/src/loader2.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_LOADER2_H
 4 | #define LAZYARRAY_LOADER2_H
 5 | 
 6 | #include "Rcpp.h"
 7 | #include "classIndexSchedule.h"
 8 | 
 9 | // [[Rcpp::interfaces(r,cpp)]]
10 | 
11 | SEXP subsetFSTBare(const std::string& rootPath, const ParsedIndex* parsed, const ScheduledIndex& schedule,
12 |                    const std::vector<int64_t>& dim, const SEXPTYPE& dtype);
13 | 
14 | // [[Rcpp::export]]
15 | SEXP subsetFST(const std::string& rootPath, SEXP listOrEnv, const std::vector<int64_t>& dim, 
16 |                 SEXPTYPE dtype, SEXP reshape = R_NilValue, bool drop = false);
17 | 
18 | 
19 | // [[Rcpp::export]]
20 | SEXP scheduleFST(SEXP listOrEnv, const std::vector<int64_t>& dim, bool forceSchedule = false, int64_t hint = -1);
21 | 
22 | // [[Rcpp::export]]
23 | SEXP executeScheduleFST(const std::string& rootPath, SEXPTYPE dtype, SEXP reshape, bool drop, int64_t partition);
24 | 
25 | // [[Rcpp::export]]
26 | SEXP scheduleExistsFST();
27 | 
28 | // [[Rcpp::export]]
29 | SEXP freeScheduleFST();
30 | 
31 | #endif // LAZYARRAY_LOADER2_H
32 | 


--------------------------------------------------------------------------------
/src/old/loader2ext.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H
 4 | #define LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H
 5 | 
 6 | #include "Rcpp.h"
 7 | 
 8 | // SEXP subsetFST_double(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
 9 | // 
10 | // SEXP subsetFST_integer(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
11 | // 
12 | // SEXP subsetFST_character(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
13 | // 
14 | // SEXP subsetFST_complex(const std::string& rootPath, const Rcpp::NumericVector& dim, const Rcpp::List& subparsed);
15 | 
16 | template <SEXPTYPE RTYPE>
17 | SEXP subsetFSTtemplate(const std::string& rootPath, const std::vector<int64_t>& dim, const Rcpp::List& subparsed);
18 | 
19 | #endif // LAZYARRAY_LOADER2_MULTIPART_DOUBLE_H
20 | 


--------------------------------------------------------------------------------
/src/openMPInterface.cpp:
--------------------------------------------------------------------------------
 1 | #include "openMPInterface.h"
 2 | 
 3 | #include <algorithm>
 4 | 
 5 | #ifdef _OPENMP
 6 | #include <pthread.h>
 7 | #endif
 8 | 
 9 | 
10 | int getLazyThread(bool max){
11 | #ifdef _OPENMP
12 |   if(detectFork){
13 |     return 1;
14 |   }
15 |   if( max ){
16 |     return omp_get_max_threads();
17 |   }
18 |   int t = lazyThreads < 0 ? omp_get_max_threads() : std::min(lazyThreads, omp_get_max_threads());
19 |   return std::max(t, 1);
20 | #else
21 |   return 1;
22 | #endif
23 | }
24 | 
25 | int setLazyThread(int n, SEXP reset_after_fork){
26 | #ifdef _OPENMP
27 |   if(!detectFork){
28 |     lazyThreads = n;
29 |   }
30 |   if( reset_after_fork != R_NilValue ){
31 |     if( Rf_asLogical(reset_after_fork) ){
32 |       reset_forked = true;
33 |     } else {
34 |       reset_forked = false;
35 |     }
36 |   }
37 |   
38 |   return n; 
39 | #else
40 |   return 1;
41 | #endif
42 | }
43 | 
44 | 
45 | bool hasOpenMP(){
46 |   return LAZYARRAY_HAS_OPENMP;
47 | }
48 | 
49 | void onForked(){ 
50 |   detectFork = true;
51 | }
52 | void onLeaveFork(){
53 |   if(!reset_forked){
54 |     lazyThreads = 1;
55 |   }
56 |   detectFork = false;
57 | }
58 | 
59 | int detectForked(DllInfo *dll){
60 |   // To disable openmp if fork is detected
61 | #ifdef _OPENMP
62 |   return pthread_atfork(&onForked, &onLeaveFork, NULL);
63 | #endif
64 |   
65 |   return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/src/openMPInterface.h:
--------------------------------------------------------------------------------
 1 | #ifndef LAZYARRAY_OPENMP_H
 2 | #define LAZYARRAY_OPENMP_H
 3 | 
 4 | // [[Rcpp::interfaces(r, cpp)]]
 5 | // [[Rcpp::plugins(openmp)]]
 6 | 
 7 | #ifdef _OPENMP
 8 | #include <omp.h>
 9 | #define LAZYARRAY_HAS_OPENMP true
10 | #else
11 | #define omp_get_thread_num() 0
12 | #define omp_get_max_threads() 1
13 | #define LAZYARRAY_HAS_OPENMP false
14 | #endif
15 | 
16 | #include <Rcpp.h>
17 | 
18 | static int lazyThreads = 0;
19 | 
20 | // stores n threads when fork occurs 
21 | static bool detectFork = false;
22 | static int reset_forked = true;
23 | 
24 | 
25 | // [[Rcpp::export]]
26 | int getLazyThread(bool max = false);
27 | 
28 | // [[Rcpp::export]]
29 | int setLazyThread(int n, SEXP reset_after_fork = R_NilValue);
30 | 
31 | // [[Rcpp::export]]
32 | bool hasOpenMP();
33 | 
34 | // [[Rcpp::init]]
35 | int detectForked(DllInfo *dll);
36 | 
37 | #endif  // OPEN_MP_HELPER_H
38 | 


--------------------------------------------------------------------------------
/src/playground.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include "utils.h"
 3 | using namespace Rcpp;
 4 | 
 5 | 
 6 | 
 7 | /*** R
 8 | a = 100000
 9 | b = 1024
10 | bench::mark(
11 |   mod2(a, b),
12 |   mod3(a, b),
13 |   mod4(a, b)
14 | )
15 | 
16 | */
17 | 


--------------------------------------------------------------------------------
/src/reshape.cpp:
--------------------------------------------------------------------------------
  1 | #include "reshape.h"
  2 | 
  3 | #include "common.h"
  4 | #include "indexConvert.h"
  5 | 
  6 | using namespace Rcpp; 
  7 | 
  8 | template <class T, typename I>
  9 | List cpp_array_to_list_template(T x, int64_t nrows, int64_t ncols){
 10 |   // length of x has been checked so assume length(x) = nrows * ncols
 11 |   List re = List::create();
 12 |   String colname;
 13 |   I ptr1_x = x.begin();
 14 |   I ptr2_x = x.begin();
 15 |   
 16 |   
 17 |   for(int64_t ii = 0; ii < ncols; ii++ ){
 18 |     ptr2_x += nrows;
 19 |     colname = "V" + std::to_string(ii + 1);
 20 |     T slice = T(ptr1_x, ptr2_x);
 21 |     re.push_back(slice, colname);
 22 |     ptr1_x = ptr2_x;
 23 |   }
 24 |   
 25 |   
 26 |   return re;
 27 | }
 28 | 
 29 | 
 30 | List cpp_array_to_list_complex(ComplexVector x, int64_t nrows, int64_t ncols){
 31 |   
 32 |   
 33 |   List re = List::create();
 34 |   String colname;
 35 |   ComplexVector::iterator ptr1_x = x.begin();
 36 |   ComplexVector::iterator ptr2_x = x.begin();
 37 |   
 38 |   for(int64_t ii = 0; ii < ncols; ii++ ){
 39 |     ptr2_x += nrows;
 40 |     const ComplexVector slice = ComplexVector(ptr1_x, ptr2_x);
 41 |     
 42 |     colname = "V" + std::to_string(ii + 1) + "R";
 43 |     re.push_back(Armor<NumericVector>(Rcpp::Re(slice)), colname);
 44 |     
 45 |     colname = "V" + std::to_string(ii + 1) + "I";
 46 |     re.push_back(Armor<NumericVector>(Rcpp::Im(slice)), colname);
 47 |     
 48 |     ptr1_x = ptr2_x;
 49 |   }
 50 |   
 51 |   return re;
 52 | }
 53 | 
 54 | 
 55 | /**
 56 |  * Conver vector x to data.frame with dimension c(first_dim, last_dim)
 57 |  * 
 58 |  * 2020-09-02: 
 59 |  * 1. renamed from cpp_array_to_list to arr2df
 60 |  * 2. changed argument, explicitly add nrows and ncols to be memory efficient
 61 |  * 3. Added length check
 62 |  */
 63 | Rcpp::List arr2df(SEXP x, int64_t nrows, int64_t ncols){
 64 |   // User explicitly tells which storage type of x should be
 65 |   // 9	CHARSXP	internal character strings
 66 |   // 10	LGLSXP	logical vectors
 67 |   // 13	INTSXP	integer vectors
 68 |   // 14	REALSXP	numeric vectors
 69 |   // 15	CPLXSXP	complex vectors
 70 |   // 16	STRSXP	character vectors
 71 |   // 24	RAWSXP	raw vector
 72 |   // 
 73 |   
 74 |   
 75 |   // check length of x
 76 |   if( nrows * ncols - Rf_xlength(x) != 0 ){
 77 |     stop("Cannot reshape array to data.frame, dimension not match");
 78 |   }
 79 |   
 80 |   Rcpp::List re; 
 81 |   switch (TYPEOF(x)) {
 82 |   case STRSXP:
 83 |   case CHARSXP:
 84 |     re = cpp_array_to_list_template<CharacterVector, CharacterVector::iterator>(x, nrows, ncols);
 85 |     break;
 86 |   case LGLSXP:
 87 |     re = cpp_array_to_list_template<LogicalVector, LogicalVector::iterator>(x, nrows, ncols);
 88 |     break;
 89 |   case INTSXP:
 90 |     re = cpp_array_to_list_template<IntegerVector, IntegerVector::iterator>(x, nrows, ncols);
 91 |     break;
 92 |   case REALSXP:
 93 |     re = cpp_array_to_list_template<NumericVector, NumericVector::iterator>(x, nrows, ncols);
 94 |     break;
 95 |   case CPLXSXP:
 96 |     re = cpp_array_to_list_complex(x, nrows, ncols);
 97 |     break;
 98 |   default:
 99 |     Rcpp::stop("Unsupported data type, only logical, numeric, complex, character types are supported.");
100 |   }
101 |   return re;
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/src/reshape.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_RESHAPE_H
 4 | #define LAZYARRAY_RESHAPE_H
 5 | 
 6 | #include "Rcpp.h"
 7 | 
 8 | Rcpp::List arr2df(SEXP x, int64_t nrows, int64_t ncols);
 9 | 
10 | #endif // LAZYARRAY_RESHAPE_H
11 | 


--------------------------------------------------------------------------------
/src/saver2.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_SAVER2_H
 4 | #define LAZYARRAY_SAVER2_H
 5 | 
 6 | #include "Rcpp.h"
 7 | #include "utils.h"
 8 | 
 9 | // [[Rcpp::interfaces(r, cpp)]]
10 | 
11 | template <SEXPTYPE RTYPE>
12 | SEXP writeFstPartition(const Rcpp::Vector<RTYPE>& values, const std::string& file, 
13 |                        const std::vector<int64_t>& dim, const Rcpp::List& subparsed,
14 |                        int compression, bool uniformEncoding);
15 | 
16 | // [[Rcpp::export]]
17 | SEXP subsetAssignFST(const SEXP values, const std::string& file, SEXP listOrEnv,
18 |                      const std::vector<int64_t>& dim, const SEXPTYPE& dtype,
19 |                      int compression = 50, bool uniformEncoding = true);
20 | 
21 | #endif  // LAZYARRAY_SAVER2_H
22 | 


--------------------------------------------------------------------------------
/src/saver2ext.cpp:
--------------------------------------------------------------------------------
 1 | #include "saver2ext.h"
 2 | 
 3 | #include "common.h"
 4 | #include "utils.h"
 5 | #include "fstWrapper.h"
 6 | #include "indexConvert.h"
 7 | using namespace Rcpp;
 8 | 
 9 | // returns: -1 all partitions affected, otherwise the indices of partitions
10 | 
11 | 
12 | /*** R
13 | x <- lazyarray:::as.lazyarray(array(rep(1,27), rep(3,3)))
14 | unlink(x$get_partition_fpath(3))
15 | res <- lazyarray:::parseAndScheduleBlocks(list(lazyarray:::get_missing_value(),1,1:3), dim(x))
16 | writeFstPartition_double(1.1, x$get_partition_fpath(), dim(x), res, 50L, TRUE)
17 | x[]
18 | x[,1,]
19 | */
20 | 


--------------------------------------------------------------------------------
/src/saver2ext.h:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::plugins("cpp11")]]
 2 | 
 3 | #ifndef LAZYARRAY_SAVER2EXT_H
 4 | #define LAZYARRAY_SAVER2EXT_H
 5 | 
 6 | #include "Rcpp.h"
 7 | 
 8 | 
 9 | 
10 | #endif  // LAZYARRAY_SAVER2EXT_H
11 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef DIP_LAZYARRAY_UTILS_H 
 3 | #define DIP_LAZYARRAY_UTILS_H
 4 | 
 5 | #include "Rcpp.h"
 6 | 
 7 | // [[Rcpp::interfaces(r, cpp)]]
 8 | 
 9 | template <typename T, typename I>
10 | bool contains(T vec, SEXP el);
11 | 
12 | SEXP getListElement(SEXP list, const char *str); 
13 | SEXP getListElement2(SEXP list, const char *str, const SEXP ifNull); 
14 | 
15 | // [[Rcpp::export]]
16 | SEXP dropDimension(SEXP x);
17 | 
18 | // [[Rcpp::export]]
19 | int64_t prod2(SEXP x, bool na_rm = false);
20 | 
21 | // [[Rcpp::export]]
22 | SEXP parseDots(Rcpp::Environment& env, bool eval);
23 | 
24 | // [[Rcpp::export]]
25 | bool stopIfNot(const bool isValid, const std::string& message, bool stopIfError = true);
26 | 
27 | // [[Rcpp::export]]
28 | SEXPTYPE getSexpType(SEXP x);
29 | 
30 | // [[Rcpp::export]]
31 | SEXP tik();
32 | 
33 | // [[Rcpp::export]]
34 | SEXP tok(std::string msg, bool stop = false);
35 | 
36 | std::string as_dirpath(std::string x);
37 | 
38 | SEXP captureException( const std::exception& e );
39 | SEXP makeException( std::string msg );
40 | 
41 | // [[Rcpp::export]]
42 | SEXP subsetAssignVector(SEXP x, int64_t start, SEXP value);
43 | 
44 | template <typename T>
45 | inline std::vector<T> seq_len3(int64_t n){
46 |   std::vector<T> re = std::vector<T>(n);
47 |   T v = 1;
48 |   for(auto it = re.begin(); it != re.end(); it++){
49 |     *it = v++;
50 |   }
51 |   return re;
52 | }
53 | 
54 | void setReIm(Rcpp::ComplexVector x, Rcpp::NumericVector v, bool is_real);
55 | 
56 | inline void r_gc(){
57 |   Rcpp::Environment env = Rcpp::Environment::base_env();
58 |   Rcpp::Function gc = env["gc"];
59 |   gc();
60 | }
61 | 
62 | #endif // DIP_LAZYARRAY_UTILS_H
63 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(lazyarray)
3 | 
4 | test_check("lazyarray")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cpp_io2.R:
--------------------------------------------------------------------------------
  1 | # devtools::load_all()
  2 | 
  3 | require(testthat)
  4 | 
  5 | context("Loader2 no sub-blocks")
  6 | 
  7 | dim <- c(10,20,50) 
  8 | 
  9 | f <- normalizePath(tempfile(), mustWork = FALSE)
 10 | on.exit({
 11 |   setLazyBlockSize(-1)
 12 |   unlink(f)
 13 | })
 14 | 
 15 | x = array(1:prod(dim), dim)
 16 | a = as.lazyarray(x, path = f, type = 'fstarray')
 17 | loader_f <- function(..., samp, reshape, drop){
 18 |   lazyarray:::subsetFST(a$storage_path, environment(), dim, getSexpType(samp), reshape, drop)
 19 | }
 20 | 
 21 | lazy_test_unit <- function(samp_data, x_alt){
 22 |   loader_double = function(..., reshape = NULL, drop = FALSE){ loader_f(samp = samp_data, ..., reshape = reshape, drop = drop) }
 23 |   if(missing(x_alt)){
 24 |     x <- x; storage.mode(x) <- storage.mode(samp_data)
 25 |   } else {
 26 |     x <- x_alt; storage.mode(x) <- storage.mode(samp_data)
 27 |   }
 28 |   
 29 |   # 1. a()
 30 |   re <- loader_double()
 31 |   expect_equal(storage.mode(re), storage.mode(samp_data))
 32 |   expect_equivalent(re, x)
 33 |   expect_equivalent(dim(re), dim(x))
 34 |   
 35 |   # 2. a(i)
 36 |   idx <- sample(length(x), size = 200, replace = TRUE)
 37 |   idx[sample(200, 20)] = NA
 38 |   re <- loader_double(idx)
 39 |   cp <- x[idx]
 40 |   expect_equivalent(re, cp)
 41 |   
 42 |   # 2. a(-i)
 43 |   idx <- -sample(length(x), size = 200, replace = TRUE)
 44 |   idx[sample(200, 20)] = NA
 45 |   re <- loader_double(idx)
 46 |   cp <- x[idx[!is.na(idx)]]
 47 |   expect_equivalent(re, cp)
 48 |   
 49 |   # 2.1 large indices
 50 |   re <- loader_double(10000000)
 51 |   cp <- x[10000000]
 52 |   expect_equivalent(re, cp)
 53 |   expect_true(is.na(re))
 54 |   re <- loader_double(-10000000)
 55 |   cp <- x[-10000000]
 56 |   expect_equal(re, cp)
 57 |   expect_equal(dim(re), NULL)
 58 |   
 59 |   
 60 |   
 61 |   # 3. a(-i:i)  
 62 |   expect_error(loader_double(-1:1))
 63 |   
 64 |   # 4. a(i,j,k)
 65 |   idx <- lapply(dim, sample, replace = TRUE, size = 6)
 66 |   re <- do.call(loader_double, idx)
 67 |   cp <- eval(as.call(c(list(quote(`[`), quote(x)), idx)))
 68 |   expect_equivalent(re, cp)
 69 |   expect_equivalent(dim(re), dim(cp))
 70 |   
 71 |   # 5. negative subscripts
 72 |   ii <- sample(2)[[1]] + 1
 73 |   idx[[ii]] <- -idx[[ii]]
 74 |   re <- do.call(loader_double, idx)
 75 |   cp <- eval(as.call(c(list(quote(`[`), quote(x)), idx)))
 76 |   expect_equivalent(re, cp)
 77 |   expect_equivalent(dim(re), dim(cp))
 78 |   ii <- 1
 79 |   idx[[ii]] <- -idx[[ii]]
 80 |   re <- do.call(loader_double, idx)
 81 |   cp <- eval(as.call(c(list(quote(`[`), quote(x)), idx)))
 82 |   expect_equivalent(re, cp)
 83 |   expect_equivalent(dim(re), dim(cp))
 84 |   
 85 |   # 6. With missing
 86 |   re <- loader_double(,c(NA,1:0),c(2,NA,1))
 87 |   cp <- x[,c(NA,1:0),c(2,NA,1),drop = FALSE]
 88 |   expect_equivalent(re, cp)
 89 |   expect_equivalent(dim(re), dim(cp))
 90 |   
 91 |   re <- loader_double(c(NA,1:0),c(2,NA,1),)
 92 |   cp <- x[c(NA,1:0),c(2,NA,1),,drop = FALSE]
 93 |   expect_equivalent(re, cp)
 94 |   expect_equivalent(dim(re), dim(cp))
 95 |   
 96 |   re <- loader_double(c(NA,1:0),,c(2,NA,1))
 97 |   cp <- x[c(NA,1:0),,c(2,NA,1),drop = FALSE]
 98 |   expect_equivalent(re, cp)
 99 |   expect_equivalent(dim(re), dim(cp))
100 |   
101 |   re <- loader_double(,,)
102 |   cp <- x[,,,drop = FALSE]
103 |   expect_equivalent(re, cp)
104 |   expect_equivalent(dim(re), dim(cp))
105 |   
106 |   # 7. drop
107 |   re <- loader_double(,c(NA,1:0),c(2,NA,1), drop = TRUE)
108 |   cp <- x[,c(NA,1:0),c(2,NA,1),drop = TRUE]
109 |   expect_equivalent(re, cp)
110 |   expect_equivalent(dim(re), dim(cp))
111 |   re <- loader_double(,c(NA,1:0),c(3), drop = TRUE)
112 |   cp <- x[,c(NA,1:0),c(3),drop = TRUE]
113 |   expect_equivalent(re, cp)
114 |   expect_equivalent(dim(re), dim(cp))
115 |   
116 |   # 8. reshape
117 |   re <- loader_double(,c(NA,1:0),c(2,NA,1), reshape = c(20, 3))
118 |   cp <- x[,c(NA,1:0),c(2,NA,1),drop = TRUE]
119 |   expect_equivalent(as.vector(re), as.vector(cp))
120 |   expect_equivalent(dim(re), c(20, 3))
121 |   re <- loader_double(,c(NA,1:0),c(2,NA,1), reshape = c(60))
122 |   expect_equivalent(dim(re), NULL)
123 |   
124 |   # 9. Negative with missing
125 |   re <- loader_double(,-c(NA,1:0),c(2,NA,1))
126 |   cp <- x[,-c(1:0),c(2,NA,1),drop = FALSE]
127 |   expect_equivalent(re, cp)
128 |   expect_equivalent(dim(re), dim(cp))
129 |   
130 |   re <- loader_double(-c(NA,1:0),c(2,NA,1),)
131 |   cp <- x[-c(1:0),c(2,NA,1),,drop = FALSE]
132 |   expect_equivalent(re, cp)
133 |   expect_equivalent(dim(re), dim(cp))
134 |   
135 |   re <- loader_double(c(NA,1:0),,-c(2,NA,1))
136 |   cp <- x[c(NA,1:0),,-c(2,1),drop = FALSE]
137 |   expect_equivalent(re, cp)
138 |   expect_equivalent(dim(re), dim(cp))
139 |   
140 |   re <- loader_double(,,-1000)
141 |   cp <- x[,,-1000,drop = FALSE]
142 |   expect_equivalent(re, cp)
143 |   expect_equivalent(dim(re), dim(cp))
144 |   
145 |   re <- loader_double(,-1000,1)
146 |   cp <- x[,-1000,1,drop = FALSE]
147 |   expect_equivalent(re, cp)
148 |   expect_equivalent(dim(re), dim(cp))
149 |   
150 |   re <- loader_double(-1000,,)
151 |   cp <- x[-1000,,,drop = FALSE]
152 |   expect_equivalent(re, cp)
153 |   expect_equivalent(dim(re), dim(cp))
154 |   
155 |   
156 |   # Wrong usages
157 |   expect_error(loader_double(100,,))
158 |   expect_error(loader_double(1,1,1,1))
159 |   expect_error(loader_double(1,1))
160 |   expect_error(loader_double(,1,1,))
161 |   expect_error(loader_double(,))
162 |   expect_error(loader_double(,1000,))
163 |   expect_error(loader_double(,,1000))
164 |   expect_error(loader_double(1000,,1000))
165 |   expect_error(loader_double(1000,1000,))
166 |   expect_error(loader_double(,1000,1000))
167 |   
168 | }
169 | 
170 | test_that("Loader2 no sub-blocks", {
171 |   lazy_test_unit(0.1)
172 |   lazy_test_unit(1L)
173 |   # lazy_test_unit("")
174 | })
175 | 
176 | 
177 | context("Loader2 sub-blocks")
178 | 
179 | test_that("Loader2 sub-blocks", {
180 |   setLazyBlockSize(1)
181 |   lazy_test_unit(0.1)
182 |   lazy_test_unit(1L)
183 |   # lazy_test_unit("")
184 |   
185 |   setLazyBlockSize(11)
186 |   lazy_test_unit(0.1)
187 |   lazy_test_unit(1L)
188 |   # lazy_test_unit("")
189 |   
190 |   setLazyBlockSize(201)
191 |   lazy_test_unit(0.1)
192 |   lazy_test_unit(1L)
193 |   # lazy_test_unit("")
194 |   
195 |   setLazyBlockSize(-1)
196 | })
197 | 
198 | context("Loader2 with NAs")
199 | 
200 | test_that("Loader2 with NAs", {
201 |   unlink(a$get_partition_fpath(2))
202 |   setLazyBlockSize(1)
203 |   x[,,2] <- NA
204 |   expect_equal(x[], a[])
205 |   lazy_test_unit(0.1, x)
206 |   lazy_test_unit(1L, x)
207 |   # lazy_test_unit("", x)
208 |   
209 |   setLazyBlockSize(-1)
210 |   x[,,2] <- NA
211 |   expect_equal(x[], a[])
212 |   lazy_test_unit(0.1, x)
213 |   lazy_test_unit(1L, x)
214 |   # lazy_test_unit("", x)
215 | })
216 | 
217 | 
218 | context("Loader2 with complex data")
219 | 
220 | x = array(rnorm(prod(dim)), dim) + 1i * array(rnorm(prod(dim)), dim)
221 | unlink(f, recursive = TRUE)
222 | a = as.lazyarray(x, path = f, type = 'fstarray')
223 | 
224 | test_that("Loader2 complex", {
225 |   setLazyBlockSize(-1)
226 |   lazy_test_unit(x[[1]])
227 |   
228 |   setLazyBlockSize(1)
229 |   lazy_test_unit(x[[1]])
230 |   
231 |   setLazyBlockSize(11)
232 |   lazy_test_unit(x[[1]])
233 |   
234 |   setLazyBlockSize(201)
235 |   lazy_test_unit(x[[1]])
236 |   
237 |   setLazyBlockSize(1)
238 |   unlink(a$get_partition_fpath(2))
239 |   x[,,2] <- NA
240 |   expect_equal(x[], a[])
241 |   lazy_test_unit(x[[1]], x)
242 |   
243 |   setLazyBlockSize(-1)
244 |   unlink(a$get_partition_fpath(2))
245 |   x[,,2] <- NA
246 |   expect_equal(x[], a[])
247 |   lazy_test_unit(x[[1]], x)
248 | })
249 | 
250 | context("Loader2 with matrix")
251 | test_that("Loader2 with matrix", {
252 |   x <- matrix(1:16,4)
253 |   a <- as.lazyarray(x, type = 'fstarray')
254 |   loader_f <- function(...){
255 |     lazyarray:::subsetFST(a$storage_path, environment(), dim(x), getSexpType(1L), NULL, FALSE)
256 |   }
257 |   
258 |   expect_equivalent(loader_f(), x)
259 |   expect_equivalent(loader_f(,1), x[,1,drop=TRUE])
260 |   expect_equivalent(loader_f(1,), x[1,,drop=TRUE])
261 |   
262 | })
263 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cpp_loc2idx.R:
--------------------------------------------------------------------------------
  1 | test_that("Test loc2idx3 - validate", {
  2 |   # Case 1: small dset with NAs
  3 |   x = array(1:8,c(2,2,2))
  4 |   dim = dim(x)
  5 |   locs = list(0:1L,1L, 3:1)
  6 |   target_dim = sapply(locs, length)
  7 |   tmp = loc2idx3(locs, dim);# tmp
  8 |   tmp[tmp < -9e18] <- NA
  9 |   # validate in R
 10 |   mfactor <- c(1, cumprod(dim))[seq_along(dim)]
 11 |   scaled_loc <- lapply(seq_along(locs), function(ii){
 12 |     x <- as.integer(locs[[ii]])
 13 |     d <- dim[[ii]]
 14 |     x[x < 1 | x > d] <- NA 
 15 |     (x - 1) * mfactor[[ii]]
 16 |   })
 17 |   t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1
 18 |   expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0)
 19 |   expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0)
 20 |   expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0))
 21 |   
 22 |   # Case 2: small dset with 0 length
 23 |   x = array(1:8,c(2,2,2))
 24 |   dim = dim(x)
 25 |   locs = list(0:1L,integer(0), 3:1)
 26 |   target_dim = sapply(locs, length)
 27 |   tmp = loc2idx3(locs, dim);# tmp
 28 |   tmp[tmp < -9e18] <- NA
 29 |   # validate in R
 30 |   mfactor <- c(1, cumprod(dim))[seq_along(dim)]
 31 |   scaled_loc <- lapply(seq_along(locs), function(ii){
 32 |     x <- as.integer(locs[[ii]])
 33 |     d <- dim[[ii]]
 34 |     x[x < 1 | x > d] <- NA 
 35 |     (x - 1) * mfactor[[ii]]
 36 |   })
 37 |   t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1
 38 |   expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0)
 39 |   expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0)
 40 |   expect_length(t1, 0)
 41 |   expect_length(tmp, 0)
 42 |   
 43 |   # case 3: random set
 44 |   x = (1:5000000)
 45 |   # arr2df(x, c(0L, 25L, 50L), 9L)
 46 |   dim = c(50000L, 10L, 10L)
 47 |   locs = list(
 48 |     as.integer(sample(100000L, 300) - 2000),
 49 |     as.integer(sample(12)-1),
 50 |     as.integer(sample(12)-1)
 51 |   )
 52 |   target_dim = sapply(locs, length)
 53 |   tmp = loc2idx3(locs, dim);# tmp
 54 |   tmp[tmp < -9e18] <- NA
 55 |   dim(tmp) = target_dim
 56 |   
 57 |   # validate in R
 58 |   mfactor <- c(1, cumprod(dim))[seq_along(dim)]
 59 |   scaled_loc <- lapply(seq_along(locs), function(ii){
 60 |     x <- as.integer(locs[[ii]])
 61 |     d <- dim[[ii]]
 62 |     x[x < 1 | x > d] <- NA 
 63 |     (x - 1) * mfactor[[ii]]
 64 |   })
 65 |   t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1
 66 |   expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0)
 67 |   expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0)
 68 |   expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0))
 69 |   
 70 |   dim(x) = c(50000L, 10L, 10L)
 71 |   a = locs[[1]]; a[(a<1) | (a >50000)] = NA
 72 |   b = locs[[2]]; b[b<1 | b > 10] = NA
 73 |   c = locs[[3]]; c[c<1 | c > 10] = NA
 74 |   y2 <- x[a,b,c]
 75 |   expect_equal(range(y2 - tmp, na.rm = TRUE), c(0,0))
 76 |   
 77 |   y3 = tmp
 78 |   expect_equal(sum(abs(is.na(y2) - is.na(y3))), 0)
 79 |   expect_length(which(abs(is.na(y2) - is.na(y3)) > 0, arr.ind = TRUE), 0)
 80 |   expect_equal(range(y2 - y3, na.rm = TRUE), c(0,0))
 81 |   
 82 | })
 83 | 
 84 | test_that("Test loc2idx3 - validate", {
 85 |   # Case 1: small dset with NAs
 86 |   x = array(1:8,c(2,2,2))
 87 |   dim = dim(x)
 88 |   locs = list(0:1L,1L, 3:1)
 89 |   target_dim = sapply(locs, length)
 90 |   tmp = loc2idx3(locs, dim);# tmp
 91 |   tmp[tmp < -9e18] <- NA
 92 |   # validate in R
 93 |   mfactor <- c(1, cumprod(dim))[seq_along(dim)]
 94 |   scaled_loc <- lapply(seq_along(locs), function(ii){
 95 |     x <- as.integer(locs[[ii]])
 96 |     d <- dim[[ii]]
 97 |     x[x < 1 | x > d] <- NA 
 98 |     (x - 1) * mfactor[[ii]]
 99 |   })
100 |   t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1
101 |   expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0)
102 |   expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0)
103 |   expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0))
104 |   
105 |   # Case 2: small dset with 0 length
106 |   x = array(1:8,c(2,2,2))
107 |   dim = dim(x)
108 |   locs = list(0:1L,integer(0), 3:1)
109 |   target_dim = sapply(locs, length)
110 |   tmp = loc2idx3(locs, dim);# tmp
111 |   tmp[tmp < -9e18] <- NA
112 |   # validate in R
113 |   mfactor <- c(1, cumprod(dim))[seq_along(dim)]
114 |   scaled_loc <- lapply(seq_along(locs), function(ii){
115 |     x <- as.integer(locs[[ii]])
116 |     d <- dim[[ii]]
117 |     x[x < 1 | x > d] <- NA 
118 |     (x - 1) * mfactor[[ii]]
119 |   })
120 |   t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1
121 |   expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0)
122 |   expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0)
123 |   expect_length(t1, 0)
124 |   expect_length(tmp, 0)
125 |   
126 |   # case 3: random set
127 |   x = (1:5000000)
128 |   # arr2df(x, c(0L, 25L, 50L), 9L)
129 |   dim = c(50000L, 10L, 10L)
130 |   locs = list(
131 |     as.integer(sample(100000L, 300) - 2000),
132 |     as.integer(sample(12)-1),
133 |     as.integer(sample(12)-1)
134 |   )
135 |   target_dim = sapply(locs, length)
136 |   tmp = loc2idx3(locs, dim);# tmp
137 |   tmp[tmp < -9e18] <- NA
138 |   dim(tmp) = target_dim
139 |   
140 |   # validate in R
141 |   mfactor <- c(1, cumprod(dim))[seq_along(dim)]
142 |   scaled_loc <- lapply(seq_along(locs), function(ii){
143 |     x <- as.integer(locs[[ii]])
144 |     d <- dim[[ii]]
145 |     x[x < 1 | x > d] <- NA 
146 |     (x - 1) * mfactor[[ii]]
147 |   })
148 |   t1 <- Reduce(function(a, b){ outer(a, b, '+') }, scaled_loc) + 1
149 |   expect_equal(sum(is.na(t1)) - sum(is.na(tmp)), 0)
150 |   expect_equal(sum(xor(is.na(t1), is.na(tmp))), 0)
151 |   expect_equal(range(t1 - tmp, na.rm = TRUE), c(0,0))
152 |   
153 |   dim(x) = c(50000L, 10L, 10L)
154 |   a = locs[[1]]; a[(a<1) | (a >50000)] = NA
155 |   b = locs[[2]]; b[b<1 | b > 10] = NA
156 |   c = locs[[3]]; c[c<1 | c > 10] = NA
157 |   y2 <- x[a,b,c]
158 |   expect_equal(range(y2 - tmp, na.rm = TRUE), c(0,0))
159 |   
160 |   y3 = tmp
161 |   expect_equal(sum(abs(is.na(y2) - is.na(y3))), 0)
162 |   expect_length(which(abs(is.na(y2) - is.na(y3)) > 0, arr.ind = TRUE), 0)
163 |   expect_equal(range(y2 - y3, na.rm = TRUE), c(0,0))
164 |   
165 | })
166 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cpp_parseAndScheduleBlocks.R:
--------------------------------------------------------------------------------
  1 | require(testthat)
  2 | 
  3 | # devtools::load_all()
  4 | dim <- c(5,7,8,10)
  5 | slice <- list(c(1,1,2),1,2:3,4:5)
  6 | 
  7 | check_schedule <- function(slice, dim){
  8 |   slice_copy <- lapply(slice, I); force(slice_copy)
  9 |   res <- parseAndScheduleBlocks2(slice_copy, dim)
 10 |   re <- res$schedule
 11 |   block_lb <- getLazyBlockSize()
 12 |   block_ub <- 31250000
 13 |   ndims <- length(dim)
 14 |   
 15 |   valid_slices <- lapply(seq_along(slice), function(ii){
 16 |     x <- slice[[ii]]
 17 |     if(isTRUE(x == get_missing_value())){
 18 |       x <- seq_len(dim[[ii]])
 19 |     } else {
 20 |       x <- x[is.na(x) | x > 0]
 21 |     }
 22 |     x
 23 |   })
 24 |   
 25 |   get_missing_value()
 26 |   
 27 |   # redo this procedure in R
 28 |   cum_dim <- cumprod(dim)
 29 |   block_ndims <- which(cum_dim > block_lb)
 30 |   if(!length(block_ndims)){ block_ndims <- ndims }
 31 |   block_ndims <- block_ndims[[1]]
 32 |   if( block_ndims == ndims ){ block_ndims = block_ndims - 1 }
 33 |   
 34 |   # [3000,7,3] if buffer_margin=1 => [3000] x [7,1] x [3]
 35 |   # [3000,7,3] if buffer_margin=2 => [3000,7] x [1,1] x [3]
 36 |   # [100,100,100,100,1] buffer_margin=2 => [100,100] x [100,100,1] x [1]
 37 |   block_dimension <- dim[seq_len(block_ndims)]
 38 |   schedule_dimension <- dim[-seq_len(block_ndims)]
 39 |   partition_index <- slice[[ndims]]
 40 |   partition_index <- partition_index[is.na(partition_index) | partition_index > 0]
 41 |   partition_counts <- length(partition_index)
 42 |   
 43 |   # make schedules
 44 |   block_expected_length <- prod(sapply(valid_slices[seq_len(block_ndims)], length))
 45 |   schedule_counts_per_part <- prod(sapply(valid_slices, length)) / partition_counts / block_expected_length
 46 |   block_indexed <- FALSE
 47 |   if(block_ndims >= 2 && block_expected_length <= block_ub){
 48 |     block_schedule <- loc2idx3(valid_slices[seq_len(block_ndims)], dim[seq_len(block_ndims)])
 49 |     block_indexed <- TRUE
 50 |     
 51 |     
 52 |     expect_equal(re$block_schedule, block_schedule)
 53 |     block_schedule[block_schedule < 0] <- NA
 54 |     expect_equal(re$block_schedule_start, min(block_schedule, na.rm = TRUE))
 55 |     expect_equal(re$block_schedule_end, max(block_schedule, na.rm = TRUE))
 56 |     expect_equal(re$block_expected_length, block_expected_length)
 57 |   } else {
 58 |     # do we need accurate block_expected_length? as it's not actually used if not block indexed
 59 |     # block_expected_length <- prod(dim[seq_len(block_ndims)])
 60 |     block_schedule_start <- 1
 61 |     block_schedule_end <- prod(dim[seq_len(block_ndims)])
 62 |     expect_equal(re$block_schedule_start, block_schedule_start)
 63 |     expect_equal(re$block_schedule_end, block_schedule_end)
 64 |     expect_equal(re$block_expected_length, block_expected_length)
 65 |   }
 66 |   
 67 |   block_prod_dim <- c(1, cumprod(block_dimension))[seq_along(block_dimension)]
 68 |   block_length <- prod(block_dimension)
 69 |   
 70 |   # schedule blocks
 71 |   schedule_dimension_alt <- schedule_dimension
 72 |   schedule_dimension_alt[[length(schedule_dimension_alt)]] <- 1
 73 |   tmp <- valid_slices[-seq_len(block_ndims)]
 74 |   tmp[[length(tmp)]] <- 1
 75 |   schedule_index <- loc2idx3(tmp, schedule_dimension_alt)
 76 |   
 77 |   # checks
 78 |   expect_equal(re$dimension, dim)
 79 |   expect_equal(re$block_ndims, block_ndims)
 80 |   expect_equal(re$block_dimension, block_dimension)
 81 |   expect_equal(re$schedule_dimension, schedule_dimension)
 82 |   # re$partition_index[re$partition_index < -9e18] <- NA_real_
 83 |   expect_equal(as.numeric(re$partition_index), partition_index)
 84 |   expect_equal(re$partition_counts, partition_counts)
 85 |   expect_equal(re$schedule_counts_per_part, schedule_counts_per_part)
 86 |   expect_equal(re$block_prod_dim, block_prod_dim)
 87 |   expect_equal(re$block_indexed, block_indexed)
 88 |   expect_equal(re$block_length, block_length)
 89 |   expect_equal(re$schedule_index, schedule_index)
 90 | }
 91 | 
 92 | 
 93 | context("subset scheduler-normal case")
 94 | 
 95 | test_that("subset scheduler-normal case", {
 96 |   dim <- c(5,7,8,10)
 97 |   slice <- list(c(1,1,2),1,2:3,4:5)
 98 |   
 99 |   setLazyBlockSize(1)
100 |   check_schedule(slice, dim)
101 |   setLazyBlockSize(30)
102 |   check_schedule(slice, dim)
103 |   setLazyBlockSize(300)
104 |   check_schedule(slice, dim)
105 |   setLazyBlockSize(10000)
106 |   check_schedule(slice, dim)
107 |   
108 | })
109 | 
110 | context("subset scheduler-NA & 0 cases")
111 | test_that("subset scheduler-NA & 0 cases", {
112 |   dim <- c(5,7,8,10)
113 |   
114 |   slice <- lapply(dim, function(x){
115 |     x <- sample(x, x, replace = TRUE)
116 |     x[sample(length(x), 2)] <- NA
117 |     x[sample(length(x), 1)] <- 0
118 |     x
119 |   })
120 |   
121 |   setLazyBlockSize(1)
122 |   check_schedule(slice, dim)
123 |   setLazyBlockSize(30)
124 |   check_schedule(slice, dim)
125 |   setLazyBlockSize(300)
126 |   check_schedule(slice, dim)
127 |   setLazyBlockSize(10000)
128 |   check_schedule(slice, dim)
129 | })
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/tests/testthat/test-generics.R:
--------------------------------------------------------------------------------
 1 | test_that("summary statistics", {
 2 |   x <-
 3 |     lazyarray::lazyarray(tempfile(),
 4 |                          dim = c(100, 100),
 5 |                          storage_format = 'double')
 6 |   x[, 1] <- 1:100
 7 |   x[, 2] <- NA
 8 |   x[, 3] <- c(0, rep(NA, 99))
 9 |   
10 |   
11 |   testthat::expect_true(is.na(min(x)))
12 |   testthat::expect_true(is.na(max(x)))
13 |   
14 |   testthat::expect_equivalent(min(x, na.rm=TRUE), 0)
15 |   testthat::expect_equivalent(max(x, na.rm=TRUE), 100)
16 |   testthat::expect_equivalent(range(x, na.rm=TRUE), c(0, 100))
17 |   testthat::expect_equivalent(range(x), c(NA, NA))
18 |   
19 |   testthat::expect_equivalent(mean(x, na.rm = TRUE), 50)
20 |   testthat::expect_equivalent(sum(x, na.rm = TRUE), 5050)
21 |   
22 |   # partition_apply(x, mean)
23 |   
24 | })
25 | 


--------------------------------------------------------------------------------
/tests/testthat/test-lazylm.R:
--------------------------------------------------------------------------------
 1 | if(interactive()){
 2 |   require(testthat)
 3 |   require(lazyarray)
 4 |   require(stats)
 5 | }
 6 | 
 7 | test_that("lazylm", {
 8 |   
 9 |   # use a baseenv to evaluate
10 |   env <- new.env(parent = asNamespace('lazyarray'))
11 |   with(env, {
12 |     lazylm( V1 ~ .-V2-1 + (V2 > 0), data = local({
13 |       arr <- array(rnorm(72), c(6,3,4))
14 |       arr[1,1,1] <- NA 
15 |       as.lazyarray(arr, type = 'file')
16 |     }), weights = runif(18), offset = rnorm(18) , na.action = 'na.exclude')
17 |   })
18 |   
19 |   
20 |   arr <- array(rnorm(72), c(6,3,4))
21 |   arr[1,1,1] <- NA        # Allow NA to be treated
22 |   offset = rnorm(18)    # offsets and weights are optional
23 |   weights = runif(18)
24 |   na_action <- 'na.omit'
25 | 
26 |   formula <- V1 ~ .-V2-1 + (V2 > 0)
27 | 
28 |   data <- as.lazyarray(arr, type = 'file')
29 |   
30 |   dim(arr) <- c(18, 4)
31 |   lm_data <- as.data.frame(arr)
32 |   
33 |   e <- eigen(crossprod(arr[complete.cases(arr), ]))
34 |   # if(!min(e$values > 1e-7)){
35 |   #   skip("Generated eigenvalue for lazylm is too small, just skip this time")
36 |   # }
37 |   
38 |   object <- lazylm(formula, data, weights = weights, offset = offset, na.action = na_action, fitted = TRUE)
39 | 
40 | 
41 |   # Compare to stats::lm
42 |   flm <- lm(formula, lm_data, weights = weights, offset = offset, na.action = na_action)
43 |   
44 |   assign('object', object, envir = globalenv())
45 |   assign('flm', flm, envir = globalenv())
46 | 
47 |   expect_lt(max(abs(coef(object) - coef(flm))), 1e-7)
48 |   expect_lt(max(abs(resid(object) - resid(flm))), 1e-7)
49 |   expect_lt(max(abs(fitted(object) - fitted(flm))), 1e-7)
50 |   
51 |   s1 <- summary(object)
52 |   s2 <- summary(flm)
53 |   
54 |   expect_lt(max(abs(s1$coefficients - s2$coefficients)), 1e-7)
55 |   
56 |   expect_lt(max(abs(s1$r.squared - s2$r.squared)), 1e-7)
57 |   expect_equivalent(s1$df, s2$df)
58 |   
59 | })
60 | 


--------------------------------------------------------------------------------
/tests/testthat/test-matmul.R:
--------------------------------------------------------------------------------
  1 | test_that("multiplication works", {
  2 |   on.exit({options('lazyarray.chunk_memory' = 80)
  3 |     lazyarray::set_lazy_threads(0)})
  4 |   
  5 |   lazyarray::set_lazy_threads(1)
  6 |   
  7 |   
  8 |   options('lazyarray.chunk_memory' = 0.00001)
  9 |   
 10 |   orig <- rnorm(1e4)
 11 |   
 12 |   orig1 <- orig; dim(orig1) <- c(10, 1000)
 13 |   
 14 |   weights <- rnorm(1000)
 15 |   
 16 |   # 0.001 
 17 |   system.time({
 18 |     xy0 <- orig1 %*% (t(orig1) * weights)
 19 |   })
 20 |   
 21 |   # x <- as.lazymatrix(orig1)
 22 |   # y <- as.lazymatrix(t(orig1))
 23 |   
 24 |   
 25 |   # # 0.130
 26 |   # system.time({
 27 |   #   xy <- lazy_matmul(x, y, weights =  weights, hybrid_limit = 0)
 28 |   # })
 29 |   # expect_equal(range(xy[] - xy0), c(0,0))
 30 |   # 
 31 |   # y <- t(as.lazymatrix(orig1))
 32 |   # 
 33 |   # # 0.188 
 34 |   # a <- system.time({
 35 |   #   xy <- lazy_matmul(x, y, weights =  weights, hybrid_limit = 0)
 36 |   # })
 37 |   # expect_equal(range(xy[] - xy0), c(0,0))
 38 |   # 
 39 |   # x <- t(as.lazymatrix(t(orig1)))
 40 |   # 
 41 |   # # 0.098 
 42 |   # system.time({
 43 |   #   xy <- lazy_matmul(x, y, weights =  weights, hybrid_limit = 0)
 44 |   # })
 45 |   # expect_equal(range(xy[] - xy0), c(0,0))
 46 |   
 47 |   y <- as.lazymatrix(t(orig1))
 48 |   
 49 |   # # 0.017 
 50 |   # system.time({
 51 |   #   xy <- lazy_matmul(x, y, weights =  weights, hybrid_limit = 0)
 52 |   # })
 53 |   # expect_equal(range(xy[] - xy0), c(0,0))
 54 |   
 55 |   x <- as.lazymatrix(t(orig1))
 56 |   
 57 |   # 0.002
 58 |   system.time({
 59 |     xy <- crossprod(x, weights = weights)
 60 |   })
 61 |   expect_equal(range(xy[] - xy0), c(0,0))
 62 |   
 63 |   # x <- t(as.lazymatrix(orig1))
 64 |   # 
 65 |   # # 0.162 
 66 |   # system.time({
 67 |   #   xy <- lazy_crossprod(x, weights = weights)
 68 |   # })
 69 |   # expect_equal(range(xy[] - xy0), c(0,0))
 70 |   # 
 71 |   # # bench::mark({
 72 |   # #   lazy_matmul(x, y)
 73 |   # # }, {
 74 |   # #   x[] %*% y[]
 75 |   # # }, check = F, iterations = 1)
 76 |   # # 
 77 | })
 78 | 
 79 | 
 80 | 
 81 | # test_that("multiplication works", {
 82 | #   skip_on_cran()
 83 | #   skip_on_travis()
 84 | #   skip_on_appveyor()
 85 | #   skip_on_bioc()
 86 | #   skip_on_ci()
 87 | #   skip_on_covr()
 88 | #   skip_if_not(dir.exists('~/Desktop/lazyarray_data/'))
 89 | # 
 90 | #   self <- lazyarray('~/Desktop/lazyarray_data/')
 91 | #   self <- as.lazymatrix(self)
 92 | # 
 93 | #   future::plan('multisession')
 94 | #   
 95 | #   # pryr::mem_used()
 96 | #   system.time({
 97 | #     a <- lazy_crossprod(self)
 98 | #   })
 99 | # 
100 | # })
101 | 


--------------------------------------------------------------------------------
/tests/testthat/test-subset.R:
--------------------------------------------------------------------------------
  1 | test_that("Subset lazyarray & lazymatrix", {
  2 |   a <- array(rnorm(80), c(2,4,2,5))
  3 |   x <- as.lazyarray(a, type = 'fstarray')
  4 |   
  5 |   expect_equal(x[], a)
  6 |   
  7 |   idx <- sample(length(x), length(x) * 10, replace = TRUE)
  8 |   expect_equal(x[idx], a[idx])
  9 |   
 10 |   # matrix
 11 |   x <- as.lazymatrix(a, type = 'fstarray')
 12 |   b <- x[]
 13 |   
 14 |   expect_equivalent(as.integer(dim(b)), c(16,5))
 15 |   
 16 |   expect_equal(x[], b)
 17 |   
 18 |   idx <- sample(length(x), length(x) * 10, replace = TRUE)
 19 |   expect_equal(x[idx], b[idx])
 20 |   
 21 |   expect_equivalent(x[,1], b[,1])
 22 |   expect_equivalent(x[,1,drop=FALSE], b[,1,drop=FALSE])
 23 |   
 24 |   # x <- t(x)  
 25 |   # b <- t(b)
 26 |   # 
 27 |   # expect_equal(dim(x), c(5,16))
 28 |   # 
 29 |   # expect_equal(x[], b)
 30 |   # 
 31 |   # idx <- sample(length(x), length(x) * 10, replace = TRUE)
 32 |   # expect_equal(x[idx], b[idx])
 33 |   # 
 34 |   # expect_equal(x[,1], b[,1])
 35 |   # expect_equal(x[,1,drop=FALSE], b[,1,drop=FALSE])
 36 |   
 37 | })
 38 | 
 39 | test_that("Subset filearray", {
 40 |   a <- array(rnorm(80), c(2,4,2,5))
 41 |   f <- tempfile()
 42 |   x <- FileArray$new(f, dim = dim(a), storage_format = 'double', read_only = FALSE)
 43 |   x[] <- a
 44 |   
 45 |   expect_equal(x[], a)
 46 |   
 47 |   i <- sample(4,replace = TRUE)
 48 |   i[sample(4,2)] <- NA
 49 |   expect_equal(x[,i,,i], a[,i,,i])
 50 |   
 51 |   expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]])
 52 |   
 53 |   j <- sample(2,replace = TRUE)
 54 |   i <- sample(5,replace = TRUE)
 55 |   # TODO: R array drop=TRUE with 0 length doesn't actually drop the array
 56 |   
 57 |   i <- 1:5
 58 |   expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]])
 59 |   # expect_equal(x[j,j,j,-i], a[j,j,j,-i[!is.na(i)]])
 60 |   
 61 |   expect_equal(x[,j,,-i,drop=FALSE], a[,j,,-i[!is.na(i)],drop=FALSE])
 62 |   expect_equal(x[j,j,j,-i,drop=FALSE], a[j,j,j,-i[!is.na(i)],drop=FALSE])
 63 |   
 64 |   
 65 |   x[,,,1] <- a[,,,2]
 66 |   a[,,,1] <- a[,,,2]
 67 |   
 68 |   expect_equal(x[], a)
 69 |   
 70 |   i <- sample(5,replace = TRUE)
 71 |   i[sample(5,2)] <- NA
 72 |   expect_equal(x[,,,i], a[,,,i])
 73 |   
 74 |   expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]])
 75 |   
 76 |   j <- sample(2,replace = TRUE)
 77 |   i <- sample(5,replace = TRUE)
 78 |   expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]])
 79 |   expect_equal(x[j,j,j,-i], a[j,j,j,-i[!is.na(i)]])
 80 |   
 81 |   expect_error(x[,,,6])
 82 |   
 83 |   expect_equal(x[,,,-6], a)
 84 |   
 85 | })
 86 | 
 87 | test_that("Subset fstarray", {
 88 |   a <- array(rnorm(80), c(2,4,2,5))
 89 |   f <- tempfile()
 90 |   x <- FstArray$new(f, dim = dim(a), storage_format = 'double', read_only = FALSE)
 91 |   x[] <- a
 92 |   
 93 |   expect_equal(x[], a)
 94 |   
 95 |   i <- sample(5,replace = TRUE)
 96 |   i[sample(5,2)] <- NA
 97 |   expect_equal(x[,,,i], a[,,,i])
 98 |   
 99 |   expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]])
100 |   
101 |   j <- sample(2,replace = TRUE)
102 |   i <- sample(5,replace = TRUE)
103 |   expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]])
104 |   
105 |   i <- 1:5
106 |   expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]])
107 |   
108 |   x[,,,1] <- a[,,,2]
109 |   a[,,,1] <- a[,,,2]
110 |   
111 |   expect_equal(x[], a)
112 |   
113 |   i <- sample(5,replace = TRUE)
114 |   i[sample(5,2)] <- NA
115 |   expect_equal(x[,,,i], a[,,,i])
116 |   
117 |   expect_equal(x[,,,-i], a[,,,-i[!is.na(i)]])
118 |   
119 |   j <- sample(2,replace = TRUE)
120 |   expect_equal(x[,j,,-i], a[,j,,-i[!is.na(i)]])
121 |   expect_equal(x[j,j,j,-i], a[j,j,j,-i[!is.na(i)]])
122 |   
123 | })
124 | 


--------------------------------------------------------------------------------
/tests/testthat/test.setget.R:
--------------------------------------------------------------------------------
  1 | 
  2 | context("Getter/Setter mode = 2")
  3 | 
  4 | test_that("Getter/Setter mode = 2", {
  5 |   skip("mode = 2 is not supported anymore")
  6 |   # path <- tempfile()
  7 |   # suppressWarnings({
  8 |   #   self <- create_lazyarray(path, 'double', c(1,3,4), multipart = TRUE, multipart_mode = 2)
  9 |   # })
 10 |   # private <- self$.__enclos_env__$private
 11 |   # value <- array(1:4, c(1,2,2))
 12 |   # 
 13 |   # expect_true(self$can_write)
 14 |   # 
 15 |   # self$`@set_data`(value, 1.0, 1:2, 1:2)
 16 |   # 
 17 |   # expect_equal(self$`@get_data`(1, 2, 1), 2)
 18 |   # 
 19 |   # expect_equal(self$`@get_data`(1, 1:3, 1), c(1,2,NA))
 20 |   # 
 21 |   # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4)))
 22 |   # 
 23 |   # self[1, 1:2, c(3,10)] <- 1:4
 24 |   # 
 25 |   # # change to read_only
 26 |   # private$read_only <- TRUE
 27 |   # expect_false(self$can_write)
 28 |   # 
 29 |   # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, 1,2,NA, rep(NA, 3)), c(1,3,4)))
 30 |   # 
 31 |   # expect_error({self[1, 1:2, c(3,10)] <- 1:4})
 32 |   # 
 33 |   # # clean up
 34 |   # self$remove_data()
 35 |   # 
 36 |   # expect_false(dir.exists(private$.dir))
 37 |   # 
 38 |   # # operations will result in error
 39 |   # expect_error(self[1,1,1])
 40 |   
 41 | })
 42 | 
 43 | 
 44 | 
 45 | 
 46 | context("Getter/Setter mode = 1")
 47 | 
 48 | test_that("Getter/Setter mode = 1", {
 49 |   
 50 |   self <- lazyarray(tempfile(), storage_format = 'double', dim = c(1,3,4), type = 'fstarray')#, multipart = TRUE, multipart_mode = 1)
 51 |   private <- self$.__enclos_env__$private
 52 |   value <- array(1:4, c(1,2,2))
 53 |   
 54 |   expect_true(self$can_write)
 55 |   
 56 |   subsetAssignFST(value, file = self$storage_path, listOrEnv = list(1.0, 1:2, 1:2), dim = self$dim, dtype = 14L)
 57 |   # self$`@set_data`(value, 1.0, 1:2, 1:2)
 58 |   
 59 |   expect_equal(subsetFST(self$storage_path, list(1,2,1), self$dim, 14L, drop=TRUE), 2)
 60 |   # expect_equal(self$`@get_data`(1, 2, 1), 2)
 61 |   
 62 |   expect_equal(subsetFST(self$storage_path, list(1, 1:3, 1), self$dim, 14L, drop=TRUE), c(1,2,NA))
 63 |   # expect_equal(self$`@get_data`(1, 1:3, 1), c(1,2,NA))
 64 |   
 65 |   expect_equivalent(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4)))
 66 |   
 67 |   expect_error({ self[1, 1:2, c(3,10)] <- 1:4 })
 68 |   self[1, 1:2, c(3,4)] <- 1:4
 69 |   
 70 |   # change to read_only
 71 |   private$.read_only <- TRUE
 72 |   expect_false(self$can_write)
 73 |   
 74 |   expect_equivalent(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, 1,2,NA, 3,4,NA), c(1,3,4)))
 75 |   
 76 |   expect_error({self[1, 1:2, c(3,4)] <- 1:4})
 77 |   
 78 |   # clean up
 79 |   self$remove_data()
 80 |   
 81 |   expect_false(dir.exists(private$.path))
 82 |   
 83 |   # operations will result in error
 84 |   expect_error(self[1,1,1])
 85 |   
 86 | })
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | context("Getter/Setter no partition")
 93 | 
 94 | test_that("Getter/Setter no partition", {
 95 |   skip("No partition is not supported anymore")
 96 |   # suppressWarnings({
 97 |   #   self <- create_lazyarray(tempfile(), 'double', c(1,3,4), multipart = FALSE)
 98 |   # })
 99 |   # private <- self$.__enclos_env__$private
100 |   # value <- array(1:4, c(1,2,2))
101 |   # 
102 |   # expect_true(self$can_write)
103 |   # 
104 |   # self$`@set_data`(value, 1.0, 1:2, 1:2)
105 |   # 
106 |   # expect_equal(self$`@get_data`(1, 2, 1), 2)
107 |   # 
108 |   # expect_equal(self$`@get_data`(1, 1:3, 1), c(1,2,NA))
109 |   # 
110 |   # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4)))
111 |   # 
112 |   # expect_error({
113 |   #   self[1, 1:2, c(3,10)] <- 1:4
114 |   # })
115 |   # 
116 |   # # change to read_only
117 |   # private$read_only <- TRUE
118 |   # expect_false(self$can_write)
119 |   # 
120 |   # expect_identical(self[drop = FALSE], array(c(1,2,NA, 3,4,NA, rep(NA, 6)), c(1,3,4)))
121 |   # 
122 |   # expect_error({self[1, 1:2, 3] <- 1:2})
123 |   # 
124 |   # # clean up
125 |   # self$remove_data()
126 |   # 
127 |   # expect_false(dir.exists(private$.dir))
128 |   # 
129 |   # # operations will result in error
130 |   # expect_error(self[1,1,1])
131 |   
132 | })
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/basic-usage-of-lazyarray.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "A Basic Usage of 'LazyArray'"
 3 | output: rmarkdown::html_vignette
 4 | author: Zhengjia Wang
 5 | vignette: >
 6 |   %\VignetteIndexEntry{A Basic Usage of 'LazyArray'}
 7 |   %\VignetteEngine{knitr::rmarkdown}
 8 |   %\VignetteEncoding{UTF-8}
 9 | ---
10 | 
11 | ```{r, include = FALSE}
12 | knitr::opts_chunk$set(
13 |   collapse = TRUE,
14 |   comment = "#>"
15 | )
16 | ```
17 | 
18 | ```{r setup}
19 | library(lazyarray)
20 | ```
21 | 
22 | ### Initialize
23 | 
24 | Create a blank array and assign R object
25 | 
26 | ```{r}
27 | # Sample data (~24 MB)
28 | x <- rnorm(3e6); dim(x) <- c(10, 100, 100, 30)
29 | 
30 | # Save array to a path
31 | path <- tempfile()
32 | arr <- lazyarray(path, dim = dim(x), storage_format = 'double')
33 | arr[] <- x
34 | ```
35 | 
36 | Load existing array
37 | 
38 | ```{r}
39 | # Load existing array
40 | arr <- lazyarray(path)
41 | ```
42 | 
43 | To protect array from further changes, make it read-only.
44 | 
45 | ```{r}
46 | arr$make_readonly()
47 | arr$can_write
48 | ```
49 | 
50 | To make a read-only array writable:
51 | 
52 | ```{r}
53 | arr$make_writable()
54 | arr$can_write
55 | ```
56 | 
57 | ### S3 methods
58 | 
59 | 1. Set dimension names
60 | 
61 | ```{r}
62 | arr$make_writable()
63 | dimnames(arr) <- list(
64 |   A = 1:10,
65 |   B = 1:100,
66 |   C = 1:100,
67 |   D = 1:30
68 | )
69 | ```
70 | 
71 | 2. Subset and subset assign
72 | 
73 | ```{r}
74 | # Subset/read array
75 | y1 <- arr[]              
76 | y2 <- arr[,,,3]          
77 | 
78 | # Write to slice of data, writing to slices along the 
79 | # last dimension is optimized
80 | arr[,,,1] <- seq_len(1e5)
81 | ```
82 | 
83 | 3. Subset by formula
84 | 
85 | ```{r}
86 | sub <- subset(arr, A ~ A <= 2, B ~ B == 10)
87 | dim(sub)
88 | ```
89 | 
90 | ### Remove Arrays
91 | 
92 | Data created via `lazyarray` does not remove automatically. You need to finalize array by yourself. This is because multiple lazy array instances might point to a same dataset. If one of the object is garbage collected, you might not want to remove the data on hard drive as this will invalidate the other instances. To manually remove data, use
93 | 
94 | ```{r}
95 | arr$remove_data()
96 | ```
97 | 


--------------------------------------------------------------------------------