├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── BGData.R ├── FWD.R ├── GWAS.R ├── chunkedApply.R ├── findRelated.R ├── getG.R ├── preprocess.R ├── segments.R ├── summarize.R └── utils.R ├── README.md ├── inst ├── CITATION └── extdata │ ├── chr1.bed │ ├── chr1.bim │ ├── chr1.fam │ ├── chr1.raw │ ├── chr2.bed │ ├── chr2.bim │ ├── chr2.fam │ ├── chr2.raw │ ├── chr3.bed │ ├── chr3.bim │ ├── chr3.fam │ ├── chr3.raw │ └── pheno.txt ├── man ├── BGData-class.Rd ├── BGData-package.Rd ├── BGData.Rd ├── FWD.Rd ├── GWAS.Rd ├── as.BGData.Rd ├── chunkedApply.Rd ├── chunkedMap.Rd ├── file-backed-matrices.Rd ├── findRelated.Rd ├── geno-class.Rd ├── geno.Rd ├── getG.Rd ├── getG_symDMatrix.Rd ├── load.BGData.Rd ├── multi-level-parallelism.Rd ├── orderedMerge.Rd ├── preprocess.Rd ├── readRAW.Rd ├── segments.Rd └── summarize.Rd ├── src ├── .gitignore ├── Makevars ├── fitLSYS.c ├── fitLSYS.h ├── init.c ├── preprocess.c ├── preprocess.h ├── rayOLS.c ├── rayOLS.h ├── summarize.c └── summarize.h └── tests ├── testthat.R └── testthat ├── helper-utils.R ├── test-BGData.R ├── test-GWAS.R ├── test-chunkedApply.R ├── test-getG.R ├── test-preprocess-int.R ├── test-preprocess-real.R ├── test-summarize.R └── test-utils.R /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: BGData 2 | Version: 2.4.1 3 | License: MIT + file LICENSE 4 | Title: A Suite of Packages for Analysis of Big Genomic Data 5 | Description: An umbrella package providing a phenotype/genotype data structure 6 | and scalable and efficient computational methods for large genomic datasets 7 | in combination with several other packages: 'BEDMatrix', 'LinkedMatrix', 8 | and 'symDMatrix'. 9 | Authors@R: c( 10 | person("Gustavo", "de los Campos", email = "gustavoc@msu.edu", role = c("aut")), 11 | person("Alexander", "Grueneberg", email = "cran@agrueneberg.info", role = c("aut", "cre")), 12 | person("Paulino", "Perez", email = "perpdgo@gmail.com", role = c("ctb")), 13 | person("Ana", "Vazquez", email = "avazquez@epi.msu.edu", role = c("ctb"))) 14 | URL: https://github.com/QuantGen/BGData 15 | BugReports: https://github.com/QuantGen/BGData/issues 16 | Depends: 17 | R (>= 3.0.2), 18 | BEDMatrix (>= 1.4.0), 19 | LinkedMatrix (>= 1.3.0), 20 | symDMatrix (>= 2.0.0) 21 | Imports: 22 | methods, 23 | parallel, 24 | crochet (>= 2.1.0), 25 | bigmemory, 26 | synchronicity, 27 | ff, 28 | bit 29 | Suggests: 30 | data.table (>= 1.9.6), 31 | lme4, 32 | SKAT, 33 | testthat 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2014-2015 2 | COPYRIGHT HOLDER: Gustavo de los Campos -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | S3method(as.BGData, BEDMatrix) 2 | S3method(as.BGData, ColumnLinkedMatrix) 3 | S3method(as.BGData, RowLinkedMatrix) 4 | S3method(findRelated, matrix) 5 | S3method(findRelated, symDMatrix) 6 | export(BGData) 7 | export(GWAS) 8 | export(as.BGData) 9 | export(chunkedApply) 10 | export(chunkedMap) 11 | export(findRelated) 12 | export(FWD) 13 | export(geno) 14 | export("geno<-") 15 | export(getG) 16 | export(getG_symDMatrix) 17 | export(load.BGData) 18 | export(map) 19 | export("map<-") 20 | export(orderedMerge) 21 | export(pheno) 22 | export("pheno<-") 23 | export(preprocess) 24 | export(readRAW) 25 | export(readRAW_big.matrix) 26 | export(readRAW_matrix) 27 | export(segments) 28 | export(summarize) 29 | exportClasses(BGData) 30 | exportMethods(geno) 31 | exportMethods("geno<-") 32 | exportMethods(pheno) 33 | exportMethods("pheno<-") 34 | exportMethods(map) 35 | exportMethods("map<-") 36 | importFrom(BEDMatrix, BEDMatrix) 37 | importFrom(LinkedMatrix, ColumnLinkedMatrix, LinkedMatrix, nNodes) 38 | importFrom(symDMatrix, nBlocks, symDMatrix) 39 | import(methods) 40 | importFrom(parallel, mclapply) 41 | importFrom(crochet, convertIndex) 42 | importFrom(bigmemory, attach.big.matrix, big.matrix, filebacked.big.matrix) 43 | importFrom(synchronicity, boost.mutex, lock, unlock) 44 | importFrom(ff, as.ff, ff, vt) 45 | importFrom(bit, physical, "physical<-") 46 | importFrom(stats, coef, lsfit, ls.print, model.frame, model.matrix, na.pass, 47 | pnorm, sd, terms, update) 48 | importFrom(utils, read.table, type.convert) 49 | importClassesFrom(BEDMatrix, BEDMatrix) 50 | importClassesFrom(LinkedMatrix, LinkedMatrix) 51 | importClassesFrom(bigmemory, big.matrix) 52 | importClassesFrom(symDMatrix, symDMatrix) 53 | useDynLib(BGData, .registration = TRUE, .fixes = "C_") 54 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # BGData 2.4.1 2 | 3 | - Fixed minor native routine registration issue. 4 | - Drop defunct CI services. 5 | - Update link to paper. 6 | 7 | 8 | # BGData 2.4.0 9 | 10 | - Add `FWD()` function for performing forward regressions. 11 | 12 | 13 | # BGData 2.3.0 14 | 15 | - Add `segments()` function for finding non-overlapping segments based on a 16 | summary statistic. 17 | - `preprocess()`: Add `nCores` parameter. 18 | - `findRelated()`: Map indices to sample names for matrix inputs. 19 | - `getG()`: Add `impute` parameter. 20 | - `getG()`: Impute by mean instead of 0 if `center = FALSE`. 21 | - `getG()`: Use `preprocess()` internally for better performance. 22 | - `as.BGData()`: Read genetic distances in .bim file as double instead of 23 | integer. 24 | 25 | 26 | # BGData 2.2.0 27 | 28 | - Follow [Bioconductor S4 practices][1]. 29 | - If you have used `new()` to create `BGData` instances, please use the 30 | `BGData()` constructor function instead. 31 | - If you have used `@` to access the slots of `BGData` instances, please use 32 | the `geno()`, `pheno()`, and `map()` accessors instead. 33 | - `BGData()`: 34 | - Do not create dimnames for `geno` as this object is likely shared. 35 | - Check if `geno` has row names before creating `pheno` stub. 36 | - Check if `geno` has column names before creating `map` stub. 37 | - Rename `IID` in `pheno` stub to `sample_id`. 38 | - Rename `mrk` in `map` stub to `variant_id`. 39 | - Change format of rownames for `pheno` stub to a sequence starting with 40 | `sample_` and rownames for `map` stub to a sequence starting with 41 | `variant_` if `geno` does not have dimnames. 42 | - `as.BGData()`: 43 | - Force column classes when loading .fam and .bim files. 44 | - Force `FID` and `IID` columns to be of type `character` when loading 45 | alternate phenotype files. 46 | - Do not make assumptions about the structure of dimnames of a BEDMatrix 47 | object if it is passed without .fam and .bim file unless they are `NULL`. 48 | - Add validity tests for `BGData` objects: 49 | - Check if number of rows of `geno` matches number of rows of `pheno`. 50 | - Check if number of columns of `geno` matches number of rows of `map`. 51 | - Warn if the row names of `pheno` do not match the row names of `geno`. 52 | - Warn if the row names of `map` do not match the column names of `geno`. 53 | - Add `preprocess()` function for fast centering, scaling, and imputation. 54 | - `GWAS()`: Return number of records used for each variant and allele 55 | frequencies in `rayOLS`. 56 | - Update citation instructions. 57 | - Use `inherits(., *)` instead of `class(.) == *` (R4 compat). 58 | 59 | 60 | # BGData 2.1.0 61 | 62 | - Add `chunkedMap()` function. 63 | - Improve error handling in `chunkedMap()` and `chunkedApply()`. 64 | - `summarize()`: Improve performance. 65 | - `GWAS()`: Improve performance of `rayOLS` method. 66 | - `GWAS()`: Fix bug when computing p-values for methods other than rayOLS, 67 | lsfit, or SKAT when `i` is used to subset samples. 68 | - `GWAS()`: Fix wrong results in `lsfit` method when covariates with missing 69 | values are used. 70 | - `as.BGData()`: Fix bug loading .fam and .bim files when path contains the 71 | word `bed`. 72 | 73 | 74 | # BGData 2.0.0 75 | 76 | ## Breaking Changes 77 | 78 | - Rename `bufferSize` to `chunkSize`. 79 | - Remove `nTasks` parameter from `chunkedApply()` and methods based on it. 80 | - Remove `crossprods` function. 81 | 82 | ## Other Changes 83 | 84 | - Change chunking strategy to improve parallelism: instead of loading a subset 85 | of `chunkSize` in the main process, load a subset of `chunkSize` in the each 86 | fork. That way `nTasks` is not necessary anymore and the same code can be 87 | used for one core and multiple cores. 88 | - Add `findRelated()` function for use with matrices and symDMatrix objects. 89 | - Add `orderedMerge()` function that allows for phenotypes to be easily merged 90 | into a BGData object. 91 | - Performance improvements in `getG()` function: use single shared memory 92 | matrix to collect results. 93 | - Performance improvements in `rayOLS` method in `GWAS()` function. 94 | - `getG_symDMatrix()`: Support version 2 of symDMatrix package. 95 | - `getG_symDMatrix()`: Add `chunkSize` parameter. 96 | - `getG_symDMatrix()`: Add `minVar` parameter. 97 | - `as.BGData()`: Use rownames of BEDMatrix object as rownames for pheno, and 98 | colnames of BEDMatrix object as rownames for map. 99 | - Include process ID in verbose output if `nCores` > 1. 100 | 101 | ## Bug Fixes 102 | 103 | - `getG_symDMatrix()`: Fix scaling error when `scale = FALSE`. 104 | - `getG_symDMatrix()`: Compute block indices correctly for out-of-order, 105 | non-sequential indices. 106 | - `getG_symDMatrix()`: Do not include centers and scales in attributes anymore 107 | because the influence of `j` and `minVar` is difficult to retain. 108 | 109 | 110 | # BGData 1.0.0 111 | 112 | Initial release. 113 | 114 | [1]: https://bioconductor.org/help/course-materials/2017/Zurich/S4-classes-and-methods.html 115 | -------------------------------------------------------------------------------- /R/FWD.R: -------------------------------------------------------------------------------- 1 | FWD <- function(y, X, df = 20, tol = 1e-7, maxIter = 1000, centerImpute = TRUE, verbose = TRUE) { 2 | y <- y - mean(y) 3 | if (centerImpute) { 4 | X <- BGData::preprocess(X, center = TRUE, impute = TRUE) 5 | } 6 | if (is.null(colnames(X))) { 7 | colNames <- paste0("X", 1:ncol(X)) 8 | } else { 9 | colNames <- colnames(X) 10 | } 11 | X <- cbind(1, X) 12 | df <- df + 1 13 | colNames <- c("Int", colNames) 14 | C <- crossprod(X) 15 | rhs <- crossprod(X, y) 16 | n <- length(y) 17 | p <- ncol(X) 18 | active <- rep(FALSE, p) 19 | names(active) <- colNames 20 | B <- matrix(data = 0, nrow = p, ncol = df) 21 | rownames(B) <- colNames 22 | RSS <- rep(NA_real_, df) 23 | DF <- rep(NA_real_, df) 24 | VARE <- rep(NA_real_, df) 25 | LogLik <- rep(NA_real_, df) 26 | AIC <- rep(NA_real_, df) 27 | BIC <- rep(NA_real_, df) 28 | path <- rep(NA_character_, df) 29 | active[1] <- TRUE 30 | B[1, 1] <- mean(y) 31 | RSS[1] <- sum((y - B[1, 1])^2) 32 | DF[1] <- 1 33 | VARE[1] <- RSS[1] / (n - DF[1]) 34 | LogLik[1] <- -(n / 2) * log(2 * pi * VARE[1]) - RSS[1] / (2 * VARE[1]) 35 | AIC[1] <- -2 * LogLik[1] + 2 * DF[1] 36 | BIC[1] <- -2 * LogLik[1] + log(n) * (DF[1] + 1) 37 | path[1] <- colNames[1] 38 | tol <- tol * RSS[1] 39 | for (i in 2:df) { 40 | tmp <- addOne( 41 | C = C, 42 | rhs = rhs, 43 | active = active, 44 | b = B[, i - 1], 45 | RSS = RSS[i - 1], 46 | maxIter = maxIter, 47 | tol = tol 48 | ) 49 | B[, i] <- tmp[["b"]] 50 | if (length(tmp[["newPred"]]) > 0) { 51 | active[tmp[["newPred"]]] <- TRUE 52 | path[i] <- colNames[tmp[["newPred"]]] 53 | } else { 54 | path[i] <- NA 55 | } 56 | RSS[i] <- tmp[["RSS"]] 57 | DF[i] <- sum(active) 58 | VARE[i] <- RSS[i] / (n - DF[i]) 59 | LogLik[i] <- -(n / 2) * log(2 * pi * VARE[i]) - RSS[i] / VARE[i] / 2 60 | AIC[i] <- -2 * LogLik[i] + 2 * (DF[i] + 1) 61 | BIC[i] <- -2 * LogLik[i] + log(n) * (DF[i] + 1) 62 | if (verbose) { 63 | message(" ", DF[i] - 1, " predictors, AIC=", round(AIC[i], 2)) 64 | } 65 | } 66 | OUT <- list( 67 | B = B, 68 | path = data.frame( 69 | variable = path, 70 | RSS = RSS, 71 | LogLik = LogLik, 72 | VARE = VARE, 73 | DF = DF, 74 | AIC = AIC, 75 | BIC = BIC 76 | ) 77 | ) 78 | return(OUT) 79 | } 80 | 81 | addOne <- function(C, rhs, active, b, RSS, maxIter, tol) { 82 | activeSet <- which(active) 83 | inactiveSet <- which(!active) 84 | nActive <- length(activeSet) 85 | nInactive <- length(inactiveSet) 86 | # if model is not null 87 | if (nActive > 1) { 88 | RSSNew <- rep(NA_real_, nInactive) 89 | for (i in 1:nInactive) { 90 | fm <- fitSYS( 91 | C = C, 92 | rhs = rhs, 93 | b = b, 94 | active = c(inactiveSet[i], activeSet), 95 | RSS = RSS, 96 | maxIter = maxIter, 97 | tol = tol 98 | ) 99 | RSSNew[i] <- fm[["RSS"]] 100 | } 101 | k <- which.min(RSSNew) 102 | fm <- fitSYS( 103 | C = C, 104 | rhs = rhs, 105 | b = b, 106 | active = c(inactiveSet[k], activeSet), 107 | RSS = RSS, 108 | maxIter = maxIter, 109 | tol = tol 110 | ) 111 | ans <- list(b = fm[["b"]], newPred = inactiveSet[k], RSS = fm[["RSS"]]) 112 | # if model is null 113 | } else { 114 | bOLS <- rhs / diag(C) 115 | dRSS <- diag(C) * bOLS^2 116 | k <- which.max(dRSS) 117 | b[k] <- bOLS[k] 118 | RSS <- RSS - bOLS[k]^2 * C[k, k] 119 | ans <- list(b = b, newPred = k, RSS = RSS) 120 | } 121 | return(ans) 122 | } 123 | 124 | fitSYS <- function(C, rhs, b, active, RSS, maxIter, tol) { 125 | active <- active - 1L # for the 0-based index 126 | ans <- .Call(C_fitLSYS, C, rhs, b, active, RSS, maxIter, tol) 127 | return(list(b = ans[[1]], RSS = ans[[2]])) 128 | } 129 | -------------------------------------------------------------------------------- /R/GWAS.R: -------------------------------------------------------------------------------- 1 | GWAS <- function(formula, data, method = "lsfit", i = seq_len(nrow(geno(data))), j = seq_len(ncol(geno(data))), chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE, ...) { 2 | 3 | if (!inherits(data, "BGData")) { 4 | stop("data must BGData") 5 | } 6 | 7 | if (!method %in% c("rayOLS", "lsfit", "lm", "lm.fit", "glm", "lmer", "SKAT")) { 8 | stop("Only rayOLS, lsfit, lm, lm.fit, glm, lmer, and SKAT have been implemented so far.") 9 | } 10 | 11 | i <- convertIndex(geno(data), i, "i") 12 | j <- convertIndex(geno(data), j, "j") 13 | 14 | if (method == "rayOLS") { 15 | if (length(labels(terms(formula))) > 0L) { 16 | stop("method rayOLS can only be used with y~1 formula, if you want to add covariates pre-adjust your phenotype.") 17 | } 18 | OUT <- GWAS.rayOLS(formula = formula, data = data, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = verbose, ...) 19 | } else if (method == "lsfit") { 20 | OUT <- GWAS.lsfit(formula = formula, data = data, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = verbose, ...) 21 | } else if (method == "SKAT") { 22 | if (!requireNamespace("SKAT", quietly = TRUE)) { 23 | stop("SKAT needed for this function to work. Please install it.", call. = FALSE) 24 | } 25 | OUT <- GWAS.SKAT(formula = formula, data = data, i = i, j = j, verbose = verbose, ...) 26 | } else { 27 | if (method == "lmer") { 28 | if (!requireNamespace("lme4", quietly = TRUE)) { 29 | stop("lme4 needed for this function to work. Please install it.", call. = FALSE) 30 | } 31 | FUN <- lme4::lmer 32 | } else { 33 | FUN <- match.fun(method) 34 | } 35 | GWAS.model <- update(formula, ".~z+.") 36 | OUT <- chunkedApply(X = geno(data), MARGIN = 2L, FUN = function(col, ...) { 37 | df <- pheno(data)[i, , drop = FALSE] 38 | df[["z"]] <- col 39 | fm <- FUN(GWAS.model, data = df, ...) 40 | getCoefficients(fm) 41 | }, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = verbose, ...) 42 | OUT <- t(OUT) 43 | rownames(OUT) <- colnames(geno(data))[j] 44 | } 45 | 46 | return(OUT) 47 | } 48 | 49 | GWAS.rayOLS <- function(formula, data, i = seq_len(nrow(geno(data))), j = seq_len(ncol(geno(data))), chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE, ...) { 50 | y <- pheno(data)[i, getResponse(formula)] 51 | y <- as.numeric(y) 52 | res <- chunkedMap(X = geno(data), FUN = rayOLS, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = verbose, y = y, ...) 53 | res <- do.call(rbind, res) 54 | colnames(res) <- c("Estimate", "Std.Err", "t-value", "Pr(>|t|)", "n", "allele_freq") 55 | rownames(res) <- colnames(geno(data))[j] 56 | return(res) 57 | } 58 | 59 | GWAS.lsfit <- function(formula, data, i = seq_len(nrow(geno(data))), j = seq_len(ncol(geno(data))), chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE, ...) { 60 | 61 | # The subset argument of model.frame is evaluated in the environment of the 62 | # formula, therefore subset after building the frame. 63 | frame <- model.frame(formula = formula, data = pheno(data), na.action = na.pass)[i, , drop = FALSE] 64 | model <- model.matrix(formula, frame) 65 | 66 | y <- pheno(data)[i, getResponse(formula)] 67 | 68 | res <- chunkedApply(X = geno(data), MARGIN = 2L, FUN = function(col, ...) { 69 | fm <- lsfit(x = cbind(col, model), y = y, intercept = FALSE) 70 | ls.print(fm, print.it = FALSE)[["coef.table"]][[1L]][1L, ] 71 | }, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = verbose, ...) 72 | res <- t(res) 73 | rownames(res) <- colnames(geno(data))[j] 74 | 75 | return(res) 76 | } 77 | 78 | # formula: the formula for the GWAS model without including the markers, e.g. 79 | # y~1 or y~factor(sex)+age 80 | # all the variables in the formula must be in data@pheno (BGData) 81 | # containing slots @pheno and @geno 82 | # groups: a vector mapping markers into groups (can be integer, character or 83 | # factor) 84 | GWAS.SKAT <- function(formula, data, groups, i = seq_len(nrow(geno(data))), j = seq_len(ncol(geno(data))), verbose = FALSE, ...) { 85 | 86 | uniqueGroups <- unique(groups) 87 | 88 | OUT <- matrix(data = double(), nrow = length(uniqueGroups), ncol = 2L) 89 | colnames(OUT) <- c("nMrk", "p-value") 90 | rownames(OUT) <- uniqueGroups 91 | 92 | H0 <- SKAT::SKAT_Null_Model(formula, data = pheno(data)[i, , drop = FALSE], ...) 93 | 94 | for (group in seq_along(uniqueGroups)) { 95 | Z <- geno(data)[i, groups == uniqueGroups[group], drop = FALSE] 96 | fm <- SKAT::SKAT(Z = Z, obj = H0, ...) 97 | OUT[group, ] <- c(ncol(Z), fm[["p.value"]]) 98 | if (verbose) { 99 | message("Group ", group, " of ", length(uniqueGroups), " ...") 100 | } 101 | } 102 | 103 | return(OUT) 104 | } 105 | 106 | rayOLS <- function(x, y) { 107 | .Call(C_rayOLS, x, y) 108 | } 109 | 110 | getCoefficients <- function(x) { 111 | UseMethod("getCoefficients") 112 | } 113 | 114 | getCoefficients.lm <- function(x) { 115 | coef(summary(x))[2L, ] 116 | } 117 | 118 | getCoefficients.glm <- function(x) { 119 | coef(summary(x))[2L, ] 120 | } 121 | 122 | getCoefficients.lmerMod <- function(x) { 123 | ans <- coef(summary(x))[2L, ] 124 | ans <- c(ans, c(1L - pnorm(ans[3L]))) 125 | return(ans) 126 | } 127 | 128 | getResponse <- function(formula) { 129 | # Extract component from parse tree (see https://cran.r-project.org/doc/manuals/r-release/R-lang.html#Language-objects) 130 | sym <- formula[[2L]] 131 | # Convert symbol to character 132 | as.character(sym) 133 | } 134 | -------------------------------------------------------------------------------- /R/chunkedApply.R: -------------------------------------------------------------------------------- 1 | chunkedMap <- function(X, FUN, i = seq_len(nrow(X)), j = seq_len(ncol(X)), chunkBy = 2L, chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE, ...) { 2 | if (length(dim(X)) != 2L) { 3 | stop("X must be a matrix-like object") 4 | } 5 | i <- convertIndex(X, i, "i") 6 | j <- convertIndex(X, j, "j") 7 | dim <- c(length(i), length(j)) 8 | if (is.null(chunkSize)) { 9 | chunkSize <- dim[chunkBy] 10 | nChunks <- 1L 11 | } else { 12 | nChunks <- ceiling(dim[chunkBy] / chunkSize) 13 | } 14 | chunkApply <- function(curChunk, ...) { 15 | if (verbose) { 16 | if (nCores > 1) { 17 | message("Process ", Sys.getpid(), ": Chunk ", curChunk, " of ", nChunks, " ...") 18 | } else { 19 | message("Chunk ", curChunk, " of ", nChunks, " ...") 20 | } 21 | } 22 | range <- seq( 23 | ((curChunk - 1L) * chunkSize) + 1L, 24 | min(curChunk * chunkSize, dim[chunkBy]) 25 | ) 26 | if (chunkBy == 2L) { 27 | chunk <- X[i, j[range], drop = FALSE] 28 | } else { 29 | chunk <- X[i[range], j, drop = FALSE] 30 | } 31 | FUN(chunk, ...) 32 | } 33 | if (nCores == 1L) { 34 | res <- lapply(X = seq_len(nChunks), FUN = chunkApply, ...) 35 | } else { 36 | # Suppress warnings because of custom error handling 37 | res <- suppressWarnings(mclapply(X = seq_len(nChunks), FUN = chunkApply, ..., mc.cores = nCores)) 38 | errors <- which(vapply(res, inherits, TRUE, "try-error")) 39 | if (length(errors) > 0L) { 40 | # With mc.preschedule = TRUE (the default), if a job fails, the 41 | # remaining jobs will fail as well with the same error message. 42 | # Therefore, the number of errors does not tell how many errors 43 | # actually occurred and only the first error message is forwarded. 44 | errorMessage <- attr(res[[errors[1L]]], "condition")[["message"]] 45 | stop("in chunk ", errors[1L], " (only first error is shown)", ": ", errorMessage, call. = FALSE) 46 | } 47 | } 48 | return(res) 49 | } 50 | 51 | chunkedApply <- function(X, MARGIN, FUN, i = seq_len(nrow(X)), j = seq_len(ncol(X)), chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE, ...) { 52 | res <- chunkedMap(X = X, FUN = function(chunk, ...) { 53 | apply2(X = chunk, MARGIN = MARGIN, FUN = FUN, ...) 54 | }, i = i, j = j, chunkBy = MARGIN, chunkSize = chunkSize, nCores = nCores, verbose = verbose, ...) 55 | simplifyList(res) 56 | } 57 | 58 | # A more memory-efficient version of apply. 59 | # 60 | # apply always makes a copy of the data. 61 | apply2 <- function(X, MARGIN, FUN, ...) { 62 | d <- dim(X) 63 | if (MARGIN == 1L) { 64 | subset <- X[1L, ] 65 | } else { 66 | subset <- X[, 1L] 67 | } 68 | sample <- FUN(subset, ...) 69 | if (is.table(sample)) { 70 | stop("tables are not supported.") 71 | } else if (is.list(sample)) { 72 | # List 73 | OUT <- vector(mode = "list", length = d[MARGIN]) 74 | names(OUT) <- dimnames(X)[[MARGIN]] 75 | OUT[[1L]] <- sample 76 | if (d[MARGIN] > 1L) { 77 | for (i in seq(2L, d[MARGIN])) { 78 | if (MARGIN == 1L) { 79 | subset <- X[i, ] 80 | } else { 81 | subset <- X[, i] 82 | } 83 | OUT[[i]] <- FUN(subset, ...) 84 | } 85 | } 86 | } else { 87 | if (length(sample) > 1L) { 88 | # Matrix or atomic vector of length > 1 89 | OUT <- matrix(data = normalizeType(typeof(sample)), nrow = length(sample), ncol = d[MARGIN]) 90 | if (!is.matrix(sample) && !is.null(names(sample))) { 91 | if (MARGIN == 1L) { 92 | dimnames(OUT) <- list(NULL, names(sample)) 93 | } else { 94 | dimnames(OUT) <- list(names(sample), NULL) 95 | } 96 | } 97 | OUT[, 1L] <- sample 98 | if (d[MARGIN] > 1L) { 99 | for (i in seq(2L, d[MARGIN])) { 100 | if (MARGIN == 1L) { 101 | subset <- X[i, ] 102 | } else { 103 | subset <- X[, i] 104 | } 105 | OUT[, i] <- FUN(subset, ...) 106 | } 107 | } 108 | } else { 109 | # Atomic vector of length 1 110 | OUT <- vector(mode = typeof(sample), length = d[MARGIN]) 111 | names(OUT) <- dimnames(X)[[MARGIN]] 112 | OUT[1L] <- sample 113 | if (d[MARGIN] > 1L) { 114 | for (i in seq(2L, d[MARGIN])) { 115 | if (MARGIN == 1L) { 116 | subset <- X[i, ] 117 | } else { 118 | subset <- X[, i] 119 | } 120 | OUT[i] <- FUN(subset, ...) 121 | } 122 | } 123 | } 124 | } 125 | return(OUT) 126 | } 127 | 128 | simplifyList <- function(x) { 129 | sample <- x[[1L]] 130 | if (is.matrix(sample)) { 131 | x <- matrix(data = unlist(x), nrow = nrow(sample), byrow = FALSE) 132 | rownames(x) <- rownames(sample) 133 | } else { 134 | x <- unlist(x) 135 | } 136 | return(x) 137 | } 138 | -------------------------------------------------------------------------------- /R/findRelated.R: -------------------------------------------------------------------------------- 1 | findRelated <- function(x, ...) { 2 | UseMethod("findRelated") 3 | } 4 | 5 | findRelated.matrix <- function(x, cutoff = 0.03, ...) { 6 | x[lower.tri(x, diag = TRUE)] <- 0 7 | pairs <- which(x > cutoff, arr.ind = TRUE, useNames = FALSE) 8 | samples <- unique(pairs[, 1L]) 9 | rownames(x)[samples] 10 | } 11 | 12 | findRelated.symDMatrix <- function(x, cutoff = 0.03, verbose = FALSE, ...) { 13 | n <- nBlocks(x) 14 | pairs <- lapply(seq_len(n), function(i) { 15 | lapply(seq(i, n), function(j) { 16 | if (verbose) { 17 | message("Working on block ", i, " ", j) 18 | } 19 | block <- x[[i]][[j]][] 20 | # Remove lower triangle in blocks that contain the diagonal 21 | if (i == j) { 22 | block[lower.tri(block, diag = TRUE)] <- 0 23 | } 24 | pairs <- which(block > cutoff, arr.ind = TRUE, useNames = FALSE) 25 | # Remap local indices to sample names 26 | remap <- matrix(character(), nrow = nrow(pairs), ncol = ncol(pairs)) 27 | remap[, 1L] <- rownames(block)[pairs[, 1L]] 28 | remap[, 2L] <- colnames(block)[pairs[, 2L]] 29 | return(remap) 30 | }) 31 | }) 32 | pairs <- do.call(rbind, lapply(pairs, function(x) do.call(rbind, x))) 33 | unique(pairs[, 1L]) 34 | } 35 | -------------------------------------------------------------------------------- /R/getG.R: -------------------------------------------------------------------------------- 1 | padDigits <- function(x, total) { 2 | formatC(x, width = as.integer(log10(total) + 1L), format = "d", flag = "0") 3 | } 4 | 5 | getG <- function(X, center = TRUE, scale = TRUE, impute = TRUE, scaleG = TRUE, minVar = 1e-05, i = seq_len(nrow(X)), j = seq_len(ncol(X)), i2 = NULL, chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE) { 6 | 7 | # compute XY' rather than XX' 8 | hasY <- !is.null(i2) 9 | 10 | if (hasY) { 11 | if (is.logical(center) && center == TRUE) { 12 | stop("centers need to be precomputed.") 13 | } 14 | if (is.logical(scale) && scale == TRUE) { 15 | stop("scales need to be precomputed.") 16 | } 17 | } 18 | 19 | i <- convertIndex(X, i, "i") 20 | j <- convertIndex(X, j, "j") 21 | if (hasY) { 22 | i2 <- convertIndex(X, i2, "i") 23 | } 24 | 25 | nX <- nrow(X) 26 | pX <- ncol(X) 27 | 28 | if (min(i) < 1L || max(i) > nX) { 29 | stop("Index out of bounds") 30 | } 31 | if (min(j) < 1L || max(j) > pX) { 32 | stop("Index out of bounds") 33 | } 34 | if (hasY) { 35 | if (min(i2) < 1L || max(i2) > nX) { 36 | stop("Index out of bounds") 37 | } 38 | } 39 | 40 | n <- length(i) 41 | p <- length(j) 42 | if (hasY) { 43 | n2 <- length(i2) 44 | } 45 | 46 | if (is.null(chunkSize)) { 47 | chunkSize <- p 48 | nChunks <- 1L 49 | } else { 50 | nChunks <- ceiling(p / chunkSize) 51 | } 52 | 53 | if (hasY) { 54 | G <- big.matrix(nrow = n, ncol = n2, type = "double", init = 0.0, dimnames = list(rownames(X)[i], rownames(X)[i2])) 55 | } else { 56 | G <- big.matrix(nrow = n, ncol = n, type = "double", init = 0.0, dimnames = list(rownames(X)[i], rownames(X)[i])) 57 | } 58 | 59 | mutex <- boost.mutex() 60 | 61 | chunkApply <- function(curChunk) { 62 | 63 | if (verbose) { 64 | if (nCores > 1) { 65 | message("Process ", Sys.getpid(), ": Chunk ", curChunk, " of ", nChunks, " ...") 66 | } else { 67 | message("Chunk ", curChunk, " of ", nChunks, " ...") 68 | } 69 | } 70 | 71 | # subset 72 | range <- seq( 73 | ((curChunk - 1L) * chunkSize) + 1L, 74 | min(curChunk * chunkSize, p) 75 | ) 76 | X1 <- X[i, j[range], drop = FALSE] 77 | if (hasY) { 78 | X2 <- X[i2, j[range], drop = FALSE] 79 | } 80 | 81 | # compute centers 82 | if (is.logical(center) && center == TRUE) { 83 | center.chunk <- colMeans(X1, na.rm = TRUE) 84 | } else if (is.numeric(center)) { 85 | center.chunk <- center[j[range]] 86 | } else { 87 | center.chunk = FALSE 88 | } 89 | 90 | # compute scales 91 | if (is.logical(scale) && scale == TRUE) { 92 | scale.chunk <- apply(X = X1, MARGIN = 2L, FUN = sd, na.rm = TRUE) 93 | } else if (is.numeric(scale)) { 94 | scale.chunk <- scale[j[range]] 95 | } else { 96 | scale.chunk <- FALSE 97 | } 98 | 99 | # remove constant columns 100 | if (is.numeric(scale.chunk)) { 101 | removeCols <- which(scale.chunk < minVar) 102 | if (length(removeCols) > 0L) { 103 | X1 <- X1[, -removeCols] 104 | if (hasY) { 105 | X2 <- X2[, -removeCols] 106 | } 107 | scale.chunk <- scale.chunk[-removeCols] 108 | center.chunk <- center.chunk[-removeCols] 109 | } 110 | } 111 | 112 | p <- ncol(X1) 113 | 114 | # compute XX' 115 | if (p > 0L) { 116 | 117 | # center, scale and impute without duplications 118 | # set nCores to 1 here because section is already parallelized 119 | X1 <- preprocess(X1, center = center.chunk, scale = scale.chunk, impute = impute, nCores = 1) 120 | if (hasY) { 121 | X2 <- preprocess(X2, center = center.chunk, scale = scale.chunk, impute = impute, nCores = 1) 122 | } 123 | 124 | if (hasY) { 125 | G_chunk <- tcrossprod(x = X1, y = X2) 126 | } else { 127 | G_chunk <- tcrossprod(X1) 128 | } 129 | 130 | lock(mutex) 131 | G[] <- G[] + G_chunk 132 | unlock(mutex) 133 | 134 | } 135 | 136 | return(p) 137 | 138 | } 139 | 140 | if (nCores == 1L) { 141 | res <- lapply(X = seq_len(nChunks), FUN = chunkApply) 142 | } else { 143 | res <- mclapply(X = seq_len(nChunks), FUN = chunkApply, mc.cores = nCores) 144 | } 145 | 146 | # Convert big.matrix to matrix 147 | G <- G[] 148 | 149 | if (scaleG) { 150 | if (hasY) { 151 | K <- do.call(sum, res) 152 | } else { 153 | # Use seq instead of diag to avoid copy as it does not increase ref count 154 | K <- mean(G[seq(from = 1L, to = n * n, by = n + 1L)]) 155 | } 156 | G[] <- G / K 157 | } 158 | 159 | return(G) 160 | 161 | } 162 | 163 | getG_symDMatrix <- function(X, center = TRUE, scale = TRUE, impute = TRUE, scaleG = TRUE, minVar = 1e-05, blockSize = 5000L, folderOut = paste0("symDMatrix_", randomString()), vmode = "double", i = seq_len(nrow(X)), j = seq_len(ncol(X)), chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE) { 164 | 165 | i <- convertIndex(X, i, "i") 166 | j <- convertIndex(X, j, "j") 167 | 168 | nX <- nrow(X) 169 | pX <- ncol(X) 170 | 171 | if (min(i) < 1L || max(i) > nX) { 172 | stop("Index out of bounds") 173 | } 174 | if (min(j) < 1L || max(j) > pX) { 175 | stop("Index out of bounds") 176 | } 177 | 178 | n <- length(i) 179 | p <- length(j) 180 | 181 | if (is.null(chunkSize)) { 182 | chunkSize <- p 183 | nChunks <- 1L 184 | } else { 185 | nChunks <- ceiling(p / chunkSize) 186 | } 187 | 188 | if (is.logical(center) && center == TRUE) { 189 | if (verbose) { 190 | message("Computing centers ...") 191 | } 192 | center <- rep(0, pX) 193 | names(center) <- colnames(X) 194 | center[j] <- chunkedApply(X = X, MARGIN = 2L, FUN = mean, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = FALSE, na.rm = TRUE) 195 | } 196 | 197 | if (is.logical(scale) && scale == TRUE) { 198 | if (verbose) { 199 | message("Computing scales ...") 200 | } 201 | scale <- rep(1, pX) 202 | names(scale) <- colnames(X) 203 | scale[j] <- chunkedApply(X = X, MARGIN = 2L, FUN = sd, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = FALSE, na.rm = TRUE) 204 | } 205 | 206 | if (file.exists(folderOut)) { 207 | stop(folderOut, " already exists") 208 | } 209 | dir.create(folderOut) 210 | 211 | if (is.null(blockSize)) { 212 | blockSize <- n 213 | nBlocks <- 1L 214 | } else { 215 | nBlocks <- ceiling(n / blockSize) 216 | } 217 | 218 | blockIndices <- split(i, ceiling(seq_along(i) / blockSize)) 219 | args <- vector(mode = "list", length = nBlocks) 220 | counter <- 1L 221 | for (rowIndex in 1L:nBlocks) { 222 | rowArgs <- vector(mode = "list", length = nBlocks) 223 | for (colIndex in 1L:nBlocks) { 224 | if (verbose) { 225 | message("Block ", rowIndex, "-", colIndex, " ...") 226 | } 227 | if (colIndex >= rowIndex) { 228 | blockName <- paste0("data_", padDigits(rowIndex, nBlocks), "_", padDigits(colIndex, nBlocks), ".bin") 229 | block <- as.ff(getG(X, center = center, scale = scale, impute = impute, scaleG = FALSE, minVar = minVar, i = blockIndices[[rowIndex]], j = j, i2 = blockIndices[[colIndex]], chunkSize = chunkSize, nCores = nCores, verbose = FALSE), filename = paste0(folderOut, "/", blockName), vmode = vmode) 230 | # Change ff path to a relative one 231 | physical(block)[["filename"]] <- blockName 232 | rowArgs[[colIndex]] <- block 233 | counter <- counter + 1L 234 | } else { 235 | rowArgs[[colIndex]] <- vt(args[[colIndex]][[rowIndex]]) 236 | } 237 | } 238 | args[[rowIndex]] <- do.call(ColumnLinkedMatrix, rowArgs) 239 | } 240 | 241 | G <- do.call(symDMatrix, args) 242 | 243 | if (scaleG) { 244 | K <- mean(diag(G)) 245 | for (rowIndex in seq_len(nBlocks)) { 246 | for (colIndex in seq(rowIndex, nBlocks)) { 247 | G[[rowIndex]][[colIndex]][] <- G[[rowIndex]][[colIndex]][] / K 248 | } 249 | } 250 | } 251 | 252 | save(G, file = paste0(folderOut, "/G.RData")) 253 | 254 | return(G) 255 | 256 | } 257 | -------------------------------------------------------------------------------- /R/preprocess.R: -------------------------------------------------------------------------------- 1 | preprocess <- function(X, center = FALSE, scale = FALSE, impute = FALSE, nCores = getOption("mc.cores", 2L)) { 2 | if (!(is.numeric(X) && length(dim(X)) == 2)) { 3 | stop("'X' needs to be a numeric matrix") 4 | } 5 | if (!(is.logical(center) && length(center) == 1L) && !(is.numeric(center) && length(center) == ncol(X))) { 6 | stop("'center' needs to be either a logical vector of size 1 or a numeric vector of size 'ncol(X)'") 7 | } 8 | if (!(is.logical(scale) && length(scale) == 1L) && !(is.numeric(scale) && length(scale) == ncol(X))) { 9 | stop("'scale' needs to be either a logical vector of size 1 or a numeric vector of size 'ncol(X)'") 10 | } 11 | if (!(is.logical(impute) && length(impute) == 1L)) { 12 | stop("'impute' needs to be a logical vector of size 1") 13 | } 14 | if (!(is.numeric(nCores) && nCores > 0L)) { 15 | stop("'nCores' needs to be a positive number") 16 | } 17 | .Call(C_preprocess, X, center, scale, impute, as.integer(nCores)) 18 | } 19 | -------------------------------------------------------------------------------- /R/segments.R: -------------------------------------------------------------------------------- 1 | segments <- function(statistic, chr, bp, threshold, gap, trim = FALSE, verbose = FALSE,snpid=NULL) { 2 | if (length(unique(c(length(statistic), length(chr), length(bp)))) != 1) { 3 | stop("statistic, chr, and bp need to match in length") 4 | } 5 | if (!is.numeric(statistic)) { 6 | stop("'statistic' needs to be a numeric vector") 7 | } 8 | if (!(is.numeric(chr) || is.character(chr))) { 9 | stop("'chr' needs to be a either a character or numeric vector") 10 | } 11 | if (!is.numeric(bp)) { 12 | stop("'bp' needs to be a numeric vector") 13 | } 14 | if (!is.numeric(threshold)) { 15 | stop("'threshold' needs to a number") 16 | } 17 | if (!is.numeric(gap)) { 18 | stop("'gap' needs to a number") 19 | } 20 | uniqueChr <- unique(chr) 21 | out <- vector(mode = "list", length = length(uniqueChr)) 22 | for (curChr in uniqueChr) { 23 | if (verbose) { 24 | message("Working on chromosome ", curChr) 25 | } 26 | # Extract chromosome data 27 | chrFilter <- which(chr == curChr) 28 | statisticChr <- statistic[chrFilter] 29 | bpChr <- bp[chrFilter] 30 | # Determine variants below threshold 31 | discoverySet <- which(statisticChr <= threshold) 32 | # Set discoveries and all variants within +/- gap to 1, leave rest as 0 33 | signal <- rep(0, length(chrFilter)) 34 | for (discovery in discoverySet) { 35 | signal[abs(bpChr - bpChr[discovery]) <= gap] <- 1 36 | } 37 | # Determine the runs in the 0/1 signal 38 | runs <- rle(signal) 39 | # Determine at what positions within the chromosome the runs start and 40 | # end while removing 0-runs 41 | runStart <- c(1, cumsum(runs[["lengths"]][-length(runs[["lengths"]])]) + 1) 42 | withinSegment <- runs[["values"]] == 1 43 | runStart <- runStart[withinSegment] 44 | runEnd <- runStart + runs[["lengths"]][withinSegment] - 1 45 | runLength <- runs[["lengths"]][withinSegment] 46 | # Determine value and position of smallest variant within segment, and 47 | # optionally trim segment (i.e., remove variants that are not internal 48 | # to the segment containing GWAS-significant variants) 49 | # Would be nice to vectorize this like the other operations ... 50 | minValue <- vector(mode = "numeric", length = length(runStart)) 51 | minValuePos <- vector(mode = "integer", length = length(runStart)) 52 | for (curSeg in seq_along(runStart)) { 53 | segFilter <- seq(runStart[curSeg], runEnd[curSeg]) 54 | statisticSeq <- statisticChr[segFilter] 55 | minValuePosSeg <- which.min(statisticSeq) 56 | minValue[curSeg] <- statisticSeq[minValuePosSeg] 57 | minValuePos[curSeg] <- chrFilter[1] + segFilter[1] + minValuePosSeg - 2 58 | if (trim) { 59 | # Determine which variants in the segment passed the threshold 60 | significantVariants <- which(statisticSeq <= threshold) 61 | # Set start of run to first significant variant and end of run 62 | # to last significant variant 63 | runStart[curSeg] <- segFilter[significantVariants[1]] 64 | runEnd[curSeg] <- segFilter[significantVariants[length(significantVariants)]] 65 | runLength[curSeg] <- runEnd[curSeg] - runStart[curSeg] + 1 66 | } 67 | } 68 | # Determine at what base-pair positions the runs start and end 69 | bpStart <- bpChr[runStart] 70 | bpEnd <- bpChr[runEnd] 71 | bpLength <- bpEnd - bpStart + 1 72 | # Determine at what positions within x the runs start and end (more 73 | # useful information than chromosome by chromosome because it is easier 74 | # to extract) 75 | xStart <- chrFilter[runStart] 76 | xEnd <- chrFilter[runEnd] 77 | # Prepare chromosome summary (there might be no segments, so do not 78 | # rely on recycling) 79 | outChr <- data.frame( 80 | chr = rep(curChr, times = length(runStart)), 81 | start = xStart, 82 | end = xEnd, 83 | length = runLength, 84 | bpStart = bpStart, 85 | bpEnd = bpEnd, 86 | bpLength = bpLength, 87 | minValue = minValue, 88 | minValuePos = minValuePos, 89 | minValueBp=bp[minValuePos] 90 | ) 91 | if(!is.null(snpid)){ 92 | outChr$leadSNP=snpid[minValuePos] 93 | } 94 | out[[curChr]] <- outChr 95 | } 96 | # Combine chromosomes 97 | out <- do.call(rbind, out) 98 | rownames(out) <- NULL 99 | return(out) 100 | } 101 | -------------------------------------------------------------------------------- /R/summarize.R: -------------------------------------------------------------------------------- 1 | summarize <- function(X, i = seq_len(nrow(X)), j = seq_len(ncol(X)), chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE) { 2 | res <- chunkedMap(X = X, FUN = function(chunk) { 3 | summaries <- .Call(C_summarize, chunk) 4 | rownames(summaries) <- colnames(chunk) 5 | colnames(summaries) <- c("freq_na", "allele_freq", "sd") 6 | return(summaries) 7 | }, i = i, j = j, chunkSize = chunkSize, nCores = nCores, verbose = verbose) 8 | res <- do.call(rbind, res) 9 | as.data.frame(res) 10 | } 11 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | getLineCount <- function(path, header) { 2 | file <- file(path, open = "r") 3 | n <- 0L 4 | while (length(readLines(file, n = 1L)) > 0L) { 5 | n <- n + 1L 6 | } 7 | if (header) { 8 | n <- n - 1L 9 | } 10 | close(file) 11 | return(n) 12 | } 13 | 14 | getFileHeader <- function(path, sep = "") { 15 | file <- file(path, open = "r") 16 | header <- scan(file, nlines = 1L, what = character(), sep = sep, quiet = TRUE) 17 | close(file) 18 | return(header) 19 | } 20 | 21 | getColumnCount <- function(path, sep = "") { 22 | header <- getFileHeader(path, sep) 23 | p <- length(header) 24 | return(p) 25 | } 26 | 27 | randomString <- function() { 28 | paste(sample(c(0L:9L, letters, LETTERS), size = 5L, replace = TRUE), collapse = "") 29 | } 30 | 31 | normalizeType <- function(val) { 32 | type <- typeof(val) 33 | # detect strings 34 | if (type == "character" && length(val) > 0L) { 35 | # convert to type if type and value match 36 | convert <- try(vector(mode = val), silent = TRUE) 37 | if (inherits(convert, "try-error")) { 38 | # return a character type if conversion failed 39 | warning("could no convert type, using character instead") 40 | character() 41 | } else { 42 | # return conversion result otherwise 43 | convert 44 | } 45 | # value doesn't contain type information and can be handled by typeof 46 | } else { 47 | val 48 | } 49 | } 50 | 51 | loadExample <- function() { 52 | path <- system.file("extdata", package = "BGData") 53 | message("Loading chromosomes as .bed files...") 54 | m <- do.call(ColumnLinkedMatrix, lapply(c("chr1", "chr2", "chr3"), function(chr) { 55 | suppressMessages(BEDMatrix(paste0(path, "/", chr))) 56 | })) 57 | as.BGData(m, alternatePhenotypeFile = paste0(path, "/pheno.txt")) 58 | } 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | BGData: A Suite of Packages for Analysis of Big Genomic Data 2 | ============================================================ 3 | 4 | [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/BGData)](https://CRAN.R-project.org/package=BGData) 5 | 6 | BGData ([Grueneberg & de los Campos, 2019](https://doi.org/10.1534/g3.119.400018)) is an R package that provides scalable and efficient computational methods for large genomic datasets, e.g., genome-wide association studies (GWAS) or genomic relationship matrices (G matrices). It also contains a container class called `BGData` that holds genotypes, sample information, and variant information. 7 | 8 | Modern genomic datasets are big (large *n*), high-dimensional (large *p*), and multi-layered. The challenges that need to be addressed are memory requirements and computational demands. Our goal is to develop software that will enable researchers to carry out analyses with big genomic data within the R environment. 9 | 10 | We have identified several approaches to tackle those challenges within R: 11 | 12 | - File-backed matrices: The data is stored in on the hard drive and users can read in smaller chunks when they are needed. 13 | - Linked arrays: For very large datasets a single file-backed array may not be enough or convenient. A linked array is an array whose content is distributed over multiple file-backed nodes. 14 | - Multiple dispatch: Methods are presented to users so that they can treat these arrays pretty much as if they were RAM arrays. 15 | - Multi-level parallelism: Exploit multi-core and multi-node computing. 16 | - Inputs: Users can create these arrays from standard formats (e.g., PLINK .bed). 17 | 18 | The BGData package is an umbrella package that comprises several packages: [BEDMatrix](https://CRAN.R-project.org/package=BEDMatrix), [LinkedMatrix](https://CRAN.R-project.org/package=LinkedMatrix), and [symDMatrix](https://CRAN.R-project.org/package=symDMatrix). 19 | 20 | 21 | Examples 22 | -------- 23 | 24 | ### Loading the package 25 | 26 | Load the BGData package: 27 | 28 | ```R 29 | library(BGData) 30 | ``` 31 | 32 | ### Inspecting the example dataset 33 | 34 | The `inst/extdata` folder contains example files that were generated from the 250k SNP and phenotype data in [Atwell et al. (2010)](https://doi.org/10.1038/nature08800). Only the first 300 SNPs of chromosome 1, 2, and 3 were included to keep the size of the example dataset small enough for CRAN. [PLINK](https://www.cog-genomics.org/plink2) was used to convert the data to [.bed](https://www.cog-genomics.org/plink2/input#bed) and [.raw](https://www.cog-genomics.org/plink2/input#raw) files. `FT10` has been chosen as a phenotype and is provided as an [alternate phenotype file](https://www.cog-genomics.org/plink2/input#pheno). The file is intentionally shuffled to demonstrate that the additional phenotypes are put in the same order as the rest of the phenotypes. 35 | 36 | ```R 37 | path <- system.file("extdata", package = "BGData") 38 | list.files(path) 39 | #> [1] "chr1.bed" "chr1.bim" "chr1.fam" "chr1.raw" "chr2.bed" "chr2.bim" 40 | #> [7] "chr2.fam" "chr2.raw" "chr3.bed" "chr3.bim" "chr3.fam" "chr3.raw" 41 | #> [13] "pheno.txt" 42 | ``` 43 | 44 | ### Loading example dataset 45 | 46 | #### Loading individual PLINK .bed files 47 | 48 | Load the .bed file for chromosome 1 (chr1.bed) using the [BEDMatrix](https://CRAN.R-project.org/package=BEDMatrix) package: 49 | 50 | ```R 51 | chr1 <- BEDMatrix(paste0(path, "/chr1.bed")) 52 | #> Extracting number of individuals and rownames from .fam file... 53 | #> Extracting number of markers and colnames from .bim file... 54 | ``` 55 | 56 | `BEDMatrix` objects behave similarly to regular matrices: 57 | 58 | ```R 59 | dim(chr1) 60 | #> [1] 199 300 61 | rownames(chr1)[1:10] 62 | #> [1] "5837_5837" "6008_6008" "6009_6009" "6016_6016" "6040_6040" "6042_6042" 63 | #> [7] "6043_6043" "6046_6046" "6064_6064" "6074_6074" 64 | colnames(chr1)[1:10] 65 | #> [1] "snp1_T" "snp2_G" "snp3_A" "snp4_T" "snp5_G" "snp6_T" "snp7_C" 66 | #> [8] "snp8_C" "snp9_C" "snp10_G" 67 | chr1["6008_6008", "snp5_G"] 68 | #> [1] 0 69 | ``` 70 | 71 | #### Linking multiple BEDMatrix objects together 72 | 73 | Load the other two .bed files: 74 | 75 | ```R 76 | chr2 <- BEDMatrix(paste0(path, "/chr2.bed")) 77 | #> Extracting number of individuals and rownames from .fam file... 78 | #> Extracting number of markers and colnames from .bim file... 79 | chr3 <- BEDMatrix(paste0(path, "/chr3.bed")) 80 | #> Extracting number of individuals and rownames from .fam file... 81 | #> Extracting number of markers and colnames from .bim file... 82 | ``` 83 | 84 | Combine the BEDMatrix objects by columns using the [LinkedMatrix](https://CRAN.R-project.org/package=LinkedMatrix) to avoid the inconvenience of having three separate matrices: 85 | 86 | ```R 87 | wg <- ColumnLinkedMatrix(chr1, chr2, chr3) 88 | ``` 89 | 90 | Just like `BEDMatrix` objects, `LinkedMatrix` objects also behave similarly to regular matrices: 91 | 92 | ```R 93 | dim(wg) 94 | #> [1] 199 900 95 | rownames(wg)[1:10] 96 | #> [1] "5837_5837" "6008_6008" "6009_6009" "6016_6016" "6040_6040" "6042_6042" 97 | #> [7] "6043_6043" "6046_6046" "6064_6064" "6074_6074" 98 | colnames(wg)[1:10] 99 | #> [1] "snp1_T" "snp2_G" "snp3_A" "snp4_T" "snp5_G" "snp6_T" "snp7_C" 100 | #> [8] "snp8_C" "snp9_C" "snp10_G" 101 | wg["6008_6008", "snp5_G"] 102 | #> [1] 0 103 | ``` 104 | 105 | ### Creating a BGData object 106 | 107 | `BGData` objects can be created from individual `BEDMatrix` objects or a collection of `BEDMatrix` objects as a `LinkedMatrix` object using the `as.BGData()` function. This will read the .fam and .bim file that comes with the .bed files. The `alternatePhenotypeFile` parameter points to the file that contains the `FT10` phenotype: 108 | 109 | ```R 110 | bg <- as.BGData(wg, alternatePhenotypeFile = paste0(path, "/pheno.txt")) 111 | #> Extracting phenotypes from .fam file, assuming that the .fam file of the first BEDMatrix instance is representative of all the other nodes... 112 | #> Extracting map from .bim files... 113 | #> Merging alternate phenotype file... 114 | ``` 115 | 116 | The `bg` object will use the `LinkedMatrix` object as genotypes, the .fam file augmented by the `FT10` phenotype as sample information, and the .bim file as variant information. 117 | 118 | ```R 119 | str(bg) 120 | #> Formal class 'BGData' [package "BGData"] with 3 slots 121 | #> ..@ geno :Formal class 'ColumnLinkedMatrix' [package "LinkedMatrix"] with 1 slot 122 | #> .. .. ..@ .Data:List of 3 123 | #> .. .. .. ..$ :BEDMatrix: 199 x 300 [/home/agrueneberg/.pkgs/R/BGData/extdata/chr1.bed] 124 | #> .. .. .. ..$ :BEDMatrix: 199 x 300 [/home/agrueneberg/.pkgs/R/BGData/extdata/chr2.bed] 125 | #> .. .. .. ..$ :BEDMatrix: 199 x 300 [/home/agrueneberg/.pkgs/R/BGData/extdata/chr3.bed] 126 | #> ..@ pheno:'data.frame': 199 obs. of 7 variables: 127 | #> .. ..$ FID : int [1:199] 5837 6008 6009 6016 6040 6042 6043 6046 6064 6074 ... 128 | #> .. ..$ IID : int [1:199] 5837 6008 6009 6016 6040 6042 6043 6046 6064 6074 ... 129 | #> .. ..$ PAT : int [1:199] 0 0 0 0 0 0 0 0 0 0 ... 130 | #> .. ..$ MAT : int [1:199] 0 0 0 0 0 0 0 0 0 0 ... 131 | #> .. ..$ SEX : int [1:199] 0 0 0 0 0 0 0 0 0 0 ... 132 | #> .. ..$ PHENOTYPE: int [1:199] -9 -9 -9 -9 -9 -9 -9 -9 -9 -9 ... 133 | #> .. ..$ FT10 : num [1:199] 57 60 98 75 71 56 90 93 96 91 ... 134 | #> ..@ map :'data.frame': 900 obs. of 6 variables: 135 | #> .. ..$ chromosome : int [1:900] 1 1 1 1 1 1 1 1 1 1 ... 136 | #> .. ..$ snp_id : chr [1:900] "snp1" "snp2" "snp3" "snp4" ... 137 | #> .. ..$ genetic_distance : int [1:900] 0 0 0 0 0 0 0 0 0 0 ... 138 | #> .. ..$ base_pair_position: int [1:900] 657 3102 4648 4880 5975 6063 6449 6514 6603 6768 ... 139 | #> .. ..$ allele_1 : chr [1:900] "T" "G" "A" "T" ... 140 | #> .. ..$ allele_2 : chr [1:900] "C" "A" "C" "C" ... 141 | ``` 142 | 143 | ### Saving a BGData object 144 | 145 | A BGData object can be saved like any other R object using the `save` function: 146 | 147 | ```R 148 | save(bg, file = "BGData.RData") 149 | ``` 150 | 151 | ### Loading a BGData object 152 | 153 | The genotypes in a `BGData` object can be of various types, some of which need to be initialized in a particular way. The `load.BGData` takes care of reloading a saved BGData object properly: 154 | 155 | ```R 156 | load.BGData("BGData.RData") 157 | #> Loaded objects: bg 158 | ``` 159 | 160 | ### Summarizing data 161 | 162 | Use `chunkedApply` to count missing values (among others): 163 | 164 | ```R 165 | countNAs <- chunkedApply(X = geno(bg), MARGIN = 2, FUN = function(x) sum(is.na(x))) 166 | ``` 167 | 168 | Use the `summarize` function to calculate minor allele frequencies and frequency of missing values: 169 | 170 | ```R 171 | summarize(geno(bg)) 172 | ``` 173 | 174 | ### Running GWASes with different regression methods 175 | 176 | A data structure for genomic data is useful when defining methods that act on both phenotype and genotype information. We have implemented a `GWAS` function that supports various regression methods. The formula takes phenotypes from the sample information of the `BGData` object and inserts one marker at a time. 177 | 178 | ```R 179 | gwas <- GWAS(formula = FT10 ~ 1, data = bg) 180 | ``` 181 | 182 | ### Generating the G Matrix 183 | 184 | ```R 185 | G <- getG(geno(bg)) 186 | ``` 187 | 188 | 189 | Installation 190 | ------------ 191 | 192 | Install the stable version from CRAN: 193 | 194 | ```R 195 | install.packages("BGData") 196 | ``` 197 | 198 | Alternatively, install the development version from GitHub: 199 | 200 | ```R 201 | # install.packages("remotes") 202 | remotes::install_github("QuantGen/BGData") 203 | ``` 204 | 205 | 206 | Documentation 207 | ------------- 208 | 209 | Further documentation can be found on [RDocumentation](https://www.rdocumentation.org/packages/BGData). 210 | 211 | 212 | Contributing 213 | ------------ 214 | 215 | - Issue Tracker: https://github.com/QuantGen/BGData/issues 216 | - Source Code: https://github.com/QuantGen/BGData 217 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "Article", 3 | textVersion = "Grueneberg, A., and G. de los Campos, 2019 BGData - A Suite of R Packages for Genomic Analysis with Big Data. G3: Genes, Genomes, Genetics 9(5): 1377-1383.", 4 | title = "BGData - A Suite of R Packages for Genomic Analysis with Big Data", 5 | author = c( 6 | person("Alexander", "Grueneberg"), 7 | person("Gustavo", "de los Campos") 8 | ), 9 | journal = "G3: Genes, Genomes, Genetics", 10 | year = "2019", 11 | volume = "9", 12 | number = "5", 13 | pages = "1377--1383", 14 | doi = "10.1534/g3.119.400018", 15 | url = "https://doi.org/10.1534/g3.119.400018" 16 | ) 17 | -------------------------------------------------------------------------------- /inst/extdata/chr1.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuantGen/BGData/fe7a57779d903f7056d5841482c9afdbeae5744b/inst/extdata/chr1.bed -------------------------------------------------------------------------------- /inst/extdata/chr1.bim: -------------------------------------------------------------------------------- 1 | 1 snp1 0 657 T C 2 | 1 snp2 0 3102 G A 3 | 1 snp3 0 4648 A C 4 | 1 snp4 0 4880 T C 5 | 1 snp5 0 5975 G T 6 | 1 snp6 0 6063 T C 7 | 1 snp7 0 6449 C T 8 | 1 snp8 0 6514 C T 9 | 1 snp9 0 6603 C T 10 | 1 snp10 0 6768 G A 11 | 1 snp11 0 7601 T C 12 | 1 snp12 0 8193 A G 13 | 1 snp13 0 8617 A G 14 | 1 snp14 0 10219 A T 15 | 1 snp15 0 10449 T A 16 | 1 snp16 0 10969 G A 17 | 1 snp17 0 11493 A G 18 | 1 snp18 0 11696 A C 19 | 1 snp19 0 12584 A C 20 | 1 snp20 0 12659 C A 21 | 1 snp21 0 13045 C T 22 | 1 snp22 0 14385 A T 23 | 1 snp23 0 19819 T A 24 | 1 snp24 0 20892 C A 25 | 1 snp25 0 21043 T A 26 | 1 snp26 0 21128 T A 27 | 1 snp27 0 22522 G C 28 | 1 snp28 0 23838 T A 29 | 1 snp29 0 25315 G A 30 | 1 snp30 0 25365 T C 31 | 1 snp31 0 25773 T C 32 | 1 snp32 0 26288 T A 33 | 1 snp33 0 27265 C T 34 | 1 snp34 0 28948 T C 35 | 1 snp35 0 28978 A C 36 | 1 snp36 0 29291 A G 37 | 1 snp37 0 30529 A G 38 | 1 snp38 0 30683 T A 39 | 1 snp39 0 31515 G A 40 | 1 snp40 0 31926 A G 41 | 1 snp41 0 32210 T C 42 | 1 snp42 0 32807 A G 43 | 1 snp43 0 34125 A G 44 | 1 snp44 0 34599 C G 45 | 1 snp45 0 35856 C T 46 | 1 snp46 0 37072 G T 47 | 1 snp47 0 38946 C A 48 | 1 snp48 0 39751 T G 49 | 1 snp49 0 41178 G A 50 | 1 snp50 0 41427 A G 51 | 1 snp51 0 41887 G A 52 | 1 snp52 0 44567 C A 53 | 1 snp53 0 45075 T A 54 | 1 snp54 0 45105 C G 55 | 1 snp55 0 45580 A T 56 | 1 snp56 0 45683 T C 57 | 1 snp57 0 46373 C T 58 | 1 snp58 0 46499 A G 59 | 1 snp59 0 46912 A G 60 | 1 snp60 0 46935 C T 61 | 1 snp61 0 47577 G A 62 | 1 snp62 0 47692 G T 63 | 1 snp63 0 48118 A G 64 | 1 snp64 0 48181 T C 65 | 1 snp65 0 49080 C T 66 | 1 snp66 0 51392 A C 67 | 1 snp67 0 51706 A T 68 | 1 snp68 0 51878 C A 69 | 1 snp69 0 52202 C A 70 | 1 snp70 0 53183 T G 71 | 1 snp71 0 53729 T C 72 | 1 snp72 0 53901 A G 73 | 1 snp73 0 55684 C T 74 | 1 snp74 0 57136 T G 75 | 1 snp75 0 57686 C A 76 | 1 snp76 0 59637 A G 77 | 1 snp77 0 60083 G T 78 | 1 snp78 0 60772 C G 79 | 1 snp79 0 61122 T A 80 | 1 snp80 0 61266 T A 81 | 1 snp81 0 61405 T A 82 | 1 snp82 0 61661 T C 83 | 1 snp83 0 62259 A T 84 | 1 snp84 0 62935 G C 85 | 1 snp85 0 63084 T A 86 | 1 snp86 0 63645 C G 87 | 1 snp87 0 63759 G A 88 | 1 snp88 0 63915 G C 89 | 1 snp89 0 64149 G A 90 | 1 snp90 0 64651 G A 91 | 1 snp91 0 68340 T G 92 | 1 snp92 0 68880 A G 93 | 1 snp93 0 69311 C G 94 | 1 snp94 0 70933 C T 95 | 1 snp95 0 71326 G T 96 | 1 snp96 0 71348 T C 97 | 1 snp97 0 71868 A C 98 | 1 snp98 0 72138 A T 99 | 1 snp99 0 72756 G C 100 | 1 snp100 0 72894 T C 101 | 1 snp101 0 72924 A G 102 | 1 snp102 0 73047 T G 103 | 1 snp103 0 73467 T G 104 | 1 snp104 0 73691 T C 105 | 1 snp105 0 73851 A G 106 | 1 snp106 0 73989 A G 107 | 1 snp107 0 74169 G C 108 | 1 snp108 0 74707 T C 109 | 1 snp109 0 75481 T C 110 | 1 snp110 0 75721 T C 111 | 1 snp111 0 75899 C T 112 | 1 snp112 0 76188 C G 113 | 1 snp113 0 76217 T G 114 | 1 snp114 0 76847 G A 115 | 1 snp115 0 76879 C T 116 | 1 snp116 0 76906 A G 117 | 1 snp117 0 77127 T C 118 | 1 snp118 0 77140 A G 119 | 1 snp119 0 77243 A G 120 | 1 snp120 0 77458 T A 121 | 1 snp121 0 78803 A T 122 | 1 snp122 0 78975 A T 123 | 1 snp123 0 79418 G C 124 | 1 snp124 0 80216 T C 125 | 1 snp125 0 80374 C A 126 | 1 snp126 0 80400 T G 127 | 1 snp127 0 80850 A T 128 | 1 snp128 0 81068 T A 129 | 1 snp129 0 81496 G C 130 | 1 snp130 0 81854 T C 131 | 1 snp131 0 81869 T C 132 | 1 snp132 0 82197 C T 133 | 1 snp133 0 82290 T C 134 | 1 snp134 0 83117 G C 135 | 1 snp135 0 83177 A C 136 | 1 snp136 0 83219 C T 137 | 1 snp137 0 84144 G A 138 | 1 snp138 0 84379 T A 139 | 1 snp139 0 84558 G C 140 | 1 snp140 0 85561 C A 141 | 1 snp141 0 85860 C G 142 | 1 snp142 0 86656 C T 143 | 1 snp143 0 87060 A G 144 | 1 snp144 0 87791 C G 145 | 1 snp145 0 87985 T C 146 | 1 snp146 0 88300 C T 147 | 1 snp147 0 88658 A G 148 | 1 snp148 0 89312 G T 149 | 1 snp149 0 90606 G T 150 | 1 snp150 0 92353 T C 151 | 1 snp151 0 92507 T A 152 | 1 snp152 0 92866 T C 153 | 1 snp153 0 93562 G T 154 | 1 snp154 0 93740 T C 155 | 1 snp155 0 93800 A G 156 | 1 snp156 0 95018 G C 157 | 1 snp157 0 95225 G A 158 | 1 snp158 0 95386 C A 159 | 1 snp159 0 95749 C G 160 | 1 snp160 0 95962 A T 161 | 1 snp161 0 96521 T C 162 | 1 snp162 0 96726 T C 163 | 1 snp163 0 96770 A C 164 | 1 snp164 0 97473 T C 165 | 1 snp165 0 98317 G T 166 | 1 snp166 0 98617 T C 167 | 1 snp167 0 99001 T C 168 | 1 snp168 0 99456 A G 169 | 1 snp169 0 99757 A G 170 | 1 snp170 0 99785 G A 171 | 1 snp171 0 99805 A C 172 | 1 snp172 0 100267 A T 173 | 1 snp173 0 100314 C T 174 | 1 snp174 0 100501 G A 175 | 1 snp175 0 100832 G C 176 | 1 snp176 0 103175 G C 177 | 1 snp177 0 103453 T G 178 | 1 snp178 0 103654 G C 179 | 1 snp179 0 103749 T G 180 | 1 snp180 0 103963 T A 181 | 1 snp181 0 105029 A C 182 | 1 snp182 0 105282 G A 183 | 1 snp183 0 105894 C G 184 | 1 snp184 0 107585 T C 185 | 1 snp185 0 109472 T C 186 | 1 snp186 0 109627 T C 187 | 1 snp187 0 112907 A G 188 | 1 snp188 0 113076 A G 189 | 1 snp189 0 114187 T G 190 | 1 snp190 0 114648 T C 191 | 1 snp191 0 114737 C A 192 | 1 snp192 0 114915 C T 193 | 1 snp193 0 115320 G T 194 | 1 snp194 0 117199 C A 195 | 1 snp195 0 117379 T A 196 | 1 snp196 0 118192 C A 197 | 1 snp197 0 118441 A G 198 | 1 snp198 0 119003 T C 199 | 1 snp199 0 120328 G C 200 | 1 snp200 0 120376 G T 201 | 1 snp201 0 120456 G A 202 | 1 snp202 0 120787 T C 203 | 1 snp203 0 122005 G C 204 | 1 snp204 0 122503 G A 205 | 1 snp205 0 122809 A G 206 | 1 snp206 0 123313 C G 207 | 1 snp207 0 123924 A T 208 | 1 snp208 0 125155 G A 209 | 1 snp209 0 126251 A G 210 | 1 snp210 0 127093 T C 211 | 1 snp211 0 127891 T C 212 | 1 snp212 0 128116 A G 213 | 1 snp213 0 128366 A G 214 | 1 snp214 0 129025 T C 215 | 1 snp215 0 131281 T C 216 | 1 snp216 0 132276 G A 217 | 1 snp217 0 132773 A C 218 | 1 snp218 0 132898 A G 219 | 1 snp219 0 133144 T G 220 | 1 snp220 0 133186 C T 221 | 1 snp221 0 133607 C A 222 | 1 snp222 0 133626 C A 223 | 1 snp223 0 133701 G A 224 | 1 snp224 0 133794 G A 225 | 1 snp225 0 134063 C A 226 | 1 snp226 0 135519 G A 227 | 1 snp227 0 136188 C T 228 | 1 snp228 0 136211 T A 229 | 1 snp229 0 136399 T G 230 | 1 snp230 0 136467 T C 231 | 1 snp231 0 136773 A G 232 | 1 snp232 0 137104 C T 233 | 1 snp233 0 137134 A G 234 | 1 snp234 0 137545 A G 235 | 1 snp235 0 138060 C T 236 | 1 snp236 0 138335 A G 237 | 1 snp237 0 138627 T C 238 | 1 snp238 0 138875 C A 239 | 1 snp239 0 138989 A G 240 | 1 snp240 0 139393 A C 241 | 1 snp241 0 139832 C T 242 | 1 snp242 0 139991 G T 243 | 1 snp243 0 140005 G C 244 | 1 snp244 0 140671 T C 245 | 1 snp245 0 141411 G T 246 | 1 snp246 0 141556 A T 247 | 1 snp247 0 142940 G T 248 | 1 snp248 0 143199 G C 249 | 1 snp249 0 143272 A T 250 | 1 snp250 0 143471 A C 251 | 1 snp251 0 143509 C G 252 | 1 snp252 0 143704 C G 253 | 1 snp253 0 144331 A C 254 | 1 snp254 0 144392 T G 255 | 1 snp255 0 144410 G T 256 | 1 snp256 0 144832 A G 257 | 1 snp257 0 144872 A G 258 | 1 snp258 0 145457 A C 259 | 1 snp259 0 146220 G T 260 | 1 snp260 0 147220 G T 261 | 1 snp261 0 147236 G T 262 | 1 snp262 0 148601 C G 263 | 1 snp263 0 148990 T A 264 | 1 snp264 0 150430 A T 265 | 1 snp265 0 150457 C A 266 | 1 snp266 0 150992 C T 267 | 1 snp267 0 151942 C A 268 | 1 snp268 0 153104 G T 269 | 1 snp269 0 153232 T A 270 | 1 snp270 0 155442 G T 271 | 1 snp271 0 155984 A C 272 | 1 snp272 0 156619 T C 273 | 1 snp273 0 157491 C T 274 | 1 snp274 0 157974 T C 275 | 1 snp275 0 158134 A G 276 | 1 snp276 0 158305 A G 277 | 1 snp277 0 158640 A G 278 | 1 snp278 0 158910 G C 279 | 1 snp279 0 163082 G A 280 | 1 snp280 0 163492 A G 281 | 1 snp281 0 163569 C A 282 | 1 snp282 0 163766 T C 283 | 1 snp283 0 164085 T C 284 | 1 snp284 0 164310 C G 285 | 1 snp285 0 164375 A T 286 | 1 snp286 0 164393 G C 287 | 1 snp287 0 164553 A G 288 | 1 snp288 0 164679 C T 289 | 1 snp289 0 164815 A T 290 | 1 snp290 0 164969 G T 291 | 1 snp291 0 165386 G C 292 | 1 snp292 0 165397 A G 293 | 1 snp293 0 165625 T C 294 | 1 snp294 0 165766 C T 295 | 1 snp295 0 165984 C T 296 | 1 snp296 0 166030 C G 297 | 1 snp297 0 166386 A G 298 | 1 snp298 0 166850 G T 299 | 1 snp299 0 167493 A G 300 | 1 snp300 0 167692 G T 301 | -------------------------------------------------------------------------------- /inst/extdata/chr1.fam: -------------------------------------------------------------------------------- 1 | 5837 5837 0 0 0 -9 2 | 6008 6008 0 0 0 -9 3 | 6009 6009 0 0 0 -9 4 | 6016 6016 0 0 0 -9 5 | 6040 6040 0 0 0 -9 6 | 6042 6042 0 0 0 -9 7 | 6043 6043 0 0 0 -9 8 | 6046 6046 0 0 0 -9 9 | 6064 6064 0 0 0 -9 10 | 6074 6074 0 0 0 -9 11 | 6243 6243 0 0 0 -9 12 | 6709 6709 0 0 0 -9 13 | 6897 6897 0 0 0 -9 14 | 6898 6898 0 0 0 -9 15 | 6899 6899 0 0 0 -9 16 | 6900 6900 0 0 0 -9 17 | 6901 6901 0 0 0 -9 18 | 6903 6903 0 0 0 -9 19 | 6904 6904 0 0 0 -9 20 | 6905 6905 0 0 0 -9 21 | 6906 6906 0 0 0 -9 22 | 6907 6907 0 0 0 -9 23 | 6908 6908 0 0 0 -9 24 | 6909 6909 0 0 0 -9 25 | 6910 6910 0 0 0 -9 26 | 6911 6911 0 0 0 -9 27 | 6913 6913 0 0 0 -9 28 | 6914 6914 0 0 0 -9 29 | 6915 6915 0 0 0 -9 30 | 6916 6916 0 0 0 -9 31 | 6917 6917 0 0 0 -9 32 | 6918 6918 0 0 0 -9 33 | 6919 6919 0 0 0 -9 34 | 6920 6920 0 0 0 -9 35 | 6921 6921 0 0 0 -9 36 | 6922 6922 0 0 0 -9 37 | 6923 6923 0 0 0 -9 38 | 6924 6924 0 0 0 -9 39 | 6926 6926 0 0 0 -9 40 | 6927 6927 0 0 0 -9 41 | 6928 6928 0 0 0 -9 42 | 6929 6929 0 0 0 -9 43 | 6930 6930 0 0 0 -9 44 | 6931 6931 0 0 0 -9 45 | 6932 6932 0 0 0 -9 46 | 6933 6933 0 0 0 -9 47 | 6936 6936 0 0 0 -9 48 | 6937 6937 0 0 0 -9 49 | 6939 6939 0 0 0 -9 50 | 6940 6940 0 0 0 -9 51 | 6942 6942 0 0 0 -9 52 | 6943 6943 0 0 0 -9 53 | 6944 6944 0 0 0 -9 54 | 6945 6945 0 0 0 -9 55 | 6946 6946 0 0 0 -9 56 | 6951 6951 0 0 0 -9 57 | 6956 6956 0 0 0 -9 58 | 6957 6957 0 0 0 -9 59 | 6958 6958 0 0 0 -9 60 | 6959 6959 0 0 0 -9 61 | 6960 6960 0 0 0 -9 62 | 6961 6961 0 0 0 -9 63 | 6962 6962 0 0 0 -9 64 | 6963 6963 0 0 0 -9 65 | 6964 6964 0 0 0 -9 66 | 6965 6965 0 0 0 -9 67 | 6966 6966 0 0 0 -9 68 | 6967 6967 0 0 0 -9 69 | 6968 6968 0 0 0 -9 70 | 6969 6969 0 0 0 -9 71 | 6970 6970 0 0 0 -9 72 | 6971 6971 0 0 0 -9 73 | 6972 6972 0 0 0 -9 74 | 6973 6973 0 0 0 -9 75 | 6974 6974 0 0 0 -9 76 | 6975 6975 0 0 0 -9 77 | 6976 6976 0 0 0 -9 78 | 6977 6977 0 0 0 -9 79 | 6978 6978 0 0 0 -9 80 | 6979 6979 0 0 0 -9 81 | 6980 6980 0 0 0 -9 82 | 6981 6981 0 0 0 -9 83 | 6982 6982 0 0 0 -9 84 | 6983 6983 0 0 0 -9 85 | 6984 6984 0 0 0 -9 86 | 6985 6985 0 0 0 -9 87 | 6988 6988 0 0 0 -9 88 | 7000 7000 0 0 0 -9 89 | 7014 7014 0 0 0 -9 90 | 7033 7033 0 0 0 -9 91 | 7062 7062 0 0 0 -9 92 | 7064 7064 0 0 0 -9 93 | 7081 7081 0 0 0 -9 94 | 7094 7094 0 0 0 -9 95 | 7123 7123 0 0 0 -9 96 | 7147 7147 0 0 0 -9 97 | 7163 7163 0 0 0 -9 98 | 7231 7231 0 0 0 -9 99 | 7255 7255 0 0 0 -9 100 | 7275 7275 0 0 0 -9 101 | 7282 7282 0 0 0 -9 102 | 7296 7296 0 0 0 -9 103 | 7306 7306 0 0 0 -9 104 | 7323 7323 0 0 0 -9 105 | 7346 7346 0 0 0 -9 106 | 7418 7418 0 0 0 -9 107 | 7424 7424 0 0 0 -9 108 | 7438 7438 0 0 0 -9 109 | 7460 7460 0 0 0 -9 110 | 7461 7461 0 0 0 -9 111 | 7477 7477 0 0 0 -9 112 | 7514 7514 0 0 0 -9 113 | 7515 7515 0 0 0 -9 114 | 7516 7516 0 0 0 -9 115 | 7517 7517 0 0 0 -9 116 | 7518 7518 0 0 0 -9 117 | 7519 7519 0 0 0 -9 118 | 7520 7520 0 0 0 -9 119 | 7521 7521 0 0 0 -9 120 | 7522 7522 0 0 0 -9 121 | 7523 7523 0 0 0 -9 122 | 7524 7524 0 0 0 -9 123 | 7525 7525 0 0 0 -9 124 | 7526 7526 0 0 0 -9 125 | 8213 8213 0 0 0 -9 126 | 8214 8214 0 0 0 -9 127 | 8215 8215 0 0 0 -9 128 | 8222 8222 0 0 0 -9 129 | 8230 8230 0 0 0 -9 130 | 8231 8231 0 0 0 -9 131 | 8233 8233 0 0 0 -9 132 | 8235 8235 0 0 0 -9 133 | 8236 8236 0 0 0 -9 134 | 8237 8237 0 0 0 -9 135 | 8239 8239 0 0 0 -9 136 | 8240 8240 0 0 0 -9 137 | 8241 8241 0 0 0 -9 138 | 8242 8242 0 0 0 -9 139 | 8243 8243 0 0 0 -9 140 | 8245 8245 0 0 0 -9 141 | 8247 8247 0 0 0 -9 142 | 8249 8249 0 0 0 -9 143 | 8254 8254 0 0 0 -9 144 | 8256 8256 0 0 0 -9 145 | 8258 8258 0 0 0 -9 146 | 8259 8259 0 0 0 -9 147 | 8264 8264 0 0 0 -9 148 | 8265 8265 0 0 0 -9 149 | 8266 8266 0 0 0 -9 150 | 8270 8270 0 0 0 -9 151 | 8271 8271 0 0 0 -9 152 | 8274 8274 0 0 0 -9 153 | 8275 8275 0 0 0 -9 154 | 8283 8283 0 0 0 -9 155 | 8284 8284 0 0 0 -9 156 | 8285 8285 0 0 0 -9 157 | 8290 8290 0 0 0 -9 158 | 8296 8296 0 0 0 -9 159 | 8297 8297 0 0 0 -9 160 | 8300 8300 0 0 0 -9 161 | 8306 8306 0 0 0 -9 162 | 8310 8310 0 0 0 -9 163 | 8311 8311 0 0 0 -9 164 | 8312 8312 0 0 0 -9 165 | 8313 8313 0 0 0 -9 166 | 8314 8314 0 0 0 -9 167 | 8323 8323 0 0 0 -9 168 | 8325 8325 0 0 0 -9 169 | 8326 8326 0 0 0 -9 170 | 8329 8329 0 0 0 -9 171 | 8334 8334 0 0 0 -9 172 | 8335 8335 0 0 0 -9 173 | 8337 8337 0 0 0 -9 174 | 8343 8343 0 0 0 -9 175 | 8351 8351 0 0 0 -9 176 | 8353 8353 0 0 0 -9 177 | 8354 8354 0 0 0 -9 178 | 8357 8357 0 0 0 -9 179 | 8365 8365 0 0 0 -9 180 | 8366 8366 0 0 0 -9 181 | 8369 8369 0 0 0 -9 182 | 8374 8374 0 0 0 -9 183 | 8376 8376 0 0 0 -9 184 | 8378 8378 0 0 0 -9 185 | 8387 8387 0 0 0 -9 186 | 8388 8388 0 0 0 -9 187 | 8389 8389 0 0 0 -9 188 | 8395 8395 0 0 0 -9 189 | 8411 8411 0 0 0 -9 190 | 8412 8412 0 0 0 -9 191 | 8420 8420 0 0 0 -9 192 | 8422 8422 0 0 0 -9 193 | 8423 8423 0 0 0 -9 194 | 8424 8424 0 0 0 -9 195 | 8426 8426 0 0 0 -9 196 | 8430 8430 0 0 0 -9 197 | 9057 9057 0 0 0 -9 198 | 9058 9058 0 0 0 -9 199 | 100000 100000 0 0 0 -9 200 | -------------------------------------------------------------------------------- /inst/extdata/chr2.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuantGen/BGData/fe7a57779d903f7056d5841482c9afdbeae5744b/inst/extdata/chr2.bed -------------------------------------------------------------------------------- /inst/extdata/chr2.bim: -------------------------------------------------------------------------------- 1 | 2 snp52382 0 2651 T C 2 | 2 snp52383 0 2961 C T 3 | 2 snp52384 0 10035 T G 4 | 2 snp52385 0 29477 C T 5 | 2 snp52386 0 30398 A G 6 | 2 snp52387 0 30725 G C 7 | 2 snp52388 0 30947 A G 8 | 2 snp52389 0 31142 T C 9 | 2 snp52390 0 31187 A G 10 | 2 snp52391 0 36510 C T 11 | 2 snp52392 0 36822 G T 12 | 2 snp52393 0 37182 T C 13 | 2 snp52394 0 37890 A G 14 | 2 snp52395 0 40795 C T 15 | 2 snp52396 0 57230 G A 16 | 2 snp52397 0 66351 T C 17 | 2 snp52398 0 68457 G C 18 | 2 snp52399 0 69008 T A 19 | 2 snp52400 0 69432 T C 20 | 2 snp52401 0 69598 T G 21 | 2 snp52402 0 70249 G A 22 | 2 snp52403 0 71642 G T 23 | 2 snp52404 0 72058 G T 24 | 2 snp52405 0 72539 T G 25 | 2 snp52406 0 73074 C T 26 | 2 snp52407 0 73116 G T 27 | 2 snp52408 0 74088 T C 28 | 2 snp52409 0 74635 T C 29 | 2 snp52410 0 75978 C A 30 | 2 snp52411 0 76134 C T 31 | 2 snp52412 0 76310 C T 32 | 2 snp52413 0 77634 A G 33 | 2 snp52414 0 78295 T C 34 | 2 snp52415 0 78565 G A 35 | 2 snp52416 0 78958 A G 36 | 2 snp52417 0 79128 A G 37 | 2 snp52418 0 79708 C A 38 | 2 snp52419 0 79962 A C 39 | 2 snp52420 0 80137 G T 40 | 2 snp52421 0 80228 A G 41 | 2 snp52422 0 80356 G A 42 | 2 snp52423 0 80495 A C 43 | 2 snp52424 0 80723 C T 44 | 2 snp52425 0 80789 G C 45 | 2 snp52426 0 80818 T G 46 | 2 snp52427 0 81127 C G 47 | 2 snp52428 0 81342 T C 48 | 2 snp52429 0 81563 T A 49 | 2 snp52430 0 81613 T C 50 | 2 snp52431 0 81750 T G 51 | 2 snp52432 0 81886 C T 52 | 2 snp52433 0 81914 T A 53 | 2 snp52434 0 82340 G A 54 | 2 snp52435 0 82580 C T 55 | 2 snp52436 0 82809 G A 56 | 2 snp52437 0 82934 T A 57 | 2 snp52438 0 83103 G A 58 | 2 snp52439 0 83234 T A 59 | 2 snp52440 0 83262 T C 60 | 2 snp52441 0 83533 A G 61 | 2 snp52442 0 83554 C T 62 | 2 snp52443 0 83795 T C 63 | 2 snp52444 0 84000 G A 64 | 2 snp52445 0 84037 A G 65 | 2 snp52446 0 84615 T C 66 | 2 snp52447 0 86107 A G 67 | 2 snp52448 0 88461 A C 68 | 2 snp52449 0 88866 C T 69 | 2 snp52450 0 91203 T C 70 | 2 snp52451 0 92516 G T 71 | 2 snp52452 0 92756 C A 72 | 2 snp52453 0 93291 G C 73 | 2 snp52454 0 93777 T C 74 | 2 snp52455 0 93818 C T 75 | 2 snp52456 0 94509 A G 76 | 2 snp52457 0 94621 C A 77 | 2 snp52458 0 95458 G T 78 | 2 snp52459 0 95862 G A 79 | 2 snp52460 0 97587 T C 80 | 2 snp52461 0 97966 A G 81 | 2 snp52462 0 98024 T C 82 | 2 snp52463 0 98731 T C 83 | 2 snp52464 0 98752 A G 84 | 2 snp52465 0 99935 T C 85 | 2 snp52466 0 100206 T C 86 | 2 snp52467 0 100348 G C 87 | 2 snp52468 0 100542 C A 88 | 2 snp52469 0 101022 C T 89 | 2 snp52470 0 102165 T G 90 | 2 snp52471 0 102672 A C 91 | 2 snp52472 0 102919 T G 92 | 2 snp52473 0 104217 A G 93 | 2 snp52474 0 104434 T C 94 | 2 snp52475 0 105664 T C 95 | 2 snp52476 0 106117 C A 96 | 2 snp52477 0 107505 C A 97 | 2 snp52478 0 108183 A C 98 | 2 snp52479 0 108597 A G 99 | 2 snp52480 0 108880 A G 100 | 2 snp52481 0 108975 C G 101 | 2 snp52482 0 110845 C T 102 | 2 snp52483 0 111267 T C 103 | 2 snp52484 0 112063 A T 104 | 2 snp52485 0 114175 A G 105 | 2 snp52486 0 114479 C T 106 | 2 snp52487 0 114506 G C 107 | 2 snp52488 0 115564 T C 108 | 2 snp52489 0 116876 G C 109 | 2 snp52490 0 118895 C T 110 | 2 snp52491 0 119449 C T 111 | 2 snp52492 0 119519 A C 112 | 2 snp52493 0 120486 T G 113 | 2 snp52494 0 120672 T C 114 | 2 snp52495 0 120948 C T 115 | 2 snp52496 0 121071 G T 116 | 2 snp52497 0 121404 G A 117 | 2 snp52498 0 121590 A T 118 | 2 snp52499 0 123187 G A 119 | 2 snp52500 0 123262 T C 120 | 2 snp52501 0 123532 A G 121 | 2 snp52502 0 123653 C T 122 | 2 snp52503 0 124927 T C 123 | 2 snp52504 0 126555 C A 124 | 2 snp52505 0 128418 C G 125 | 2 snp52506 0 128543 G C 126 | 2 snp52507 0 128964 A G 127 | 2 snp52508 0 129109 G A 128 | 2 snp52509 0 129268 G A 129 | 2 snp52510 0 129448 T A 130 | 2 snp52511 0 131134 A G 131 | 2 snp52512 0 134343 C A 132 | 2 snp52513 0 134490 C A 133 | 2 snp52514 0 137159 A G 134 | 2 snp52515 0 137260 G A 135 | 2 snp52516 0 137433 A G 136 | 2 snp52517 0 138851 A G 137 | 2 snp52518 0 139479 C G 138 | 2 snp52519 0 142739 A G 139 | 2 snp52520 0 143193 T G 140 | 2 snp52521 0 144173 C T 141 | 2 snp52522 0 144219 T A 142 | 2 snp52523 0 144377 T A 143 | 2 snp52524 0 144953 T G 144 | 2 snp52525 0 145185 G T 145 | 2 snp52526 0 145226 G C 146 | 2 snp52527 0 145550 G T 147 | 2 snp52528 0 146013 A G 148 | 2 snp52529 0 146066 G C 149 | 2 snp52530 0 146345 G A 150 | 2 snp52531 0 146396 T C 151 | 2 snp52532 0 146895 A G 152 | 2 snp52533 0 147035 A G 153 | 2 snp52534 0 147264 G A 154 | 2 snp52535 0 147837 C A 155 | 2 snp52536 0 148060 T C 156 | 2 snp52537 0 148353 C T 157 | 2 snp52538 0 149064 A T 158 | 2 snp52539 0 149254 G C 159 | 2 snp52540 0 149398 A C 160 | 2 snp52541 0 149692 T G 161 | 2 snp52542 0 150627 T C 162 | 2 snp52543 0 151189 A G 163 | 2 snp52544 0 151273 T G 164 | 2 snp52545 0 151343 T A 165 | 2 snp52546 0 151415 A G 166 | 2 snp52547 0 151709 T G 167 | 2 snp52548 0 152431 T C 168 | 2 snp52549 0 152591 G A 169 | 2 snp52550 0 153545 C T 170 | 2 snp52551 0 153631 G A 171 | 2 snp52552 0 154223 A G 172 | 2 snp52553 0 154749 C T 173 | 2 snp52554 0 154775 T C 174 | 2 snp52555 0 154824 T A 175 | 2 snp52556 0 154920 T C 176 | 2 snp52557 0 154938 C T 177 | 2 snp52558 0 155224 T G 178 | 2 snp52559 0 155258 A G 179 | 2 snp52560 0 155327 T C 180 | 2 snp52561 0 156174 G A 181 | 2 snp52562 0 156225 A G 182 | 2 snp52563 0 156448 C T 183 | 2 snp52564 0 156860 C G 184 | 2 snp52565 0 157053 T G 185 | 2 snp52566 0 157749 C G 186 | 2 snp52567 0 158102 A C 187 | 2 snp52568 0 158765 G T 188 | 2 snp52569 0 158979 G A 189 | 2 snp52570 0 159184 G T 190 | 2 snp52571 0 159209 A G 191 | 2 snp52572 0 159309 T C 192 | 2 snp52573 0 159818 A C 193 | 2 snp52574 0 159872 C A 194 | 2 snp52575 0 160448 C T 195 | 2 snp52576 0 160925 A T 196 | 2 snp52577 0 161051 C T 197 | 2 snp52578 0 161382 T A 198 | 2 snp52579 0 161598 A G 199 | 2 snp52580 0 161820 G A 200 | 2 snp52581 0 162538 C T 201 | 2 snp52582 0 162725 T A 202 | 2 snp52583 0 163083 G C 203 | 2 snp52584 0 163111 C T 204 | 2 snp52585 0 163497 A G 205 | 2 snp52586 0 164150 C T 206 | 2 snp52587 0 164375 G A 207 | 2 snp52588 0 165036 C G 208 | 2 snp52589 0 166310 A C 209 | 2 snp52590 0 166668 T C 210 | 2 snp52591 0 167441 G C 211 | 2 snp52592 0 167647 A C 212 | 2 snp52593 0 167716 T C 213 | 2 snp52594 0 168816 A T 214 | 2 snp52595 0 169729 A G 215 | 2 snp52596 0 169989 T C 216 | 2 snp52597 0 170202 C T 217 | 2 snp52598 0 170351 C T 218 | 2 snp52599 0 172244 A G 219 | 2 snp52600 0 172482 A T 220 | 2 snp52601 0 172548 T A 221 | 2 snp52602 0 172584 T C 222 | 2 snp52603 0 172629 C T 223 | 2 snp52604 0 172813 T G 224 | 2 snp52605 0 172931 C A 225 | 2 snp52606 0 172980 G A 226 | 2 snp52607 0 173320 C A 227 | 2 snp52608 0 173442 G A 228 | 2 snp52609 0 173651 G A 229 | 2 snp52610 0 173823 C A 230 | 2 snp52611 0 173858 T G 231 | 2 snp52612 0 174115 G C 232 | 2 snp52613 0 174364 A T 233 | 2 snp52614 0 174885 G A 234 | 2 snp52615 0 174960 G A 235 | 2 snp52616 0 175289 A G 236 | 2 snp52617 0 175529 T C 237 | 2 snp52618 0 175874 G T 238 | 2 snp52619 0 176156 A T 239 | 2 snp52620 0 176616 T C 240 | 2 snp52621 0 176666 C T 241 | 2 snp52622 0 178596 A G 242 | 2 snp52623 0 178849 A T 243 | 2 snp52624 0 180461 A T 244 | 2 snp52625 0 180480 G A 245 | 2 snp52626 0 180942 C A 246 | 2 snp52627 0 181030 G T 247 | 2 snp52628 0 182538 T G 248 | 2 snp52629 0 182818 G A 249 | 2 snp52630 0 182841 A G 250 | 2 snp52631 0 184364 C T 251 | 2 snp52632 0 184749 G C 252 | 2 snp52633 0 185018 A G 253 | 2 snp52634 0 186236 T G 254 | 2 snp52635 0 186947 C T 255 | 2 snp52636 0 187048 A T 256 | 2 snp52637 0 187084 C T 257 | 2 snp52638 0 187383 T G 258 | 2 snp52639 0 187747 C T 259 | 2 snp52640 0 189659 G A 260 | 2 snp52641 0 189683 C T 261 | 2 snp52642 0 190634 G A 262 | 2 snp52643 0 191136 C G 263 | 2 snp52644 0 191181 G A 264 | 2 snp52645 0 191716 A G 265 | 2 snp52646 0 191913 T G 266 | 2 snp52647 0 192676 T C 267 | 2 snp52648 0 193364 A G 268 | 2 snp52649 0 194546 A G 269 | 2 snp52650 0 198408 A G 270 | 2 snp52651 0 199672 C T 271 | 2 snp52652 0 202467 G A 272 | 2 snp52653 0 203117 A G 273 | 2 snp52654 0 203192 A G 274 | 2 snp52655 0 203286 G A 275 | 2 snp52656 0 204027 T G 276 | 2 snp52657 0 204325 A C 277 | 2 snp52658 0 204522 A C 278 | 2 snp52659 0 204596 T C 279 | 2 snp52660 0 204935 C A 280 | 2 snp52661 0 205036 G C 281 | 2 snp52662 0 205338 T C 282 | 2 snp52663 0 205446 G A 283 | 2 snp52664 0 205489 C T 284 | 2 snp52665 0 206117 C G 285 | 2 snp52666 0 207344 G C 286 | 2 snp52667 0 207433 A G 287 | 2 snp52668 0 210149 G C 288 | 2 snp52669 0 210357 A T 289 | 2 snp52670 0 212442 T C 290 | 2 snp52671 0 212787 C A 291 | 2 snp52672 0 212833 C A 292 | 2 snp52673 0 214360 A G 293 | 2 snp52674 0 214861 G C 294 | 2 snp52675 0 215794 A G 295 | 2 snp52676 0 215873 T C 296 | 2 snp52677 0 219015 G C 297 | 2 snp52678 0 220105 T C 298 | 2 snp52679 0 220163 T G 299 | 2 snp52680 0 220283 T C 300 | 2 snp52681 0 220523 G A 301 | -------------------------------------------------------------------------------- /inst/extdata/chr2.fam: -------------------------------------------------------------------------------- 1 | 5837 5837 0 0 0 -9 2 | 6008 6008 0 0 0 -9 3 | 6009 6009 0 0 0 -9 4 | 6016 6016 0 0 0 -9 5 | 6040 6040 0 0 0 -9 6 | 6042 6042 0 0 0 -9 7 | 6043 6043 0 0 0 -9 8 | 6046 6046 0 0 0 -9 9 | 6064 6064 0 0 0 -9 10 | 6074 6074 0 0 0 -9 11 | 6243 6243 0 0 0 -9 12 | 6709 6709 0 0 0 -9 13 | 6897 6897 0 0 0 -9 14 | 6898 6898 0 0 0 -9 15 | 6899 6899 0 0 0 -9 16 | 6900 6900 0 0 0 -9 17 | 6901 6901 0 0 0 -9 18 | 6903 6903 0 0 0 -9 19 | 6904 6904 0 0 0 -9 20 | 6905 6905 0 0 0 -9 21 | 6906 6906 0 0 0 -9 22 | 6907 6907 0 0 0 -9 23 | 6908 6908 0 0 0 -9 24 | 6909 6909 0 0 0 -9 25 | 6910 6910 0 0 0 -9 26 | 6911 6911 0 0 0 -9 27 | 6913 6913 0 0 0 -9 28 | 6914 6914 0 0 0 -9 29 | 6915 6915 0 0 0 -9 30 | 6916 6916 0 0 0 -9 31 | 6917 6917 0 0 0 -9 32 | 6918 6918 0 0 0 -9 33 | 6919 6919 0 0 0 -9 34 | 6920 6920 0 0 0 -9 35 | 6921 6921 0 0 0 -9 36 | 6922 6922 0 0 0 -9 37 | 6923 6923 0 0 0 -9 38 | 6924 6924 0 0 0 -9 39 | 6926 6926 0 0 0 -9 40 | 6927 6927 0 0 0 -9 41 | 6928 6928 0 0 0 -9 42 | 6929 6929 0 0 0 -9 43 | 6930 6930 0 0 0 -9 44 | 6931 6931 0 0 0 -9 45 | 6932 6932 0 0 0 -9 46 | 6933 6933 0 0 0 -9 47 | 6936 6936 0 0 0 -9 48 | 6937 6937 0 0 0 -9 49 | 6939 6939 0 0 0 -9 50 | 6940 6940 0 0 0 -9 51 | 6942 6942 0 0 0 -9 52 | 6943 6943 0 0 0 -9 53 | 6944 6944 0 0 0 -9 54 | 6945 6945 0 0 0 -9 55 | 6946 6946 0 0 0 -9 56 | 6951 6951 0 0 0 -9 57 | 6956 6956 0 0 0 -9 58 | 6957 6957 0 0 0 -9 59 | 6958 6958 0 0 0 -9 60 | 6959 6959 0 0 0 -9 61 | 6960 6960 0 0 0 -9 62 | 6961 6961 0 0 0 -9 63 | 6962 6962 0 0 0 -9 64 | 6963 6963 0 0 0 -9 65 | 6964 6964 0 0 0 -9 66 | 6965 6965 0 0 0 -9 67 | 6966 6966 0 0 0 -9 68 | 6967 6967 0 0 0 -9 69 | 6968 6968 0 0 0 -9 70 | 6969 6969 0 0 0 -9 71 | 6970 6970 0 0 0 -9 72 | 6971 6971 0 0 0 -9 73 | 6972 6972 0 0 0 -9 74 | 6973 6973 0 0 0 -9 75 | 6974 6974 0 0 0 -9 76 | 6975 6975 0 0 0 -9 77 | 6976 6976 0 0 0 -9 78 | 6977 6977 0 0 0 -9 79 | 6978 6978 0 0 0 -9 80 | 6979 6979 0 0 0 -9 81 | 6980 6980 0 0 0 -9 82 | 6981 6981 0 0 0 -9 83 | 6982 6982 0 0 0 -9 84 | 6983 6983 0 0 0 -9 85 | 6984 6984 0 0 0 -9 86 | 6985 6985 0 0 0 -9 87 | 6988 6988 0 0 0 -9 88 | 7000 7000 0 0 0 -9 89 | 7014 7014 0 0 0 -9 90 | 7033 7033 0 0 0 -9 91 | 7062 7062 0 0 0 -9 92 | 7064 7064 0 0 0 -9 93 | 7081 7081 0 0 0 -9 94 | 7094 7094 0 0 0 -9 95 | 7123 7123 0 0 0 -9 96 | 7147 7147 0 0 0 -9 97 | 7163 7163 0 0 0 -9 98 | 7231 7231 0 0 0 -9 99 | 7255 7255 0 0 0 -9 100 | 7275 7275 0 0 0 -9 101 | 7282 7282 0 0 0 -9 102 | 7296 7296 0 0 0 -9 103 | 7306 7306 0 0 0 -9 104 | 7323 7323 0 0 0 -9 105 | 7346 7346 0 0 0 -9 106 | 7418 7418 0 0 0 -9 107 | 7424 7424 0 0 0 -9 108 | 7438 7438 0 0 0 -9 109 | 7460 7460 0 0 0 -9 110 | 7461 7461 0 0 0 -9 111 | 7477 7477 0 0 0 -9 112 | 7514 7514 0 0 0 -9 113 | 7515 7515 0 0 0 -9 114 | 7516 7516 0 0 0 -9 115 | 7517 7517 0 0 0 -9 116 | 7518 7518 0 0 0 -9 117 | 7519 7519 0 0 0 -9 118 | 7520 7520 0 0 0 -9 119 | 7521 7521 0 0 0 -9 120 | 7522 7522 0 0 0 -9 121 | 7523 7523 0 0 0 -9 122 | 7524 7524 0 0 0 -9 123 | 7525 7525 0 0 0 -9 124 | 7526 7526 0 0 0 -9 125 | 8213 8213 0 0 0 -9 126 | 8214 8214 0 0 0 -9 127 | 8215 8215 0 0 0 -9 128 | 8222 8222 0 0 0 -9 129 | 8230 8230 0 0 0 -9 130 | 8231 8231 0 0 0 -9 131 | 8233 8233 0 0 0 -9 132 | 8235 8235 0 0 0 -9 133 | 8236 8236 0 0 0 -9 134 | 8237 8237 0 0 0 -9 135 | 8239 8239 0 0 0 -9 136 | 8240 8240 0 0 0 -9 137 | 8241 8241 0 0 0 -9 138 | 8242 8242 0 0 0 -9 139 | 8243 8243 0 0 0 -9 140 | 8245 8245 0 0 0 -9 141 | 8247 8247 0 0 0 -9 142 | 8249 8249 0 0 0 -9 143 | 8254 8254 0 0 0 -9 144 | 8256 8256 0 0 0 -9 145 | 8258 8258 0 0 0 -9 146 | 8259 8259 0 0 0 -9 147 | 8264 8264 0 0 0 -9 148 | 8265 8265 0 0 0 -9 149 | 8266 8266 0 0 0 -9 150 | 8270 8270 0 0 0 -9 151 | 8271 8271 0 0 0 -9 152 | 8274 8274 0 0 0 -9 153 | 8275 8275 0 0 0 -9 154 | 8283 8283 0 0 0 -9 155 | 8284 8284 0 0 0 -9 156 | 8285 8285 0 0 0 -9 157 | 8290 8290 0 0 0 -9 158 | 8296 8296 0 0 0 -9 159 | 8297 8297 0 0 0 -9 160 | 8300 8300 0 0 0 -9 161 | 8306 8306 0 0 0 -9 162 | 8310 8310 0 0 0 -9 163 | 8311 8311 0 0 0 -9 164 | 8312 8312 0 0 0 -9 165 | 8313 8313 0 0 0 -9 166 | 8314 8314 0 0 0 -9 167 | 8323 8323 0 0 0 -9 168 | 8325 8325 0 0 0 -9 169 | 8326 8326 0 0 0 -9 170 | 8329 8329 0 0 0 -9 171 | 8334 8334 0 0 0 -9 172 | 8335 8335 0 0 0 -9 173 | 8337 8337 0 0 0 -9 174 | 8343 8343 0 0 0 -9 175 | 8351 8351 0 0 0 -9 176 | 8353 8353 0 0 0 -9 177 | 8354 8354 0 0 0 -9 178 | 8357 8357 0 0 0 -9 179 | 8365 8365 0 0 0 -9 180 | 8366 8366 0 0 0 -9 181 | 8369 8369 0 0 0 -9 182 | 8374 8374 0 0 0 -9 183 | 8376 8376 0 0 0 -9 184 | 8378 8378 0 0 0 -9 185 | 8387 8387 0 0 0 -9 186 | 8388 8388 0 0 0 -9 187 | 8389 8389 0 0 0 -9 188 | 8395 8395 0 0 0 -9 189 | 8411 8411 0 0 0 -9 190 | 8412 8412 0 0 0 -9 191 | 8420 8420 0 0 0 -9 192 | 8422 8422 0 0 0 -9 193 | 8423 8423 0 0 0 -9 194 | 8424 8424 0 0 0 -9 195 | 8426 8426 0 0 0 -9 196 | 8430 8430 0 0 0 -9 197 | 9057 9057 0 0 0 -9 198 | 9058 9058 0 0 0 -9 199 | 100000 100000 0 0 0 -9 200 | -------------------------------------------------------------------------------- /inst/extdata/chr3.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuantGen/BGData/fe7a57779d903f7056d5841482c9afdbeae5744b/inst/extdata/chr3.bed -------------------------------------------------------------------------------- /inst/extdata/chr3.bim: -------------------------------------------------------------------------------- 1 | 3 snp81183 0 1394 T C 2 | 3 snp81184 0 4681 T G 3 | 3 snp81185 0 5444 A C 4 | 3 snp81186 0 5826 A T 5 | 3 snp81187 0 9268 A G 6 | 3 snp81188 0 9374 T C 7 | 3 snp81189 0 10352 T G 8 | 3 snp81190 0 14199 G C 9 | 3 snp81191 0 14851 A T 10 | 3 snp81192 0 16517 C T 11 | 3 snp81193 0 16871 T C 12 | 3 snp81194 0 22781 T A 13 | 3 snp81195 0 26145 A G 14 | 3 snp81196 0 26826 A C 15 | 3 snp81197 0 30382 A G 16 | 3 snp81198 0 30509 G T 17 | 3 snp81199 0 32608 T C 18 | 3 snp81200 0 34164 T G 19 | 3 snp81201 0 38218 A C 20 | 3 snp81202 0 38447 A G 21 | 3 snp81203 0 41804 A C 22 | 3 snp81204 0 44099 A C 23 | 3 snp81205 0 44736 T G 24 | 3 snp81206 0 45059 G T 25 | 3 snp81207 0 45416 G A 26 | 3 snp81208 0 46607 T A 27 | 3 snp81209 0 46839 C T 28 | 3 snp81210 0 46968 A C 29 | 3 snp81211 0 46998 G A 30 | 3 snp81212 0 47215 C G 31 | 3 snp81213 0 47535 C G 32 | 3 snp81214 0 49398 G A 33 | 3 snp81215 0 49571 A C 34 | 3 snp81216 0 51068 C T 35 | 3 snp81217 0 53767 T C 36 | 3 snp81218 0 54885 C T 37 | 3 snp81219 0 55635 A T 38 | 3 snp81220 0 56047 G A 39 | 3 snp81221 0 56555 C G 40 | 3 snp81222 0 56807 C T 41 | 3 snp81223 0 57407 A G 42 | 3 snp81224 0 57780 C T 43 | 3 snp81225 0 58114 T C 44 | 3 snp81226 0 58346 G T 45 | 3 snp81227 0 59032 A G 46 | 3 snp81228 0 59497 T G 47 | 3 snp81229 0 61218 C T 48 | 3 snp81230 0 61477 G A 49 | 3 snp81231 0 61772 A T 50 | 3 snp81232 0 62096 C T 51 | 3 snp81233 0 62216 C G 52 | 3 snp81234 0 63393 C G 53 | 3 snp81235 0 63580 T C 54 | 3 snp81236 0 64824 T A 55 | 3 snp81237 0 65817 C A 56 | 3 snp81238 0 65996 C G 57 | 3 snp81239 0 66603 C G 58 | 3 snp81240 0 69268 G C 59 | 3 snp81241 0 70128 C T 60 | 3 snp81242 0 70545 G A 61 | 3 snp81243 0 71143 A C 62 | 3 snp81244 0 72096 T C 63 | 3 snp81245 0 73628 T C 64 | 3 snp81246 0 74169 C A 65 | 3 snp81247 0 74405 T A 66 | 3 snp81248 0 77175 C T 67 | 3 snp81249 0 77505 C T 68 | 3 snp81250 0 77597 C T 69 | 3 snp81251 0 78632 C T 70 | 3 snp81252 0 78916 G A 71 | 3 snp81253 0 79525 A G 72 | 3 snp81254 0 79573 G C 73 | 3 snp81255 0 80528 A T 74 | 3 snp81256 0 80622 A G 75 | 3 snp81257 0 80699 A G 76 | 3 snp81258 0 80725 A T 77 | 3 snp81259 0 80937 G T 78 | 3 snp81260 0 81124 C G 79 | 3 snp81261 0 81142 G A 80 | 3 snp81262 0 81258 G A 81 | 3 snp81263 0 81306 T C 82 | 3 snp81264 0 81339 G C 83 | 3 snp81265 0 81894 T C 84 | 3 snp81266 0 82272 A C 85 | 3 snp81267 0 82777 A G 86 | 3 snp81268 0 82916 T C 87 | 3 snp81269 0 83001 T C 88 | 3 snp81270 0 83198 G T 89 | 3 snp81271 0 83538 T G 90 | 3 snp81272 0 85135 C T 91 | 3 snp81273 0 85187 G A 92 | 3 snp81274 0 85381 A G 93 | 3 snp81275 0 88508 T G 94 | 3 snp81276 0 90774 A T 95 | 3 snp81277 0 90815 A C 96 | 3 snp81278 0 91052 T G 97 | 3 snp81279 0 91325 T A 98 | 3 snp81280 0 91411 A C 99 | 3 snp81281 0 92117 T A 100 | 3 snp81282 0 92418 C G 101 | 3 snp81283 0 92744 A G 102 | 3 snp81284 0 92971 C G 103 | 3 snp81285 0 94431 A G 104 | 3 snp81286 0 94834 C A 105 | 3 snp81287 0 95420 A G 106 | 3 snp81288 0 95769 A T 107 | 3 snp81289 0 95948 G A 108 | 3 snp81290 0 96090 C A 109 | 3 snp81291 0 96385 T G 110 | 3 snp81292 0 97418 T C 111 | 3 snp81293 0 97909 G A 112 | 3 snp81294 0 98583 G A 113 | 3 snp81295 0 99152 A C 114 | 3 snp81296 0 99623 A C 115 | 3 snp81297 0 99836 C A 116 | 3 snp81298 0 99921 G T 117 | 3 snp81299 0 100086 C A 118 | 3 snp81300 0 100602 G A 119 | 3 snp81301 0 100840 G A 120 | 3 snp81302 0 101797 G A 121 | 3 snp81303 0 102093 T A 122 | 3 snp81304 0 102255 A G 123 | 3 snp81305 0 102969 A C 124 | 3 snp81306 0 103676 T C 125 | 3 snp81307 0 103720 T C 126 | 3 snp81308 0 103802 T C 127 | 3 snp81309 0 103823 C T 128 | 3 snp81310 0 103965 A T 129 | 3 snp81311 0 104091 G A 130 | 3 snp81312 0 104130 C A 131 | 3 snp81313 0 104166 A C 132 | 3 snp81314 0 104271 G C 133 | 3 snp81315 0 104405 C G 134 | 3 snp81316 0 104421 C T 135 | 3 snp81317 0 105058 T C 136 | 3 snp81318 0 105288 C T 137 | 3 snp81319 0 105819 C G 138 | 3 snp81320 0 105838 A G 139 | 3 snp81321 0 106060 G C 140 | 3 snp81322 0 107792 C G 141 | 3 snp81323 0 108430 T C 142 | 3 snp81324 0 110471 G A 143 | 3 snp81325 0 111647 C T 144 | 3 snp81326 0 112949 T C 145 | 3 snp81327 0 113450 T C 146 | 3 snp81328 0 114059 C G 147 | 3 snp81329 0 114367 G C 148 | 3 snp81330 0 114475 G A 149 | 3 snp81331 0 114753 A G 150 | 3 snp81332 0 117453 T A 151 | 3 snp81333 0 117708 A T 152 | 3 snp81334 0 117981 G T 153 | 3 snp81335 0 118015 G A 154 | 3 snp81336 0 119781 T A 155 | 3 snp81337 0 119974 T C 156 | 3 snp81338 0 120886 A G 157 | 3 snp81339 0 121259 T A 158 | 3 snp81340 0 121396 T C 159 | 3 snp81341 0 121642 A T 160 | 3 snp81342 0 122731 C T 161 | 3 snp81343 0 123281 G A 162 | 3 snp81344 0 123330 G C 163 | 3 snp81345 0 123438 C A 164 | 3 snp81346 0 124382 G C 165 | 3 snp81347 0 126986 C G 166 | 3 snp81348 0 127080 T G 167 | 3 snp81349 0 128122 G T 168 | 3 snp81350 0 128362 A T 169 | 3 snp81351 0 128433 C T 170 | 3 snp81352 0 128538 A C 171 | 3 snp81353 0 128755 T C 172 | 3 snp81354 0 128807 A G 173 | 3 snp81355 0 128886 A C 174 | 3 snp81356 0 129912 T C 175 | 3 snp81357 0 130000 T G 176 | 3 snp81358 0 130145 A T 177 | 3 snp81359 0 131316 C A 178 | 3 snp81360 0 132645 G A 179 | 3 snp81361 0 132898 G A 180 | 3 snp81362 0 134351 A G 181 | 3 snp81363 0 136171 A G 182 | 3 snp81364 0 138201 G A 183 | 3 snp81365 0 143851 C T 184 | 3 snp81366 0 145429 A T 185 | 3 snp81367 0 145622 T C 186 | 3 snp81368 0 146508 G A 187 | 3 snp81369 0 149369 A G 188 | 3 snp81370 0 150938 G A 189 | 3 snp81371 0 154973 A G 190 | 3 snp81372 0 156664 A C 191 | 3 snp81373 0 161029 T G 192 | 3 snp81374 0 162240 A G 193 | 3 snp81375 0 162816 A G 194 | 3 snp81376 0 163128 G A 195 | 3 snp81377 0 164144 A G 196 | 3 snp81378 0 165230 A G 197 | 3 snp81379 0 165603 T C 198 | 3 snp81380 0 165875 A G 199 | 3 snp81381 0 166305 T G 200 | 3 snp81382 0 166491 C T 201 | 3 snp81383 0 166552 T C 202 | 3 snp81384 0 167243 G A 203 | 3 snp81385 0 167947 G A 204 | 3 snp81386 0 168154 C G 205 | 3 snp81387 0 168630 G C 206 | 3 snp81388 0 170811 C A 207 | 3 snp81389 0 175659 C T 208 | 3 snp81390 0 177200 A C 209 | 3 snp81391 0 178043 T C 210 | 3 snp81392 0 178071 C T 211 | 3 snp81393 0 178789 T C 212 | 3 snp81394 0 179977 T C 213 | 3 snp81395 0 180363 G A 214 | 3 snp81396 0 180526 T C 215 | 3 snp81397 0 181790 G A 216 | 3 snp81398 0 181838 A G 217 | 3 snp81399 0 184000 A G 218 | 3 snp81400 0 184954 T C 219 | 3 snp81401 0 185878 G C 220 | 3 snp81402 0 186033 A T 221 | 3 snp81403 0 186194 G A 222 | 3 snp81404 0 188039 A C 223 | 3 snp81405 0 188837 G C 224 | 3 snp81406 0 189030 T C 225 | 3 snp81407 0 189227 A G 226 | 3 snp81408 0 189479 G A 227 | 3 snp81409 0 189928 G A 228 | 3 snp81410 0 190615 T C 229 | 3 snp81411 0 193541 G A 230 | 3 snp81412 0 194175 A G 231 | 3 snp81413 0 195339 T C 232 | 3 snp81414 0 195886 C G 233 | 3 snp81415 0 198118 T A 234 | 3 snp81416 0 198298 C T 235 | 3 snp81417 0 198422 T G 236 | 3 snp81418 0 198638 T A 237 | 3 snp81419 0 198824 G A 238 | 3 snp81420 0 199212 T C 239 | 3 snp81421 0 199376 C A 240 | 3 snp81422 0 200230 G T 241 | 3 snp81423 0 201334 G T 242 | 3 snp81424 0 201857 T C 243 | 3 snp81425 0 202256 T A 244 | 3 snp81426 0 203271 C T 245 | 3 snp81427 0 203611 T A 246 | 3 snp81428 0 204358 C T 247 | 3 snp81429 0 205067 C A 248 | 3 snp81430 0 205339 A T 249 | 3 snp81431 0 205627 T A 250 | 3 snp81432 0 205691 A C 251 | 3 snp81433 0 206921 C A 252 | 3 snp81434 0 207101 C T 253 | 3 snp81435 0 207454 T C 254 | 3 snp81436 0 207898 G A 255 | 3 snp81437 0 208329 T C 256 | 3 snp81438 0 209073 A T 257 | 3 snp81439 0 209244 C G 258 | 3 snp81440 0 209307 T A 259 | 3 snp81441 0 209571 G T 260 | 3 snp81442 0 209717 A G 261 | 3 snp81443 0 210977 G A 262 | 3 snp81444 0 212444 C T 263 | 3 snp81445 0 212490 G T 264 | 3 snp81446 0 212719 T A 265 | 3 snp81447 0 213305 C A 266 | 3 snp81448 0 213740 G C 267 | 3 snp81449 0 213989 G C 268 | 3 snp81450 0 214212 A C 269 | 3 snp81451 0 214302 T C 270 | 3 snp81452 0 214523 T A 271 | 3 snp81453 0 214737 C A 272 | 3 snp81454 0 216126 G A 273 | 3 snp81455 0 216222 A C 274 | 3 snp81456 0 217405 A C 275 | 3 snp81457 0 217753 C T 276 | 3 snp81458 0 218241 G T 277 | 3 snp81459 0 219118 G T 278 | 3 snp81460 0 219451 C T 279 | 3 snp81461 0 220186 G C 280 | 3 snp81462 0 220342 C T 281 | 3 snp81463 0 220610 A G 282 | 3 snp81464 0 220761 T G 283 | 3 snp81465 0 221049 T A 284 | 3 snp81466 0 221262 G A 285 | 3 snp81467 0 221359 T C 286 | 3 snp81468 0 221593 T A 287 | 3 snp81469 0 221880 C T 288 | 3 snp81470 0 222522 A T 289 | 3 snp81471 0 222780 A G 290 | 3 snp81472 0 222802 C A 291 | 3 snp81473 0 223018 G A 292 | 3 snp81474 0 223088 T A 293 | 3 snp81475 0 223271 C A 294 | 3 snp81476 0 223463 C T 295 | 3 snp81477 0 223495 C G 296 | 3 snp81478 0 223734 T C 297 | 3 snp81479 0 223794 C T 298 | 3 snp81480 0 223958 T C 299 | 3 snp81481 0 223989 C T 300 | 3 snp81482 0 224644 A C 301 | -------------------------------------------------------------------------------- /inst/extdata/chr3.fam: -------------------------------------------------------------------------------- 1 | 5837 5837 0 0 0 -9 2 | 6008 6008 0 0 0 -9 3 | 6009 6009 0 0 0 -9 4 | 6016 6016 0 0 0 -9 5 | 6040 6040 0 0 0 -9 6 | 6042 6042 0 0 0 -9 7 | 6043 6043 0 0 0 -9 8 | 6046 6046 0 0 0 -9 9 | 6064 6064 0 0 0 -9 10 | 6074 6074 0 0 0 -9 11 | 6243 6243 0 0 0 -9 12 | 6709 6709 0 0 0 -9 13 | 6897 6897 0 0 0 -9 14 | 6898 6898 0 0 0 -9 15 | 6899 6899 0 0 0 -9 16 | 6900 6900 0 0 0 -9 17 | 6901 6901 0 0 0 -9 18 | 6903 6903 0 0 0 -9 19 | 6904 6904 0 0 0 -9 20 | 6905 6905 0 0 0 -9 21 | 6906 6906 0 0 0 -9 22 | 6907 6907 0 0 0 -9 23 | 6908 6908 0 0 0 -9 24 | 6909 6909 0 0 0 -9 25 | 6910 6910 0 0 0 -9 26 | 6911 6911 0 0 0 -9 27 | 6913 6913 0 0 0 -9 28 | 6914 6914 0 0 0 -9 29 | 6915 6915 0 0 0 -9 30 | 6916 6916 0 0 0 -9 31 | 6917 6917 0 0 0 -9 32 | 6918 6918 0 0 0 -9 33 | 6919 6919 0 0 0 -9 34 | 6920 6920 0 0 0 -9 35 | 6921 6921 0 0 0 -9 36 | 6922 6922 0 0 0 -9 37 | 6923 6923 0 0 0 -9 38 | 6924 6924 0 0 0 -9 39 | 6926 6926 0 0 0 -9 40 | 6927 6927 0 0 0 -9 41 | 6928 6928 0 0 0 -9 42 | 6929 6929 0 0 0 -9 43 | 6930 6930 0 0 0 -9 44 | 6931 6931 0 0 0 -9 45 | 6932 6932 0 0 0 -9 46 | 6933 6933 0 0 0 -9 47 | 6936 6936 0 0 0 -9 48 | 6937 6937 0 0 0 -9 49 | 6939 6939 0 0 0 -9 50 | 6940 6940 0 0 0 -9 51 | 6942 6942 0 0 0 -9 52 | 6943 6943 0 0 0 -9 53 | 6944 6944 0 0 0 -9 54 | 6945 6945 0 0 0 -9 55 | 6946 6946 0 0 0 -9 56 | 6951 6951 0 0 0 -9 57 | 6956 6956 0 0 0 -9 58 | 6957 6957 0 0 0 -9 59 | 6958 6958 0 0 0 -9 60 | 6959 6959 0 0 0 -9 61 | 6960 6960 0 0 0 -9 62 | 6961 6961 0 0 0 -9 63 | 6962 6962 0 0 0 -9 64 | 6963 6963 0 0 0 -9 65 | 6964 6964 0 0 0 -9 66 | 6965 6965 0 0 0 -9 67 | 6966 6966 0 0 0 -9 68 | 6967 6967 0 0 0 -9 69 | 6968 6968 0 0 0 -9 70 | 6969 6969 0 0 0 -9 71 | 6970 6970 0 0 0 -9 72 | 6971 6971 0 0 0 -9 73 | 6972 6972 0 0 0 -9 74 | 6973 6973 0 0 0 -9 75 | 6974 6974 0 0 0 -9 76 | 6975 6975 0 0 0 -9 77 | 6976 6976 0 0 0 -9 78 | 6977 6977 0 0 0 -9 79 | 6978 6978 0 0 0 -9 80 | 6979 6979 0 0 0 -9 81 | 6980 6980 0 0 0 -9 82 | 6981 6981 0 0 0 -9 83 | 6982 6982 0 0 0 -9 84 | 6983 6983 0 0 0 -9 85 | 6984 6984 0 0 0 -9 86 | 6985 6985 0 0 0 -9 87 | 6988 6988 0 0 0 -9 88 | 7000 7000 0 0 0 -9 89 | 7014 7014 0 0 0 -9 90 | 7033 7033 0 0 0 -9 91 | 7062 7062 0 0 0 -9 92 | 7064 7064 0 0 0 -9 93 | 7081 7081 0 0 0 -9 94 | 7094 7094 0 0 0 -9 95 | 7123 7123 0 0 0 -9 96 | 7147 7147 0 0 0 -9 97 | 7163 7163 0 0 0 -9 98 | 7231 7231 0 0 0 -9 99 | 7255 7255 0 0 0 -9 100 | 7275 7275 0 0 0 -9 101 | 7282 7282 0 0 0 -9 102 | 7296 7296 0 0 0 -9 103 | 7306 7306 0 0 0 -9 104 | 7323 7323 0 0 0 -9 105 | 7346 7346 0 0 0 -9 106 | 7418 7418 0 0 0 -9 107 | 7424 7424 0 0 0 -9 108 | 7438 7438 0 0 0 -9 109 | 7460 7460 0 0 0 -9 110 | 7461 7461 0 0 0 -9 111 | 7477 7477 0 0 0 -9 112 | 7514 7514 0 0 0 -9 113 | 7515 7515 0 0 0 -9 114 | 7516 7516 0 0 0 -9 115 | 7517 7517 0 0 0 -9 116 | 7518 7518 0 0 0 -9 117 | 7519 7519 0 0 0 -9 118 | 7520 7520 0 0 0 -9 119 | 7521 7521 0 0 0 -9 120 | 7522 7522 0 0 0 -9 121 | 7523 7523 0 0 0 -9 122 | 7524 7524 0 0 0 -9 123 | 7525 7525 0 0 0 -9 124 | 7526 7526 0 0 0 -9 125 | 8213 8213 0 0 0 -9 126 | 8214 8214 0 0 0 -9 127 | 8215 8215 0 0 0 -9 128 | 8222 8222 0 0 0 -9 129 | 8230 8230 0 0 0 -9 130 | 8231 8231 0 0 0 -9 131 | 8233 8233 0 0 0 -9 132 | 8235 8235 0 0 0 -9 133 | 8236 8236 0 0 0 -9 134 | 8237 8237 0 0 0 -9 135 | 8239 8239 0 0 0 -9 136 | 8240 8240 0 0 0 -9 137 | 8241 8241 0 0 0 -9 138 | 8242 8242 0 0 0 -9 139 | 8243 8243 0 0 0 -9 140 | 8245 8245 0 0 0 -9 141 | 8247 8247 0 0 0 -9 142 | 8249 8249 0 0 0 -9 143 | 8254 8254 0 0 0 -9 144 | 8256 8256 0 0 0 -9 145 | 8258 8258 0 0 0 -9 146 | 8259 8259 0 0 0 -9 147 | 8264 8264 0 0 0 -9 148 | 8265 8265 0 0 0 -9 149 | 8266 8266 0 0 0 -9 150 | 8270 8270 0 0 0 -9 151 | 8271 8271 0 0 0 -9 152 | 8274 8274 0 0 0 -9 153 | 8275 8275 0 0 0 -9 154 | 8283 8283 0 0 0 -9 155 | 8284 8284 0 0 0 -9 156 | 8285 8285 0 0 0 -9 157 | 8290 8290 0 0 0 -9 158 | 8296 8296 0 0 0 -9 159 | 8297 8297 0 0 0 -9 160 | 8300 8300 0 0 0 -9 161 | 8306 8306 0 0 0 -9 162 | 8310 8310 0 0 0 -9 163 | 8311 8311 0 0 0 -9 164 | 8312 8312 0 0 0 -9 165 | 8313 8313 0 0 0 -9 166 | 8314 8314 0 0 0 -9 167 | 8323 8323 0 0 0 -9 168 | 8325 8325 0 0 0 -9 169 | 8326 8326 0 0 0 -9 170 | 8329 8329 0 0 0 -9 171 | 8334 8334 0 0 0 -9 172 | 8335 8335 0 0 0 -9 173 | 8337 8337 0 0 0 -9 174 | 8343 8343 0 0 0 -9 175 | 8351 8351 0 0 0 -9 176 | 8353 8353 0 0 0 -9 177 | 8354 8354 0 0 0 -9 178 | 8357 8357 0 0 0 -9 179 | 8365 8365 0 0 0 -9 180 | 8366 8366 0 0 0 -9 181 | 8369 8369 0 0 0 -9 182 | 8374 8374 0 0 0 -9 183 | 8376 8376 0 0 0 -9 184 | 8378 8378 0 0 0 -9 185 | 8387 8387 0 0 0 -9 186 | 8388 8388 0 0 0 -9 187 | 8389 8389 0 0 0 -9 188 | 8395 8395 0 0 0 -9 189 | 8411 8411 0 0 0 -9 190 | 8412 8412 0 0 0 -9 191 | 8420 8420 0 0 0 -9 192 | 8422 8422 0 0 0 -9 193 | 8423 8423 0 0 0 -9 194 | 8424 8424 0 0 0 -9 195 | 8426 8426 0 0 0 -9 196 | 8430 8430 0 0 0 -9 197 | 9057 9057 0 0 0 -9 198 | 9058 9058 0 0 0 -9 199 | 100000 100000 0 0 0 -9 200 | -------------------------------------------------------------------------------- /inst/extdata/pheno.txt: -------------------------------------------------------------------------------- 1 | FID IID FT10 2 | 6960 6960 47.0 3 | 7517 7517 107.0 4 | 6945 6945 55.0 5 | 6914 6914 73.0 6 | 6944 6944 49.0 7 | 6939 6939 49.0 8 | 6009 6009 98.0 9 | 7518 7518 103.0 10 | 6046 6046 93.0 11 | 8325 8325 49.0 12 | 7123 7123 59.5 13 | 6899 6899 54.0 14 | 8222 8222 90.0 15 | 6968 6968 71.0 16 | 7461 7461 61.0 17 | 8423 8423 70.0 18 | 6008 6008 60.0 19 | 8313 8313 49.0 20 | 8365 8365 51.0 21 | 9058 9058 101.0 22 | 8335 8335 104.0 23 | 8343 8343 62.0 24 | 8374 8374 59.0 25 | 6956 6956 69.0 26 | 6909 6909 51.0 27 | 6042 6042 56.0 28 | 6064 6064 96.0 29 | 7525 7525 46.0 30 | 6961 6961 46.0 31 | 6974 6974 103.0 32 | 7306 7306 60.0 33 | 7255 7255 46.0 34 | 7418 7418 63.0 35 | 6928 6928 55.0 36 | 7094 7094 58.5 37 | 6900 6900 90.0 38 | 8387 8387 52.0 39 | 8247 8247 87.0 40 | 7062 7062 46.0 41 | 7282 7282 51.0 42 | 8422 8422 106.0 43 | 6985 6985 56.0 44 | 6915 6915 53.0 45 | 6958 6958 49.0 46 | 6016 6016 75.0 47 | 6929 6929 71.0 48 | 7520 7520 60.0 49 | 6933 6933 56.0 50 | 8424 8424 46.0 51 | 6926 6926 49.0 52 | 7346 7346 64.0 53 | 7296 7296 70.0 54 | 7522 7522 83.0 55 | 8369 8369 76.0 56 | 6932 6932 51.0 57 | 8237 8237 97.0 58 | 7000 7000 65.0 59 | 7477 7477 59.0 60 | 8378 8378 56.0 61 | 8329 8329 46.0 62 | 6917 6917 121.0 63 | 7460 7460 49.0 64 | 6920 6920 71.0 65 | 7081 7081 46.0 66 | 8270 8270 49.0 67 | 6980 6980 51.0 68 | 8271 8271 49.0 69 | 8230 8230 97.0 70 | 5837 5837 57.0 71 | 8310 8310 49.0 72 | 8215 8215 51.0 73 | 8258 8258 73.0 74 | 6898 6898 41.0 75 | 8411 8411 NA 76 | 6910 6910 49.0 77 | 7014 7014 92.0 78 | 6927 6927 51.0 79 | 6966 6966 53.0 80 | 6959 6959 51.0 81 | 8354 8354 70.0 82 | 8264 8264 46.0 83 | 6965 6965 102.0 84 | 6973 6973 53.0 85 | 6975 6975 51.0 86 | 8236 8236 91.0 87 | 8337 8337 70.0 88 | 8420 8420 56.0 89 | 8297 8297 73.0 90 | 8231 8231 91.0 91 | 8366 8366 NA 92 | 8351 8351 78.0 93 | 6943 6943 49.0 94 | 6972 6972 63.0 95 | 6942 6942 46.0 96 | 6901 6901 86.0 97 | 6936 6936 67.0 98 | 8389 8389 63.0 99 | 8395 8395 69.0 100 | 6918 6918 108.0 101 | 7033 7033 76.0 102 | 6976 6976 56.0 103 | 8239 8239 52.0 104 | 6040 6040 71.0 105 | 6919 6919 71.0 106 | 6981 6981 44.0 107 | 7516 7516 100.0 108 | 7147 7147 71.0 109 | 6969 6969 70.0 110 | 6921 6921 64.0 111 | 7524 7524 51.0 112 | 8241 8241 73.0 113 | 9057 9057 76.0 114 | 6979 6979 44.0 115 | 8326 8326 67.0 116 | 8412 8412 NA 117 | 8256 8256 61.0 118 | 7521 7521 60.0 119 | 6908 6908 49.0 120 | 8357 8357 NA 121 | 8296 8296 45.0 122 | 6946 6946 62.0 123 | 8242 8242 120.0 124 | 7231 7231 46.0 125 | 8284 8284 61.0 126 | 6962 6962 52.0 127 | 8235 8235 60.0 128 | 8353 8353 41.0 129 | 8259 8259 73.0 130 | 6923 6923 44.0 131 | 6906 6906 43.0 132 | 6967 6967 44.0 133 | 8285 8285 70.0 134 | 6970 6970 48.0 135 | 8240 8240 93.0 136 | 7064 7064 79.0 137 | 8306 8306 96.0 138 | 7519 7519 76.0 139 | 8274 8274 74.0 140 | 8283 8283 71.0 141 | 6916 6916 63.0 142 | 6924 6924 49.0 143 | 7515 7515 49.0 144 | 6043 6043 90.0 145 | 7526 7526 53.0 146 | 8243 8243 66.0 147 | 8300 8300 61.0 148 | 7514 7514 58.0 149 | 6911 6911 46.0 150 | 100000 100000 58.0 151 | 8388 8388 60.0 152 | 8275 8275 68.0 153 | 6931 6931 46.0 154 | 7275 7275 46.0 155 | 6983 6983 71.0 156 | 7163 7163 57.0 157 | 7438 7438 75.0 158 | 6963 6963 60.0 159 | 8334 8334 64.0 160 | 6951 6951 68.0 161 | 8430 8430 NA 162 | 6930 6930 49.0 163 | 8214 8214 51.0 164 | 8290 8290 50.0 165 | 8426 8426 49.0 166 | 8323 8323 51.0 167 | 6897 6897 62.0 168 | 8249 8249 81.0 169 | 6922 6922 48.0 170 | 8376 8376 84.0 171 | 6709 6709 52.0 172 | 8213 8213 44.0 173 | 8254 8254 52.0 174 | 8311 8311 49.0 175 | 6977 6977 49.0 176 | 6957 6957 84.0 177 | 6978 6978 49.0 178 | 8245 8245 46.0 179 | 6913 6913 99.0 180 | 6971 6971 51.0 181 | 6964 6964 93.0 182 | 6074 6074 91.0 183 | 6905 6905 65.0 184 | 8233 8233 59.0 185 | 7323 7323 56.0 186 | 6982 6982 49.0 187 | 8312 8312 66.0 188 | 6937 6937 65.0 189 | 6984 6984 53.0 190 | 6243 6243 56.0 191 | 7424 7424 43.0 192 | 7523 7523 57.0 193 | 6988 6988 48.0 194 | 8266 8266 99.0 195 | 6903 6903 57.0 196 | 6907 6907 58.0 197 | 8314 8314 64.0 198 | 6904 6904 66.0 199 | 8265 8265 44.0 200 | 6940 6940 49.0 201 | -------------------------------------------------------------------------------- /man/BGData-class.Rd: -------------------------------------------------------------------------------- 1 | \docType{class} 2 | \name{BGData-class} 3 | \alias{BGData-class} 4 | \alias{geno,BGData-method} 5 | \alias{geno<-,BGData-method} 6 | \alias{pheno,BGData-method} 7 | \alias{pheno<-,BGData-method} 8 | \alias{map,BGData-method} 9 | \alias{map<-,BGData-method} 10 | \title{Container for Phenotype and Genotype Data} 11 | \description{ 12 | The BGData class is a container for genotypes, sample information, and 13 | variant information. The class is inspired by the \code{.bed/.fam/.bim} 14 | (binary) and \code{.ped/.fam/.map} (text) phenotype/genotype file formats 15 | of \href{https://www.cog-genomics.org/plink2}{PLINK}. It is used by several 16 | functions of this package such as \code{GWAS} for performing a Genome Wide 17 | Association Study or \code{getG} for calculating a genomic relationship 18 | matrix. 19 | } 20 | \details{ 21 | There are several ways to create an instance of this class: 22 | 23 | \itemize{ 24 | \item from arbitrary phenotype/genotype data using the \code{BGData} 25 | constructor function. 26 | \item from a .bed file using \code{as.BGData} and \code{BEDMatrix}. 27 | \item from a previously saved \code{BGData} object using 28 | \code{load.BGData}. 29 | \item from multiple files (even a mixture of different file types) 30 | using \code{LinkedMatrix}. 31 | \item from a .raw file (or a .ped-like file) using 32 | \code{readRAW}, \code{readRAW_matrix}, or 33 | \code{readRAW_big.matrix}. 34 | } 35 | 36 | A .ped file can be recoded to a .raw file in 37 | \href{https://www.cog-genomics.org/plink2}{PLINK} using \code{plink --file 38 | myfile --recodeA}, or converted to a .bed file using \code{plink --file 39 | myfile --make-bed}. Conversely, a .bed file can be transformed back to a 40 | .ped file using \code{plink --bfile myfile --recode} or to a .raw file 41 | using \code{plink --bfile myfile --recodeA} without losing information. 42 | } 43 | \section{Accessors}{ 44 | In the following code snippets, \code{x} is a BGData object. 45 | \describe{ 46 | \item{\code{geno(x)}, \code{geno(x) <- value}:}{ 47 | Get or set genotypes. 48 | } 49 | \item{\code{pheno(x)}, \code{pheno(x) <- value}:}{ 50 | Get or set sample information. 51 | } 52 | \item{\code{map(x)}, \code{map(x) <- value}:}{ 53 | Get or set variant information. 54 | } 55 | } 56 | } 57 | \seealso{ 58 | \code{\link{BGData}}, \code{\link{as.BGData}}, \code{\link{load.BGData}}, 59 | \code{\link{readRAW}} to create \code{BGData} objects. 60 | 61 | \code{\link[LinkedMatrix]{LinkedMatrix-class}} and 62 | \code{\link[BEDMatrix]{BEDMatrix-class}} for more information on the above 63 | mentioned classes. 64 | } 65 | \examples{ 66 | X <- matrix(data = rnorm(100), nrow = 10, ncol = 10) 67 | Y <- data.frame(y = runif(10)) 68 | MAP <- data.frame(means = colMeans(X), freqNA = colMeans(is.na(X))) 69 | DATA <- BGData(geno = X, pheno = Y, map = MAP) 70 | 71 | dim(geno(DATA)) 72 | head(pheno(DATA)) 73 | head(map(DATA)) 74 | } 75 | -------------------------------------------------------------------------------- /man/BGData-package.Rd: -------------------------------------------------------------------------------- 1 | \docType{package} 2 | \name{BGData-package} 3 | \alias{BGData-package} 4 | \title{A Suite of Packages for Analysis of Big Genomic Data} 5 | \description{ 6 | Modern genomic datasets are big (large \emph{n}), high-dimensional (large 7 | \emph{p}), and multi-layered. The challenges that need to be addressed are 8 | memory requirements and computational demands. Our goal is to develop 9 | software that will enable researchers to carry out analyses with big 10 | genomic data within the R environment. 11 | } 12 | \details{ 13 | We have identified several approaches to tackle those challenges within R: 14 | 15 | \itemize{ 16 | \item File-backed matrices: The data is stored in on the hard drive and 17 | users can read in smaller chunks when they are needed. 18 | \item Linked arrays: For very large datasets a single file-backed array 19 | may not be enough or convenient. A linked array is an array whose 20 | content is distributed over multiple file-backed nodes. 21 | \item Multiple dispatch: Methods are presented to users so that they 22 | can treat these arrays pretty much as if they were RAM arrays. 23 | \item Multi-level parallelism: Exploit multi-core and multi-node 24 | computing. 25 | \item Inputs: Users can create these arrays from standard formats 26 | (e.g., PLINK .bed). 27 | } 28 | 29 | The \code{BGData} package is an umbrella package that comprises several 30 | packages: \code{BEDMatrix}, \code{LinkedMatrix}, and \code{symDMatrix}. It 31 | features scalable and efficient computational methods for large genomic 32 | datasets such as genome-wide association studies (GWAS) or genomic 33 | relationship matrices (G matrix). It also contains a container class called 34 | \code{BGData} that holds genotypes, sample information, and variant 35 | information. 36 | } 37 | \section{Example dataset}{ 38 | The \code{extdata} folder contains example files that were generated from 39 | the 250k SNP and phenotype data in 40 | \href{https://www.nature.com/articles/nature08800}{Atwell et al. (2010)}. 41 | Only the first 300 SNPs of chromosome 1, 2, and 3 were included to keep the 42 | size of the example dataset small. 43 | \href{https://www.cog-genomics.org/plink2}{PLINK} was used to convert the 44 | data to \href{https://www.cog-genomics.org/plink2/input#bed}{.bed} and 45 | \href{https://www.cog-genomics.org/plink2/input#raw}{.raw} files. 46 | \code{FT10} has been chosen as a phenotype and is provided as an 47 | \href{https://www.cog-genomics.org/plink2/input#pheno}{alternate phenotype 48 | file}. The file is intentionally shuffled to demonstrate that the 49 | additional phenotypes are put in the same order as the rest of the 50 | phenotypes. 51 | } 52 | \seealso{ 53 | \code{\link[BEDMatrix]{BEDMatrix-package}}, 54 | \code{\link[LinkedMatrix]{LinkedMatrix-package}}, and 55 | \code{\link[symDMatrix]{symDMatrix-package}} for an introduction to the 56 | respective packages. 57 | 58 | \code{\link{file-backed-matrices}} for more information on file-backed 59 | matrices. \code{\link{multi-level-parallelism}} for more information on 60 | multi-level parallelism. 61 | } 62 | -------------------------------------------------------------------------------- /man/BGData.Rd: -------------------------------------------------------------------------------- 1 | \name{BGData} 2 | \alias{BGData} 3 | \title{Creates a New BGData Instance} 4 | \description{ 5 | This function constructs a new \code{BGData} object. 6 | } 7 | \usage{ 8 | BGData(geno, pheno = NULL, map = NULL) 9 | } 10 | \arguments{ 11 | \item{geno}{ 12 | A \code{geno} object that contains genotypes. 13 | } 14 | \item{pheno}{ 15 | A \code{data.frame} that contains sample information (including 16 | phenotypes). A stub that only contains a \code{sample_id} column 17 | populated with either the rownames of \code{geno} or a sequence 18 | starting with \code{sample_} will be generated if \code{NULL} 19 | } 20 | \item{map}{ 21 | A \code{data.frame} that contains variant information. A stub that only 22 | contains a \code{variant_id} column populated with either the colnames 23 | of \code{geno} or a sequence starting with \code{variant_} will be 24 | generated if \code{NULL} 25 | } 26 | } 27 | \seealso{ 28 | \code{\link{BGData-class}} and \code{\link{geno-class}} for more 29 | information on the above mentioned classes. 30 | } 31 | -------------------------------------------------------------------------------- /man/FWD.Rd: -------------------------------------------------------------------------------- 1 | \name{FWD} 2 | \alias{FWD} 3 | \title{Performs Forward Regressions} 4 | \description{ 5 | Performs forward regression of \code{y} on the columns of \code{X}. 6 | Predictors are added, one at a time, each time adding the one that produces 7 | the largest reduction in the residual sum of squares (RSS). The function 8 | returns estimates and summaries for the entire forward search. This 9 | function performs a similar search than that of \code{step(, 10 | direction='forward')}, however, \code{FWD()} is optimized for 11 | computational speed for linear models with very large sample size. To 12 | achieve fast computations, the software first computes the sufficient 13 | statistics X'X and X'y. At each step, the function first finds the 14 | predictor that produces the largest reduction in the sum of squares (this 15 | can be derived from X'X, X'y and the current solution of effects), and then 16 | updates the estimates of effects for the resulting model using Gauss Seidel 17 | iterations performed on the linear system (X'X)b=X'y, iterating only over 18 | the elements of b that are active in the model. 19 | } 20 | \usage{ 21 | FWD(y, X, df = 20, tol = 1e-7, maxIter = 1000, centerImpute = TRUE, 22 | verbose = TRUE) 23 | } 24 | \arguments{ 25 | \item{y}{ 26 | The response vector (numeric nx1). 27 | } 28 | \item{X}{ 29 | An (nxp) numeric matrix. Columns are the features (aka predictors) 30 | considered in the forward search. The rows of \code{X} must be matched 31 | to the entries of \code{y}. 32 | } 33 | \item{df}{ 34 | Defines the maximum number of predictors to be included in the model. 35 | For complete forward search, set \code{df = ncol(X)}. 36 | } 37 | \item{tol}{ 38 | A tolerance parameter to control when to stop the Gauss Seidel 39 | algorithm. 40 | } 41 | \item{maxIter}{ 42 | The maximum number of iterations for the Gauss Seidel algorithm (only 43 | used when the algorithm is not stopped by the tolerance parameter). 44 | } 45 | \item{centerImpute}{ 46 | Whether to center the columns of \code{X} and impute the missing values 47 | with the column means. 48 | } 49 | \item{verbose}{ 50 | Use \code{verbose = TRUE} to print summaries of the forward search. 51 | } 52 | } 53 | \value{ 54 | A list with two entries: 55 | \itemize{ 56 | \item \code{B}: (pxdf+1) includes the estimated effects for each 57 | predictor (rows) at each step of the forward search (df, in columns). 58 | \item \code{path}: A data frame providing the order in which variables 59 | were added to the model (\code{variable}) and statistics for each step 60 | of the forward search (\code{RSS}, \code{LogLik}, \code{VARE} (the 61 | residual variance), \code{DF}, \code{AIC}, and \code{BIC}). 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /man/GWAS.Rd: -------------------------------------------------------------------------------- 1 | \name{GWAS} 2 | \alias{GWAS} 3 | \title{Performs Single Marker Regressions Using BGData Objects} 4 | \description{ 5 | Implements single marker regressions. The regression model includes all the 6 | covariates specified in the right-hand-side of the \code{formula} plus one 7 | column of the genotypes at a time. The data from the association tests is 8 | obtained from a \code{BGData} object. 9 | } 10 | \usage{ 11 | GWAS(formula, data, method = "lsfit", i = seq_len(nrow(geno(data))), 12 | j = seq_len(ncol(geno(data))), chunkSize = 5000L, 13 | nCores = getOption("mc.cores", 2L), verbose = FALSE, ...) 14 | } 15 | \arguments{ 16 | \item{formula}{ 17 | The formula for the GWAS model without the variant, e.g. \code{y ~ 1} 18 | or \code{y ~ factor(sex) + age}. The variables included in the formula 19 | must be column names in the sample information of the \code{BGData} 20 | object. 21 | } 22 | \item{data}{ 23 | A \code{BGData} object. 24 | } 25 | \item{method}{ 26 | The regression method to be used. Currently, the following methods are 27 | implemented: \code{rayOLS} (see below), \code{lsfit}, \code{lm}, 28 | \code{lm.fit}, \code{glm}, \code{lmer}, and \code{SKAT}. Defaults to 29 | \code{lsfit}. 30 | } 31 | \item{i}{ 32 | Indicates which rows of the genotypes should be used. Can be integer, 33 | boolean, or character. By default, all rows are used. 34 | } 35 | \item{j}{ 36 | Indicates which columns of the genotypes should be used. Can be 37 | integer, boolean, or character. By default, all columns are used. 38 | } 39 | \item{chunkSize}{ 40 | The number of columns of the genotypes that are brought into physical 41 | memory for processing per core. If \code{NULL}, all elements in 42 | \code{j} are used. Defaults to 5000. 43 | } 44 | \item{nCores}{ 45 | The number of cores (passed to \code{mclapply}). Defaults to the number 46 | of cores as detected by \code{detectCores}. 47 | } 48 | \item{verbose}{ 49 | Whether progress updates will be posted. Defaults to \code{FALSE}. 50 | } 51 | \item{...}{ 52 | Additional arguments for chunkedApply and regression method. 53 | } 54 | } 55 | \details{ 56 | The \code{rayOLS} method is a regression through the origin that can only 57 | be used with a \code{y ~ 1} formula, i.e. it only allows for one 58 | quantitative response variable \code{y} and one variant at a time as an 59 | explanatory variable (the variant is not included in the formula, hence 60 | \code{1} is used as a dummy). If covariates are needed, consider 61 | preadjustment of \code{y}. Among the provided methods, it is by far the 62 | fastest. 63 | 64 | Some regression methods may require the data to not contain columns with 65 | variance 0 or too many missing values. We suggest running \code{summarize} 66 | to detect variants that do not clear the desired minor-allele frequency and 67 | rate of missing genotype calls, and filtering these variants out using the 68 | \code{j} parameter of the \code{GWAS} function (see example below). 69 | } 70 | \value{ 71 | The same matrix that would be returned by \code{coef(summary(model))}. 72 | } 73 | \seealso{ 74 | \code{\link{file-backed-matrices}} for more information on file-backed 75 | matrices. \code{\link{multi-level-parallelism}} for more information on 76 | multi-level parallelism. \code{\link{BGData-class}} for more information on 77 | the \code{BGData} class. \code{\link[stats]{lsfit}}, 78 | \code{\link[stats]{lm}}, \code{\link[stats]{lm.fit}}, 79 | \code{\link[stats]{glm}}, \code{\link[lme4]{lmer}}, and 80 | \code{\link[SKAT]{SKAT}} for more information on regression methods. 81 | } 82 | \examples{ 83 | # Restrict number of cores to 1 on Windows 84 | if (.Platform$OS.type == "windows") { 85 | options(mc.cores = 1) 86 | } 87 | 88 | # Load example data 89 | bg <- BGData:::loadExample() 90 | 91 | # Detect variants that do not pass MAF and missingness thresholds 92 | summaries <- summarize(geno(bg)) 93 | maf <- ifelse(summaries$allele_freq > 0.5, 1 - summaries$allele_freq, 94 | summaries$allele_freq) 95 | exclusions <- maf < 0.01 | summaries$freq_na > 0.05 96 | 97 | # Perform a single marker regression 98 | res1 <- GWAS(formula = FT10 ~ 1, data = bg, j = !exclusions) 99 | 100 | # Draw a Manhattan plot 101 | plot(-log10(res1[, 4])) 102 | 103 | # Use lm instead of lsfit (the default) 104 | res2 <- GWAS(formula = FT10 ~ 1, data = bg, method = "lm", j = !exclusions) 105 | 106 | # Use glm instead of lsfit (the default) 107 | y <- pheno(bg)$FT10 108 | pheno(bg)$FT10.01 <- y > quantile(y, 0.8, na.rm = TRUE) 109 | res3 <- GWAS(formula = FT10.01 ~ 1, data = bg, method = "glm", j = !exclusions) 110 | 111 | # Perform a single marker regression on the first 50 markers (useful for 112 | # distributed computing) 113 | res4 <- GWAS(formula = FT10 ~ 1, data = bg, j = 1:50) 114 | } 115 | -------------------------------------------------------------------------------- /man/as.BGData.Rd: -------------------------------------------------------------------------------- 1 | \name{as.BGData} 2 | \alias{as.BGData} 3 | \alias{as.BGData.BEDMatrix} 4 | \alias{as.BGData.ColumnLinkedMatrix} 5 | \alias{as.BGData.RowLinkedMatrix} 6 | \title{Convert Other Objects to BGData Objects} 7 | \description{ 8 | Converts other objects to \code{BGData} objects by loading supplementary 9 | phenotypes and map files referenced by the object to be used for the sample 10 | information and variant information, respectively. 11 | 12 | Currently supported are \code{BEDMatrix} objects, plain or nested in 13 | \code{ColumnLinkedMatrix} objects. 14 | } 15 | \usage{ 16 | as.BGData(x, alternatePhenotypeFile = NULL, ...) 17 | 18 | \method{as.BGData}{BEDMatrix}(x, alternatePhenotypeFile = NULL, ...) 19 | 20 | \method{as.BGData}{ColumnLinkedMatrix}(x, alternatePhenotypeFile = NULL, 21 | ...) 22 | 23 | \method{as.BGData}{RowLinkedMatrix}(x, alternatePhenotypeFile = NULL, 24 | ...) 25 | } 26 | \arguments{ 27 | \item{x}{ 28 | An object. Currently supported are \code{BEDMatrix} objects, plain or 29 | nested in \code{ColumnLinkedMatrix} objects. 30 | } 31 | \item{alternatePhenotypeFile}{ 32 | Path to an 33 | \href{https://www.cog-genomics.org/plink2/input#pheno}{alternate 34 | phenotype file}. 35 | } 36 | \item{...}{ 37 | Additional arguments to the \code{read.table} or \code{fread} call (if 38 | data.table package is installed) call to parse the alternate pheno 39 | file. 40 | } 41 | } 42 | \details{ 43 | The .ped and .raw formats only allows for a single phenotype. If more 44 | phenotypes are required it is possible to store them in an 45 | \href{https://www.cog-genomics.org/plink2/input#pheno}{alternate phenotype 46 | file}. The path to such a file can be provided with 47 | \code{alternatePhenotypeFile} and will be merged with the existing sample 48 | information. The first and second columns of that file must contain family 49 | and within-family IDs, respectively. 50 | 51 | For \code{BEDMatrix} objects: If a .fam file (which corresponds to the 52 | first six columns of a .ped or .raw file) of the same name and in the same 53 | directory as the .bed file exists, the sample information will be populated 54 | with the data stored in that file. Otherwise a stub that only contains an 55 | \code{IID} column populated with the rownames of \code{geno(x)} will be 56 | generated. The same will happen for a .bim file for the variant 57 | information. 58 | 59 | For \code{ColumnLinkedMatrix} objects: See the case for \code{BEDMatrix} 60 | objects, but only the .fam file of the first node of the 61 | \code{LinkedMatrix} will be read and used for the sample information, and 62 | the .bim files of all nodes will be combined and used for the variant 63 | information. 64 | } 65 | \value{ 66 | A \code{BGData} object. 67 | } 68 | \seealso{ 69 | \code{\link[=readRAW]{readRAW()}} to convert text files to \code{BGData} 70 | objects. \code{\link{BGData-class}}, 71 | \code{\link[BEDMatrix]{BEDMatrix-class}}, 72 | \code{\link[LinkedMatrix]{ColumnLinkedMatrix-class}} for more information 73 | on the above mentioned classes. \code{\link[utils]{read.table}} and 74 | \code{\link[data.table]{fread}} to learn more about extra arguments that 75 | can be passed via \code{...}. 76 | } 77 | \examples{ 78 | # Path to example data 79 | path <- system.file("extdata", package = "BGData") 80 | 81 | # Convert a single BEDMatrix object to a BGData object 82 | chr1 <- BEDMatrix::BEDMatrix(paste0(path, "/chr1.bed")) 83 | bg1 <- as.BGData(chr1) 84 | 85 | # Convert multiple BEDMatrix objects in a ColumnLinkedMatrix to a BGData object 86 | chr2 <- BEDMatrix::BEDMatrix(paste0(path, "/chr2.bed")) 87 | chr3 <- BEDMatrix::BEDMatrix(paste0(path, "/chr3.bed")) 88 | clm <- ColumnLinkedMatrix(chr1, chr2, chr3) 89 | bg2 <- as.BGData(clm) 90 | 91 | # Load additional (alternate) phenotypes 92 | bg3 <- as.BGData(clm, alternatePhenotypeFile = paste0(path, "/pheno.txt")) 93 | } 94 | -------------------------------------------------------------------------------- /man/chunkedApply.Rd: -------------------------------------------------------------------------------- 1 | \name{chunkedApply} 2 | \alias{chunkedApply} 3 | \title{Applies a Function on Each Row or Column of a File-Backed Matrix} 4 | \description{ 5 | Similar to \code{apply}, but designed for file-backed matrices. The 6 | function brings chunks of an object into physical memory by taking subsets, 7 | and applies a function on either the rows or the columns of the chunks 8 | using an optimized version of \code{apply}. If \code{nCores} is greater 9 | than 1, the function will be applied in parallel using \code{mclapply}. In 10 | that case the subsets of the object are taken on the slaves. 11 | } 12 | \usage{ 13 | chunkedApply(X, MARGIN, FUN, i = seq_len(nrow(X)), 14 | j = seq_len(ncol(X)), chunkSize = 5000L, 15 | nCores = getOption("mc.cores", 2L), verbose = FALSE, ...) 16 | } 17 | \arguments{ 18 | \item{X}{ 19 | A file-backed matrix, typically the genotypes of a \code{BGData} 20 | object. 21 | } 22 | \item{MARGIN}{ 23 | The subscripts which the function will be applied over. 1 indicates 24 | rows, 2 indicates columns. 25 | } 26 | \item{FUN}{ 27 | The function to be applied. 28 | } 29 | \item{i}{ 30 | Indicates which rows of \code{X} should be used. Can be integer, 31 | boolean, or character. By default, all rows are used. 32 | } 33 | \item{j}{ 34 | Indicates which columns of \code{X} should be used. Can be integer, 35 | boolean, or character. By default, all columns are used. 36 | } 37 | \item{chunkSize}{ 38 | The number of rows or columns of \code{X} that are brought into 39 | physical memory for processing per core. If \code{NULL}, all elements 40 | in \code{i} or \code{j} are used. Defaults to 5000. 41 | } 42 | \item{nCores}{ 43 | The number of cores (passed to \code{mclapply}). Defaults to the number 44 | of cores as detected by \code{detectCores}. 45 | } 46 | \item{verbose}{ 47 | Whether progress updates will be posted. Defaults to \code{FALSE}. 48 | } 49 | \item{...}{ 50 | Additional arguments to be passed to the \code{apply} like function. 51 | } 52 | } 53 | \seealso{ 54 | \code{\link{file-backed-matrices}} for more information on file-backed 55 | matrices. \code{\link{multi-level-parallelism}} for more information on 56 | multi-level parallelism. \code{\link{BGData-class}} for more information on 57 | the \code{BGData} class. 58 | } 59 | \examples{ 60 | # Restrict number of cores to 1 on Windows 61 | if (.Platform$OS.type == "windows") { 62 | options(mc.cores = 1) 63 | } 64 | 65 | # Load example data 66 | bg <- BGData:::loadExample() 67 | 68 | # Compute standard deviation of columns 69 | chunkedApply(X = geno(bg), MARGIN = 2, FUN = sd) 70 | } 71 | -------------------------------------------------------------------------------- /man/chunkedMap.Rd: -------------------------------------------------------------------------------- 1 | \name{chunkedMap} 2 | \alias{chunkedMap} 3 | \title{Applies a Function on Each Chunk of a File-Backed Matrix} 4 | \description{ 5 | Similar to \code{lapply}, but designed for file-backed matrices. The 6 | function brings chunks of an object into physical memory by taking subsets, 7 | and applies a function on them. If \code{nCores} is greater than 1, the 8 | function will be applied in parallel using \code{mclapply}. In that case 9 | the subsets of the object are taken on the slaves. 10 | } 11 | \usage{ 12 | chunkedMap(X, FUN, i = seq_len(nrow(X)), j = seq_len(ncol(X)), 13 | chunkBy = 2L, chunkSize = 5000L, nCores = getOption("mc.cores", 14 | 2L), verbose = FALSE, ...) 15 | } 16 | \arguments{ 17 | \item{X}{ 18 | A file-backed matrix, typically the genotypes of a \code{BGData} 19 | object. 20 | } 21 | \item{FUN}{ 22 | The function to be applied on each chunk. 23 | } 24 | \item{i}{ 25 | Indicates which rows of \code{X} should be used. Can be integer, 26 | boolean, or character. By default, all rows are used. 27 | } 28 | \item{j}{ 29 | Indicates which columns of \code{X} should be used. Can be integer, 30 | boolean, or character. By default, all columns are used. 31 | } 32 | \item{chunkBy}{ 33 | Whether to extract chunks by rows (1) or by columns (2). Defaults to 34 | columns (2). 35 | } 36 | \item{chunkSize}{ 37 | The number of rows or columns of \code{X} that are brought into 38 | physical memory for processing per core. If \code{NULL}, all elements 39 | in \code{i} or \code{j} are used. Defaults to 5000. 40 | } 41 | \item{nCores}{ 42 | The number of cores (passed to \code{mclapply}). Defaults to the number 43 | of cores as detected by \code{detectCores}. 44 | } 45 | \item{verbose}{ 46 | Whether progress updates will be posted. Defaults to \code{FALSE}. 47 | } 48 | \item{...}{ 49 | Additional arguments to be passed to the 50 | \code{apply} like function. 51 | } 52 | } 53 | \seealso{ 54 | \code{\link{file-backed-matrices}} for more information on file-backed 55 | matrices. \code{\link{multi-level-parallelism}} for more information on 56 | multi-level parallelism. \code{\link{BGData-class}} for more information on 57 | the \code{BGData} class. 58 | } 59 | \examples{ 60 | # Restrict number of cores to 1 on Windows 61 | if (.Platform$OS.type == "windows") { 62 | options(mc.cores = 1) 63 | } 64 | 65 | # Load example data 66 | bg <- BGData:::loadExample() 67 | 68 | # Compute column sums of each chunk 69 | chunkedMap(X = geno(bg), FUN = colSums) 70 | } 71 | -------------------------------------------------------------------------------- /man/file-backed-matrices.Rd: -------------------------------------------------------------------------------- 1 | \name{file-backed-matrices} 2 | \alias{file-backed-matrices} 3 | \title{File-Backed Matrices} 4 | \description{ 5 | Functions with the \code{chunkSize} parameter work best with file-backed 6 | matrices such as \code{BEDMatrix} objects. To avoid loading the whole, 7 | potentially very large matrix into memory, these functions will load chunks 8 | of the file-backed matrix into memory and perform the operations on one 9 | chunk at a time. The size of the chunks is determined by the 10 | \code{chunkSize} parameter. Care must be taken to not set \code{chunkSize} 11 | too high to avoid memory shortage, particularly when combined with parallel 12 | computing. 13 | } 14 | \seealso{ 15 | \code{\link[BEDMatrix]{BEDMatrix-class}} as an example of a file-backed 16 | matrix. 17 | } 18 | -------------------------------------------------------------------------------- /man/findRelated.Rd: -------------------------------------------------------------------------------- 1 | \name{findRelated} 2 | \alias{findRelated} 3 | \alias{findRelated.matrix} 4 | \alias{findRelated.symDMatrix} 5 | \title{Find related individuals in a relationship matrix} 6 | \description{ 7 | Find related individuals in a relationship matrix. 8 | } 9 | \usage{ 10 | findRelated(x, ...) 11 | 12 | \method{findRelated}{matrix}(x, cutoff = 0.03, ...) 13 | 14 | \method{findRelated}{symDMatrix}(x, cutoff = 0.03, verbose = FALSE, 15 | ...) 16 | } 17 | \arguments{ 18 | \item{x}{ 19 | A matrix-like object with dimnames. 20 | } 21 | \item{...}{ 22 | Additional arguments for methods. 23 | } 24 | \item{cutoff}{ 25 | The cutoff between 0 and 1 for related individuals to be included in 26 | the output. Defaults to 0.03. 27 | } 28 | \item{verbose}{ 29 | Whether progress updates will be posted. Defaults to \code{FALSE}. 30 | } 31 | } 32 | \value{ 33 | A vector of names or indices of related individuals. 34 | } 35 | \section{Methods (by class)}{ 36 | \itemize{ 37 | \item \code{matrix}: Find related individuals in matrices 38 | \item \code{symDMatrix}: Find related individuals in symDMatrix objects 39 | } 40 | } 41 | \examples{ 42 | # Load example data 43 | bg <- BGData:::loadExample() 44 | 45 | G <- getG(geno(bg)) 46 | findRelated(G) 47 | } 48 | -------------------------------------------------------------------------------- /man/geno-class.Rd: -------------------------------------------------------------------------------- 1 | \docType{class} 2 | \name{geno-class} 3 | \alias{geno-class} 4 | \title{An Abstract S4 Class Union of Matrix-Like Types} 5 | \description{ 6 | \code{geno} is a class union of several matrix-like types, many of 7 | them suitable for very large datasets. 8 | 9 | Currently supported are \code{LinkedMatrix}, \code{BEDMatrix}, 10 | \code{big.matrix},\code{ff_matrix}, and \code{matrix}. 11 | } 12 | \seealso{ 13 | \code{\link[LinkedMatrix]{LinkedMatrix-class}}, 14 | \code{\link[BEDMatrix]{BEDMatrix-class}}, 15 | \code{\link[bigmemory]{big.matrix-class}}, \code{\link[ff]{ff}}, and 16 | \code{\link[base]{matrix}} for more information on each matrix-like type. 17 | 18 | \code{\link{BGData-class}} for more information on the \code{BGData} class, 19 | in particular its \code{geno} accessor that accepts \code{geno} objects. 20 | } 21 | -------------------------------------------------------------------------------- /man/geno.Rd: -------------------------------------------------------------------------------- 1 | \name{geno} 2 | \alias{geno} 3 | \alias{geno<-} 4 | \alias{pheno} 5 | \alias{pheno<-} 6 | \alias{map} 7 | \alias{map<-} 8 | \title{Getting/Setting Genotypes, Sample Information, and Variant Information} 9 | \description{ 10 | A set of generic functions for getting/setting the genotypes, sample 11 | information, and variant information. 12 | } 13 | \usage{ 14 | geno(x) 15 | geno(x) <- value 16 | 17 | pheno(x) 18 | pheno(x) <- value 19 | 20 | map(x) 21 | map(x) <- value 22 | } 23 | \arguments{ 24 | \item{x}{ 25 | The object from/on which to get/set genotypes, sample information, and 26 | variant information. Typically a \code{BGData} object. 27 | } 28 | \item{value}{ 29 | Typically a \code{geno} object for the \code{geno} setter. 30 | 31 | Typically a \code{data.frame} object for the \code{pheno} setter. 32 | 33 | Typically a \code{data.frame} object for the \code{map} setter. 34 | } 35 | } 36 | \seealso{ 37 | \itemize{ 38 | \item \code{\link{BGData-class}} 39 | \item \code{\link{geno-class}} 40 | } 41 | } 42 | \examples{ 43 | # Load example data 44 | bg <- BGData:::loadExample() 45 | 46 | # Access genotypes 47 | geno(bg) 48 | 49 | # Access sample information 50 | pheno(bg) 51 | 52 | # Access variant information 53 | map(bg) 54 | } 55 | \keyword{methods} 56 | -------------------------------------------------------------------------------- /man/getG.Rd: -------------------------------------------------------------------------------- 1 | \name{getG} 2 | \alias{getG} 3 | \title{Computes a Genomic Relationship Matrix} 4 | \description{ 5 | Computes a positive semi-definite symmetric genomic relation matrix G=XX' 6 | offering options for centering and scaling the columns of \code{X} 7 | beforehand. 8 | } 9 | \usage{ 10 | getG(X, center = TRUE, scale = TRUE, impute = TRUE, scaleG = TRUE, 11 | minVar = 1e-05, i = seq_len(nrow(X)), j = seq_len(ncol(X)), i2 = NULL, 12 | chunkSize = 5000L, nCores = getOption("mc.cores", 2L), verbose = FALSE) 13 | } 14 | \arguments{ 15 | \item{X}{ 16 | A matrix-like object, typically the genotypes of a \code{BGData} 17 | object. 18 | } 19 | \item{center}{ 20 | Either a logical value or a numeric vector of length equal to the 21 | number of columns of \code{X}. Numeric vector required if \code{i2} is 22 | used. If \code{FALSE}, no centering is done. Defaults to \code{TRUE}. 23 | } 24 | \item{scale}{ 25 | Either a logical value or a numeric vector of length equal to the 26 | number of columns of \code{X}. Numeric vector required if \code{i2} is 27 | used. If \code{FALSE}, no scaling is done. Defaults to \code{TRUE}. 28 | } 29 | \item{impute}{ 30 | Indicates whether missing values should be imputed. Defaults to 31 | \code{TRUE}. 32 | } 33 | \item{scaleG}{ 34 | Whether XX' should be scaled. Defaults to \code{TRUE}. 35 | } 36 | \item{minVar}{ 37 | Columns with variance lower than this value will not be used in the 38 | computation (only if \code{scale} is not \code{FALSE}). 39 | } 40 | \item{i}{ 41 | Indicates which rows of \code{X} should be used. Can be integer, 42 | boolean, or character. By default, all rows are used. 43 | } 44 | \item{j}{ 45 | Indicates which columns of \code{X} should be used. Can be integer, 46 | boolean, or character. By default, all columns are used. 47 | } 48 | \item{i2}{ 49 | Indicates which rows should be used to compute a block of the genomic 50 | relationship matrix. Will compute XY' where X is determined by \code{i} 51 | and \code{j} and Y by \code{i2} and \code{j}. Can be integer, boolean, 52 | or character. If \code{NULL}, the whole genomic relationship matrix XX' 53 | is computed. Defaults to \code{NULL}. 54 | } 55 | \item{chunkSize}{ 56 | The number of columns of \code{X} that are brought into physical memory 57 | for processing per core. If \code{NULL}, all columns of \code{X} are 58 | used. Defaults to 5000. 59 | } 60 | \item{nCores}{ 61 | The number of cores (passed to \code{mclapply}). Defaults to the number 62 | of cores as detected by \code{detectCores}. 63 | } 64 | \item{verbose}{ 65 | Whether progress updates will be posted. Defaults to \code{FALSE}. 66 | } 67 | } 68 | \details{ 69 | If \code{center = FALSE}, \code{scale = FALSE} and \code{scaleG = FALSE}, 70 | \code{getG} produces the same outcome than \code{tcrossprod}. 71 | } 72 | \value{ 73 | A positive semi-definite symmetric numeric matrix. 74 | } 75 | \seealso{ 76 | \code{\link{file-backed-matrices}} for more information on file-backed 77 | matrices. \code{\link{multi-level-parallelism}} for more information on 78 | multi-level parallelism. \code{\link{BGData-class}} for more information on 79 | the \code{BGData} class. 80 | } 81 | \examples{ 82 | # Restrict number of cores to 1 on Windows 83 | if (.Platform$OS.type == "windows") { 84 | options(mc.cores = 1) 85 | } 86 | 87 | # Load example data 88 | bg <- BGData:::loadExample() 89 | 90 | # Compute a scaled genomic relationship matrix from centered and scaled 91 | # genotypes 92 | g1 <- getG(X = geno(bg)) 93 | 94 | # Disable scaling of G 95 | g2 <- getG(X = geno(bg), scaleG = FALSE) 96 | 97 | # Disable centering of genotypes 98 | g3 <- getG(X = geno(bg), center = FALSE) 99 | 100 | # Disable scaling of genotypes 101 | g4 <- getG(X = geno(bg), scale = FALSE) 102 | 103 | # Provide own scales 104 | scales <- chunkedApply(X = geno(bg), MARGIN = 2, FUN = sd) 105 | g4 <- getG(X = geno(bg), scale = scales) 106 | 107 | # Provide own centers 108 | centers <- chunkedApply(X = geno(bg), MARGIN = 2, FUN = mean) 109 | g5 <- getG(X = geno(bg), center = centers) 110 | 111 | # Only use the first 50 individuals (useful to account for population structure) 112 | g6 <- getG(X = geno(bg), i = 1:50) 113 | 114 | # Only use the first 100 markers (useful to ignore some markers) 115 | g7 <- getG(X = geno(bg), j = 1:100) 116 | 117 | # Compute unscaled G matrix by combining blocks of $XX_{i2}'$ where $X_{i2}$ is 118 | # a horizontal partition of X. This is useful for distributed computing as each 119 | # block can be computed in parallel. Centers and scales need to be precomputed. 120 | block1 <- getG(X = geno(bg), i2 = 1:100, center = centers, scale = scales) 121 | block2 <- getG(X = geno(bg), i2 = 101:199, center = centers, scale = scales) 122 | g8 <- cbind(block1, block2) 123 | 124 | # Compute unscaled G matrix by combining blocks of $X_{i}X_{i2}'$ where both 125 | # $X_{i}$ and $X_{i2}$ are horizontal partitions of X. Similarly to the example 126 | # above, this is useful for distributed computing, in particular to compute 127 | # very large G matrices. Centers and scales need to be precomputed. This 128 | # approach is similar to the one taken by the symDMatrix package, but the 129 | # symDMatrix package adds memory-mapped blocks, only stores the upper side of 130 | # the triangular matrix, and provides a type that allows for indexing as if the 131 | # full G matrix is in memory. 132 | block11 <- getG(X = geno(bg), i = 1:100, i2 = 1:100, center = centers, scale = scales) 133 | block12 <- getG(X = geno(bg), i = 1:100, i2 = 101:199, center = centers, scale = scales) 134 | block21 <- getG(X = geno(bg), i = 101:199, i2 = 1:100, center = centers, scale = scales) 135 | block22 <- getG(X = geno(bg), i = 101:199, i2 = 101:199, center = centers, scale = scales) 136 | g9 <- rbind( 137 | cbind(block11, block12), 138 | cbind(block21, block22) 139 | ) 140 | } 141 | -------------------------------------------------------------------------------- /man/getG_symDMatrix.Rd: -------------------------------------------------------------------------------- 1 | \name{getG_symDMatrix} 2 | \alias{getG_symDMatrix} 3 | \title{Computes a Very Large Genomic Relationship Matrix} 4 | \description{ 5 | Computes a positive semi-definite symmetric genomic relation matrix G=XX' 6 | offering options for centering and scaling the columns of \code{X} 7 | beforehand. 8 | } 9 | \usage{ 10 | getG_symDMatrix(X, center = TRUE, scale = TRUE, impute = TRUE, scaleG = TRUE, 11 | minVar = 1e-05, blockSize = 5000L, 12 | folderOut = paste0("symDMatrix_", randomString()), vmode = "double", 13 | i = seq_len(nrow(X)), j = seq_len(ncol(X)), chunkSize = 5000L, 14 | nCores = getOption("mc.cores", 2L), verbose = FALSE) 15 | } 16 | \arguments{ 17 | \item{X}{ 18 | A matrix-like object, typically the genotypes of a \code{BGData} 19 | object. 20 | } 21 | \item{center}{ 22 | Either a logical value or a numeric vector of length equal to the 23 | number of columns of \code{X}. If \code{FALSE}, no centering is done. 24 | Defaults to \code{TRUE}. 25 | } 26 | \item{scale}{ 27 | Either a logical value or a numeric vector of length equal to the 28 | number of columns of \code{X}. If \code{FALSE}, no scaling is done. 29 | Defaults to \code{TRUE}. 30 | } 31 | \item{impute}{ 32 | Indicates whether missing values should be imputed. Defaults to 33 | \code{TRUE}. 34 | } 35 | \item{scaleG}{ 36 | TRUE/FALSE whether xx' must be scaled. 37 | } 38 | \item{minVar}{ 39 | Columns with variance lower than this value will not be used in the 40 | computation (only if \code{scale} is not \code{FALSE}). 41 | } 42 | \item{blockSize}{ 43 | The number of rows and columns of each block. If \code{NULL}, a single 44 | block of the same length as \code{i} will be created. Defaults to 5000. 45 | } 46 | \item{folderOut}{ 47 | The path to the folder where to save the \code{symDMatrix} object. 48 | Defaults to a random string prefixed with "symDMatrix_". 49 | } 50 | \item{vmode}{ 51 | vmode of \code{ff} objects. 52 | } 53 | \item{i}{ 54 | Indicates which rows of \code{X} should be used. Can be integer, 55 | boolean, or character. By default, all rows are used. 56 | } 57 | \item{j}{ 58 | Indicates which columns of \code{X} should be used. Can be integer, 59 | boolean, or character. By default, all columns are used. 60 | } 61 | \item{chunkSize}{ 62 | The number of columns of \code{X} that are brought into physical memory 63 | for processing per core. If \code{NULL}, all columns of \code{X} are 64 | used. Defaults to 5000. 65 | } 66 | \item{nCores}{ 67 | The number of cores (passed to \code{mclapply}). Defaults to the number 68 | of cores as detected by \code{detectCores}. 69 | } 70 | \item{verbose}{ 71 | Whether progress updates will be posted. Defaults to \code{FALSE}. 72 | } 73 | } 74 | \details{ 75 | Even very large genomic relationship matrices are supported by partitioning 76 | \code{X} into blocks and calling \code{getG} on these blocks. This function 77 | performs the block computations sequentially, which may be slow. In an HPC 78 | environment, performance can be improved by manually distributing these 79 | operations to different nodes. 80 | } 81 | \value{ 82 | A \code{symDMatrix} object. 83 | } 84 | \seealso{ 85 | \code{\link{multi-level-parallelism}} for more information on multi-level 86 | parallelism. \code{\link[symDMatrix]{symDMatrix-class}} and 87 | \code{\link{BGData-class}} for more information on the \code{BGData} class. 88 | \code{\link{getG}} to learn more about the underlying method. 89 | } 90 | -------------------------------------------------------------------------------- /man/load.BGData.Rd: -------------------------------------------------------------------------------- 1 | \name{load.BGData} 2 | \alias{load.BGData} 3 | \title{Loads BGData (and Other) Objects from .RData Files} 4 | \description{ 5 | This function is similar to \code{load}, but also initializes the different 6 | types of objects that can be used as genotypes in a \code{BGData} object. 7 | 8 | Currently supported are \code{ff_matrix}, \code{big.matrix}, and 9 | \code{BEDMatrix} objects. If the object is of type \code{LinkedMatrix}, all 10 | nodes will be initialized with their appropriate method. 11 | } 12 | \usage{ 13 | load.BGData(file, envir = parent.frame()) 14 | } 15 | \arguments{ 16 | \item{file}{ 17 | The name of the .RData file to be loaded. 18 | } 19 | \item{envir}{ 20 | The environment where to load the data. 21 | } 22 | } 23 | \seealso{ 24 | \code{\link{BGData-class}}, \code{\link[ff]{ff}}, 25 | \code{\link[bigmemory]{big.matrix-class}}, 26 | \code{\link[BEDMatrix]{BEDMatrix-class}}, and 27 | \code{\link[LinkedMatrix]{LinkedMatrix-class}} for more information on the 28 | above mentioned classes. 29 | } 30 | -------------------------------------------------------------------------------- /man/multi-level-parallelism.Rd: -------------------------------------------------------------------------------- 1 | \name{multi-level-parallelism} 2 | \alias{multi-level-parallelism} 3 | \title{Multi-Level Parallelism} 4 | \description{ 5 | Functions with the \code{nCores}, \code{i}, and \code{j} parameters provide 6 | capabilities for both parallel and distributed computing. 7 | 8 | For parallel computing, \code{nCores} determines the number of cores the 9 | code is run on. Memory usage can be an issue for higher values of 10 | \code{nCores} as R is not particularly memory-efficient. As a rule of 11 | thumb, at least around \code{(nCores * object_size(chunk)) + 12 | object_size(result)} MB of total memory will be needed for operations 13 | on file-backed matrices, not including potential copies of your data that 14 | might be created (for example \code{lsfit} runs \code{cbind(1, X)}). 15 | \code{i} and \code{j} can be used to include or exclude certain rows or 16 | columns. Internally, the \code{mclapply} function is used and therefore 17 | parallel computing will not work on Windows machines. 18 | 19 | For distributed computing, \code{i} and \code{j} determine the subset of 20 | the input matrix that the code runs on. In an HPC environment, this can be 21 | used not just to include or exclude certain rows or columns, but also to 22 | partition the task among many nodes rather than cores. Scheduler-specific 23 | code and code to aggregate the results need to be written by the user. It 24 | is recommended to set \code{nCores} to \code{1} as nodes are often cheaper 25 | than cores. 26 | } 27 | \seealso{ 28 | \code{\link[parallel]{mclapply}} to learn more about the function used to 29 | implement parallel computing. \code{\link[parallel]{detectCores}} to detect 30 | the number of available cores. 31 | } 32 | -------------------------------------------------------------------------------- /man/orderedMerge.Rd: -------------------------------------------------------------------------------- 1 | \name{orderedMerge} 2 | \alias{orderedMerge} 3 | \title{Merge Two Data Frames Keeping the Order of the First} 4 | \description{ 5 | This is a simplified version of \code{merge} useful for merging additional 6 | data into a \code{BGData} object while keeping the order of the data in the 7 | \code{BGData} object. 8 | } 9 | \usage{ 10 | orderedMerge(x, y, by = c(1L, 2L)) 11 | } 12 | \arguments{ 13 | \item{x}{ 14 | Data frame 15 | } 16 | \item{y}{ 17 | Data frame 18 | } 19 | \item{by}{ 20 | Specifications of the columns used for merging. Defaults to the first 21 | two columns of the data frame, which traditionally has the family ID 22 | and the individual ID. 23 | } 24 | } 25 | \value{ 26 | Merged data frame 27 | } 28 | \seealso{ 29 | \code{\link{BGData-class}} for more information on the \code{BGData} class. 30 | } 31 | -------------------------------------------------------------------------------- /man/preprocess.Rd: -------------------------------------------------------------------------------- 1 | \name{preprocess} 2 | \alias{preprocess} 3 | \title{Center, scale, and impute data} 4 | \description{ 5 | A faster version of \code{\link[base]{scale}} with a similar interface that 6 | also allows for imputation. The main difference is that this version scales 7 | by the standard deviation regardless of whether centering is enabled or 8 | not. If centering is enabled, missing values are imputed by 0, otherwise by 9 | the mean of the column that contains the value. 10 | } 11 | \usage{ 12 | preprocess(X, center = FALSE, scale = FALSE, impute = FALSE, 13 | nCores = getOption("mc.cores", 2L)) 14 | } 15 | \arguments{ 16 | \item{X}{ 17 | A numeric matrix. 18 | } 19 | \item{center}{ 20 | Either a logical value or numeric vector of length equal to the number 21 | of columns of \code{X}. 22 | } 23 | \item{scale}{ 24 | Either a logical value or numeric vector of length equal to the number 25 | of columns of \code{X}. 26 | } 27 | \item{impute}{ 28 | Indicates whether missing values should be imputed. 29 | } 30 | \item{nCores}{ 31 | The number of cores (passed to \code{mclapply}). Defaults to the number 32 | of cores as detected by \code{detectCores}. 33 | } 34 | } 35 | \value{ 36 | The centered, scaled, and imputed matrix. 37 | } 38 | \seealso{ 39 | \code{\link[base]{scale}}, which this function tries to improve upon. 40 | } 41 | \examples{ 42 | # Load example data 43 | bg <- BGData:::loadExample() 44 | 45 | # Center and scale genotypes 46 | W <- preprocess(as.matrix(geno(bg)), center = TRUE, scale = TRUE) 47 | } 48 | -------------------------------------------------------------------------------- /man/readRAW.Rd: -------------------------------------------------------------------------------- 1 | \name{readRAW} 2 | \alias{readRAW} 3 | \alias{readRAW_matrix} 4 | \alias{readRAW_big.matrix} 5 | \title{Creates a BGData Object From a .raw File or a .ped-Like File} 6 | \description{ 7 | Creates a \code{BGData} object from a .raw file (generated with 8 | \code{--recodeA} in \href{https://www.cog-genomics.org/plink2}{PLINK}). 9 | Other text-based file formats are supported as well by tweaking some of the 10 | parameters as long as the records of individuals are in rows, and 11 | phenotypes, covariates and markers are in columns. 12 | } 13 | \usage{ 14 | readRAW(fileIn, header = TRUE, dataType = integer(), n = NULL, 15 | p = NULL, sep = "", na.strings = "NA", nColSkip = 6L, 16 | idCol = c(1L, 2L), nNodes = NULL, linked.by = "rows", 17 | folderOut = paste0("BGData_", sub("\\\\.[[:alnum:]]+$", "", 18 | basename(fileIn))), outputType = "byte", dimorder = if (linked.by == 19 | "rows") 2L:1L else 1L:2L, verbose = FALSE) 20 | 21 | readRAW_matrix(fileIn, header = TRUE, dataType = integer(), n = NULL, 22 | p = NULL, sep = "", na.strings = "NA", nColSkip = 6L, 23 | idCol = c(1L, 2L), verbose = FALSE) 24 | 25 | readRAW_big.matrix(fileIn, header = TRUE, dataType = integer(), 26 | n = NULL, p = NULL, sep = "", na.strings = "NA", nColSkip = 6L, 27 | idCol = c(1L, 2L), folderOut = paste0("BGData_", 28 | sub("\\\\.[[:alnum:]]+$", "", basename(fileIn))), outputType = "char", 29 | verbose = FALSE) 30 | } 31 | \arguments{ 32 | \item{fileIn}{ 33 | The path to the plaintext file. 34 | } 35 | \item{header}{ 36 | Whether \code{fileIn} contains a header. Defaults to \code{TRUE}. 37 | } 38 | \item{dataType}{ 39 | The coding type of genotypes in \code{fileIn}. Use \code{integer()} or 40 | \code{double()} for numeric coding. Alpha-numeric coding is currently 41 | not supported for \code{readRAW} and \code{readRAW_big.matrix}: use the 42 | \code{--recodeA} option of PLINK to convert the .ped file into a .raw 43 | file. Defaults to \code{integer()}. 44 | } 45 | \item{n}{ 46 | The number of individuals. Auto-detect if \code{NULL}. Defaults to 47 | \code{NULL}. 48 | } 49 | \item{p}{ 50 | The number of markers. Auto-detect if \code{NULL}. Defaults to 51 | \code{NULL}. 52 | } 53 | \item{sep}{ 54 | The field separator character. Values on each line of the file are 55 | separated by this character. If \code{sep = ""} (the default for 56 | \code{readRAW} the separator is "white space", that is one or more 57 | spaces, tabs, newlines or carriage returns. 58 | } 59 | \item{na.strings}{ 60 | The character string used in the plaintext file to denote missing 61 | value. Defaults to \code{NA}. 62 | } 63 | \item{nColSkip}{ 64 | The number of columns to be skipped to reach the genotype information 65 | in the file. Defaults to \code{6}. 66 | } 67 | \item{idCol}{ 68 | The index of the ID column. If more than one index is given, both 69 | columns will be concatenated with "_". Defaults to \code{c(1, 2)}, i.e. 70 | a concatenation of the first two columns. 71 | } 72 | \item{nNodes}{ 73 | The number of nodes to create. Auto-detect if \code{NULL}. Defaults to 74 | \code{NULL}. 75 | } 76 | \item{linked.by}{ 77 | If \code{columns} a column-linked matrix (\code{ColumnLinkedMatrix}) is 78 | created, if \code{rows} a row-linked matrix (\code{RowLinkedMatrix}). 79 | Defaults to \code{rows}. 80 | } 81 | \item{folderOut}{ 82 | The path to the folder where to save the binary files. Defaults to the 83 | name of the input file (\code{fileIn}) without extension prefixed with 84 | "BGData_". 85 | } 86 | \item{outputType}{ 87 | The \code{vmode} for \code{ff} and \code{type} for \code{big.matrix} 88 | objects. Default to \code{byte} for \code{ff} and \code{char} for 89 | \code{big.matrix} objects. 90 | } 91 | \item{dimorder}{ 92 | The physical layout of the underlying \code{ff} object of each node. 93 | } 94 | \item{verbose}{ 95 | Whether progress updates will be posted. Defaults to \code{FALSE}. 96 | } 97 | } 98 | \details{ 99 | The data included in the first couple of columns (up to \code{nColSkip}) is 100 | used to populate the sample information of a \code{BGData} object, and the 101 | remaining columns are used to fill the genotypes. If the first row contains 102 | a header (\code{header = TRUE}), data in this row is used to determine the 103 | column names for sample information and genotypes. 104 | 105 | The genotypes can take several forms, depending on the function that is 106 | called (\code{readRAW}, \code{readRAW_matrix}, or 107 | \code{readRAW_big.matrix}). The following sections illustrate each function 108 | in detail. 109 | } 110 | \section{readRAW}{ 111 | Genotypes are stored in a \code{LinkedMatrix} object where each node is an 112 | \code{ff} instance. Multiple \code{ff} files are used because the array 113 | size in \code{ff} is limited to the largest integer which can be 114 | represented on the system (\code{.Machine$integer.max}) and for genetic 115 | data this limitation is often exceeded. The \code{LinkedMatrix} package 116 | makes it possible to link several \code{ff} files together by columns or by 117 | rows and treat them similarly to a single matrix. By default a 118 | \code{ColumnLinkedMatrix} is used for the genotypes, but the user can 119 | modify this using the \code{linked.by} argument. The number of nodes to 120 | generate is either specified by the user using the \code{nNodes} argument 121 | or determined internally so that each \code{ff} object has a number of 122 | cells that is smaller than \code{.Machine$integer.max / 1.2}. A folder (see 123 | \code{folderOut}) that contains the binary flat files (named 124 | \code{geno_*.bin}) and an external representation of the \code{BGData} 125 | object in \code{BGData.RData} is created. 126 | } 127 | \section{readRAW_matrix}{ 128 | Genotypes are stored in a regular \code{matrix} object. Therefore, this 129 | function will only work if the .raw file is small enough to fit into 130 | memory. 131 | } 132 | \section{readRAW_big.matrix}{ 133 | Genotypes are stored in a filebacked \code{big.matrix} object. A folder 134 | (see \code{folderOut}) that contains the binary flat file (named 135 | \code{BGData.bin}), a descriptor file (named \code{BGData.desc}), and an 136 | external representation of the \code{BGData} object in \code{BGData.RData} 137 | are created. 138 | } 139 | \section{Reloading a BGData object}{ 140 | To reload a \code{BGData} object, it is recommended to use the 141 | \code{load.BGData} function instead of the \code{load} function as 142 | \code{load} does not initialize \code{ff} objects or attach 143 | \code{big.matrix} objects. 144 | } 145 | \seealso{ 146 | \code{\link[=load.BGData]{load.BGData()}} to load a previously saved 147 | \code{BGData} object, \code{\link[=as.BGData]{as.BGData()}} to create 148 | \code{BGData} objects from non-text files (e.g. .bed files). 149 | \code{\link{BGData-class}}, 150 | \code{\link[LinkedMatrix]{ColumnLinkedMatrix-class}}, 151 | \code{\link[LinkedMatrix]{RowLinkedMatrix-class}}, 152 | \code{\link[bigmemory]{big.matrix-class}}, and \code{\link[ff]{ff}} for 153 | more information on the above mentioned classes. 154 | } 155 | \examples{ 156 | # Path to example data 157 | path <- system.file("extdata", package = "BGData") 158 | 159 | # Convert RAW files of chromosome 1 to a BGData object 160 | bg <- readRAW(fileIn = paste0(path, "/chr1.raw")) 161 | 162 | unlink("BGData_chr1", recursive = TRUE) 163 | } 164 | -------------------------------------------------------------------------------- /man/segments.Rd: -------------------------------------------------------------------------------- 1 | \name{segments} 2 | \alias{segments} 3 | \title{Find non-overlapping segments based on a summary statistic} 4 | \description{ 5 | Given a summary statistic and a threshold, this function computes the 6 | number of non-overlapping segments, each defined as a discovery (i.e., 7 | \code{statistic[i] <= threshold)} +/- a gap, in the same units as \code{bp} 8 | (often base-pair position). 9 | } 10 | \usage{ 11 | segments(statistic, chr, bp, threshold, gap, trim = FALSE, verbose = FALSE) 12 | } 13 | \arguments{ 14 | \item{statistic}{ 15 | A statistic (e.g., BFDR or p-values). 16 | } 17 | \item{chr}{ 18 | A vector containing the chromosome for each value of \code{statistic}. 19 | } 20 | \item{bp}{ 21 | A vector containing the base-pair positions for each value of 22 | \code{statistic}. 23 | } 24 | \item{threshold}{ 25 | The threshold to determine 'significance' (e.g., \code{1e-5} for 26 | p-values). 27 | } 28 | \item{gap}{ 29 | 1/2 of the length of the desired segments. 30 | } 31 | \item{trim}{ 32 | Whether to collapse segments that were artifically inflated by 33 | \code{gap}. Defaults to \code{FALSE}. 34 | } 35 | \item{verbose}{ 36 | Whether progress updates will be posted. Defaults to \code{FALSE}. 37 | } 38 | } 39 | \value{ 40 | A data frame containing the following information: 41 | \item{chr}{ 42 | Chromosome 43 | } 44 | \item{start}{ 45 | Index where segment starts within \code{statistic}. 46 | } 47 | \item{end}{ 48 | Index where segment ends within \code{statistic}. 49 | } 50 | \item{length}{ 51 | Length of segment. 52 | } 53 | \item{bpStart}{ 54 | Base-pair position where segment starts. 55 | } 56 | \item{bpEnd}{ 57 | Base-pair position where segment ends. 58 | } 59 | \item{bpLength}{ 60 | Length of segment in base-pair positions. 61 | } 62 | \item{minValue}{ 63 | Smallest value of \code{statistic} within segment. 64 | } 65 | \item{minValuePos}{ 66 | Position of variant with the smallest value of \code{statistic} within 67 | segment. 68 | } 69 | } 70 | \examples{ 71 | library(BGData) 72 | 73 | # Load example data 74 | bg <- BGData:::loadExample() 75 | 76 | # Perform GWAS 77 | pValues <- GWAS( 78 | formula = FT10 ~ 1, 79 | data = bg, 80 | method = "rayOLS" 81 | ) 82 | 83 | # Determine segments within +/- 1MB from a significant variant 84 | segments <- segments( 85 | statistic = pValues[, 4], 86 | chr = map(bg)$chromosome, 87 | bp = map(bg)$base_pair_position, 88 | threshold = 1e-5, 89 | gap = 1e6, 90 | trim = FALSE, 91 | verbose = FALSE 92 | ) 93 | } 94 | -------------------------------------------------------------------------------- /man/summarize.Rd: -------------------------------------------------------------------------------- 1 | \name{summarize} 2 | \alias{summarize} 3 | \title{Generates Various Summary Statistics} 4 | \description{ 5 | Computes the frequency of missing values, the (minor) allele frequency, and 6 | standard deviation of each column of \code{X}. 7 | } 8 | \usage{ 9 | summarize(X, i = seq_len(nrow(X)), j = seq_len(ncol(X)), 10 | chunkSize = 5000L, nCores = getOption("mc.cores", 2L), 11 | verbose = FALSE) 12 | } 13 | \arguments{ 14 | \item{X}{ 15 | A matrix-like object, typically the genotypes of a \code{BGData} 16 | object. 17 | } 18 | \item{i}{ 19 | Indicates which rows of \code{X} should be used. Can be integer, 20 | boolean, or character. By default, all rows are used. 21 | } 22 | \item{j}{ 23 | Indicates which columns of \code{X} should be used. Can be integer, 24 | boolean, or character. By default, all columns are used. 25 | } 26 | \item{chunkSize}{ 27 | The number of columns of \code{X} that are brought into physical memory 28 | for processing per core. If \code{NULL}, all elements in \code{j} are 29 | used. Defaults to 5000. 30 | } 31 | \item{nCores}{ 32 | The number of cores (passed to \code{mclapply}). Defaults to the 33 | number of cores as detected by \code{detectCores}. 34 | } 35 | \item{verbose}{ 36 | Whether progress updates will be posted. Defaults to \code{FALSE}. 37 | } 38 | } 39 | \value{ 40 | A \code{data.frame} with three columns: \code{freq_na} for frequencies of 41 | missing values, \code{allele_freq} for allele frequencies of the counted 42 | allele, and \code{sd} for standard deviations. 43 | } 44 | \seealso{ 45 | \code{\link{file-backed-matrices}} for more information on file-backed 46 | matrices. \code{\link{multi-level-parallelism}} for more information on 47 | multi-level parallelism. \code{\link{BGData-class}} for more information on 48 | the \code{BGData} class. 49 | } 50 | \examples{ 51 | # Restrict number of cores to 1 on Windows 52 | if (.Platform$OS.type == "windows") { 53 | options(mc.cores = 1) 54 | } 55 | 56 | # Load example data 57 | bg <- BGData:::loadExample() 58 | 59 | # Summarize the whole dataset 60 | sum1 <- summarize(X = geno(bg)) 61 | 62 | # Summarize the first 50 individuals 63 | sum2 <- summarize(X = geno(bg), i = 1:50) 64 | 65 | # Summarize the first 1000 markers (useful for distributed computing) 66 | sum3 <- summarize(X = geno(bg), j = 1:100) 67 | 68 | # Summarize the first 50 individuals on the first 1000 markers 69 | sum4 <- summarize(X = geno(bg), i = 1:50, j = 1:100) 70 | 71 | # Summarize by names 72 | sum5 <- summarize(X = geno(bg), j = c("snp81233_C", "snp81234_C", "snp81235_T")) 73 | 74 | # Convert to minor allele frequencies (useful if the counted alleles are not 75 | # the minor alleles) 76 | maf <- ifelse(sum1$allele_freq > 0.5, 1 - sum1$allele_freq, sum1$allele_freq) 77 | } 78 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) 2 | PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) 3 | -------------------------------------------------------------------------------- /src/fitLSYS.c: -------------------------------------------------------------------------------- 1 | #include "fitLSYS.h" 2 | 3 | SEXP fitLSYS(SEXP C, SEXP rhs, SEXP b, SEXP active, SEXP RSS, SEXP maxIter, SEXP tolerance) { 4 | int p = Rf_ncols(C); 5 | R_xlen_t nActive = Rf_xlength(active); 6 | int nIter = Rf_asInteger(maxIter); 7 | double tol = Rf_asReal(tolerance); 8 | double *pC = REAL(C); 9 | double *prhs = REAL(rhs); 10 | b = PROTECT(Rf_duplicate(b)); 11 | double *pb = REAL(b); 12 | int *pactive = INTEGER(active); 13 | double oldRSS = Rf_asReal(RSS); 14 | double newRSS = oldRSS; 15 | for (int iter = 0; iter < nIter; iter++) { 16 | oldRSS = newRSS; 17 | for (int j = 0; j < nActive; j++) { // loop over active predictors 18 | int k = pactive[j]; 19 | double Ckk = pC[k * (p + 1)]; 20 | double offset = 0; 21 | for (int m = 0; m < nActive; m++) { 22 | int n = pactive[m]; 23 | offset += pC[p * k + n] * pb[n]; 24 | } 25 | offset -= Ckk * pb[k]; 26 | double rhs_offset = prhs[k] - offset; 27 | double sol = rhs_offset / Ckk; 28 | newRSS += (pow(sol, 2) - pow(pb[k], 2)) * Ckk - 2 * (sol - pb[k]) * rhs_offset; 29 | pb[k] = sol; 30 | } 31 | if (((oldRSS - newRSS) / oldRSS) < tol) { 32 | break; 33 | } 34 | } 35 | // Creating a list to return results 36 | SEXP list = PROTECT(Rf_allocVector(VECSXP, 2)); 37 | SET_VECTOR_ELT(list, 0, b); 38 | SET_VECTOR_ELT(list, 1, Rf_ScalarReal(newRSS)); 39 | UNPROTECT(2); // b, list 40 | return list; 41 | } 42 | -------------------------------------------------------------------------------- /src/fitLSYS.h: -------------------------------------------------------------------------------- 1 | #define R_NO_REMAP 2 | 3 | #include 4 | 5 | SEXP fitLSYS(SEXP C, SEXP rhs, SEXP b, SEXP active, SEXP RSS, SEXP maxIter, SEXP tolerance); 6 | -------------------------------------------------------------------------------- /src/init.c: -------------------------------------------------------------------------------- 1 | #include "summarize.h" 2 | #include "rayOLS.h" 3 | #include "preprocess.h" 4 | #include "fitLSYS.h" 5 | 6 | #include 7 | 8 | static const R_CallMethodDef callMethods[] = { 9 | {"summarize", (DL_FUNC) &summarize, 1}, 10 | {"rayOLS", (DL_FUNC) &rayOLS, 2}, 11 | {"preprocess", (DL_FUNC) &preprocess, 5}, 12 | {"fitLSYS", (DL_FUNC) &fitLSYS, 7}, 13 | {NULL, NULL, 0} 14 | }; 15 | 16 | void R_init_BGData(DllInfo *dll) { 17 | R_registerRoutines(dll, NULL, callMethods, NULL, NULL); 18 | R_useDynamicSymbols(dll, FALSE); 19 | R_forceSymbols(dll, TRUE); 20 | } 21 | -------------------------------------------------------------------------------- /src/preprocess.c: -------------------------------------------------------------------------------- 1 | #include "preprocess.h" 2 | 3 | #ifdef _OPENMP 4 | #include 5 | #endif 6 | #include 7 | 8 | void preprocess_int(int *in, int nrows, int ncols, double *out, int center, double *centers, int computeCenters, int scale, double *scales, int computeScales, int impute, int numCores) { 9 | #pragma omp parallel for schedule(static) default(none) shared(NA_INTEGER, NA_REAL, in, nrows, ncols, out, center, centers, computeCenters, scale, scales, computeScales, impute) num_threads(numCores) 10 | for (ptrdiff_t j = 0; j < ncols; j++) { 11 | double mean; 12 | if (computeCenters || computeScales || impute) { 13 | double sum = 0; 14 | double sumsq = 0; 15 | ptrdiff_t n = 0; 16 | for (ptrdiff_t i = 0; i < nrows; i++) { 17 | int *cin = in + j * nrows + i; 18 | if (*cin != NA_INTEGER) { 19 | sum += *cin; 20 | sumsq += *cin * *cin; 21 | n++; 22 | } 23 | } 24 | mean = sum / n; 25 | if (computeCenters) { 26 | centers[j] = mean; 27 | } 28 | if (computeScales) { 29 | scales[j] = sqrt((sumsq - (sum * sum) / n) / (n - 1)); 30 | } 31 | } 32 | for (ptrdiff_t i = 0; i < nrows; i++) { 33 | int *cin = in + j * nrows + i; 34 | double *cout = out + j * nrows + i; 35 | if (*cin == NA_INTEGER) { 36 | if (impute) { 37 | if (center) { 38 | *cout = 0; 39 | } else { 40 | *cout = mean; 41 | } 42 | } else { 43 | *cout = NA_REAL; 44 | } 45 | } else { 46 | *cout = *cin; 47 | if (center) { 48 | *cout -= centers[j]; 49 | } 50 | if (scale) { 51 | *cout /= scales[j]; 52 | } 53 | } 54 | } 55 | } 56 | } 57 | 58 | void preprocess_real(double *in, int nrows, int ncols, double *out, int center, double *centers, int computeCenters, int scale, double *scales, int computeScales, int impute, int numCores) { 59 | #pragma omp parallel for schedule(static) default(none) shared(NA_REAL, in, nrows, ncols, out, center, centers, computeCenters, scale, scales, computeScales, impute) num_threads(numCores) 60 | for (ptrdiff_t j = 0; j < ncols; j++) { 61 | double mean; 62 | if (computeCenters || computeScales || impute) { 63 | double sum = 0; 64 | double sumsq = 0; 65 | ptrdiff_t n = 0; 66 | for (ptrdiff_t i = 0; i < nrows; i++) { 67 | double *cin = in + j * nrows + i; 68 | if (!ISNAN(*cin)) { 69 | sum += *cin; 70 | sumsq += *cin * *cin; 71 | n++; 72 | } 73 | } 74 | mean = sum / n; 75 | if (computeCenters) { 76 | centers[j] = mean; 77 | } 78 | if (computeScales) { 79 | scales[j] = sqrt((sumsq - (sum * sum) / n) / (n - 1)); 80 | } 81 | } 82 | for (ptrdiff_t i = 0; i < nrows; i++) { 83 | double *cin = in + j * nrows + i; 84 | double *cout = out + j * nrows + i; 85 | *cout = *cin; 86 | if (ISNA(*cin)) { 87 | if (impute) { 88 | if (center) { 89 | *cout = 0; 90 | } else { 91 | *cout = mean; 92 | } 93 | } 94 | } else { 95 | if (center) { 96 | *cout -= centers[j]; 97 | } 98 | if (scale) { 99 | *cout /= scales[j]; 100 | } 101 | } 102 | } 103 | } 104 | } 105 | 106 | SEXP preprocess(SEXP sIn, SEXP sCenter, SEXP sScale, SEXP sImpute, SEXP sNumCores) { 107 | int nprotect = 0; 108 | R_xlen_t length = Rf_xlength(sIn); 109 | int nrows = Rf_nrows(sIn); 110 | int ncols = Rf_ncols(sIn); 111 | int center = 0; 112 | SEXP sCenters = NULL; 113 | double *centers = NULL; 114 | int computeCenters = 0; 115 | switch(TYPEOF(sCenter)) { 116 | case LGLSXP: 117 | center = Rf_asLogical(sCenter); 118 | if (center) { 119 | sCenters = PROTECT(Rf_allocVector(REALSXP, ncols)); 120 | nprotect++; 121 | centers = REAL(sCenters); 122 | computeCenters = 1; 123 | } 124 | break; 125 | case REALSXP: 126 | center = 1; 127 | sCenters = PROTECT(Rf_duplicate(sCenter)); 128 | nprotect++; 129 | centers = REAL(sCenters); 130 | break; 131 | } 132 | int scale = 0; 133 | SEXP sScales = NULL; 134 | double *scales = NULL; 135 | int computeScales = 0; 136 | switch(TYPEOF(sScale)) { 137 | case LGLSXP: 138 | scale = Rf_asLogical(sScale); 139 | if (scale) { 140 | sScales = PROTECT(Rf_allocVector(REALSXP, ncols)); 141 | nprotect++; 142 | scales = REAL(sScales); 143 | computeScales = 1; 144 | } 145 | break; 146 | case REALSXP: 147 | scale = 1; 148 | sScales = PROTECT(Rf_duplicate(sScale)); 149 | nprotect++; 150 | scales = REAL(sScales); 151 | break; 152 | } 153 | int impute = Rf_asLogical(sImpute); 154 | int numCores = Rf_asInteger(sNumCores); 155 | // Allocate output vector 156 | SEXP sOut = PROTECT(Rf_allocVector(REALSXP, length)); 157 | nprotect++; 158 | switch(TYPEOF(sIn)) { 159 | case REALSXP: 160 | preprocess_real( 161 | REAL(sIn), 162 | nrows, 163 | ncols, 164 | REAL(sOut), 165 | center, 166 | centers, 167 | computeCenters, 168 | scale, 169 | scales, 170 | computeScales, 171 | impute, 172 | numCores 173 | ); 174 | break; 175 | case INTSXP: 176 | preprocess_int( 177 | INTEGER(sIn), 178 | nrows, 179 | ncols, 180 | REAL(sOut), 181 | center, 182 | centers, 183 | computeCenters, 184 | scale, 185 | scales, 186 | computeScales, 187 | impute, 188 | numCores 189 | ); 190 | break; 191 | } 192 | // Handle attributes 193 | DUPLICATE_ATTRIB(sOut, sIn); 194 | if (center) { 195 | Rf_setAttrib(sOut, Rf_install("scaled:center"), sCenters); 196 | } 197 | if (scale) { 198 | Rf_setAttrib(sOut, Rf_install("scaled:scale"), sScales); 199 | } 200 | UNPROTECT(nprotect); 201 | return sOut; 202 | } 203 | -------------------------------------------------------------------------------- /src/preprocess.h: -------------------------------------------------------------------------------- 1 | #define R_NO_REMAP 2 | 3 | #include 4 | 5 | SEXP preprocess(SEXP sIn, SEXP sCenter, SEXP sScale, SEXP sImpute, SEXP sNumCores); 6 | -------------------------------------------------------------------------------- /src/rayOLS.c: -------------------------------------------------------------------------------- 1 | #include "rayOLS.h" 2 | 3 | #include 4 | 5 | SEXP rayOLS_real(SEXP X, SEXP y) { 6 | // Get dimensions of X 7 | int X_nrow = Rf_nrows(X); 8 | int X_ncol = Rf_ncols(X); 9 | // Check if dimensions match 10 | R_xlen_t y_length = Rf_xlength(y); 11 | if (X_nrow != y_length) { 12 | Rf_error("The number of rows in X and the length of y need to match\n"); 13 | } 14 | // Allocate output matrix 15 | SEXP out = PROTECT(Rf_allocMatrix(REALSXP, X_ncol, 6)); 16 | // Get data pointers 17 | double *X_data = REAL(X); 18 | double *y_data = REAL(y); 19 | // Iterate over columns of X 20 | for (R_xlen_t col_idx = 0; col_idx < X_ncol; col_idx++) { 21 | // Compute number of non-missing values in both x and y (n), and 22 | // Compute sum of x (xt1) for centering x, and 23 | // Compute sum of y (yt1) for centering y, and 24 | // Compute sum of products of x and y (xty) for Cov(x, y), and 25 | // Compute sum of squares of x (xtx) for Var(x), and 26 | // Compute sum of squares of y (yty) for RSS 27 | R_xlen_t n = 0; 28 | double xt1 = 0; 29 | double yt1 = 0; 30 | double xty = 0; 31 | double xtx = 0; 32 | double yty = 0; 33 | for (R_xlen_t row_idx = 0; row_idx < X_nrow; row_idx++) { 34 | double x_val = X_data[row_idx + (col_idx * X_nrow)]; 35 | if (!(ISNA(x_val) || ISNA(y_data[row_idx]))) { 36 | n++; 37 | xt1 += x_val; 38 | yt1 += y_data[row_idx]; 39 | xty += x_val * y_data[row_idx]; 40 | xtx += x_val * x_val; 41 | yty += y_data[row_idx] * y_data[row_idx]; 42 | } 43 | } 44 | // Center xty, xtx, and yty 45 | xty -= (xt1 * yt1) / n; 46 | xtx -= (xt1 * xt1) / n; 47 | yty -= (yt1 * yt1) / n; 48 | // Compute beta_1 as Cov(x, y) / Var(x) 49 | // For centered data, beta_0 will be 0: mean(y) - beta_1 * mean(x) 50 | double beta_1 = xty / xtx; 51 | // Compute remaining statistics 52 | double rss = yty - (xtx * pow(beta_1, 2)); 53 | double se = sqrt((rss / (n - 2)) / xtx); 54 | double z_stat = beta_1 / se; 55 | double p_value = Rf_pt(fabs(z_stat), n - 2, 0, 0) * 2; 56 | double allele_freq = xt1 / n / 2; 57 | // Write results 58 | REAL(out)[col_idx] = beta_1; 59 | REAL(out)[col_idx + X_ncol] = se; 60 | REAL(out)[col_idx + (2 * X_ncol)] = z_stat; 61 | REAL(out)[col_idx + (3 * X_ncol)] = p_value; 62 | REAL(out)[col_idx + (4 * X_ncol)] = n; 63 | REAL(out)[col_idx + (5 * X_ncol)] = allele_freq; 64 | } 65 | UNPROTECT(1); 66 | return out; 67 | } 68 | 69 | SEXP rayOLS_integer(SEXP X, SEXP y) { 70 | // Get dimensions of X 71 | int X_nrow = Rf_nrows(X); 72 | int X_ncol = Rf_ncols(X); 73 | // Check if dimensions match 74 | R_xlen_t y_length = Rf_xlength(y); 75 | if (X_nrow != y_length) { 76 | Rf_error("The number of rows in X and the length of y need to match\n"); 77 | } 78 | // Allocate output matrix 79 | SEXP out = PROTECT(Rf_allocMatrix(REALSXP, X_ncol, 6)); 80 | // Get data pointers 81 | int *X_data = INTEGER(X); 82 | double *y_data = REAL(y); 83 | // Iterate over columns of X 84 | for (R_xlen_t col_idx = 0; col_idx < X_ncol; col_idx++) { 85 | // Compute number of non-missing values in both x and y (n), and 86 | // Compute sum of x (xt1) for centering x, and 87 | // Compute sum of y (yt1) for centering y, and 88 | // Compute sum of products of x and y (xty) for Cov(x, y), and 89 | // Compute sum of squares of x (xtx) for Var(x), and 90 | // Compute sum of squares of y (yty) for RSS 91 | R_xlen_t n = 0; 92 | double xt1 = 0; 93 | double yt1 = 0; 94 | double xty = 0; 95 | double xtx = 0; 96 | double yty = 0; 97 | for (R_xlen_t row_idx = 0; row_idx < X_nrow; row_idx++) { 98 | int x_val = X_data[row_idx + (col_idx * X_nrow)]; 99 | if (!(x_val == NA_INTEGER || ISNA(y_data[row_idx]))) { 100 | n++; 101 | xt1 += x_val; 102 | yt1 += y_data[row_idx]; 103 | xty += x_val * y_data[row_idx]; 104 | xtx += x_val * x_val; 105 | yty += y_data[row_idx] * y_data[row_idx]; 106 | } 107 | } 108 | // Center xty, xtx, and yty 109 | xty -= (xt1 * yt1) / n; 110 | xtx -= (xt1 * xt1) / n; 111 | yty -= (yt1 * yt1) / n; 112 | // Compute beta_1 as Cov(x, y) / Var(x) 113 | // For centered data, beta_0 will be 0: mean(y) - beta_1 * mean(x) 114 | double beta_1 = xty / xtx; 115 | // Compute remaining statistics 116 | double rss = yty - (xtx * pow(beta_1, 2)); 117 | double se = sqrt((rss / (n - 2)) / xtx); 118 | double z_stat = beta_1 / se; 119 | double p_value = Rf_pt(fabs(z_stat), n - 2, 0, 0) * 2; 120 | double allele_freq = xt1 / n / 2; 121 | // Write results 122 | REAL(out)[col_idx] = beta_1; 123 | REAL(out)[col_idx + X_ncol] = se; 124 | REAL(out)[col_idx + (2 * X_ncol)] = z_stat; 125 | REAL(out)[col_idx + (3 * X_ncol)] = p_value; 126 | REAL(out)[col_idx + (4 * X_ncol)] = n; 127 | REAL(out)[col_idx + (5 * X_ncol)] = allele_freq; 128 | } 129 | UNPROTECT(1); 130 | return out; 131 | } 132 | 133 | SEXP rayOLS(SEXP X, SEXP y) { 134 | // Dispatch to real or integer function 135 | // TODO: Macro-based generics 136 | switch (TYPEOF(X)) { 137 | case REALSXP: 138 | return rayOLS_real(X, y); 139 | break; 140 | case INTSXP: 141 | return rayOLS_integer(X, y); 142 | break; 143 | default: 144 | Rf_error("x needs to be a numeric vector"); 145 | break; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/rayOLS.h: -------------------------------------------------------------------------------- 1 | #define R_NO_REMAP 2 | 3 | #include 4 | 5 | SEXP rayOLS(SEXP X, SEXP y); 6 | -------------------------------------------------------------------------------- /src/summarize.c: -------------------------------------------------------------------------------- 1 | #include "summarize.h" 2 | 3 | #include 4 | 5 | SEXP summarize_real(SEXP X) { 6 | // Get dimensions of X 7 | int nrow = Rf_nrows(X); 8 | int ncol = Rf_ncols(X); 9 | // Get data pointer 10 | double *X_data = REAL(X); 11 | // Allocate output matrix 12 | SEXP out = PROTECT(Rf_allocMatrix(REALSXP, ncol, 3)); 13 | // Iterate over columns of X 14 | int col_idx = 0; 15 | int row_idx = 0; 16 | for (col_idx = 0; col_idx < ncol; col_idx++) { 17 | // Compute number of non-missing values (n), and 18 | // Compute column sum (xt1), and 19 | // Compute column sum of squares (xtx) 20 | R_xlen_t n = 0; 21 | double xt1 = 0; 22 | double xtx = 0; 23 | for (row_idx = 0; row_idx < nrow; row_idx++) { 24 | double x_val = X_data[row_idx + (col_idx * nrow)]; 25 | if (!ISNA(x_val)) { 26 | n++; 27 | xt1 += x_val; 28 | xtx += x_val * x_val; 29 | } 30 | } 31 | double freq_na; 32 | double allele_freq; 33 | double sd; 34 | if (n) { 35 | // Center xtx 36 | xtx -= (xt1 * xt1) / n; 37 | // Compute summary statistics 38 | freq_na = (nrow - n) / (double) nrow; 39 | allele_freq = xt1 / n / 2; 40 | sd = sqrt(xtx / (n - 1)); 41 | } else { 42 | freq_na = 1; 43 | allele_freq = NA_REAL; 44 | sd = NA_REAL; 45 | } 46 | // Write results into output matrix 47 | REAL(out)[col_idx] = freq_na; 48 | REAL(out)[col_idx + ncol] = allele_freq; 49 | REAL(out)[col_idx + (2 * ncol)] = sd; 50 | } 51 | UNPROTECT(1); 52 | return out; 53 | } 54 | 55 | SEXP summarize_integer(SEXP X) { 56 | // Get dimensions of X 57 | int nrow = Rf_nrows(X); 58 | int ncol = Rf_ncols(X); 59 | // Get data pointer 60 | int *X_data = INTEGER(X); 61 | // Allocate output matrix 62 | SEXP out = PROTECT(Rf_allocMatrix(REALSXP, ncol, 3)); 63 | // Iterate over columns of X 64 | int col_idx = 0; 65 | int row_idx = 0; 66 | for (col_idx = 0; col_idx < ncol; col_idx++) { 67 | // Compute number of non-missing values (n), and 68 | // Compute column sum (xt1), and 69 | // Compute column sum of squares (xtx) 70 | R_xlen_t n = 0; 71 | double xt1 = 0; 72 | double xtx = 0; 73 | for (row_idx = 0; row_idx < nrow; row_idx++) { 74 | int x_val = X_data[row_idx + (col_idx * nrow)]; 75 | if (x_val != NA_INTEGER) { 76 | n++; 77 | xt1 += x_val; 78 | xtx += x_val * x_val; 79 | } 80 | } 81 | double freq_na; 82 | double allele_freq; 83 | double sd; 84 | if (n) { 85 | // Center xtx 86 | xtx -= (xt1 * xt1) / n; 87 | // Compute summary statistics 88 | freq_na = (nrow - n) / (double) nrow; 89 | allele_freq = xt1 / n / 2; 90 | sd = sqrt(xtx / (n - 1)); 91 | } else { 92 | freq_na = 1; 93 | allele_freq = NA_REAL; 94 | sd = NA_REAL; 95 | } 96 | // Write results into output matrix 97 | REAL(out)[col_idx] = freq_na; 98 | REAL(out)[col_idx + ncol] = allele_freq; 99 | REAL(out)[col_idx + (2 * ncol)] = sd; 100 | } 101 | UNPROTECT(1); 102 | return out; 103 | } 104 | 105 | SEXP summarize(SEXP X) { 106 | // Dispatch to real or integer function 107 | // TODO: Macro-based generics 108 | switch (TYPEOF(X)) { 109 | case REALSXP: 110 | return summarize_real(X); 111 | break; 112 | case INTSXP: 113 | return summarize_integer(X); 114 | break; 115 | default: 116 | Rf_error("X needs to be a numeric matrix"); 117 | break; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/summarize.h: -------------------------------------------------------------------------------- 1 | #define R_NO_REMAP 2 | 3 | #include 4 | 5 | SEXP summarize(SEXP X); 6 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(BGData) 3 | 4 | test_check("BGData") 5 | -------------------------------------------------------------------------------- /tests/testthat/helper-utils.R: -------------------------------------------------------------------------------- 1 | library(parallel) 2 | 3 | testDir <- function() { 4 | paste0(tempdir(), "/BGData-", BGData:::randomString(), "/") 5 | } 6 | 7 | hasCores <- function(numCores) { 8 | # For CRAN 9 | if (Sys.getenv("_R_CHECK_LIMIT_CORES_") == TRUE || numCores > parallel::detectCores()) { 10 | skip("Not enough cores or number of cores capped for CRAN submission checks.") 11 | } 12 | # For WinBuilder 13 | if (.Platform$OS.type == "windows" && numCores > 1) { 14 | skip("mc.cores > 1 is not supported on Windows.") 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tests/testthat/test-BGData.R: -------------------------------------------------------------------------------- 1 | context("BGData") 2 | 3 | # Create dummy path 4 | testPath <- paste0(tempdir(), "/BGData-", BGData:::randomString(), "/") 5 | dir.create(testPath) 6 | 7 | restoreGenotypes <- function() { 8 | set.seed(4711) 9 | data <- sample(c(1, 2, 3, 4), size = nRows * nCols, replace = TRUE) 10 | set.seed(NULL) 11 | genotypes <- matrix(data = data, nrow = nRows, ncol = nCols) 12 | rownames(genotypes) <- paste0("1_", seq_len(nRows)) 13 | colnames(genotypes) <- paste0("mrk_", seq_len(nCols)) 14 | return(genotypes) 15 | } 16 | 17 | # Create example .raw files 18 | pedPath <- paste0(testPath, "ped-", BGData:::randomString(), ".txt") 19 | nRows <- 3 20 | nCols <- 3 21 | phenotypes <- data.frame(FID = c("1", "1", "1"), IID = c("1", "2", "3"), 22 | PAT = c("NA", "NA", "NA"), MAT = c("NA", "NA", "NA"), SEX = c("NA", "NA", "NA"), 23 | PHENOTYPE = c("NA", "NA", "NA"), stringsAsFactors = FALSE) 24 | phenotypes[] <- lapply(phenotypes, type.convert, as.is = TRUE) 25 | rownames(phenotypes) <- paste0("1_", 1:3) 26 | genotypes <- restoreGenotypes() 27 | ped <- cbind(phenotypes, genotypes) 28 | outFile <- file(pedPath, "w") 29 | write.table(ped, file = outFile, quote = FALSE, row.names = FALSE) 30 | close(outFile) 31 | 32 | 33 | context("initialize") 34 | 35 | test_that("it requires at least geno", { 36 | expect_error(BGData()) 37 | }) 38 | 39 | test_that("it checks if pheno is a data.frame", { 40 | expect_error(BGData(geno = genotypes, pheno = rownames(genotypes))) 41 | }) 42 | 43 | test_that("it checks if map is a data.frame", { 44 | expect_error(BGData(geno = genotypes, map = colnames(genotypes))) 45 | }) 46 | 47 | test_that("it checks if the number of rows of geno match with the number of rows of pheno", { 48 | expect_error(BGData(geno = genotypes, pheno = phenotypes[-1, ])) 49 | }) 50 | 51 | test_that("it checks if the number of rows of geno match with the number of rows of pheno", { 52 | map <- data.frame(mrk = colnames(genotypes)) 53 | expect_error(BGData(geno = genotypes, map = map[-1, ])) 54 | }) 55 | 56 | test_that("it checks if the rownames of geno are unique", { 57 | rownames(genotypes) <- c("1_1", "1_2", "1_2") 58 | expect_error(BGData(geno = genotypes)) 59 | genotypes <- restoreGenotypes() 60 | }) 61 | 62 | test_that("it checks if the colnames of geno are unique", { 63 | colnames(genotypes) <- c("mrk_1", "mrk_2", "mrk_2") 64 | expect_error(BGData(geno = genotypes)) 65 | genotypes <- restoreGenotypes() 66 | }) 67 | 68 | test_that("it warns if the row names of pheno do not match the row names of geno", { 69 | expect_warning(BGData(geno = genotypes, pheno = phenotypes[nrow(phenotypes):1, ])) 70 | }) 71 | 72 | test_that("it warns if the row names of map do not match the columns names of geno", { 73 | map <- data.frame(mrk = rev(colnames(genotypes))) 74 | expect_warning(BGData(geno = genotypes, map = map)) 75 | }) 76 | 77 | test_that("it generates a sequence as rownames for pheno if geno does not have rownames", { 78 | rownames(genotypes) <- NULL 79 | DATA <- BGData(geno = genotypes) 80 | expect_equal(rownames(pheno(DATA)), paste0("sample_", seq_len(nrow(pheno(DATA))))) 81 | genotypes <- restoreGenotypes() 82 | }) 83 | 84 | test_that("it generates a sequence as rownames for map if geno does not have colnames", { 85 | colnames(genotypes) <- NULL 86 | DATA <- BGData(geno = genotypes) 87 | expect_equal(rownames(map(DATA)), paste0("variant_", seq_len(nrow(map(DATA))))) 88 | genotypes <- restoreGenotypes() 89 | }) 90 | 91 | 92 | context("readRAW") 93 | 94 | test_that("it complains if folderOut already exists", { 95 | dirExistsPath <- paste0(testPath, "dirExists") 96 | dir.create(dirExistsPath, showWarnings = FALSE) 97 | expect_error(readRAW(fileIn = pedPath, n = nRows, folderOut = dirExistsPath)) 98 | }) 99 | 100 | 101 | test_that("it reads .raw files into BGData objects", { 102 | 103 | # With minimum number of parameters (with exception of folderOut) 104 | BGData <- readRAW(fileIn = pedPath, folderOut = paste0(testPath, "test-", BGData:::randomString())) 105 | expect_equal(pheno(BGData), phenotypes) 106 | expect_equivalent(geno(BGData)[], genotypes) 107 | 108 | # With n 109 | BGData <- readRAW(fileIn = pedPath, n = nRows, folderOut = paste0(testPath, "test-", BGData:::randomString())) 110 | expect_equal(pheno(BGData), phenotypes) 111 | expect_equivalent(geno(BGData)[], genotypes) 112 | 113 | # With p 114 | BGData <- readRAW(fileIn = pedPath, p = nCols, folderOut = paste0(testPath, "test-", BGData:::randomString())) 115 | expect_equal(pheno(BGData), phenotypes) 116 | expect_equivalent(geno(BGData)[], genotypes) 117 | 118 | # With both n and p 119 | BGData <- readRAW(fileIn = pedPath, n = nRows, p = nCols, folderOut = paste0(testPath, "test-", BGData:::randomString())) 120 | expect_equal(pheno(BGData), phenotypes) 121 | expect_equivalent(geno(BGData)[], genotypes) 122 | 123 | # As integer 124 | class(genotypes) <- "integer" 125 | BGData <- readRAW(fileIn = pedPath, dataType = integer(), folderOut = paste0(testPath, "test-", BGData:::randomString())) 126 | expect_equivalent(geno(BGData)[], genotypes) 127 | BGData <- readRAW(fileIn = pedPath, dataType = "integer", folderOut = paste0(testPath, "test-", BGData:::randomString())) 128 | expect_equivalent(geno(BGData)[], genotypes) 129 | genotypes <- restoreGenotypes() 130 | 131 | # As double 132 | class(genotypes) <- "double" 133 | BGData <- readRAW(fileIn = pedPath, dataType = double(), folderOut = paste0(testPath, "test-", BGData:::randomString())) 134 | expect_equivalent(geno(BGData)[], genotypes) 135 | BGData <- readRAW(fileIn = pedPath, dataType = "double", folderOut = paste0(testPath, "test-", BGData:::randomString())) 136 | expect_equivalent(geno(BGData)[], genotypes) 137 | genotypes <- restoreGenotypes() 138 | 139 | # As character 140 | expect_error(readRAW(fileIn = pedPath, dataType = character(), folderOut = paste0(testPath, "test-", BGData:::randomString()))) 141 | expect_error(readRAW(fileIn = pedPath, dataType = "character", folderOut = paste0(testPath, "test-", BGData:::randomString()))) 142 | 143 | }) 144 | 145 | 146 | context("readRAW_matrix") 147 | 148 | test_that("it reads a .raw file into a matrix object", { 149 | 150 | # With minimum number of parameters (with exception of folderOut) 151 | BGData <- readRAW_matrix(fileIn = pedPath) 152 | expect_equal(pheno(BGData), phenotypes) 153 | expect_equal(geno(BGData)[], genotypes) 154 | 155 | # With n 156 | BGData <- readRAW_matrix(fileIn = pedPath, n = nRows) 157 | expect_equal(pheno(BGData), phenotypes) 158 | expect_equal(geno(BGData)[], genotypes) 159 | 160 | # With p 161 | BGData <- readRAW_matrix(fileIn = pedPath, p = nCols) 162 | expect_equal(pheno(BGData), phenotypes) 163 | expect_equal(geno(BGData)[], genotypes) 164 | 165 | # With both n and p 166 | BGData <- readRAW_matrix(fileIn = pedPath, n = nRows, p = nCols) 167 | expect_equal(pheno(BGData), phenotypes) 168 | expect_equal(geno(BGData)[], genotypes) 169 | 170 | # As integer 171 | class(genotypes) <- "integer" 172 | BGData <- readRAW_matrix(fileIn = pedPath, dataType = integer()) 173 | expect_equal(geno(BGData)[], genotypes) 174 | BGData <- readRAW_matrix(fileIn = pedPath, dataType = "integer") 175 | expect_equal(geno(BGData)[], genotypes) 176 | genotypes <- restoreGenotypes() 177 | 178 | # As double 179 | class(genotypes) <- "double" 180 | BGData <- readRAW_matrix(fileIn = pedPath, dataType = double()) 181 | expect_equal(geno(BGData)[], genotypes) 182 | BGData <- readRAW_matrix(fileIn = pedPath, dataType = "double") 183 | expect_equal(geno(BGData)[], genotypes) 184 | genotypes <- restoreGenotypes() 185 | 186 | # As character 187 | class(genotypes) <- "character" 188 | BGData <- readRAW_matrix(fileIn = pedPath, dataType = character()) 189 | expect_equal(geno(BGData)[], genotypes) 190 | BGData <- readRAW_matrix(fileIn = pedPath, dataType = "character") 191 | expect_equal(geno(BGData)[], genotypes) 192 | genotypes <- restoreGenotypes() 193 | 194 | }) 195 | 196 | context("readRAW_big.matrix") 197 | 198 | test_that("it reads a .raw file into a big.matrix object", { 199 | 200 | # With minimum number of parameters (with exception of folderOut) 201 | BGData <- readRAW_big.matrix(fileIn = pedPath, folderOut = paste0(testPath, "test-", BGData:::randomString())) 202 | expect_equal(pheno(BGData), phenotypes) 203 | expect_equal(geno(BGData)[], genotypes) 204 | 205 | # With n 206 | BGData <- readRAW_big.matrix(fileIn = pedPath, n = nRows, folderOut = paste0(testPath, "test-", BGData:::randomString())) 207 | expect_equal(pheno(BGData), phenotypes) 208 | expect_equal(geno(BGData)[], genotypes) 209 | 210 | # With p 211 | BGData <- readRAW_big.matrix(fileIn = pedPath, p = nCols, folderOut = paste0(testPath, "test-", BGData:::randomString())) 212 | expect_equal(pheno(BGData), phenotypes) 213 | expect_equal(geno(BGData)[], genotypes) 214 | 215 | # With both n and p 216 | BGData <- readRAW_big.matrix(fileIn = pedPath, n = nRows, p = nCols, folderOut = paste0(testPath, "test-", BGData:::randomString())) 217 | expect_equal(pheno(BGData), phenotypes) 218 | expect_equal(geno(BGData)[], genotypes) 219 | 220 | # As integer 221 | class(genotypes) <- "integer" 222 | BGData <- readRAW_big.matrix(fileIn = pedPath, dataType = integer(), folderOut = paste0(testPath, "test-", BGData:::randomString())) 223 | expect_equal(geno(BGData)[], genotypes) 224 | BGData <- readRAW_big.matrix(fileIn = pedPath, dataType = "integer", folderOut = paste0(testPath, "test-", BGData:::randomString())) 225 | expect_equal(geno(BGData)[], genotypes) 226 | genotypes <- restoreGenotypes() 227 | 228 | # As double 229 | class(genotypes) <- "double" 230 | BGData <- readRAW_big.matrix(fileIn = pedPath, dataType = double(), folderOut = paste0(testPath, "test-", BGData:::randomString())) 231 | expect_equal(geno(BGData)[], genotypes) 232 | BGData <- readRAW_big.matrix(fileIn = pedPath, dataType = "double", folderOut = paste0(testPath, "test-", BGData:::randomString())) 233 | expect_equal(geno(BGData)[], genotypes) 234 | genotypes <- restoreGenotypes() 235 | 236 | # As character 237 | expect_error(readRAW(fileIn = pedPath, dataType = character(), folderOut = paste0(testPath, "test-", BGData:::randomString()))) 238 | expect_error(readRAW(fileIn = pedPath, dataType = "character", folderOut = paste0(testPath, "test-", BGData:::randomString()))) 239 | 240 | }) 241 | 242 | context("load.BGData") 243 | 244 | test_that("it loads BGData objects created by readRAW", { 245 | 246 | # Create dummy BGData object without returning data 247 | path <- paste0(testPath, "test-", BGData:::randomString()) 248 | readRAW(fileIn = pedPath, folderOut = path) 249 | expect_true(!("BGData" %in% ls())) 250 | 251 | # Append BGData.RData to path 252 | path <- paste0(path, "/", "BGData.RData") 253 | 254 | # Load BGData object and test if all nodes have been opened 255 | load.BGData(path) 256 | expect_true("BGData" %in% ls()) 257 | for (node in seq_len(LinkedMatrix::nNodes(geno(BGData)))) { 258 | expect_true(ff::is.open(geno(BGData)[[node]])) 259 | } 260 | expect_equal(dim(geno(BGData)), c(nRows, nCols)) 261 | 262 | }) 263 | 264 | test_that("it loads BGData objects created by readRAW_matrix", { 265 | 266 | # Create dummy BGData object 267 | path <- paste0(testPath, "test-", BGData:::randomString(), "/", "BGData.RData") 268 | dir.create(dirname(path)) 269 | BGData <- readRAW_matrix(fileIn = pedPath) 270 | save(BGData, file = path) 271 | rm(BGData) 272 | expect_true(!("BGData" %in% ls())) 273 | 274 | # Load BGData object 275 | load.BGData(path) 276 | expect_true("BGData" %in% ls()) 277 | expect_equal(dim(geno(BGData)), c(nRows, nCols)) 278 | 279 | }) 280 | 281 | test_that("it loads BGData objects created by readRAW_big.matrix", { 282 | 283 | # Create dummy BGData object 284 | path <- paste0(testPath, "test-", BGData:::randomString()) 285 | readRAW_big.matrix(fileIn = pedPath, dataType = integer(), folderOut = path) 286 | expect_true(!("BGData" %in% ls())) 287 | 288 | # Append BGData.RData to path 289 | path <- paste0(path, "/", "BGData.RData") 290 | 291 | # Load BGData object 292 | load.BGData(path) 293 | expect_true("BGData" %in% ls()) 294 | expect_equal(dim(geno(BGData)), c(nRows, nCols)) 295 | 296 | }) 297 | 298 | test_that("it loads BGData objects containing a BEDMatrix object", { 299 | 300 | # Create dummy objects 301 | bedMatrix <- BEDMatrix::BEDMatrix(system.file("extdata", "chr1.bed", package = "BGData")) 302 | bedDims <- dim(bedMatrix) 303 | bedDNames <- dimnames(bedMatrix) 304 | bedRow <- bedMatrix[1, ] 305 | BGData <- BGData(geno = bedMatrix) 306 | 307 | # Save BGData object 308 | path <- paste0(testPath, "test-", BGData:::randomString(), "/", "BGData.RData") 309 | dir.create(dirname(path)) 310 | save(BGData, file = path) 311 | rm(BGData) 312 | expect_true(!("BGData" %in% ls())) 313 | 314 | # Load BGData object 315 | load.BGData(path) 316 | expect_true("BGData" %in% ls()) 317 | expect_equal(dim(geno(BGData)), bedDims) 318 | expect_equal(dimnames(geno(BGData)), bedDNames) 319 | expect_equal(geno(BGData)[1, ], bedRow) 320 | 321 | }) 322 | 323 | context("as.BGData") 324 | 325 | test_that("it converts a regular BEDMatrix object to a BGData object", { 326 | bedMatrix <- BEDMatrix::BEDMatrix(system.file("extdata", "chr1.bed", package = "BGData")) 327 | bgData <- as.BGData(bedMatrix) 328 | expect_is(bgData, "BGData") 329 | expect_equal(dim(geno(bgData)), dim(bedMatrix)) 330 | expect_equal(nrow(pheno(bgData)), nrow(bedMatrix)) 331 | expect_equal(rownames(pheno(bgData)), rownames(bedMatrix)) 332 | expect_equal(nrow(map(bgData)), ncol(bedMatrix)) 333 | expect_equal(rownames(map(bgData)), colnames(bedMatrix)) 334 | }) 335 | 336 | test_that("it converts a BEDMatrix object created with the n parameter to a BGData object", { 337 | bedMatrix <- BEDMatrix::BEDMatrix(system.file("extdata", "chr1.bed", package = "BGData"), n = 199) 338 | bgData <- as.BGData(bedMatrix) 339 | expect_is(bgData, "BGData") 340 | expect_equal(dim(geno(bgData)), dim(bedMatrix)) 341 | expect_equal(nrow(pheno(bgData)), nrow(bedMatrix)) 342 | expect_equal(nrow(map(bgData)), ncol(bedMatrix)) 343 | expect_equal(rownames(map(bgData)), colnames(bedMatrix)) 344 | }) 345 | 346 | test_that("it converts a BEDMatrix object created with the p parameter to a BGData object", { 347 | bedMatrix <- BEDMatrix::BEDMatrix(system.file("extdata", "chr1.bed", package = "BGData"), p = 300) 348 | bgData <- as.BGData(bedMatrix) 349 | expect_is(bgData, "BGData") 350 | expect_equal(dim(geno(bgData)), dim(bedMatrix)) 351 | expect_equal(nrow(pheno(bgData)), nrow(bedMatrix)) 352 | expect_equal(rownames(pheno(bgData)), rownames(bedMatrix)) 353 | expect_equal(nrow(map(bgData)), ncol(bedMatrix)) 354 | }) 355 | 356 | test_that("it converts a BEDMatrix object created with the n and p parameters to a BGData object", { 357 | bedMatrix <- BEDMatrix::BEDMatrix(system.file("extdata", "chr1.bed", package = "BGData"), n = 199, p = 300) 358 | bgData <- as.BGData(bedMatrix) 359 | expect_is(bgData, "BGData") 360 | expect_equal(dim(geno(bgData)), dim(bedMatrix)) 361 | expect_equal(nrow(pheno(bgData)), nrow(bedMatrix)) 362 | expect_equal(nrow(map(bgData)), ncol(bedMatrix)) 363 | }) 364 | 365 | test_that("it throws an error if an alternate phenotype file does not exist when converting a BEDMatrix object to a BGData object", { 366 | bedMatrix <- BEDMatrix::BEDMatrix(system.file("extdata", "chr1.bed", package = "BGData")) 367 | expect_error(as.BGData(bedMatrix, alternatePhenotypeFile = "NOT_FOUND")) 368 | }) 369 | 370 | test_that("it reads an alternate phenotype file when converting a BEDMatrix object to a BGData object", { 371 | bedMatrix <- BEDMatrix::BEDMatrix(system.file("extdata", "chr1.bed", package = "BGData")) 372 | bgData <- as.BGData(bedMatrix, alternatePhenotypeFile = system.file("extdata", "pheno.txt", package = "BGData")) 373 | expect_is(bgData, "BGData") 374 | # Test if pheno has an extra column for the phenotype 375 | expect_equal(ncol(pheno(bgData)), 7) 376 | # Test merging and NA handling 377 | expect_equal(pheno(bgData)[1, 7], 57.0) 378 | expect_equal(nrow(pheno(bgData)), nrow(geno(bgData))) 379 | expect_true(all(is.na(pheno(bgData)[c(178, 180, 189, 190, 196), 7]))) 380 | # Test if rownames are retained 381 | expect_equal(rownames(pheno(bgData)), rownames(bedMatrix)) 382 | }) 383 | -------------------------------------------------------------------------------- /tests/testthat/test-GWAS.R: -------------------------------------------------------------------------------- 1 | context("GWAS") 2 | 3 | set.seed(1) 4 | 5 | nRows <- 15 6 | nCols <- 50 7 | percentNA <- 0.1 8 | 9 | lm_test <- function(X, y, covariates = NULL) { 10 | res <- apply(X, 2, function(x) { 11 | data <- data.frame( 12 | y = y, 13 | x = x 14 | ) 15 | if (!is.null(covariates)) { 16 | data <- cbind(data, covariates) 17 | } 18 | fm <- lm(y ~ ., data = data) 19 | coefficients(summary(fm))[2, ] 20 | }) 21 | res <- t(res) 22 | rownames(res) <- colnames(X) 23 | return(res) 24 | } 25 | 26 | lsfit_test <- function(X, y, covariates = NULL) { 27 | res <- apply(X, 2, function(x) { 28 | fm <- lsfit(x = cbind(x, covariates), y = y) 29 | ls.print(fm, print.it = FALSE)$coef.table[[1]][2, ] 30 | }) 31 | res <- t(res) 32 | rownames(res) <- colnames(X) 33 | return(res) 34 | } 35 | 36 | test_that("GWAS without covariates", { 37 | 38 | for (mode in c("integer", "double")) { 39 | 40 | X <- matrix(data = rnorm(nRows * nCols, sd = 100), nrow = nRows, ncol = nCols) 41 | X[sample(seq_along(X), size = ceiling(length(X) * percentNA))] <- NA 42 | storage.mode(X) <- mode 43 | 44 | y <- rnorm(nRows, sd = 100) 45 | y[sample(seq_along(y), size = ceiling(length(y) * percentNA))] <- NA 46 | 47 | lsfit_res <- suppressWarnings(lsfit_test(X, y)) 48 | lm_res <- lm_test(X, y) 49 | 50 | DATA <- BGData(geno = X, pheno = data.frame( 51 | y = y 52 | )) 53 | 54 | for (method in c("rayOLS", "lsfit", "lm")) { 55 | 56 | for (nCores in seq_len(2)) { 57 | 58 | hasCores(nCores) 59 | 60 | GWAS_res <- suppressWarnings(GWAS(formula = y ~ 1, data = DATA, method = method, nCores = nCores)) 61 | 62 | expect_equivalent(GWAS_res[, 1:4], lsfit_res) 63 | expect_equivalent(GWAS_res[, 1:4], lm_res) 64 | 65 | } 66 | 67 | } 68 | 69 | } 70 | 71 | }) 72 | 73 | test_that("GWAS with covariates", { 74 | 75 | for (mode in c("integer", "double")) { 76 | 77 | X <- matrix(data = rnorm(nRows * nCols, sd = 100), nrow = nRows, ncol = nCols) 78 | 79 | PCs <- svd(X, nu = 2, nv = 0)$u 80 | colnames(PCs) <- c("pc1", "pc2") 81 | PCs[sample(seq_along(PCs), size = ceiling(length(PCs) * percentNA))] <- NA 82 | 83 | X[sample(seq_along(X), size = ceiling(length(X) * percentNA))] <- NA 84 | storage.mode(X) <- mode 85 | 86 | y <- rnorm(nRows, sd = 100) 87 | y[sample(seq_along(y), size = ceiling(length(y) * percentNA))] <- NA 88 | 89 | lsfit_res <- suppressWarnings(lsfit_test(X, y, PCs)) 90 | lm_res <- lm_test(X, y, PCs) 91 | 92 | DATA <- BGData(geno = X, pheno = data.frame( 93 | y = y, 94 | pc1 = PCs[, 1], 95 | pc2 = PCs[, 2] 96 | )) 97 | 98 | for (method in c("lsfit", "lm")) { 99 | 100 | for (nCores in seq_len(2)) { 101 | 102 | hasCores(nCores) 103 | 104 | GWAS_res <- suppressWarnings(GWAS(formula = y ~ pc1 + pc2, data = DATA, method = method, nCores = nCores)) 105 | 106 | expect_equivalent(GWAS_res, lsfit_res) 107 | expect_equivalent(GWAS_res, lm_res) 108 | 109 | } 110 | 111 | } 112 | 113 | } 114 | 115 | }) 116 | -------------------------------------------------------------------------------- /tests/testthat/test-chunkedApply.R: -------------------------------------------------------------------------------- 1 | context("chunkedApply") 2 | 3 | set.seed(1) 4 | 5 | nRows <- 5 6 | nCols <- 10 7 | nNAs <- 5 8 | 9 | X <- matrix(data = rnorm(nRows * nCols, sd = 100), nrow = nRows, ncol = nCols) 10 | X[sample(1:length(X), size = nNAs)] <- NA 11 | 12 | test_that("chunkedMap", { 13 | 14 | for (nCores in seq_len(2)) { 15 | 16 | hasCores(nCores) 17 | 18 | for (chunkSize in c(5, 10)) { 19 | 20 | expect_equal(unlist(chunkedMap(X = X, FUN = rowSums, chunkBy = 1, chunkSize = chunkSize, nCores = nCores)), rowSums(X)) 21 | expect_equal(unlist(chunkedMap(X = X, FUN = colSums, chunkSize = chunkSize, nCores = nCores)), colSums(X)) 22 | 23 | expect_equal(unlist(chunkedMap(X = X, FUN = rowSums, chunkBy = 1, i = c(1, 3), chunkSize = chunkSize, nCores = nCores)), rowSums(X[c(1, 3), ])) 24 | expect_equal(unlist(chunkedMap(X = X, FUN = colSums, i = c(1, 3), chunkSize = chunkSize, nCores = nCores)), colSums(X[c(1, 3), ])) 25 | 26 | expect_equal(unlist(chunkedMap(X = X, FUN = rowSums, chunkBy = 1, j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores)), rowSums(X[, c(1, 3, 5)])) 27 | expect_equal(unlist(chunkedMap(X = X, FUN = colSums, j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores)), colSums(X[, c(1, 3, 5)])) 28 | 29 | expect_equal(unlist(chunkedMap(X = X, FUN = rowSums, chunkBy = 1, i = c(1, 3), j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores)), rowSums(X[c(1, 3), c(1, 3, 5)])) 30 | expect_equal(unlist(chunkedMap(X = X, FUN = colSums, i = c(1, 3), j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores)), colSums(X[c(1, 3), c(1, 3, 5)])) 31 | 32 | } 33 | } 34 | 35 | }) 36 | 37 | test_that("chunkedApply", { 38 | 39 | for (nCores in seq_len(2)) { 40 | 41 | hasCores(nCores) 42 | 43 | for (chunkSize in c(5, 10)) { 44 | 45 | expect_equal(chunkedApply(X = X, MARGIN = 1, FUN = sum, chunkSize = chunkSize, nCores = nCores), apply(X, 1, sum)) 46 | expect_equal(chunkedApply(X = X, MARGIN = 2, FUN = sum, chunkSize = chunkSize, nCores = nCores), apply(X, 2, sum)) 47 | 48 | expect_equal(chunkedApply(X = X, MARGIN = 1, FUN = sum, i = c(1, 3), chunkSize = chunkSize, nCores = nCores), apply(X[c(1, 3), ], 1, sum)) 49 | expect_equal(chunkedApply(X = X, MARGIN = 2, FUN = sum, i = c(1, 3), chunkSize = chunkSize, nCores = nCores), apply(X[c(1, 3), ], 2, sum)) 50 | 51 | expect_equal(chunkedApply(X = X, MARGIN = 1, FUN = sum, j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores), apply(X[, c(1, 3, 5)], 1, sum)) 52 | expect_equal(chunkedApply(X = X, MARGIN = 2, FUN = sum, j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores), apply(X[, c(1, 3, 5)], 2, sum)) 53 | 54 | expect_equal(chunkedApply(X = X, MARGIN = 1, FUN = sum, i = c(1, 3), j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores), apply(X[c(1, 3), c(1, 3, 5)], 1, sum)) 55 | expect_equal(chunkedApply(X = X, MARGIN = 2, FUN = sum, i = c(1, 3), j = c(1, 3, 5), chunkSize = chunkSize, nCores = nCores), apply(X[c(1, 3), c(1, 3, 5)], 2, sum)) 56 | 57 | } 58 | 59 | } 60 | 61 | }) 62 | -------------------------------------------------------------------------------- /tests/testthat/test-getG.R: -------------------------------------------------------------------------------- 1 | context("getG") 2 | 3 | for (nCores in seq_len(2)) { 4 | 5 | test_that(paste("getGi", "on", nCores, "cores"), { 6 | 7 | hasCores(nCores) 8 | 9 | n <- 10 10 | p <- 100 11 | X <- matrix(data = rnorm(n * p), nrow = n, ncol = p) 12 | 13 | for (chunkSize in c(NULL, p, ceiling(p / 3))) { 14 | 15 | # both scalings 16 | G <- tcrossprod(scale(X)) 17 | G <- G / mean(diag(G)) 18 | G2 <- getG(X = X, scale = TRUE, scaleG = TRUE, chunkSize = chunkSize, nCores = nCores) 19 | expect_equivalent(G, G2) 20 | 21 | # without scaling to average diagonal = 1 (scaleG) 22 | G <- tcrossprod(scale(X)) 23 | G2 <- getG(X = X, scale = TRUE, scaleG = FALSE, chunkSize = chunkSize, nCores = nCores) 24 | expect_equivalent(G, G2) 25 | 26 | # without scaling columns, but scaling average diagonal = 1 (scaleG) 27 | G <- tcrossprod(scale(X, center = TRUE, scale = FALSE)) 28 | G <- G / mean(diag(G)) 29 | G2 <- getG(X = X, scale = FALSE, scaleG = TRUE, chunkSize = chunkSize, nCores = nCores) 30 | 31 | expect_equivalent(G, G2) 32 | 33 | # no scaling at all 34 | G <- tcrossprod(scale(X, center = TRUE, scale = FALSE)) 35 | G2 <- getG(X = X, scale = FALSE, scaleG = FALSE, chunkSize = chunkSize, nCores = nCores) 36 | expect_equivalent(G, G2) 37 | 38 | # neither scaling nor centering 39 | G <- tcrossprod(X) 40 | G2 <- getG(X = X, center = FALSE, scale = FALSE, scaleG = FALSE, chunkSize = chunkSize, nCores = nCores) 41 | expect_equivalent(G, G2) 42 | 43 | } 44 | 45 | X[sample(1:length(X), size = 20)] <- NA 46 | G <- getG(X, nCores = nCores) 47 | expect_true(!any(is.na(G))) 48 | 49 | }) 50 | 51 | test_that(paste("getGij", "on", nCores, "cores"), { 52 | 53 | hasCores(nCores) 54 | 55 | n <- 10 56 | p <- 100 57 | X <- matrix(data = rnorm(n * p), nrow = n, ncol = p) 58 | 59 | for (chunkSize in c(NULL, p, ceiling(p / 3))) { 60 | 61 | i <- sample(1:nrow(X), size = 3) 62 | i2 <- sample(1:nrow(X), size = 4) 63 | 64 | centers <- colMeans(X) 65 | scales <- apply(X, 2, sd) * sqrt((n - 1)/n) 66 | 67 | # all scalings 68 | G <- tcrossprod(scale(X)) 69 | G <- G / mean(diag(G)) 70 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = TRUE, i = i, i2 = i2, chunkSize = chunkSize, nCores = nCores) 71 | expect_equivalent(G[i, i2], G_12) 72 | 73 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = TRUE, i = i, i2 = i, chunkSize = chunkSize, nCores = nCores) 74 | expect_equivalent(G[i, i], G_12) 75 | 76 | # without scaling to average diagonal = 1 77 | G <- tcrossprod(scale(X) * sqrt(n/(n - 1))) 78 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = FALSE, i = i, i2 = i2, chunkSize = chunkSize, nCores = nCores) 79 | expect_equivalent(G[i, i2], G_12) 80 | 81 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = FALSE, i = i, i2 = i, chunkSize = chunkSize, nCores = nCores) 82 | expect_equivalent(G[i, i], G_12) 83 | 84 | # without scaling columns, but scaling average diagonal = 1 85 | scales <- rep(1, ncol(X)) 86 | 87 | G <- tcrossprod(scale(X, center = TRUE, scale = FALSE)) 88 | G <- G / ncol(X) 89 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = TRUE, i = i, i2 = i2, chunkSize = chunkSize, nCores = nCores) 90 | expect_equivalent(G[i, i2], G_12) 91 | 92 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = TRUE, i = i, i2 = i, chunkSize = chunkSize, nCores = nCores) 93 | expect_equivalent(G[i, i], G_12) 94 | 95 | # no scaling at all 96 | G <- tcrossprod(scale(X, center = TRUE, scale = FALSE)) 97 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = FALSE, i = i, i2 = i2, chunkSize = chunkSize, nCores = nCores) 98 | expect_equivalent(G[i, i2], G_12) 99 | 100 | G_12 <- getG(X = X, center = centers, scale = scales, scaleG = FALSE, i = i, i2 = i, chunkSize = chunkSize, nCores = nCores) 101 | expect_equivalent(G[i, i], G_12) 102 | 103 | } 104 | }) 105 | 106 | test_that(paste("getG_symDMatrix", "on", nCores, "cores"), { 107 | 108 | hasCores(nCores) 109 | 110 | W <- matrix(data = rnorm(200), nrow = 10, ncol = 20) 111 | G1 <- tcrossprod(scale(W)) 112 | G1 <- G1 / mean(diag(G1)) 113 | 114 | G2 <- getG_symDMatrix(X = W, blockSize = ceiling(nrow(W) / 3), folderOut = testDir(), nCores = nCores) 115 | expect_equivalent(G2[], G1) # use equivalent to correct slight difference in NULL dimnames handling 116 | 117 | }) 118 | 119 | } 120 | -------------------------------------------------------------------------------- /tests/testthat/test-preprocess-int.R: -------------------------------------------------------------------------------- 1 | context("preprocess for integers") 2 | 3 | # Parameters 4 | n <- 250 5 | p <- 50 6 | length <- n * p 7 | nmiss <- 100 8 | 9 | # Data 10 | set.seed(4711) 11 | X <- sample(0:9, size = length, replace = TRUE) 12 | dim(X) <- c(n, p) 13 | missing <- sample(seq_len(length), size = nmiss) 14 | X[missing] <- NA 15 | 16 | centers <- colMeans(X, na.rm = TRUE) 17 | scales <- apply(X, 2, sd, na.rm = TRUE) 18 | 19 | # No operation 20 | expect_equal( 21 | scale(X, center = FALSE, scale = FALSE), 22 | preprocess(X, center = FALSE, scale = FALSE, impute = FALSE) 23 | ) 24 | 25 | # Tests without imputation 26 | 27 | # Compute centers and scales 28 | expect_equal( 29 | scale(X, center = TRUE, scale = TRUE), 30 | preprocess(X, center = TRUE, scale = TRUE, impute = FALSE) 31 | ) 32 | expect_equal( 33 | scale(X, center = TRUE, scale = FALSE), 34 | preprocess(X, center = TRUE, scale = FALSE, impute = FALSE) 35 | ) 36 | expect_equal( 37 | scale(X, center = FALSE, scale = scales), # scale() uses root mean squares if 'center = FALSE' 38 | preprocess(X, center = FALSE, scale = TRUE, impute = FALSE) 39 | ) 40 | 41 | # Provide own centers and scales 42 | expect_equal( 43 | scale(X, center = centers, scale = scales), 44 | preprocess(X, center = centers, scale = scales, impute = FALSE) 45 | ) 46 | expect_equal( 47 | scale(X, center = centers, scale = FALSE), 48 | preprocess(X, center = centers, scale = FALSE, impute = FALSE) 49 | ) 50 | expect_equal( 51 | scale(X, center = FALSE, scale = scales), 52 | preprocess(X, center = FALSE, scale = scales, impute = FALSE) 53 | ) 54 | 55 | # Provide own centers, compute scales 56 | expect_equal( 57 | scale(X, center = centers, scale = TRUE), 58 | preprocess(X, center = centers, scale = TRUE, impute = FALSE) 59 | ) 60 | 61 | # Provide own scales, compute centers 62 | expect_equal( 63 | scale(X, center = TRUE, scale = scales), 64 | preprocess(X, center = TRUE, scale = scales, impute = FALSE) 65 | ) 66 | 67 | 68 | # Tests with imputation 69 | 70 | # center = TRUE and impute = TRUE means impute by 0 71 | expect_equal( 72 | { 73 | W <- scale(X, center = TRUE, scale = FALSE) 74 | W[missing] <- 0 75 | W 76 | }, 77 | preprocess(X, center = TRUE, scale = FALSE, impute = TRUE) 78 | ) 79 | 80 | # Given centers and impute = TRUE means impute by 0 81 | expect_equal( 82 | { 83 | W <- scale(X, center = centers, scale = FALSE) 84 | W[missing] <- 0 85 | W 86 | }, 87 | preprocess(X, center = centers, scale = FALSE, impute = TRUE) 88 | ) 89 | 90 | # center = FALSE and impute = TRUE means impute by mean 91 | expect_equal( 92 | { 93 | means <- rep(colMeans(X, na.rm = TRUE), each = n) 94 | W <- X 95 | W[missing] <- means[missing] 96 | W 97 | }, 98 | preprocess(X, center = FALSE, scale = FALSE, impute = TRUE) 99 | ) 100 | -------------------------------------------------------------------------------- /tests/testthat/test-preprocess-real.R: -------------------------------------------------------------------------------- 1 | context("preprocess for real numbers") 2 | 3 | # Parameters 4 | n <- 250 5 | p <- 50 6 | length <- n * p 7 | nmiss <- 100 8 | 9 | # Data 10 | set.seed(4711) 11 | X <- rnorm(length) 12 | dim(X) <- c(n, p) 13 | missing <- sample(seq_len(length), size = nmiss) 14 | X[missing] <- NA 15 | 16 | centers <- colMeans(X, na.rm = TRUE) 17 | scales <- apply(X, 2, sd, na.rm = TRUE) 18 | 19 | # No operation 20 | expect_equal( 21 | scale(X, center = FALSE, scale = FALSE), 22 | preprocess(X, center = FALSE, scale = FALSE, impute = FALSE) 23 | ) 24 | 25 | # Tests without imputation 26 | 27 | # Compute centers and scales 28 | expect_equal( 29 | scale(X, center = TRUE, scale = TRUE), 30 | preprocess(X, center = TRUE, scale = TRUE, impute = FALSE) 31 | ) 32 | expect_equal( 33 | scale(X, center = TRUE, scale = FALSE), 34 | preprocess(X, center = TRUE, scale = FALSE, impute = FALSE) 35 | ) 36 | expect_equal( 37 | scale(X, center = FALSE, scale = scales), # scale() uses root mean squares if 'center = FALSE' 38 | preprocess(X, center = FALSE, scale = TRUE, impute = FALSE) 39 | ) 40 | 41 | # Provide own centers and scales 42 | expect_equal( 43 | scale(X, center = centers, scale = scales), 44 | preprocess(X, center = centers, scale = scales, impute = FALSE) 45 | ) 46 | expect_equal( 47 | scale(X, center = centers, scale = FALSE), 48 | preprocess(X, center = centers, scale = FALSE, impute = FALSE) 49 | ) 50 | expect_equal( 51 | scale(X, center = FALSE, scale = scales), 52 | preprocess(X, center = FALSE, scale = scales, impute = FALSE) 53 | ) 54 | 55 | # Provide own centers, compute scales 56 | expect_equal( 57 | scale(X, center = centers, scale = TRUE), 58 | preprocess(X, center = centers, scale = TRUE, impute = FALSE) 59 | ) 60 | 61 | # Provide own scales, compute centers 62 | expect_equal( 63 | scale(X, center = TRUE, scale = scales), 64 | preprocess(X, center = TRUE, scale = scales, impute = FALSE) 65 | ) 66 | 67 | 68 | # Tests with imputation 69 | 70 | # center = TRUE and impute = TRUE means impute by 0 71 | expect_equal( 72 | { 73 | W <- scale(X, center = TRUE, scale = FALSE) 74 | W[missing] <- 0 75 | W 76 | }, 77 | preprocess(X, center = TRUE, scale = FALSE, impute = TRUE) 78 | ) 79 | 80 | # Given centers and impute = TRUE means impute by 0 81 | expect_equal( 82 | { 83 | W <- scale(X, center = centers, scale = FALSE) 84 | W[missing] <- 0 85 | W 86 | }, 87 | preprocess(X, center = centers, scale = FALSE, impute = TRUE) 88 | ) 89 | 90 | # center = FALSE and impute = TRUE means impute by mean 91 | expect_equal( 92 | { 93 | means <- rep(colMeans(X, na.rm = TRUE), each = n) 94 | W <- X 95 | W[missing] <- means[missing] 96 | W 97 | }, 98 | preprocess(X, center = FALSE, scale = FALSE, impute = TRUE) 99 | ) 100 | -------------------------------------------------------------------------------- /tests/testthat/test-summarize.R: -------------------------------------------------------------------------------- 1 | context("summarize") 2 | 3 | set.seed(1) 4 | 5 | nRows <- 5 6 | nCols <- 10 7 | percentNA <- 0.15 8 | 9 | summarize_test <- function(X) { 10 | res <- data.frame( 11 | freq_na = vector(mode = "double", length = ncol(X)), 12 | allele_freq = vector(mode = "double", length = ncol(X)), 13 | sd = vector(mode = "double", length = ncol(X)) 14 | ) 15 | for (col in seq_len(ncol(X))) { 16 | x <- X[, col] 17 | nMissing <- sum(is.na(x)) 18 | res$freq_na[col] <- nMissing / length(x) 19 | res$allele_freq[col] <- sum(x, na.rm = TRUE) / ((length(x) - nMissing) * 2) 20 | res$sd[col] <- sd(x, na.rm = TRUE) 21 | } 22 | return(res) 23 | } 24 | 25 | test_that("summarize", { 26 | 27 | for (mode in c("integer", "double")) { 28 | 29 | X <- matrix(data = rnorm(nRows * nCols, sd = 100), nrow = nRows, ncol = nCols) 30 | X[sample(seq_along(X), size = as.integer(length(X) * percentNA))] <- NA 31 | storage.mode(X) <- mode 32 | 33 | for (nCores in seq_len(2)) { 34 | 35 | hasCores(nCores) 36 | 37 | expect_equal( 38 | summarize(X, nCores = nCores), 39 | summarize_test(X) 40 | ) 41 | 42 | } 43 | 44 | } 45 | 46 | }) 47 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | context("utils") 2 | 3 | test_that("normalizeType", { 4 | 5 | expect_equal(typeof(BGData:::normalizeType("double")), "double") 6 | expect_equal(typeof(BGData:::normalizeType(double())), "double") 7 | expect_equal(typeof(BGData:::normalizeType("integer")), "integer") 8 | expect_equal(typeof(BGData:::normalizeType(integer())), "integer") 9 | expect_equal(typeof(BGData:::normalizeType("character")), "character") 10 | expect_equal(typeof(BGData:::normalizeType(character())), "character") 11 | expect_equal(typeof(BGData:::normalizeType("complex")), "complex") 12 | expect_equal(typeof(BGData:::normalizeType(complex())), "complex") 13 | expect_warning(BGData:::normalizeType("test")) 14 | expect_equal(suppressWarnings(typeof(BGData:::normalizeType("test"))), "character") 15 | expect_equal(typeof(BGData:::normalizeType(1)), "double") 16 | expect_equal(typeof(BGData:::normalizeType(1L)), "integer") 17 | 18 | }) 19 | --------------------------------------------------------------------------------