├── _pkgdown.yml ├── .github ├── .gitignore └── workflows │ └── pkgdown.yaml ├── docs ├── _config.yml ├── reference │ ├── Rplot001.png │ ├── Rplot002.png │ ├── smart_mva-1.png │ ├── smart_pca-1.png │ ├── smart_pca-2.png │ ├── smart_permanova-1.png │ ├── smart_permdisp-1.png │ ├── figures │ │ ├── README-example-1.png │ │ ├── README-pressure-1.png │ │ └── README-unnamed-chunk-2-1.png │ ├── read_packedancestrymap.html │ └── index.html ├── articles │ ├── figure │ │ ├── lazaridis_plot-1.png │ │ ├── pca_plot_mallard-1.png │ │ └── data_transformation_example_plot-1.png │ ├── aDNA_smartpca_analysis_files │ │ └── header-attrs-2.8 │ │ │ └── header-attrs.js │ ├── mallard_smartpca_analysis_files │ │ └── header-attrs-2.8 │ │ │ └── header-attrs.js │ ├── Converting_VCF_and_PLINK_formats_files │ │ └── header-attrs-2.8 │ │ │ └── header-attrs.js │ └── index.html ├── pkgdown.yml ├── link.svg ├── bootstrap-toc.css ├── docsearch.js ├── pkgdown.js ├── bootstrap-toc.js ├── LICENSE-text.html ├── 404.html ├── news │ └── index.html ├── authors.html ├── LICENSE.html ├── pkgdown.css └── docsearch.css ├── src ├── .gitignore ├── RcppExports.cpp └── cpp_read_packedancestrymap.cpp ├── vignettes ├── .gitignore ├── .DS_Store ├── figure │ ├── .DS_Store │ ├── lazaridis_plot-1.png │ ├── pca_plot_mallard-1.png │ └── data_transformation_example_plot-1.png ├── Converting_VCF_and_PLINK_formats.Rmd ├── Converting_VCF_and_PLINK_formats.Rmd.orig ├── aDNA_smartpca_analysis.Rmd ├── aDNA_smartpca_analysis.Rmd.orig └── mallard_smartpca_analysis.Rmd.orig ├── LICENSE ├── .DS_Store ├── inst ├── .DS_Store └── extdata │ └── mallard_snps_Kraus2013 ├── man ├── figures │ ├── README-example-1.png │ ├── README-pressure-1.png │ └── README-unnamed-chunk-2-1.png ├── read_packedancestrymap.Rd ├── smart_mva.Rd ├── smart_pca.Rd └── smart_permanova.Rd ├── .gitignore ├── NEWS.md ├── .Rbuildignore ├── R ├── smartsnp-package.R ├── RcppExports.R └── read_packedancestrymap.R ├── smartsnp.Rproj ├── NAMESPACE ├── data-raw └── dataSNP.R ├── LICENSE.md ├── DESCRIPTION ├── cran-comments.md ├── README.Rmd └── README.md /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: Salvador Herrando-Pérez 3 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/.DS_Store -------------------------------------------------------------------------------- /inst/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/inst/.DS_Store -------------------------------------------------------------------------------- /vignettes/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/vignettes/.DS_Store -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/reference/Rplot002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/Rplot002.png -------------------------------------------------------------------------------- /vignettes/figure/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/vignettes/figure/.DS_Store -------------------------------------------------------------------------------- /docs/reference/smart_mva-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/smart_mva-1.png -------------------------------------------------------------------------------- /docs/reference/smart_pca-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/smart_pca-1.png -------------------------------------------------------------------------------- /docs/reference/smart_pca-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/smart_pca-2.png -------------------------------------------------------------------------------- /man/figures/README-example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/man/figures/README-example-1.png -------------------------------------------------------------------------------- /man/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/man/figures/README-pressure-1.png -------------------------------------------------------------------------------- /docs/reference/smart_permanova-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/smart_permanova-1.png -------------------------------------------------------------------------------- /docs/reference/smart_permdisp-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/smart_permdisp-1.png -------------------------------------------------------------------------------- /vignettes/figure/lazaridis_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/vignettes/figure/lazaridis_plot-1.png -------------------------------------------------------------------------------- /vignettes/figure/pca_plot_mallard-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/vignettes/figure/pca_plot_mallard-1.png -------------------------------------------------------------------------------- /docs/articles/figure/lazaridis_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/articles/figure/lazaridis_plot-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/man/figures/README-unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /docs/articles/figure/pca_plot_mallard-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/articles/figure/pca_plot_mallard-1.png -------------------------------------------------------------------------------- /docs/reference/figures/README-example-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/figures/README-example-1.png -------------------------------------------------------------------------------- /docs/reference/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/figures/README-pressure-1.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | inst/doc 9 | 10 | /doc/ 11 | /Meta/ 12 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # smartsnp 1.1.0 2 | 3 | * Added a `NEWS.md` file to track changes to the package. 4 | 5 | * First submission of package smartsnp (1 March 2020) 6 | -------------------------------------------------------------------------------- /docs/reference/figures/README-unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/reference/figures/README-unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^smartsnp\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^_pkgdown\.yml$ 5 | ^docs$ 6 | ^pkgdown$ 7 | ^\.github$ 8 | ^doc$ 9 | ^Meta$ 10 | -------------------------------------------------------------------------------- /vignettes/figure/data_transformation_example_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/vignettes/figure/data_transformation_example_plot-1.png -------------------------------------------------------------------------------- /docs/articles/figure/data_transformation_example_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/smartsnp/master/docs/articles/figure/data_transformation_example_plot-1.png -------------------------------------------------------------------------------- /R/smartsnp-package.R: -------------------------------------------------------------------------------- 1 | ## usethis namespace: start 2 | #' @useDynLib smartsnp, .registration = TRUE 3 | ## usethis namespace: end 4 | NULL 5 | ## usethis namespace: start 6 | #' @importFrom Rcpp sourceCpp 7 | ## usethis namespace: end 8 | NULL 9 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.14.0.1 2 | pkgdown: 1.6.1 3 | pkgdown_sha: ~ 4 | articles: 5 | Converting_VCF_and_PLINK_formats: Converting_VCF_and_PLINK_formats.html 6 | aDNA_smartpca_analysis: aDNA_smartpca_analysis.html 7 | mallard_smartpca_analysis: mallard_smartpca_analysis.html 8 | last_built: 2021-08-06T16:52Z 9 | 10 | -------------------------------------------------------------------------------- /smartsnp.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | cpp_read_packedancestrymap <- function(genofile, nsnp, nind, indvec, first, last, transpose = FALSE, verbose = FALSE) { 5 | .Call(`_smartsnp_cpp_read_packedancestrymap`, genofile, nsnp, nind, indvec, first, last, transpose, verbose) 6 | } 7 | 8 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(read_packedancestrymap) 4 | export(smart_mva) 5 | export(smart_pca) 6 | export(smart_permanova) 7 | export(smart_permdisp) 8 | importFrom(Rcpp,sourceCpp) 9 | importFrom(data.table,":=") 10 | importFrom(data.table,.N) 11 | importFrom(data.table,.SD) 12 | importFrom(foreach,"%do%") 13 | useDynLib(smartsnp, .registration = TRUE) 14 | -------------------------------------------------------------------------------- /docs/articles/aDNA_smartpca_analysis_files/header-attrs-2.8/header-attrs.js: -------------------------------------------------------------------------------- 1 | // Pandoc 2.9 adds attributes on both header and div. We remove the former (to 2 | // be compatible with the behavior of Pandoc < 2.8). 3 | document.addEventListener('DOMContentLoaded', function(e) { 4 | var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); 5 | var i, h, a; 6 | for (i = 0; i < hs.length; i++) { 7 | h = hs[i]; 8 | if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 9 | a = h.attributes; 10 | while (a.length > 0) h.removeAttribute(a[0].name); 11 | } 12 | }); 13 | -------------------------------------------------------------------------------- /docs/articles/mallard_smartpca_analysis_files/header-attrs-2.8/header-attrs.js: -------------------------------------------------------------------------------- 1 | // Pandoc 2.9 adds attributes on both header and div. We remove the former (to 2 | // be compatible with the behavior of Pandoc < 2.8). 3 | document.addEventListener('DOMContentLoaded', function(e) { 4 | var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); 5 | var i, h, a; 6 | for (i = 0; i < hs.length; i++) { 7 | h = hs[i]; 8 | if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 9 | a = h.attributes; 10 | while (a.length > 0) h.removeAttribute(a[0].name); 11 | } 12 | }); 13 | -------------------------------------------------------------------------------- /data-raw/dataSNP.R: -------------------------------------------------------------------------------- 1 | # Simulate data 2 | 3 | # Data parameters 4 | cellN <- 1e06 5 | sampleN <- 100 6 | snpN <- cellN/sampleN 7 | 8 | # Random sample of genotypes /0|1|2/ and missing values /9/ 9 | sampleSNP <- sample(x = c(0,1,2,9), size = cellN, replace = TRUE, prob=c(0.33,0.33,0.33,0.01)) 10 | 11 | # Build dataset 12 | dataSNP <- matrix(sampleSNP, nrow = snpN, ncol = sampleN) 13 | dim(dataSNP) 14 | 15 | # Save data to working directory 16 | write.table(dataSNP, file = "extdata/dataSNP", col.names = FALSE, row.names = FALSE) 17 | 18 | # Make data available to package 19 | # library(usethis) 20 | # usethis::use_data(dataSNP) 21 | -------------------------------------------------------------------------------- /docs/articles/Converting_VCF_and_PLINK_formats_files/header-attrs-2.8/header-attrs.js: -------------------------------------------------------------------------------- 1 | // Pandoc 2.9 adds attributes on both header and div. We remove the former (to 2 | // be compatible with the behavior of Pandoc < 2.8). 3 | document.addEventListener('DOMContentLoaded', function(e) { 4 | var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); 5 | var i, h, a; 6 | for (i = 0; i < hs.length; i++) { 7 | h = hs[i]; 8 | if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 9 | a = h.attributes; 10 | while (a.length > 0) h.removeAttribute(a[0].name); 11 | } 12 | }); 13 | -------------------------------------------------------------------------------- /man/read_packedancestrymap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_packedancestrymap.R 3 | \name{read_packedancestrymap} 4 | \alias{read_packedancestrymap} 5 | \title{Read Files in PACKEDANCESTRYMAP format} 6 | \usage{ 7 | read_packedancestrymap(pref) 8 | } 9 | \arguments{ 10 | \item{pref}{The prefix of the file name that contains the genotype data (i.e., without the \code{*.geno}).} 11 | } 12 | \value{ 13 | Returns a list containing a single element: 14 | \itemize{ 15 | \item {\code{geno}} {Genotype data as R matrix.}\cr 16 | } 17 | } 18 | \description{ 19 | This function loads genotype data in \code{PACKEDANCESTRYMAP} format (binary or compressed). 20 | } 21 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 Salvador Herrando-Pérez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | 7 | name: pkgdown 8 | 9 | jobs: 10 | pkgdown: 11 | runs-on: macOS-latest 12 | env: 13 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - uses: r-lib/actions/setup-r@v1 18 | 19 | - uses: r-lib/actions/setup-pandoc@v1 20 | 21 | - name: Query dependencies 22 | run: | 23 | install.packages('remotes') 24 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 25 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 26 | shell: Rscript {0} 27 | 28 | - name: Restore R package cache 29 | uses: actions/cache@v2 30 | with: 31 | path: ${{ env.R_LIBS_USER }} 32 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 33 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 34 | 35 | - name: Install dependencies 36 | run: | 37 | remotes::install_deps(dependencies = TRUE) 38 | install.packages("pkgdown", type = "binary") 39 | shell: Rscript {0} 40 | 41 | - name: Install package 42 | run: R CMD INSTALL . 43 | 44 | - name: Deploy package 45 | run: | 46 | git config --local user.email "actions@github.com" 47 | git config --local user.name "GitHub Actions" 48 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 49 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: smartsnp 2 | Type: Package 3 | Title: Fast Multivariate Analyses of Big Genomic Data 4 | Version: 1.1.0 5 | Authors@R: c( 6 | person("Salvador", family = "Herrando-Perez", email = "salherra@gmail.com", 7 | role = c("aut"), comment = c(ORCID = "0000-0001-6052-6854")), 8 | person("Ray", family = "Tobler", email = "tingalingx@gmail.com", 9 | role = c("ctb"), comment = c(ORCID = "0000-0002-4603-1473")), 10 | person("Christian", family = "Huber", email = "christian.domitian.huber@gmail.com", 11 | role = c("ctb", "cre"), comment = c(ORCID = "0000-0002-2267-2604"))) 12 | Maintainer: Christian Huber 13 | Description: Fast computation of multivariate analyses of small (10s to 100s markers) to big (1000s to 100000s) genotype data. Runs Principal Component Analysis allowing for centering, z-score standardization and scaling for genetic drift, projection of ancient samples to modern genetic space and multivariate tests for differences in group location (Permutation-Based Multivariate Analysis of Variance) and dispersion (Permutation-Based Multivariate Analysis of Dispersion). 14 | Language: en-GB 15 | License: MIT + file LICENSE 16 | URL: https://christianhuber.github.io/smartsnp/ 17 | BugReports: https://github.com/ChristianHuber/smartsnp/issues 18 | Depends: R (>= 3.6.0) 19 | VignetteBuilder: knitr 20 | Imports: 21 | bootSVD, 22 | data.table, 23 | foreach, 24 | Rfast, 25 | RSpectra, 26 | vegan, 27 | vroom, 28 | Rcpp, 29 | Suggests: 30 | knitr, 31 | rmarkdown 32 | RoxygenNote: 7.1.1 33 | Encoding: UTF-8 34 | LinkingTo: 35 | Rcpp, 36 | RcppArmadillo 37 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | // cpp_read_packedancestrymap 10 | NumericMatrix cpp_read_packedancestrymap(String genofile, int nsnp, int nind, IntegerVector indvec, int first, int last, bool transpose, bool verbose); 11 | RcppExport SEXP _smartsnp_cpp_read_packedancestrymap(SEXP genofileSEXP, SEXP nsnpSEXP, SEXP nindSEXP, SEXP indvecSEXP, SEXP firstSEXP, SEXP lastSEXP, SEXP transposeSEXP, SEXP verboseSEXP) { 12 | BEGIN_RCPP 13 | Rcpp::RObject rcpp_result_gen; 14 | Rcpp::RNGScope rcpp_rngScope_gen; 15 | Rcpp::traits::input_parameter< String >::type genofile(genofileSEXP); 16 | Rcpp::traits::input_parameter< int >::type nsnp(nsnpSEXP); 17 | Rcpp::traits::input_parameter< int >::type nind(nindSEXP); 18 | Rcpp::traits::input_parameter< IntegerVector >::type indvec(indvecSEXP); 19 | Rcpp::traits::input_parameter< int >::type first(firstSEXP); 20 | Rcpp::traits::input_parameter< int >::type last(lastSEXP); 21 | Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); 22 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 23 | rcpp_result_gen = Rcpp::wrap(cpp_read_packedancestrymap(genofile, nsnp, nind, indvec, first, last, transpose, verbose)); 24 | return rcpp_result_gen; 25 | END_RCPP 26 | } 27 | 28 | static const R_CallMethodDef CallEntries[] = { 29 | {"_smartsnp_cpp_read_packedancestrymap", (DL_FUNC) &_smartsnp_cpp_read_packedancestrymap, 8}, 30 | {NULL, NULL, 0} 31 | }; 32 | 33 | RcppExport void R_init_smartsnp(DllInfo *dll) { 34 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 35 | R_useDynamicSymbols(dll, FALSE); 36 | } 37 | -------------------------------------------------------------------------------- /R/read_packedancestrymap.R: -------------------------------------------------------------------------------- 1 | # smartsnp v.1 2 | # Coding start date = 26/01/2021 3 | # smartsnp::read_packedancestrymap by Robert Maier (rmaier@broadinstitute.org), modified by Christian D. Huber (christian.domitian.huber@gmail.com) 4 | 5 | # Loading genotype data that is in packedancestrymap format standing for binary or compressed data 6 | 7 | #' @name read_packedancestrymap 8 | #' 9 | #' @title Read Files in PACKEDANCESTRYMAP format 10 | #' 11 | #' @description This function loads genotype data in \code{PACKEDANCESTRYMAP} format (binary or compressed). 12 | #' 13 | #' @param pref The prefix of the file name that contains the genotype data (i.e., without the \code{*.geno}). 14 | #' @return Returns a list containing a single element: 15 | #' \itemize{ 16 | #' \item {\code{geno}} {Genotype data as R matrix.}\cr 17 | #' } 18 | #' 19 | #' @export 20 | read_packedancestrymap = function (pref) 21 | { 22 | pref <- normalizePath(pref, mustWork = FALSE) 23 | fl <- paste0(pref, ".geno") 24 | conn <- file(fl, "rb") 25 | hd <- strsplit(readBin(conn, "character", n = 1), " +")[[1]] 26 | close(conn) 27 | nindall <- as.numeric(hd[2]) 28 | nsnpall <- as.numeric(hd[3]) 29 | message(basename(pref), ".geno has ", nindall, 30 | " samples and ", nsnpall, " SNPs.") 31 | message("Reading data for ", nindall, " samples and ", 32 | nsnpall, " SNPs") 33 | message("Expected size of genotype data: ", 34 | round((nsnpall * nindall * 8 + nsnpall * 112)/1e+06), " MB") 35 | indvec <- rep(1, nindall) 36 | geno <- cpp_read_packedancestrymap(fl, nsnpall, nindall, indvec, 37 | first = 0, last = nsnpall, transpose = FALSE, 38 | verbose = FALSE) 39 | outlist <- list(geno = geno) 40 | outlist 41 | } 42 | ##### smartsnp v.1 43 | ##### Coding end date = 08/02/2021 44 | ##### smartsnp::read_packedancestrymap spelling checked by Salvador Herrando-Perez (salherra@gmail.com) 45 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * local OS X installation, R 4.0.2 3 | * fedora linux (devel) 4 | * win-builder (devel and release) 5 | 6 | ## R CMD check results 7 | There were no ERRORs or WARNINGs or NOTEs. 8 | 9 | ## Additional CRAN comments (Mar 3, 2021) 10 | Thanks to Gregor Seyer for his constructive review of the first submission. I fixed all issues pointed out. Find below G. Seyer's remarks and my answers to each of them: 11 | 12 | Please reduce the length of the title to less than 65 characters. 13 | 14 | * We changed the title to "Fast Multivariate Analyses of Big Genomic Data" 15 | 16 | If there are references describing the methods in your package, please 17 | add these in the description field of your DESCRIPTION file in the form 18 | authors (year) 19 | authors (year) 20 | authors (year, ISBN:...) 21 | or if those are not available: 22 | with no space after 'doi:', 'arXiv:', 'https:' and angle brackets for 23 | auto-linking. 24 | (If you want to add a title as well please put it in quotes: "Title") 25 | 26 | * There are no published references yet. 27 | 28 | You write information messages to the console that cannot be easily 29 | suppressed. 30 | It is more R like to generate objects that can be used to extract the 31 | information a user is interested in, and then print() that object. 32 | Instead of print()/cat() rather use message()/warning() or 33 | if(verbose)cat(..) (or maybe stop()) if you really have to write text to 34 | the console. 35 | (except for print, summary, interactive functions) 36 | 37 | * I have replaced *cat()* and *print()* calls in all functions with *message()*. I have also turned off any messages to Rcout for the Rcpp function *cpp_read_packedancestrymap()*. 38 | 39 | Please always make sure to reset to user's options(), working directory 40 | or par() after you changed it in examples and vignettes and demos. 41 | e.g.: 42 | oldpar <- par(mfrow = c(1,2)) 43 | ... 44 | par(oldpar) 45 | 46 | * Fixed. 47 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent; 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /src/cpp_read_packedancestrymap.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | // [[Rcpp::plugins(cpp11)]] 9 | // [[Rcpp::depends(RcppArmadillo)]] 10 | 11 | #define PACK_DENSITY 4 12 | 13 | /* 3 is 11 in binary, we need a 2 bit mask for each of the 4 positions */ 14 | #define MASK0 3 /* 3 << 2 * 0 */ 15 | #define MASK1 12 /* 3 << 2 * 1 */ 16 | #define MASK2 48 /* 3 << 2 * 2 */ 17 | #define MASK3 192 /* 3 << 2 * 3 */ 18 | 19 | 20 | // [[Rcpp::export]] 21 | NumericMatrix cpp_read_packedancestrymap(String genofile, int nsnp, int nind, IntegerVector indvec, 22 | int first, int last, bool transpose = false, bool verbose = false) { 23 | int val; 24 | long len, bytespersnp; 25 | int readsnps = last - first; 26 | 27 | std::ifstream in(genofile.get_cstring(), std::ios::in | std::ios::binary); 28 | 29 | if(!in) { 30 | Rcerr << "Error reading file " << genofile.get_cstring() << std::endl; 31 | throw std::runtime_error("io error"); 32 | } 33 | in.seekg(0, std::ifstream::end); 34 | // file size in bytes 35 | len = (long)in.tellg(); 36 | bytespersnp = len/(nsnp+1); 37 | 38 | // size of packed data, in bytes, per SNP 39 | //np = (long)ceil((double)nind / PACK_DENSITY); 40 | 41 | int nindused = 0; 42 | int* blockused = new int[bytespersnp]; 43 | for(int i = 0 ; i < bytespersnp; i++) { 44 | blockused[i] = 0; 45 | } 46 | for(int i = 0 ; i < nind; i++) { 47 | if(indvec[i] == 1) { 48 | nindused++; 49 | blockused[i/PACK_DENSITY] = 1; 50 | } 51 | } 52 | 53 | NumericMatrix geno(transpose?nindused:readsnps, transpose?readsnps:nindused); 54 | std::fill(geno.begin(), geno.end(), NA_REAL); 55 | 56 | // char* header = new char[bytespersnp]; 57 | // in.seekg(0, std::ifstream::beg); 58 | // in.read((char*)header, bytespersnp); 59 | // Rcout << "header " << header << std::endl; 60 | 61 | in.seekg((first+1)*bytespersnp, std::ifstream::beg); 62 | char* tmp = new char[bytespersnp + 1]; 63 | tmp[bytespersnp] = '\0'; 64 | char tmpi; 65 | 66 | // Allocate more than the sample size since data must take up whole bytes 67 | char* tmp2 = new char[bytespersnp * PACK_DENSITY + 1]; 68 | tmp2[bytespersnp * PACK_DENSITY] = '\0'; 69 | 70 | int k; 71 | for(int j = 0 ; j < readsnps; j++) { 72 | //for(unsigned int j = 0 ; j < 3; j++) { 73 | if(verbose && j % 1000 == 0) Rcout << "\r" << j/1000 << "k SNPs read..."; 74 | 75 | // read raw genotypes 76 | in.read((char*)tmp, sizeof(char) * bytespersnp); 77 | 78 | for(int l = 0; l < bytespersnp; l++) { 79 | if(!blockused[l]) continue; 80 | 81 | tmpi = tmp[l]; 82 | k = PACK_DENSITY * l; 83 | 84 | /* geno is interpreted as a char, however a1 and a2 are bits for allele 1 and 85 | * allele 2. The final genotype is the sum of the alleles, except for 11 86 | * which denotes missing. 87 | */ 88 | tmp2[k] = (tmpi & MASK3) >> 6; 89 | tmp2[k+1] = (tmpi & MASK2) >> 4; 90 | tmp2[k+2] = (tmpi & MASK1) >> 2; 91 | tmp2[k+3] = (tmpi & MASK0); 92 | } 93 | 94 | int c = 0; 95 | if(!transpose) { 96 | for(int i = 0; i < nind; i++) { 97 | if(!indvec[i]) continue; 98 | val = (double)tmp2[i]; 99 | if(val != 3) geno(j, c) = val; 100 | c++; 101 | } 102 | } else { 103 | for(int i = 0; i < nind; i++) { 104 | if(!indvec[i]) continue; 105 | val = (double)tmp2[i]; 106 | if(val != 3) geno(c, j) = val; 107 | c++; 108 | } 109 | } 110 | } 111 | if(verbose) Rcout << std::endl; 112 | 113 | delete[] tmp; 114 | delete[] tmp2; 115 | delete[] blockused; 116 | in.close(); 117 | 118 | return geno; 119 | } 120 | 121 | 122 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | (function() { 6 | 'use strict'; 7 | 8 | window.Toc = { 9 | helpers: { 10 | // return all matching elements in the set, or their descendants 11 | findOrFilter: function($el, selector) { 12 | // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ 13 | // http://stackoverflow.com/a/12731439/358804 14 | var $descendants = $el.find(selector); 15 | return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); 16 | }, 17 | 18 | generateUniqueIdBase: function(el) { 19 | var text = $(el).text(); 20 | var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); 21 | return anchor || el.tagName.toLowerCase(); 22 | }, 23 | 24 | generateUniqueId: function(el) { 25 | var anchorBase = this.generateUniqueIdBase(el); 26 | for (var i = 0; ; i++) { 27 | var anchor = anchorBase; 28 | if (i > 0) { 29 | // add suffix 30 | anchor += '-' + i; 31 | } 32 | // check if ID already exists 33 | if (!document.getElementById(anchor)) { 34 | return anchor; 35 | } 36 | } 37 | }, 38 | 39 | generateAnchor: function(el) { 40 | if (el.id) { 41 | return el.id; 42 | } else { 43 | var anchor = this.generateUniqueId(el); 44 | el.id = anchor; 45 | return anchor; 46 | } 47 | }, 48 | 49 | createNavList: function() { 50 | return $(''); 51 | }, 52 | 53 | createChildNavList: function($parent) { 54 | var $childList = this.createNavList(); 55 | $parent.append($childList); 56 | return $childList; 57 | }, 58 | 59 | generateNavEl: function(anchor, text) { 60 | var $a = $(''); 61 | $a.attr('href', '#' + anchor); 62 | $a.text(text); 63 | var $li = $('
  • '); 64 | $li.append($a); 65 | return $li; 66 | }, 67 | 68 | generateNavItem: function(headingEl) { 69 | var anchor = this.generateAnchor(headingEl); 70 | var $heading = $(headingEl); 71 | var text = $heading.data('toc-text') || $heading.text(); 72 | return this.generateNavEl(anchor, text); 73 | }, 74 | 75 | // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). 76 | getTopLevel: function($scope) { 77 | for (var i = 1; i <= 6; i++) { 78 | var $headings = this.findOrFilter($scope, 'h' + i); 79 | if ($headings.length > 1) { 80 | return i; 81 | } 82 | } 83 | 84 | return 1; 85 | }, 86 | 87 | // returns the elements for the top level, and the next below it 88 | getHeadings: function($scope, topLevel) { 89 | var topSelector = 'h' + topLevel; 90 | 91 | var secondaryLevel = topLevel + 1; 92 | var secondarySelector = 'h' + secondaryLevel; 93 | 94 | return this.findOrFilter($scope, topSelector + ',' + secondarySelector); 95 | }, 96 | 97 | getNavLevel: function(el) { 98 | return parseInt(el.tagName.charAt(1), 10); 99 | }, 100 | 101 | populateNav: function($topContext, topLevel, $headings) { 102 | var $context = $topContext; 103 | var $prevNav; 104 | 105 | var helpers = this; 106 | $headings.each(function(i, el) { 107 | var $newNav = helpers.generateNavItem(el); 108 | var navLevel = helpers.getNavLevel(el); 109 | 110 | // determine the proper $context 111 | if (navLevel === topLevel) { 112 | // use top level 113 | $context = $topContext; 114 | } else if ($prevNav && $context === $topContext) { 115 | // create a new level of the tree and switch to it 116 | $context = helpers.createChildNavList($prevNav); 117 | } // else use the current $context 118 | 119 | $context.append($newNav); 120 | 121 | $prevNav = $newNav; 122 | }); 123 | }, 124 | 125 | parseOps: function(arg) { 126 | var opts; 127 | if (arg.jquery) { 128 | opts = { 129 | $nav: arg 130 | }; 131 | } else { 132 | opts = arg; 133 | } 134 | opts.$scope = opts.$scope || $(document.body); 135 | return opts; 136 | } 137 | }, 138 | 139 | // accepts a jQuery object, or an options object 140 | init: function(opts) { 141 | opts = this.helpers.parseOps(opts); 142 | 143 | // ensure that the data attribute is in place for styling 144 | opts.$nav.attr('data-toggle', 'toc'); 145 | 146 | var $topContext = this.helpers.createChildNavList(opts.$nav); 147 | var topLevel = this.helpers.getTopLevel(opts.$scope); 148 | var $headings = this.helpers.getHeadings(opts.$scope, topLevel); 149 | this.helpers.populateNav($topContext, topLevel, $headings); 150 | } 151 | }; 152 | 153 | $(function() { 154 | $('nav[data-toggle="toc"]').each(function(i, el) { 155 | var $nav = $(el); 156 | Toc.init($nav); 157 | }); 158 | }); 159 | })(); 160 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # smartsnp 17 | 18 | 19 | 20 | 21 | ## Overview 22 | 23 | The package *smartsnp* runs fast and user-friendly computation of Principal Component Analysis (PCA) on single-nucleotide-polymorphism (SNP) data suitable for ancient, low-coverage and modern DNA. The package combines SNP scaling for genetic drift and projection of ancient samples onto a modern genetic PCA space (currently available only in Unix environment in the field-standard software EIGENSOFT) with permutation-based multivariate tests for population differences in genetic diversity (both location and dispersion). The package comprises three functions that run each analysis individually (*smart_pca*, *smart_permanova*, *smart_permdisp*), and a wrapper function (*smart_mva*) that runs any combination of the three standalone functions. 24 | 25 | ## Installation 26 | 27 | You can install the released version of smartsnp from [CRAN](https://CRAN.R-project.org) with: 28 | 29 | ``` r 30 | install.packages("smartsnp") 31 | ``` 32 | 33 | ## Example 34 | 35 | This is an example of how to run PCA, PERMANOVA and PERMDISP controlling for genetic drift for the package's dataset *dataSNP* including 10000 simulated SNPs in 100 samples (80 = modern, 20 = ancient). 36 | 37 | ```{r example, message=FALSE} 38 | #1/ Load package and label samples 39 | library(smartsnp) 40 | # Path to example genotype matrix "dataSNP" 41 | pathToGenoFile = system.file("extdata", "dataSNP", package = "smartsnp") 42 | #assign 50 samples to each of two groups 43 | my_groups <- c(rep("A", 50), rep("B", 50)) 44 | #assign samples 1st to 10th per group to ancient 45 | my_ancient <- c(1:10, 51:60) 46 | 47 | #2/ Run PCA with truncated SVD (PCA 1 x PCA 2 axes) and assign results to object pcaR 48 | pcaR <- smart_pca(snp_data = pathToGenoFile, sample_group = my_groups, sample_project = my_ancient) 49 | #assign statistical results to objects pcaR_eigen, pcaR_load and pcaR_coord 50 | pcaR_eigen <- pcaR$pca.eigenvalues; dim(pcaR_eigen) # extract eigenvalues 51 | pcaR_load <- pcaR$pca.snp_loadings; dim(pcaR_load) # extract principal coefficients (SNP loadings) 52 | pcaR_coord <- pcaR$pca.sample_coordinates; dim(pcaR_coord) # extract principal components (sample position in PCA space) 53 | 54 | #3/ Run PERMANOVA test (group location in PCA1 x PCA2 space after excluding ancient samples) and assign results to object permanovaR 55 | permanovaR <- smart_permanova(snp_data = pathToGenoFile, sample_group = my_groups, target_space = "pca", sample_remove = my_ancient) 56 | #assign sample summary to object permP 57 | permP <- permanovaR$permanova.samples 58 | #show PERMANOVA table 59 | permanovaR$permanova.global_test 60 | 61 | #4/ Run PERMDISP test (group dispersion in PCA1 x PCA2 space after excluding ancient samples) and assign results to object permdispR 62 | permdispR <- smart_permdisp(snp_data = pathToGenoFile, sample_group = my_groups, sample_remove = my_ancient) 63 | #assign sample summary to object permD 64 | permD <-permdispR$permdisp.samples 65 | #show PERMDISP table 66 | permdispR$permdisp.global_test 67 | 68 | #5/ Run PCA, PERMANOVA and PERMDISP in one run and assign results to object mvaR 69 | mvaR <- smart_mva(snp_data = pathToGenoFile, sample_group = my_groups, sample_remove = my_ancient) 70 | # assign statistical results to objects mvaR_eigen, mvaR_load and mvaR_coord 71 | mvaR_eigen <- mvaR$pca$pca.eigenvalues # extract PCA eigenvalues 72 | mvaR_load <- mvaR$pca$pca.snp_loadings # extract principal coefficients (SNP loadings) 73 | mvaR_coord <- mvaR$pca$pca.sample_coordinates # extract PCA principal components (sample position in PCA space) 74 | #show PERMANOVA table 75 | mvaR$test$permanova.global_test 76 | #show PERMDISP table 77 | mvaR$test$permdisp.global_test # extract PERMDISP table 78 | #assign sample summary to object mvaS 79 | mvaS <- mvaR$test$test_samples 80 | 81 | #NOTE 1: Modify argument pc_axes to set the number of computed PCA axes (defaults: pc_axes = 2, program_svd = "RSpectra") 82 | #use program_svd = "bootSVD" for computing all PCA axes, where pc_axes has no effect on computations 83 | #NOTE 2: Missing values in dataset can only be coded as 9 (default: missing_value = 9) or NA (missing_value = NA) 84 | #SNPs with missing values are removed by default (missing_impute = "remove") 85 | #use missing_impute = "mean" for imputing missing values with SNP means 86 | #NOTE 3: arguments sample_remove and snp_remove remove any set of samples (by column number) and SNPs (by row number), respectively 87 | #defaults: sample_remove = FALSE, snp_remove = FALSE 88 | #NOTE 4: use argument sample_project to specify ancient samples by row number (default: sample_project = FALSE) 89 | #ancient samples are assumed to include missing values 90 | #if specified, ancient samples are always removed from PCA, PERMANOVA and PERMDISP computations 91 | #use argument pc_project to set the PCA space onto which ancient samples are projected (default: pc_project = c(1:2) for PCA 1 x PCA2 space) 92 | 93 | #6/ Plot PCA 1 x PCA 2 94 | #create colors for samples groups 95 | cols <- c("red", "blue") 96 | #create color vector (group A = red, group B = blue, ancient samples = black) 97 | my_groups[my_ancient] <- "ancient"; cols = c("red", "black", "blue") 98 | #plot 99 | plot(pcaR$pca.sample_coordinates[,c("PC1","PC2")], cex = 2, col = cols[as.factor(my_groups)], pch = 19, main = "genotype smartpca") 100 | legend("topleft", legend = levels(as.factor(my_groups)), cex = 1, pch = 19, col = cols, text.col = cols) 101 | ``` 102 | -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | License • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 123 | 124 | 125 | 126 |
    127 | 128 |
    129 |
    130 | 133 | 134 |
    YEAR: 2021
    135 | COPYRIGHT HOLDER: Salvador Herrando-Pérez
    136 | 
    137 | 138 |
    139 | 140 | 145 | 146 |
    147 | 148 | 149 | 150 |
    151 | 154 | 155 |
    156 |

    Site built with pkgdown 1.6.1.

    157 |
    158 | 159 |
    160 |
    161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Page not found (404) • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 123 | 124 | 125 | 126 |
    127 | 128 |
    129 |
    130 | 133 | 134 | Content not found. Please use links in the navbar. 135 | 136 |
    137 | 138 | 143 | 144 |
    145 | 146 | 147 | 148 |
    149 | 152 | 153 |
    154 |

    Site built with pkgdown 1.6.1.

    155 |
    156 | 157 |
    158 |
    159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /vignettes/Converting_VCF_and_PLINK_formats.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using VCF and PLINK formatted files with smartsnp" 3 | output: 4 | rmarkdown::html_vignette: 5 | toc: true 6 | toc_depth: 2 7 | description: > 8 | This Vignette shows how you can use VCF and PLINK formatted variant files with smartsnp. 9 | vignette: > 10 | %\VignetteIndexEntry{Using VCF and PLINK formatted files with smartsnp} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | 16 | 17 | Package *smartsnp* is not a data-conversion tool. Inspired by the command-line tool SMARTPCA, *smartsnp* handles SNP datasets in text format and in SMARTPCA formats (uncompressed = EIGENSTRAT or compressed binary = PACKENDANCESTRYMAP) and a general genotype matrix format. However, both VCF (.vcf) and PLINK (.bed) formats are frequently used for storing genetic variation data. In this vignette we provide a quick and robust solution for how to transform these two formats into a general genotype matrix (i.e. where homozygous genotypes are coded as 0 or 2, and heterozygotes as 1) that can be used with the *smartsnp* package. 18 | 19 | The general strategy is to use the *plink2* software for transforming VCF or PLINK/bed files into a general (transposed) genotype matrix. It is "transposed" because PLINK and VCF files typically have samples in rows, whereas the general input file for *smartsnp* has samples in columns. 20 | 21 | We make heavily use of the *plink2* software, which is a comprehensive update to Shaun Purcell's PLINK original command-line program. Binary downloads and an installation guide is available here: 22 | https://www.cog-genomics.org/plink2 23 | 24 | In the *plink2* manual, the file format that we transform our data into is called ".traw (variant-major additive component file)". See here for more information: 25 | https://www.cog-genomics.org/plink/1.9/formats#traw 26 | 27 | Note that the *.traw* format can be directly used with the lastest development version of *smartsnp*, no further data transformation is necessary. 28 | 29 | ## Download a small example VCF 30 | 31 | The R package *sim1000G* contains a small VCF, an unfiltered region from the 1000 genomes Phase III sequencing data VCF, chromosome 4, CEU samples. We will first load the package and then use this file as an example dataset. 32 | 33 | 34 | ```r 35 | library(sim1000G) 36 | 37 | # First set the current working directory (cwd) to where you want to download the data to (note that the *Downloads* folder might not exist on your computer, e.g. if you have a Windows system): 38 | oldwd <- getwd() 39 | setwd("~/Downloads/") 40 | ``` 41 | 42 | 43 | ```r 44 | examples_dir <- system.file("examples", package = "sim1000G") 45 | vcf_file <- file.path(examples_dir, "region.vcf.gz") 46 | 47 | file.copy(from = vcf_file, to = "./") # Copy the file to the cwd 48 | ``` 49 | 50 | ``` 51 | ## [1] FALSE 52 | ``` 53 | 54 | ## VCF to PLINK (.bed) 55 | 56 | As a first step, we show how to transform a VCF file into a PLINK/bed format. Note that the VCF is gzipped, but *plink2* can directly use gzipped files. 57 | 58 | We will use the *system* function for calling *plink2*. This is effectively the same as running the quoted command on the command line. 59 | 60 | 61 | ```r 62 | system("plink --vcf region.vcf.gz --make-bed --out region") 63 | ``` 64 | 65 | The --out parameter defines the name of the output file (without any suffix), you can set it to any arbitrary string. After running this command, you will see three files that make up the PLINK/bed format (region.bim, region.bed, region.fam). See the definition of the PLINK/bed (binary biallelic genotype table) file format for more information: 66 | https://www.cog-genomics.org/plink/1.9/formats#bed 67 | 68 | The *plink2* software offers a wide range of options for filtering and transforming the data that could be useful for your analysis. See the manual: 69 | https://www.cog-genomics.org/plink/1.9/ 70 | 71 | If you don't want to make use of any further *plink2* functionality, then you can also directly transform the VCF file into the .traw format. See section "Directly transforming VCF to raw genotype (.traw)" below. 72 | 73 | ## PLINK to raw genotype (.traw) 74 | 75 | Now we will use the *plink2* software to transform the .bed file into raw genotypes. Again, note that we will need a "transposed" version since *smartsnp* assumes that samples are in columns, not rows. 76 | 77 | 78 | ```r 79 | system("plink --bfile region --recode A-transpose --out region_genotypeMatrix") 80 | ``` 81 | 82 | Again, the --out parameter defines the name of the output file, without the suffix. After running this command, you will see a "region_genotypeMatrix.traw" file. This file can be directly used with smartsnp. 83 | 84 | ## VCF to raw genotype (.traw) 85 | 86 | We could have skipped the intermediate step of transforming the VCF into a PLINK format. The *plink2* software allows to directly transform the VCF into the .traw format. 87 | 88 | 89 | ```r 90 | system("plink --vcf region.vcf.gz --recode A-transpose --out region_genotypeMatrix") 91 | ``` 92 | 93 | ## Running smartpca 94 | 95 | The VCF file just contained data from a single group (CEU). However, just to demonstrate that this file can be used with smartsnp we'll run a simple pca analysis. Importantly, you will have to set the *missing_value* parameter to "NA". 96 | 97 | 98 | ```r 99 | # To use .traw files, we will need to load the latest development version of smartsnp. 100 | install.packages("devtools") 101 | devtools::install_github("ChristianHuber/smartsnp") 102 | ``` 103 | 104 | 105 | ```r 106 | # Load the PLINK (.fam) file to get the number of samples 107 | numSamples = nrow(read.table("region.fam")) 108 | 109 | # There is just a single group in this data 110 | group_id <- rep(c("CEU"), length.out = numSamples) 111 | 112 | # Running smart_pca 113 | sm.pca <- smart_pca(snp_data = "region_genotype.traw", 114 | sample_group = group_id, 115 | missing_value = NA) 116 | 117 | # Here is a plot of the first two components: 118 | plot(sm.pca$pca.sample_coordinates[, c(3,4)]) 119 | ``` 120 | 121 | plot of chunk data_transformation_example_plot 122 | 123 | Voila! Now to go back to the old working directory: 124 | 125 | 126 | ```r 127 | setwd(oldwd) 128 | ``` 129 | 130 | 131 | -------------------------------------------------------------------------------- /vignettes/Converting_VCF_and_PLINK_formats.Rmd.orig: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using VCF and PLINK formatted files with smartsnp" 3 | output: 4 | rmarkdown::html_vignette: 5 | toc: true 6 | toc_depth: 2 7 | description: > 8 | This Vignette shows how you can use VCF and PLINK formatted variant files with smartsnp. 9 | vignette: > 10 | %\VignetteIndexEntry{Using VCF and PLINK formatted files with smartsnp} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | ```{r setup, echo = FALSE, message = FALSE} 16 | knitr::opts_knit$set(collapse = T, comment = "#>") 17 | knitr::opts_knit$set(root.dir = normalizePath("~/Dropbox/Salva_PCA/converting_VCF_to_PLINK_and_GENOTYPE")) 18 | options(tibble.print_min = 4L, tibble.print_max = 4L) 19 | library(smartsnp) 20 | library(sim1000G) 21 | set.seed(1014) 22 | 23 | ``` 24 | 25 | Package *smartsnp* is not a data-conversion tool. Inspired by the command-line tool SMARTPCA, *smartsnp* handles SNP datasets in text format and in SMARTPCA formats (uncompressed = EIGENSTRAT or compressed binary = PACKENDANCESTRYMAP) and a general genotype matrix format. However, both VCF (.vcf) and PLINK (.bed) formats are frequently used for storing genetic variation data. In this vignette we provide a quick and robust solution for how to transform these two formats into a general genotype matrix (i.e. where homozygous genotypes are coded as 0 or 2, and heterozygotes as 1) that can be used with the *smartsnp* package. 26 | 27 | The general strategy is to use the *plink2* software for transforming VCF or PLINK/bed files into a general (transposed) genotype matrix. It is "transposed" because PLINK and VCF files typically have samples in rows, whereas the general input file for *smartsnp* has samples in columns. 28 | 29 | We make heavily use of the *plink2* software, which is a comprehensive update to Shaun Purcell's PLINK original command-line program. Binary downloads and an installation guide is available here: 30 | https://www.cog-genomics.org/plink2 31 | 32 | In the *plink2* manual, the file format that we transform our data into is called ".traw (variant-major additive component file)". See here for more information: 33 | https://www.cog-genomics.org/plink/1.9/formats#traw 34 | 35 | Note that the *.traw* format can be directly used with the lastest development version of *smartsnp*, no further data transformation is necessary. 36 | 37 | ## Download a small example VCF 38 | 39 | The R package *sim1000G* contains a small VCF, an unfiltered region from the 1000 genomes Phase III sequencing data VCF, chromosome 4, CEU samples. We will first load the package and then use this file as an example dataset. 40 | 41 | ```{r, message=FALSE, error=FALSE, eval=FALSE} 42 | library(sim1000G) 43 | 44 | # First set the current working directory (cwd) to where you want to download the data to (note that the *Downloads* folder might not exist on your computer, e.g. if you have a Windows system): 45 | oldwd <- getwd() 46 | setwd("~/Downloads/") 47 | ``` 48 | 49 | ```{r, message=FALSE, error=FALSE} 50 | examples_dir <- system.file("examples", package = "sim1000G") 51 | vcf_file <- file.path(examples_dir, "region.vcf.gz") 52 | 53 | file.copy(from = vcf_file, to = "./") # Copy the file to the cwd 54 | ``` 55 | 56 | ## VCF to PLINK (.bed) 57 | 58 | As a first step, we show how to transform a VCF file into a PLINK/bed format. Note that the VCF is gzipped, but *plink2* can directly use gzipped files. 59 | 60 | We will use the *system* function for calling *plink2*. This is effectively the same as running the quoted command on the command line. 61 | 62 | ```{r} 63 | system("plink --vcf region.vcf.gz --make-bed --out region") 64 | ``` 65 | 66 | The --out parameter defines the name of the output file (without any suffix), you can set it to any arbitrary string. After running this command, you will see three files that make up the PLINK/bed format (region.bim, region.bed, region.fam). See the definition of the PLINK/bed (binary biallelic genotype table) file format for more information: 67 | https://www.cog-genomics.org/plink/1.9/formats#bed 68 | 69 | The *plink2* software offers a wide range of options for filtering and transforming the data that could be useful for your analysis. See the manual: 70 | https://www.cog-genomics.org/plink/1.9/ 71 | 72 | If you don't want to make use of any further *plink2* functionality, then you can also directly transform the VCF file into the .traw format. See section "Directly transforming VCF to raw genotype (.traw)" below. 73 | 74 | ## PLINK to raw genotype (.traw) 75 | 76 | Now we will use the *plink2* software to transform the .bed file into raw genotypes. Again, note that we will need a "transposed" version since *smartsnp* assumes that samples are in columns, not rows. 77 | 78 | ```{r} 79 | system("plink --bfile region --recode A-transpose --out region_genotypeMatrix") 80 | ``` 81 | 82 | Again, the --out parameter defines the name of the output file, without the suffix. After running this command, you will see a "region_genotypeMatrix.traw" file. This file can be directly used with smartsnp. 83 | 84 | ## VCF to raw genotype (.traw) 85 | 86 | We could have skipped the intermediate step of transforming the VCF into a PLINK format. The *plink2* software allows to directly transform the VCF into the .traw format. 87 | 88 | ```{r} 89 | system("plink --vcf region.vcf.gz --recode A-transpose --out region_genotypeMatrix") 90 | ``` 91 | 92 | ## Running smartpca 93 | 94 | The VCF file just contained data from a single group (CEU). However, just to demonstrate that this file can be used with smartsnp we'll run a simple pca analysis. Importantly, you will have to set the *missing_value* parameter to "NA". 95 | 96 | ```{r, eval=FALSE} 97 | # To use .traw files, we will need to load the latest development version of smartsnp. 98 | install.packages("devtools") 99 | devtools::install_github("ChristianHuber/smartsnp") 100 | ``` 101 | 102 | ```{r data_transformation_example_plot, message=FALSE, fig.height = 7, fig.width = 7, fig.align = "center"} 103 | # Load the PLINK (.fam) file to get the number of samples 104 | numSamples = nrow(read.table("region.fam")) 105 | 106 | # There is just a single group in this data 107 | group_id <- rep(c("CEU"), length.out = numSamples) 108 | 109 | # Running smart_pca 110 | sm.pca <- smart_pca(snp_data = "region_genotype.traw", 111 | sample_group = group_id, 112 | missing_value = NA) 113 | 114 | # Here is a plot of the first two components: 115 | plot(sm.pca$pca.sample_coordinates[, c(3,4)]) 116 | ``` 117 | 118 | Voila! Now to go back to the old working directory: 119 | 120 | ```{r, eval=FALSE} 121 | setwd(oldwd) 122 | ``` 123 | 124 | 125 | -------------------------------------------------------------------------------- /inst/extdata/mallard_snps_Kraus2013: -------------------------------------------------------------------------------- 1 | ss263068950 2 | ss263068952 3 | ss263068953 4 | ss263068954 5 | ss263068955 6 | ss263068956 7 | ss263068957 8 | ss263068958 9 | ss263068959 10 | ss263068960 11 | ss263068961 12 | ss263068962 13 | ss263068963 14 | ss263068964 15 | ss263068965 16 | ss263068967 17 | ss263068968 18 | ss263068969 19 | ss263068970 20 | ss263068971 21 | ss263068972 22 | ss263068973 23 | ss263068974 24 | ss263068975 25 | ss263068976 26 | ss263068977 27 | ss263068978 28 | ss263068979 29 | ss263068980 30 | ss263068981 31 | ss263068982 32 | ss263068983 33 | ss263068984 34 | ss263068985 35 | ss263068986 36 | ss263068987 37 | ss263068989 38 | ss263068991 39 | ss263068992 40 | ss263068993 41 | ss263068994 42 | ss263068995 43 | ss263068996 44 | ss263068997 45 | ss263068998 46 | ss263068999 47 | ss263069000 48 | ss263069002 49 | ss263069004 50 | ss263069005 51 | ss263069006 52 | ss263069007 53 | ss263069008 54 | ss263069009 55 | ss263069010 56 | ss263069012 57 | ss263069013 58 | ss263069014 59 | ss263069015 60 | ss263069017 61 | ss263069018 62 | ss263069019 63 | ss263069020 64 | ss263069021 65 | ss263069022 66 | ss263069023 67 | ss263069024 68 | ss263069025 69 | ss263069026 70 | ss263069027 71 | ss263069028 72 | ss263069029 73 | ss263069030 74 | ss263069031 75 | ss263069032 76 | ss263069033 77 | ss263069034 78 | ss263069035 79 | ss263069036 80 | ss263069037 81 | ss263069038 82 | ss263069039 83 | ss263069040 84 | ss263069041 85 | ss263069042 86 | ss263069043 87 | ss263069044 88 | ss263069045 89 | ss263069046 90 | ss263069048 91 | ss263069049 92 | ss263069050 93 | ss263069051 94 | ss263069052 95 | ss263069053 96 | ss263069054 97 | ss263069055 98 | ss263069056 99 | ss263069057 100 | ss263069058 101 | ss263069059 102 | ss263069060 103 | ss263069061 104 | ss263069062 105 | ss263069063 106 | ss263069064 107 | ss263069065 108 | ss263069066 109 | ss263069067 110 | ss263069068 111 | ss263069069 112 | ss263069070 113 | ss263069071 114 | ss263069072 115 | ss263069073 116 | ss263069074 117 | ss263069075 118 | ss263069076 119 | ss263069077 120 | ss263069078 121 | ss263069079 122 | ss263069080 123 | ss263069081 124 | ss263069082 125 | ss263069083 126 | ss263069084 127 | ss263069085 128 | ss263069086 129 | ss263069087 130 | ss263069088 131 | ss263069089 132 | ss263069090 133 | ss263069091 134 | ss263069092 135 | ss263069093 136 | ss263069094 137 | ss263069095 138 | ss263069096 139 | ss263069097 140 | ss263069098 141 | ss263069099 142 | ss263069100 143 | ss263069101 144 | ss263069102 145 | ss263069103 146 | ss263069104 147 | ss263069105 148 | ss263069106 149 | ss263069108 150 | ss263069109 151 | ss263069110 152 | ss263069111 153 | ss263069112 154 | ss263069113 155 | ss263069114 156 | ss263069115 157 | ss263069116 158 | ss263069117 159 | ss263069118 160 | ss263069119 161 | ss263069120 162 | ss263069121 163 | ss263069122 164 | ss263069123 165 | ss263069124 166 | ss263069125 167 | ss263069126 168 | ss263069127 169 | ss263069128 170 | ss263069129 171 | ss263069130 172 | ss263069131 173 | ss263069132 174 | ss263069133 175 | ss263069134 176 | ss263069136 177 | ss263069137 178 | ss263069138 179 | ss263069139 180 | ss263069140 181 | ss263069141 182 | ss263069142 183 | ss263069143 184 | ss263069144 185 | ss263069145 186 | ss263069146 187 | ss263069147 188 | ss263069148 189 | ss263069149 190 | ss263069150 191 | ss263069151 192 | ss263069152 193 | ss263069153 194 | ss263069154 195 | ss263069155 196 | ss263069156 197 | ss263069157 198 | ss263069158 199 | ss263069159 200 | ss263069160 201 | ss263069162 202 | ss263069163 203 | ss263069164 204 | ss263069165 205 | ss263069166 206 | ss263069167 207 | ss263069168 208 | ss263069169 209 | ss263069170 210 | ss263069171 211 | ss263069172 212 | ss263069173 213 | ss263069174 214 | ss263069175 215 | ss263069176 216 | ss263069177 217 | ss263069178 218 | ss263069179 219 | ss263069180 220 | ss263069181 221 | ss263069182 222 | ss263069183 223 | ss263069184 224 | ss263069185 225 | ss263069186 226 | ss263069187 227 | ss263069188 228 | ss263069189 229 | ss263069190 230 | ss263069191 231 | ss263069192 232 | ss263069193 233 | ss263069195 234 | ss263069196 235 | ss263069198 236 | ss263069199 237 | ss263069200 238 | ss263069201 239 | ss263069202 240 | ss263069203 241 | ss263069204 242 | ss263069205 243 | ss263069206 244 | ss263069207 245 | ss263069208 246 | ss263069209 247 | ss263069210 248 | ss263069211 249 | ss263069212 250 | ss263069213 251 | ss263069214 252 | ss263069215 253 | ss263069217 254 | ss263069218 255 | ss263069219 256 | ss263069220 257 | ss263069221 258 | ss263069222 259 | ss263069223 260 | ss263069224 261 | ss263069225 262 | ss263069226 263 | ss263069227 264 | ss263069228 265 | ss263069229 266 | ss263069230 267 | ss263069231 268 | ss263069232 269 | ss263069233 270 | ss263069234 271 | ss263069235 272 | ss263069236 273 | ss263069237 274 | ss263069238 275 | ss263069239 276 | ss263069240 277 | ss263069241 278 | ss263069243 279 | ss263069244 280 | ss263069245 281 | ss263069246 282 | ss263069247 283 | ss263069248 284 | ss263069249 285 | ss263069250 286 | ss263069251 287 | ss263069252 288 | ss263069253 289 | ss263069254 290 | ss263069255 291 | ss263069256 292 | ss263069257 293 | ss263069258 294 | ss263069259 295 | ss263069261 296 | ss263069262 297 | ss263069263 298 | ss263069264 299 | ss263069265 300 | ss263069266 301 | ss263069267 302 | ss263069268 303 | ss263069269 304 | ss263069270 305 | ss263069271 306 | ss263069272 307 | ss263069273 308 | ss263069274 309 | ss263069275 310 | ss263069276 311 | ss263069277 312 | ss263069278 313 | ss263069279 314 | ss263069280 315 | ss263069281 316 | ss263069282 317 | ss263069283 318 | ss263069284 319 | ss263069286 320 | ss263069287 321 | ss263069288 322 | ss263069289 323 | ss263069290 324 | ss263069291 325 | ss263069292 326 | ss263069293 327 | ss263069294 328 | ss263069295 329 | ss263069296 330 | ss263069297 331 | ss263069298 332 | ss263069299 333 | ss263069300 334 | ss263069302 335 | ss263069303 336 | ss263069304 337 | ss263069305 338 | ss263069306 339 | ss263069307 340 | ss263069308 341 | ss263069309 342 | ss263069310 343 | ss263069311 344 | ss263069312 345 | ss263069313 346 | ss263069314 347 | ss263069315 348 | ss263069316 349 | ss263069317 350 | ss263069318 351 | ss263069319 352 | ss263069320 353 | ss263069321 354 | ss263069323 355 | ss263069324 356 | ss263069325 357 | ss263069326 358 | ss263069327 359 | ss263069328 360 | ss263069329 361 | ss263069330 362 | ss263069331 363 | ss263069332 364 | ss263069333 365 | -------------------------------------------------------------------------------- /docs/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Changelog • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 123 | 124 | 125 | 126 |
    127 | 128 |
    129 |
    130 | 134 | 135 |
    136 |

    137 | smartsnp 1.1.0 2021-03-04 138 |

    139 |
      140 |
    • Added a NEWS.md file to track changes to the package.

    • 141 |
    • First submission of package smartsnp (1 March 2020)

    • 142 |
    143 |
    144 |
    145 | 146 | 151 | 152 |
    153 | 154 | 155 |
    156 | 159 | 160 |
    161 |

    Site built with pkgdown 1.6.1.

    162 |
    163 | 164 |
    165 |
    166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 123 | 124 | 125 | 126 |
    127 | 128 |
    129 |
    130 | 133 | 134 |
      135 |
    • 136 |

      Salvador Herrando-Perez. Author. 137 |

      138 |
    • 139 |
    • 140 |

      Ray Tobler. Contributor. 141 |

      142 |
    • 143 |
    • 144 |

      Christian Huber. Contributor, maintainer. 145 |

      146 |
    • 147 |
    148 | 149 |
    150 | 151 |
    152 | 153 | 154 | 155 |
    156 | 159 | 160 |
    161 |

    Site built with pkgdown 1.6.1.

    162 |
    163 | 164 |
    165 |
    166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Articles • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 123 | 124 | 125 | 126 |
    127 | 128 |
    129 |
    130 | 133 | 134 |
    135 |

    All vignettes

    136 |

    137 | 138 |
    139 |
    Using VCF and PLINK formatted files with smartsnp
    140 |

    This Vignette shows how you can use VCF and PLINK formatted variant files with smartsnp.

    141 |
    Projecting ancient samples
    142 |

    This Vignette provides an example of how to project ancient DNA onto modern data using the smartsnp package.

    143 |
    Example PCA, PERMANOVA and PERMDISP analysis
    144 |

    This Vignette provides an example analysis of genetic data using the smartsnp package.

    145 |
    146 |
    147 |
    148 |
    149 | 150 | 151 |
    152 | 155 | 156 |
    157 |

    Site built with pkgdown 1.6.1.

    158 |
    159 | 160 |
    161 |
    162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /docs/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | MIT License • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 123 | 124 | 125 | 126 |
    127 | 128 |
    129 |
    130 | 133 | 134 |
    135 | 136 |

    Copyright (c) 2021 Salvador Herrando-Pérez

    137 |

    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

    138 |

    The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

    139 |

    THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

    140 |
    141 | 142 |
    143 | 144 | 149 | 150 |
    151 | 152 | 153 | 154 |
    155 | 158 | 159 |
    160 |

    Site built with pkgdown 1.6.1.

    161 |
    162 | 163 |
    164 |
    165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # smartsnp 5 | 6 | 7 | 8 | 9 | 10 | ## Overview 11 | 12 | The package *smartsnp* runs fast and user-friendly computation of 13 | Principal Component Analysis (PCA) on single-nucleotide-polymorphism 14 | (SNP) data suitable for ancient, low-coverage and modern DNA. The 15 | package combines SNP scaling for genetic drift and projection of ancient 16 | samples onto a modern genetic PCA space (currently available only in 17 | Unix environment in the field-standard software EIGENSOFT) with 18 | permutation-based multivariate tests for population differences in 19 | genetic diversity (both location and dispersion). The package comprises 20 | three functions that run each analysis individually (*smart\_pca*, 21 | *smart\_permanova*, *smart\_permdisp*), and a wrapper function 22 | (*smart\_mva*) that runs any combination of the three standalone 23 | functions. 24 | 25 | ## Installation 26 | 27 | Install the released version of smartsnp from 28 | [CRAN](https://CRAN.R-project.org) with: 29 | 30 | ``` r 31 | install.packages("smartsnp") 32 | ``` 33 | 34 | Install the latest development version from GitHub: 35 | 36 | ``` r 37 | if (!require("devtools")) { 38 | install.packages("devtools") 39 | } 40 | devtools::install_github("ChristianHuber/smartsnp") 41 | ``` 42 | 43 | ## Vignettes 44 | 45 | The vignettes demonstrate the different steps of a typical smartsnp analysis. 46 | 47 | * [Example PCA, PERMANOVA and PERMDISP analysis](https://christianhuber.github.io/smartsnp/articles/mallard_smartpca_analysis.html) 48 | 49 | * [Projecting ancient samples](https://christianhuber.github.io/smartsnp/articles/aDNA_smartpca_analysis.html) 50 | 51 | * [Using VCF and PLINK formatted files with smartsnp](https://christianhuber.github.io/smartsnp/articles/Converting_VCF_and_PLINK_formats.html) 52 | 53 | All vignettes and function descriptions can be found at the *smartsnp* website: 54 | 55 | https://christianhuber.github.io/smartsnp 56 | 57 | ## Quick example 58 | 59 | This is an example of how to run PCA, PERMANOVA and PERMDISP controlling 60 | for genetic drift for the package’s dataset *dataSNP* including 10000 61 | simulated SNPs in 100 samples (80 = modern, 20 = ancient). 62 | 63 | ``` r 64 | #1/ Load package and label samples 65 | library(smartsnp) 66 | # Path to example genotype matrix "dataSNP" 67 | pathToGenoFile = system.file("extdata", "dataSNP", package = "smartsnp") 68 | #assign 50 samples to each of two groups 69 | my_groups <- c(rep("A", 50), rep("B", 50)) 70 | #assign samples 1st to 10th per group to ancient 71 | my_ancient <- c(1:10, 51:60) 72 | 73 | #2/ Run PCA with truncated SVD (PCA 1 x PCA 2 axes) and assign results to object pcaR 74 | pcaR <- smart_pca(snp_data = pathToGenoFile, sample_group = my_groups, sample_project = my_ancient) 75 | #assign statistical results to objects pcaR_eigen, pcaR_load and pcaR_coord 76 | pcaR_eigen <- pcaR$pca.eigenvalues; dim(pcaR_eigen) # extract eigenvalues 77 | #> [1] 3 2 78 | pcaR_load <- pcaR$pca.snp_loadings; dim(pcaR_load) # extract principal coefficients (SNP loadings) 79 | #> [1] 4532 2 80 | pcaR_coord <- pcaR$pca.sample_coordinates; dim(pcaR_coord) # extract principal components (sample position in PCA space) 81 | #> [1] 100 4 82 | 83 | #3/ Run PERMANOVA test (group location in PCA1 x PCA2 space after excluding ancient samples) and assign results to object permanovaR 84 | permanovaR <- smart_permanova(snp_data = pathToGenoFile, sample_group = my_groups, target_space = "pca", sample_remove = my_ancient) 85 | #assign sample summary to object permP 86 | permP <- permanovaR$permanova.samples 87 | #show PERMANOVA table 88 | permanovaR$permanova.global_test 89 | #> Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) 90 | #> group 1 175.8 175.82 0.45613 0.00581 0.6459 91 | #> Residuals 78 30066.3 385.47 0.99419 92 | #> Total 79 30242.1 1.00000 93 | 94 | #4/ Run PERMDISP test (group dispersion in PCA1 x PCA2 space after excluding ancient samples) and assign results to object permdispR 95 | permdispR <- smart_permdisp(snp_data = pathToGenoFile, sample_group = my_groups, sample_remove = my_ancient) 96 | #assign sample summary to object permD 97 | permD <-permdispR$permdisp.samples 98 | #show PERMDISP table 99 | permdispR$permdisp.global_test 100 | #> Df Sum Sq Mean Sq F Pr(>F) 101 | #> Groups 1 0.07254468 0.07254468 0.1911168 0.6693 102 | #> Residuals 78 29.60747071 0.37958296 NA NA 103 | 104 | #5/ Run PCA, PERMANOVA and PERMDISP in one run and assign results to object mvaR 105 | mvaR <- smart_mva(snp_data = pathToGenoFile, sample_group = my_groups, sample_remove = my_ancient) 106 | # assign statistical results to objects mvaR_eigen, mvaR_load and mvaR_coord 107 | mvaR_eigen <- mvaR$pca$pca.eigenvalues # extract PCA eigenvalues 108 | mvaR_load <- mvaR$pca$pca.snp_loadings # extract principal coefficients (SNP loadings) 109 | mvaR_coord <- mvaR$pca$pca.sample_coordinates # extract PCA principal components (sample position in PCA space) 110 | #show PERMANOVA table 111 | mvaR$test$permanova.global_test 112 | #> Df SumsOfSqs MeanSqs F.Model R2 Pr(>F) 113 | #> group 1 11849 11849 0.97217 0.01231 0.9092 114 | #> Residuals 78 950644 12188 0.98769 115 | #> Total 79 962493 1.00000 116 | #show PERMDISP table 117 | mvaR$test$permdisp.global_test # extract PERMDISP table 118 | #> Df Sum Sq Mean Sq F Pr(>F) 119 | #> Groups 1 0.07254468 0.07254468 0.1911168 0.6661 120 | #> Residuals 78 29.60747071 0.37958296 NA NA 121 | #assign sample summary to object mvaS 122 | mvaS <- mvaR$test$test_samples 123 | 124 | #NOTE 1: Modify argument pc_axes to set the number of computed PCA axes (defaults: pc_axes = 2, program_svd = "RSpectra") 125 | #use program_svd = "bootSVD" for computing all PCA axes, where pc_axes has no effect on computations 126 | #NOTE 2: Missing values in dataset can only be coded as 9 (default: missing_value = 9) or NA (missing_value = NA) 127 | #SNPs with missing values are removed by default (missing_impute = "remove") 128 | #use missing_impute = "mean" for imputing missing values with SNP means 129 | #NOTE 3: arguments sample_remove and snp_remove remove any set of samples (by column number) and SNPs (by row number), respectively 130 | #defaults: sample_remove = FALSE, snp_remove = FALSE 131 | #NOTE 4: use argument sample_project to specify ancient samples by row number (default: sample_project = FALSE) 132 | #ancient samples are assumed to include missing values 133 | #if specified, ancient samples are always removed from PCA, PERMANOVA and PERMDISP computations 134 | #use argument pc_project to set the PCA space onto which ancient samples are projected (default: pc_project = c(1:2) for PCA 1 x PCA2 space) 135 | 136 | #6/ Plot PCA 1 x PCA 2 137 | #create colors for samples groups 138 | cols <- c("red", "blue") 139 | #create color vector (group A = red, group B = blue, ancient samples = black) 140 | my_groups[my_ancient] <- "ancient"; cols = c("red", "black", "blue") 141 | #plot 142 | plot(pcaR$pca.sample_coordinates[,c("PC1","PC2")], cex = 2, col = cols[as.factor(my_groups)], pch = 19, main = "genotype smartpca") 143 | legend("topleft", legend = levels(as.factor(my_groups)), cex = 1, pch = 19, col = cols, text.col = cols) 144 | ``` 145 | 146 | 147 | -------------------------------------------------------------------------------- /docs/reference/read_packedancestrymap.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Read Files in PACKEDANCESTRYMAP format — read_packedancestrymap • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 56 | 57 | 58 | 59 | 60 | 61 | 62 |
    63 |
    64 | 124 | 125 | 126 | 127 |
    128 | 129 |
    130 |
    131 | 136 | 137 |
    138 |

    This function loads genotype data in PACKEDANCESTRYMAP format (binary or compressed).

    139 |
    140 | 141 |
    read_packedancestrymap(pref)
    142 | 143 |

    Arguments

    144 | 145 | 146 | 147 | 148 | 149 | 150 |
    pref

    The prefix of the file name that contains the genotype data (i.e., without the *.geno).

    151 | 152 |

    Value

    153 | 154 |

    Returns a list containing a single element:

      155 |
    • geno Genotype data as R matrix.

    • 156 |
    157 | 158 | 159 |
    160 | 165 |
    166 | 167 | 168 |
    169 | 172 | 173 |
    174 |

    Site built with pkgdown 1.6.1.

    175 |
    176 | 177 |
    178 |
    179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Function reference • smartsnp 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
    62 |
    63 | 123 | 124 | 125 | 126 |
    127 | 128 |
    129 |
    130 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 159 | 160 | 161 | 162 | 165 | 166 | 167 | 168 | 171 | 172 | 173 | 174 | 177 | 178 | 179 | 180 | 183 | 184 | 185 | 186 |
    145 |

    All functions

    146 |

    147 |
    157 |

    read_packedancestrymap()

    158 |

    Read Files in PACKEDANCESTRYMAP format

    163 |

    smart_mva

    164 |

    Smart Multivariate Analyses (wrapper of PCA, PERMANOVA and PERMDISP)

    169 |

    smart_pca

    170 |

    Smart Principal Component Analysis

    175 |

    smart_permanova

    176 |

    Smart Permutational Multivariate Analysis of Variance

    181 |

    smart_permdisp

    182 |

    Smart Permutational Multivariate Analysis of Dispersion

    187 |
    188 | 189 | 194 |
    195 | 196 | 197 |
    198 | 201 | 202 |
    203 |

    Site built with pkgdown 1.6.1.

    204 |
    205 | 206 |
    207 |
    208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body { 21 | position: relative; 22 | } 23 | 24 | body > .container { 25 | display: flex; 26 | height: 100%; 27 | flex-direction: column; 28 | } 29 | 30 | body > .container .row { 31 | flex: 1 0 auto; 32 | } 33 | 34 | footer { 35 | margin-top: 45px; 36 | padding: 35px 0 36px; 37 | border-top: 1px solid #e5e5e5; 38 | color: #666; 39 | display: flex; 40 | flex-shrink: 0; 41 | } 42 | footer p { 43 | margin-bottom: 0; 44 | } 45 | footer div { 46 | flex: 1; 47 | } 48 | footer .pkgdown { 49 | text-align: right; 50 | } 51 | footer p { 52 | margin-bottom: 0; 53 | } 54 | 55 | img.icon { 56 | float: right; 57 | } 58 | 59 | img { 60 | max-width: 100%; 61 | } 62 | 63 | /* Fix bug in bootstrap (only seen in firefox) */ 64 | summary { 65 | display: list-item; 66 | } 67 | 68 | /* Typographic tweaking ---------------------------------*/ 69 | 70 | .contents .page-header { 71 | margin-top: calc(-60px + 1em); 72 | } 73 | 74 | dd { 75 | margin-left: 3em; 76 | } 77 | 78 | /* Section anchors ---------------------------------*/ 79 | 80 | a.anchor { 81 | margin-left: -30px; 82 | display:inline-block; 83 | width: 30px; 84 | height: 30px; 85 | visibility: hidden; 86 | 87 | background-image: url(./link.svg); 88 | background-repeat: no-repeat; 89 | background-size: 20px 20px; 90 | background-position: center center; 91 | } 92 | 93 | .hasAnchor:hover a.anchor { 94 | visibility: visible; 95 | } 96 | 97 | @media (max-width: 767px) { 98 | .hasAnchor:hover a.anchor { 99 | visibility: hidden; 100 | } 101 | } 102 | 103 | 104 | /* Fixes for fixed navbar --------------------------*/ 105 | 106 | .contents h1, .contents h2, .contents h3, .contents h4 { 107 | padding-top: 60px; 108 | margin-top: -40px; 109 | } 110 | 111 | /* Navbar submenu --------------------------*/ 112 | 113 | .dropdown-submenu { 114 | position: relative; 115 | } 116 | 117 | .dropdown-submenu>.dropdown-menu { 118 | top: 0; 119 | left: 100%; 120 | margin-top: -6px; 121 | margin-left: -1px; 122 | border-radius: 0 6px 6px 6px; 123 | } 124 | 125 | .dropdown-submenu:hover>.dropdown-menu { 126 | display: block; 127 | } 128 | 129 | .dropdown-submenu>a:after { 130 | display: block; 131 | content: " "; 132 | float: right; 133 | width: 0; 134 | height: 0; 135 | border-color: transparent; 136 | border-style: solid; 137 | border-width: 5px 0 5px 5px; 138 | border-left-color: #cccccc; 139 | margin-top: 5px; 140 | margin-right: -10px; 141 | } 142 | 143 | .dropdown-submenu:hover>a:after { 144 | border-left-color: #ffffff; 145 | } 146 | 147 | .dropdown-submenu.pull-left { 148 | float: none; 149 | } 150 | 151 | .dropdown-submenu.pull-left>.dropdown-menu { 152 | left: -100%; 153 | margin-left: 10px; 154 | border-radius: 6px 0 6px 6px; 155 | } 156 | 157 | /* Sidebar --------------------------*/ 158 | 159 | #pkgdown-sidebar { 160 | margin-top: 30px; 161 | position: -webkit-sticky; 162 | position: sticky; 163 | top: 70px; 164 | } 165 | 166 | #pkgdown-sidebar h2 { 167 | font-size: 1.5em; 168 | margin-top: 1em; 169 | } 170 | 171 | #pkgdown-sidebar h2:first-child { 172 | margin-top: 0; 173 | } 174 | 175 | #pkgdown-sidebar .list-unstyled li { 176 | margin-bottom: 0.5em; 177 | } 178 | 179 | /* bootstrap-toc tweaks ------------------------------------------------------*/ 180 | 181 | /* All levels of nav */ 182 | 183 | nav[data-toggle='toc'] .nav > li > a { 184 | padding: 4px 20px 4px 6px; 185 | font-size: 1.5rem; 186 | font-weight: 400; 187 | color: inherit; 188 | } 189 | 190 | nav[data-toggle='toc'] .nav > li > a:hover, 191 | nav[data-toggle='toc'] .nav > li > a:focus { 192 | padding-left: 5px; 193 | color: inherit; 194 | border-left: 1px solid #878787; 195 | } 196 | 197 | nav[data-toggle='toc'] .nav > .active > a, 198 | nav[data-toggle='toc'] .nav > .active:hover > a, 199 | nav[data-toggle='toc'] .nav > .active:focus > a { 200 | padding-left: 5px; 201 | font-size: 1.5rem; 202 | font-weight: 400; 203 | color: inherit; 204 | border-left: 2px solid #878787; 205 | } 206 | 207 | /* Nav: second level (shown on .active) */ 208 | 209 | nav[data-toggle='toc'] .nav .nav { 210 | display: none; /* Hide by default, but at >768px, show it */ 211 | padding-bottom: 10px; 212 | } 213 | 214 | nav[data-toggle='toc'] .nav .nav > li > a { 215 | padding-left: 16px; 216 | font-size: 1.35rem; 217 | } 218 | 219 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 220 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 221 | padding-left: 15px; 222 | } 223 | 224 | nav[data-toggle='toc'] .nav .nav > .active > a, 225 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 226 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 227 | padding-left: 15px; 228 | font-weight: 500; 229 | font-size: 1.35rem; 230 | } 231 | 232 | /* orcid ------------------------------------------------------------------- */ 233 | 234 | .orcid { 235 | font-size: 16px; 236 | color: #A6CE39; 237 | /* margins are required by official ORCID trademark and display guidelines */ 238 | margin-left:4px; 239 | margin-right:4px; 240 | vertical-align: middle; 241 | } 242 | 243 | /* Reference index & topics ----------------------------------------------- */ 244 | 245 | .ref-index th {font-weight: normal;} 246 | 247 | .ref-index td {vertical-align: top; min-width: 100px} 248 | .ref-index .icon {width: 40px;} 249 | .ref-index .alias {width: 40%;} 250 | .ref-index-icons .alias {width: calc(40% - 40px);} 251 | .ref-index .title {width: 60%;} 252 | 253 | .ref-arguments th {text-align: right; padding-right: 10px;} 254 | .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} 255 | .ref-arguments .name {width: 20%;} 256 | .ref-arguments .desc {width: 80%;} 257 | 258 | /* Nice scrolling for wide elements --------------------------------------- */ 259 | 260 | table { 261 | display: block; 262 | overflow: auto; 263 | } 264 | 265 | /* Syntax highlighting ---------------------------------------------------- */ 266 | 267 | pre { 268 | word-wrap: normal; 269 | word-break: normal; 270 | border: 1px solid #eee; 271 | } 272 | 273 | pre, code { 274 | background-color: #f8f8f8; 275 | color: #333; 276 | } 277 | 278 | pre code { 279 | overflow: auto; 280 | word-wrap: normal; 281 | white-space: pre; 282 | } 283 | 284 | pre .img { 285 | margin: 5px 0; 286 | } 287 | 288 | pre .img img { 289 | background-color: #fff; 290 | display: block; 291 | height: auto; 292 | } 293 | 294 | code a, pre a { 295 | color: #375f84; 296 | } 297 | 298 | a.sourceLine:hover { 299 | text-decoration: none; 300 | } 301 | 302 | .fl {color: #1514b5;} 303 | .fu {color: #000000;} /* function */ 304 | .ch,.st {color: #036a07;} /* string */ 305 | .kw {color: #264D66;} /* keyword */ 306 | .co {color: #888888;} /* comment */ 307 | 308 | .message { color: black; font-weight: bolder;} 309 | .error { color: orange; font-weight: bolder;} 310 | .warning { color: #6A0366; font-weight: bolder;} 311 | 312 | /* Clipboard --------------------------*/ 313 | 314 | .hasCopyButton { 315 | position: relative; 316 | } 317 | 318 | .btn-copy-ex { 319 | position: absolute; 320 | right: 0; 321 | top: 0; 322 | visibility: hidden; 323 | } 324 | 325 | .hasCopyButton:hover button.btn-copy-ex { 326 | visibility: visible; 327 | } 328 | 329 | /* headroom.js ------------------------ */ 330 | 331 | .headroom { 332 | will-change: transform; 333 | transition: transform 200ms linear; 334 | } 335 | .headroom--pinned { 336 | transform: translateY(0%); 337 | } 338 | .headroom--unpinned { 339 | transform: translateY(-100%); 340 | } 341 | 342 | /* mark.js ----------------------------*/ 343 | 344 | mark { 345 | background-color: rgba(255, 255, 51, 0.5); 346 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 347 | padding: 1px; 348 | } 349 | 350 | /* vertical spacing after htmlwidgets */ 351 | .html-widget { 352 | margin-bottom: 10px; 353 | } 354 | 355 | /* fontawesome ------------------------ */ 356 | 357 | .fab { 358 | font-family: "Font Awesome 5 Brands" !important; 359 | } 360 | 361 | /* don't display links in code chunks when printing */ 362 | /* source: https://stackoverflow.com/a/10781533 */ 363 | @media print { 364 | code a:link:after, code a:visited:after { 365 | content: ""; 366 | } 367 | } 368 | -------------------------------------------------------------------------------- /vignettes/aDNA_smartpca_analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Projecting ancient samples" 3 | output: 4 | rmarkdown::html_vignette: 5 | toc: true 6 | toc_depth: 2 7 | description: > 8 | This Vignette provides an example of how to project ancient DNA onto modern data using 9 | the smartsnp package. 10 | vignette: > 11 | %\VignetteIndexEntry{Projecting ancient samples} 12 | %\VignetteEngine{knitr::rmarkdown} 13 | %\VignetteEncoding{UTF-8} 14 | --- 15 | 16 | 17 | 18 | This Vignette provides an example of projecting ancient DNA onto modern data in a PCA analysis using the *smartsnp* package. We will use data from one of the first large-scale ancient DNA studies, Lazaridis et al. 2016: 19 | 20 | Lazaridis et al. "Genomic insights into the origin of farming in the ancient Near East", Nature volume 536, pages 419–424 (2016). 21 | 22 | The data is available online but needs to be pre-processed. Particularly, the aDNA data needs to be merged with modern data. All steps can be completed in R, but some need certain command-line software installed - they won't work on a Windows machine. 23 | 24 | If you are just interested in how to run *smart_pca* with ancient samples (all you need is an index vector *aDNA_inds* with the column numbers of the ancient samples), feel free to go straight to the section "Running smartpca" below. 25 | 26 | ## Install package *smartsnp* 27 | 28 | Select one of two options. 29 | 30 | Install development version from GitHub: 31 | 32 | 33 | ```r 34 | install.packages("devtools") 35 | devtools::install_github("ChristianHuber/smartsnp") 36 | ``` 37 | 38 | Install release version from CRAN: 39 | 40 | 41 | ```r 42 | install.packages("smartsnp") 43 | ``` 44 | 45 | Load the package: 46 | 47 | 48 | ```r 49 | library(smartsnp) 50 | ``` 51 | 52 | ## Downloading the data 53 | 54 | First, set the working directory to a location where you want to download and process the files. In my case, I'm choosing the Downloads directory in my home folder. 55 | 56 | 57 | ```r 58 | oldwd <- getwd() 59 | setwd("~/Downloads/") 60 | ``` 61 | 62 | We will download the data provided here https://reich.hms.harvard.edu/datasets using a command-line software called *wget*. Alternatively, you can download the file using a browser and the link. 63 | Note that this is quite a large file, >200 Mb! 64 | 65 | 66 | ```r 67 | system("wget https://reich.hms.harvard.edu/sites/reich.hms.harvard.edu/files/inline-files/NearEastPublic.tar.gz") 68 | ``` 69 | 70 | The downloaded data has to be unzipped. I will unzip it into a new folder called "data". 71 | 72 | 73 | ```r 74 | system("mkdir data") # Make a new folder called "data" 75 | system("tar -xvf NearEastPublic.tar.gz -C ./data") # Unzip data into this folder 76 | system("rm NearEastPublic.tar.gz") # Remove the zip file 77 | ``` 78 | 79 | 80 | ## Select subsets of individuals with convertf 81 | 82 | The data is in a PACKEDANCESTRYMAP format. The ancient and modern data is in two different files, and we are only interested in the Western Eurasian subset of the modern samples. 83 | 84 | In the next step, we will filter out the Western Eurasian samples from the full set of modern samples. Then, we will merge the modern with the ancient data. 85 | 86 | The *convertf* and *mergeit* command-line software of the Eigensoft package conveniently allows to run these two operations, see here for installation: 87 | https://github.com/DReichLab/EIG 88 | 89 | To run Eigensoft software within R using the *system* function, you might need to first explicitly tell R where the software can be found: 90 | 91 | 92 | ```r 93 | # Path to Eigensoft binaries (might be different on your computer): 94 | pathToEIGENSOFT = "~/repos/EIG/bin/" 95 | Sys.setenv(PATH = paste(Sys.getenv()["PATH"], paste0(":", pathToEIGENSOFT), sep="")) 96 | ``` 97 | 98 | We need to generate a parameter file for *convertf* that contains all the file names. We also need to generate a list of West Eurasian populations in a text file. 99 | 100 | 101 | ```r 102 | # Generating a text file with West Eurasian group names 103 | westEurasian_pops <- c( 104 | "Abkhasian", "Adygei", "Albanian", "Armenian", "Assyrian", "Balkar", "Basque", "BedouinA", "BedouinB", "Belarusian", "Bulgarian", "Canary_Islander", 105 | "Chechen", "Croatian", "Cypriot", "Czech", "Druze", "English", "Estonian", "Finnish", "French", "Georgian", "German", "Greek", "Hungarian", "Icelandic", 106 | "Iranian", "Irish", "Irish_Ulster", "Italian_North", "Italian_South", "Jew_Ashkenazi", "Jew_Georgian", "Jew_Iranian", "Jew_Iraqi", "Jew_Libyan", "Jew_Moroccan", 107 | "Jew_Tunisian", "Jew_Turkish", "Jew_Yemenite", "Jordanian", "Kumyk", "Lebanese_Christian", "Lebanese", "Lebanese_Muslim", "Lezgin", "Lithuanian", "Maltese", 108 | "Mordovian", "North_Ossetian", "Norwegian", "Orcadian", "Palestinian", "Polish", "Romanian", "Russian", "Sardinian", "Saudi", "Scottish", "Shetlandic", "Sicilian", 109 | "Sorb", "Spanish_North", "Spanish", "Syrian", "Turkish", "Ukrainian" 110 | ) 111 | ``` 112 | 113 | 114 | ```r 115 | # Generating the parameter file for convertf: 116 | par.ANCESTRYMAP.FILTER <- c( 117 | "genotypename: ./data/HumanOriginsPublic2068.geno", 118 | "snpname: ./data/HumanOriginsPublic2068.snp", 119 | "indivname: ./data/HumanOriginsPublic2068.ind", 120 | "poplistname: ./WestEurasia.poplist.txt", 121 | "genotypeoutname: ./data/HumanOriginsPublic2068.WestEurasia.geno", 122 | "snpoutname: ./data/HumanOriginsPublic2068.WestEurasia.snp", 123 | "indivoutname: ./data/HumanOriginsPublic2068.WestEurasia.ind" 124 | ) 125 | 126 | writeLines(par.ANCESTRYMAP.FILTER, con = "par.ANCESTRYMAP.FILTER") 127 | 128 | # Now run convertf using the system command in R. This is equivalent to running the quoted command in a terminal: 129 | 130 | system("convertf -p par.ANCESTRYMAP.FILTER") 131 | ``` 132 | 133 | ## Merging ancient with modern data using mergeit 134 | 135 | Now we combine the ancient samples with the modern data using *mergeit*. 136 | Again, we first need a parameter file and then we can run *mergeit* with the *system* function in R (or alternatively in the terminal). 137 | 138 | 139 | ```r 140 | params <- c( 141 | "geno1: ./data/HumanOriginsPublic2068.WestEurasia.geno", 142 | "snp1: ./data/HumanOriginsPublic2068.WestEurasia.snp", 143 | "ind1: ./data/HumanOriginsPublic2068.WestEurasia.ind", 144 | "geno2: ./data/AncientLazaridis2016.geno", 145 | "snp2: ./data/AncientLazaridis2016.snp", 146 | "ind2: ./data/AncientLazaridis2016.ind", 147 | "genooutfilename: ./data/AncientLazaridis2016_ModernWestEurasia.geno", 148 | "snpoutfilename: ./data/AncientLazaridis2016_ModernWestEurasia.snp", 149 | "indoutfilename: ./data/AncientLazaridis2016_ModernWestEurasia.ind" 150 | ) 151 | 152 | writeLines(params, con = "mergeit.params.txt") 153 | 154 | system("mergeit -p mergeit.params.txt") 155 | ``` 156 | 157 | ## Running smartpca 158 | 159 | We still need two additional vectors before we can run *smartsnp*: one that defines the ancient samples, and one that defines which samples we want to remove before running the PCA. 160 | 161 | 162 | ```r 163 | # Group names of the ancient groups 164 | aDNA_inds <- c("Anatolia_ChL", "Anatolia_N", "Armenia_ChL", "Armenia_EBA", "Armenia_MLBA", "CHG", "EHG", "Europe_EN", "Europe_LNBA", "Europe_MNChL", "Iberia_BA", "Iran_ChL", "Iran_HotuIIIb", "Iran_LN", "Iran_N", "Levant_BA", "Levant_N", "Natufian", "SHG", "Steppe_EMBA", "Steppe_Eneolithic", "Steppe_IA", "Steppe_MLBA", "Switzerland_HG", "WHG") 165 | 166 | # Contains group names of all groups in merged data 167 | GR <- read.table("./data/AncientLazaridis2016_ModernWestEurasia.ind", header=F) 168 | 169 | # Vector defining ancient and modern groups 170 | SA <- ifelse(GR$V3 %in% westEurasian_pops, "modern", "ancient") 171 | 172 | # Samples to remove: 173 | sample.rem <- c("Mota", "Denisovan", "Chimp", "Mbuti.DG", "Altai", 174 | "Vi_merge", "Clovis", "Kennewick", "Chuvash", "Ust_Ishim", 175 | "AG2", "MA1", "MezE", "hg19ref", "Kostenki14") 176 | 177 | # Simple index vectors that determines which samples to remove and which to use for PCA ordination or projection: 178 | SR <- which(GR$V3 %in% sample.rem) # Column numbers of samples to remove 179 | SP <- which(SA == "ancient") # Column numbers of samples to project (i.e. aDNA) 180 | ``` 181 | 182 | Now we are finally ready to run smart_pca: 183 | 184 | 185 | ```r 186 | # Running smart_pca: 187 | sm.pca <- smart_pca(snp_data = "./data/AncientLazaridis2016_ModernWestEurasia.geno", 188 | sample_group = GR$V3, missing_value = 9, missing_impute = "mean", 189 | scaling = "drift", program_svd = "RSpectra", pc_axes = 2, 190 | sample_remove = SR, sample_project = SP, pc_project = 1:2) 191 | 192 | # To see more information on the different parameter options: 193 | ?smart_pca 194 | ``` 195 | 196 | ## Plotting the results using ggplot2 197 | 198 | We can have a look at the result using *ggplot2* (and *ggrepel* for labeling the groups). The *data.table* package is used to simplify some data operations. 199 | For more info on these packages, see: 200 | 201 | https://ggplot2.tidyverse.org/ 202 | 203 | https://cran.r-project.org/web/packages/ggrepel/vignettes/ggrepel.html 204 | 205 | https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html 206 | 207 | 208 | ```r 209 | # This needs the R libraries ggplot2, ggrepel, and data.table. 210 | library(ggplot2) 211 | library(ggrepel) 212 | library(data.table) 213 | 214 | # Plotting with ggplot2 and labeling the groups with ggrepel: 215 | 216 | smart_mva.aDNA_WestEurasia.evec <- data.table(sm.pca$pca.sample_coordinates) 217 | smart_mva.aDNA_WestEurasia.evec[, c("PC1mean", "PC2mean") := .(mean(PC1), mean(PC2)), Group] 218 | smart_mva.aDNA_WestEurasia.evec[, Name := GR$V1] 219 | 220 | ggplot() + 221 | geom_point(data = smart_mva.aDNA_WestEurasia.evec[Class == "PCA"], aes(-PC1, PC2), col="grey", alpha=0.5) + 222 | geom_point(data = smart_mva.aDNA_WestEurasia.evec[Class == "Projected" ], aes(-PC1, PC2, fill=Group, shape=Group), size=3) + 223 | scale_shape_manual(values=rep(21:25, 100)) + 224 | geom_label_repel(data = smart_mva.aDNA_WestEurasia.evec[Class == "Projected",.SD[1], Group], aes(-PC1mean, PC2mean, label=Group, col=Group), alpha=0.7, segment.color="NA") + 225 | theme_bw() + theme(legend.position = "none") 226 | ``` 227 | 228 | plot of chunk lazaridis_plot 229 | 230 | Voila! Note that we have plotted the negative of PC1 (i.e. -PC1) here. The only reason for this is to make the plot have the same orientation as the original plot in Lazaridis et al. (2016), Fig. 1B. Importantly, changing the sign of any axis of an PCA does not change its interpretation, and different software can give different signs. See the excellent explanation here: 231 | 232 | https://stats.stackexchange.com/questions/88880/does-the-sign-of-scores-or-of-loadings-in-pca-or-fa-have-a-meaning-may-i-revers 233 | 234 | Finally, let's move back to the old working directory: 235 | 236 | 237 | ```r 238 | setwd(oldwd) 239 | ``` 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | -------------------------------------------------------------------------------- /vignettes/aDNA_smartpca_analysis.Rmd.orig: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Projecting ancient samples" 3 | output: 4 | rmarkdown::html_vignette: 5 | toc: true 6 | toc_depth: 2 7 | description: > 8 | This Vignette provides an example of how to project ancient DNA onto modern data using 9 | the smartsnp package. 10 | vignette: > 11 | %\VignetteIndexEntry{Projecting ancient samples} 12 | %\VignetteEngine{knitr::rmarkdown} 13 | %\VignetteEncoding{UTF-8} 14 | --- 15 | 16 | ```{r setup, echo = FALSE, message = FALSE} 17 | knitr::opts_knit$set(collapse = T, comment = "#>") 18 | knitr::opts_knit$set(root.dir = normalizePath("~/Dropbox/Salva_PCA/TESTING/Lazaridis_2016")) 19 | options(tibble.print_min = 4L, tibble.print_max = 4L) 20 | library(smartsnp) 21 | set.seed(1014) 22 | 23 | ``` 24 | 25 | This Vignette provides an example of projecting ancient DNA onto modern data in a PCA analysis using the *smartsnp* package. We will use data from one of the first large-scale ancient DNA studies, Lazaridis et al. 2016: 26 | 27 | Lazaridis et al. "Genomic insights into the origin of farming in the ancient Near East", Nature volume 536, pages 419–424 (2016). 28 | 29 | The data is available online but needs to be pre-processed. Particularly, the aDNA data needs to be merged with modern data. All steps can be completed in R, but some need certain command-line software installed - they won't work on a Windows machine. 30 | 31 | If you are just interested in how to run *smart_pca* with ancient samples (all you need is an index vector *aDNA_inds* with the column numbers of the ancient samples), feel free to go straight to the section "Running smartpca" below. 32 | 33 | ## Install package *smartsnp* 34 | 35 | Select one of two options. 36 | 37 | Install development version from GitHub: 38 | 39 | ```{r, eval = FALSE} 40 | install.packages("devtools") 41 | devtools::install_github("ChristianHuber/smartsnp") 42 | ``` 43 | 44 | Install release version from CRAN: 45 | 46 | ```{r, eval = FALSE} 47 | install.packages("smartsnp") 48 | ``` 49 | 50 | Load the package: 51 | 52 | ```{r} 53 | library(smartsnp) 54 | ``` 55 | 56 | ## Downloading the data 57 | 58 | First, set the working directory to a location where you want to download and process the files. In my case, I'm choosing the Downloads directory in my home folder. 59 | 60 | ```{r, eval=FALSE} 61 | oldwd <- getwd() 62 | setwd("~/Downloads/") 63 | ``` 64 | 65 | We will download the data provided here https://reich.hms.harvard.edu/datasets using a command-line software called *wget*. Alternatively, you can download the file using a browser and the link. 66 | Note that this is quite a large file, >200 Mb! 67 | 68 | ```{r, eval=FALSE} 69 | system("wget https://reich.hms.harvard.edu/sites/reich.hms.harvard.edu/files/inline-files/NearEastPublic.tar.gz") 70 | ``` 71 | 72 | The downloaded data has to be unzipped. I will unzip it into a new folder called "data". 73 | 74 | ```{r, eval=FALSE} 75 | system("mkdir data") # Make a new folder called "data" 76 | system("tar -xvf NearEastPublic.tar.gz -C ./data") # Unzip data into this folder 77 | system("rm NearEastPublic.tar.gz") # Remove the zip file 78 | ``` 79 | 80 | 81 | ## Select subsets of individuals with convertf 82 | 83 | The data is in a PACKEDANCESTRYMAP format. The ancient and modern data is in two different files, and we are only interested in the Western Eurasian subset of the modern samples. 84 | 85 | In the next step, we will filter out the Western Eurasian samples from the full set of modern samples. Then, we will merge the modern with the ancient data. 86 | 87 | The *convertf* and *mergeit* command-line software of the Eigensoft package conveniently allows to run these two operations, see here for installation: 88 | https://github.com/DReichLab/EIG 89 | 90 | To run Eigensoft software within R using the *system* function, you might need to first explicitly tell R where the software can be found: 91 | 92 | ```{r} 93 | # Path to Eigensoft binaries (might be different on your computer): 94 | pathToEIGENSOFT = "~/repos/EIG/bin/" 95 | Sys.setenv(PATH = paste(Sys.getenv()["PATH"], paste0(":", pathToEIGENSOFT), sep="")) 96 | ``` 97 | 98 | We need to generate a parameter file for *convertf* that contains all the file names. We also need to generate a list of West Eurasian populations in a text file. 99 | 100 | ```{r, eval=TRUE} 101 | # Generating a text file with West Eurasian group names 102 | westEurasian_pops <- c( 103 | "Abkhasian", "Adygei", "Albanian", "Armenian", "Assyrian", "Balkar", "Basque", "BedouinA", "BedouinB", "Belarusian", "Bulgarian", "Canary_Islander", 104 | "Chechen", "Croatian", "Cypriot", "Czech", "Druze", "English", "Estonian", "Finnish", "French", "Georgian", "German", "Greek", "Hungarian", "Icelandic", 105 | "Iranian", "Irish", "Irish_Ulster", "Italian_North", "Italian_South", "Jew_Ashkenazi", "Jew_Georgian", "Jew_Iranian", "Jew_Iraqi", "Jew_Libyan", "Jew_Moroccan", 106 | "Jew_Tunisian", "Jew_Turkish", "Jew_Yemenite", "Jordanian", "Kumyk", "Lebanese_Christian", "Lebanese", "Lebanese_Muslim", "Lezgin", "Lithuanian", "Maltese", 107 | "Mordovian", "North_Ossetian", "Norwegian", "Orcadian", "Palestinian", "Polish", "Romanian", "Russian", "Sardinian", "Saudi", "Scottish", "Shetlandic", "Sicilian", 108 | "Sorb", "Spanish_North", "Spanish", "Syrian", "Turkish", "Ukrainian" 109 | ) 110 | ``` 111 | 112 | ```{r, eval=FALSE} 113 | 114 | # Generating the parameter file for convertf: 115 | par.ANCESTRYMAP.FILTER <- c( 116 | "genotypename: ./data/HumanOriginsPublic2068.geno", 117 | "snpname: ./data/HumanOriginsPublic2068.snp", 118 | "indivname: ./data/HumanOriginsPublic2068.ind", 119 | "poplistname: ./WestEurasia.poplist.txt", 120 | "genotypeoutname: ./data/HumanOriginsPublic2068.WestEurasia.geno", 121 | "snpoutname: ./data/HumanOriginsPublic2068.WestEurasia.snp", 122 | "indivoutname: ./data/HumanOriginsPublic2068.WestEurasia.ind" 123 | ) 124 | 125 | writeLines(par.ANCESTRYMAP.FILTER, con = "par.ANCESTRYMAP.FILTER") 126 | 127 | # Now run convertf using the system command in R. This is equivalent to running the quoted command in a terminal: 128 | 129 | system("convertf -p par.ANCESTRYMAP.FILTER") 130 | ``` 131 | 132 | ## Merging ancient with modern data using mergeit 133 | 134 | Now we combine the ancient samples with the modern data using *mergeit*. 135 | Again, we first need a parameter file and then we can run *mergeit* with the *system* function in R (or alternatively in the terminal). 136 | 137 | ```{r, eval = FALSE} 138 | params <- c( 139 | "geno1: ./data/HumanOriginsPublic2068.WestEurasia.geno", 140 | "snp1: ./data/HumanOriginsPublic2068.WestEurasia.snp", 141 | "ind1: ./data/HumanOriginsPublic2068.WestEurasia.ind", 142 | "geno2: ./data/AncientLazaridis2016.geno", 143 | "snp2: ./data/AncientLazaridis2016.snp", 144 | "ind2: ./data/AncientLazaridis2016.ind", 145 | "genooutfilename: ./data/AncientLazaridis2016_ModernWestEurasia.geno", 146 | "snpoutfilename: ./data/AncientLazaridis2016_ModernWestEurasia.snp", 147 | "indoutfilename: ./data/AncientLazaridis2016_ModernWestEurasia.ind" 148 | ) 149 | 150 | writeLines(params, con = "mergeit.params.txt") 151 | 152 | system("mergeit -p mergeit.params.txt") 153 | ``` 154 | 155 | ## Running smartpca 156 | 157 | We still need two additional vectors before we can run *smartsnp*: one that defines the ancient samples, and one that defines which samples we want to remove before running the PCA. 158 | 159 | ```{r} 160 | # Group names of the ancient groups 161 | aDNA_inds <- c("Anatolia_ChL", "Anatolia_N", "Armenia_ChL", "Armenia_EBA", "Armenia_MLBA", "CHG", "EHG", "Europe_EN", "Europe_LNBA", "Europe_MNChL", "Iberia_BA", "Iran_ChL", "Iran_HotuIIIb", "Iran_LN", "Iran_N", "Levant_BA", "Levant_N", "Natufian", "SHG", "Steppe_EMBA", "Steppe_Eneolithic", "Steppe_IA", "Steppe_MLBA", "Switzerland_HG", "WHG") 162 | 163 | # Contains group names of all groups in merged data 164 | GR <- read.table("./data/AncientLazaridis2016_ModernWestEurasia.ind", header=F) 165 | 166 | # Vector defining ancient and modern groups 167 | SA <- ifelse(GR$V3 %in% westEurasian_pops, "modern", "ancient") 168 | 169 | # Samples to remove: 170 | sample.rem <- c("Mota", "Denisovan", "Chimp", "Mbuti.DG", "Altai", 171 | "Vi_merge", "Clovis", "Kennewick", "Chuvash", "Ust_Ishim", 172 | "AG2", "MA1", "MezE", "hg19ref", "Kostenki14") 173 | 174 | # Simple index vectors that determines which samples to remove and which to use for PCA ordination or projection: 175 | SR <- which(GR$V3 %in% sample.rem) # Column numbers of samples to remove 176 | SP <- which(SA == "ancient") # Column numbers of samples to project (i.e. aDNA) 177 | ``` 178 | 179 | Now we are finally ready to run smart_pca: 180 | 181 | ```{r run__smart_pca, message = FALSE} 182 | # Running smart_pca: 183 | sm.pca <- smart_pca(snp_data = "./data/AncientLazaridis2016_ModernWestEurasia.geno", 184 | sample_group = GR$V3, missing_value = 9, missing_impute = "mean", 185 | scaling = "drift", program_svd = "RSpectra", pc_axes = 2, 186 | sample_remove = SR, sample_project = SP, pc_project = 1:2) 187 | 188 | # To see more information on the different parameter options: 189 | ?smart_pca 190 | ``` 191 | 192 | ## Plotting the results using ggplot2 193 | 194 | We can have a look at the result using *ggplot2* (and *ggrepel* for labeling the groups). The *data.table* package is used to simplify some data operations. 195 | For more info on these packages, see: 196 | 197 | https://ggplot2.tidyverse.org/ 198 | 199 | https://cran.r-project.org/web/packages/ggrepel/vignettes/ggrepel.html 200 | 201 | https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html 202 | 203 | ```{r lazaridis_plot, fig.height = 7, fig.width = 7, fig.align = "center", message=FALSE, error=FALSE} 204 | # This needs the R libraries ggplot2, ggrepel, and data.table. 205 | library(ggplot2) 206 | library(ggrepel) 207 | library(data.table) 208 | 209 | # Plotting with ggplot2 and labeling the groups with ggrepel: 210 | 211 | smart_mva.aDNA_WestEurasia.evec <- data.table(sm.pca$pca.sample_coordinates) 212 | smart_mva.aDNA_WestEurasia.evec[, c("PC1mean", "PC2mean") := .(mean(PC1), mean(PC2)), Group] 213 | smart_mva.aDNA_WestEurasia.evec[, Name := GR$V1] 214 | 215 | ggplot() + 216 | geom_point(data = smart_mva.aDNA_WestEurasia.evec[Class == "PCA"], aes(-PC1, PC2), col="grey", alpha=0.5) + 217 | geom_point(data = smart_mva.aDNA_WestEurasia.evec[Class == "Projected" ], aes(-PC1, PC2, fill=Group, shape=Group), size=3) + 218 | scale_shape_manual(values=rep(21:25, 100)) + 219 | geom_label_repel(data = smart_mva.aDNA_WestEurasia.evec[Class == "Projected",.SD[1], Group], aes(-PC1mean, PC2mean, label=Group, col=Group), alpha=0.7, segment.color="NA") + 220 | theme_bw() + theme(legend.position = "none") 221 | 222 | 223 | ``` 224 | 225 | Voila! Note that we have plotted the negative of PC1 (i.e. -PC1) here. The only reason for this is to make the plot have the same orientation as the original plot in Lazaridis et al. (2016), Fig. 1B. Importantly, changing the sign of any axis of an PCA does not change its interpretation, and different software can give different signs. See the excellent explanation here: 226 | 227 | https://stats.stackexchange.com/questions/88880/does-the-sign-of-scores-or-of-loadings-in-pca-or-fa-have-a-meaning-may-i-revers 228 | 229 | Finally, let's move back to the old working directory: 230 | 231 | ```{r, eval=FALSE} 232 | setwd(oldwd) 233 | ``` 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /docs/docsearch.css: -------------------------------------------------------------------------------- 1 | /* Docsearch -------------------------------------------------------------- */ 2 | /* 3 | Source: https://github.com/algolia/docsearch/ 4 | License: MIT 5 | */ 6 | 7 | .algolia-autocomplete { 8 | display: block; 9 | -webkit-box-flex: 1; 10 | -ms-flex: 1; 11 | flex: 1 12 | } 13 | 14 | .algolia-autocomplete .ds-dropdown-menu { 15 | width: 100%; 16 | min-width: none; 17 | max-width: none; 18 | padding: .75rem 0; 19 | background-color: #fff; 20 | background-clip: padding-box; 21 | border: 1px solid rgba(0, 0, 0, .1); 22 | box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); 23 | } 24 | 25 | @media (min-width:768px) { 26 | .algolia-autocomplete .ds-dropdown-menu { 27 | width: 175% 28 | } 29 | } 30 | 31 | .algolia-autocomplete .ds-dropdown-menu::before { 32 | display: none 33 | } 34 | 35 | .algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { 36 | padding: 0; 37 | background-color: rgb(255,255,255); 38 | border: 0; 39 | max-height: 80vh; 40 | } 41 | 42 | .algolia-autocomplete .ds-dropdown-menu .ds-suggestions { 43 | margin-top: 0 44 | } 45 | 46 | .algolia-autocomplete .algolia-docsearch-suggestion { 47 | padding: 0; 48 | overflow: visible 49 | } 50 | 51 | .algolia-autocomplete .algolia-docsearch-suggestion--category-header { 52 | padding: .125rem 1rem; 53 | margin-top: 0; 54 | font-size: 1.3em; 55 | font-weight: 500; 56 | color: #00008B; 57 | border-bottom: 0 58 | } 59 | 60 | .algolia-autocomplete .algolia-docsearch-suggestion--wrapper { 61 | float: none; 62 | padding-top: 0 63 | } 64 | 65 | .algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { 66 | float: none; 67 | width: auto; 68 | padding: 0; 69 | text-align: left 70 | } 71 | 72 | .algolia-autocomplete .algolia-docsearch-suggestion--content { 73 | float: none; 74 | width: auto; 75 | padding: 0 76 | } 77 | 78 | .algolia-autocomplete .algolia-docsearch-suggestion--content::before { 79 | display: none 80 | } 81 | 82 | .algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { 83 | padding-top: .75rem; 84 | margin-top: .75rem; 85 | border-top: 1px solid rgba(0, 0, 0, .1) 86 | } 87 | 88 | .algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { 89 | display: block; 90 | padding: .1rem 1rem; 91 | margin-bottom: 0.1; 92 | font-size: 1.0em; 93 | font-weight: 400 94 | /* display: none */ 95 | } 96 | 97 | .algolia-autocomplete .algolia-docsearch-suggestion--title { 98 | display: block; 99 | padding: .25rem 1rem; 100 | margin-bottom: 0; 101 | font-size: 0.9em; 102 | font-weight: 400 103 | } 104 | 105 | .algolia-autocomplete .algolia-docsearch-suggestion--text { 106 | padding: 0 1rem .5rem; 107 | margin-top: -.25rem; 108 | font-size: 0.8em; 109 | font-weight: 400; 110 | line-height: 1.25 111 | } 112 | 113 | .algolia-autocomplete .algolia-docsearch-footer { 114 | width: 110px; 115 | height: 20px; 116 | z-index: 3; 117 | margin-top: 10.66667px; 118 | float: right; 119 | font-size: 0; 120 | line-height: 0; 121 | } 122 | 123 | .algolia-autocomplete .algolia-docsearch-footer--logo { 124 | background-image: url("data:image/svg+xml;utf8,"); 125 | background-repeat: no-repeat; 126 | background-position: 50%; 127 | background-size: 100%; 128 | overflow: hidden; 129 | text-indent: -9000px; 130 | width: 100%; 131 | height: 100%; 132 | display: block; 133 | transform: translate(-8px); 134 | } 135 | 136 | .algolia-autocomplete .algolia-docsearch-suggestion--highlight { 137 | color: #FF8C00; 138 | background: rgba(232, 189, 54, 0.1) 139 | } 140 | 141 | 142 | .algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { 143 | box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) 144 | } 145 | 146 | .algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { 147 | background-color: rgba(192, 192, 192, .15) 148 | } 149 | -------------------------------------------------------------------------------- /vignettes/mallard_smartpca_analysis.Rmd.orig: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Example PCA, PERMANOVA and PERMDISP analysis" 3 | output: 4 | rmarkdown::html_vignette: 5 | toc: true 6 | toc_depth: 2 7 | description: > 8 | This Vignette provides an example analysis of genetic data using 9 | the smartsnp package. 10 | vignette: > 11 | %\VignetteIndexEntry{Example PCA, PERMANOVA and PERMDISP analysis} 12 | %\VignetteEngine{knitr::rmarkdown} 13 | %\VignetteEncoding{UTF-8} 14 | --- 15 | 16 | ```{r, echo = FALSE, message = FALSE} 17 | knitr::opts_chunk$set(collapse = T, comment = "#>") 18 | options(tibble.print_min = 4L, tibble.print_max = 4L) 19 | library(smartsnp) 20 | set.seed(1014) 21 | ``` 22 | 23 | This Vignette provides an example analysis of genetic data using the smartsnp package. 24 | 25 | ## Description of the data 26 | 27 | Multivariate analysis of mallard genotypes using the dataset published by Kraus et al. 2013. 28 | 29 | Paper = https://onlinelibrary.wiley.com/doi/10.1111/mec.12098 30 | 31 | Dataset = https://datadryad.org/stash/dataset/doi:10.5061/dryad.1bq39 32 | 33 | Population SEAP removed from dataset as its geographic background is unclear (Robert Kraus, pers. comm., 02/06/2021). 34 | 35 | Populations GBAB, GBFE and GBNM (British Isles) removed from dataset as these individuals might have mixed with captive/feral mallards (Robert Kraus, pers. comm., 03/06/2021). 36 | 37 | Three datasets are available. They are part of the *smartsnp* package, you don't need to download or process the data from dryad: 38 | 39 | * Genotype data (mallard_genotype_Kraus2012.txt) = 364 SNPs (rows) x 695 individuals (columns), individuals comprise 55 populations and 10 flyways. Genotypes are 0, 1, 2 or (for missing values) 9 40 | * Group names (mallard_samples_Kraus2013.txt) = 695 rows x 3 columns, column 1 = flyway names, column 2 = population name, column 3 = indvidual names comprise 55 populations and 10 flyways 41 | * SNP names (mallard_snps_Kraus2013.txt) = 695 rows x 1 column 42 | 43 | The study supports panmixia in cosmopolitan bird species (see Kraus et al. 2013): 44 | 45 | "...Only Greenland is genetically differentiated from the remaining mallard 46 | population, and to a lesser extent, slight differentiation is observed between 47 | flyways in Europe and North America". 48 | 49 | "...There is a lack of clear population structure, suggesting that the world's 50 | mallards, perhaps with minor exceptions, form a single large, mainly 51 | interbreeding population". 52 | 53 | 54 | ## Install package *smartsnp* (use one option) 55 | 56 | From GitHub: 57 | 58 | ```{r, eval = FALSE} 59 | install.packages("devtools") 60 | devtools::install_github("ChristianHuber/smartsnp") 61 | ``` 62 | 63 | From CRAN: 64 | 65 | ```{r, eval = FALSE} 66 | install.packages("smartsnp") 67 | ``` 68 | 69 | ## Load package 70 | 71 | ```{r} 72 | library(smartsnp) 73 | ``` 74 | 75 | ## Create group factor 76 | 77 | Load group file (flyway = categorical predictor in PERMANOVA AND PERMDISP tests): 78 | 79 | ```{r} 80 | pathToFile <- system.file("extdata", "mallard_samples_Kraus2013", package = "smartsnp") 81 | my_groups <- c(data.table::fread(pathToFile, header = FALSE))[[1]] 82 | length(my_groups) #number of individuals 83 | length(table(my_groups)) #number of flyways 84 | table(my_groups) #number of individuals per flyway 85 | ``` 86 | 87 | Number of populations (not needed for analysis hereafter): 88 | 89 | ```{r} 90 | my_pops <- c(data.table::fread(pathToFile, header = FALSE))[[2]] 91 | length(table(my_pops)) #number of populations 92 | table(my_pops) #number of individuals per population 93 | ``` 94 | 95 | Code per individual (not needed for analysis hereafter): 96 | 97 | ```{r} 98 | my_indv <- c(data.table::fread(pathToFile, header = FALSE))[[3]] 99 | ``` 100 | 101 | SNP names (not needed for analysis hereafter): 102 | 103 | ```{r} 104 | pathToFile <- system.file("extdata", "mallard_snps_Kraus2013", package = "smartsnp") 105 | my_snps <- c(data.table::fread(pathToFile, header = FALSE))[[1]] 106 | length(my_snps) # number of snps 107 | ``` 108 | 109 | ## Run *smart_pca* 110 | 111 | Run PCA with truncated SVD (PCA 1 x PCA 2 axes) and assign results to object pcaR (missing values imputed with means, SNPs scaled to control genetic drift): 112 | 113 | ```{r, message=FALSE} 114 | pathToFile <- system.file("extdata", "mallard_genotype_Kraus2012", package = "smartsnp") 115 | pcaR <- smart_pca(snp_data = pathToFile, sample_group = my_groups, missing_impute = "mean") 116 | ``` 117 | 118 | pcaR is a list with 3 elements: 119 | 120 | ```{r} 121 | class(pcaR) 122 | names(pcaR) 123 | str(pcaR) 124 | ``` 125 | 126 | Assign statistical results to objects pcaR_eigen, pcaR_load and pcaR_coord: 127 | 128 | ```{r} 129 | pcaR_eigen <- pcaR$pca.eigenvalues # extract eigenvalues (PCA1 and PC2 axes explain 3.5% variation in SNP variation across individuals) 130 | pcaR_load <- pcaR$pca.snp_loadings # extract principal coefficients (high SNP loadings indicate loci with stronger variation across individuals) 131 | pcaR_coord <- pcaR$pca.sample_coordinates # extract principal components (position of individuals in PCA space used to generate the ordination) 132 | ``` 133 | 134 | Plot PCA: 135 | 136 | ```{r pca_plot_mallard, fig.height = 7, fig.width = 7, fig.align = "center"} 137 | cols <- rainbow(length(table(my_groups))) 138 | plot(pcaR$pca.sample_coordinates[,c("PC1","PC2")], cex = 1.5, 139 | bg = cols[as.factor(my_groups)], pch = 21, col = "black", main = "mallard genotype smartpca") 140 | legend("topleft", legend = levels(as.factor(my_groups)), cex = 1, pch = 21, 141 | pt.cex = 1.0, col = "black", pt.bg = cols, text.col = cols) 142 | ``` 143 | 144 | Greenland individuals cluster in one of the corners of the ordination, supporting a distinct SNP composition relative to the remaining flyways. 145 | 146 | 147 | ## Run *smart_permanova* 148 | 149 | Run PERMANOVA test (group location in PCA1 x PCA2 space) and assign results to object permanovaR (missing values imputed with means, SNPs scaled to control genetic drift). 150 | Notice that pairwise tests increase computing time considerably as there are 45 pairwise comparisons to make for 10 flyways, each calculating a p value based on 10,000 permutations of the data. 151 | 152 | ```{r, message=FALSE} 153 | pathToFile <- system.file("extdata", "mallard_genotype_Kraus2012", package = "smartsnp") 154 | permanovaR <- smart_permanova(snp_data = pathToFile, sample_group = my_groups, 155 | target_space = "pca", missing_impute = "mean", pairwise = "TRUE") 156 | ``` 157 | 158 | permanovaR is a list with 5 elements: 159 | 160 | ```{r} 161 | class(permanovaR) 162 | names(permanovaR) 163 | str(permanovaR) 164 | ``` 165 | 166 | Assign sample summary to object permP: 167 | 168 | ```{r} 169 | permP <- permanovaR$permanova.samples 170 | ``` 171 | 172 | Show PERMANOVA tables (global and pairwise): 173 | 174 | ```{r} 175 | permanovaR$permanova.global_test 176 | ``` 177 | 178 | For the mallard dataset, the p value is 1e-04. 179 | As with other frequentist tests, p values should be interpreted as the probability of the observed differences if the null hypothesis of no differences between groups is true. 180 | The lower the p value, the weaker the support for the null hypothesis. 181 | 182 | ```{r} 183 | head(permanovaR$permanova.pairwise_test) 184 | ``` 185 | 186 | The lowest p values (resulting from pairwise comparisons) consistently occur between the Greenland and the remaining flyways, supporting a unique SNP composition mostly in Greenland mallards. 187 | 188 | ## Run *smart_permdisp* 189 | 190 | Run PERMDISP test (group dispersion in PCA1 x PCA2 space) and assign results to object permdispR (missing values imputed with means, SNPs scaled to control genetic drift). Heteroscededasticity tests in combination with ANOVA tests tell whether the ANOVA F statistic is driven by mean and/or varinance differences among groups in a univariate context. Location and dispersion (multivariate context) are analogous with mean and variance in a univariate context. As the number per individuals per flyway differ a great deal among flyways, PERMDISP is run to control for sample-size bias (samplesize_bias = TRUE). 191 | 192 | ```{r, message=FALSE} 193 | pathToFile <- system.file("extdata", "mallard_genotype_Kraus2012", package = "smartsnp") 194 | permdispR <- smart_permdisp(snp_data = pathToFile, sample_group = my_groups, 195 | target_space = "pca", missing_impute = "mean", pairwise = "TRUE", samplesize_bias = TRUE) 196 | ``` 197 | 198 | permdispR is a list with 7 elements: 199 | 200 | ```{r} 201 | class(permdispR) 202 | names(permdispR) 203 | str(permdispR) 204 | ``` 205 | 206 | Assign sample summary to object permD, where column Sample_dispersion column show dispersion of individuals relative to their flyway: 207 | 208 | ```{r} 209 | permD <- permdispR$permdisp.samples 210 | ``` 211 | 212 | Show PERMDISP tables (global and pairwise): 213 | 214 | ```{r} 215 | permdispR$permdisp.global_test 216 | ``` 217 | 218 | For the mallard dataset, the p value is 0.0073: 219 | 220 | ```{r} 221 | str(permdispR$permdisp.pairwise_test) 222 | ``` 223 | 224 | Most PERMDISP pairwise tests show relatively high p values (i.e., high probability of the observed differences in dispersion if the null hypothesis of no dispersion differences among groups is true), indicating that PERMANOVA tests mainly captured differences in location. The lowest p values for the PERMDISP pairwise tests among Eurasian flyways occur for the Europe North Western (ENW) flyway versus the other flyways as seen in the ordination plot (i.e., ENW individuals are widely spread over both the PCA1 and PCA2 axes). 225 | 226 | 227 | ## Run *smart_mva* 228 | 229 | Run PCA, and PERMANOVA and PERMDISP tests (group location and dispersion in PCA1 x PCA2 space), and assign results to object mvaR. No pairwise comparisons are applied (default: pairwise = "FALSE"), so computation will be relatively fast. This is a wrapper function running in one single job the three other functions of the package (smart_pca, smart_permanova, smart_permdisp). 230 | 231 | ```{r, message=FALSE} 232 | pathToFile <- system.file("extdata", "mallard_genotype_Kraus2012", package = "smartsnp") 233 | mvaR <- smart_mva(snp_data = pathToFile, sample_group = my_groups, 234 | target_space = "pca", missing_impute = "mean", samplesize_bias = TRUE) 235 | ``` 236 | 237 | mvaR is a list with three elements (data, pca, test): 238 | 239 | ```{r} 240 | class(mvaR) 241 | names(mvaR) 242 | str(mvaR) 243 | ``` 244 | 245 | Element 1 = scaled dataset (none, covariance, correlation, drift) in a matrix and array (rows = SNPs, columns = samples): 246 | 247 | ```{r} 248 | class(mvaR$data) 249 | dim(mvaR$data) 250 | str(mvaR$data) 251 | ``` 252 | 253 | Element 2 = PCA results in a list: 254 | 255 | ```{r} 256 | class(mvaR$pca) 257 | names(mvaR$pca) 258 | str(mvaR$pca) 259 | ``` 260 | 261 | Show PCA results: 262 | 263 | ```{r} 264 | head(mvaR$pca$pca.eigenvalues) #extract eigenvalues 265 | head(mvaR$pca$pca.sample_coordinates) #extract coordinates of individuals in PCA1 x PCA2 space 266 | head(mvaR$pca$pca.snp_loadings) #extract SNP loadings 267 | ``` 268 | 269 | Element 3 = PERMANOVA and PERMDISP results in a list: 270 | 271 | ```{r} 272 | class(mvaR$test) 273 | names(mvaR$test) 274 | str(mvaR$test) 275 | ``` 276 | 277 | Multiple-testing correction applied: 278 | 279 | ```{r} 280 | mvaR$test$test.pairwise_correction 281 | ``` 282 | 283 | Number of permutations to estimate p value: 284 | 285 | ```{r} 286 | mvaR$test$test.permutation_number 287 | ``` 288 | 289 | Seed for random generator: 290 | 291 | ```{r} 292 | mvaR$test$test.permutation_seed 293 | ``` 294 | 295 | Summary of samples: 296 | 297 | ```{r} 298 | head(mvaR$test$test_samples) 299 | ``` 300 | 301 | Show PERMANOVA table: 302 | 303 | ```{r} 304 | mvaR$test$permanova.global_test #global test 305 | mvaR$test$permanova.pairwise_test #pairwise tests 306 | ``` 307 | 308 | Show PERMDISP table: 309 | 310 | ```{r} 311 | mvaR$test$permdisp.global_test #global test 312 | mvaR$test$test$permdisp.pairwise_test #pairwise tests 313 | ``` 314 | 315 | Sample-size correction applied: 316 | 317 | ```{r} 318 | mvaR$test$permdisp.bias 319 | ``` 320 | 321 | Location of flyways in ordination: 322 | 323 | ```{r} 324 | mvaR$test$permdisp.group_location 325 | ``` 326 | 327 | 328 | -------------------------------------------------------------------------------- /man/smart_mva.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/smart_mva.R 3 | \name{smart_mva} 4 | \alias{smart_mva} 5 | \title{Smart Multivariate Analyses (wrapper of PCA, PERMANOVA and PERMDISP)} 6 | \arguments{ 7 | \item{snp_data}{snp_data}{File name read from working directory. 8 | SNP = rows, samples = columns without row names or column headings. 9 | SNP values must be count data (no decimals allowed). 10 | File extension detected automatically whether text or \code{EIGENSTRAT}. 11 | See details.} 12 | 13 | \item{packed_data}{Logical value for \code{EIGENSTRAT}, irrelevant for text data. 14 | Default \code{packed_data = FALSE} assumes uncompressed \code{EIGENSTRAT}. 15 | \code{packed_data = TRUE} for compressed or binary \code{EIGENSTRAT} (\code{PACKENDANCESTRYMAP}).} 16 | 17 | \item{sample_group}{Character or numeric vector assigning samples to groups. 18 | Coerced to factor.} 19 | 20 | \item{sample_remove}{Logical \code{FALSE} or numeric vector indicating column numbers (samples) to be removed from computations. 21 | Default \code{sample_remove = FALSE} keeps all samples.} 22 | 23 | \item{snp_remove}{Logical \code{FALSE} or numeric vector indicating row numbers (SNPs) to be removed from computations. 24 | Default \code{snp_remove = FALSE} keeps all SNPs. 25 | See details.} 26 | 27 | \item{pca}{Logical indicating if PCA is computed. 28 | Default \code{TRUE}.} 29 | 30 | \item{permanova}{Logical indicating if PERMANOVA is computed. 31 | Default \code{TRUE}} 32 | 33 | \item{permdisp}{Logical indicating if PERMDISP is computed. 34 | Default \code{TRUE}.} 35 | 36 | \item{missing_value}{Number \code{9} or string \code{NA} indicating missing value. 37 | Default \code{missing_value = 9} as in \code{EIGENSTRAT}. 38 | If no missing values present, no effect on computation.} 39 | 40 | \item{missing_impute}{String handling missing values. 41 | Default \code{missing_impute = "mean"} replaces missing values of each SNP by mean of non-missing values across samples. 42 | \code{missing_impute = "remove"} removes SNPs with at least one missing value. 43 | If no missing values present, no effect on computation.} 44 | 45 | \item{scaling}{String. Default \code{scaling = "drift"} scales SNPs to control for expected allele frequency dispersion caused by genetic drift (SMARTPCA). 46 | \code{scaling = "center"} for \code{centering} (covariance-based PCA). 47 | \code{scaling = "sd"} for \code{centered} SNPs divided by standard deviation (correlation-based PCA). 48 | \code{scaling = "none"} for no scaling. 49 | See details.} 50 | 51 | \item{program_svd}{String indicating R package computing single value decomposition (SVD). 52 | Default \code{program_svd = "Rspectra"} for \code{\link[RSpectra]{svds}}. 53 | \code{program_svd = "bootSVD"} for \code{\link[bootSVD]{fastSVD}}. 54 | See details.} 55 | 56 | \item{sample_project}{Numeric vector indicating column numbers (ancient samples) projected onto (modern) PCA space. 57 | Default \code{sample_project = FALSE} implements no projection. 58 | See details.} 59 | 60 | \item{pc_project}{Numeric vector indicating the ranks of the PCA axes ancient samples are projected onto. Default \code{pc_ancient = c(1, 2)} for PCA axes 1 and 2. 61 | If \code{program_svd = "RSpectra"}, \code{length(pc_ancient)} must be smaller than or equal to \code{pc_axes}. 62 | No effect on computation, if no ancient samples present.} 63 | 64 | \item{sample_distance}{Type of inter-sample proximity computed (distance, similarity, dissimilarity). 65 | Default is \code{Euclidean distance}. 66 | See details.} 67 | 68 | \item{program_distance}{A string value indicating R package to estimate proximities between pairs of samples. 69 | Default \code{program_distance = "Rfast"} uses function \code{\link[Rfast]{Dist}}; \code{program_distance = "vegan"} uses \code{\link[vegan]{vegdist}}. 70 | See details.} 71 | 72 | \item{target_space}{String. 73 | Default \code{target_space = "multidimensional"} applies PERMANOVA and/or PERMDISP to sample-by-sample triangular matrix computed from variable-by-sample data, \code{pc_axes} has no effect on computation. \code{target_space = "pca"} applies PERMANOVA and/or PERMDISP to sample-by-sample data in PCA space, \code{pc_axes} determines number of PCA axes for testing.} 74 | 75 | \item{pc_axes}{Number of PCA axes computed always starting with PCA axis 1. 76 | Default \code{pc_axes = 2} computes PCA axes 1 and 2 if \code{target_space = "pca"}. 77 | No effect on computation if \code{target_space = "multidimensional"}.} 78 | 79 | \item{pairwise}{Logical. 80 | Default \code{pairwise = FALSE} computes global test. \code{pairwise = TRUE} computes global and pairwise tests.} 81 | 82 | \item{pairwise_method}{String specifying type of correction for multiple testing. 83 | Default \code{"holm"}.} 84 | 85 | \item{permutation_n}{Number of permutations resulting in PERMANOVA/PERMDISP test \emph{p value}. 86 | Default \code{9999}.} 87 | 88 | \item{permutation_seed}{Number fixing random generator of permutations. 89 | Default \code{1}.} 90 | 91 | \item{dispersion_type}{String indicating quantification of group dispersion whether relative to spatial \code{"median"} or \code{"centroid"} in PERMDISP. 92 | Default \code{"median"}.} 93 | 94 | \item{samplesize_bias}{Logical. \code{samplesize_bias = TRUE} for dispersion weighted by number of samples per group in PERMDISP. 95 | Default \code{pairwise = FALSE} for no weighting.} 96 | } 97 | \value{ 98 | Returns a list containing the following elements: 99 | \itemize{ 100 | \item{pca.snp_loadings}{Dataframe of principal coefficients of SNPs. 101 | One set of coefficients per PCA axis computed.} 102 | \item{pca.eigenvalues}{Dataframe of eigenvalues, variance and cumulative variance explained. 103 | One eigenvalue per PCA axis computed.} 104 | \item{pca_sample_coordinates}{Dataframe showing PCA sample summary. Column \emph{Group} assigns samples to groups. Column \emph{Class} specifies if samples "Removed" from PCA or "Projected" onto PCA space. 105 | Sequence of additional columns shows principal components (coordinates) of samples in PCA space (1 column per PCA computed named PC1, PC2, ...).} 106 | \item{test_samples}{Dataframe showing test sample summary. 107 | Column \emph{Group} assigns samples to tested groups. 108 | Column \emph{Class} specifies if samples were used in, or removed from, testing (PERMANOVA and/or PERMDISP). 109 | Column \emph{Sample_dispersion} shows dispersion of individual samples relative to spatial \code{"median"} or \code{"centroid"} used in PERMDISP.} 110 | \item{permanova.global_test}{List showing PERMANOVA table with degrees of freedom, sum of squares, mean sum of squares, \emph{F} statistic, variance explained (\emph{R2}) and \emph{p} value.} 111 | \item{permanova.pairwise_test}{List showing PERMANOVA table with \emph{F} statistic, variance explained (\emph{R2}), \emph{p} value and corrected \emph{p} value per pair of groups.} 112 | \item{permdisp.global_test}{List showing PERMDISP table with degrees of freedoms, sum of squares, mean sum of squares, \emph{F} statistic and \emph{p} value.} 113 | \item{permdisp.pairwise_test}{List showing PERMDISP table with \emph{F} statistic, \emph{p} value and corrected \emph{p} value per pair of groups. 114 | Obtained only if \code{pairwise = TRUE}.} 115 | \item{permdisp.bias}{String indicating if PERMDISP dispersion corrected for number of samples per group.} 116 | \item{permdisp.group_location}{Dataframe showing coordinates of spatial \code{"median"} or \code{"centroid"} per group in PERMDISP.} 117 | \item{test.pairwise_correction}{String indicating type of correction for multiple testing in PERMANOVA and/or PERMDISP.} 118 | \item{test.permutation_number}{Number of permutations applied to obtain the distribution of \emph{F} statistic of PERMANOVA and/or PERMDISP.} 119 | \item{test.permutation_seed}{Number fixing random generator of permutations of PERMANOVA and/or PERMDISP for reproducibility of results.} 120 | } 121 | } 122 | \description{ 123 | Computes Principal Component Analysis (PCA) for variable x sample genotype data, such as Single Nucleotide Polymorphisms (SNP), in combination with Permutational Multivariate Analysis of Variance (PERMANOVA) and Permutational Multivariate Analysis of Dispersion (PERMDISP). 124 | A wrapper of functions \code{smart_pca}, \code{smart_permanova} and \code{smart_permdisp}. 125 | Genetic markers such as SNPs can be scaled by \code{centering}, z-scores and genetic drift-based dispersion. 126 | The latter follows the SMARTPCA implementation of Patterson, Price and Reich (2006). 127 | Optimized to run fast computation for big datasets. 128 | } 129 | \details{ 130 | See details in other functions for conceptualization of PCA (\code{smart_pca}) (Hotelling 1993), SMARTPCA (Patterson, Price and Reich 2006), PERMANOVA (\code{smart_permanova}) (Anderson 2001) and PERMDISP (\code{smart_permdisp} (Anderson 2006), types of scaling, ancient projection, and correction for multiple testing.\cr 131 | 132 | Users can compute any combination of the three analyses by assigning \code{TRUE} or \code{FALSE} to \code{pca} and/or \code{permanova} and/or \code{permdisp}.\cr 133 | 134 | PERMANOVA and PERMDISP exclude samples (columns) specified in either \code{sample_remove} or \code{sample_project}. 135 | Projected samples are not used for testing as their PCA coordinates are derived from, and therefore depend on, the coordinates of non-projected samples.\cr 136 | 137 | Data read from working directory with SNPs as rows and samples as columns. Two alternative formats: (1) text file of SNPs by samples (file extension and column separators recognized automatically) read using \code{\link[data.table]{fread}}; or (2) duet of \code{EIGENSTRAT} files (see \url{https://reich.hms.harvard.edu/software}) using \code{\link[vroom]{vroom_fwf}}, including a genotype file of SNPs by samples (\code{*.geno}), and a sample file (\code{*.ind}) containing three vectors assigning individual samples to unique user-predefined groups (populations), sexes (or other user-defined descriptor) and alphanumeric identifiers. 138 | For \code{EIGENSTRAT}, vector \code{sample_group} assigns samples to groups retrievable from column 3 of file \code{*.ind}. 139 | SNPs with zero variance removed prior to SVD to optimize computation time and avoid undefined values if \code{scaling = "sd"} or \code{"drift"}.\cr 140 | 141 | Users can select subsets of samples or SNPs by introducing a vector including column numbers for samples (\code{sample_remove}) and/or row numbers for SNPs (\code{snp_remove}) to be removed from computations. 142 | Function stops if the final number of SNPs is 1 or 2. 143 | \code{EIGENSOFT} was conceived for the analysis of human genes and its SMARTPCA suite so accepts 22 (autosomal) chromosomes by default. 144 | If >22 chromosomes are provided and the internal parameter \code{numchrom} is not set to the target number chromosomes of interest, SMARTPCA automatically subsets chromosomes 1 to 22. 145 | In contrast, \code{smart_mva} accepts any number of autosomes with or without the sex chromosomes from an \code{EIGENSTRAT} file.\cr 146 | } 147 | \examples{ 148 | # Path to example genotype matrix "dataSNP" 149 | pathToGenoFile = system.file("extdata", "dataSNP", package = "smartsnp") 150 | 151 | # Assign 50 samples to each of two groups and colors 152 | my_groups <- as.factor(c(rep("A", 50), rep("B", 50))); cols = c("red", "blue") 153 | 154 | # Run PCA, PERMANOVA and PERMDISP 155 | mvaR <- smart_mva(snp_data = pathToGenoFile, sample_group = my_groups) 156 | mvaR$pca$pca.eigenvalues # extract PCA eigenvalues 157 | head(mvaR$pca$pca.snp_loadings) # extract principal coefficients (SNP loadings) 158 | head(mvaR$pca$pca.sample_coordinates) # extract PCA principal components (sample position in PCA space) 159 | 160 | # plot PCA 161 | plot(mvaR$pca$pca.sample_coordinates[,c("PC1","PC2")], cex = 2, 162 | pch = 19, col = cols[my_groups], main = "genotype smartpca") 163 | legend("topleft", legend = levels(my_groups), cex = 1, 164 | pch = 19, col = cols, text.col = cols) 165 | 166 | # Extract PERMANOVA table 167 | mvaR$test$permanova.global_test 168 | 169 | # Extract PERMDISP table 170 | mvaR$test$permdisp.global_test # extract PERMDISP table 171 | 172 | # Extract sample summary and dispersion of individual samples used in PERMDISP 173 | mvaR$test$test_samples 174 | 175 | } 176 | \seealso{ 177 | \code{\link{smart_pca}}, 178 | \code{\link{smart_permanova}}, 179 | \code{\link{smart_permdisp}} 180 | } 181 | -------------------------------------------------------------------------------- /man/smart_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/smart_pca.R 3 | \name{smart_pca} 4 | \alias{smart_pca} 5 | \title{Smart Principal Component Analysis} 6 | \arguments{ 7 | \item{snp_data}{File name read from working directory. 8 | SNP = rows, samples = columns without row names or column headings. 9 | SNP values must be count data (no decimals allowed). File extension detected automatically whether text or \code{EIGENSTRAT}. 10 | See details.} 11 | 12 | \item{packed_data}{Logical value for \code{EIGENSTRAT}, irrelevant for text data. 13 | Default \code{packed_data = FALSE} assumes uncompressed \code{EIGENSTRAT}. 14 | \code{packed_data = TRUE} for compressed or binary \code{EIGENSTRAT} (\code{PACKENDANCESTRYMAP}).} 15 | 16 | \item{sample_group}{Character or numeric vector assigning samples to groups. 17 | Coerced to factor.} 18 | 19 | \item{sample_remove}{Logical \code{FALSE} or numeric vector indicating column numbers (samples) to be removed from computations. 20 | Default \code{sample_remove = FALSE} keeps all samples.} 21 | 22 | \item{snp_remove}{Logical \code{FALSE} or numeric vector indicating row numbers (SNPs) to be removed from computations. 23 | Default \code{snp_remove = FALSE} keeps all SNPs. 24 | See details.} 25 | 26 | \item{missing_value}{Number \code{9} or string \code{NA} indicating missing value. 27 | Default \code{missing_value = 9} as in \code{EIGENSTRAT}. 28 | If no missing values present, no effect on computation.} 29 | 30 | \item{missing_impute}{String handling missing values. 31 | Default \code{missing_impute = "mean"} replaces missing values of each SNP by mean of non-missing values across samples. 32 | \code{missing_impute = "remove"} removes SNPs with at least one missing value. 33 | If no missing values present, no effect on computation.} 34 | 35 | \item{scaling}{String. Default \code{scaling = "drift"} scales SNPs to control for expected allele frequency dispersion caused by genetic drift (SMARTPCA). 36 | \code{scaling = "center"} for \code{centering} (covariance-based PCA). 37 | \code{scaling = "sd"} for \code{centered} SNPs divided by standard deviation (correlation-based PCA). 38 | \code{scaling = "none"} for no scaling. 39 | See details.} 40 | 41 | \item{program_svd}{String indicating R package computing single value decomposition (SVD). 42 | Default \code{program_svd = "Rspectra"} for \code{\link[RSpectra]{svds}}. 43 | \code{program_svd = "bootSVD"} for \code{\link[bootSVD]{fastSVD}}. 44 | See details.} 45 | 46 | \item{pc_axes}{A numeric value. 47 | If \code{program_svd = "Rspectra"} this argument indicates number of PCA axes computed starting with PCA axis 1. 48 | Default \code{pc_axes = 2} computes PCA axes 1 and 2. 49 | No effect on computation if \code{program_svd = "bootSVD"} since all PCA axes are computed.} 50 | 51 | \item{sample_project}{Numeric vector indicating column numbers (ancient samples) projected onto (modern) PCA space. 52 | Default \code{sample_project = FALSE} indicates no samples will be used for projection. 53 | See details.} 54 | 55 | \item{pc_project}{Numeric vector indicating the ranks of the PCA axes ancient samples are projected onto. 56 | Default \code{pc_ancient = c(1, 2)} for PCA axes 1 and 2. If \code{program_svd = "RSpectra"}, \code{length(pc_ancient)} must be smaller than or equal to \code{pc_axes}. 57 | No effect on computation, if no ancient samples present.} 58 | } 59 | \value{ 60 | Returns a list containing the following elements: 61 | \itemize{ 62 | \item {\code{pca.snp_loadings}} {Dataframe of principal coefficients of SNPs. One set of coefficients per PCA axis computed.}\cr 63 | \item {\code{pca.eigenvalues}} {Dataframe of eigenvalues, variance and cumulative variance explained. One eigenvalue per PCA axis computed.}\cr 64 | \item {\code{pca_sample_coordinates}} {Dataframe showing PCA sample summary. 65 | Column \emph{Group} assigns samples to groups. 66 | Column \emph{Class} specifies if samples "Removed" from PCA or "Projected" onto PCA space. 67 | Sequence of additional columns shows principal components (coordinates) of samples in PCA space (1 column per PCA computed named PC1, PC2, ...).} 68 | } 69 | } 70 | \description{ 71 | Compute Principal Component Analysis (PCA) for variable x sample genotype data including covariance (\code{centered}), correlation (z-score) and SMARTPCA scaling, 72 | and implements projection of ancient samples onto modern PCA space. SMARTPCA scaling controls for genetic drift when variables are bi-allelic genetic markers 73 | such as single nucleotide polymorphisms (SNP) following Patterson, Price and Reich (2006). 74 | Optimized to run fast single value decomposition for big datasets. 75 | } 76 | \details{ 77 | PCA is a rigid rotation of a Cartesian coordinate system (samples = points, axes = variables or SNPs) that maximizes the dispersion of points along a new system of axes (Pearson 1901; Hotelling 1933; Jolliffe 2002). 78 | In rotated space (ordination), axes are \code{principal axes} (PCA axes), \code{eigenvalues} measure variance explained, and \code{principal coefficients} measure importance of SNPs (eigenvectors), \code{principal components} are coordinates of samples (i.e., linear combinations of scaled variables weighted by eigenvectors). 79 | Principal coefficients are direction cosines between original and PCA axes (Legendre & Legendre 2012). PCA can be computed by \code{eigenanalysis} or, as implemented here, single value decomposition (SVD). \cr 80 | 81 | SNPs can be scaled in four different ways prior to SVD: (1) no scaling; (2) covariance: SNPs \code{centered} such that \emph{M(i,j)} = \emph{C(i,j)} minus \emph{mean(j)}) where \emph{C(i,j)} is the number of variant alleles for SNP \emph{j} and sample \emph{i}, and \emph{M(i,j)} is the \code{centered} value of each data point; (3) correlation (z-scores): SNPs \code{centered} then divided by standard deviation \emph{sd(j)}, (4) SMARTPCA: SNPs \code{centered} then divided by \emph{sqrt(p(j)(1-p(j)))}, where \emph{p(j)} equals \emph{mean(j)} divided by \emph{2}, quantifies the underlying allele frequency (autosomal chromosomes) and conceptualizes that SNP frequency changes at rate proportional to \emph{sqrt(p(j)(1-p(j)))} per generation due to genetic drift (Patterson, Price and Reich 2006). 82 | SMARTPCA standardization results in all SNPs that comply with Hardy-Weinberg equilibrium having identical variance. 83 | SMARTPCA (Patterson, Price and Reich 2006) and \code{EIGENSTRAT} (Price, Patterson, Plenge, Weinblatt, Shadick and Reich 2006) are the computing suites of software \code{EIGENSOFT} (\url{https://reich.hms.harvard.edu/software}).\cr 84 | 85 | \code{\link[RSpectra]{svds}} runs single value decomposition much faster than \code{\link[bootSVD]{fastSVD}}. With \code{\link[RSpectra]{svds}}, \code{pc_axes} indicates number of eigenvalues and eigenvectors computed starting from PCA axis 1. \code{\link[bootSVD]{fastSVD}} computes all eigenvalues and eigenvectors. Eigenvalues calculated from singular values divided by number of samples minus 1. If number of samples equals number of SNPS, \code{\link[bootSVD]{fastSVD}} prints message alert that no computing efficiency is achieved for square matrices.\cr 86 | 87 | Ancient samples (with many missing values) can be projected onto modern PCA space derived from modern samples. 88 | Following Nelson Taylor and MacGregor (1996), the projected coordinates of a given ancient sample equal the slope coefficient of linear fit through the origin of (scaled) non-missing SNP values of that sample (response) versus principal coefficients of same SNPs in modern samples. 89 | Number of projected coordinates per ancient sample given by \code{length(pc_ancient)}. 90 | With \code{\link[RSpectra]{svds}}, \code{pc_axes} must be larger or equal to \code{length(pc_ancient)}.\cr 91 | 92 | Data read from working directory with SNPs as rows and samples as columns. 93 | Two alternative formats: (1) text file of SNPs by samples (file extension and column separators recognized automatically) read using \code{\link[data.table]{fread}}; or (2) duet of \code{EIGENSTRAT} files (see \url{https://reich.hms.harvard.edu/software}) using \code{\link[vroom]{vroom_fwf}}, including a genotype file of SNPs by samples (\code{*.geno}), and a sample file (\code{*.ind}) containing three vectors assigning individual samples to unique user-predefined groups (populations), sexes (or other user-defined descriptor) and alphanumeric identifiers. 94 | For \code{EIGENSTRAT}, vector \code{sample_group} assigns samples to groups retrievable from column of file \code{*.ind}. SNPs with zero variance removed prior to SVD to optimize computation time and avoid undefined values if \code{scaling = "sd"} or \code{"drift"}.\cr 95 | 96 | Users can select subsets of samples or SNPs by introducing a vector including column numbers for samples (\code{sample_remove}) and/or row numbers for SNPs (\code{snp_remove}) to be removed from computations. 97 | Function stops if the final number of SNPs is 1 or 2. 98 | \code{EIGENSOFT} was conceived for the analysis of human genes and its SMARTPCA suite so accepts 22 (autosomal) chromosomes by default. 99 | If >22 chromosomes are provided and the internal parameter \code{numchrom} is not set to the target number chromosomes of interest, SMARTPCA automatically subsets chromosomes 1 to 22. 100 | In contrast, \code{smart_pca} accepts any number of autosomes with or without the sex chromosomes from an \code{EIGENSTRAT} file.\cr 101 | } 102 | \examples{ 103 | # Path to example genotype matrix "dataSNP" 104 | pathToGenoFile = system.file("extdata", "dataSNP", package = "smartsnp") 105 | 106 | # Example 1: modern samples 107 | #assign 50 samples to each of two groups and colors 108 | my_groups <- c(rep("A", 50), rep("B", 50)); cols = c("red", "blue") 109 | #run PCA with truncated SVD (PCA 1 x PCA 2) 110 | pcaR1 <- smart_pca(snp_data = pathToGenoFile, sample_group = my_groups) 111 | pcaR1$pca.eigenvalues # extract eigenvalues 112 | head(pcaR1$pca.snp_loadings) # extract principal coefficients (SNP loadings) 113 | head(pcaR1$pca.sample_coordinates) # extract principal components (sample position in PCA space) 114 | #plot PCA 115 | plot(pcaR1$pca.sample_coordinates[,c("PC1","PC2")], cex = 2, 116 | pch = 19, col = cols[as.factor(my_groups)], main = "genotype smartpca") 117 | legend("topleft", legend = levels(as.factor(my_groups)), cex =1, 118 | pch = 19, col = cols, text.col = cols) 119 | 120 | # Example 2: modern and ancient samples (ancient samples projected onto modern PCA space) 121 | #assign samples 1st to 10th per group to ancient 122 | my_ancient <- c(1:10, 51:60) 123 | #run PCA with truncated SVD (PCA 1 x PCA 2) 124 | pcaR2 <- smart_pca(snp_data = pathToGenoFile, sample_group = my_groups, sample_project = my_ancient) 125 | pcaR2$pca.eigenvalues # extract eigenvalues 126 | head(pcaR2$pca.snp_loadings) # extract principal coefficients (SNP loading) 127 | head(pcaR2$pca.sample_coordinates) # extract principal components (sample position in PCA space) 128 | #assign samples to groups (A, ancient, B) and colors 129 | my_groups[my_ancient] <- "ancient"; cols = c("red", "black", "blue") 130 | #plot PCA 131 | plot(pcaR2$pca.sample_coordinates[,c("PC1","PC2")], 132 | cex = 2, col = cols[as.factor(my_groups)], pch = 19, main = "genotype smartpca") 133 | legend("topleft", legend = levels(as.factor(my_groups)), cex = 1, 134 | pch = 19, col = cols, text.col = cols) 135 | 136 | } 137 | \references{ 138 | Hotelling, H. (1933) Analysis of a complex of statistical variables into principal components. Journal of Educational Psychology, 24, 417-441.\cr 139 | 140 | Jolliffe, I.T. (2002) Principal Component Analysis (Springer, New York, USA).\cr 141 | 142 | Legendre, P. & L. F. J. Legendre (2012). Numerical ecology. Developments in environmental modelling (Elsevier, Oxford, UK).\cr 143 | 144 | Nelson, P.R.C., P.A. Taylor, and J.F. MacGregor (1996) Missing data methods in PCA and PLS: score calculations with incomplete observations. Chemometrics and Intelligent Laboratory Systems, 35, 45-65.\cr 145 | 146 | Patterson, N.J., A. L. Price and D. Reich (2006) Population structure and eigenanalysis. PLoS Genetics, 2, e190.\cr 147 | 148 | Pearson, K. (1901) On lines and planes of closest fit to systems of points in space. Philosophical Magazine, 2, 559-572.\cr 149 | 150 | Price, A.L., N.J. Patterson, R.M. Plenge, M.E. Weinblatt, N.A. Shadick and David Reich (2006). Principal components analysis corrects for stratification in genome-wide association studies. Nature Genetics, 38, 904-909. 151 | } 152 | \seealso{ 153 | \code{\link[bootSVD]{fastSVD}} (package \bold{bootSVD}), 154 | \code{\link[foreach]{foreach}} (package \bold{foreach}), 155 | \code{\link[data.table]{fread}} (package \bold{data.table}), 156 | \code{\link[Rfast]{rowVars}} (package \bold{Rfast}), 157 | \code{\link[RSpectra]{svds}} (package \bold{RSpectra}), 158 | \code{\link[vroom]{vroom_fwf}} (package \bold{vroom}) 159 | } 160 | -------------------------------------------------------------------------------- /man/smart_permanova.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/smart_permanova.R 3 | \name{smart_permanova} 4 | \alias{smart_permanova} 5 | \title{Smart Permutational Multivariate Analysis of Variance} 6 | \arguments{ 7 | \item{snp_data}{File name read from working directory. 8 | SNP = rows, samples = columns without row names or column headings. 9 | SNP values must be count data (no decimals allowed). 10 | File extension detected automatically whether text or \code{EIGENSTRAT}. 11 | See details.} 12 | 13 | \item{packed_data}{Logical value for \code{EIGENSTRAT}, irrelevant for text data. 14 | Default \code{packed_data = FALSE} assumes uncompressed \code{EIGENSTRAT}. 15 | \code{packed_data = TRUE} for compressed or binary \code{EIGENSTRAT} (\code{PACKENDANCESTRYMAP}).} 16 | 17 | \item{sample_group}{Character or numeric vector assigning samples to groups. Coerced to factor.} 18 | 19 | \item{sample_remove}{Logical \code{FALSE} or numeric vector indicating column numbers (samples) to be removed from computations. 20 | Default \code{sample_remove = FALSE} keeps all samples.} 21 | 22 | \item{snp_remove}{Logical \code{FALSE} or numeric vector indicating row numbers (SNPs) to be removed from computations. 23 | Default \code{snp_remove = FALSE} keeps all SNPs. See details.} 24 | 25 | \item{missing_value}{Number \code{9} or string \code{NA} indicating missing value. 26 | Default \code{missing_value = 9} as in \code{EIGENSTRAT}. 27 | If no missing values present, no effect on computation.} 28 | 29 | \item{missing_impute}{String handling missing values. 30 | Default \code{missing_impute = "mean"} replaces missing values of each SNP by mean of non-missing values across samples. 31 | \code{missing_impute = "remove"} removes SNPs with at least one missing value. 32 | If no missing values present, no effect on computation.} 33 | 34 | \item{scaling}{String. Default \code{scaling = "drift"} scales SNPs to control for expected allele frequency dispersion caused by genetic drift (SMARTPCA). 35 | \code{scaling = "center"} for \code{centering} (covariance-based PCA). 36 | \code{scaling = "sd"} for \code{centered} SNPs divided by standard deviation (correlation-based PCA). 37 | \code{scaling = "none"} for no scaling. 38 | See details.} 39 | 40 | \item{sample_distance}{Type of inter-sample proximity computed (distance, similarity, dissimilarity). 41 | Default is \code{Euclidean distance}. See details.} 42 | 43 | \item{program_distance}{A string value indicating R package to estimate proximities between pairs of samples. 44 | Default \code{program_distance = "Rfast"} uses function \code{\link[Rfast]{Dist}}; \code{program_distance = "vegan"} uses \code{\link[vegan]{vegdist}}. 45 | See details.} 46 | 47 | \item{target_space}{String. 48 | Default \code{target_space = "multidimensional"} applies PERMANOVA to sample-by-sample triangular matrix computed from variable-by-sample data, \code{pc_axes} has no effect on computation. 49 | \code{target_space = "pca"} applies PERMANOVA to sample-by-sample data in PCA space, \code{pc_axes} determines number of PCA axes for testing.} 50 | 51 | \item{pc_axes}{Number of PCA axes computed always starting with PCA axis 1. Default \code{pc_axes = 2} computes PCA axes 1 and 2 if \code{target_space = "pca"}. 52 | No effect on computation if \code{target_space = "multidimensional"}.} 53 | 54 | \item{pairwise}{Logical. 55 | Default \code{pairwise = FALSE} computes global test. 56 | \code{pairwise = TRUE} computes global and pairwise tests.} 57 | 58 | \item{pairwise_method}{String specifying type of correction for multiple testing. 59 | Default \code{"holm"}. 60 | See details.} 61 | 62 | \item{permutation_n}{Number of permutations resulting in PERMANOVA test \emph{p value}. 63 | Default \code{9999}.} 64 | 65 | \item{permutation_seed}{Number fixing random generator of permutations. 66 | Default \code{1}.} 67 | } 68 | \value{ 69 | Returns a list containing the following elements: 70 | \itemize{ 71 | \item{permanova.samples}{Dataframe showing sample summary. 72 | Column \emph{Group} assigns samples to tested groups. 73 | Column \emph{Class} specifies if samples were used in, or removed from, testing.} 74 | \item{permanova.global_test}{List showing table with degrees of freedom, sum of squares, mean sum of squares, \emph{F} statistic, variance explained (\emph{R2}) and \emph{p} value.} 75 | \item{permanova.pairwise_test}{List showing table \emph{F} statistic, variance explained (\emph{R2}), \emph{p} value and corrected \emph{p} value per pair of groups. 76 | Obtained only if \code{pairwise = TRUE}.} 77 | \item{permanova.pairwise_correction}{String indicating type of correction for multiple testing.} 78 | \item{permanova.permutation_number}{Number of permutations applied to obtain the distribution of \emph{p value}.} 79 | \item{permanova.permutation_seed}{Number fixing random generator of permutations for reproducibility of results.} 80 | } 81 | } 82 | \description{ 83 | Computes Permutational Multivariate Analysis of Variance (PERMANOVA) for testing differences in group location using multivariate data. Variance partitioning computed on a sample-by-sample triangular matrix obtained from variable-by-sample data following Anderson (2001). 84 | Calculates a range of inter-sample distances, similarities and dissimilarities. 85 | Includes control for genetic drift for bi-allelic genetic markers such as single nucleotide polymorphisms (SNP) following Patterson, Price and Reich (2006) that can be combined with SMART Principal Component Analysis (PCA). Optimized to run fast matrix building and permutations for big datasets in ecological, evolutionary and genomic research. 86 | } 87 | \details{ 88 | PERMANOVA is a form of linear modelling that partitions variation in a triangular matrix of inter-sample proximities obtained from variable-by-sample data. 89 | Uses permutations to estimate the probability of observed group differences in SNP composition given a null hypothesis of no differences between groups (Anderson 2001). 90 | Proximity between samples can be any type of distance, similarity or dissimilarity. 91 | Original acronym \code{NPMANOVA} (Non-Parametric MANOVA) replaced with PERMANOVA (Anderson 2004, 2017).\cr 92 | 93 | Univariate ANOVA captures differences in mean and variance referred to as location and dispersion in PERMANOVA's multivariate context (Anderson & Walsh 2013, Warton, Wright and Wang 2012). 94 | To attribute group differences to location (position of sample groups) and/or dispersion (spread of sample groups), PERMANOVA must be combined with PERMDISP as implemented through \code{smart_permdisp}.\cr 95 | 96 | Function \code{smart_permanova} uses \code{\link[vegan]{adonis}} to fit formula \code{snp_eucli ~ sample_group}, where \code{snp_eucli} is the sample-by-sample triangular matrix in Principal Coordinate Analysis (Gower 1966) space. 97 | Current version restricted to one-way designs (one categorical predictor) though PERMANOVA can handle >1 crossed and/or nested factors (Anderson 2001) and continuous predictors (McArdle & Anderson 2001). 98 | If >2 sample groups tested, \code{pairwise = TRUE} allows pairwise testing and correction for multiple testing by \code{holm (Holm)} [default], \code{hochberg (Hochberg)}, \code{hommel (Hommel)}, \code{bonferroni (Bonferroni)}, \code{BY (Benjamini-Yekuieli)}, \code{BH (Benjamini-Hochberg)} or \code{fdr (False Discovery Rate)}.\cr 99 | 100 | For big data, \code{\link[Rfast]{Dist}} builds sample-by-sample triangular matrix much faster than \code{\link[vegan]{vegdist}}. 101 | \code{\link[Rfast]{Dist}} computes proximities \code{euclidean}, \code{manhattan}, \code{canberra1}, \code{canberra2}, \code{minimum}, \code{maximum}, \code{minkowski}, \code{bhattacharyya}, \code{hellinger}, \code{kullback_leibler} and \code{jensen_shannon}. \code{\link[vegan]{vegdist}} computes \code{manhattan}, \code{euclidean}, \code{canberra}, \code{clark}, \code{bray}, \code{kulczynski}, \code{jaccard}, \code{gower}, \code{altGower}, \code{morisita}, \code{horn}, \code{mountford}, \code{raup}, \code{binomial}, \code{chao}, \code{cao} and \code{mahalanobis}. 102 | Euclidean distance required for SMARTPCA scaling.\cr 103 | 104 | \code{sample_remove} should include both samples removed from PCA and ancient samples projected onto PCA space (if any).\cr 105 | 106 | Data read from working directory with SNPs as rows and samples as columns. 107 | Two alternative formats: (1) text file of SNPs by samples (file extension and column separators recognized automatically) read using \code{\link[data.table]{fread}}; or (2) duet of \code{EIGENSTRAT} files (see \url{https://reich.hms.harvard.edu/software}) using \code{\link[vroom]{vroom_fwf}}, including a genotype file of SNPs by samples (\code{*.geno}), and a sample file (\code{*.ind}) containing three vectors assigning individual samples to unique user-predefined groups (populations), sexes (or other user-defined descriptor) and alphanumeric identifiers. 108 | For \code{EIGENSTRAT}, vector \code{sample_group} assigns samples to groups retrievable from column 3 of file \code{*.ind}. 109 | SNPs with zero variance removed prior to SVD to optimize computation time and avoid undefined values if \code{scaling = "sd"} or \code{"drift"}.\cr 110 | 111 | Users can select subsets of samples or SNPs by introducing a vector including column numbers for samples (\code{sample_remove}) and/or row numbers for SNPs (\code{snp_remove}) to be removed from computations. 112 | Function stops if the final number of SNPs is 1 or 2. 113 | \code{EIGENSOFT} was conceived for the analysis of human genes and its SMARTPCA suite so accepts 22 (autosomal) chromosomes by default. 114 | If >22 chromosomes are provided and the internal parameter \code{numchrom} is not set to the target number chromosomes of interest, SMARTPCA automatically subsets chromosomes 1 to 22. 115 | In contrast, \code{smart_permanova} accepts any number of autosomes with or without the sex chromosomes from an \code{EIGENSTRAT} file.\cr 116 | } 117 | \examples{ 118 | # Path to example genotype matrix "dataSNP" 119 | pathToGenoFile = system.file("extdata", "dataSNP", package = "smartsnp") 120 | 121 | # Assign 50 samples to each of two groups 122 | my_groups <- as.factor(c(rep("A", 50), rep("B", 50))) 123 | 124 | # Run PERMANOVA 125 | permanovaR <- smart_permanova(snp_data = pathToGenoFile, sample_group = my_groups) 126 | 127 | # Extract summary table assigning samples to groups 128 | permanovaR$permanova.samples 129 | 130 | # Extract PERMANOVA table 131 | permanovaR$permanova.global_test 132 | 133 | # Plot means of squares per group 134 | #run pca with truncated SVD (PCA 1 x PCA 2) 135 | pcaR1 <- smart_pca(snp_data = pathToGenoFile, sample_group = my_groups) 136 | #compute Euclidean inter-sample distances in PCA space (triangular matrix) 137 | snp_eucli <- vegan::vegdist(pcaR1$pca.sample_coordinates[,c("PC1","PC2")], method = "euclidean") 138 | #run PERMANOVA 139 | permanova <- vegan::adonis(formula = snp_eucli ~ my_groups, permutations = 9999) 140 | #extract meanSqs (groups versus residuals) 141 | meanSqs <- as.matrix(t(permanova$aov.tab$MeanSqs[1:2])) 142 | colnames(meanSqs) <- c("Groups", "Residuals") 143 | #two horizontal plots 144 | oldpar <- par(mfrow = c(2,1), oma = c(0,5,0.1,0.1), lwd = 2) 145 | barplot(meanSqs, horiz = TRUE, main = "PERMANOVA mean of squares", 146 | cex.names = 2, cex.main = 2, col = c("grey40")) 147 | #run ANOSIM 148 | anosimD <- vegan::anosim(snp_eucli, my_groups, permutations = 999) 149 | #remove outputs for clean plotting 150 | #anosimD[2] <- ""; anosimD[5] <- "" 151 | par(mar = c(5, 0.1, 3.5, 0.1)) 152 | plot(anosimD, xlab = "", ylab = "distance/similarity ranks", 153 | main = "Inter-sample proximity ranks", cex.main =2, cex.axis = 2, 154 | col = c("cyan", "red", "blue")) 155 | par(oldpar) 156 | 157 | } 158 | \references{ 159 | Anderson, M. J. (2001) A new method for non-parametric multivariate analysis of variance. Austral Ecology, 26, 32-46.\cr 160 | Anderson, M. J. (2004). PERMANOVA_2factor: a FORTRAN computer program for permutational multivariate analysis of variance (for any two-factor ANOVA design) using permutation tests (Department of Statistics, University of Auckland, New Zealand).\cr 161 | Anderson, M. J. & D. C. I. Walsh (2013) PERMANOVA, ANOSIM, and the Mantel test in the face of heterogeneous dispersions: What null hypothesis are you testing? Ecological Monographs, 83, 557-574.\cr 162 | Gower, J. C. (1966) Some distance properties of latent root and vector methods used in multivariate analysis. Biometrika, 53, 325-338.\cr 163 | McArdle, B. H. & M. J. Anderson (2001) Fitting multivariate models to community data: a comment on distance-based redundancy analysis. Ecology, 82, 290-297.\cr 164 | Patterson, N., A. L. Price and D. Reich (2006) Population structure and eigenanalysis. PLoS Genetics, 2, e190.\cr 165 | Warton, D. I., S. T. Wright and Y. Wang (2012) Distance-based multivariate analyses confound location and dispersion effects. Methods in Ecology and Evolution, 3, 89-101. 166 | } 167 | \seealso{ 168 | \code{\link[vegan]{adonis}} (package \bold{vegan}), 169 | \code{\link[Rfast]{Dist}} (package \bold{Rfast}), 170 | \code{\link[data.table]{fread}} (package \bold{data.table}), 171 | \code{\link[vegan]{vegdist}} (package \bold{vegan}), 172 | \code{\link[vroom]{vroom_fwf}} (package \bold{vroom}) 173 | } 174 | --------------------------------------------------------------------------------