├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── CHANGELOG ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── conStruct-package.R ├── data.R ├── format.data.R ├── model.comparison.R ├── plot.output.R ├── process.model.fit.R ├── run.conStruct.R ├── stanmodels.R └── zzz.R ├── README.md ├── configure ├── configure.win ├── cran-comments.md ├── data ├── conStruct.data.rda └── data.block.rda ├── inst ├── include │ └── stan_meta_header.hpp └── stan │ ├── multiK.stan │ ├── oneK.stan │ ├── space_multiK.stan │ └── space_oneK.stan ├── man ├── calculate.layer.contribution.Rd ├── compare.two.runs.Rd ├── conStruct-manual.pdf ├── conStruct-package.Rd ├── conStruct.Rd ├── conStruct.data.Rd ├── data.block.Rd ├── make.admix.pie.plot.Rd ├── make.all.the.plots.Rd ├── make.structure.plot.Rd ├── match.layers.x.runs.Rd ├── print.conStruct.results.Rd ├── print.data.block.Rd ├── print.freq.data.Rd ├── print.layer.params.Rd ├── structure2conStruct.Rd └── x.validation.Rd ├── src ├── Makevars ├── Makevars.win ├── RcppExports.cpp ├── stanExports_multiK.cc ├── stanExports_multiK.h ├── stanExports_oneK.cc ├── stanExports_oneK.h ├── stanExports_space_multiK.cc ├── stanExports_space_multiK.h ├── stanExports_space_oneK.cc └── stanExports_space_oneK.h ├── testing ├── runs │ ├── sim.dataset.Robj │ ├── test.mods.R │ └── testOne.R └── xval │ ├── Makefile │ ├── sim.dataset.Robj │ ├── test.xval.R │ └── test.xval2.R └── vignettes ├── format-data.Rmd ├── model-comparison.Rmd ├── run-conStruct.Rmd └── visualize-results.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | build.conStruct.pkg.R 3 | build.conStruct.pkg.Rout 4 | .*\.tar\.gz$ 5 | sandbox/* 6 | testing/* 7 | cran-comments.md 8 | README.md 9 | nohup.out 10 | \.Rapp.history 11 | man/conStruct-manual.pdf 12 | src/init.o 13 | src/conStruct.so 14 | src/stan_files/.*\.o$ 15 | src/stan_files/.*\.hpp$ 16 | notes_for_next.release.txt 17 | ^\.github$ 18 | ^CRAN-SUBMISSION$ 19 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | */.DS_Store 3 | *tar.gz 4 | sandbox/ 5 | build.conStruct.pkg.R 6 | */.Rapp.history 7 | .Rapp.history 8 | .RData 9 | *.Rout 10 | nohup.out 11 | testing/*/.Rapp.history 12 | testing/*/.RData 13 | testing/*/*.pdf 14 | testing/*/*conStruct.results.Robj 15 | testing/*/*data.block.Robj 16 | testing/*/*model.fit.Robj 17 | testing/*/*.out 18 | testing/*/*.Rout 19 | testing/xval/*.txt 20 | testing/xval/xvals*.Robj 21 | testing/xval/*.xvals.Robj 22 | testing/xval/*data.partitions.Robj 23 | testing/xval/*.xval.results.Robj 24 | src/*.so 25 | src/*.o 26 | src/stan_files/*.o 27 | src/stan_files/*.hpp 28 | testing/xval/*.log 29 | notes_for_next_release.txt 30 | testing/data_types 31 | CRAN-SUBMISSION -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | *********************** 2 | [UPCOMING] - XXXX-XX-XX 3 | *********************** 4 | 5 | Bug fixes: 6 | 7 | - Calculation of the allelic covariance matrix in calc.covariance was found to 8 | incorrectly use the sample covariance instead of the population covariance, 9 | which could lead to non-positive-definite covariance matrices in rare cases 10 | with small sample sizes. (@petrelharp, PR #34) 11 | 12 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: conStruct 2 | Version: 1.0.6 3 | Date: 2024-1-08 4 | Title: Models Spatially Continuous and Discrete Population Genetic Structure 5 | Description: A method for modeling genetic data as a combination of discrete 6 | layers, within each of which relatedness may decay continuously with geographic 7 | distance. This package contains code for running analyses (which are implemented 8 | in the modeling language 'rstan') and visualizing and interpreting output. See the 9 | paper for more details on the model and its utility. 10 | Authors@R: person("Gideon", "Bradburd", email = "bradburd@umich.edu", role = c("aut", "cre")) 11 | License: GPL-3 12 | Encoding: UTF-8 13 | LazyData: true 14 | ByteCompile: true 15 | Depends: R (>= 3.4.0), Rcpp (>= 0.12.0), methods 16 | Imports: rstan (>= 2.26.0), rstantools (>= 1.5.0), caroline, gtools, foreach, parallel, doParallel 17 | LinkingTo: StanHeaders (>= 2.26.0), rstan (>= 2.26.0), BH (>= 1.66.0), Rcpp (>= 0.12.0), RcppEigen (>= 0.3.3.3.0), RcppParallel (>= 5.0.1) 18 | SystemRequirements: GNU make 19 | NeedsCompilation: yes 20 | RoxygenNote: 7.2.3 21 | Suggests: 22 | knitr, 23 | rmarkdown, 24 | maps 25 | VignetteBuilder: knitr 26 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(calculate.layer.contribution) 4 | export(compare.two.runs) 5 | export(conStruct) 6 | export(make.admix.pie.plot) 7 | export(make.all.the.plots) 8 | export(make.structure.plot) 9 | export(match.layers.x.runs) 10 | export(structure2conStruct) 11 | export(x.validation) 12 | import(Rcpp) 13 | import(methods) 14 | import(rstan) 15 | import(rstantools) 16 | importFrom(rstan,sampling) 17 | useDynLib(conStruct, .registration = TRUE) 18 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # conStruct 1.0.6 2 | ## Minor changes 3 | + Updated for compatibility with rstan 2.26 4 | 5 | # conStruct 1.0.5 6 | 7 | ## Minor changes 8 | + updated package to fully delegate the installation/compilation of stan models to rstantools. 9 | + added a github actions R CMD CHECK workflow to make sure that changes are compatible across all platforms. 10 | 11 | # conStruct 1.0.4 12 | 13 | ## Bug fixes 14 | + updated data validation functions to be in compliance with mew class inheritance coming in next R release 15 | 16 | 17 | # conStruct 1.0.3 18 | 19 | ## Major changes 20 | + added `...` to `conStruct` and `x.validation` so additional arguments can be passed to `rstan::sampling` 21 | 22 | ## Bug fixes 23 | + fixed aliasing due to inefficient deep copy in stan model 24 | + removed duplicated vignettes displayed on CRAN page 25 | 26 | # conStruct 1.0.2 27 | 28 | ## Bug fixes 29 | + updated Makevars and Makevars.win to be in compliance with CRAN policy 30 | 31 | # conStruct 1.0.1 32 | 33 | ## Major changes 34 | + following move to C++14 by Stan 35 | + `structure2conStruct` now works for multiple STRUCTURE file formats 36 | 37 | ## Bug fixes 38 | + users can now specify their own custom plotting colors in `make.all.the.plots` 39 | 40 | # conStruct 1.0.0 41 | 42 | ## Major changes 43 | + stan model blocks are now compiled at package installation instead of at a call to `conStruct` or `x.validation`. 44 | + `x.validation` is now parallelizable 45 | + new `model-comparison` vignette (see `vignette("model-comparison",package="conStruct")`) 46 | + alphaD parameter is now rescaled to reflect non-normalized geographic distances 47 | + compare.two.runs function added 48 | 49 | ## Bug fixes 50 | + Removed large files in git history on repo 51 | 52 | # conStruct 0.0.0.9000 53 | 54 | ## Beta release -------------------------------------------------------------------------------- /R/conStruct-package.R: -------------------------------------------------------------------------------- 1 | #' The 'conStruct' package. 2 | #' 3 | #' @description A method for modeling genetic data as a combination of discrete 4 | #' layers, within each of which relatedness may decay continuously with geographic 5 | #' distance. This package contains code for running analyses (which are implemented 6 | #' in the modeling language 'rstan') and visualizing and interpreting output. See the 7 | #' associated paper for more details on the model and its utility. 8 | #' 9 | #' @docType package 10 | #' @name conStruct-package 11 | #' @aliases conStruct-package 12 | #' @useDynLib conStruct, .registration = TRUE 13 | #' @import methods 14 | #' @import Rcpp 15 | #' @import rstantools 16 | #' @importFrom rstan sampling 17 | #' 18 | #' @references 19 | #' G.S. Bradburd, G.M. Coop, and P.L. Ralph (2018) . 20 | #' 21 | #' Stan Development Team (2018). RStan: the R interface to Stan. R package version 2.17.3. http://mc-stan.org 22 | NULL 23 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Example dataset used in a \code{conStruct} analysis 2 | #' 3 | #' A simulated dataset containing the allele frequency 4 | #' and sampling coordinate data necessary to run a 5 | #' \code{conStruct} analysis. 6 | #' 7 | #' @format A list with two elements: 8 | #' \describe{ 9 | #' \item{allele.frequencies}{a matrix with one row for each of 10 | #' the 16 samples and one column for each of 10,000 loci, 11 | #' giving the frequency of the counted allele at each locus 12 | #' in each sample} 13 | #' \item{coords}{a matrix with one row for each of the 16 samples, 14 | #' in the same order as that of the allele frequency matrix, 15 | #' and two columns, the first giving the x-coordinate 16 | #' (or longitude), the second giving the y-coordinate (or latitude)} 17 | #' } 18 | # 19 | "conStruct.data" 20 | 21 | #' Example \code{data.block} generated by a \code{conStruct} analysis 22 | #' 23 | #' An example \code{data.block} object generated in a \code{conStruct} 24 | #' analysis from the raw data supplied by the user. This object is 25 | #' automatically saved and is used in several subsequent plotting functions. 26 | #' 27 | #' @format A list with 7 elements: 28 | #' \describe{ 29 | #' \item{\code{N}}{the number of samples included in the analysis} 30 | #' \item{\code{K}}{the number of clusters/layers included in the model} 31 | #' \item{\code{spatial}}{a boolean indicating whether the spatial 32 | #' model has been specified} 33 | #' \item{\code{L}}{the number of loci included in the analysis} 34 | #' \item{\code{coords}}{a matrix with one row for each of the \code{N} samples, 35 | #' in the same order as that of the \code{obsCov} matrix, 36 | #' and two columns, the first giving the x-coordinate 37 | #' (or longitude), the second giving the y-coordinate (or latitude)} 38 | #' \item{\code{obsCov}}{the sample allelic covariance matrix, 39 | #' in the same order as that of the \code{coords} matrix, 40 | #' with \code{N} rows and columns} 41 | #' \item{\code{geoDist}}{a matrix of pairwise geographic distance between , 42 | #' samples in the same order as that of the \code{obsCov}, 43 | #' with \code{N} rows and columns} 44 | #' \item{\code{sd.geoDist}}{the standard deviation of the raw geographic 45 | #' distance matrix, used for normalizing \code{geoDist} within the 46 | #' stan model} 47 | #' \item{\code{varMeanFreqs}}{the variance of the mean allele frequencies, 48 | #' averaged over choice of counted allele (passed to the model 49 | #' as a prior on the global covariance parameter)} 50 | #' } 51 | # 52 | "data.block" -------------------------------------------------------------------------------- /R/format.data.R: -------------------------------------------------------------------------------- 1 | #' Convert a dataset from STRUCTURE to conStruct format 2 | #' 3 | #' \code{structure2conStruct} converts a STRUCTURE dataset 4 | #' to conStruct format 5 | #' 6 | #' This function takes a population genetics dataset in 7 | #' STRUCTURE format and converts it to conStruct format. 8 | #' The STRUCTURE file can have one row per individual 9 | #' and two columns per locus, or one column and two rows 10 | #' per individual. It can only contain bi-allelic SNPs. 11 | #' Missing data is acceptable, but must be indicated with 12 | #' a single value throughout the dataset. 13 | #' 14 | #' @param infile The name and path of the file in STRUCTURE format 15 | #' to be converted to \code{conStruct} format. 16 | #' @param onerowperind Indicates whether the file format has 17 | #' one row per individual (\code{TRUE}) or two rows per 18 | #' individual (\code{FALSE}). 19 | #' @param start.loci The index of the first column in the dataset 20 | #' that contains genotype data. 21 | #' @param start.samples The index of the first row in the dataset 22 | #' that contains genotype data (e.g., after any headers). 23 | #' Default value is 1. 24 | #' @param missing.datum The character or value used to denote 25 | #' missing data in the STRUCTURE dataset (often 0 or -9). 26 | #' @param outfile The name and path of the file containing the 27 | #' \code{conStruct} formatted dataset to be generated 28 | #' by this function. 29 | #' 30 | #' @details This function takes a STRUCTURE format data file and 31 | #' converts it to a \code{conStruct} format data file. 32 | #' This function can only be applied to diploid organisms. 33 | #' The STRUCTURE data file must be a plain text file. 34 | #' If there is extraneous text or column headers before the data 35 | #' starts, those extra lines should be deleted by hand or 36 | #' taken into account via the \code{start.samples} argument. 37 | #' 38 | #' The STRUCTURE dataset can either be in the ONEROWPERIND=1 39 | #' file format, with one row per individual and two columns 40 | #' per locus, or the ONEROWPERIND=0 format, with two rows and 41 | #' one column per individual. The first column of the STRUCTURE 42 | #' dataset should be individual names. There may be any number 43 | #' of other columns that contain non-genotype information before 44 | #' the first column that contains genotype data, but there can 45 | #' be no extraneous columns at the end of the dataset, after the 46 | #' genotype data. 47 | #' 48 | #' The genotype data must be bi-allelic 49 | #' single nucleotide polymorphisms (SNPs). Applying this function 50 | #' to datasets with more than two alleles per locus may result in 51 | #' cryptic failure. For more details, see the \code{format-data} 52 | #' vignette. 53 | #' 54 | #' @return This function returns an allele frequency data matrix 55 | #' that can be used as the \code{freqs} argument in a conStruct 56 | #' analysis run using \code{\link{conStruct}}. It also saves 57 | #' this object as an .RData file so that it can be used in 58 | #' future analyses. 59 | #' 60 | #' @export 61 | structure2conStruct <- function(infile,onerowperind,start.loci,start.samples=1,missing.datum,outfile){ 62 | outfile <- paste0(outfile,".RData") 63 | if(file.exists(outfile)){ 64 | stop("\noutfile already exists\n\n") 65 | } 66 | structure.data <- utils::read.table(infile,header=FALSE,skip=start.samples-1,stringsAsFactors=FALSE) 67 | sample.names <- get.sample.names(structure.data,onerowperind) 68 | genos <- structure.data[,start.loci:ncol(structure.data)] 69 | rm(structure.data) 70 | if(onerowperind & ncol(genos) %% 2 != 0){ 71 | stop("\nyou have mis-specified the genotype matrix\nplease check documentation\n\n") 72 | } 73 | if(!onerowperind & nrow(genos) %% 2 != 0){ 74 | stop("\nyou have mis-specified the genotype matrix\nplease check documentation\n\n") 75 | } 76 | 77 | freqs <- get.freqs(genos,onerowperind,missing.datum) 78 | row.names(freqs) <- sample.names 79 | save(freqs,file=outfile) 80 | return(freqs) 81 | } 82 | 83 | get.sample.names <- function(structure.data,onerowperind){ 84 | sample.names <- structure.data[,1] 85 | if(!onerowperind){ 86 | sample.names <- sample.names[seq(1,length(sample.names),by=2)] 87 | } 88 | return(sample.names) 89 | } 90 | 91 | get.counted.allele <- function(genos,missing.datum){ 92 | alleles <- unique(genos) 93 | if(all(alleles==missing.datum)){ 94 | stop("\nyour dataset contains loci with all data missing. please remove and re-try.\n\n") 95 | } 96 | alleles <- alleles[!alleles==missing.datum] 97 | counted <- sample(alleles,1) 98 | return(counted) 99 | } 100 | 101 | get.freqs <- function(genos,onerowperind,missing.datum){ 102 | n.loci <- ifelse(onerowperind,ncol(genos)/2,ncol(genos)) 103 | if(onerowperind){ 104 | freqs <- get.freqs.onerowperind(genos,n.loci,missing.datum) 105 | } else { 106 | freqs <- get.freqs.tworowperind(genos,n.loci,missing.datum) 107 | } 108 | colnames(freqs) <- NULL 109 | return(freqs) 110 | } 111 | 112 | get.freqs.onerowperind <- function(genos,n.loci,missing.datum){ 113 | if(any(genos > 1)){ 114 | counted.alleles <- apply(genos,2,get.counted.allele,missing.datum) 115 | } else { 116 | counted.alleles <- rep(1,n.loci) 117 | } 118 | freqs <- Reduce("cbind", 119 | lapply(1:n.loci, 120 | function(l){ 121 | (genos[,seq(1,2*n.loci,by=2)[l]] == counted.alleles[l]) + 122 | (genos[,seq(2,2*n.loci,by=2)[l]] == counted.alleles[l]) 123 | })) 124 | freqs <- freqs/2 125 | missing.data <- Reduce("cbind", 126 | lapply(1:n.loci, 127 | function(l){ 128 | (genos[,seq(1,2*n.loci,by=2)[l]] == missing.datum) + 129 | (genos[,seq(2,2*n.loci,by=2)[l]] == missing.datum) 130 | })) 131 | freqs[missing.data==2] <- NA 132 | return(freqs) 133 | } 134 | 135 | get.freqs.tworowperind <- function(genos,n.loci,missing.datum){ 136 | if(any(genos > 1)){ 137 | counted.alleles <- apply(genos,2,get.counted.allele,missing.datum) 138 | } else { 139 | counted.alleles <- rep(1,n.loci) 140 | } 141 | freqs <- Reduce("cbind", 142 | lapply(1:n.loci, 143 | function(l){ 144 | (genos[seq(1,nrow(genos),by=2),l] == counted.alleles[l]) + 145 | (genos[seq(2,nrow(genos),by=2),l] == counted.alleles[l]) 146 | })) 147 | freqs <- freqs/2 148 | missing.data <- Reduce("cbind", 149 | lapply(1:n.loci, 150 | function(l){ 151 | (genos[seq(1,nrow(genos),by=2),l] == missing.datum) + 152 | (genos[seq(2,nrow(genos),by=2),l] == missing.datum) 153 | })) 154 | freqs[missing.data==2] <- NA 155 | return(freqs) 156 | } 157 | 158 | -------------------------------------------------------------------------------- /R/process.model.fit.R: -------------------------------------------------------------------------------- 1 | unstandardize.distances <- function(data.block){ 2 | if(!is.null(data.block$sd.geoDist)){ 3 | data.block$geoDist <- data.block$geoDist*data.block$sd.geoDist 4 | } 5 | return(data.block) 6 | } 7 | 8 | get.conStruct.results <- function(data.block,model.fit,n.chains){ 9 | conStruct.results <- stats::setNames( 10 | lapply(1:n.chains, 11 | function(i){ 12 | get.conStruct.chain.results(data.block,model.fit,i) 13 | }), 14 | paste0("chain_",1:n.chains)) 15 | return(conStruct.results) 16 | } 17 | 18 | get.MAP.iter <- function(model.fit,chain.no){ 19 | lpd <- rstan::get_logposterior(model.fit) 20 | MAP.iter <- lapply(lpd,which.max)[[chain.no]] 21 | return(MAP.iter) 22 | } 23 | 24 | get.admix.props <- function(model.fit,chain.no,N,n.layers){ 25 | # recover() 26 | admix.props <- array(1,dim=c(model.fit@sim$n_save[chain.no],N,n.layers)) 27 | if(any(grepl("w",model.fit@model_pars))){ 28 | for(k in 1:n.layers){ 29 | admix.props[,,k] <- rstan::extract(model.fit, 30 | pars=unlist(lapply(1:N,function(j){sprintf("w[%s,%s]",j,k)})), 31 | permuted=FALSE,inc_warmup=TRUE)[,chain.no,] 32 | } 33 | } 34 | return(admix.props) 35 | } 36 | 37 | get.par.cov <- function(model.fit,chain.no,N){ 38 | par.cov <- array(NA,dim=c(model.fit@sim$n_save[chain.no],N,N)) 39 | for(i in 1:N){ 40 | for(j in 1:N){ 41 | my.par <- sprintf("parCov[%s,%s]",i,j) 42 | par.cov[,i,j] <- rstan::extract(model.fit,pars=my.par,inc_warmup=TRUE,permuted=FALSE)[,chain.no,] 43 | } 44 | } 45 | return(par.cov) 46 | } 47 | 48 | get.nuggets <- function(model.fit,chain.no,N){ 49 | nuggets <- rstan::extract(model.fit,pars="nugget",inc_warmup=TRUE,permuted=FALSE)[,chain.no,] 50 | return(nuggets) 51 | } 52 | 53 | get.gamma <- function(model.fit,chain.no){ 54 | gamma <- rstan::extract(model.fit,pars="gamma",inc_warmup=TRUE,permuted=FALSE)[,chain.no,] 55 | return(gamma) 56 | } 57 | 58 | get.null.alpha.params <- function(n.iter){ 59 | alpha.params <- list("alpha0" = rep(0,n.iter), 60 | "alphaD" = rep(0,n.iter), 61 | "alpha2" = rep(0,n.iter)) 62 | return(alpha.params) 63 | } 64 | 65 | get.alpha.params <- function(model.fit,data.block,chain.no,layer,n.layers){ 66 | alpha.pars <- model.fit@model_pars[grepl("alpha",model.fit@model_pars)] 67 | if(length(alpha.pars) !=0 ){ 68 | if(n.layers > 1){ 69 | alpha.params <- stats::setNames( 70 | lapply(1:length(alpha.pars), 71 | function(i){ 72 | rstan::extract(model.fit, 73 | pars=paste0(alpha.pars[i],"[",layer,"]"), 74 | inc_warmup=TRUE,permuted=FALSE)[,chain.no,] 75 | }),alpha.pars) 76 | } else { 77 | alpha.params <- stats::setNames( 78 | lapply(1:length(alpha.pars), 79 | function(i){ 80 | rstan::extract(model.fit, 81 | pars=alpha.pars[i], 82 | inc_warmup=TRUE,permuted=FALSE)[,chain.no,] 83 | }),alpha.pars) 84 | } 85 | } else { 86 | alpha.params <- get.null.alpha.params(model.fit@sim$n_save[chain.no]) 87 | } 88 | if(!is.null(data.block$sd.geoDist)){ 89 | alpha.params$alphaD <- alpha.params$alphaD/data.block$sd.geoDist 90 | } 91 | return(alpha.params) 92 | } 93 | 94 | get.null.phi <- function(n.iter){ 95 | phi <- rep(0,n.iter) 96 | return(phi) 97 | } 98 | 99 | get.layer.phi <- function(model.fit,chain.no,layer){ 100 | has.phi <- any(grepl("phi",model.fit@model_pars)) 101 | if(has.phi){ 102 | phi <- rstan::extract(model.fit, 103 | pars=paste0("phi","[",layer,"]"), 104 | inc_warmup=TRUE,permuted=FALSE)[,chain.no,] 105 | } else { 106 | phi <- get.null.phi(model.fit@sim$n_save[chain.no]) 107 | } 108 | return(phi) 109 | } 110 | 111 | get.cov.function <- function(data.block){ 112 | if(data.block$K == 1){ 113 | if(data.block$spatial){ 114 | cov.func <- function(layer.params,data.block){ 115 | return(layer.params$alpha0 * 116 | exp(-(layer.params$alphaD*data.block$geoDist)^layer.params$alpha2)) 117 | } 118 | } 119 | if(!data.block$spatial){ 120 | cov.func <- function(layer.params,data.block){ 121 | return(matrix(0,nrow=data.block$N,ncol=data.block$N)) 122 | } 123 | } 124 | } else { 125 | if(data.block$spatial){ 126 | cov.func <- function(layer.params,data.block){ 127 | return(layer.params$alpha0 * 128 | exp(-(layer.params$alphaD*data.block$geoDist)^layer.params$alpha2) + 129 | layer.params$phi) 130 | } 131 | } 132 | if(!data.block$spatial){ 133 | cov.func <- function(layer.params,data.block){ 134 | return(matrix(layer.params$phi,nrow=data.block$N,ncol=data.block$N)) 135 | } 136 | } 137 | } 138 | return(cov.func) 139 | } 140 | 141 | get.layer.cov <- function(layer.params,data.block,n.iter){ 142 | cov.function <- get.cov.function(data.block) 143 | layer.cov <- lapply(1:n.iter, 144 | function(i){ 145 | cov.function(layer.params= 146 | lapply(layer.params,"[[",i), 147 | data.block) 148 | }) 149 | return(layer.cov) 150 | } 151 | 152 | get.layer.params <- function(model.fit,data.block,chain.no,layer,n.layers,n.iter){ 153 | layer.params <- list() 154 | layer.params <- get.alpha.params(model.fit,data.block,chain.no,layer,n.layers) 155 | layer.params[["phi"]] <- get.layer.phi(model.fit,chain.no,layer) 156 | layer.cov <- get.layer.cov(layer.params,data.block,n.iter) 157 | layer.params <- c(layer.params,list("layer.cov"=layer.cov)) 158 | return(layer.params) 159 | } 160 | 161 | get.layer.params.list <- function(model.fit,data.block,chain.no,n.iter){ 162 | layer.params <- stats::setNames( 163 | lapply(1:data.block$K, 164 | function(i){ 165 | get.layer.params(model.fit,data.block,chain.no,i,data.block$K,n.iter) 166 | }), 167 | paste("layer",1:data.block$K,sep="_")) 168 | layer.params <- make.layer.params.S3(layer.params) 169 | return(layer.params) 170 | } 171 | 172 | make.layer.params.S3 <- function(layer.params){ 173 | layer.params <- layer.params 174 | class(layer.params) <- "layer.params" 175 | return(layer.params) 176 | } 177 | 178 | #' An S3 print method for class layer.params 179 | #' 180 | #' @param x an object of class \code{layer.params} 181 | #' @param ... further options to be passed to \code{print} 182 | #' @return prints a top-level summary of the layer.params, returns nothing 183 | #' @method print layer.params 184 | print.layer.params <- function(x,...){ 185 | print(x=utils::str(x,max.level=1),...) 186 | } 187 | 188 | index.MAP <- function(param,MAP.iter){ 189 | if(inherits(param,"numeric")){ 190 | MAP.param <- param[MAP.iter] 191 | } 192 | if(inherits(param,"list")){ 193 | MAP.param <- param[[MAP.iter]] 194 | } 195 | if(inherits(param,"array") & length(dim(param)) == 3){ 196 | MAP.param <- param[MAP.iter,,] 197 | if(is.null(dim(MAP.param))){ 198 | MAP.param <- matrix(MAP.param,nrow=length(MAP.param),ncol=1) 199 | } 200 | } 201 | if(inherits(param,"matrix") & length(dim(param)) == 2){ 202 | MAP.param <- param[MAP.iter,] 203 | } 204 | if(inherits(param,"layer.params")){ 205 | MAP.param <- index.MAP.layer.params.list(param,MAP.iter) 206 | } 207 | return(MAP.param) 208 | } 209 | 210 | index.MAP.layer.params <- function(layer.params,MAP.iter){ 211 | MAP.layer.params <- lapply(layer.params,index.MAP,MAP.iter) 212 | return(MAP.layer.params) 213 | } 214 | 215 | index.MAP.layer.params.list <- function(layer.params.list,MAP.iter){ 216 | MAP.layer.params.list <- lapply(layer.params.list,index.MAP.layer.params,MAP.iter) 217 | return(MAP.layer.params.list) 218 | } 219 | 220 | get.n.iter <- function(model.fit,chain.no){ 221 | n.iter <- model.fit@sim$n_save[chain.no] 222 | return(n.iter) 223 | } 224 | 225 | make.conStruct.results.S3 <- function(conStruct.results){ 226 | conStruct.results <- conStruct.results 227 | class(conStruct.results) <- "conStruct.results" 228 | return(conStruct.results) 229 | } 230 | 231 | #' An S3 print method for class conStruct.results 232 | #' 233 | #' @param x an object of class \code{conStruct.results} 234 | #' @param ... further options to be passed to \code{print} 235 | #' @return prints a top-level summary of the conStruct.results, returns nothing 236 | #' @method print conStruct.results 237 | print.conStruct.results <- function(x,...){ 238 | print(x=utils::str(x,max.level=1),...) 239 | } 240 | 241 | get.conStruct.chain.results <- function(data.block,model.fit,chain.no){ 242 | n.iter <- get.n.iter(model.fit,chain.no) 243 | posterior <- list("n.iter" = model.fit@sim$n_save[chain.no], 244 | "lpd" = rstan::get_logposterior(model.fit)[[chain.no]], 245 | "nuggets" = get.nuggets(model.fit,chain.no,data.block$N), 246 | "par.cov" = get.par.cov(model.fit,chain.no,data.block$N), 247 | "gamma" = get.gamma(model.fit,chain.no), 248 | "layer.params" = get.layer.params.list(model.fit,data.block,chain.no,n.iter), 249 | "admix.proportions" = get.admix.props(model.fit,chain.no,data.block$N,data.block$K)) 250 | MAP.iter <- get.MAP.iter(model.fit,chain.no) 251 | MAP <- lapply(posterior,function(X){index.MAP(X,MAP.iter)}) 252 | names(MAP)[[1]] <- "index.iter" 253 | MAP[["index.iter"]] <- MAP.iter 254 | conStruct.results <- list("posterior" = posterior,"MAP" = MAP) 255 | conStruct.results <- make.conStruct.results.S3(conStruct.results) 256 | return(conStruct.results) 257 | } -------------------------------------------------------------------------------- /R/run.conStruct.R: -------------------------------------------------------------------------------- 1 | #' Run a conStruct analysis. 2 | #' 3 | #' \code{conStruct} runs a conStruct analysis of genetic data. 4 | #' 5 | #' This function initiates an analysis that uses 6 | #' geographic and genetic relationships between samples 7 | #' to estimate sample membership (admixture proportions) across 8 | #' a user-specified number of layers. 9 | #' 10 | #' @param spatial A logical indicating whether to perform a spatial analysis. 11 | #' Default is \code{TRUE}. 12 | #' @param K An \code{integer} that indicates the number of layers to be 13 | #' included in the analysis. 14 | #' @param freqs A \code{matrix} of allele frequencies with one column per 15 | #' locus and one row per sample. 16 | #' Missing data should be indicated with \code{NA}. 17 | #' @param geoDist A full \code{matrix} of geographic distance between samples. 18 | #' If \code{NULL}, user can only run the nonspatial model. 19 | #' @param coords A \code{matrix} giving the longitude and latitude 20 | #' (or X and Y coordinates) of the samples. 21 | #' @param prefix A character \code{vector} giving the prefix to be attached 22 | #' to all output files. 23 | #' @param n.chains An integer indicating the number of MCMC chains to be run 24 | #' in the analysis. Default is 1. 25 | #' @param n.iter An \code{integer} giving the number of iterations each MCMC 26 | #' chain is run. Default is 1e3. If the number of iterations 27 | #' is greater than 500, the MCMC is thinned so that the number 28 | #' of retained iterations is 500 (before burn-in). 29 | #' @param make.figs A \code{logical} value indicating whether to automatically 30 | #' make figures once the analysis is complete. Default is 31 | #' \code{TRUE}. 32 | #' @param save.files A \code{logical} value indicating whether to automatically 33 | #' save output and intermediate files once the analysis is 34 | #' complete. Default is \code{TRUE}. 35 | #' @param ... Further options to be passed to rstan::sampling (e.g., adapt_delta). 36 | #' 37 | #' @return This function returns a list with one entry for each chain run 38 | #' (specified with \code{n.chains}). The entry for each chain is named 39 | #' "chain_X" for the Xth chain. The components of the entries for each 40 | #' are detailed below: 41 | #' \itemize{ 42 | #' \item \code{posterior} gives parameter estimates over the posterior 43 | #' distribution of the MCMC. 44 | #' \itemize{ 45 | #' \item \code{n.iter} number of MCMC iterations retained for 46 | #' analysis (half of the \code{n.iter} argument 47 | #' specified in the function call). 48 | #' \item \code{lpd} vector of log posterior density over the retained 49 | #' MCMC iterations. 50 | #' \item \code{nuggets} matrix of estimated nugget parameters with 51 | #' one row per MCMC iteration and one column per sample. 52 | #' \item \code{par.cov} array of estimated parametric covariance matrices, 53 | #' for which the first dimension is the number of MCMC iterations. 54 | #' \item \code{gamma} vector of estimated gamma parameter. 55 | #' \item \code{layer.params} list summarizing estimates of layer-specific 56 | #' parameters. There is one entry for each layer specified, and the 57 | #' entry for the kth layer is named "Layer_k". 58 | #' \itemize{ 59 | #' \item \code{alpha0} vector of estimated alpha0 parameter in the 60 | #' kth layer. 61 | #' \item \code{alphaD} vector of estimated alphaD parameter in the 62 | #' kth layer. 63 | #' \item \code{alpha2} vector of estimated alpha2 parameter in the 64 | #' kth layer. 65 | #' \item \code{mu} vector of estimated mu parameter in the 66 | #' kth layer. 67 | #' \item \code{layer.cov} vector of estimated layer-specific 68 | #' covariance parameter in the kth layer. 69 | #' } 70 | #' \item \code{admix.proportions} array of estimated admixture proportions. 71 | #' The first dimension is the number of MCMC iterations, 72 | #' the second is the number of samples, 73 | #' and the third is the number of layers. 74 | #' } 75 | #' \item \code{MAP} gives point estimates of the parameters listed in the \code{posterior} 76 | #' list described above. Values are indexed at the MCMC iteration 77 | #' with the greatest posterior probability. 78 | #' \itemize{ 79 | #' \item \code{index.iter} the iteration of the MCMC with the highest 80 | #' posterior probability, which is used to index all parameters 81 | #' included in the \code{MAP} list 82 | #' \item \code{lpd} the greatest value of the posterior probability 83 | #' \item \code{nuggets} point estimate of nugget parameters 84 | #' \item \code{par.cov} point estimate of parametric covariance 85 | #' \item \code{gamma} point estimate of gamma parameter 86 | #' \item \code{layer.params} point estimates of all layer-specific parameters 87 | #' \item \code{admix.proportions} point estimates of admixture proportions. 88 | #' } 89 | #' } 90 | #' 91 | #' @details This function acts as a wrapper around a STAN model block determined 92 | #' by the user-specified model (e.g., a spatial model with 3 layers, 93 | #' or a nonspatial model with 5 layers). 94 | #' User-specified data are checked for appropriate format and consistent dimensions, 95 | #' then formatted into a \code{data.block}, 96 | #' which is then passed to the STAN model block. 97 | #' Along with the \code{conStruct.results} output described above, 98 | #' several objects are saved during the course of a \code{conStruct} call 99 | #' (if \code{save.files=TRUE}). 100 | #' These are the \code{data.block}, which contains all data passed to the STAN model block, 101 | #' \code{model.fit}, which is unprocessed results of the STAN run in \code{stanfit} format, 102 | #' and the \code{conStruct.results}, which are saved in the course of the function call 103 | #' in addition to being returned. 104 | #' If \code{make.figs=TRUE}, running \code{conStruct} will also generate many output figures, 105 | #' which are detailed in the function \code{make.all.the.plots} in this package. 106 | #' 107 | #' @examples 108 | #' # load example dataset 109 | #' data(conStruct.data) 110 | #' 111 | #' # run example spatial analysis with K=1 112 | #' # 113 | #' # for this example, make.figs and save.files 114 | #' # are set to FALSE, but most users will want them 115 | #' # set to TRUE 116 | #' my.run <- conStruct(spatial = TRUE, 117 | #' K = 1, 118 | #' freqs = conStruct.data$allele.frequencies, 119 | #' geoDist = conStruct.data$geoDist, 120 | #' coords = conStruct.data$coords, 121 | #' prefix = "test", 122 | #' n.chains = 1, 123 | #' n.iter = 1e3, 124 | #' make.figs = FALSE, 125 | #' save.files = FALSE) 126 | #' 127 | #' @import rstan 128 | #' @export 129 | conStruct <- function(spatial=TRUE,K,freqs,geoDist=NULL,coords,prefix="",n.chains=1,n.iter=1e3,make.figs=TRUE,save.files=TRUE,...){ 130 | call.check <- check.call(args <- as.list(environment())) 131 | freq.data <- process.freq.data(freqs) 132 | data.block <- make.data.block(K,freq.data,coords,spatial,geoDist) 133 | if(save.files){ 134 | save(data.block,file=paste0(prefix,"_data.block.Robj")) 135 | } 136 | stan.model <- pick.stan.model(spatial,K) 137 | model.fit <- rstan::sampling(object = stanmodels[[stan.model]], 138 | refresh = min(floor(n.iter/10),500), 139 | data = data.block, 140 | iter = n.iter, 141 | chains = n.chains, 142 | thin = ifelse(n.iter/500 > 1,floor(n.iter/500),1), 143 | save_warmup = FALSE, 144 | ...) 145 | conStruct.results <- get.conStruct.results(data.block,model.fit,n.chains) 146 | data.block <- unstandardize.distances(data.block) 147 | if(save.files){ 148 | save(data.block,file=paste0(prefix,"_data.block.Robj")) 149 | save(model.fit,file=paste(prefix,"model.fit.Robj",sep="_")) 150 | save(conStruct.results,file=paste(prefix,"conStruct.results.Robj",sep="_")) 151 | } 152 | if(make.figs){ 153 | make.all.the.plots(conStruct.results,data.block,prefix,layer.colors=NULL) 154 | } 155 | return(conStruct.results) 156 | } 157 | 158 | validate.data.list <- function(data.block){ 159 | if(!"spatial" %in% names(data.block)){ 160 | stop("\nUser must specify a \"spatial\" option\n\n") 161 | } 162 | if(!"N" %in% names(data.block)){ 163 | stop("\nUser must specify a \"N\"\n\n") 164 | } 165 | if(!"K" %in% names(data.block)){ 166 | stop("\nUser must specify a \"K\"\n\n") 167 | } 168 | if(!"L" %in% names(data.block)){ 169 | stop("\nUser must specify a \"L\"\n\n") 170 | } 171 | if(!"obsCov" %in% names(data.block)){ 172 | stop("\nUser must specify a \"obsCov\"\n\n") 173 | } 174 | return(invisible("list elements validated")) 175 | } 176 | 177 | validate.n.samples <- function(data.block){ 178 | n.samples <- data.block$N 179 | n.samples <- c(data.block$N,nrow(data.block$obsCov)) 180 | if(!is.null(data.block$geoDist)){ 181 | n.samples <- c(n.samples,nrow(data.block$geoDist)) 182 | } 183 | if(length(unique(n.samples)) > 1){ 184 | stop("\nthe number of samples is not consistent 185 | across entries in the data.block\n\n") 186 | } 187 | return(invisible("n.samples validated")) 188 | } 189 | 190 | validate.model <- function(data.block){ 191 | if(data.block$spatial){ 192 | if(is.null(data.block$geoDist)){ 193 | stop("\nyou have specified a spatial model, 194 | but you have not specified a matrix 195 | of pairwise geographic distances\n\n") 196 | } 197 | if(any(data.block$geoDist < 0)){ 198 | stop("\nyou have specified an invalid 199 | distance matrix that contains 200 | negative values\n\n") 201 | } 202 | if(any(is.na(data.block$geoDist))){ 203 | stop("\nyou have specified an invalid 204 | distance matrix that contains 205 | non-numeric values\n\n") 206 | } 207 | } 208 | return(invisible("model validated")) 209 | } 210 | 211 | make.data.block.S3 <- function(data.block){ 212 | data.block <- data.block 213 | class(data.block) <- "data.block" 214 | return(data.block) 215 | } 216 | 217 | #' An S3 print method for class data.block 218 | #' 219 | #' @param x an object of class \code{data.block} 220 | #' @param ... further options to be passed to \code{print} 221 | #' @return prints a top-level summary of the data.block, returns nothing 222 | #' @method print data.block 223 | print.data.block <- function(x,...){ 224 | print(x=utils::str(x,max.level=1),...) 225 | } 226 | 227 | validate.data.block <- function(data.block){ 228 | message("\nchecking data.block\n") 229 | validate.data.list(data.block) 230 | validate.n.samples(data.block) 231 | message(sprintf("\treading %s samples",data.block$N)) 232 | message(sprintf("\treading %s loci",data.block$L)) 233 | if(!data.block$L > data.block$N){ 234 | stop("\nyour data must have a greater number of loci than there are samples\n") 235 | } 236 | message("\nchecking specified model\n") 237 | validate.model(data.block) 238 | if(data.block$spatial){ 239 | message(sprintf("\nuser has specified a spatial model with %s layer(s)\n",data.block$K)) 240 | } 241 | if(!data.block$spatial){ 242 | message(sprintf("\nuser has specified a purely discrete model with %s layer(s)\n",data.block$K)) 243 | } 244 | data.block <- make.data.block.S3(data.block) 245 | return(data.block) 246 | } 247 | 248 | pick.stan.model <- function(spatial,n.layers){ 249 | stan.code.block.name <- "stan.block" 250 | if(n.layers == 1){ 251 | name <- "oneK" 252 | } 253 | if(n.layers > 1){ 254 | name <- "multiK" 255 | } 256 | if(spatial){ 257 | name <- paste0("space_",name) 258 | } 259 | return(name) 260 | } 261 | 262 | make.freq.data.list.S3 <- function(freq.data){ 263 | freq.data <- freq.data 264 | class(freq.data) <- "freq.data" 265 | return(freq.data) 266 | } 267 | 268 | #' An S3 print method for class freq.data 269 | #' 270 | #' @param x an object of class \code{freq.data} 271 | #' @param ... further options to be passed to \code{print} 272 | #' @return prints a top-level summary of the freq.data, returns nothing 273 | #' @method print freq.data 274 | print.freq.data <- function(x,...){ 275 | print(x=utils::str(x,max.level=1),...) 276 | } 277 | 278 | identify.invar.sites <- function(freqs){ 279 | invar <- length(unique(freqs[which(!is.na(freqs))])) == 1 280 | return(invar) 281 | } 282 | 283 | drop.invars <- function(freqs){ 284 | invars <- apply(freqs,2,identify.invar.sites) 285 | freqs <- freqs[,!invars] 286 | return(freqs) 287 | } 288 | 289 | identify.missing.sites <- function(freqs){ 290 | n.samples <- length(freqs) 291 | missing <- FALSE 292 | if(length(which(is.na(freqs))) == n.samples){ 293 | missing <- TRUE 294 | } 295 | return(missing) 296 | } 297 | 298 | drop.missing <- function(freqs){ 299 | missing <- apply(freqs,2,identify.missing.sites) 300 | freqs <- freqs[,!missing] 301 | return(freqs) 302 | } 303 | 304 | calc.covariance <- function(freqs){ 305 | x <- t(freqs) 306 | allelic.covariance <- (1 - 1/nrow(freqs)) * stats::cov(x,use="pairwise.complete.obs") - 307 | (1/2) * outer( colMeans(x,na.rm=TRUE), 1-colMeans(x,na.rm=TRUE), "*" ) - 308 | (1/2) * outer(1-colMeans(x,na.rm=TRUE), colMeans(x,na.rm=TRUE), "*") + 1/4 309 | diag(allelic.covariance) <- 0.25 310 | return(allelic.covariance) 311 | } 312 | 313 | pos.def.check <- function(obsCov){ 314 | eigenvalues <- eigen(obsCov)$values 315 | if(any(eigenvalues <= 0)){ 316 | stop("\n\nThe sample covariance is not positive definite. Check to make sure that none of your samples are identical (after dropping missing data). If that does not fix the problem, try dropping the loci or samples with the most missing data.\n\n") 317 | } 318 | return("pos.def.check") 319 | } 320 | 321 | process.freq.data <- function(freqs){ 322 | freqs <- drop.invars(freqs) 323 | freqs <- drop.missing(freqs) 324 | n.loci <- ncol(freqs) 325 | obsCov <- calc.covariance(freqs) 326 | if(any(is.na(obsCov))){ 327 | stop("\n\nAfter dropping invariant loci, one or more pairs of samples have no genotyped loci in common, so relatedness between them cannot be assessed.\n\n") 328 | } 329 | pos.def <- pos.def.check(obsCov) 330 | freq.data <- list("freqs" = freqs, 331 | "obsCov" = obsCov, 332 | "n.loci" = n.loci) 333 | freq.data <- make.freq.data.list.S3(freq.data) 334 | return(freq.data) 335 | } 336 | 337 | standardize.distances <- function(D){ 338 | if(!is.null(D)){ 339 | stdev.D <- stats::sd(D[upper.tri(D)]) 340 | std.D <- D/stdev.D 341 | } else { 342 | std.D <- NULL 343 | stdev.D <- NULL 344 | } 345 | sd.dist.lit <- list("std.D" = std.D, 346 | "stdev.D" = stdev.D) 347 | return(sd.dist.lit) 348 | } 349 | 350 | make.data.block <- function(K,freq.data,coords,spatial,geoDist=NULL){ 351 | sd.dist.list <- standardize.distances(geoDist) 352 | data.block <- list("N" = nrow(coords), 353 | "K" = K, 354 | "spatial" = spatial, 355 | "L" = freq.data$n.loci, 356 | "coords" = coords, 357 | "obsCov" = freq.data$obsCov, 358 | "geoDist" = sd.dist.list$std.D, 359 | "sd.geoDist" = sd.dist.list$stdev.D, 360 | "varMeanFreqs" = mean(0.5*colMeans(freq.data$freqs-0.5,na.rm=TRUE)^2 + 0.5*colMeans(1-freq.data$freqs-0.5,na.rm=TRUE)^2)) 361 | data.block <- validate.data.block(data.block) 362 | return(data.block) 363 | } 364 | 365 | check.call <- function(args){ 366 | check.spatial.arg(args) 367 | check.K.arg(args) 368 | check.freqs.arg(args) 369 | check.geoDist.arg(args) 370 | check.coords.arg(args) 371 | return(invisible("args checked")) 372 | } 373 | 374 | check.spatial.arg <- function(args){ 375 | if(args[["spatial"]] != TRUE & args[["spatial"]] != FALSE){ 376 | stop("\nthe \"spatial\" argument must be either TRUE or FALSE\n") 377 | } 378 | return(invisible("spatial arg checked")) 379 | } 380 | 381 | check.K.arg <- function(args){ 382 | if(length(args[["K"]]) > 1){ 383 | stop("\nyou have specified more than one value for the \"K\" argument\n") 384 | } 385 | if(!inherits(args[["K"]],"numeric") & !inherits(args[["K"]],"integer")){ 386 | stop("\nyou have specified a non-numeric value for the \"K\" argument\n") 387 | } 388 | if(args[["K"]] %% 1 != 0){ 389 | stop("\nyou have specified a non-integer value for the \"K\" argument\n") 390 | } 391 | return(invisible("K arg checked")) 392 | } 393 | 394 | check.freqs.arg <- function(args){ 395 | if(!inherits(args[["freqs"]],"matrix")){ 396 | stop("\nthe \"freqs\" argument must be of class \"matrix\"\n") 397 | } 398 | if(any(args[["freqs"]] > 1,na.rm=TRUE)){ 399 | stop("\nall values of the the \"freqs\" argument must be less than 1\n") 400 | } 401 | if(any(args[["freqs"]] < 0,na.rm=TRUE)){ 402 | stop("\nall values of the the \"freqs\" argument must be greater than 0\n") 403 | } 404 | return(invisible("freqs arg checked")) 405 | } 406 | 407 | check.geoDist.arg <- function(args){ 408 | if(args[["spatial"]]){ 409 | if(is.null(args[["geoDist"]])){ 410 | stop("\nif the \"spatial\" argument is TRUE, you must specify a \"geoDist\" argument\n") 411 | } 412 | } 413 | if(!is.null(args[["geoDist"]])){ 414 | if(!inherits(args[["geoDist"]],"matrix")){ 415 | stop("\nthe \"geoDist\" argument must be of class \"matrix\"\n") 416 | } 417 | if(length(unique(dim(args[["geoDist"]]))) > 1){ 418 | stop("\nyou have specified a \"geoDist\" argument with an unequal number of rows and columns\n") 419 | } 420 | if(any(args[["geoDist"]] < 0)){ 421 | stop("\nall values of the \"geoDist\" argument must be greater than 0\n") 422 | } 423 | tmp.geoDist <- args[["geoDist"]] 424 | row.names(tmp.geoDist) <- NULL 425 | colnames(tmp.geoDist) <- NULL 426 | if(!isSymmetric(tmp.geoDist)){ 427 | stop("\nyou must specify a symmetric matrix for the \"geoDist\" argument \n") 428 | } 429 | } 430 | return(invisible("geoDist arg checked")) 431 | } 432 | 433 | check.coords.arg <- function(args){ 434 | if(!inherits(args[["coords"]],"matrix")){ 435 | stop("\nthe \"coords\" argument must be of class \"matrix\"\n") 436 | } 437 | if(ncol(args[["coords"]]) > 2){ 438 | stop("\nthe \"coords\" argument must be a matrix with two columns\n") 439 | } 440 | return(invisible("coords arg checked")) 441 | } 442 | -------------------------------------------------------------------------------- /R/stanmodels.R: -------------------------------------------------------------------------------- 1 | # Generated by rstantools. Do not edit by hand. 2 | 3 | # names of stan models 4 | stanmodels <- c("multiK", "oneK", "space_multiK", "space_oneK") 5 | 6 | # load each stan module 7 | Rcpp::loadModule("stan_fit4multiK_mod", what = TRUE) 8 | Rcpp::loadModule("stan_fit4oneK_mod", what = TRUE) 9 | Rcpp::loadModule("stan_fit4space_multiK_mod", what = TRUE) 10 | Rcpp::loadModule("stan_fit4space_oneK_mod", what = TRUE) 11 | 12 | # instantiate each stanmodel object 13 | stanmodels <- sapply(stanmodels, function(model_name) { 14 | # create C++ code for stan model 15 | stan_file <- if(dir.exists("stan")) "stan" else file.path("inst", "stan") 16 | stan_file <- file.path(stan_file, paste0(model_name, ".stan")) 17 | stanfit <- rstan::stanc_builder(stan_file, 18 | allow_undefined = TRUE, 19 | obfuscate_model_name = FALSE) 20 | stanfit$model_cpp <- list(model_cppname = stanfit$model_name, 21 | model_cppcode = stanfit$cppcode) 22 | # create stanmodel object 23 | methods::new(Class = "stanmodel", 24 | model_name = stanfit$model_name, 25 | model_code = stanfit$model_code, 26 | model_cpp = stanfit$model_cpp, 27 | mk_cppmodule = function(x) get(paste0("rstantools_model_", model_name))) 28 | }) 29 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | modules <- paste0("stan_fit4", names(stanmodels), "_mod") 3 | for (m in modules) loadModule(m, what = TRUE) 4 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## conStruct (continuous Structure) ReadMe 3 | 4 | 5 | [![R-CMD-check](https://github.com/gbradburd/conStruct/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/gbradburd/conStruct/actions/workflows/R-CMD-check.yaml) 6 | 7 | 8 | 9 | This repo contains the code for the method **conStruct** - a statistical tool 10 | for modeling continuous and discrete population genetic structure. 11 | 12 | The manuscript, data files, and analysis scripts associated with the publication 13 | "Inferring Continuous and Discrete Population Genetic Structure Across Space," 14 | have been moved, and can be accessed at the links below: 15 | 16 | * [paper](https://doi.org/10.1534/genetics.118.301333) 17 | * [manuscript repo](https://github.com/gbradburd/conStruct-paper) 18 | * [data dryad](https://doi.org/10.5061/dryad.5qj7h09) 19 | 20 | ## Installation 21 | 22 | ### Latest release 23 | 24 | To install the most recent release of the **conStruct** R package: 25 | 26 | ```r 27 | install.packages("conStruct") 28 | ``` 29 | 30 | Upon installation, the **conStruct** models will be compiled, which may 31 | spit lots of text, and possibly some warnings, to your screen. This is 32 | totally normal, and you should only be concerned if you get errors 33 | and the installation fails. 34 | 35 | 36 | ### Development version 37 | 38 | To install the development version from github: 39 | 40 | ```r 41 | library(devtools) 42 | install_github("gbradburd/conStruct",build_vignettes=TRUE) 43 | ``` 44 | 45 | Note that Windows users may have to download Rtools as a 46 | standalone executable before trying to install the **conStruct** R package. 47 | 48 | 49 | ## Getting Started 50 | 51 | A complete manual for all documented functions is available [here](https://github.com/gbradburd/conStruct/blob/master/man/conStruct-manual.pdf). 52 | 53 | In addition, there are four vignettes included in the package that walk through 54 | various steps in the analysis pipeline in detail. You can find them using: 55 | 56 | ```r 57 | # formatting data 58 | vignette(topic="format-data",package="conStruct") 59 | 60 | # how to run a conStruct analysis 61 | vignette(topic="run-conStruct",package="conStruct") 62 | 63 | # how to visualize the output of a conStruct model 64 | vignette(topic="visualize-results",package="conStruct") 65 | 66 | # how to compare and select between different conStruct models 67 | vignette(topic="model-comparison",package="conStruct") 68 | ``` 69 | 70 | There is also an example data file included in the package, which you can 71 | load using the command: 72 | 73 | ```r 74 | data(conStruct.data) 75 | ``` 76 | 77 | ## Contact 78 | 79 | After referring to the manual and vignettes, 80 | please direct all queries to bradburd (at) umich.edu, 81 | or post as issues on the git repo. -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | # Generated by rstantools. Do not edit by hand. 4 | 5 | "${R_HOME}/bin/Rscript" -e "rstantools::rstan_config()" 6 | -------------------------------------------------------------------------------- /configure.win: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | # Generated by rstantools. Do not edit by hand. 4 | 5 | "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "rstantools::rstan_config()" 6 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | 2 | ## Test environments 3 | * local OS X install R 4.2.3 4 | * win-builder 5 | 6 | ## R CMD check results 7 | There were no ERRORs or WARNINGs 8 | 9 | There were 2 NOTES: 10 | 11 | * checking for GNU extensions in Makefiles ... NOTE 12 | GNU make is a SystemRequirements. 13 | 14 | Explanation: GNU make is required for packages that 15 | depend on rstan and which are developed using rstantools. 16 | The requirement is noted in the DESCRIPTION file. 17 | 18 | * checking installed package size ... NOTE 19 | installed size is 5.5Mb 20 | sub-directories of 1Mb or more: 21 | libs 4.6Mb 22 | 23 | Explanation: This package has a large installed library 24 | size because it uses the Stan MCMC library as a backend, 25 | and the C++ Stan models included in the package are 26 | compiled upon installation. 27 | 28 | 29 | ## Downstream dependendencies 30 | 31 | * There are currently no downstream dependencies for this package. -------------------------------------------------------------------------------- /data/conStruct.data.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/data/conStruct.data.rda -------------------------------------------------------------------------------- /data/data.block.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/data/data.block.rda -------------------------------------------------------------------------------- /inst/include/stan_meta_header.hpp: -------------------------------------------------------------------------------- 1 | // Insert all #include statements here 2 | -------------------------------------------------------------------------------- /inst/stan/multiK.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | matrix admixed_covariance(int N, int K, matrix w_mat, vector nugget, vector phi, real gamma) { 3 | matrix[N,N] parCov; 4 | matrix[N,N] Nug_mat; 5 | parCov = rep_matrix(0,N,N); 6 | Nug_mat = diag_matrix(nugget); 7 | for(k in 1:K){ 8 | parCov += tcrossprod(to_matrix(w_mat[,k])) * phi[k]; 9 | } 10 | parCov += gamma + Nug_mat; 11 | return parCov; 12 | } 13 | matrix make_w_matrix(int N, int K, array[] vector w){ 14 | matrix[N,K] w_mat; 15 | for(i in 1:N){ 16 | w_mat[i] = to_row_vector(w[i]); 17 | } 18 | return w_mat; 19 | } 20 | } 21 | data { 22 | int K; // number of layers 23 | int N; // number of samples 24 | int L; // number of loci 25 | matrix[N,N] obsCov; // observed projected covariance 26 | real varMeanFreqs; 27 | } 28 | transformed data { 29 | matrix[N,N] LobsCov; // n.loci multiplied by the sample covariance 30 | vector[K] dirConPar; 31 | LobsCov = L * obsCov; 32 | dirConPar = rep_vector(0.1,K); 33 | } 34 | parameters { 35 | positive_ordered[K] phi; // shared drift effect in layer k 36 | real gamma; // covariance between all layers 37 | vector[N] nugget; // sample-specific variance (allele sampling error + sample-specific drift) 38 | array[N] simplex[K] w; // every sample (N in total) has a K simplex (i.e. K layers) 39 | } 40 | transformed parameters { 41 | matrix[N,N] parCov; // this specifies the parametric, admixed covariance matrix 42 | matrix[N,K] w_mat; 43 | w_mat = make_w_matrix(N,K,w); 44 | parCov = admixed_covariance(N, K, w_mat, nugget, phi, gamma); 45 | } 46 | model { 47 | nugget ~ normal(0,1); // prior on nugget 48 | phi ~ normal(0,1); 49 | gamma ~ normal(varMeanFreqs,0.5); 50 | for(i in 1:N) w[i] ~ dirichlet(dirConPar); // prior on admixture proportions 51 | LobsCov ~ wishart(L,parCov); // likelihood function 52 | } 53 | -------------------------------------------------------------------------------- /inst/stan/oneK.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | matrix Cov(int N, vector nugget, real gamma) { 3 | matrix[N,N] parCov; 4 | matrix[N,N] Nug_mat; 5 | parCov = rep_matrix(gamma,N,N); 6 | Nug_mat = diag_matrix(nugget); 7 | parCov += Nug_mat; 8 | return parCov; 9 | } 10 | } 11 | data { 12 | int K; // number of layers 13 | int N; // number of samples 14 | int L; // number of loci 15 | matrix[N,N] obsCov; // observed projected covariance 16 | real varMeanFreqs; // variance in mean frequencies 17 | } 18 | transformed data { 19 | matrix[N,N] LobsCov; // n.loci multiplied by the sample covariance 20 | LobsCov = L * obsCov; 21 | } 22 | parameters { 23 | real gamma; // covariance between all layers 24 | vector[N] nugget; // sample-specific variance (allele sampling error + sample-specific drift) 25 | } 26 | transformed parameters { 27 | matrix[N,N] parCov; // this specifies the parametric, admixed covariance matrix 28 | parCov = Cov(N, nugget, gamma); 29 | } 30 | model { 31 | nugget ~ normal(0,1); // prior on nugget 32 | gamma ~ normal(varMeanFreqs,0.5); // prior on gamma 33 | LobsCov ~ wishart(L,parCov); // likelihood function 34 | } 35 | -------------------------------------------------------------------------------- /inst/stan/space_multiK.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | matrix spCov(int N, real a0, real aD, real a2, matrix D, real phi){ 3 | matrix[N,N] cov; 4 | for(i in 1:N){ 5 | for(j in i:N){ 6 | cov[i,j] = a0 * exp( -(aD* D[i,j])^a2) + phi; 7 | cov[j,i] = cov[i,j]; 8 | } 9 | } 10 | return cov; 11 | } 12 | matrix admixed_covariance(int N, int K, vector alpha0, vector alphaD, vector alpha2, matrix geoDist, matrix w_mat, vector nugget, vector phi, real gamma) { 13 | matrix[N,N] parCov; 14 | matrix[N,N] Nug_mat; 15 | parCov = rep_matrix(0,N,N); 16 | Nug_mat = diag_matrix(nugget); 17 | for(k in 1:K){ 18 | parCov += tcrossprod(to_matrix(w_mat[,k])) .* spCov(N,alpha0[k],alphaD[k],alpha2[k],geoDist,phi[k]); 19 | } 20 | parCov += gamma + Nug_mat; 21 | return parCov; 22 | } 23 | matrix make_w_matrix(int N, int K, array[] vector w){ 24 | matrix[N,K] w_mat; 25 | for(i in 1:N){ 26 | w_mat[i] = to_row_vector(w[i]); 27 | } 28 | return w_mat; 29 | } 30 | } 31 | data { 32 | int K; // number of layers 33 | int N; // number of samples 34 | int L; // number of loci 35 | matrix[N,N] obsCov; // observed projected covariance 36 | matrix[N, N] geoDist; // matrix of pairwise geographic distance 37 | real varMeanFreqs; 38 | } 39 | transformed data { 40 | matrix[N,N] LobsCov; // n.loci multiplied by the sample covariance 41 | vector[K] dirConPar; 42 | LobsCov = L * obsCov; 43 | dirConPar = rep_vector(0.1,K); 44 | } 45 | parameters { 46 | vector[K] alpha0; // sill of the parametric covariance in layer k 47 | vector[K] alphaD; // effect of geographic distance in the parametric covariance in layer k 48 | vector[K] alpha2; // exponential slope parameter in the parametric covariance in layer k 49 | positive_ordered[K] phi; // shared drift effect in layer k 50 | vector[N] nugget; // sample-specific variance (allele sampling error + sample-specific drift) 51 | array[N] simplex[K] w; // every sample (N in total) has a K simplex (i.e. K layers) 52 | real gamma; 53 | } 54 | transformed parameters { 55 | matrix[N,N] parCov; // this specifies the parametric, admixed covariance matrix 56 | matrix[N,K] w_mat; 57 | w_mat = make_w_matrix(N,K,w); 58 | parCov = admixed_covariance(N, K, alpha0, alphaD, alpha2, geoDist, w_mat, nugget, phi, gamma); 59 | } 60 | model { 61 | alpha0 ~ normal(0,1); // prior on alpha0 62 | alphaD ~ normal(0,1); // prior on alphaD 63 | alpha2 ~ uniform(0,2); // prior on alpha2 64 | nugget ~ normal(0,1); // prior on nugget 65 | phi ~ normal(0,1); 66 | gamma ~ normal(varMeanFreqs,0.5); 67 | for(i in 1:N) w[i] ~ dirichlet(dirConPar); // prior on admixture proportions 68 | LobsCov ~ wishart(L,parCov); // likelihood function 69 | } 70 | -------------------------------------------------------------------------------- /inst/stan/space_oneK.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | matrix spCov(int N, real a0, real aD, real a2, matrix D, vector nugget, real gamma) { 3 | matrix[N,N] parCov; 4 | matrix[N,N] Nug_mat; 5 | parCov = rep_matrix(0,N,N); 6 | Nug_mat = diag_matrix(nugget); 7 | for(i in 1:N){ 8 | for(j in i:N){ 9 | parCov[i,j] = a0 * exp( -(aD * D[i,j])^a2); 10 | parCov[j,i] = parCov[i,j]; 11 | } 12 | } 13 | parCov += gamma + Nug_mat; 14 | return parCov; 15 | } 16 | } 17 | data { 18 | int K; // number of layers 19 | int N; // number of samples 20 | int L; // number of loci 21 | matrix[N,N] obsCov; // observed projected covariance 22 | matrix[N, N] geoDist; // matrix of pairwise geographic distance 23 | real varMeanFreqs; 24 | } 25 | transformed data { 26 | matrix[N,N] LobsCov; // n.loci multiplied by the sample covariance 27 | LobsCov = L * obsCov; 28 | } 29 | parameters { 30 | real alpha0; // sill of the parametric covariance in layer k 31 | real alphaD; // effect of geographic distance in the parametric covariance in layer k 32 | real alpha2; // exponential slope parameter in the parametric covariance in layer k 33 | real gamma; // covariance between all layers 34 | vector[N] nugget; // sample-specific variance (allele sampling error + sample-specific drift) 35 | } 36 | transformed parameters { 37 | matrix[N,N] parCov; // this specifies the parametric, admixed covariance matrix 38 | parCov = spCov(N, alpha0, alphaD, alpha2, geoDist, nugget, gamma); 39 | } 40 | model { 41 | alpha0 ~ normal(0,1); // prior on alpha0 42 | alphaD ~ normal(0,1); // prior on alphaD 43 | alpha2 ~ uniform(0,2); // prior on alpha2 44 | nugget ~ normal(0,1); // prior on nugget 45 | gamma ~ normal(varMeanFreqs,0.5); // prior on global covariance 46 | LobsCov ~ wishart(L,parCov); // likelihood function 47 | } 48 | -------------------------------------------------------------------------------- /man/calculate.layer.contribution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/model.comparison.R 3 | \name{calculate.layer.contribution} 4 | \alias{calculate.layer.contribution} 5 | \title{Calculate layer contribution} 6 | \usage{ 7 | calculate.layer.contribution(conStruct.results, data.block, layer.order = NULL) 8 | } 9 | \arguments{ 10 | \item{conStruct.results}{The list output by a 11 | \code{conStruct} run for a given MCMC chain.} 12 | 13 | \item{data.block}{A \code{data.block} list saved during a 14 | \code{conStruct} run.} 15 | 16 | \item{layer.order}{An optional \code{vector} giving the 17 | order in which the layers of \code{conStruct.results} are 18 | read.} 19 | } 20 | \value{ 21 | This function returns a \code{vector} giving the 22 | relative contributions of the layers 23 | in the analysis. 24 | } 25 | \description{ 26 | \code{calculate.layer.contribution} 27 | } 28 | \details{ 29 | This function takes the results of a \code{conStruct} 30 | analysis and calculates the relative contributions of 31 | each layer to total covariance. 32 | 33 | This function calculates the contribution of each layer to 34 | total covariance by multiplying the within-layer covariance 35 | in a given layer by the admixture proportions samples draw 36 | from that layer. The relative contribution of that layer 37 | is this absolute contribution divided by the sum of those of 38 | all other layers. 39 | A layer can have a large contribution if many samples draw 40 | large amounts of admixture from it, or if it has a very large 41 | within-layer covariance parameter (phi), or some combination 42 | of the two. Layer contribution can be useful for evaluating 43 | an appropriate level of model complexity for the data (e.g., 44 | choosing a value of \code{K} or comparing the spatial and 45 | nonspatial models). 46 | } 47 | -------------------------------------------------------------------------------- /man/compare.two.runs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.output.R 3 | \name{compare.two.runs} 4 | \alias{compare.two.runs} 5 | \title{Compare two conStruct runs} 6 | \usage{ 7 | compare.two.runs( 8 | conStruct.results1, 9 | data.block1, 10 | conStruct.results2, 11 | data.block2, 12 | prefix, 13 | layer.colors = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{conStruct.results1}{The list output by a 18 | \code{conStruct} run.} 19 | 20 | \item{data.block1}{A \code{data.block} list saved during a 21 | \code{conStruct} run.} 22 | 23 | \item{conStruct.results2}{The list output by a second 24 | \code{conStruct} run.} 25 | 26 | \item{data.block2}{A \code{data.block} list saved during a 27 | second \code{conStruct} run.} 28 | 29 | \item{prefix}{A character vector to be prepended to all figures.} 30 | 31 | \item{layer.colors}{A \code{vector} of colors to be used in 32 | plotting results for different layers. Users must 33 | specify one color per layer. If \code{NULL}, plots 34 | will use a pre-specified vector of colors.} 35 | } 36 | \value{ 37 | This function has only invisible return values. 38 | } 39 | \description{ 40 | \code{compare.two.runs} makes figures comparing the output 41 | from two conStruct analyses. 42 | } 43 | \details{ 44 | This function takes the outputs from two conStruct analyses and 45 | generates a number of plots for comparing results and 46 | diagnosing MCMC performance. 47 | 48 | This function produces a variety of plots that can be 49 | useful for comparing results from two \code{conStruct} analyses. 50 | The runs must have the same number of independent MCMC chains, 51 | but may have different values of \code{K}. The spatial and 52 | nonspatial models can be compared. If the runs were executed 53 | with different values of \code{K}, the run with the smaller 54 | value of \code{K} should be specified in the first set of 55 | arguments (\code{conStruct.results1} and \code{data.block1}). 56 | 57 | The plots made are by no means an exhaustive, and users are 58 | encouraged to make further plots, or customize these plots as they 59 | see fit. For each plot, one file is generated for each MCMC chain 60 | in each analysis (specified with the \code{n.chains} argument in 61 | the function \code{conStruct}. For clarity, the layers in the second 62 | are matched to those in the first using the function 63 | \code{match.clusters.x.runs} The plots generated (as .pdf files) are: 64 | \itemize{ 65 | \item Structure plot - STRUCTURE-style plot, where each sample 66 | is represented as a stacked bar plot, and the length of the 67 | bar plot segments of each color represent that sample's 68 | admixture proportion in that layer. Described further 69 | in the help page for \code{make.structure.plot}. 70 | \item Admixture pie plot - A map of samples in which each sample's 71 | location is denoted with a pie chart, and the proportion 72 | of a pie chart of each color represents that sample's 73 | admixture in each layer. Described further in the help 74 | page for \code{make.admix.pie.plot} 75 | \item model.fit.CIs - A plot of the sample allelic covariance 76 | shown with the 95\% credible interval of the parametric 77 | covariance for each entry in the matrix. 78 | \item layer.covariances - A plot of the layer-specific 79 | covariances overlain unto the sample allelic covariance. 80 | \item Trace plots - Plots of parameter values over the MCMC. 81 | \itemize{ 82 | \item lpd - A plot of the log posterior probability over the MCMC. 83 | \item nuggets - A plot of estimates of the nugget parameters 84 | over the MCMC. 85 | \item gamma - A plot of estimates of the gamma parameter 86 | over the MCMC. 87 | \item layer.cov.params - Plots of estimates of the 88 | layer-specific parameters over the MCMC. 89 | \item admix.props - A plot of estimates of the admixture proportions 90 | over the MCMC. 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /man/conStruct-manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/man/conStruct-manual.pdf -------------------------------------------------------------------------------- /man/conStruct-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/conStruct-package.R 3 | \docType{package} 4 | \name{conStruct-package} 5 | \alias{conStruct-package} 6 | \title{The 'conStruct' package.} 7 | \description{ 8 | A method for modeling genetic data as a combination of discrete 9 | layers, within each of which relatedness may decay continuously with geographic 10 | distance. This package contains code for running analyses (which are implemented 11 | in the modeling language 'rstan') and visualizing and interpreting output. See the 12 | associated paper for more details on the model and its utility. 13 | } 14 | \references{ 15 | G.S. Bradburd, G.M. Coop, and P.L. Ralph (2018) . 16 | 17 | Stan Development Team (2018). RStan: the R interface to Stan. R package version 2.17.3. http://mc-stan.org 18 | } 19 | -------------------------------------------------------------------------------- /man/conStruct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run.conStruct.R 3 | \name{conStruct} 4 | \alias{conStruct} 5 | \title{Run a conStruct analysis.} 6 | \usage{ 7 | conStruct( 8 | spatial = TRUE, 9 | K, 10 | freqs, 11 | geoDist = NULL, 12 | coords, 13 | prefix = "", 14 | n.chains = 1, 15 | n.iter = 1000, 16 | make.figs = TRUE, 17 | save.files = TRUE, 18 | ... 19 | ) 20 | } 21 | \arguments{ 22 | \item{spatial}{A logical indicating whether to perform a spatial analysis. 23 | Default is \code{TRUE}.} 24 | 25 | \item{K}{An \code{integer} that indicates the number of layers to be 26 | included in the analysis.} 27 | 28 | \item{freqs}{A \code{matrix} of allele frequencies with one column per 29 | locus and one row per sample. 30 | Missing data should be indicated with \code{NA}.} 31 | 32 | \item{geoDist}{A full \code{matrix} of geographic distance between samples. 33 | If \code{NULL}, user can only run the nonspatial model.} 34 | 35 | \item{coords}{A \code{matrix} giving the longitude and latitude 36 | (or X and Y coordinates) of the samples.} 37 | 38 | \item{prefix}{A character \code{vector} giving the prefix to be attached 39 | to all output files.} 40 | 41 | \item{n.chains}{An integer indicating the number of MCMC chains to be run 42 | in the analysis. Default is 1.} 43 | 44 | \item{n.iter}{An \code{integer} giving the number of iterations each MCMC 45 | chain is run. Default is 1e3. If the number of iterations 46 | is greater than 500, the MCMC is thinned so that the number 47 | of retained iterations is 500 (before burn-in).} 48 | 49 | \item{make.figs}{A \code{logical} value indicating whether to automatically 50 | make figures once the analysis is complete. Default is 51 | \code{TRUE}.} 52 | 53 | \item{save.files}{A \code{logical} value indicating whether to automatically 54 | save output and intermediate files once the analysis is 55 | complete. Default is \code{TRUE}.} 56 | 57 | \item{...}{Further options to be passed to rstan::sampling (e.g., adapt_delta).} 58 | } 59 | \value{ 60 | This function returns a list with one entry for each chain run 61 | (specified with \code{n.chains}). The entry for each chain is named 62 | "chain_X" for the Xth chain. The components of the entries for each 63 | are detailed below: 64 | \itemize{ 65 | \item \code{posterior} gives parameter estimates over the posterior 66 | distribution of the MCMC. 67 | \itemize{ 68 | \item \code{n.iter} number of MCMC iterations retained for 69 | analysis (half of the \code{n.iter} argument 70 | specified in the function call). 71 | \item \code{lpd} vector of log posterior density over the retained 72 | MCMC iterations. 73 | \item \code{nuggets} matrix of estimated nugget parameters with 74 | one row per MCMC iteration and one column per sample. 75 | \item \code{par.cov} array of estimated parametric covariance matrices, 76 | for which the first dimension is the number of MCMC iterations. 77 | \item \code{gamma} vector of estimated gamma parameter. 78 | \item \code{layer.params} list summarizing estimates of layer-specific 79 | parameters. There is one entry for each layer specified, and the 80 | entry for the kth layer is named "Layer_k". 81 | \itemize{ 82 | \item \code{alpha0} vector of estimated alpha0 parameter in the 83 | kth layer. 84 | \item \code{alphaD} vector of estimated alphaD parameter in the 85 | kth layer. 86 | \item \code{alpha2} vector of estimated alpha2 parameter in the 87 | kth layer. 88 | \item \code{mu} vector of estimated mu parameter in the 89 | kth layer. 90 | \item \code{layer.cov} vector of estimated layer-specific 91 | covariance parameter in the kth layer. 92 | } 93 | \item \code{admix.proportions} array of estimated admixture proportions. 94 | The first dimension is the number of MCMC iterations, 95 | the second is the number of samples, 96 | and the third is the number of layers. 97 | } 98 | \item \code{MAP} gives point estimates of the parameters listed in the \code{posterior} 99 | list described above. Values are indexed at the MCMC iteration 100 | with the greatest posterior probability. 101 | \itemize{ 102 | \item \code{index.iter} the iteration of the MCMC with the highest 103 | posterior probability, which is used to index all parameters 104 | included in the \code{MAP} list 105 | \item \code{lpd} the greatest value of the posterior probability 106 | \item \code{nuggets} point estimate of nugget parameters 107 | \item \code{par.cov} point estimate of parametric covariance 108 | \item \code{gamma} point estimate of gamma parameter 109 | \item \code{layer.params} point estimates of all layer-specific parameters 110 | \item \code{admix.proportions} point estimates of admixture proportions. 111 | } 112 | } 113 | } 114 | \description{ 115 | \code{conStruct} runs a conStruct analysis of genetic data. 116 | } 117 | \details{ 118 | This function initiates an analysis that uses 119 | geographic and genetic relationships between samples 120 | to estimate sample membership (admixture proportions) across 121 | a user-specified number of layers. 122 | 123 | This function acts as a wrapper around a STAN model block determined 124 | by the user-specified model (e.g., a spatial model with 3 layers, 125 | or a nonspatial model with 5 layers). 126 | User-specified data are checked for appropriate format and consistent dimensions, 127 | then formatted into a \code{data.block}, 128 | which is then passed to the STAN model block. 129 | Along with the \code{conStruct.results} output described above, 130 | several objects are saved during the course of a \code{conStruct} call 131 | (if \code{save.files=TRUE}). 132 | These are the \code{data.block}, which contains all data passed to the STAN model block, 133 | \code{model.fit}, which is unprocessed results of the STAN run in \code{stanfit} format, 134 | and the \code{conStruct.results}, which are saved in the course of the function call 135 | in addition to being returned. 136 | If \code{make.figs=TRUE}, running \code{conStruct} will also generate many output figures, 137 | which are detailed in the function \code{make.all.the.plots} in this package. 138 | } 139 | \examples{ 140 | # load example dataset 141 | data(conStruct.data) 142 | 143 | # run example spatial analysis with K=1 144 | # 145 | # for this example, make.figs and save.files 146 | # are set to FALSE, but most users will want them 147 | # set to TRUE 148 | my.run <- conStruct(spatial = TRUE, 149 | K = 1, 150 | freqs = conStruct.data$allele.frequencies, 151 | geoDist = conStruct.data$geoDist, 152 | coords = conStruct.data$coords, 153 | prefix = "test", 154 | n.chains = 1, 155 | n.iter = 1e3, 156 | make.figs = FALSE, 157 | save.files = FALSE) 158 | 159 | } 160 | -------------------------------------------------------------------------------- /man/conStruct.data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{conStruct.data} 5 | \alias{conStruct.data} 6 | \title{Example dataset used in a \code{conStruct} analysis} 7 | \format{ 8 | A list with two elements: 9 | \describe{ 10 | \item{allele.frequencies}{a matrix with one row for each of 11 | the 16 samples and one column for each of 10,000 loci, 12 | giving the frequency of the counted allele at each locus 13 | in each sample} 14 | \item{coords}{a matrix with one row for each of the 16 samples, 15 | in the same order as that of the allele frequency matrix, 16 | and two columns, the first giving the x-coordinate 17 | (or longitude), the second giving the y-coordinate (or latitude)} 18 | } 19 | } 20 | \usage{ 21 | conStruct.data 22 | } 23 | \description{ 24 | A simulated dataset containing the allele frequency 25 | and sampling coordinate data necessary to run a 26 | \code{conStruct} analysis. 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /man/data.block.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{data.block} 5 | \alias{data.block} 6 | \title{Example \code{data.block} generated by a \code{conStruct} analysis} 7 | \format{ 8 | A list with 7 elements: 9 | \describe{ 10 | \item{\code{N}}{the number of samples included in the analysis} 11 | \item{\code{K}}{the number of clusters/layers included in the model} 12 | \item{\code{spatial}}{a boolean indicating whether the spatial 13 | model has been specified} 14 | \item{\code{L}}{the number of loci included in the analysis} 15 | \item{\code{coords}}{a matrix with one row for each of the \code{N} samples, 16 | in the same order as that of the \code{obsCov} matrix, 17 | and two columns, the first giving the x-coordinate 18 | (or longitude), the second giving the y-coordinate (or latitude)} 19 | \item{\code{obsCov}}{the sample allelic covariance matrix, 20 | in the same order as that of the \code{coords} matrix, 21 | with \code{N} rows and columns} 22 | \item{\code{geoDist}}{a matrix of pairwise geographic distance between , 23 | samples in the same order as that of the \code{obsCov}, 24 | with \code{N} rows and columns} 25 | \item{\code{sd.geoDist}}{the standard deviation of the raw geographic 26 | distance matrix, used for normalizing \code{geoDist} within the 27 | stan model} 28 | \item{\code{varMeanFreqs}}{the variance of the mean allele frequencies, 29 | averaged over choice of counted allele (passed to the model 30 | as a prior on the global covariance parameter)} 31 | } 32 | } 33 | \usage{ 34 | data.block 35 | } 36 | \description{ 37 | An example \code{data.block} object generated in a \code{conStruct} 38 | analysis from the raw data supplied by the user. This object is 39 | automatically saved and is used in several subsequent plotting functions. 40 | } 41 | \keyword{datasets} 42 | -------------------------------------------------------------------------------- /man/make.admix.pie.plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.output.R 3 | \name{make.admix.pie.plot} 4 | \alias{make.admix.pie.plot} 5 | \title{Make admixture pie plot} 6 | \usage{ 7 | make.admix.pie.plot( 8 | admix.proportions, 9 | coords, 10 | layer.colors = NULL, 11 | radii = 2.7, 12 | add = FALSE, 13 | x.lim = NULL, 14 | y.lim = NULL, 15 | mar = c(2, 2, 2, 2) 16 | ) 17 | } 18 | \arguments{ 19 | \item{admix.proportions}{A \code{matrix} of admixture proportions, 20 | with one row per sample and one column per layer.} 21 | 22 | \item{coords}{\code{matrix} of sample coordinates, with one row 23 | per sample and two columns giving (respectively) the X 24 | and Y plotting coordinates.} 25 | 26 | \item{layer.colors}{A \code{vector} of colors to be used in 27 | plotting results for different layers. Users must 28 | specify one color per layer. If \code{NULL}, the plot 29 | will use a pre-specified vector of colors.} 30 | 31 | \item{radii}{A \code{vector} of numeric values giving the radii to be 32 | used in plotting admixture pie plots. If the number of values 33 | specified is smaller than the number of samples, radii values 34 | will be recycled across samples. The default is 2.7.} 35 | 36 | \item{add}{A \code{logical} value indicating whether to add the pie plots 37 | to an existing plot. Default is \code{FALSE}.} 38 | 39 | \item{x.lim}{A \code{vector} giving the x limits of the plot. The default 40 | value is \code{NULL}, which indicates that the range of values 41 | given in the first column of \code{coords} should be used.} 42 | 43 | \item{y.lim}{A \code{vector} giving the y limits of the plot. The default 44 | value is \code{NULL}, which indicates that the range of values 45 | given in the second column of \code{coords} should be used.} 46 | 47 | \item{mar}{A \code{vector} giving the number of lines of margin specified 48 | for the four sides of the plotting window (passed to \code{par}). 49 | Default value, which is only used if \code{add=FALSE}, is 50 | \code{c(2,2,2,2)}.} 51 | } 52 | \value{ 53 | This function has only invisible return values. 54 | } 55 | \description{ 56 | \code{make.structure.plot} makes a map of pie plots showing admixture 57 | proportions across layers. 58 | } 59 | \details{ 60 | This function takes the output from a conStruct analysis and 61 | makes a map of pie plots showing admixture proportions across layers, 62 | where each sample is represented as a pie chart, and the proportion of 63 | the pie of each color represent that sample's 64 | admixture proportion in that layer. 65 | } 66 | \examples{ 67 | \dontshow{ 68 | admix.props <- matrix(c(0.086,0.000,0.500,0.505,0.099,0.052,0.024,0.007,0.800,0.000,0.216,0.744,0.917,0.199,0.469,0.000,0.783,0.298,0.329,0.446,0.000,0.000,0.637,0.903,0.000,0.000,0.000,0.012,0.021,0.000,0.000,0.089,0.000,0.554,0.002,0.000,0.000,0.095,0.020,0.001,0.001,0.011,0.000,0.200,0.000,0.060,0.053,0.082,0.036,0.013,0.000,0.062,0.169,0.137,0.029,0.001,0.000,0.178,0.079,0.000,0.999,1.000,0.988,0.979,0.975,1.000,0.744,0.984,0.435,0.998,0.914,1.000,0.405,0.475,0.900,0.947,0.965,0.993,0.000,1.000,0.725,0.203,0.000,0.765,0.518,1.000,0.154,0.533,0.534,0.525,0.999,1.000,0.185,0.018,1.000,0.001,0.000,0.000,0.000,0.025,0.000,0.167,0.016,0.012,0.000),ncol=3) 69 | coords <- matrix(c(-126.38,-125.23,-126.97,-128.54,-126.95,-121.71,-126.79,-123.38,-137.88,-125.82,-122.94,-130.73,-123.08,-122.84,-128.58,-124.82,-129.75,-122.25,-122.32,-129.10,-125.28,-123.98,-133.35,-131.74,-124.16,-146.35,-94.63,-149.02,-111.50,-126.67,-133.77,-118.63,-115.78,-113.42,-135.33,52.40,49.84,54.66,54.65,51.69,49.44,52.82,50.05,59.52,51.34,45.81,56.81,44.71,50.24,54.14,51.04,56.68,52.98,54.04,55.34,50.64,50.23,58.76,57.30,50.54,64.90,56.35,63.87,56.92,65.23,68.38,54.75,60.80,50.82,60.70),ncol=2) 70 | } 71 | # make admixture pie plot 72 | make.admix.pie.plot(admix.proportions = admix.props,coords = coords) 73 | 74 | } 75 | -------------------------------------------------------------------------------- /man/make.all.the.plots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.output.R 3 | \name{make.all.the.plots} 4 | \alias{make.all.the.plots} 5 | \title{Make output plots} 6 | \usage{ 7 | make.all.the.plots(conStruct.results, data.block, prefix, layer.colors = NULL) 8 | } 9 | \arguments{ 10 | \item{conStruct.results}{The list output by a 11 | \code{conStruct} run.} 12 | 13 | \item{data.block}{A \code{data.block} list saved during a 14 | \code{conStruct} run.} 15 | 16 | \item{prefix}{A character vector to be prepended to all figures.} 17 | 18 | \item{layer.colors}{A \code{vector} of colors to be used in 19 | plotting results for different layers. Users must 20 | specify one color per layer. If \code{NULL}, plots 21 | will use a pre-specified vector of colors.} 22 | } 23 | \value{ 24 | This function has only invisible return values. 25 | } 26 | \description{ 27 | \code{make.all.the.plots} makes figures from the output from a 28 | conStruct analysis. 29 | } 30 | \details{ 31 | This function takes the output from a conStruct analysis and 32 | generates a number of plots for visualizing results and 33 | diagnosing MCMC performance. 34 | 35 | This function produces a variety of plots that can be 36 | useful for visualizing results or diagnosing MCMC performance. 37 | The plots made are by no means exhaustive, and users are 38 | encouraged to make further plots, or customize these plots as they 39 | see fit. For each plot, one file is generated for each MCMC chain 40 | (specified with the \code{n.chains} argument in the function 41 | \code{conStruct}. The plots generated (as .pdf files) are: 42 | \itemize{ 43 | \item Structure plot - STRUCTURE-style plot, where each sample 44 | is represented as a stacked bar plot, and the length of the 45 | bar plot segments of each color represent that sample's 46 | admixture proportion in that layer. Described further 47 | in the help page for \code{make.structure.plot}. 48 | \item Admixture pie plot - A map of samples in which each sample's 49 | location is denoted with a pie chart, and the proportion 50 | of a pie chart of each color represents that sample's 51 | admixture in each layer. Described further in the help 52 | page for \code{make.admix.pie.plot} 53 | \item model.fit.CIs - A plot of the sample allelic covariance 54 | shown with the 95\% credible interval of the parametric 55 | covariance for each entry in the matrix. 56 | \item layer.covariances - A plot of the layer-specific 57 | covariances overlain unto the sample allelic covariance. 58 | \item Trace plots - Plots of parameter values over the MCMC. 59 | \itemize{ 60 | \item lpd - A plot of the log posterior probability over the MCMC. 61 | \item nuggets - A plot of estimates of the nugget parameters 62 | over the MCMC. 63 | \item gamma - A plot of estimates of the gamma parameter 64 | over the MCMC. 65 | \item layer.cov.params - Plots of estimates of the 66 | layer-specific parameters over the MCMC. 67 | \item admix.props - A plot of estimates of the admixture proportions 68 | over the MCMC. 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /man/make.structure.plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.output.R 3 | \name{make.structure.plot} 4 | \alias{make.structure.plot} 5 | \title{Make STRUCTURE output plot} 6 | \usage{ 7 | make.structure.plot( 8 | admix.proportions, 9 | mar = c(2, 4, 2, 2), 10 | sample.order = NULL, 11 | layer.order = NULL, 12 | sample.names = NULL, 13 | sort.by = NULL, 14 | layer.colors = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{admix.proportions}{A \code{matrix} of admixture proportions, 19 | with one row per sample and one column per layer.} 20 | 21 | \item{mar}{A \code{vector} of plotting margins passed to \code{par}. 22 | Default is \code{c(2,4,2,2)}, which tends to look good.} 23 | 24 | \item{sample.order}{A \code{vector} giving the order in which sample 25 | admixture proportions are to be plotted, left to right. If 26 | \code{NULL}, samples are plotted in the order they occur in 27 | \code{admix.proportions}.} 28 | 29 | \item{layer.order}{A \code{vector} giving the order in which layers 30 | are plotted, bottom to top. If \code{NULL}, layers are plotted 31 | in the order they occur in \code{admix.proportions}.} 32 | 33 | \item{sample.names}{Vector of names to be plotted under each sample's 34 | admixture proportion bar plot. The index of a sample's name 35 | should be the same as the index of the sample's row in 36 | \code{admix.proportions}. If \code{NULL}, no names 37 | are printed.} 38 | 39 | \item{sort.by}{An \code{integer} giving the column index of the \code{admix.proportions} 40 | matrix to be used in determining sample plotting order. If specified, 41 | samples are plotted from left to right in increasing order of their 42 | membership in that layer. If \code{NULL}, samples are plotted 43 | in the order they occur in \code{admix.proportions}.} 44 | 45 | \item{layer.colors}{A \code{vector} of colors to be used in plotting 46 | results for different layers. Users must specify one 47 | color per layer. If \code{NULL}, the plot will use 48 | a pre-specified vector of colors.} 49 | } 50 | \value{ 51 | This function has only invisible return values. 52 | } 53 | \description{ 54 | \code{make.structure.plot} makes a STRUCTURE-style plot from the output from a 55 | conStruct analysis. 56 | } 57 | \details{ 58 | This function takes the output from a conStruct analysis and 59 | makes a STRUCTURE-style plot, where each sample 60 | is represented as a stacked bar plot, and the length of the 61 | bar plot segments of each color represent that sample's 62 | admixture proportion in that layer. 63 | } 64 | \examples{ 65 | \dontshow{ 66 | admix.props <- matrix(c(0.086,0.000,0.500,0.505,0.099,0.052,0.024,0.007,0.800,0.000,0.216,0.744,0.917,0.199,0.469,0.000,0.783,0.298,0.329,0.446,0.000,0.000,0.637,0.903,0.000,0.000,0.000,0.012,0.021,0.000,0.000,0.089,0.000,0.554,0.002,0.000,0.000,0.095,0.020,0.001,0.001,0.011,0.000,0.200,0.000,0.060,0.053,0.082,0.036,0.013,0.000,0.062,0.169,0.137,0.029,0.001,0.000,0.178,0.079,0.000,0.999,1.000,0.988,0.979,0.975,1.000,0.744,0.984,0.435,0.998,0.914,1.000,0.405,0.475,0.900,0.947,0.965,0.993,0.000,1.000,0.725,0.203,0.000,0.765,0.518,1.000,0.154,0.533,0.534,0.525,0.999,1.000,0.185,0.018,1.000,0.001,0.000,0.000,0.000,0.025,0.000,0.167,0.016,0.012,0.000),ncol=3) 67 | } 68 | # make STRUCTURE-style plot 69 | make.structure.plot(admix.proportions = admix.props) 70 | 71 | # make STRUCTURE-style plot, sorted by membership in layer 1 72 | make.structure.plot(admix.proportions = admix.props,sort.by=1) 73 | 74 | } 75 | -------------------------------------------------------------------------------- /man/match.layers.x.runs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/model.comparison.R 3 | \name{match.layers.x.runs} 4 | \alias{match.layers.x.runs} 5 | \title{Match layers up across independent conStruct runs} 6 | \usage{ 7 | match.layers.x.runs(admix.mat1, admix.mat2, admix.mat1.order = NULL) 8 | } 9 | \arguments{ 10 | \item{admix.mat1}{A \code{matrix} of estimated admixture proportions 11 | from the original \code{conStruct} analysis, with one row 12 | per sample and one column per layer.} 13 | 14 | \item{admix.mat2}{A \code{matrix} of estimated admixture proportions 15 | from a second \code{conStruct} analysis, with one row per 16 | sample and one column per layer, for which the 17 | layer order is desired. Must have equal or greater number 18 | of layers to \code{admix.mat1}.} 19 | 20 | \item{admix.mat1.order}{An optional \code{vector} giving the 21 | order in which the layers of \code{admix.mat1} are read.} 22 | } 23 | \value{ 24 | This function returns a \code{vector} giving the ordering 25 | of the layers in \code{admix.mat2} that maximizes 26 | similarity between \code{admix.mat1} and re-ordered 27 | \code{admix.mat2}. 28 | } 29 | \description{ 30 | \code{match.layers.x.runs} 31 | } 32 | \details{ 33 | This function takes the results of two independent 34 | \code{conStruct} analyses and compares them to identify 35 | which layers in a new analysis correspond most closely 36 | to the layers from an original analysis. 37 | 38 | This function compares admixture proportions in layers across 39 | independent \code{conStruct} runs, and compares between them to 40 | identify the layers in \code{admix.mat2} that correspond most 41 | closely to those in \code{admix.mat1}. It then returns a vector 42 | giving an ordering of \code{admix.mat2} that matches up the order 43 | of the layers that correspond to each other. This can be useful 44 | for: 45 | \enumerate{ 46 | \item Dealing with "label switching" across independent runs 47 | with the same number of layers; 48 | \item Plotting results from independent runs with different 49 | numbers of layers using consistent colors 50 | (e.g., the "blue" layer shows up as blue even as 51 | \code{K} increases); 52 | \item Examining results for multimodality (i.e., multiple 53 | distinct solutions with qualitatively different patterns 54 | of membership across layers). 55 | } 56 | The \code{admix.mat1.order} argument can be useful when running 57 | this function to sync up plotting colors/order across the output 58 | of more than two \code{conStruct} runs. 59 | } 60 | \examples{ 61 | \dontshow{ 62 | admix.props1 <- matrix(c(0.09,0.00,0.50,0.51,0.10,0.05,0.02,0.01,0.80,0.00,0.22,0.74,0.92,0.20,0.47,0.00,0.78,0.30,0.33,0.45,0.00,0.00,0.64,0.90,0.00,0.00,0.00,0.01,0.02,0.00,0.00,0.09,0.00,0.55,0.00,0.00,0.00,0.09,0.02,0.00,0.00,0.01,0.00,0.20,0.00,0.06,0.05,0.08,0.04,0.01,0.00,0.06,0.17,0.14,0.03,0.00,0.00,0.18,0.08,0.00,1.00,1.00,0.99,0.98,0.98,1.00,0.74,0.98,0.43,1.00,0.91,1.00,0.41,0.47,0.90,0.95,0.96,0.99,0.00,1.00,0.72,0.20,0.00,0.77,0.52,1.00,0.15,0.53,0.53,0.53,1.00,1.00,0.18,0.02,1.00,0.00,0.00,0.00,0.00,0.02,0.00,0.17,0.02,0.01,0.00),ncol=3) 63 | admix.props2 <- matrix(c(0.36,0.35,0.42,0.38,0.35,0.35,0.36,0.35,0.48,0.36,0.39,0.39,0.40,0.36,0.36,0.35,0.40,0.46,0.45,0.38,0.34,0.35,0.47,0.40,0.35,1.00,1.00,0.99,0.99,0.98,1.00,0.84,0.99,0.63,1.00,0.32,0.35,0.24,0.24,0.33,0.34,0.33,0.35,0.15,0.32,0.32,0.10,0.30,0.33,0.27,0.36,0.13,0.26,0.27,0.22,0.36,0.35,0.14,0.11,0.35,0.00,0.00,0.00,0.01,0.01,0.00,0.07,0.00,0.18,0.00,0.32,0.30,0.34,0.38,0.31,0.30,0.31,0.30,0.36,0.32,0.30,0.51,0.30,0.31,0.37,0.30,0.47,0.29,0.28,0.40,0.30,0.31,0.39,0.49,0.30,0.00,0.00,0.00,0.00,0.01,0.00,0.09,0.01,0.19,0.00),ncol=3) 64 | } 65 | # compare the estimated admixture proportions from 66 | # two different conStruct runs to determine which 67 | # layers in one run correspond to those in the other 68 | match.layers.x.runs(admix.props1,admix.props2) 69 | 70 | } 71 | -------------------------------------------------------------------------------- /man/print.conStruct.results.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/process.model.fit.R 3 | \name{print.conStruct.results} 4 | \alias{print.conStruct.results} 5 | \title{An S3 print method for class conStruct.results} 6 | \usage{ 7 | \method{print}{conStruct.results}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{an object of class \code{conStruct.results}} 11 | 12 | \item{...}{further options to be passed to \code{print}} 13 | } 14 | \value{ 15 | prints a top-level summary of the conStruct.results, returns nothing 16 | } 17 | \description{ 18 | An S3 print method for class conStruct.results 19 | } 20 | -------------------------------------------------------------------------------- /man/print.data.block.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run.conStruct.R 3 | \name{print.data.block} 4 | \alias{print.data.block} 5 | \title{An S3 print method for class data.block} 6 | \usage{ 7 | \method{print}{data.block}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{an object of class \code{data.block}} 11 | 12 | \item{...}{further options to be passed to \code{print}} 13 | } 14 | \value{ 15 | prints a top-level summary of the data.block, returns nothing 16 | } 17 | \description{ 18 | An S3 print method for class data.block 19 | } 20 | -------------------------------------------------------------------------------- /man/print.freq.data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run.conStruct.R 3 | \name{print.freq.data} 4 | \alias{print.freq.data} 5 | \title{An S3 print method for class freq.data} 6 | \usage{ 7 | \method{print}{freq.data}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{an object of class \code{freq.data}} 11 | 12 | \item{...}{further options to be passed to \code{print}} 13 | } 14 | \value{ 15 | prints a top-level summary of the freq.data, returns nothing 16 | } 17 | \description{ 18 | An S3 print method for class freq.data 19 | } 20 | -------------------------------------------------------------------------------- /man/print.layer.params.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/process.model.fit.R 3 | \name{print.layer.params} 4 | \alias{print.layer.params} 5 | \title{An S3 print method for class layer.params} 6 | \usage{ 7 | \method{print}{layer.params}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{an object of class \code{layer.params}} 11 | 12 | \item{...}{further options to be passed to \code{print}} 13 | } 14 | \value{ 15 | prints a top-level summary of the layer.params, returns nothing 16 | } 17 | \description{ 18 | An S3 print method for class layer.params 19 | } 20 | -------------------------------------------------------------------------------- /man/structure2conStruct.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/format.data.R 3 | \name{structure2conStruct} 4 | \alias{structure2conStruct} 5 | \title{Convert a dataset from STRUCTURE to conStruct format} 6 | \usage{ 7 | structure2conStruct( 8 | infile, 9 | onerowperind, 10 | start.loci, 11 | start.samples = 1, 12 | missing.datum, 13 | outfile 14 | ) 15 | } 16 | \arguments{ 17 | \item{infile}{The name and path of the file in STRUCTURE format 18 | to be converted to \code{conStruct} format.} 19 | 20 | \item{onerowperind}{Indicates whether the file format has 21 | one row per individual (\code{TRUE}) or two rows per 22 | individual (\code{FALSE}).} 23 | 24 | \item{start.loci}{The index of the first column in the dataset 25 | that contains genotype data.} 26 | 27 | \item{start.samples}{The index of the first row in the dataset 28 | that contains genotype data (e.g., after any headers). 29 | Default value is 1.} 30 | 31 | \item{missing.datum}{The character or value used to denote 32 | missing data in the STRUCTURE dataset (often 0 or -9).} 33 | 34 | \item{outfile}{The name and path of the file containing the 35 | \code{conStruct} formatted dataset to be generated 36 | by this function.} 37 | } 38 | \value{ 39 | This function returns an allele frequency data matrix 40 | that can be used as the \code{freqs} argument in a conStruct 41 | analysis run using \code{\link{conStruct}}. It also saves 42 | this object as an .RData file so that it can be used in 43 | future analyses. 44 | } 45 | \description{ 46 | \code{structure2conStruct} converts a STRUCTURE dataset 47 | to conStruct format 48 | } 49 | \details{ 50 | This function takes a population genetics dataset in 51 | STRUCTURE format and converts it to conStruct format. 52 | The STRUCTURE file can have one row per individual 53 | and two columns per locus, or one column and two rows 54 | per individual. It can only contain bi-allelic SNPs. 55 | Missing data is acceptable, but must be indicated with 56 | a single value throughout the dataset. 57 | 58 | This function takes a STRUCTURE format data file and 59 | converts it to a \code{conStruct} format data file. 60 | This function can only be applied to diploid organisms. 61 | The STRUCTURE data file must be a plain text file. 62 | If there is extraneous text or column headers before the data 63 | starts, those extra lines should be deleted by hand or 64 | taken into account via the \code{start.samples} argument. 65 | 66 | The STRUCTURE dataset can either be in the ONEROWPERIND=1 67 | file format, with one row per individual and two columns 68 | per locus, or the ONEROWPERIND=0 format, with two rows and 69 | one column per individual. The first column of the STRUCTURE 70 | dataset should be individual names. There may be any number 71 | of other columns that contain non-genotype information before 72 | the first column that contains genotype data, but there can 73 | be no extraneous columns at the end of the dataset, after the 74 | genotype data. 75 | 76 | The genotype data must be bi-allelic 77 | single nucleotide polymorphisms (SNPs). Applying this function 78 | to datasets with more than two alleles per locus may result in 79 | cryptic failure. For more details, see the \code{format-data} 80 | vignette. 81 | } 82 | -------------------------------------------------------------------------------- /man/x.validation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/model.comparison.R 3 | \name{x.validation} 4 | \alias{x.validation} 5 | \title{Run a conStruct cross-validation analysis} 6 | \usage{ 7 | x.validation( 8 | train.prop = 0.9, 9 | n.reps, 10 | K, 11 | freqs = NULL, 12 | data.partitions = NULL, 13 | geoDist, 14 | coords, 15 | prefix, 16 | n.iter, 17 | make.figs = FALSE, 18 | save.files = FALSE, 19 | parallel = FALSE, 20 | n.nodes = NULL, 21 | ... 22 | ) 23 | } 24 | \arguments{ 25 | \item{train.prop}{A numeric value between 0 and 1 that gives 26 | the proportions of the data to be used in the 27 | training partition of the analysis. Default is 0.9.} 28 | 29 | \item{n.reps}{An \code{integer} giving the number of cross- 30 | validation replicates to be run.} 31 | 32 | \item{K}{A numeric \code{vector} giving the numbers of layers 33 | to be tested in each cross-validation replicate. 34 | E.g., \code{K=1:7}.} 35 | 36 | \item{freqs}{A \code{matrix} of allele frequencies with one column per 37 | locus and one row per sample. 38 | Missing data should be indicated with \code{NA}.} 39 | 40 | \item{data.partitions}{A list with one element for each desired 41 | cross-validation replicate. This argument can be specified 42 | instead of the \code{freqs} argument if the user wants to 43 | provide their own data partitions for model training and testing. 44 | See the model comparison vignette for details on what this 45 | should look like.} 46 | 47 | \item{geoDist}{A \code{matrix} of geographic distance between samples. 48 | If \code{NULL}, user can only run the nonspatial model.} 49 | 50 | \item{coords}{A \code{matrix} giving the longitude and latitude 51 | (or X and Y coordinates) of the samples.} 52 | 53 | \item{prefix}{A character \code{vector} giving the prefix to be attached 54 | to all output files.} 55 | 56 | \item{n.iter}{An \code{integer} giving the number of iterations each MCMC 57 | chain is run. Default is 1e3. If the number of iterations 58 | is greater than 500, the MCMC is thinned so that the number 59 | of retained iterations is 500 (before burn-in).} 60 | 61 | \item{make.figs}{A \code{logical} value indicating whether to automatically 62 | make figures during the course of the cross-validation analysis. 63 | Default is \code{FALSE}.} 64 | 65 | \item{save.files}{A \code{logical} value indicating whether to automatically 66 | save output and intermediate files once the analysis is 67 | complete. Default is \code{FALSE}.} 68 | 69 | \item{parallel}{A \code{logical} value indicating whether or not to run the 70 | different cross-validation replicates in parallel. Default is \code{FALSE}. 71 | For more details on how to set up runs in parallel, see the model 72 | comparison vignette.} 73 | 74 | \item{n.nodes}{Number of nodes to run parallel analyses on. Default is 75 | \code{NULL}. Ignored if \code{parallel} is \code{FALSE}. For more details 76 | in how to set up runs in parallel, see the model comparison vignette.} 77 | 78 | \item{...}{Further options to be passed to rstan::sampling (e.g., adapt_delta).} 79 | } 80 | \value{ 81 | This function returns (and also saves as a .Robj) a \code{list} 82 | containing the standardized results of the cross-validation analysis 83 | across replicates. For each replicate, the function returns 84 | a list with the following elements: 85 | \itemize{ 86 | \item \code{sp} - the mean of the standardized log likelihoods of the 87 | "testing" data partition of that replicate for the spatial model for 88 | each value of K specified in \code{K}. 89 | \item \code{nsp} - the mean of the standardized log likelihoods of the 90 | "testing" data partitions of that replicate for the nonspatial model for 91 | each value of K specified in \code{K}. 92 | } 93 | In addition, this function saves two text files containing the standardized 94 | cross-validation results for the spatial and nonspatial results 95 | (prefix_sp_xval_results.txt and prefix_nsp_xval_results.txt, respectively). 96 | These values are written as matrices for user convenience; each column is 97 | a cross-validation replicate, and each row gives the result for a value of 98 | \code{K}. 99 | } 100 | \description{ 101 | \code{x.validation} runs a conStruct cross-validation analysis 102 | } 103 | \details{ 104 | This function initiates a cross-validation analysis that 105 | uses Monte Carlo cross-validation to determine the statistical 106 | support for models with different numbers of layers or 107 | with and without a spatial component. 108 | } 109 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | # Generated by rstantools. Do not edit by hand. 2 | 3 | STANHEADERS_SRC = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" -e "message()" | grep "StanHeaders") 4 | 5 | STANC_FLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "cat(ifelse(utils::packageVersion('rstan') >= 2.26, '-DUSE_STANC3',''))") 6 | PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG -DBOOST_MATH_OVERFLOW_ERROR_POLICY=errno_on_error $(STANC_FLAGS) 7 | PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::CxxFlags()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::CxxFlags()") 8 | PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()") 9 | 10 | CXX_STD = CXX14 11 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | # Generated by rstantools. Do not edit by hand. 2 | 3 | STANHEADERS_SRC = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" -e "message()" | grep "StanHeaders") 4 | 5 | STANC_FLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "cat(ifelse(utils::packageVersion('rstan') >= 2.26, '-DUSE_STANC3',''))") 6 | PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG -DRCPP_PARALLEL_USE_TBB=1 $(STANC_FLAGS) 7 | PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::CxxFlags()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::CxxFlags()") 8 | PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()") 9 | 10 | CXX_STD = CXX14 11 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | 15 | RcppExport SEXP _rcpp_module_boot_stan_fit4multiK_mod(); 16 | RcppExport SEXP _rcpp_module_boot_stan_fit4oneK_mod(); 17 | RcppExport SEXP _rcpp_module_boot_stan_fit4space_multiK_mod(); 18 | RcppExport SEXP _rcpp_module_boot_stan_fit4space_oneK_mod(); 19 | 20 | static const R_CallMethodDef CallEntries[] = { 21 | {"_rcpp_module_boot_stan_fit4multiK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4multiK_mod, 0}, 22 | {"_rcpp_module_boot_stan_fit4oneK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4oneK_mod, 0}, 23 | {"_rcpp_module_boot_stan_fit4space_multiK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4space_multiK_mod, 0}, 24 | {"_rcpp_module_boot_stan_fit4space_oneK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4space_oneK_mod, 0}, 25 | {NULL, NULL, 0} 26 | }; 27 | 28 | RcppExport void R_init_conStruct(DllInfo *dll) { 29 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 30 | R_useDynamicSymbols(dll, FALSE); 31 | } 32 | -------------------------------------------------------------------------------- /src/stanExports_multiK.cc: -------------------------------------------------------------------------------- 1 | // Generated by rstantools. Do not edit by hand. 2 | 3 | #include 4 | using namespace Rcpp ; 5 | #include "stanExports_multiK.h" 6 | 7 | RCPP_MODULE(stan_fit4multiK_mod) { 8 | 9 | 10 | class_ >("rstantools_model_multiK") 11 | 12 | .constructor() 13 | 14 | 15 | .method("call_sampler", &rstan::stan_fit ::call_sampler) 16 | .method("param_names", &rstan::stan_fit ::param_names) 17 | .method("param_names_oi", &rstan::stan_fit ::param_names_oi) 18 | .method("param_fnames_oi", &rstan::stan_fit ::param_fnames_oi) 19 | .method("param_dims", &rstan::stan_fit ::param_dims) 20 | .method("param_dims_oi", &rstan::stan_fit ::param_dims_oi) 21 | .method("update_param_oi", &rstan::stan_fit ::update_param_oi) 22 | .method("param_oi_tidx", &rstan::stan_fit ::param_oi_tidx) 23 | .method("grad_log_prob", &rstan::stan_fit ::grad_log_prob) 24 | .method("log_prob", &rstan::stan_fit ::log_prob) 25 | .method("unconstrain_pars", &rstan::stan_fit ::unconstrain_pars) 26 | .method("constrain_pars", &rstan::stan_fit ::constrain_pars) 27 | .method("num_pars_unconstrained", &rstan::stan_fit ::num_pars_unconstrained) 28 | .method("unconstrained_param_names", &rstan::stan_fit ::unconstrained_param_names) 29 | .method("constrained_param_names", &rstan::stan_fit ::constrained_param_names) 30 | .method("standalone_gqs", &rstan::stan_fit ::standalone_gqs) 31 | ; 32 | } 33 | -------------------------------------------------------------------------------- /src/stanExports_oneK.cc: -------------------------------------------------------------------------------- 1 | // Generated by rstantools. Do not edit by hand. 2 | 3 | #include 4 | using namespace Rcpp ; 5 | #include "stanExports_oneK.h" 6 | 7 | RCPP_MODULE(stan_fit4oneK_mod) { 8 | 9 | 10 | class_ >("rstantools_model_oneK") 11 | 12 | .constructor() 13 | 14 | 15 | .method("call_sampler", &rstan::stan_fit ::call_sampler) 16 | .method("param_names", &rstan::stan_fit ::param_names) 17 | .method("param_names_oi", &rstan::stan_fit ::param_names_oi) 18 | .method("param_fnames_oi", &rstan::stan_fit ::param_fnames_oi) 19 | .method("param_dims", &rstan::stan_fit ::param_dims) 20 | .method("param_dims_oi", &rstan::stan_fit ::param_dims_oi) 21 | .method("update_param_oi", &rstan::stan_fit ::update_param_oi) 22 | .method("param_oi_tidx", &rstan::stan_fit ::param_oi_tidx) 23 | .method("grad_log_prob", &rstan::stan_fit ::grad_log_prob) 24 | .method("log_prob", &rstan::stan_fit ::log_prob) 25 | .method("unconstrain_pars", &rstan::stan_fit ::unconstrain_pars) 26 | .method("constrain_pars", &rstan::stan_fit ::constrain_pars) 27 | .method("num_pars_unconstrained", &rstan::stan_fit ::num_pars_unconstrained) 28 | .method("unconstrained_param_names", &rstan::stan_fit ::unconstrained_param_names) 29 | .method("constrained_param_names", &rstan::stan_fit ::constrained_param_names) 30 | .method("standalone_gqs", &rstan::stan_fit ::standalone_gqs) 31 | ; 32 | } 33 | -------------------------------------------------------------------------------- /src/stanExports_oneK.h: -------------------------------------------------------------------------------- 1 | // Generated by rstantools. Do not edit by hand. 2 | 3 | #ifndef MODELS_HPP 4 | #define MODELS_HPP 5 | #define STAN__SERVICES__COMMAND_HPP 6 | #include 7 | // Code generated by Stan version 2.21.0 8 | #include 9 | namespace model_oneK_namespace { 10 | using std::istream; 11 | using std::string; 12 | using std::stringstream; 13 | using std::vector; 14 | using stan::io::dump; 15 | using stan::math::lgamma; 16 | using stan::model::prob_grad; 17 | using namespace stan::math; 18 | static int current_statement_begin__; 19 | stan::io::program_reader prog_reader__() { 20 | stan::io::program_reader reader; 21 | reader.add_event(0, 0, "start", "model_oneK"); 22 | reader.add_event(36, 34, "end", "model_oneK"); 23 | return reader; 24 | } 25 | template 26 | Eigen::Matrix::type, Eigen::Dynamic, Eigen::Dynamic> 27 | Cov(const int& N, 28 | const Eigen::Matrix& nugget, 29 | const T2__& gamma, std::ostream* pstream__) { 30 | typedef typename boost::math::tools::promote_args::type local_scalar_t__; 31 | typedef local_scalar_t__ fun_return_scalar_t__; 32 | const static bool propto__ = true; 33 | (void) propto__; 34 | local_scalar_t__ DUMMY_VAR__(std::numeric_limits::quiet_NaN()); 35 | (void) DUMMY_VAR__; // suppress unused var warning 36 | int current_statement_begin__ = -1; 37 | try { 38 | { 39 | current_statement_begin__ = 3; 40 | validate_non_negative_index("parCov", "N", N); 41 | validate_non_negative_index("parCov", "N", N); 42 | Eigen::Matrix parCov(N, N); 43 | stan::math::initialize(parCov, DUMMY_VAR__); 44 | stan::math::fill(parCov, DUMMY_VAR__); 45 | current_statement_begin__ = 4; 46 | validate_non_negative_index("Nug_mat", "N", N); 47 | validate_non_negative_index("Nug_mat", "N", N); 48 | Eigen::Matrix Nug_mat(N, N); 49 | stan::math::initialize(Nug_mat, DUMMY_VAR__); 50 | stan::math::fill(Nug_mat, DUMMY_VAR__); 51 | current_statement_begin__ = 5; 52 | stan::math::assign(parCov, rep_matrix(gamma, N, N)); 53 | current_statement_begin__ = 6; 54 | stan::math::assign(Nug_mat, diag_matrix(nugget)); 55 | current_statement_begin__ = 7; 56 | stan::math::assign(parCov, add(parCov, Nug_mat)); 57 | current_statement_begin__ = 8; 58 | return stan::math::promote_scalar(parCov); 59 | } 60 | } catch (const std::exception& e) { 61 | stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__()); 62 | // Next line prevents compiler griping about no return 63 | throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***"); 64 | } 65 | } 66 | struct Cov_functor__ { 67 | template 68 | Eigen::Matrix::type, Eigen::Dynamic, Eigen::Dynamic> 69 | operator()(const int& N, 70 | const Eigen::Matrix& nugget, 71 | const T2__& gamma, std::ostream* pstream__) const { 72 | return Cov(N, nugget, gamma, pstream__); 73 | } 74 | }; 75 | #include 76 | class model_oneK 77 | : public stan::model::model_base_crtp { 78 | private: 79 | int K; 80 | int N; 81 | int L; 82 | matrix_d obsCov; 83 | double varMeanFreqs; 84 | matrix_d LobsCov; 85 | public: 86 | model_oneK(stan::io::var_context& context__, 87 | std::ostream* pstream__ = 0) 88 | : model_base_crtp(0) { 89 | ctor_body(context__, 0, pstream__); 90 | } 91 | model_oneK(stan::io::var_context& context__, 92 | unsigned int random_seed__, 93 | std::ostream* pstream__ = 0) 94 | : model_base_crtp(0) { 95 | ctor_body(context__, random_seed__, pstream__); 96 | } 97 | void ctor_body(stan::io::var_context& context__, 98 | unsigned int random_seed__, 99 | std::ostream* pstream__) { 100 | typedef double local_scalar_t__; 101 | boost::ecuyer1988 base_rng__ = 102 | stan::services::util::create_rng(random_seed__, 0); 103 | (void) base_rng__; // suppress unused var warning 104 | current_statement_begin__ = -1; 105 | static const char* function__ = "model_oneK_namespace::model_oneK"; 106 | (void) function__; // dummy to suppress unused var warning 107 | size_t pos__; 108 | (void) pos__; // dummy to suppress unused var warning 109 | std::vector vals_i__; 110 | std::vector vals_r__; 111 | local_scalar_t__ DUMMY_VAR__(std::numeric_limits::quiet_NaN()); 112 | (void) DUMMY_VAR__; // suppress unused var warning 113 | try { 114 | // initialize data block variables from context__ 115 | current_statement_begin__ = 12; 116 | context__.validate_dims("data initialization", "K", "int", context__.to_vec()); 117 | K = int(0); 118 | vals_i__ = context__.vals_i("K"); 119 | pos__ = 0; 120 | K = vals_i__[pos__++]; 121 | check_greater_or_equal(function__, "K", K, 1); 122 | current_statement_begin__ = 13; 123 | context__.validate_dims("data initialization", "N", "int", context__.to_vec()); 124 | N = int(0); 125 | vals_i__ = context__.vals_i("N"); 126 | pos__ = 0; 127 | N = vals_i__[pos__++]; 128 | check_greater_or_equal(function__, "N", N, 2); 129 | current_statement_begin__ = 14; 130 | context__.validate_dims("data initialization", "L", "int", context__.to_vec()); 131 | L = int(0); 132 | vals_i__ = context__.vals_i("L"); 133 | pos__ = 0; 134 | L = vals_i__[pos__++]; 135 | check_greater_or_equal(function__, "L", L, (N + 1)); 136 | current_statement_begin__ = 15; 137 | validate_non_negative_index("obsCov", "N", N); 138 | validate_non_negative_index("obsCov", "N", N); 139 | context__.validate_dims("data initialization", "obsCov", "matrix_d", context__.to_vec(N,N)); 140 | obsCov = Eigen::Matrix(N, N); 141 | vals_r__ = context__.vals_r("obsCov"); 142 | pos__ = 0; 143 | size_t obsCov_j_2_max__ = N; 144 | size_t obsCov_j_1_max__ = N; 145 | for (size_t j_2__ = 0; j_2__ < obsCov_j_2_max__; ++j_2__) { 146 | for (size_t j_1__ = 0; j_1__ < obsCov_j_1_max__; ++j_1__) { 147 | obsCov(j_1__, j_2__) = vals_r__[pos__++]; 148 | } 149 | } 150 | current_statement_begin__ = 16; 151 | context__.validate_dims("data initialization", "varMeanFreqs", "double", context__.to_vec()); 152 | varMeanFreqs = double(0); 153 | vals_r__ = context__.vals_r("varMeanFreqs"); 154 | pos__ = 0; 155 | varMeanFreqs = vals_r__[pos__++]; 156 | // initialize transformed data variables 157 | current_statement_begin__ = 19; 158 | validate_non_negative_index("LobsCov", "N", N); 159 | validate_non_negative_index("LobsCov", "N", N); 160 | LobsCov = Eigen::Matrix(N, N); 161 | stan::math::fill(LobsCov, DUMMY_VAR__); 162 | // execute transformed data statements 163 | current_statement_begin__ = 20; 164 | stan::math::assign(LobsCov, multiply(L, obsCov)); 165 | // validate transformed data 166 | // validate, set parameter ranges 167 | num_params_r__ = 0U; 168 | param_ranges_i__.clear(); 169 | current_statement_begin__ = 23; 170 | num_params_r__ += 1; 171 | current_statement_begin__ = 24; 172 | validate_non_negative_index("nugget", "N", N); 173 | num_params_r__ += N; 174 | } catch (const std::exception& e) { 175 | stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__()); 176 | // Next line prevents compiler griping about no return 177 | throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***"); 178 | } 179 | } 180 | ~model_oneK() { } 181 | void transform_inits(const stan::io::var_context& context__, 182 | std::vector& params_i__, 183 | std::vector& params_r__, 184 | std::ostream* pstream__) const { 185 | typedef double local_scalar_t__; 186 | stan::io::writer writer__(params_r__, params_i__); 187 | size_t pos__; 188 | (void) pos__; // dummy call to supress warning 189 | std::vector vals_r__; 190 | std::vector vals_i__; 191 | current_statement_begin__ = 23; 192 | if (!(context__.contains_r("gamma"))) 193 | stan::lang::rethrow_located(std::runtime_error(std::string("Variable gamma missing")), current_statement_begin__, prog_reader__()); 194 | vals_r__ = context__.vals_r("gamma"); 195 | pos__ = 0U; 196 | context__.validate_dims("parameter initialization", "gamma", "double", context__.to_vec()); 197 | double gamma(0); 198 | gamma = vals_r__[pos__++]; 199 | try { 200 | writer__.scalar_lb_unconstrain(0, gamma); 201 | } catch (const std::exception& e) { 202 | stan::lang::rethrow_located(std::runtime_error(std::string("Error transforming variable gamma: ") + e.what()), current_statement_begin__, prog_reader__()); 203 | } 204 | current_statement_begin__ = 24; 205 | if (!(context__.contains_r("nugget"))) 206 | stan::lang::rethrow_located(std::runtime_error(std::string("Variable nugget missing")), current_statement_begin__, prog_reader__()); 207 | vals_r__ = context__.vals_r("nugget"); 208 | pos__ = 0U; 209 | validate_non_negative_index("nugget", "N", N); 210 | context__.validate_dims("parameter initialization", "nugget", "vector_d", context__.to_vec(N)); 211 | Eigen::Matrix nugget(N); 212 | size_t nugget_j_1_max__ = N; 213 | for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) { 214 | nugget(j_1__) = vals_r__[pos__++]; 215 | } 216 | try { 217 | writer__.vector_lb_unconstrain(0, nugget); 218 | } catch (const std::exception& e) { 219 | stan::lang::rethrow_located(std::runtime_error(std::string("Error transforming variable nugget: ") + e.what()), current_statement_begin__, prog_reader__()); 220 | } 221 | params_r__ = writer__.data_r(); 222 | params_i__ = writer__.data_i(); 223 | } 224 | void transform_inits(const stan::io::var_context& context, 225 | Eigen::Matrix& params_r, 226 | std::ostream* pstream__) const { 227 | std::vector params_r_vec; 228 | std::vector params_i_vec; 229 | transform_inits(context, params_i_vec, params_r_vec, pstream__); 230 | params_r.resize(params_r_vec.size()); 231 | for (int i = 0; i < params_r.size(); ++i) 232 | params_r(i) = params_r_vec[i]; 233 | } 234 | template 235 | T__ log_prob(std::vector& params_r__, 236 | std::vector& params_i__, 237 | std::ostream* pstream__ = 0) const { 238 | typedef T__ local_scalar_t__; 239 | local_scalar_t__ DUMMY_VAR__(std::numeric_limits::quiet_NaN()); 240 | (void) DUMMY_VAR__; // dummy to suppress unused var warning 241 | T__ lp__(0.0); 242 | stan::math::accumulator lp_accum__; 243 | try { 244 | stan::io::reader in__(params_r__, params_i__); 245 | // model parameters 246 | current_statement_begin__ = 23; 247 | local_scalar_t__ gamma; 248 | (void) gamma; // dummy to suppress unused var warning 249 | if (jacobian__) 250 | gamma = in__.scalar_lb_constrain(0, lp__); 251 | else 252 | gamma = in__.scalar_lb_constrain(0); 253 | current_statement_begin__ = 24; 254 | Eigen::Matrix nugget; 255 | (void) nugget; // dummy to suppress unused var warning 256 | if (jacobian__) 257 | nugget = in__.vector_lb_constrain(0, N, lp__); 258 | else 259 | nugget = in__.vector_lb_constrain(0, N); 260 | // transformed parameters 261 | current_statement_begin__ = 27; 262 | validate_non_negative_index("parCov", "N", N); 263 | validate_non_negative_index("parCov", "N", N); 264 | Eigen::Matrix parCov(N, N); 265 | stan::math::initialize(parCov, DUMMY_VAR__); 266 | stan::math::fill(parCov, DUMMY_VAR__); 267 | // transformed parameters block statements 268 | current_statement_begin__ = 28; 269 | stan::math::assign(parCov, Cov(N, nugget, gamma, pstream__)); 270 | // validate transformed parameters 271 | const char* function__ = "validate transformed params"; 272 | (void) function__; // dummy to suppress unused var warning 273 | current_statement_begin__ = 27; 274 | size_t parCov_j_1_max__ = N; 275 | size_t parCov_j_2_max__ = N; 276 | for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) { 277 | for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) { 278 | if (stan::math::is_uninitialized(parCov(j_1__, j_2__))) { 279 | std::stringstream msg__; 280 | msg__ << "Undefined transformed parameter: parCov" << "(" << j_1__ << ", " << j_2__ << ")"; 281 | stan::lang::rethrow_located(std::runtime_error(std::string("Error initializing variable parCov: ") + msg__.str()), current_statement_begin__, prog_reader__()); 282 | } 283 | } 284 | } 285 | // model body 286 | current_statement_begin__ = 31; 287 | lp_accum__.add(normal_log(nugget, 0, 1)); 288 | current_statement_begin__ = 32; 289 | lp_accum__.add(normal_log(gamma, varMeanFreqs, 0.5)); 290 | current_statement_begin__ = 33; 291 | lp_accum__.add(wishart_log(LobsCov, L, parCov)); 292 | } catch (const std::exception& e) { 293 | stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__()); 294 | // Next line prevents compiler griping about no return 295 | throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***"); 296 | } 297 | lp_accum__.add(lp__); 298 | return lp_accum__.sum(); 299 | } // log_prob() 300 | template 301 | T_ log_prob(Eigen::Matrix& params_r, 302 | std::ostream* pstream = 0) const { 303 | std::vector vec_params_r; 304 | vec_params_r.reserve(params_r.size()); 305 | for (int i = 0; i < params_r.size(); ++i) 306 | vec_params_r.push_back(params_r(i)); 307 | std::vector vec_params_i; 308 | return log_prob(vec_params_r, vec_params_i, pstream); 309 | } 310 | void get_param_names(std::vector& names__) const { 311 | names__.resize(0); 312 | names__.push_back("gamma"); 313 | names__.push_back("nugget"); 314 | names__.push_back("parCov"); 315 | } 316 | void get_dims(std::vector >& dimss__) const { 317 | dimss__.resize(0); 318 | std::vector dims__; 319 | dims__.resize(0); 320 | dimss__.push_back(dims__); 321 | dims__.resize(0); 322 | dims__.push_back(N); 323 | dimss__.push_back(dims__); 324 | dims__.resize(0); 325 | dims__.push_back(N); 326 | dims__.push_back(N); 327 | dimss__.push_back(dims__); 328 | } 329 | template 330 | void write_array(RNG& base_rng__, 331 | std::vector& params_r__, 332 | std::vector& params_i__, 333 | std::vector& vars__, 334 | bool include_tparams__ = true, 335 | bool include_gqs__ = true, 336 | std::ostream* pstream__ = 0) const { 337 | typedef double local_scalar_t__; 338 | vars__.resize(0); 339 | stan::io::reader in__(params_r__, params_i__); 340 | static const char* function__ = "model_oneK_namespace::write_array"; 341 | (void) function__; // dummy to suppress unused var warning 342 | // read-transform, write parameters 343 | double gamma = in__.scalar_lb_constrain(0); 344 | vars__.push_back(gamma); 345 | Eigen::Matrix nugget = in__.vector_lb_constrain(0, N); 346 | size_t nugget_j_1_max__ = N; 347 | for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) { 348 | vars__.push_back(nugget(j_1__)); 349 | } 350 | double lp__ = 0.0; 351 | (void) lp__; // dummy to suppress unused var warning 352 | stan::math::accumulator lp_accum__; 353 | local_scalar_t__ DUMMY_VAR__(std::numeric_limits::quiet_NaN()); 354 | (void) DUMMY_VAR__; // suppress unused var warning 355 | if (!include_tparams__ && !include_gqs__) return; 356 | try { 357 | // declare and define transformed parameters 358 | current_statement_begin__ = 27; 359 | validate_non_negative_index("parCov", "N", N); 360 | validate_non_negative_index("parCov", "N", N); 361 | Eigen::Matrix parCov(N, N); 362 | stan::math::initialize(parCov, DUMMY_VAR__); 363 | stan::math::fill(parCov, DUMMY_VAR__); 364 | // do transformed parameters statements 365 | current_statement_begin__ = 28; 366 | stan::math::assign(parCov, Cov(N, nugget, gamma, pstream__)); 367 | if (!include_gqs__ && !include_tparams__) return; 368 | // validate transformed parameters 369 | const char* function__ = "validate transformed params"; 370 | (void) function__; // dummy to suppress unused var warning 371 | // write transformed parameters 372 | if (include_tparams__) { 373 | size_t parCov_j_2_max__ = N; 374 | size_t parCov_j_1_max__ = N; 375 | for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) { 376 | for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) { 377 | vars__.push_back(parCov(j_1__, j_2__)); 378 | } 379 | } 380 | } 381 | if (!include_gqs__) return; 382 | } catch (const std::exception& e) { 383 | stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__()); 384 | // Next line prevents compiler griping about no return 385 | throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***"); 386 | } 387 | } 388 | template 389 | void write_array(RNG& base_rng, 390 | Eigen::Matrix& params_r, 391 | Eigen::Matrix& vars, 392 | bool include_tparams = true, 393 | bool include_gqs = true, 394 | std::ostream* pstream = 0) const { 395 | std::vector params_r_vec(params_r.size()); 396 | for (int i = 0; i < params_r.size(); ++i) 397 | params_r_vec[i] = params_r(i); 398 | std::vector vars_vec; 399 | std::vector params_i_vec; 400 | write_array(base_rng, params_r_vec, params_i_vec, vars_vec, include_tparams, include_gqs, pstream); 401 | vars.resize(vars_vec.size()); 402 | for (int i = 0; i < vars.size(); ++i) 403 | vars(i) = vars_vec[i]; 404 | } 405 | std::string model_name() const { 406 | return "model_oneK"; 407 | } 408 | void constrained_param_names(std::vector& param_names__, 409 | bool include_tparams__ = true, 410 | bool include_gqs__ = true) const { 411 | std::stringstream param_name_stream__; 412 | param_name_stream__.str(std::string()); 413 | param_name_stream__ << "gamma"; 414 | param_names__.push_back(param_name_stream__.str()); 415 | size_t nugget_j_1_max__ = N; 416 | for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) { 417 | param_name_stream__.str(std::string()); 418 | param_name_stream__ << "nugget" << '.' << j_1__ + 1; 419 | param_names__.push_back(param_name_stream__.str()); 420 | } 421 | if (!include_gqs__ && !include_tparams__) return; 422 | if (include_tparams__) { 423 | size_t parCov_j_2_max__ = N; 424 | size_t parCov_j_1_max__ = N; 425 | for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) { 426 | for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) { 427 | param_name_stream__.str(std::string()); 428 | param_name_stream__ << "parCov" << '.' << j_1__ + 1 << '.' << j_2__ + 1; 429 | param_names__.push_back(param_name_stream__.str()); 430 | } 431 | } 432 | } 433 | if (!include_gqs__) return; 434 | } 435 | void unconstrained_param_names(std::vector& param_names__, 436 | bool include_tparams__ = true, 437 | bool include_gqs__ = true) const { 438 | std::stringstream param_name_stream__; 439 | param_name_stream__.str(std::string()); 440 | param_name_stream__ << "gamma"; 441 | param_names__.push_back(param_name_stream__.str()); 442 | size_t nugget_j_1_max__ = N; 443 | for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) { 444 | param_name_stream__.str(std::string()); 445 | param_name_stream__ << "nugget" << '.' << j_1__ + 1; 446 | param_names__.push_back(param_name_stream__.str()); 447 | } 448 | if (!include_gqs__ && !include_tparams__) return; 449 | if (include_tparams__) { 450 | size_t parCov_j_2_max__ = N; 451 | size_t parCov_j_1_max__ = N; 452 | for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) { 453 | for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) { 454 | param_name_stream__.str(std::string()); 455 | param_name_stream__ << "parCov" << '.' << j_1__ + 1 << '.' << j_2__ + 1; 456 | param_names__.push_back(param_name_stream__.str()); 457 | } 458 | } 459 | } 460 | if (!include_gqs__) return; 461 | } 462 | }; // model 463 | } // namespace 464 | typedef model_oneK_namespace::model_oneK stan_model; 465 | #ifndef USING_R 466 | stan::model::model_base& new_model( 467 | stan::io::var_context& data_context, 468 | unsigned int seed, 469 | std::ostream* msg_stream) { 470 | stan_model* m = new stan_model(data_context, seed, msg_stream); 471 | return *m; 472 | } 473 | #endif 474 | #endif 475 | -------------------------------------------------------------------------------- /src/stanExports_space_multiK.cc: -------------------------------------------------------------------------------- 1 | // Generated by rstantools. Do not edit by hand. 2 | 3 | #include 4 | using namespace Rcpp ; 5 | #include "stanExports_space_multiK.h" 6 | 7 | RCPP_MODULE(stan_fit4space_multiK_mod) { 8 | 9 | 10 | class_ >("rstantools_model_space_multiK") 11 | 12 | .constructor() 13 | 14 | 15 | .method("call_sampler", &rstan::stan_fit ::call_sampler) 16 | .method("param_names", &rstan::stan_fit ::param_names) 17 | .method("param_names_oi", &rstan::stan_fit ::param_names_oi) 18 | .method("param_fnames_oi", &rstan::stan_fit ::param_fnames_oi) 19 | .method("param_dims", &rstan::stan_fit ::param_dims) 20 | .method("param_dims_oi", &rstan::stan_fit ::param_dims_oi) 21 | .method("update_param_oi", &rstan::stan_fit ::update_param_oi) 22 | .method("param_oi_tidx", &rstan::stan_fit ::param_oi_tidx) 23 | .method("grad_log_prob", &rstan::stan_fit ::grad_log_prob) 24 | .method("log_prob", &rstan::stan_fit ::log_prob) 25 | .method("unconstrain_pars", &rstan::stan_fit ::unconstrain_pars) 26 | .method("constrain_pars", &rstan::stan_fit ::constrain_pars) 27 | .method("num_pars_unconstrained", &rstan::stan_fit ::num_pars_unconstrained) 28 | .method("unconstrained_param_names", &rstan::stan_fit ::unconstrained_param_names) 29 | .method("constrained_param_names", &rstan::stan_fit ::constrained_param_names) 30 | .method("standalone_gqs", &rstan::stan_fit ::standalone_gqs) 31 | ; 32 | } 33 | -------------------------------------------------------------------------------- /src/stanExports_space_oneK.cc: -------------------------------------------------------------------------------- 1 | // Generated by rstantools. Do not edit by hand. 2 | 3 | #include 4 | using namespace Rcpp ; 5 | #include "stanExports_space_oneK.h" 6 | 7 | RCPP_MODULE(stan_fit4space_oneK_mod) { 8 | 9 | 10 | class_ >("rstantools_model_space_oneK") 11 | 12 | .constructor() 13 | 14 | 15 | .method("call_sampler", &rstan::stan_fit ::call_sampler) 16 | .method("param_names", &rstan::stan_fit ::param_names) 17 | .method("param_names_oi", &rstan::stan_fit ::param_names_oi) 18 | .method("param_fnames_oi", &rstan::stan_fit ::param_fnames_oi) 19 | .method("param_dims", &rstan::stan_fit ::param_dims) 20 | .method("param_dims_oi", &rstan::stan_fit ::param_dims_oi) 21 | .method("update_param_oi", &rstan::stan_fit ::update_param_oi) 22 | .method("param_oi_tidx", &rstan::stan_fit ::param_oi_tidx) 23 | .method("grad_log_prob", &rstan::stan_fit ::grad_log_prob) 24 | .method("log_prob", &rstan::stan_fit ::log_prob) 25 | .method("unconstrain_pars", &rstan::stan_fit ::unconstrain_pars) 26 | .method("constrain_pars", &rstan::stan_fit ::constrain_pars) 27 | .method("num_pars_unconstrained", &rstan::stan_fit ::num_pars_unconstrained) 28 | .method("unconstrained_param_names", &rstan::stan_fit ::unconstrained_param_names) 29 | .method("constrained_param_names", &rstan::stan_fit ::constrained_param_names) 30 | .method("standalone_gqs", &rstan::stan_fit ::standalone_gqs) 31 | ; 32 | } 33 | -------------------------------------------------------------------------------- /testing/runs/sim.dataset.Robj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/testing/runs/sim.dataset.Robj -------------------------------------------------------------------------------- /testing/runs/test.mods.R: -------------------------------------------------------------------------------- 1 | library(conStruct) 2 | library(doParallel) 3 | library(foreach) 4 | load("sim.dataset.Robj") 5 | 6 | options(error=recover) 7 | args <- list("run1" = list("spatial" = FALSE, 8 | "geoDist" = fields::rdist(sim.dataset$coords), 9 | "K" = 1, 10 | "prefix" = "nsp1"), 11 | "run2" = list("spatial" = FALSE, 12 | "geoDist" = NULL, 13 | "K" = 1, 14 | "prefix" = "nsp1a"), 15 | "run3" = list("spatial" = TRUE, 16 | "geoDist" = fields::rdist(sim.dataset$coords), 17 | "K" = 1, 18 | "prefix" = "sp1"), 19 | "run4" = list("spatial" = TRUE, 20 | "geoDist" = fields::rdist(sim.dataset$coords), 21 | "K" = 3, 22 | "prefix" = "sp3"), 23 | "run5" = list("spatial" = FALSE, 24 | "geoDist" = fields::rdist(sim.dataset$coords), 25 | "K" = 3, 26 | "prefix" = "nsp3"), 27 | "run6" = list("spatial" = FALSE, 28 | "geoDist" = NULL, 29 | "K" = 3, 30 | "prefix" = "nsp3b") 31 | ) 32 | 33 | cl <- parallel::makeCluster(3,type="FORK") 34 | doParallel::registerDoParallel(cl) 35 | 36 | tmp <- foreach::foreach(i=1:length(args)) %dopar% { 37 | x <- args[[i]] ; 38 | conStruct::conStruct(spatial = x[["spatial"]], 39 | K = x[["K"]], 40 | freqs = sim.dataset$freq.data$freqs, 41 | geoDist = x[["geoDist"]], 42 | coords = sim.dataset$coords, 43 | prefix = x[["prefix"]]) 44 | } 45 | 46 | parallel::stopCluster(cl) 47 | -------------------------------------------------------------------------------- /testing/runs/testOne.R: -------------------------------------------------------------------------------- 1 | library(conStruct) 2 | 3 | load("sim.dataset.Robj") 4 | 5 | options(error=recover) 6 | test <- conStruct::conStruct(spatial = TRUE, 7 | K = 2, 8 | freqs = sim.dataset$freq.data$freqs, 9 | geoDist = fields::rdist(sim.dataset$coords), 10 | coords = sim.dataset$coords, 11 | prefix = "test1") 12 | 13 | test <- conStruct::conStruct(spatial = FALSE, 14 | K = 2, 15 | freqs = sim.dataset$freq.data$freqs, 16 | geoDist = fields::rdist(sim.dataset$coords), 17 | coords = sim.dataset$coords, 18 | prefix = "test2", 19 | n.iter=400) 20 | 21 | test <- conStruct::conStruct(spatial = FALSE, 22 | K = 2, 23 | freqs = sim.dataset$freq.data$freqs, 24 | geoDist = fields::rdist(sim.dataset$coords), 25 | coords = sim.dataset$coords, 26 | prefix = "test3", 27 | n.iter=500) 28 | 29 | test <- conStruct::conStruct(spatial = FALSE, 30 | K = 2, 31 | freqs = sim.dataset$freq.data$freqs, 32 | geoDist = fields::rdist(sim.dataset$coords), 33 | coords = sim.dataset$coords, 34 | prefix = "test4", 35 | n.iter=510) 36 | 37 | test <- conStruct::conStruct(spatial = FALSE, 38 | K = 2, 39 | freqs = sim.dataset$freq.data$freqs, 40 | geoDist = fields::rdist(sim.dataset$coords), 41 | coords = sim.dataset$coords, 42 | prefix = "test5", 43 | n.iter=2e3) 44 | 45 | test <- conStruct::conStruct(spatial = FALSE, 46 | K = 2, 47 | freqs = sim.dataset$freq.data$freqs, 48 | geoDist = fields::rdist(sim.dataset$coords), 49 | coords = sim.dataset$coords, 50 | prefix = "test5", 51 | n.iter=2e3, 52 | control = setNames(list(0.9),"adapt_delta")) -------------------------------------------------------------------------------- /testing/xval/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY : test clean 2 | 3 | test : clean 4 | Rscript test.xval.R &> test.log 5 | Rscript test.xval2.R &> test2.log 6 | 7 | clean : 8 | rm -f test1.xvals.Robj xval_test1.xval.data.partitions.Robj xval_test1.xval.results.Robj xval_test1_nsp_xval_results.txt xval_test1_sp_xval_results.txt test.log test2.log xval_test2.xval.data.partitions.Robj xval_test2.xval.results.Robj xval_test2_nsp_xval_results.txt xval_test2_sp_xval_results.txt xvals2.Robj .Rapp.history .RData -------------------------------------------------------------------------------- /testing/xval/sim.dataset.Robj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/testing/xval/sim.dataset.Robj -------------------------------------------------------------------------------- /testing/xval/test.xval.R: -------------------------------------------------------------------------------- 1 | library(conStruct) 2 | load("sim.dataset.Robj") 3 | 4 | options(error=recover) 5 | 6 | xvals <- x.validation(train.prop = 0.9, 7 | n.reps = 2, 8 | K = 1:2, 9 | freqs = sim.dataset$freqs, 10 | geoDist = fields::rdist(sim.dataset$coords), 11 | coords = sim.dataset$coords, 12 | prefix = "xval_test1", 13 | n.iter = 1e3, 14 | make.figs = FALSE, 15 | save.files = FALSE, 16 | parallel = FALSE, 17 | n.nodes = NULL) 18 | 19 | save(xvals,file="test1.xvals.Robj") -------------------------------------------------------------------------------- /testing/xval/test.xval2.R: -------------------------------------------------------------------------------- 1 | library(conStruct) 2 | load("sim.dataset.Robj") 3 | 4 | library(foreach) 5 | library(doParallel) 6 | cl <- makeCluster(2,type="FORK") 7 | registerDoParallel(cl) 8 | 9 | xvals <- x.validation(train.prop = 0.9, 10 | n.reps = 2, 11 | K = 1:2, 12 | freqs = sim.dataset$freqs, 13 | data.partitions = NULL, 14 | geoDist = fields::rdist(sim.dataset$coords), 15 | coords = sim.dataset$coords, 16 | prefix = "xval_test2", 17 | n.iter = 1e3, 18 | make.figs = FALSE, 19 | save.files = FALSE, 20 | parallel = TRUE, 21 | n.nodes = 2) 22 | 23 | save(xvals,file="xvals2.Robj") 24 | 25 | stopCluster(cl) -------------------------------------------------------------------------------- /vignettes/format-data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to format data for a conStruct analysis" 3 | author: "Gideon Bradburd" 4 | date: '`r format(Sys.Date(), "%B %d, %Y")`' 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | vignette: > 9 | %\VignetteIndexEntry{format-data} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | ```{r, echo = FALSE} 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>") 15 | ``` 16 | 17 | 18 | 19 | ## Format data 20 | This document describes the format of the data used in 21 | a `conStruct` analysis. 22 | 23 | For information on how to run a `conStruct` analysis 24 | after you've formatted your data, see the companion 25 | vignette on [how to run conStruct](run-conStruct.html). 26 | 27 | Throughout the document, I'll be referring to the 28 | example dataset included with the package: 29 | 30 | ```{r} 31 | library(conStruct) 32 | data(conStruct.data) 33 | ``` 34 | 35 | ## conStruct data 36 | 37 | There are 3 data objects you need to run a `conStruct` analysis: 38 | 39 | 1. [allele frequency data] 40 | 41 | 2. [geographic sampling coordinates] 42 | 43 | 3. [geographic distance matrix] 44 | 45 | In the sections below, I walk through the specific format required for each. 46 | 47 | ### Allele frequency data 48 | 49 | You must specify a matrix of allele frequency data for your samples. 50 | (Make sure the data are of class `matrix`, and that it's not a `data.frame`.) 51 | I assume that the data consist of bi-allelic SNPs. 52 | At each locus, you pick an allele to count across all samples 53 | (it doesn't matter whether it's randomly chosen or 54 | whether it's always the major or minor allele). 55 | The frequency of the counted allele at a locus in a sample 56 | is the number of times the counted allele is observed at a locus 57 | divided by the number of chromosomes genotyped in that sample. 58 | A sample can consist of a single individual or multiple individuals 59 | lumped together. 60 | So, a successfully genotyped diploid individual heterozygous at a 61 | particular locus would have an allele frequency of 0.5. 62 | If the sample is a population of 13 haploids, of which 12 have the 63 | counted allele at a given locus, the frequency in that sample at that 64 | locus would be 12/13. 65 | 66 | The matrix of allele frequencies should have one row per sample and 67 | one column per locus. Missing data should be denoted with the value `NA`. 68 | An small example allele frequency data matrix is shown below: 69 | 70 | | Sample | Locus1 | Locus2 | Locus3 | Locus4 | Locus5 | Locus6 | Locus7 | Locus8 | Locus9 | Locus10 | 71 | |:------|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:| 72 | | Pop1 | 0 | 1 | NA | 0.8 | 0.7 | 0 | 0 | 0.6 | 0 | 1 | 73 | | Pop2 | 0 | 1 | 1 | 0.9 | 1 | 1 | 0.1 | 0.6 | 0 | 0.9 | 74 | | Pop3 | 0.2 | 0.75 | 0 | 1 | 1 | NA | 1 | 1 | 0.1 | 1 | 75 | | Pop4 | 0.1 | 0.9 | 1 | 1 | 0.8 | 1 | 0.2 | 0.7 | 0.1 | 0.3 | 76 | | Pop5 | 0 | 1 | 1 | 1 | 1 | 1 | 0.3 | 0.9 | 0.3 | NA | 77 | 78 | An full example allele frequency data matrix is included in the 79 | `conStruct.data` object included with the package. 80 | 81 | ```{r} 82 | # load the example data object 83 | data(conStruct.data) 84 | 85 | # look at the allele frequency data 86 | # for the first 5 populations and 10 loci 87 | conStruct.data$allele.frequencies[1:5,1:10] 88 | ``` 89 | 90 | ### Geographic sampling coordinates 91 | 92 | You must specify a matrix of geographic sampling coordinates, 93 | which will be used for plotting the results of the analysis. 94 | This should be a matrix with two columns that give the sample 95 | x-coordinates (longitude) and y-coordinates (latitude), 96 | respectively. The order of rows of the matrix should be the same 97 | as the order of the rows of the allele frequencies matrix. 98 | If you specify longitude and latitude, they should be in 99 | decimal degrees. 100 | 101 | A full example sampling coordinate data matrix is included 102 | in the `conStruct.data` object included with the package. 103 | 104 | ```{r} 105 | # load the example data object 106 | data(conStruct.data) 107 | 108 | # look at the geographic sampling coordinates 109 | # for the first 5 populations 110 | conStruct.data$coords[1:5,] 111 | ``` 112 | 113 | ### Geographic distance matrix 114 | 115 | If you choose to run the spatial model implemented in `conStruct`, 116 | you must specify a matrix of pairwise geographic distance between 117 | all samples. If the coordinates of the samples are real locations 118 | on Earth (as opposed to simulated coordinates), I recommend 119 | calculating pairwise great-circle distance between sampling 120 | coordinates (using, e.g., `geosphere::distm` or `geosphere::distGeo`). 121 | 122 | The order of the samples in the geographic distance matrix should 123 | match both that of the geographic coordinates and that of the 124 | allele frequency data matrix, and all three matrices should have 125 | the same number of rows. 126 | 127 | The geographic distance matrix you specify should be the full 128 | matrix (that is, not the upper- or lower-triangles), with values 129 | of `0` on the diagonal entries. 130 | 131 | A full example geographic distance matrix between all samples 132 | is in the `conStruct.data` object included with the package. 133 | 134 | ```{r} 135 | # load the example data object 136 | data(conStruct.data) 137 | 138 | # look at pariwise geographic distance 139 | # between the first 5 populations 140 | conStruct.data$geoDist[1:5,1:5] 141 | ``` 142 | 143 | 144 | # Other formats to conStruct 145 | 146 | For convenience, I've written a function to convert population 147 | genetic data in STRUCTURE format to that used in `conStruct`. 148 | 149 | ## STRUCTURE to conStruct 150 | 151 | The program STRUCTURE is one of the most widely used 152 | methods for model-based clustering in population genetics. 153 | Many existing programs, including plink (v1.9 and above) 154 | and PgdSpider, convert data from diverse formats (including 155 | .vcf files) into STRUCTURE format. In this section of the 156 | vignette, I walk through an example of converting a STRUCTURE 157 | format data file into a `conStruct` format data file. 158 | 159 | ### STRUCTURE data format 160 | More extensive documentation on STRUCTURE's data format 161 | can be found in [the STRUCTURE manual](https://web.stanford.edu/group/pritchardlab/structure_software/release_versions/v2.3.4/structure_doc.pdf). 162 | An example STRUCTURE-formatted dataset is shown below: 163 | 164 | | | | Loc1 | Loc1 | Loc2 | Loc2 | Loc3 | Loc3 | Loc4 | Loc4 | 165 | |:------:|:---:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:| 166 | | Ind1 | 1 | 1 | 1 | 2 | 2 | 1 | 2 | -9 | -9 | 167 | | Ind2 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 168 | | Ind3 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 169 | | Ind4 | 2 | -9 | -9 | 1 | 2 | 1 | 1 | 1 | 1 | 170 | | Ind5 | 2 | 2 | 1 | 2 | 2 | 1 | 1 | 1 | 2 | 171 | 172 | Example STRUCTURE format dataset, with one row per individual 173 | and two columns per locus. The first column gives sample names, the 174 | second refers to the sample locations, and the last 8 columns give 175 | genotype data for four loci. The numbers in the genotype data refer to 176 | the allele present at that locus: A1 = `1`, A2 = `2`, missing = `-9`. 177 | 178 | To convert a STRUCTURE format file to `conStruct` format, 179 | you can use the function `structure2conStruct`, included in the 180 | `conStruct` package. 181 | 182 | Below, I give an example of the usage of this function, assuming 183 | that the file containing the STRUCTURE format data is called 184 | "myStructureData.str", and that it's on the "desktop" directory 185 | on the computer. I also assume that the data are formatted as in 186 | the table above, with the genotype data starting at the 3rd column 187 | of the data matrix, and missing data denoted with a value of -9. 188 | 189 | **Note that the STRUCTURE-format data must be a text file 190 | and there can be no lines of text before the data table begins. 191 | If your file is in an Excel spreadsheet, it can be converted to 192 | a text file using Save As > File Format = Tab delimited Text 193 | (.txt). If there are lines of text at the top of the document 194 | before the data matrix begins, they must be deleted or specified 195 | via the `start.samples` argument. In addition, 196 | your data can only contain bi-allelic data. If you have loci with 197 | more than two alleles, they should be not be included in the dataset. 198 | For more information on multi-allelic datasets, see the section on 199 | [Microsatellites](#microsatellites) below.** 200 | 201 | ```{r,eval=FALSE} 202 | conStruct.data <- structure2conStruct(infile = "~/Desktop/myStructureData.str", 203 | onerowperind = TRUE, 204 | start.loci = 3, 205 | start.samples = 1, 206 | missing.datum = -9, 207 | outfile = "~/Desktop/myConStructData") 208 | 209 | ``` 210 | 211 | An alternate STRUCTURE data format has two rows and one column per 212 | diploid genotype: 213 | 214 | | | | Loc1 | Loc2 | Loc3 | Loc4 | Loc5 | Loc6 | Loc7 | Loc8 | 215 | |:------:|:---:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:| 216 | | Ind1 | 1 | 1 | 1 | 2 | 2 | 1 | 2 | 0 | 1 | 217 | | Ind1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 0 | 1 | 218 | | Ind2 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 219 | | Ind2 | 1 | 0 | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 220 | | Ind3 | 2 | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 221 | | Ind3 | 2 | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 222 | | Ind4 | 2 | 2 | 0 | 2 | 2 | 2 | 1 | 0 | 2 | 223 | | Ind4 | 2 | 2 | 0 | 2 | 2 | 1 | 1 | 0 | 2 | 224 | 225 | Example STRUCTURE format dataset, with two rows per individual 226 | and one column per locus. The first column gives sample names, the 227 | second refers to the sample locations, and the last 8 columns give 228 | genotype data for 8 loci. The numbers in the genotype data refer to 229 | the allele present at that locus: A1 = `1`, A2 = `2`, missing = `0`. 230 | 231 | Data in this format can be converted to `conStruct` format using the 232 | command below: 233 | 234 | ```{r,eval=FALSE} 235 | conStruct.data <- structure2conStruct(infile = "~/Desktop/myStructureData.str", 236 | onerowperind = FALSE, 237 | start.loci = 3, 238 | start.samples = 1, 239 | missing.datum = 0, 240 | outfile = "~/Desktop/myConStructData") 241 | 242 | ``` 243 | 244 | Further documentation for this function is in its help page, 245 | which you can go to using the command `help(structure2conStruct)`. 246 | 247 | If you wish to group multiple individuals together into a single 248 | sample for analysis you can collapse rows of the `conStruct` format 249 | data file. For example, if you have 12 individuals from 4 250 | locations (3 individuals from each location), and you wish to 251 | analyze the data treating populations at a sampling location 252 | as the unit of analysis, you can do something like the 253 | following: 254 | 255 | ```{r,eval=FALSE} 256 | pop.data.matrix <- matrix(NA,nrow=4,ncol=ncol(conStruct.data)) 257 | for(i in 1:nrow(pop.data.matrix)){ 258 | pop.data.matrix[i,] <- colMeans( 259 | conStruct.data[ 260 | which(pop.index==i),, 261 | drop=FALSE 262 | ],na.rm=TRUE 263 | ) 264 | } 265 | ``` 266 | where `pop.index` is a vector that gives the population of origin 267 | for each of the individuals sampled. In the example above, with 268 | 12 individuals sampled from 4 locations (3 from each), 269 | `pop.index` would be `c(1,1,1,2,2,2,3,3,3,4,4,4)`. 270 | 271 | 272 | ## Microsatellites 273 | 274 | This method is designed to run on large datasets consisting of 275 | bi-allelic SNPs. If you have a microsatellite dataset and you 276 | wish to run `conStruct`, the first consideration is whether you 277 | have sufficient data. You should have more loci than samples 278 | in your data matrix (i.e., your data matrix should have more 279 | columns than rows). 280 | 281 | If that's the case, the second consideration is how to format your 282 | microsat data so that you can run conStruct. There are two standard 283 | ways of "SNP-ifying" a microsat dataset. 284 | 285 | The first is to lump all microsatellite alleles present at a locus 286 | into two categories: "major" and "other". The "major" allele is the 287 | allele that occurs most frequently at a particular locus; all other 288 | alleles are put in the "other" bin. You then can create a dataset 289 | in which you only report the frequency of the major allele, 290 | effectively reducing the number of alleles per locus to 2. 291 | This method has the disadvantage of throwing out data, but 292 | acknowledges the simplex relationships between alleles at a locus 293 | (the sum of the frequencies of all alleles at a locus must be 1). 294 | 295 | The second approach, introduced by Cavalli-Sforza, is to split out 296 | each allele at a locus into a separate pseudo-locus consisting of only 297 | that allele. That is, if you had 4 alleles present in the genotyped 298 | sample at a particular locus, at frequencies {0.4,0.3,0.1,0.2}, 299 | you would split those out into 4 separate columns in your data matrix 300 | (pseudo-loci), with frequencies in the sampled population of 301 | {0.4,0.3,0.1,0.2}. This approach has the advantage of not throwing 302 | data away, but does not acknowledge the inter-allele dependence 303 | structure in frequencies, and therefore introduces some 304 | pseudoreplication into the dataset. This pseudoreplication may make 305 | you overconfident in your results, as the credible intervals on 306 | parameter estimates may be artificially narrow. 307 | 308 | I would recommend trying both approaches, and comparing the 309 | estimates of pairwise relatedness you get from each to those 310 | derived from the raw microsatellite data to see which best 311 | recovers the patterns of relatedness in the data. I also recommend 312 | running `conStruct` on datasets SNP-ified using both approaches, 313 | and comparing the results. -------------------------------------------------------------------------------- /vignettes/model-comparison.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to compare conStruct model runs" 3 | author: "Gideon Bradburd" 4 | date: '`r format(Sys.Date(), "%B %d, %Y")`' 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | vignette: > 9 | %\VignetteIndexEntry{model-comparison} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | ```{r, echo = FALSE} 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>") 15 | ``` 16 | 17 | 18 | 19 | ## Model comparison 20 | 21 | This document describes how to do model comparison 22 | for conStruct analyses. It assumes that you are already 23 | familiar with the [companion vignette for running conStruct](run-conStruct.html). 24 | 25 | -------------------------------------------------------------------------- 26 | 27 | **Caveat user!** 28 | Although it may sometimes be necessary to 29 | simplify the presentation of the results of several analyses by 30 | only showing the output from a single "best" run, it is important 31 | to remember several things: 32 | 33 | 1. First, choice of best _K_ is always relative to the data at hand, 34 | and, as the amount of data increases, statistical support for larger 35 | _K_ will likely increase. With infinite data, the "best" value of _K_ 36 | would probably be the number of samples in the dataset. 37 | 38 | 2. Although we think that conStruct is less likely to falsely ascribe 39 | continuous patterns of genetic variation to discrete population clusters 40 | than other existing methods, that does not mean that the discrete groups 41 | identified by conStruct are biologically real. See "A tutorial on how not 42 | to over-interpret STRUCTURE and ADMIXTURE bar plots" (Lawson, van Dorp, 43 | and Falush 2018) for a more in-depth discussion of these issues. 44 | 45 | 3. Finally, as with all other statistical inference, output should be 46 | interpreted with care and a large grain of salt. We strongly recommend 47 | that users check whether individual runs seem to have performed well 48 | and whether results are consistent across independent runs. We also 49 | recommend that users compare output across runs with different values 50 | of _K_ to see which samples split out into their own layers in the 51 | different analyses. 52 | 53 | -------------------------------------------------------------------------- 54 | 55 | So, you've run two or more conStruct analyses and you want 56 | to compare them to see which might be the best model to 57 | describe the variation in your data. There are two methods in 58 | the conStruct package for doing model comparison: 59 | 60 | 1. Cross-validation 61 | 62 | 2. Calculating layer contributions 63 | 64 | Below, I describe both options and give examples for how to 65 | use their associated functions in the `conStruct` package 66 | and visualize the output they generate. 67 | 68 | Note that if you are interested in visually comparing two 69 | independent `conStruct` runs, you can use the function 70 | `compare.two.runs`, the documentation for which can be found with the 71 | command `help(compare.two.runs)`. This function is further described 72 | in the [companion vignette for visualizing results](visualize-results.html). 73 | 74 | 75 | ## Cross-validation 76 | 77 | Cross-validation is a tool for testing how the results of an analysis 78 | will generalize to an independent dataset. 79 | 80 | ### How it works 81 | 82 | In general, the more parameters included in the model, the better the 83 | fit to the data. To determine an appropriate level of parameterization 84 | for a given dataset, we can use cross-validation. In `conStruct`, 85 | this works by fitting a model to a "training" subset of the data, then 86 | testing the fit to the remaining "testing" subset. If the parameter 87 | values estimated from the training data parameterize a model that 88 | describes the testing data well, the predictive accuracy of the model 89 | is good. If the model is overparameterized, it will fit the training data 90 | very well, but may not fit the testing better any better than (or even as 91 | well as) a less parameter-rich model. By fitting a given model to many 92 | training partitions and testing its fit to the accompanying testing 93 | partitions, we can get a mean predictive accuracy for each model. We can 94 | then compare predictive accuracies across models to determine which model 95 | strikes has the best goodness-of-fit without overfitting. 96 | 97 | ### How to run a cross-validation analysis 98 | 99 | To run a cross-validation analysis in `conStruct`, you can use the 100 | `x.validation` function. 101 | 102 | ```{r,eval=FALSE} 103 | # load the library 104 | library(conStruct) 105 | 106 | # load the example dataset 107 | data(conStruct.data) 108 | 109 | # to run a cross-validation analysis 110 | # you have to specify: 111 | # the numbers of layers you want to compare (K) 112 | # the allele frequency data (freqs) 113 | # the geographic distance matrix (geoDist) 114 | # the sampling coordinates (coords) 115 | 116 | my.xvals <- x.validation(train.prop = 0.9, 117 | n.reps = 8, 118 | K = 1:3, 119 | freqs = conStruct.data$allele.frequencies, 120 | data.partitions = NULL, 121 | geoDist = conStruct.data$geoDist, 122 | coords = conStruct.data$coords, 123 | prefix = "example", 124 | n.iter = 1e3, 125 | make.figs = TRUE, 126 | save.files = FALSE, 127 | parallel = FALSE, 128 | n.nodes = NULL) 129 | ``` 130 | 131 | In the example above, we ran a cross-validation analysis with 8 132 | cross-validation replicates, comparing the spatial and nonspatial 133 | models with _K_ = 1 through 3 for each replicate. Each training 134 | partition (one per replicate) was created by randomly subsampling 135 | 90% of the total number of loci. This function call will run a total 136 | of 24 `conStruct` analyses (_K_ = 1:3 for each of 8 replicates), 137 | each for 1,000 MCMC iterations (`n.iter` = 1000), which will 138 | generate a lot of output figures and files. To avoid these piling up, 139 | we can set the `make.figs` and `save.files` options to `FALSE`. 140 | However, as with all analyses, it's important to make 141 | sure these runs are mixing well, so we suggest checking the output 142 | figures to make sure they look good. 143 | 144 | The `x.validation` function returns a list containing the results of 145 | the cross-validation analysis, standardized within each replicate. 146 | The model with the best predictive accuracy within each replicate has 147 | a standardized score of 0. Smaller (i.e., more negative) values 148 | indicate worse model fit to the testing data in that replicate. 149 | 150 | For convenience, the function also writes a table of results to a 151 | text file for both the spatial model (`prefix_sp_xval_results.txt`) 152 | and the nonspatial model (`prefix_nsp_xval_results.txt`). Each 153 | column in the table gives the results for a single cross-validation 154 | replicate over evaluated values of _K_, and each row gives the 155 | results of a given value of _K_ across replicates. 156 | 157 | The arguments `parallel` and `n.nodes` can be used to 158 | parallelize the cross-validation analysis. These are described 159 | in further detail below in [Parallelization](#parallelization). 160 | The argument `data.partitions` allows the user to specify their 161 | own training/testing data partitions to be used across replicates. 162 | This option is described further below in 163 | [Specifying data partitions](#specifying-data-partitions). 164 | 165 | ### Visualizing results 166 | 167 | To visualize the output of a cross-validation analysis, you can 168 | use either the output list or the text files. Examples of both 169 | are given below. 170 | 171 | ```{r, eval=FALSE} 172 | # read in results from text files 173 | 174 | sp.results <- as.matrix( 175 | read.table("example_sp_xval_results.txt", 176 | header = TRUE, 177 | stringsAsFactors = FALSE) 178 | ) 179 | nsp.results <- as.matrix( 180 | read.table("example_nsp_xval_results.txt", 181 | header = TRUE, 182 | stringsAsFactors = FALSE) 183 | ) 184 | 185 | # or, format results from the output list 186 | sp.results <- Reduce("cbind",lapply(my.xvals,function(x){unlist(x$sp)}),init=NULL) 187 | nsp.results <- Reduce("cbind",lapply(my.xvals,function(x){unlist(x$nsp)}),init=NULL) 188 | ``` 189 | 190 | ```{r,echo=FALSE} 191 | sp.results <- matrix(c(-1.201, 0.000, -1.819, -4.579, -5.730, 0.000, 0.000, -5.346, -1.114, -7.315, -8.853, 0.000, 0.000, -6.125, -3.602, 0.000, -11.155, -5.506, -3.650, 0.000, -2.909, 0.000, -4.799, -9.890),nrow=3,ncol=8) 192 | row.names(sp.results) <- paste0("K=",1:3) 193 | nsp.results <- matrix(c(-685.108, -416.726, -141.223, -684.230, -418.651, -148.589, -679.392, -404.326, -147.367, -682.996, -415.190, -147.767, -680.044, -411.200, -147.288, -677.238, -410.037, -149.066, -679.914, -404.820, -145.464, -672.501, -414.927, -145.073),nrow=3,ncol=8) 194 | row.names(nsp.results) <- paste0("K=",1:3) 195 | ``` 196 | 197 | The results look like this: 198 | ```{r,eval=TRUE,echo=FALSE} 199 | knitr::kable(sp.results,row.names=TRUE,col.names=paste0("rep",1:8),caption="Spatial cross-validation results") 200 | ``` 201 | 202 | A quick and dirty plot of the output is given below: 203 | 204 | ```{r, eval=TRUE, fig.width=8,fig.height=5} 205 | 206 | # first, get the 95% confidence intervals for the spatial and nonspatial 207 | # models over values of K (mean +/- 1.96 the standard error) 208 | 209 | sp.CIs <- apply(sp.results,1,function(x){mean(x) + c(-1.96,1.96) * sd(x)/length(x)}) 210 | nsp.CIs <- apply(nsp.results,1,function(x){mean(x) + c(-1.96,1.96) * sd(x)/length(x)}) 211 | 212 | # then, plot cross-validation results for K=1:3 with 8 replicates 213 | 214 | par(mfrow=c(1,2)) 215 | plot(rowMeans(sp.results), 216 | pch=19,col="blue", 217 | ylab="predictive accuracy",xlab="values of K", 218 | ylim=range(sp.results,nsp.results), 219 | main="cross-validation results") 220 | points(rowMeans(nsp.results),col="green",pch=19) 221 | 222 | # finally, visualize results for the spatial model 223 | # separately with its confidence interval bars 224 | # 225 | # note that you could do the same with the spatial model, 226 | # but the confidence intervals don't really show up 227 | # because the differences between predictive accuracies 228 | # across values of K are so large. 229 | 230 | plot(rowMeans(sp.results), 231 | pch=19,col="blue", 232 | ylab="predictive accuracy",xlab="values of K", 233 | ylim=range(sp.CIs), 234 | main="spatial cross-validation results") 235 | segments(x0 = 1:nrow(sp.results), 236 | y0 = sp.CIs[1,], 237 | x1 = 1:nrow(sp.results), 238 | y1 = sp.CIs[2,], 239 | col = "blue",lwd=2) 240 | ``` 241 | 242 | 243 | ### Interpreting results 244 | 245 | The model with the highest mean predictive accuracy is the "best" model, 246 | but, as noted above, we caution against overinterpretation of these 247 | cross-validation results. If a significance test for the "best" number 248 | of layers is required, you can use a t-test comparing cross-validation 249 | scores across values of K, paired by replicate. E.g., 250 | `t.test(sp.results[2,],sp.results[1,],paired=TRUE,alternative="greater")`. 251 | 252 | I would interpret the results above as strong evidence that the spatial 253 | model is preferred over the nonspatial model over all tested values of _K_ 254 | (indicating that isolation by distance is probably a feature of the data). 255 | The cross-validation analyses also strongly support the conclusion that a 256 | single spatial layer (_K_ = 1) is sufficient to describe the variation in 257 | the data. 258 | 259 | A final caveat of this section is that, with sufficient data, it is possible 260 | to get strong statistical support for layers that contribute little to overall 261 | patterns of covariance. Therefore, it's good to interpret cross-validation 262 | results alongside calculated layer contributions (discussed further in 263 | [Layer Contributions](#layer-contributions). 264 | 265 | ### Parallelization 266 | 267 | Because each cross-validation replicate consists of several analyses (one for 268 | each specified value of _K_), and because several cross-validation replicates 269 | are required for model comparison, a single call to `x.validation` can take a 270 | long time. To reduce computational burden, we have introduced an option for 271 | users to parallelize their analyses across replicates. The simplest way to 272 | parallelize is to use the `parallel` and `n.nodes` arguments of in the 273 | `x.validation` function, which we illustrate using the same `x.validation` 274 | given above in [How it works](#how-it-works): 275 | 276 | ```{r,eval=FALSE} 277 | 278 | # load the example dataset 279 | data(conStruct.data) 280 | 281 | # to run a cross-validation analysis 282 | # you have to specify: 283 | # the numbers of layers you want to compare (K) 284 | # the allele frequency data (freqs) 285 | # the geographic distance matrix (geoDist) 286 | # the sampling coordinates (coords) 287 | 288 | # in addition, here we run our analysis parallelized 289 | # across all replicates using 4 nodes 290 | 291 | my.xvals <- x.validation(train.prop = 0.9, 292 | n.reps = 8, 293 | K = 1:3, 294 | freqs = conStruct.data$allele.frequencies, 295 | data.partitions = NULL, 296 | geoDist = conStruct.data$geoDist, 297 | coords = conStruct.data$coords, 298 | prefix = "example", 299 | n.iter = 1e3, 300 | make.figs = TRUE, 301 | save.files = FALSE, 302 | parallel = TRUE, 303 | n.nodes = 4) 304 | 305 | ``` 306 | 307 | The example above should run ~4 times as fast as cross-validation with the 308 | same number of replicates not run in parallel. At the end of the cross-validation 309 | analysis, the parallel workers generated at the beginning of the run will be 310 | terminated. 311 | 312 | 313 | To facilitate greater flexibility in parallelization, users can also specify 314 | their own parallelization scheme before running a cross-validation analysis, 315 | in which case they should simply set `parallel=TRUE` and make sure that `n.nodes` 316 | is equal to the number of nodes they've set up. If you've set up your own 317 | parallelization beforehand (as in the example that follows), `x.validation` will use 318 | that set-up rather than initializing one itself. E.g., 319 | 320 | ```{r,eval=FALSE} 321 | 322 | library(parallel) 323 | library(foreach) 324 | library(doParallel) 325 | 326 | cl <- makeCluster(4,type="FORK") 327 | registerDoParallel(cl) 328 | 329 | my.xvals <- x.validation(train.prop = 0.9, 330 | n.reps = 8, 331 | K = 1:3, 332 | freqs = conStruct.data$allele.frequencies, 333 | data.partitions = NULL, 334 | geoDist = conStruct.data$geoDist, 335 | coords = conStruct.data$coords, 336 | prefix = "example", 337 | n.iter = 1e3, 338 | make.figs = TRUE, 339 | save.files = FALSE, 340 | parallel = TRUE, 341 | n.nodes = 4) 342 | 343 | stopCluster(cl) 344 | 345 | ``` 346 | 347 | Note that if you have prespecified a parallelization scheme, you 348 | are responsible for ending the parallelization yourself, as shown 349 | above with the `stopCluster()` call. Linux and Mac users may wish 350 | use `makeCluster(N,type="FORK")`, as it does better with memory 351 | usage. Windows users should user the default PSOCK cluster 352 | (e.g., `makeCluster(N,type="PSOCK")`). 353 | 354 | ## Layer contributions 355 | 356 | Layer contributions offer a second metric users can employ to compare models 357 | with different numbers of layers. 358 | 359 | ### How it works 360 | 361 | In a `conStruct` run, users are estimating a parametric covariance matrix to 362 | fit their sample allelic covariance. Each layer in the model contributes to 363 | that parametric covariance, and those contributions can be calculated and 364 | compared. If there is a layer that no samples draw appreciable admixture from, 365 | it will contribute almost nothing to overall covariance, and is therefore of 366 | little biological importance in the model. 367 | 368 | By comparing layer contributions across different `conStruct` analyses run 369 | with different values of _K_, users can identify the point at which layers 370 | included in the analysis contribute little to overall covariance, and pick 371 | a "best" value of _K_ below that point. 372 | 373 | ### How to calculate layer contributions 374 | 375 | Layer contributions are calculated from the output of a standard 376 | `conStruct` analysis using the function `calculate.layer.contribution`. 377 | 378 | ```{r,eval=FALSE} 379 | 380 | # Loop through output files generated by conStruct 381 | # runs with K=1 through 5 and calculate the 382 | # layer contributions for each layer in each run 383 | 384 | layer.contributions <- matrix(NA,nrow=5,ncol=5) 385 | 386 | # load the conStruct.results.Robj and data.block.Robj 387 | # files saved at the end of a conStruct run 388 | load("K1_sp_conStruct.results.Robj") 389 | load("K1_sp_data.block.Robj") 390 | 391 | # calculate layer contributions 392 | layer.contributions[,1] <- c(calculate.layer.contribution(conStruct.results[[1]],data.block),rep(0,4)) 393 | tmp <- conStruct.results[[1]]$MAP$admix.proportions 394 | 395 | for(i in 2:5){ 396 | # load the conStruct.results.Robj and data.block.Robj 397 | # files saved at the end of a conStruct run 398 | load(sprintf("K%s_sp_conStruct.results.Robj",i)) 399 | load(sprintf("K%s_sp_data.block.Robj",i)) 400 | 401 | # match layers up across runs to keep plotting colors consistent 402 | # for the same layers in different runs 403 | tmp.order <- match.layers.x.runs(tmp,conStruct.results[[1]]$MAP$admix.proportions) 404 | 405 | # calculate layer contributions 406 | layer.contributions[,i] <- c(calculate.layer.contribution(conStruct.results=conStruct.results[[1]], 407 | data.block=data.block, 408 | layer.order=tmp.order), 409 | rep(0,5-i)) 410 | tmp <- conStruct.results[[1]]$MAP$admix.proportions[,tmp.order] 411 | } 412 | ``` 413 | 414 | Note that, because layers can label switch across runs, the example 415 | above uses the `match.layers.x.runs` function to determine which 416 | layers correspond to each other across analyses run with different 417 | values of _K_. 418 | 419 | ### Visualizing results 420 | 421 | ```{r, echo=FALSE} 422 | layer.contributions <- matrix(c(1.000, 0.000, 0.000, 0.000, 0.000, 0.680, 0.320, 0.000, 0.000, 0.000, 0.682, 0.318, 0.000, 0.000, 0.000, 0.678, 0.322, 0.000, 0.000, 0.000, 0.684, 0.315, 0.000, 0.000, 0.000),nrow=5,ncol=5) 423 | row.names(layer.contributions) <- paste0("Layer_",1:5) 424 | ``` 425 | 426 | The table of layer contributions looks like this: 427 | ```{r, eval=TRUE,echo=FALSE} 428 | knitr::kable(layer.contributions,row.names=TRUE,col.names=paste0("K=",1:5),caption="Contributions for each layer for runs done with K=1 through 5") 429 | ``` 430 | 431 | Layer contributions can be easily plotted across values of _K_ using 432 | a stacked barplot: 433 | 434 | ```{r, eval=TRUE,fig.width=5,fig.height=5} 435 | barplot(layer.contributions, 436 | col=c("blue", "red", "goldenrod1", "forestgreen", "darkorchid1"), 437 | xlab="", 438 | ylab="layer contributions", 439 | names.arg=paste0("K=",1:5)) 440 | ``` 441 | 442 | In this case, the contributions of layers beyond _K_ = 2 is so small 443 | that they don't even show up on the barplot. 444 | 445 | ### Interpreting results 446 | 447 | If a layer in a given model contributes very little to overall covariance, 448 | it is unlikely to have much biological significance. If you run `conStruct` 449 | analyses across values of _K_, and see that, after a certain value of _K_, 450 | no additional clusters contribute much to overall covariance, that may be 451 | a good indication that that value of _K_ (or at least, no larger value of 452 | _K_) is best for describing the variation in your data. For example, in 453 | the layer contributions plotted above in [Visualizing results](# visualizing-results-1), 454 | additional layers after _K_ = 2 have negligible layer contributions, so 455 | we might reasonably conclude that the best value of _K_ for describing our 456 | data is no greater than 2. 457 | 458 | Users can also set some threshold (e.g., 0.01) below which they count a layer's 459 | contribution as negligible, and, by setting this threshold _a priori_, can 460 | use layer contributions as a metric for model selection. 461 | 462 | ## Cross-validation vs. Layer contribution 463 | 464 | With sufficient data, a cross-validation analysis may indicate 465 | strong support for layers that each contribute very little to overall 466 | covariance. In such a case, the input from cross-validation and 467 | layer contributions are at odds, with the former arguing for the inclusion 468 | of more layers, and the latter arguing against. What to do with that situation? 469 | 470 | Well, the specifics will vary from dataset to dataset, but we encourage users 471 | to distinguish between statistical and biological significance, and not get too 472 | caught up in the first at the expense of the second. 473 | 474 | ## Advanced options 475 | 476 | Below, we include information on advanced topics that will not be of use 477 | for the average user. 478 | 479 | ### Specifying data partitions 480 | 481 | In many cases, there will be no genome assembly available for the focal species in a 482 | `conStruct` analysis, and the genotyped loci will have no known genomic location. 483 | When genomic positions are known, advanced users may wish to specify their own data 484 | partitions to maximize the efficacy of the cross-validation procedure. This is 485 | because the cross-validation results are most trustworthy when the testing data 486 | partition is independent from but still representative of the training data. 487 | Because coalescent histories tend to be shared by adjacent loci on the genome, 488 | if neighboring loci are split (one in the training dataset, the other in the 489 | testing dataset), the training/testing partitions might not be truly independent. 490 | In this case, the model parameterized by the training dataset will be fitting 491 | coalescent "noise" that's also present in the testing dataset, the most likely 492 | result of which is overfitting. Another concern is that different regions of the 493 | genome have different properties (e.g., centromeres vs. non-centromeric DNA), so 494 | to keep the training and testing partitions representative of each other, it's 495 | best to try to match by genomic properties. 496 | 497 | **Our recommendation**: If genomic position and LD information is available, we 498 | recommend divvying the genome up into blocks of length equal to twice the scale 499 | of LD, then randomly assigning 90% of those blocks to a training partition, and 500 | the remaining 10% to the testing partition for each replicate. 501 | 502 | To facilitate this type of custom data partitioning, users can specify their own 503 | data partitions for a `x.validation` analysis using the `data.partitions` argument. 504 | There is no function in the package for generating custom a custom data partitions 505 | object, as the details of the data format and specifics of the desired partitioning 506 | scheme will vary from user to user and genome to genome. Instead, we describe the 507 | structure of the `data.partitions` object in detail below so that users can create 508 | it for themselves. 509 | 510 | The `data.partitions` object must be a `list` of length `n.reps` as specified in 511 | the `x.validation` function (one partitioning scheme per cross-validation replicate). 512 | Each of the `n.reps` elements of the list must contain two elements, one named 513 | `training` and one named `testing`, which contain the training and testing data 514 | partitions, respectively. Each training and testing element of the list must contain 515 | three named elements: `data`, `n.loci`, and `varMeanFreqs`. 516 | 517 | The `data` element contains the allelic covariance matrix for that partition of the 518 | data; `n.loci` gives the number of loci in that partition; and `varMeanFreqs` gives 519 | the variance in mean allele frequencies across loci (averaged over choice of counted 520 | allele). 521 | 522 | Peeking under the hood of how conStruct creates this `data.partitions` object when none 523 | is specified, the relevant functions are: 524 | 525 | * conStruct:::make.data.partitions 526 | 527 | * conStruct:::xval.process.data 528 | 529 | + conStruct:::calc.covariance 530 | 531 | + conStruct:::get.var.mean.freqs 532 | 533 | Users attempting to specify their own `data.partitions` object are encouraged to use 534 | these functions as guides for what operations are being carried out to generate the 535 | data partitions list for a cross-validation analysis. The structure of an example 536 | `data.partitions` object with 3 partitioning schemes (for 3 cross-validation replicates) 537 | is shown below: 538 | 539 | ```{r,echo=FALSE} 540 | library(conStruct) 541 | data(conStruct.data) 542 | data.partitions <- conStruct:::make.data.partitions(3,conStruct.data$allele.frequencies,0.9) 543 | ``` 544 | 545 | ```{r,eval=TRUE} 546 | # In this dataset, there are 36 samples and 1e4 loci total, 547 | # and the data partitions are generated 548 | # with a 90% training 10% testing split 549 | 550 | str(data.partitions,max.level=3,give.attr=FALSE,vec.len=3) 551 | ``` 552 | -------------------------------------------------------------------------------- /vignettes/run-conStruct.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to run a conStruct analysis" 3 | author: "Gideon Bradburd" 4 | date: '`r format(Sys.Date(), "%B %d, %Y")`' 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | vignette: > 9 | %\VignetteIndexEntry{run-conStruct} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | ```{r, echo = FALSE} 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>") 15 | ``` 16 | 17 | 18 | 19 | ## Run conStruct 20 | This document describes how to run a `conStruct` analysis. 21 | 22 | Throughout the document, I'll be referring to the 23 | example dataset included with the package: 24 | 25 | ```{r} 26 | library(conStruct) 27 | data(conStruct.data) 28 | ``` 29 | 30 | The format for the data you need to run a `conStruct` 31 | analysis is covered in a separate vignette in this 32 | package. You can view that vignette using the command: 33 | `vignette(package="conStruct",topic="format-data")`. 34 | If you've already run `conStruct` and you want more 35 | information on how to visualize the results, please see 36 | the companion vignette for [visualizing results](visualize-results.html). 37 | If you've run several `conStruct` analyses and want to 38 | compare them, please see the companion vignette for 39 | [model comparison](model-comparison.html). 40 | 41 | ## Running a conStruct analysis 42 | 43 | The function you use to run a `conStruct` analysis 44 | is called, fittingly, `conStruct`. This vignette 45 | walks through the use of this function in detail, 46 | and should be used in concert with the documentation 47 | for the function, which can be viewed using the command: 48 | `help(conStruct)`. 49 | 50 | ### Spatial Model 51 | 52 | The default model in the conStruct package is 53 | the spatial model, which allows relatedness 54 | within a layer to decay as a function of the distance 55 | between samples drawing ancestry from that layer. 56 | 57 | Below, I show an example of how to run a `conStruct` 58 | analysis using the spatial model. 59 | 60 | ```{r,eval=FALSE} 61 | # load the example dataset 62 | data(conStruct.data) 63 | 64 | # run a conStruct analysis 65 | 66 | # you have to specify: 67 | # the number of layers (K) 68 | # the allele frequency data (freqs) 69 | # the geographic distance matrix (geoDist) 70 | # the sampling coordinates (coords) 71 | 72 | my.run <- conStruct(spatial = TRUE, 73 | K = 3, 74 | freqs = conStruct.data$allele.frequencies, 75 | geoDist = conStruct.data$geoDist, 76 | coords = conStruct.data$coords, 77 | prefix = "spK3") 78 | ``` 79 | 80 | The function call above runs `conStruct`'s spatial model 81 | using 3 discrete layers. All output files will be have "spK3" 82 | prepended to their names. To vary the number of layers 83 | in the spatial model, you need only change the value of `K`. 84 | The example dataset `conStruct.data` is organized into an R list 85 | for convenience, but users can provide their data to the function 86 | any way they see fit, so long as each argument is properly formatted 87 | (e.g., `freqs` is a matrix, `prefix` is a character vector, etc.). 88 | 89 | ### Nonspatial Model 90 | 91 | You can also run a nonspatial model using the `conStruct` 92 | function, in which relatedness within each of the K clusters 93 | does not decay with distance. This model is analogous to 94 | the model implemented in STRUCTURE. 95 | 96 | Below, I show an example of how to run a `conStruct` 97 | analysis using the nonspatial model. 98 | 99 | ```{r,eval=FALSE} 100 | # load the example dataset 101 | data(conStruct.data) 102 | 103 | # run a conStruct analysis 104 | 105 | # you have to specify: 106 | # the number of layers (K) 107 | # the allele frequency data (freqs) 108 | # the sampling coordinates (coords) 109 | # 110 | # if you're running the nonspatial model, 111 | # you do not have to specify 112 | # the geographic distance matrix (geoDist) 113 | 114 | my.run <- conStruct(spatial = FALSE, 115 | K = 2, 116 | freqs = conStruct.data$allele.frequencies, 117 | geoDist = NULL, 118 | coords = conStruct.data$coords, 119 | prefix = "nspK2") 120 | ``` 121 | 122 | The function call above runs `conStruct`'s nonspatial model 123 | using 2 discrete layers. All output files will be have "nspK2" 124 | prepended to their names. As with the spatial model, if you 125 | want to vary the number of layers, you change the value of `K`. 126 | 127 | ### Other function options 128 | 129 | The `conStruct` function has other arguments that 130 | have default values, for which you don't have to 131 | specify any values. However, you may wish to alter 132 | these defaults, so we describe them below: 133 | 134 | The full function call for the spatial model with 3 layers is: 135 | 136 | ```{r,eval=FALSE} 137 | my.run <- conStruct(spatial = TRUE, 138 | K = 3, 139 | freqs = conStruct.data$allele.frequencies, 140 | geoDist = conStruct.data$geoDist, 141 | coords = conStruct.data$coords, 142 | prefix = "spK3", 143 | n.chains = 1, 144 | n.iter = 1000, 145 | make.figs = TRUE, 146 | save.files = TRUE) 147 | ``` 148 | 149 | The other options are `n.chains`, `n.iter`, `make.figs`, `save.files`; 150 | I describe each of them below: 151 | 152 | * `n.chains` - gives the number of independent MCMCs to be run for this model. 153 | The default is `1`, but you may wish to run multiple independent chains to 154 | make sure you get consistent results across them. 155 | 156 | * `n.iter` - gives the number of iterations per MCMC. The default is `1000`. 157 | If you have more genotyped samples, you generally need more iterations 158 | to describe the posterior probability surface well. There are no 159 | hard and fast rules on how many iterations you should run. 160 | I **strongly recommend** examining model output to assess convergence; 161 | if you don't see good convergence, you can run the analysis using a 162 | larger number of iterations. 163 | 164 | * `make.figs` - determines whether or not to automatically make figures 165 | describing the results. The default is `TRUE`. However, if you're running 166 | lots of independent analyses, or if you're running on a cluster with limited 167 | disk space, you may wish to set this option to `FALSE` and make the figures 168 | later on your own. 169 | 170 | * `save.files` - determines whether or not to automatically save all output 171 | files. The default is `TRUE`. However, again, there may be circumstances 172 | in which you don't want to automatically save these files, and instead want 173 | to capture the results of the analysis, which are the returned value of the 174 | `conStruct` function call. 175 | 176 | ## Model diagnosis 177 | 178 | As with any statistical model, it is important to assess the 179 | performance of the inference method. Below, I briefly walk 180 | through some of the important things to look out for when 181 | you run a `conStruct` analysis. 182 | 183 | ### MCMC diagnosis 184 | 185 | Although the Hamiltonian Monte Carlo algorithm implemented in STAN 186 | is quite robust, it's always a good idea to look at the results of 187 | the analysis to diagnose MCMC performance. If the chain is mixing 188 | well, the trace plots for the different parameters and the posterior 189 | probability will resemble a “fuzzy caterpillar,” as in panel (a) 190 | below. If the trace plots have not plateaued (as in panel (b)), 191 | it is an indication that the chain has not converged on the 192 | stationary distribution, and that it should be run longer. 193 | If the chain appears to be bouncing between two or more modes, 194 | as in panel (c) below, that may be an indication of a multi-modal 195 | likelihood surface, with multiple points in parameter space that 196 | have equal or similar posterior probability given the data. 197 | 198 | 199 | ```{r,echo=FALSE,fig.width=7,fig.height=2.7} 200 | par(mfrow=c(1,3),mar=c(4,3,1.5,1)) 201 | plot(c(0,rnorm(500,1,0.2)),type='l', 202 | xlab="",yaxt='n',ylab="") 203 | mtext(side=2,text="parameter estimate",padj=-1) 204 | mtext(side=3,text="(a) looks good",padj=-0.1) 205 | plot(c(0,rnorm(500,c(log(seq(0,1,length.out=500))),0.2)),type='l', 206 | xlab="",yaxt='n',ylab="") 207 | mtext(side=1,text="mcmc iterations",padj=2.6) 208 | mtext(side=3,text="(b) hasn't converged",padj=-0.1) 209 | plot(c(0,rnorm(150,1,0.2),rnorm(200,3,0.2),rnorm(150,1,0.2)),type='l', 210 | xlab="",yaxt='n',ylab="") 211 | mtext(side=3,text="(c) multi-modal",padj=-0.1) 212 | ``` 213 | 214 | ### Independent runs 215 | 216 | Above, I highlight the importance of evaluating performance of 217 | individual MCMC runs, but it's also a good idea to run multiple, 218 | independent analyses and compare results across them. Ideally, 219 | multiple independent runs converge on the same stationary distribution, 220 | with similar parameter estimates and posterior probabilities. 221 | If different runs give very different results, you can check whether 222 | there's a mixing problem or a truly multi-modal posterior probability 223 | surface by comparing the values of the posterior probability across 224 | runs. If two runs have very different parameter estimates but their 225 | posterior probability distributions are indistinguishable, that's an 226 | indication of multi-modality. If multiple runs show different parameter 227 | estimates, but the posterior probabilities for a subset of the runs that 228 | show consistent results are higher than those of a different subset that 229 | gives conflicting results, that indicates that some of the runs are not 230 | mixing well. 231 | 232 | ### Missing data 233 | 234 | Missing data can affect the sample allelic covariance, and 235 | therefore the results of a `conStruct` analysis. This is 236 | especially the case when the distribution of missing data is 237 | biased - that is, when individuals of particular ancestry are 238 | more likely to be missing data at a locus. This pattern 239 | is expected when, for example, allelic dropout occurs in a 240 | RADseq dataset. 241 | 242 | In some empirical datasets with missing data that I used to 243 | test `conStruct`, I observed a phenomenon of "homogeneous 244 | minimum layer membership," (HMLM) in which all samples had troublingly 245 | similar admixture proportions in a particular cluster (see 246 | membership in the blue layer in the figure below). 247 | 248 | \ 249 | 250 | ```{r,echo=FALSE,fig.width=7,fig.height=3} 251 | w <- matrix(rnorm(40,sample(2:10,40,replace=TRUE),1), 252 | nrow=20,ncol=2) 253 | w <- w/rowSums(w) 254 | w <- cbind(pmax(rnorm(20,0.15,0.005),0),w) 255 | w <- w/rowSums(w) 256 | conStruct::make.structure.plot(w) 257 | ``` 258 | 259 | \ 260 | 261 | Users are advised to check the results of their analyses carefully 262 | for this HMLM behavior. If you encounter this issue, try reducing 263 | the amount of missing data in your dataset, either by dropping 264 | poorly genotyped samples or poorly genotyped loci (rows and columns 265 | of the allele frequency data matrix, respectively). -------------------------------------------------------------------------------- /vignettes/visualize-results.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to visualize the results of a conStruct analysis" 3 | author: "Gideon Bradburd" 4 | date: '`r format(Sys.Date(), "%B %d, %Y")`' 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | vignette: > 9 | %\VignetteIndexEntry{visualize-results} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | ```{r, echo = FALSE} 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>") 15 | ``` 16 | 17 | 18 | 19 | ## Visualize results 20 | This document describes the use of the functions included in 21 | the conStruct package for visualizing analysis outputs. 22 | For more information on how to run a `conStruct` analysis, 23 | see the companion vignette for [running conStruct](run-conStruct.html). 24 | 25 | Throughout, this vignette will make use of the example data output objects 26 | generated by a `conStruct` run: 27 | 28 | ```{r} 29 | library(conStruct) 30 | data(data.block) 31 | ``` 32 | 33 | ## Make all the plots 34 | If the `make.figs` is set to `TRUE` in a `conStruct` run, 35 | the run will finish by calling the function `make.all.the.plots`. 36 | As the name implies, this function makes all the relevant plots 37 | from a set of conStruct results: 38 | 39 | * STRUCTURE plot 40 | * Admixture pie plot 41 | * Model fit plot 42 | * Layer covariance functions plot 43 | * Trace plots for relevant MCMC quantities including: 44 | * the log posterior density 45 | * nugget parameters 46 | * gamma parameter 47 | * layer-specific parameters 48 | * admixture proportions 49 | 50 | More information is available in the documentation for the function, 51 | which you can view using the command: 52 | 53 | ```{r,eval=FALSE} 54 | help(make.all.the.plots) 55 | ``` 56 | 57 | If you deleted the output plots from an analysis, or if you 58 | set `make.figs` to `FALSE` to avoid making them in the first place, 59 | you can make them by calling the `make.all.the.plots` function. 60 | The arguments you have to specify are a `conStruct.results` output 61 | object and a `data.block` output object, both of which are automatically 62 | generated and saved when you execute a `conStruct` analysis. You must 63 | also specify a `prefix`, which will be prepended to all output pdf 64 | file names. If you choose, you can specify a the colors you want each 65 | layer to be plotted in; if none are specified, the function will use 66 | its own internal vector of colors, which I think look nice but are 67 | otherwise arbitrary. 68 | 69 | An example call to `make.all.the.plots` using the example output 70 | data objects loaded above is shown below. 71 | 72 | ```{r,eval=FALSE} 73 | make.all.the.plots(conStruct.results = conStruct.results, 74 | data.block = data.block, 75 | prefix = "example", 76 | layer.colors = NULL) 77 | # generates a bunch of pdf figures 78 | ``` 79 | 80 | ## Visualizing estimated admixture proportions 81 | 82 | Generally, users are most interested in the estimated admixture 83 | proportions for each sample. These are commonly visualized using 84 | STRUCTURE plots and pie plots. Functions for both are included in 85 | the package, and their use is detailed below. 86 | 87 | ### STRUCTURE plots 88 | 89 | Probably the most common method for visualizing admixture proportions 90 | is using a stacked bar plot (commonly called a STRUCTURE plot after 91 | the model-based clustering method `STRUCTURE`). 92 | 93 | Users can generate a STRUCTURE plot for their data using the command 94 | `make.structure.plot`, (see documentation at `help(make.structure.plot)`). 95 | This function takes as its principal argument the estimated admixture 96 | proportions and makes a STRUCTURE plot in the plotting window. An 97 | example is given below. 98 | 99 | ```{r,echo=FALSE} 100 | admix.props <- matrix( 101 | c(0.086, 0.000, 0.500, 0.505, 0.099, 0.052, 0.024, 0.007, 0.800, 0.000, 0.216, 0.744, 0.917, 102 | 0.199, 0.469, 0.000, 0.783, 0.298, 0.329, 0.446, 0.000, 0.000, 0.637, 0.903, 0.000, 0.000, 103 | 0.000, 0.012, 0.021, 0.000, 0.000, 0.089, 0.000, 0.554, 0.002, 0.000, 0.000, 0.095, 0.020, 104 | 0.001, 0.001, 0.011, 0.000, 0.200, 0.000, 0.060, 0.053, 0.082, 0.036, 0.013, 0.000, 0.062, 105 | 0.169, 0.137, 0.029, 0.001, 0.000, 0.178, 0.079, 0.000, 0.999, 1.000, 0.988, 0.979, 0.975, 106 | 1.000, 0.744, 0.984, 0.435, 0.998, 0.914, 1.000, 0.405, 0.475, 0.900, 0.947, 0.965, 0.993, 107 | 0.000, 1.000, 0.725, 0.203, 0.000, 0.765, 0.518, 1.000, 0.154, 0.533, 0.534, 0.525, 0.999, 108 | 1.000, 0.185, 0.018, 1.000, 0.001, 0.000, 0.000, 0.000, 0.025, 0.000, 0.167, 0.016, 0.012, 109 | 0.000),nrow=35,ncol=3) 110 | ``` 111 | 112 | First, we load the `conStruct.results` data output object 113 | and, for convenience, assign the _maximum a posteriori_ 114 | admixture parameter estimates to a variable with a 115 | shorter name: 116 | 117 | ```{r,eval=FALSE} 118 | load("my_conStruct.results.Robj") 119 | 120 | # assign the MAP admixture proportions from 121 | # the first MCMC chain to a variable 122 | # with a new name 123 | 124 | admix.props <- conStruct.results$chain_1$MAP$admix.proportions 125 | ``` 126 | 127 | Now we can visualize the results: 128 | 129 | ```{r, fig.width=8,fig.height=4} 130 | # make a STRUCTURE plot using the 131 | # maximum a posteriori (MAP) estimates 132 | # from the first chain of a conStruct run 133 | 134 | make.structure.plot(admix.proportions = admix.props) 135 | 136 | ``` 137 | 138 | #### Order STRUCTURE plots 139 | 140 | The function also includes a variety of options for tweaking the order of the 141 | plotted samples. 142 | 143 | ```{r, fig.width=8,fig.height=4} 144 | 145 | # order by membership in layer 1 146 | make.structure.plot(admix.proportions = admix.props, 147 | sort.by = 1) 148 | 149 | # re-order the stacking order of the layers 150 | make.structure.plot(admix.proportions = admix.props, 151 | layer.order = c(2,1,3), 152 | sort.by = 2) 153 | 154 | # provide a custom sample ordering 155 | # in this case by sample latitude 156 | make.structure.plot(admix.proportions = admix.props, 157 | sample.order = order(data.block$coords[,2])) 158 | 159 | # add sample names 160 | make.structure.plot(admix.proportions = admix.props, 161 | sample.names = row.names(data.block$coords), 162 | mar = c(4.5,4,2,2)) 163 | ``` 164 | 165 | 166 | ### ADMIXTURE pie plots 167 | 168 | It is often also useful to visualize estimated admixture 169 | proportions in a spatial context by plotting them on a 170 | map. The most common way to do this is to plot a pie plot 171 | at the sampling location of each sample, in which each 172 | modeled layer gets its own slice of the pie (`K` wedges), 173 | and the size of each slice in the pie is proportional to the 174 | sample's admixture proportion in that layer. 175 | 176 | Users can make an admixture pie plot with their own data 177 | using the command `make.admix.pie.plot` (see documentation 178 | at `help(make.admix.pie.plot)`. This function takes as its 179 | principal arguments the estimated admixture proportions and 180 | the sample coordinates, then makes an admixture pie plot in 181 | the plotting window. An example is given below: 182 | 183 | ```{r,fig.width=6,fig.height=6} 184 | # make an admix pie plot using the 185 | # maximum a posteriori (MAP) estimates 186 | # from the first chain of a conStruct run 187 | make.admix.pie.plot(admix.proportions = admix.props, 188 | coords = data.block$coords) 189 | 190 | # increase pie chart size 191 | make.admix.pie.plot(admix.proportions = admix.props, 192 | coords = data.block$coords, 193 | radii = 4) 194 | 195 | # zoom in on a subsection of the map 196 | make.admix.pie.plot(admix.proportions = admix.props, 197 | coords = data.block$coords, 198 | x.lim = c(-130,-120), 199 | y.lim = c(49,56)) 200 | ``` 201 | 202 | #### Pie plot on a map 203 | 204 | Users can also add the pie plot directly to a map of their own 205 | creation using the `make.admix.pie.plot` by setting the `add` 206 | argument to `TRUE`. E.g., 207 | 208 | ```{r,fig.width=6,fig.height=6} 209 | 210 | # add pie plot to an existing map 211 | 212 | # make the desired map 213 | maps::map(xlim = range(data.block$coords[,1]) + c(-5,5), ylim = range(data.block$coords[,2])+c(-2,2), col="gray") 214 | 215 | # add the admixture pie plot 216 | make.admix.pie.plot(admix.proportions = admix.props, 217 | coords = data.block$coords, 218 | add = TRUE) 219 | ``` 220 | 221 | ## Comparing two conStruct runs 222 | 223 | If you've run multiple `conStruct` analyses you may want to 224 | visually compare them. Although you could always just open up 225 | both sets of output pdfs, label-switching between independent 226 | runs can make visual comparisons difficult. Label-switching 227 | different models have the same, or very similar, estimated 228 | admixture proportions, but with a different permutation of 229 | layer labels (e.g., Layer 1 in run 1, and Layer 3 in run 2). 230 | To enable easy comparison between a pair of `conStruct` runs, 231 | you can use the function `compare.two.runs`. 232 | 233 | To do so, you need to specify to sets of `conStruct.results` output R 234 | objects, as well as the `data.block` objects associated with each run. 235 | Independent runs with the same model can be compared, as can analyses 236 | run with different models (e.g., spatial vs. nonspatial) or 237 | different values of `K`. The only restriction is that if the user is 238 | comparing two models run with different values of `K`, the run with 239 | the smaller value should be specified first (`conStruct.results2`). 240 | Documentation for `compare.two.runs` can be found using the command 241 | `help(compare.two.runs)`. Example usage is shown below: 242 | 243 | ```{r, eval=FALSE} 244 | # load output files from a run with 245 | # the spatial model and K=4 246 | load("spK4.conStruct.results.Robj") 247 | load("spK4.data.block.Robj") 248 | 249 | # assign to new variable names 250 | spK4_cr <- conStruct.results 251 | spK4_db <- data.block 252 | 253 | # load output files from a run with 254 | # the spatial model and K=3 255 | load("spK3.conStruct.results.Robj") 256 | load("spK3.data.block.Robj") 257 | 258 | # assign to new variable names 259 | spK3_cr <- conStruct.results 260 | spK3_db <- data.block 261 | 262 | # compare the two runs 263 | compare.two.runs(conStruct.results1=spK3_cr, 264 | data.block1=spK3_db, 265 | conStruct.results2=spK4_cr, 266 | data.block2=spK4_db, 267 | prefix="spK3_vs_spK4") 268 | 269 | # generates a bunch of pdf figures 270 | ``` --------------------------------------------------------------------------------