├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── CHANGELOG
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── conStruct-package.R
    ├── data.R
    ├── format.data.R
    ├── model.comparison.R
    ├── plot.output.R
    ├── process.model.fit.R
    ├── run.conStruct.R
    ├── stanmodels.R
    └── zzz.R
├── README.md
├── configure
├── configure.win
├── cran-comments.md
├── data
    ├── conStruct.data.rda
    └── data.block.rda
├── inst
    ├── include
    │   └── stan_meta_header.hpp
    └── stan
    │   ├── multiK.stan
    │   ├── oneK.stan
    │   ├── space_multiK.stan
    │   └── space_oneK.stan
├── man
    ├── calculate.layer.contribution.Rd
    ├── compare.two.runs.Rd
    ├── conStruct-manual.pdf
    ├── conStruct-package.Rd
    ├── conStruct.Rd
    ├── conStruct.data.Rd
    ├── data.block.Rd
    ├── make.admix.pie.plot.Rd
    ├── make.all.the.plots.Rd
    ├── make.structure.plot.Rd
    ├── match.layers.x.runs.Rd
    ├── print.conStruct.results.Rd
    ├── print.data.block.Rd
    ├── print.freq.data.Rd
    ├── print.layer.params.Rd
    ├── structure2conStruct.Rd
    └── x.validation.Rd
├── src
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    ├── stanExports_multiK.cc
    ├── stanExports_multiK.h
    ├── stanExports_oneK.cc
    ├── stanExports_oneK.h
    ├── stanExports_space_multiK.cc
    ├── stanExports_space_multiK.h
    ├── stanExports_space_oneK.cc
    └── stanExports_space_oneK.h
├── testing
    ├── runs
    │   ├── sim.dataset.Robj
    │   ├── test.mods.R
    │   └── testOne.R
    └── xval
    │   ├── Makefile
    │   ├── sim.dataset.Robj
    │   ├── test.xval.R
    │   └── test.xval2.R
└── vignettes
    ├── format-data.Rmd
    ├── model-comparison.Rmd
    ├── run-conStruct.Rmd
    └── visualize-results.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CRAN-RELEASE$
 2 | build.conStruct.pkg.R
 3 | build.conStruct.pkg.Rout
 4 | .*\.tar\.gz$
 5 | sandbox/*
 6 | testing/*
 7 | cran-comments.md
 8 | README.md
 9 | nohup.out
10 | \.Rapp.history
11 | man/conStruct-manual.pdf
12 | src/init.o
13 | src/conStruct.so
14 | src/stan_files/.*\.o$
15 | src/stan_files/.*\.hpp$
16 | notes_for_next.release.txt
17 | ^\.github$
18 | ^CRAN-SUBMISSION$
19 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v3
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | */.DS_Store
 3 | *tar.gz
 4 | sandbox/
 5 | build.conStruct.pkg.R
 6 | */.Rapp.history
 7 | .Rapp.history
 8 | .RData
 9 | *.Rout
10 | nohup.out
11 | testing/*/.Rapp.history
12 | testing/*/.RData
13 | testing/*/*.pdf
14 | testing/*/*conStruct.results.Robj
15 | testing/*/*data.block.Robj
16 | testing/*/*model.fit.Robj
17 | testing/*/*.out
18 | testing/*/*.Rout
19 | testing/xval/*.txt
20 | testing/xval/xvals*.Robj
21 | testing/xval/*.xvals.Robj
22 | testing/xval/*data.partitions.Robj
23 | testing/xval/*.xval.results.Robj
24 | src/*.so
25 | src/*.o
26 | src/stan_files/*.o
27 | src/stan_files/*.hpp
28 | testing/xval/*.log
29 | notes_for_next_release.txt
30 | testing/data_types
31 | CRAN-SUBMISSION


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | ***********************
 2 | [UPCOMING] - XXXX-XX-XX
 3 | ***********************
 4 | 
 5 | Bug fixes:
 6 | 
 7 | - Calculation of the allelic covariance matrix in calc.covariance was found to
 8 |   incorrectly use the sample covariance instead of the population covariance,
 9 |   which could lead to non-positive-definite covariance matrices in rare cases
10 |   with small sample sizes. (@petrelharp, PR #34)
11 | 
12 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: conStruct
 2 | Version: 1.0.6
 3 | Date: 2024-1-08
 4 | Title: Models Spatially Continuous and Discrete Population Genetic Structure
 5 | Description: A method for modeling genetic data as a combination of discrete
 6 |     layers, within each of which relatedness may decay continuously with geographic
 7 |     distance. This package contains code for running analyses (which are implemented
 8 |     in the modeling language 'rstan') and visualizing and interpreting output. See the
 9 |     paper for more details on the model and its utility.
10 | Authors@R: person("Gideon", "Bradburd", email = "bradburd@umich.edu", role = c("aut", "cre"))
11 | License: GPL-3
12 | Encoding: UTF-8
13 | LazyData: true
14 | ByteCompile: true
15 | Depends: R (>= 3.4.0), Rcpp (>= 0.12.0), methods
16 | Imports: rstan (>= 2.26.0), rstantools (>= 1.5.0), caroline, gtools, foreach, parallel, doParallel
17 | LinkingTo: StanHeaders (>= 2.26.0), rstan (>= 2.26.0), BH (>= 1.66.0), Rcpp (>= 0.12.0), RcppEigen (>= 0.3.3.3.0), RcppParallel (>= 5.0.1)
18 | SystemRequirements: GNU make
19 | NeedsCompilation: yes
20 | RoxygenNote: 7.2.3
21 | Suggests:
22 |     knitr,
23 |     rmarkdown,
24 |     maps
25 | VignetteBuilder: knitr
26 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(calculate.layer.contribution)
 4 | export(compare.two.runs)
 5 | export(conStruct)
 6 | export(make.admix.pie.plot)
 7 | export(make.all.the.plots)
 8 | export(make.structure.plot)
 9 | export(match.layers.x.runs)
10 | export(structure2conStruct)
11 | export(x.validation)
12 | import(Rcpp)
13 | import(methods)
14 | import(rstan)
15 | import(rstantools)
16 | importFrom(rstan,sampling)
17 | useDynLib(conStruct, .registration = TRUE)
18 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # conStruct 1.0.6
 2 | ## Minor changes
 3 |  + Updated for compatibility with rstan 2.26
 4 | 
 5 | # conStruct 1.0.5
 6 | 
 7 | ## Minor changes
 8 |  + updated package to fully delegate the installation/compilation of stan models to rstantools.
 9 |  + added a github actions R CMD CHECK workflow to make sure that changes are compatible across all platforms.
10 | 
11 | # conStruct 1.0.4
12 | 
13 | ## Bug fixes
14 |  + updated data validation functions to be in compliance with mew class inheritance coming in next R release
15 | 
16 | 
17 | # conStruct 1.0.3
18 | 
19 | ## Major changes
20 |  + added `...` to `conStruct` and `x.validation` so additional arguments can be passed to `rstan::sampling`
21 | 
22 | ## Bug fixes
23 |  + fixed aliasing due to inefficient deep copy in stan model
24 |  + removed duplicated vignettes displayed on CRAN page
25 | 
26 | # conStruct 1.0.2
27 | 
28 | ## Bug fixes
29 |  + updated Makevars and Makevars.win to be in compliance with CRAN policy
30 | 
31 | # conStruct 1.0.1
32 | 
33 | ## Major changes
34 |  + following move to C++14 by Stan
35 |  + `structure2conStruct` now works for multiple STRUCTURE file formats
36 | 
37 | ## Bug fixes
38 |  + users can now specify their own custom plotting colors in `make.all.the.plots` 
39 | 
40 | # conStruct 1.0.0
41 | 
42 | ## Major changes
43 |  + stan model blocks are now compiled at package installation instead of at a call to `conStruct` or `x.validation`.
44 |  + `x.validation` is now parallelizable
45 |  + new `model-comparison` vignette (see `vignette("model-comparison",package="conStruct")`)
46 |  + alphaD parameter is now rescaled to reflect non-normalized geographic distances
47 |  + compare.two.runs function added
48 | 
49 | ## Bug fixes
50 |  + Removed large files in git history on repo
51 | 
52 | # conStruct 0.0.0.9000
53 | 
54 | ## Beta release


--------------------------------------------------------------------------------
/R/conStruct-package.R:
--------------------------------------------------------------------------------
 1 | #' The 'conStruct' package.
 2 | #' 
 3 | #' @description A method for modeling genetic data as a combination of discrete
 4 | #'    layers, within each of which relatedness may decay continuously with geographic
 5 | #'    distance. This package contains code for running analyses (which are implemented
 6 | #'    in the modeling language 'rstan') and visualizing and interpreting output. See the
 7 | #'    associated paper for more details on the model and its utility.
 8 | #' 
 9 | #' @docType package
10 | #' @name conStruct-package
11 | #' @aliases conStruct-package
12 | #' @useDynLib conStruct, .registration = TRUE
13 | #' @import methods
14 | #' @import Rcpp
15 | #' @import rstantools
16 | #' @importFrom rstan sampling
17 | #' 
18 | #' @references 
19 | #' G.S. Bradburd, G.M. Coop, and P.L. Ralph (2018) <doi: 10.1534/genetics.118.301333>.
20 | #'
21 | #' Stan Development Team (2018). RStan: the R interface to Stan. R package version 2.17.3. http://mc-stan.org
22 | NULL
23 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Example dataset used in a \code{conStruct} analysis
 2 | #' 
 3 | #' A simulated dataset containing the allele frequency 
 4 | #' and sampling coordinate data necessary to run a 
 5 | #' \code{conStruct} analysis.
 6 | #' 
 7 | #' @format A list with two elements:
 8 | #' \describe{
 9 | #'		\item{allele.frequencies}{a matrix with one row for each of 
10 | #'			the 16 samples and one column for each of 10,000 loci, 
11 | #'			giving the frequency of the counted allele at each locus 
12 | #'			in each sample}
13 | #'		\item{coords}{a matrix with one row for each of the 16 samples, 
14 | #'			in the same order as that of the allele frequency matrix, 
15 | #'			and two columns, the first giving the x-coordinate 
16 | #'			(or longitude), the second giving the y-coordinate (or latitude)}
17 | #' }
18 | #
19 | "conStruct.data"
20 | 
21 | #' Example \code{data.block} generated by a \code{conStruct} analysis
22 | #' 
23 | #' An example \code{data.block} object generated in a \code{conStruct} 
24 | #' analysis from the raw data supplied by the user. This object is 
25 | #' automatically saved and is used in several subsequent plotting functions.
26 | #' 
27 | #' @format A list with 7 elements:
28 | #' \describe{
29 | #'		\item{\code{N}}{the number of samples included in the analysis}
30 | #'		\item{\code{K}}{the number of clusters/layers included in the model}
31 | #'		\item{\code{spatial}}{a boolean indicating whether the spatial
32 | #'			model has been specified}
33 | #'		\item{\code{L}}{the number of loci included in the analysis}
34 | #'		\item{\code{coords}}{a matrix with one row for each of the \code{N} samples, 
35 | #'			in the same order as that of the \code{obsCov} matrix, 
36 | #'			and two columns, the first giving the x-coordinate 
37 | #'			(or longitude), the second giving the y-coordinate (or latitude)}
38 | #'		\item{\code{obsCov}}{the sample allelic covariance matrix, 
39 | #'			in the same order as that of the \code{coords} matrix, 
40 | #'			with \code{N} rows and columns}
41 | #'		\item{\code{geoDist}}{a matrix of pairwise geographic distance between , 
42 | #'			samples in the same order as that of the \code{obsCov}, 
43 | #'			with \code{N} rows and columns}
44 | #'		\item{\code{sd.geoDist}}{the standard deviation of the raw geographic 
45 | #'			distance matrix, used for normalizing \code{geoDist} within the 
46 | #'			stan model}
47 | #'		\item{\code{varMeanFreqs}}{the variance of the mean allele frequencies, 
48 | #'			averaged over choice of counted allele (passed to the model 
49 | #'			as a prior on the global covariance parameter)}
50 | #' }
51 | #
52 | "data.block"


--------------------------------------------------------------------------------
/R/format.data.R:
--------------------------------------------------------------------------------
  1 | #' Convert a dataset from STRUCTURE to conStruct format
  2 | #'
  3 | #' \code{structure2conStruct} converts a STRUCTURE dataset 
  4 | #' to conStruct format
  5 | #' 
  6 | #' This function takes a population genetics dataset in 
  7 | #' STRUCTURE format and converts it to conStruct format. 
  8 | #' The STRUCTURE file can have one row per individual 
  9 | #' and two columns per locus, or one column and two rows 
 10 | #' per individual. It can only contain bi-allelic SNPs.
 11 | #' Missing data is acceptable, but must be indicated with 
 12 | #' a single value throughout the dataset.
 13 | #' 
 14 | #' @param infile The name and path of the file in STRUCTURE format 
 15 | #' 			to be converted to \code{conStruct} format. 
 16 | #' @param onerowperind Indicates whether the file format has 
 17 | #'		one row per individual (\code{TRUE}) or two rows per 
 18 | #'		individual (\code{FALSE}).
 19 | #' @param start.loci The index of the first column in the dataset 
 20 | #'			that contains genotype data.
 21 | #' @param start.samples The index of the first row in the dataset 
 22 | #'			that contains genotype data (e.g., after any headers). 
 23 | #'			Default value is 1.
 24 | #' @param missing.datum The character or value used to denote 
 25 | #' 			missing data in the STRUCTURE dataset (often 0 or -9).
 26 | #' @param outfile The name and path of the file containing the 
 27 | #'			\code{conStruct} formatted dataset to be generated 
 28 | #' 			by this function.
 29 | #'
 30 | #' @details This function takes a STRUCTURE format data file and 
 31 | #'		converts it to a \code{conStruct} format data file.
 32 | #'		This function can only be applied to diploid organisms.
 33 | #'		The STRUCTURE data file must be a plain text file. 
 34 | #'		If there is extraneous text or column headers before the data 
 35 | #'		starts, those extra lines should be deleted by hand or 
 36 | #'		taken into account via the \code{start.samples} argument.
 37 | #'		
 38 | #' 		The STRUCTURE dataset can either be in the ONEROWPERIND=1 
 39 | #' 		file format, with one row per individual and two columns 
 40 | #' 		per locus, or the ONEROWPERIND=0 format, with two rows and 
 41 | #'		one column per individual. The first column of the STRUCTURE 
 42 | #' 		dataset should be individual names. There may be any number 
 43 | #' 		of other columns that contain non-genotype information before 
 44 | #'		the first column that contains genotype data, but there can 
 45 | #' 		be no extraneous columns at the end of the dataset, after the 
 46 | #' 		genotype data.
 47 | #'		
 48 | #'		The genotype data must be bi-allelic 
 49 | #'		single nucleotide polymorphisms (SNPs). Applying this function 
 50 | #'		to datasets with more than two alleles per locus may result in 
 51 | #'		cryptic failure. For more details, see the \code{format-data} 
 52 | #'		vignette.
 53 | #'	
 54 | #'	@return This function returns an allele frequency data matrix 
 55 | #'		that can be used as the \code{freqs} argument in a conStruct 
 56 | #'		analysis run using \code{\link{conStruct}}.  It also saves 
 57 | #'		this object as an .RData file so that it can be used in 
 58 | #'		future analyses.
 59 | #'		
 60 | #' @export
 61 | structure2conStruct <- function(infile,onerowperind,start.loci,start.samples=1,missing.datum,outfile){
 62 | 	outfile <- paste0(outfile,".RData")
 63 | 	if(file.exists(outfile)){
 64 | 		stop("\noutfile already exists\n\n")
 65 | 	}
 66 | 	structure.data <- utils::read.table(infile,header=FALSE,skip=start.samples-1,stringsAsFactors=FALSE)
 67 | 	sample.names <- get.sample.names(structure.data,onerowperind)
 68 | 	genos <- structure.data[,start.loci:ncol(structure.data)]
 69 | 	rm(structure.data)
 70 | 	if(onerowperind & ncol(genos) %% 2 != 0){
 71 | 		stop("\nyou have mis-specified the genotype matrix\nplease check documentation\n\n")
 72 | 	}
 73 | 	if(!onerowperind & nrow(genos) %% 2 != 0){
 74 | 		stop("\nyou have mis-specified the genotype matrix\nplease check documentation\n\n")	
 75 | 	}
 76 | 
 77 | 	freqs <- get.freqs(genos,onerowperind,missing.datum)
 78 | 	row.names(freqs) <- sample.names
 79 | 	save(freqs,file=outfile)
 80 | 	return(freqs)
 81 | }
 82 | 
 83 | get.sample.names <- function(structure.data,onerowperind){
 84 | 	sample.names <- structure.data[,1]
 85 | 	if(!onerowperind){
 86 | 		sample.names <- sample.names[seq(1,length(sample.names),by=2)]
 87 | 	}
 88 | 	return(sample.names)
 89 | }
 90 | 
 91 | get.counted.allele <- function(genos,missing.datum){
 92 | 	alleles <- unique(genos)
 93 | 	if(all(alleles==missing.datum)){
 94 | 		stop("\nyour dataset contains loci with all data missing. please remove and re-try.\n\n")
 95 | 	}
 96 | 	alleles <- alleles[!alleles==missing.datum]
 97 | 	counted <- sample(alleles,1)
 98 | 	return(counted)
 99 | }
100 | 
101 | get.freqs <- function(genos,onerowperind,missing.datum){
102 | 	n.loci <- ifelse(onerowperind,ncol(genos)/2,ncol(genos))
103 | 	if(onerowperind){
104 | 		freqs <- get.freqs.onerowperind(genos,n.loci,missing.datum)
105 | 	} else {
106 | 		freqs <- get.freqs.tworowperind(genos,n.loci,missing.datum)
107 | 	}
108 | 	colnames(freqs) <- NULL
109 | 	return(freqs)
110 | }
111 | 
112 | get.freqs.onerowperind <- function(genos,n.loci,missing.datum){
113 | 	if(any(genos > 1)){
114 | 		counted.alleles <- apply(genos,2,get.counted.allele,missing.datum)
115 | 	} else {
116 | 		counted.alleles <- rep(1,n.loci)
117 | 	}
118 | 	freqs <- Reduce("cbind",
119 | 				lapply(1:n.loci,
120 | 							function(l){
121 | 								(genos[,seq(1,2*n.loci,by=2)[l]] == counted.alleles[l]) + 
122 | 								(genos[,seq(2,2*n.loci,by=2)[l]] == counted.alleles[l])
123 | 							}))
124 | 	freqs <- freqs/2
125 | 	missing.data <- Reduce("cbind",
126 | 						lapply(1:n.loci,
127 | 							function(l){
128 | 								(genos[,seq(1,2*n.loci,by=2)[l]] == missing.datum) + 
129 | 								(genos[,seq(2,2*n.loci,by=2)[l]] == missing.datum)
130 | 							}))
131 | 	freqs[missing.data==2] <- NA
132 | 	return(freqs)
133 | }
134 | 
135 | get.freqs.tworowperind <- function(genos,n.loci,missing.datum){
136 | 	if(any(genos > 1)){
137 | 		counted.alleles <- apply(genos,2,get.counted.allele,missing.datum)
138 | 	} else {
139 | 		counted.alleles <- rep(1,n.loci)
140 | 	}
141 | 	freqs <- Reduce("cbind",
142 | 				lapply(1:n.loci,
143 | 							function(l){
144 | 								(genos[seq(1,nrow(genos),by=2),l] == counted.alleles[l]) + 
145 | 								(genos[seq(2,nrow(genos),by=2),l] == counted.alleles[l])
146 | 							}))
147 | 	freqs <- freqs/2
148 | 	missing.data <- Reduce("cbind",
149 | 						lapply(1:n.loci,
150 | 							function(l){
151 | 								(genos[seq(1,nrow(genos),by=2),l] == missing.datum) + 
152 | 								(genos[seq(2,nrow(genos),by=2),l] == missing.datum)
153 | 							}))
154 | 	freqs[missing.data==2] <- NA
155 | 	return(freqs)
156 | }
157 | 
158 | 


--------------------------------------------------------------------------------
/R/process.model.fit.R:
--------------------------------------------------------------------------------
  1 | unstandardize.distances <- function(data.block){
  2 | 	if(!is.null(data.block$sd.geoDist)){
  3 | 		data.block$geoDist <- data.block$geoDist*data.block$sd.geoDist
  4 | 	}
  5 | 	return(data.block)
  6 | }
  7 | 
  8 | get.conStruct.results <- function(data.block,model.fit,n.chains){
  9 | 	conStruct.results <- stats::setNames(
 10 | 							lapply(1:n.chains,
 11 | 								function(i){
 12 | 									get.conStruct.chain.results(data.block,model.fit,i)
 13 | 								}),
 14 | 						  paste0("chain_",1:n.chains))
 15 | 	return(conStruct.results)
 16 | }
 17 | 
 18 | get.MAP.iter <- function(model.fit,chain.no){
 19 | 	lpd <- rstan::get_logposterior(model.fit)
 20 | 	MAP.iter <- lapply(lpd,which.max)[[chain.no]]
 21 | 	return(MAP.iter)
 22 | }
 23 | 
 24 | get.admix.props <- function(model.fit,chain.no,N,n.layers){
 25 | 	# recover()
 26 | 	admix.props <- array(1,dim=c(model.fit@sim$n_save[chain.no],N,n.layers))
 27 | 	if(any(grepl("w",model.fit@model_pars))){
 28 | 		for(k in 1:n.layers){
 29 | 			admix.props[,,k] <- rstan::extract(model.fit,
 30 | 											pars=unlist(lapply(1:N,function(j){sprintf("w[%s,%s]",j,k)})),
 31 | 											permuted=FALSE,inc_warmup=TRUE)[,chain.no,]
 32 | 		}
 33 | 	}
 34 | 	return(admix.props)
 35 | }
 36 | 
 37 | get.par.cov <- function(model.fit,chain.no,N){
 38 | 	par.cov <- array(NA,dim=c(model.fit@sim$n_save[chain.no],N,N))
 39 | 	for(i in 1:N){
 40 | 		for(j in 1:N){
 41 | 			my.par <- sprintf("parCov[%s,%s]",i,j)
 42 | 			par.cov[,i,j] <- rstan::extract(model.fit,pars=my.par,inc_warmup=TRUE,permuted=FALSE)[,chain.no,]
 43 | 		}
 44 | 	}
 45 | 	return(par.cov)
 46 | }
 47 | 
 48 | get.nuggets <- function(model.fit,chain.no,N){
 49 | 	nuggets <- rstan::extract(model.fit,pars="nugget",inc_warmup=TRUE,permuted=FALSE)[,chain.no,]
 50 | 	return(nuggets)
 51 | }
 52 | 
 53 | get.gamma <- function(model.fit,chain.no){
 54 | 	gamma <- rstan::extract(model.fit,pars="gamma",inc_warmup=TRUE,permuted=FALSE)[,chain.no,]
 55 | 	return(gamma)
 56 | }
 57 | 
 58 | get.null.alpha.params <- function(n.iter){
 59 | 	alpha.params <- list("alpha0" = rep(0,n.iter),
 60 | 						 "alphaD" = rep(0,n.iter),
 61 | 						 "alpha2" = rep(0,n.iter))
 62 | 	return(alpha.params)	
 63 | }
 64 | 
 65 | get.alpha.params <- function(model.fit,data.block,chain.no,layer,n.layers){
 66 | 	alpha.pars <- model.fit@model_pars[grepl("alpha",model.fit@model_pars)]
 67 | 	if(length(alpha.pars) !=0 ){
 68 | 		if(n.layers > 1){
 69 | 			alpha.params <- stats::setNames(
 70 | 									lapply(1:length(alpha.pars),
 71 | 											function(i){
 72 | 												rstan::extract(model.fit,
 73 | 														pars=paste0(alpha.pars[i],"[",layer,"]"),
 74 | 														inc_warmup=TRUE,permuted=FALSE)[,chain.no,]
 75 | 											}),alpha.pars)
 76 | 		} else {
 77 | 			alpha.params <- stats::setNames(
 78 | 									lapply(1:length(alpha.pars),
 79 | 											function(i){
 80 | 												rstan::extract(model.fit,
 81 | 														pars=alpha.pars[i],
 82 | 														inc_warmup=TRUE,permuted=FALSE)[,chain.no,]
 83 | 											}),alpha.pars)		
 84 | 		}
 85 | 	} else {
 86 | 		alpha.params <- get.null.alpha.params(model.fit@sim$n_save[chain.no])
 87 | 	}
 88 | 	if(!is.null(data.block$sd.geoDist)){
 89 | 		alpha.params$alphaD <- alpha.params$alphaD/data.block$sd.geoDist
 90 | 	}
 91 | 	return(alpha.params)
 92 | }
 93 | 
 94 | get.null.phi <- function(n.iter){
 95 | 	phi <- rep(0,n.iter)
 96 | 	return(phi)
 97 | }
 98 | 
 99 | get.layer.phi <- function(model.fit,chain.no,layer){
100 | 	has.phi <- any(grepl("phi",model.fit@model_pars))
101 | 	if(has.phi){
102 | 		phi <- rstan::extract(model.fit,
103 | 						pars=paste0("phi","[",layer,"]"),
104 | 						inc_warmup=TRUE,permuted=FALSE)[,chain.no,]
105 | 	} else {
106 | 		phi <- get.null.phi(model.fit@sim$n_save[chain.no])
107 | 	}
108 | 	return(phi)
109 | }
110 | 
111 | get.cov.function <- function(data.block){
112 | 	if(data.block$K == 1){
113 | 		if(data.block$spatial){
114 | 			cov.func <- function(layer.params,data.block){
115 | 				return(layer.params$alpha0 * 
116 | 						exp(-(layer.params$alphaD*data.block$geoDist)^layer.params$alpha2))
117 | 			}
118 | 		}
119 | 		if(!data.block$spatial){
120 | 			cov.func <- function(layer.params,data.block){
121 | 				return(matrix(0,nrow=data.block$N,ncol=data.block$N))
122 | 			}
123 | 		}
124 | 	} else {
125 | 		if(data.block$spatial){
126 | 			cov.func <- function(layer.params,data.block){
127 | 				return(layer.params$alpha0 *  
128 | 						exp(-(layer.params$alphaD*data.block$geoDist)^layer.params$alpha2) + 
129 | 							layer.params$phi)
130 | 			}
131 | 		}
132 | 		if(!data.block$spatial){
133 | 			cov.func <- function(layer.params,data.block){
134 | 				return(matrix(layer.params$phi,nrow=data.block$N,ncol=data.block$N))
135 | 			}
136 | 		}
137 | 	}
138 | 	return(cov.func)
139 | }
140 | 
141 | get.layer.cov <- function(layer.params,data.block,n.iter){
142 | 	cov.function <- get.cov.function(data.block)
143 | 	layer.cov <- lapply(1:n.iter,
144 | 							function(i){
145 | 								cov.function(layer.params=
146 | 												lapply(layer.params,"[[",i),
147 | 												data.block)
148 | 							})
149 | 	return(layer.cov)
150 | }
151 | 
152 | get.layer.params <- function(model.fit,data.block,chain.no,layer,n.layers,n.iter){
153 | 	layer.params <- list()
154 | 	layer.params <- get.alpha.params(model.fit,data.block,chain.no,layer,n.layers)
155 | 	layer.params[["phi"]] <- get.layer.phi(model.fit,chain.no,layer)
156 | 	layer.cov <- get.layer.cov(layer.params,data.block,n.iter)
157 | 	layer.params <- c(layer.params,list("layer.cov"=layer.cov))
158 | 	return(layer.params)
159 | }
160 | 
161 | get.layer.params.list <- function(model.fit,data.block,chain.no,n.iter){
162 | 	layer.params <- stats::setNames(
163 | 								lapply(1:data.block$K,
164 | 											function(i){
165 | 												get.layer.params(model.fit,data.block,chain.no,i,data.block$K,n.iter)
166 | 											}),
167 | 								paste("layer",1:data.block$K,sep="_"))
168 | 	layer.params <- make.layer.params.S3(layer.params)
169 | 	return(layer.params)
170 | }
171 | 
172 | make.layer.params.S3 <- function(layer.params){
173 | 	layer.params <- layer.params
174 | 	class(layer.params) <- "layer.params"
175 | 	return(layer.params)
176 | }
177 | 
178 | #' An S3 print method for class layer.params
179 | #' 
180 | #' @param x an object of class \code{layer.params}
181 | #' @param ... further options to be passed to \code{print}
182 | #' @return prints a top-level summary of the layer.params, returns nothing
183 | #' @method print layer.params
184 | print.layer.params <- function(x,...){
185 | 	print(x=utils::str(x,max.level=1),...)
186 | }
187 | 
188 | index.MAP <- function(param,MAP.iter){
189 | 	if(inherits(param,"numeric")){
190 | 		MAP.param <- param[MAP.iter]
191 | 	}
192 | 	if(inherits(param,"list")){
193 | 		MAP.param <- param[[MAP.iter]]
194 | 	}
195 | 	if(inherits(param,"array") & length(dim(param)) == 3){
196 | 		MAP.param <- param[MAP.iter,,]
197 | 		if(is.null(dim(MAP.param))){
198 | 			MAP.param <- matrix(MAP.param,nrow=length(MAP.param),ncol=1)
199 | 		}
200 | 	}
201 | 	if(inherits(param,"matrix") & length(dim(param)) == 2){
202 | 		MAP.param <- param[MAP.iter,]
203 | 	}
204 | 	if(inherits(param,"layer.params")){
205 | 		MAP.param <- index.MAP.layer.params.list(param,MAP.iter)
206 | 	}
207 | 	return(MAP.param)
208 | }
209 | 
210 | index.MAP.layer.params <- function(layer.params,MAP.iter){
211 | 	MAP.layer.params <- lapply(layer.params,index.MAP,MAP.iter)
212 | 	return(MAP.layer.params)
213 | }
214 | 
215 | index.MAP.layer.params.list <- function(layer.params.list,MAP.iter){
216 | 	MAP.layer.params.list <- lapply(layer.params.list,index.MAP.layer.params,MAP.iter)
217 | 	return(MAP.layer.params.list)
218 | }
219 | 
220 | get.n.iter <- function(model.fit,chain.no){
221 | 	n.iter <- model.fit@sim$n_save[chain.no]
222 | 	return(n.iter)
223 | }
224 | 
225 | make.conStruct.results.S3 <- function(conStruct.results){
226 | 	conStruct.results <- conStruct.results
227 | 	class(conStruct.results) <- "conStruct.results"
228 | 	return(conStruct.results)
229 | }
230 | 
231 | #' An S3 print method for class conStruct.results
232 | #' 
233 | #' @param x an object of class \code{conStruct.results}
234 | #' @param ... further options to be passed to \code{print}
235 | #' @return prints a top-level summary of the conStruct.results, returns nothing
236 | #' @method print conStruct.results
237 | print.conStruct.results <- function(x,...){
238 | 	print(x=utils::str(x,max.level=1),...)
239 | }
240 | 
241 | get.conStruct.chain.results <- function(data.block,model.fit,chain.no){
242 | 	n.iter <- get.n.iter(model.fit,chain.no)
243 | 	posterior <- list("n.iter" = model.fit@sim$n_save[chain.no],
244 | 					  "lpd" = rstan::get_logposterior(model.fit)[[chain.no]],
245 | 					  "nuggets" = get.nuggets(model.fit,chain.no,data.block$N),
246 | 					  "par.cov" = get.par.cov(model.fit,chain.no,data.block$N),
247 | 					  "gamma" = get.gamma(model.fit,chain.no),
248 | 					  "layer.params" = get.layer.params.list(model.fit,data.block,chain.no,n.iter),
249 | 					  "admix.proportions" = get.admix.props(model.fit,chain.no,data.block$N,data.block$K))
250 | 	MAP.iter <- get.MAP.iter(model.fit,chain.no)
251 | 	MAP <- lapply(posterior,function(X){index.MAP(X,MAP.iter)})
252 | 	names(MAP)[[1]]  <- "index.iter"
253 | 	MAP[["index.iter"]] <- MAP.iter
254 | 	conStruct.results <- list("posterior" = posterior,"MAP" = MAP)
255 | 	conStruct.results <- make.conStruct.results.S3(conStruct.results)
256 | 	return(conStruct.results)
257 | }


--------------------------------------------------------------------------------
/R/run.conStruct.R:
--------------------------------------------------------------------------------
  1 | #' Run a conStruct analysis.
  2 | #'
  3 | #' \code{conStruct} runs a conStruct analysis of genetic data.
  4 | #'
  5 | #' This function initiates an analysis that uses  
  6 | #' geographic and genetic relationships between samples 
  7 | #' to estimate sample membership (admixture proportions) across 
  8 | #' a user-specified number of layers.
  9 | #'
 10 | #' @param spatial A logical indicating whether to perform a spatial analysis. 
 11 | #' 				  Default is \code{TRUE}. 
 12 | #' @param K An \code{integer} that indicates the number of layers to be 
 13 | #' 				  included in the analysis.
 14 | #' @param freqs A \code{matrix} of allele frequencies with one column per 
 15 | #'				locus and one row per sample.
 16 | #' 				Missing data should be indicated with \code{NA}.
 17 | #' @param geoDist A full \code{matrix} of geographic distance between samples. 
 18 | #'					If \code{NULL}, user can only run the nonspatial model.
 19 | #' @param coords A \code{matrix} giving the longitude and latitude 
 20 | #'					(or X and Y coordinates) of the samples.
 21 | #' @param prefix A character \code{vector} giving the prefix to be attached 
 22 | #'					 to all output files.
 23 | #' @param n.chains An integer indicating the number of MCMC chains to be run 
 24 | #'					in the analysis. Default is 1.
 25 | #' @param n.iter An \code{integer} giving the number of iterations each MCMC 
 26 | #'				 chain is run. Default is 1e3.  If the number of iterations 
 27 | #'				 is greater than 500, the MCMC is thinned so that the number 
 28 | #'				 of retained iterations is 500 (before burn-in).
 29 | #' @param make.figs A \code{logical} value indicating whether to automatically 
 30 | #'					make figures once the analysis is complete. Default is 
 31 | #'					\code{TRUE}.
 32 | #' @param save.files A \code{logical} value indicating whether to automatically 
 33 | #'						save output and intermediate files once the analysis is
 34 | #'						 complete. Default is \code{TRUE}.
 35 | #' @param ... Further options to be passed to rstan::sampling (e.g., adapt_delta).
 36 | #'
 37 | #' @return This function returns a list with one entry for each chain run 
 38 | #'			(specified with \code{n.chains}). The entry for each chain is named 
 39 | #'			"chain_X" for the Xth chain.  The components of the entries for each 
 40 | #'			are detailed below: 
 41 | #'			\itemize{
 42 | #'				\item \code{posterior} gives parameter estimates over the posterior 
 43 | #'						distribution of the MCMC.
 44 | #'					\itemize{
 45 | #'						\item \code{n.iter} number of MCMC iterations retained for 
 46 | #'								analysis (half of the \code{n.iter} argument 
 47 | #'								specified in the function call).
 48 | #'						\item \code{lpd} vector of log posterior density over the retained 
 49 | #'								MCMC iterations.
 50 | #'						\item \code{nuggets} matrix of estimated nugget parameters with 
 51 | #'								one row per MCMC iteration and one column per sample.
 52 | #'						\item \code{par.cov} array of estimated parametric covariance matrices, 
 53 | #'								for which the first dimension is the number of MCMC iterations.
 54 | #'						\item \code{gamma} vector of estimated gamma parameter.
 55 | #'						\item \code{layer.params} list summarizing estimates of layer-specific 
 56 | #'								parameters. There is one entry for each layer specified, and the 
 57 | #'								entry for the kth layer is named "Layer_k".
 58 | #'							\itemize{
 59 | #'								\item \code{alpha0} vector of estimated alpha0 parameter in the 
 60 | #'										kth layer.
 61 | #'								\item \code{alphaD} vector of estimated alphaD parameter in the 
 62 | #'										kth layer.
 63 | #'								\item \code{alpha2} vector of estimated alpha2 parameter in the 
 64 | #'										kth layer.
 65 | #'								\item \code{mu} vector of estimated mu parameter in the 
 66 | #'										kth layer.
 67 | #'								\item \code{layer.cov} vector of estimated layer-specific 
 68 | #'										covariance parameter in the kth layer.
 69 | #'							}
 70 | #'						\item \code{admix.proportions} array of estimated admixture proportions.
 71 | #'								The first dimension is the number of MCMC iterations, 
 72 | #'								the second is the number of samples, 
 73 | #' 								and the third is the number of layers.
 74 | #'					}
 75 | #'			\item \code{MAP} gives point estimates of the parameters listed in the \code{posterior}
 76 | #'								list described above. Values are indexed at the MCMC iteration 
 77 | #'								with the greatest posterior probability.
 78 | #'					\itemize{
 79 | #'						\item \code{index.iter} the iteration of the MCMC with the highest 
 80 | #'								posterior probability, which is used to index all parameters 
 81 | #'								included in the \code{MAP} list
 82 | #'						\item \code{lpd} the greatest value of the posterior probability
 83 | #'						\item \code{nuggets} point estimate of nugget parameters
 84 | #'						\item \code{par.cov} point estimate of parametric covariance
 85 | #'						\item \code{gamma} point estimate of gamma parameter
 86 | #'						\item \code{layer.params} point estimates of all layer-specific parameters 
 87 | #'						\item \code{admix.proportions} point estimates of admixture proportions.
 88 | #'					}
 89 | #'			}
 90 | #'
 91 | #' @details This function acts as a wrapper around a STAN model block determined 
 92 | #'			by the user-specified model (e.g., a spatial model with 3 layers, 
 93 | #'			or a nonspatial model with 5 layers).
 94 | #'			User-specified data are checked for appropriate format and consistent dimensions,
 95 | #'			then formatted into a \code{data.block},
 96 | #'			which is then passed to the STAN model block.
 97 | #'			Along with the \code{conStruct.results} output described above, 
 98 | #'			several objects are saved during the course of a \code{conStruct} call
 99 | #'			(if \code{save.files=TRUE}).
100 | #'			These are the \code{data.block}, which contains all data passed to the STAN model block,
101 | #'			\code{model.fit}, which is unprocessed results of the STAN run in \code{stanfit} format,
102 | #'			and the \code{conStruct.results}, which are saved in the course of the function call
103 | #'			in addition to being returned.
104 | #'			If \code{make.figs=TRUE}, running \code{conStruct} will also generate many output figures, 
105 | #'			which are detailed in the function \code{make.all.the.plots} in this package.
106 | #'
107 | #' @examples
108 | #' # load example dataset
109 | #' data(conStruct.data)
110 | #' 
111 | #' # run example spatial analysis with K=1
112 | #' 	#	
113 | #'	# for this example, make.figs and save.files
114 | #'	#	are set to FALSE, but most users will want them 
115 | #'	#	set to TRUE
116 | #' my.run <- conStruct(spatial = TRUE,
117 | #'			 			K = 1,
118 | #'			 			freqs = conStruct.data$allele.frequencies,
119 | #'			 			geoDist = conStruct.data$geoDist,
120 | #'			 			coords = conStruct.data$coords,
121 | #'			 			prefix = "test",
122 | #'			 			n.chains = 1,
123 | #'			 			n.iter = 1e3,
124 | #'			 			make.figs = FALSE,
125 | #'			 			save.files = FALSE)
126 | #'
127 | #' @import rstan
128 | #' @export
129 | conStruct <- function(spatial=TRUE,K,freqs,geoDist=NULL,coords,prefix="",n.chains=1,n.iter=1e3,make.figs=TRUE,save.files=TRUE,...){
130 | 	call.check <- check.call(args <- as.list(environment()))
131 | 	freq.data <- process.freq.data(freqs)
132 | 	data.block <- make.data.block(K,freq.data,coords,spatial,geoDist)
133 | 	if(save.files){
134 | 		save(data.block,file=paste0(prefix,"_data.block.Robj"))
135 | 	}
136 | 	stan.model <- pick.stan.model(spatial,K)
137 | 	model.fit <- rstan::sampling(object = stanmodels[[stan.model]],
138 | 							 	 refresh = min(floor(n.iter/10),500),
139 | 							 	 data = data.block,
140 | 							 	 iter = n.iter,
141 | 							 	 chains = n.chains,
142 | 							 	 thin = ifelse(n.iter/500 > 1,floor(n.iter/500),1),
143 | 							 	 save_warmup = FALSE,
144 | 							 	 ...)
145 | 	conStruct.results <- get.conStruct.results(data.block,model.fit,n.chains)
146 | 	data.block <- unstandardize.distances(data.block)
147 | 	if(save.files){
148 | 		save(data.block,file=paste0(prefix,"_data.block.Robj"))
149 | 		save(model.fit,file=paste(prefix,"model.fit.Robj",sep="_"))
150 | 		save(conStruct.results,file=paste(prefix,"conStruct.results.Robj",sep="_"))
151 | 	}
152 | 	if(make.figs){
153 | 		make.all.the.plots(conStruct.results,data.block,prefix,layer.colors=NULL)
154 | 	}
155 | 	return(conStruct.results)
156 | }
157 | 
158 | validate.data.list <- function(data.block){
159 | 	if(!"spatial" %in% names(data.block)){
160 | 		stop("\nUser must specify a \"spatial\" option\n\n")
161 | 	}
162 | 	if(!"N" %in% names(data.block)){
163 | 		stop("\nUser must specify a \"N\"\n\n")
164 | 	}
165 | 	if(!"K" %in% names(data.block)){
166 | 		stop("\nUser must specify a \"K\"\n\n")
167 | 	}
168 | 	if(!"L" %in% names(data.block)){
169 | 		stop("\nUser must specify a \"L\"\n\n")
170 | 	}
171 | 	if(!"obsCov" %in% names(data.block)){
172 | 		stop("\nUser must specify a \"obsCov\"\n\n")
173 | 	}
174 | 	return(invisible("list elements validated"))	
175 | }
176 | 
177 | validate.n.samples <- function(data.block){
178 | 	n.samples <- data.block$N
179 | 	n.samples <- c(data.block$N,nrow(data.block$obsCov))
180 | 	if(!is.null(data.block$geoDist)){
181 | 		n.samples <- c(n.samples,nrow(data.block$geoDist))
182 | 	}
183 | 	if(length(unique(n.samples)) > 1){
184 | 		stop("\nthe number of samples is not consistent 
185 | 				across entries in the data.block\n\n")
186 | 	}
187 | 	return(invisible("n.samples validated"))
188 | }
189 | 
190 | validate.model <- function(data.block){
191 | 	if(data.block$spatial){
192 | 		if(is.null(data.block$geoDist)){
193 | 			stop("\nyou have specified a spatial model,
194 | 				  but you have not specified a matrix 
195 | 				  of pairwise geographic distances\n\n")
196 | 		}
197 | 		if(any(data.block$geoDist < 0)){
198 | 			stop("\nyou have specified an invalid 
199 | 				  distance matrix that contains 
200 | 				  negative values\n\n")
201 | 		}
202 | 		if(any(is.na(data.block$geoDist))){
203 | 			stop("\nyou have specified an invalid 
204 | 				  distance matrix that contains 
205 | 				  non-numeric values\n\n")			
206 | 		}
207 | 	}
208 | 	return(invisible("model validated"))
209 | }
210 | 
211 | make.data.block.S3 <- function(data.block){
212 | 	data.block <- data.block
213 | 	class(data.block) <- "data.block"
214 | 	return(data.block)
215 | }
216 | 
217 | #' An S3 print method for class data.block
218 | #' 
219 | #' @param x an object of class \code{data.block}
220 | #' @param ... further options to be passed to \code{print}
221 | #' @return prints a top-level summary of the data.block, returns nothing
222 | #' @method print data.block
223 | print.data.block <- function(x,...){
224 | 	print(x=utils::str(x,max.level=1),...)
225 | }
226 | 
227 | validate.data.block <- function(data.block){
228 | 	message("\nchecking data.block\n")
229 | 		validate.data.list(data.block)
230 | 		validate.n.samples(data.block)
231 | 	message(sprintf("\treading %s samples",data.block$N))
232 | 	message(sprintf("\treading %s loci",data.block$L))
233 | 	if(!data.block$L > data.block$N){
234 | 		stop("\nyour data must have a greater number of loci than there are samples\n")
235 | 	}
236 | 	message("\nchecking specified model\n")
237 | 		validate.model(data.block)
238 | 	if(data.block$spatial){
239 | 		message(sprintf("\nuser has specified a spatial model with %s layer(s)\n",data.block$K))
240 | 	}
241 | 	if(!data.block$spatial){
242 | 		message(sprintf("\nuser has specified a purely discrete model with %s layer(s)\n",data.block$K))
243 | 	}
244 | 	data.block <- make.data.block.S3(data.block)
245 | 	return(data.block)
246 | }
247 | 
248 | pick.stan.model <- function(spatial,n.layers){
249 | 	stan.code.block.name <- "stan.block"
250 | 	if(n.layers == 1){
251 | 		name <- "oneK"
252 | 	}
253 | 	if(n.layers > 1){
254 | 		name <- "multiK"
255 | 	}
256 | 	if(spatial){
257 | 		name <- paste0("space_",name)
258 | 	}
259 | 	return(name)
260 | }
261 | 
262 | make.freq.data.list.S3 <- function(freq.data){
263 | 	freq.data <- freq.data
264 | 	class(freq.data) <- "freq.data"
265 | 	return(freq.data)
266 | }
267 | 
268 | #' An S3 print method for class freq.data
269 | #' 
270 | #' @param x an object of class \code{freq.data}
271 | #' @param ... further options to be passed to \code{print}
272 | #' @return prints a top-level summary of the freq.data, returns nothing
273 | #' @method print freq.data
274 | print.freq.data <- function(x,...){
275 | 	print(x=utils::str(x,max.level=1),...)
276 | }
277 | 
278 | identify.invar.sites <- function(freqs){
279 | 	invar <- length(unique(freqs[which(!is.na(freqs))])) == 1
280 | 	return(invar)
281 | }
282 | 
283 | drop.invars <- function(freqs){
284 | 	invars <- apply(freqs,2,identify.invar.sites)
285 | 	freqs <- freqs[,!invars]
286 | 	return(freqs)
287 | }
288 | 
289 | identify.missing.sites <- function(freqs){
290 | 	n.samples <- length(freqs)
291 | 	missing <- FALSE
292 | 	if(length(which(is.na(freqs))) == n.samples){
293 | 		missing <- TRUE
294 | 	}
295 | 	return(missing)
296 | }
297 | 
298 | drop.missing <- function(freqs){
299 | 	missing <- apply(freqs,2,identify.missing.sites)
300 | 	freqs <- freqs[,!missing]
301 | 	return(freqs)
302 | }
303 | 
304 | calc.covariance <- function(freqs){
305 | 	x <- t(freqs)
306 | 	allelic.covariance <- (1 - 1/nrow(freqs)) * stats::cov(x,use="pairwise.complete.obs") - 
307 | 									(1/2) * outer( colMeans(x,na.rm=TRUE), 1-colMeans(x,na.rm=TRUE), "*" ) -
308 | 									(1/2) * outer(1-colMeans(x,na.rm=TRUE), colMeans(x,na.rm=TRUE), "*") + 1/4
309 | 	diag(allelic.covariance) <- 0.25
310 | 	return(allelic.covariance)
311 | }
312 | 
313 | pos.def.check <- function(obsCov){
314 | 	eigenvalues <- eigen(obsCov)$values
315 | 	if(any(eigenvalues <= 0)){
316 | 		stop("\n\nThe sample covariance is not positive definite. Check to make sure that none of your samples are identical (after dropping missing data). If that does not fix the problem, try dropping the loci or samples with the most missing data.\n\n")
317 | 	}
318 | 	return("pos.def.check")
319 | }
320 | 
321 | process.freq.data <- function(freqs){
322 | 	freqs <- drop.invars(freqs)
323 | 	freqs <- drop.missing(freqs)
324 | 	n.loci <- ncol(freqs)
325 | 	obsCov <- calc.covariance(freqs)
326 | 	if(any(is.na(obsCov))){
327 | 		stop("\n\nAfter dropping invariant loci, one or more pairs of samples have no genotyped loci in common, so relatedness between them cannot be assessed.\n\n")
328 | 	}
329 | 	pos.def <- pos.def.check(obsCov)
330 | 	freq.data <- list("freqs" = freqs,
331 | 					  "obsCov" = obsCov,
332 | 					  "n.loci" = n.loci)
333 | 	freq.data <- make.freq.data.list.S3(freq.data)
334 | 	return(freq.data)
335 | }
336 | 
337 | standardize.distances <- function(D){
338 | 	if(!is.null(D)){
339 | 		stdev.D <- stats::sd(D[upper.tri(D)])
340 | 		std.D <- D/stdev.D
341 | 	} else {
342 | 		std.D <- NULL
343 | 		stdev.D <- NULL
344 | 	}
345 | 	sd.dist.lit <- list("std.D" = std.D,
346 | 						"stdev.D" = stdev.D)
347 | 	return(sd.dist.lit)
348 | }
349 | 
350 | make.data.block <- function(K,freq.data,coords,spatial,geoDist=NULL){
351 | 	sd.dist.list <- standardize.distances(geoDist)
352 | 	data.block <- list("N" = nrow(coords),
353 | 					   "K" = K,
354 | 					   "spatial" = spatial,
355 | 					   "L" = freq.data$n.loci,
356 | 					   "coords" = coords,
357 | 					   "obsCov" = freq.data$obsCov,
358 | 					   "geoDist" = sd.dist.list$std.D,
359 | 					   "sd.geoDist" = sd.dist.list$stdev.D,
360 | 					   "varMeanFreqs" = mean(0.5*colMeans(freq.data$freqs-0.5,na.rm=TRUE)^2 + 0.5*colMeans(1-freq.data$freqs-0.5,na.rm=TRUE)^2))
361 | 	data.block <- validate.data.block(data.block)
362 | 	return(data.block)
363 | }
364 | 
365 | check.call <- function(args){
366 | 	check.spatial.arg(args)
367 | 	check.K.arg(args)
368 | 	check.freqs.arg(args)
369 | 	check.geoDist.arg(args)
370 | 	check.coords.arg(args)
371 | 	return(invisible("args checked"))
372 | }
373 | 
374 | check.spatial.arg <- function(args){
375 | 	if(args[["spatial"]] != TRUE & args[["spatial"]] != FALSE){
376 | 		stop("\nthe \"spatial\" argument must be either TRUE or FALSE\n")
377 | 	}
378 | 	return(invisible("spatial arg checked"))
379 | }
380 | 
381 | check.K.arg <- function(args){
382 | 	if(length(args[["K"]]) > 1){
383 | 		stop("\nyou have specified more than one value for the \"K\" argument\n")
384 | 	} 
385 | 	if(!inherits(args[["K"]],"numeric") & !inherits(args[["K"]],"integer")){
386 | 		stop("\nyou have specified a non-numeric value for the \"K\" argument\n")
387 | 	}
388 | 	if(args[["K"]] %% 1 != 0){
389 | 		stop("\nyou have specified a non-integer value for the \"K\" argument\n")
390 | 	}
391 | 	return(invisible("K arg checked"))
392 | }
393 | 
394 | check.freqs.arg <- function(args){
395 | 	if(!inherits(args[["freqs"]],"matrix")){
396 | 		stop("\nthe \"freqs\" argument must be of class \"matrix\"\n")
397 | 	}
398 | 	if(any(args[["freqs"]] > 1,na.rm=TRUE)){
399 | 		stop("\nall values of the the \"freqs\" argument must be less than 1\n")	
400 | 	}
401 | 	if(any(args[["freqs"]] < 0,na.rm=TRUE)){	
402 | 		stop("\nall values of the the \"freqs\" argument must be greater than 0\n")
403 | 	}
404 | 	return(invisible("freqs arg checked"))
405 | }
406 | 
407 | check.geoDist.arg <- function(args){
408 | 	if(args[["spatial"]]){
409 | 		if(is.null(args[["geoDist"]])){
410 | 			stop("\nif the \"spatial\" argument is TRUE, you must specify a \"geoDist\" argument\n")
411 | 		}
412 | 	}
413 | 	if(!is.null(args[["geoDist"]])){
414 | 		if(!inherits(args[["geoDist"]],"matrix")){
415 | 			stop("\nthe \"geoDist\" argument must be of class \"matrix\"\n")
416 | 		}
417 | 		if(length(unique(dim(args[["geoDist"]]))) > 1){
418 | 			stop("\nyou have specified a \"geoDist\" argument with an unequal number of rows and columns\n")	
419 | 		}
420 | 		if(any(args[["geoDist"]] < 0)){
421 | 			stop("\nall values of the \"geoDist\" argument must be greater than 0\n")
422 | 		}
423 | 		tmp.geoDist <- args[["geoDist"]]
424 | 		row.names(tmp.geoDist) <- NULL
425 | 		colnames(tmp.geoDist) <- NULL
426 | 		if(!isSymmetric(tmp.geoDist)){	
427 | 			stop("\nyou must specify a symmetric matrix for the \"geoDist\" argument \n")
428 | 		}
429 | 	}
430 | 	return(invisible("geoDist arg checked"))
431 | }
432 | 
433 | check.coords.arg <- function(args){
434 | 	if(!inherits(args[["coords"]],"matrix")){
435 | 		stop("\nthe \"coords\" argument must be of class \"matrix\"\n")
436 | 	}
437 | 	if(ncol(args[["coords"]]) > 2){
438 | 		stop("\nthe \"coords\" argument must be a matrix with two columns\n")
439 | 	}
440 | 	return(invisible("coords arg checked"))
441 | }
442 | 


--------------------------------------------------------------------------------
/R/stanmodels.R:
--------------------------------------------------------------------------------
 1 | # Generated by rstantools.  Do not edit by hand.
 2 | 
 3 | # names of stan models
 4 | stanmodels <- c("multiK", "oneK", "space_multiK", "space_oneK")
 5 | 
 6 | # load each stan module
 7 | Rcpp::loadModule("stan_fit4multiK_mod", what = TRUE)
 8 | Rcpp::loadModule("stan_fit4oneK_mod", what = TRUE)
 9 | Rcpp::loadModule("stan_fit4space_multiK_mod", what = TRUE)
10 | Rcpp::loadModule("stan_fit4space_oneK_mod", what = TRUE)
11 | 
12 | # instantiate each stanmodel object
13 | stanmodels <- sapply(stanmodels, function(model_name) {
14 |   # create C++ code for stan model
15 |   stan_file <- if(dir.exists("stan")) "stan" else file.path("inst", "stan")
16 |   stan_file <- file.path(stan_file, paste0(model_name, ".stan"))
17 |   stanfit <- rstan::stanc_builder(stan_file,
18 |                                   allow_undefined = TRUE,
19 |                                   obfuscate_model_name = FALSE)
20 |   stanfit$model_cpp <- list(model_cppname = stanfit$model_name,
21 |                             model_cppcode = stanfit$cppcode)
22 |   # create stanmodel object
23 |   methods::new(Class = "stanmodel",
24 |                model_name = stanfit$model_name,
25 |                model_code = stanfit$model_code,
26 |                model_cpp = stanfit$model_cpp,
27 |                mk_cppmodule = function(x) get(paste0("rstantools_model_", model_name)))
28 | })
29 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onLoad <- function(libname, pkgname) {
2 |   modules <- paste0("stan_fit4", names(stanmodels), "_mod")
3 |   for (m in modules) loadModule(m, what = TRUE)
4 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## conStruct (continuous Structure) ReadMe
 3 | 
 4 |   <!-- badges: start -->
 5 |   [![R-CMD-check](https://github.com/gbradburd/conStruct/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/gbradburd/conStruct/actions/workflows/R-CMD-check.yaml)
 6 |   <!-- badges: end -->
 7 | 
 8 | 
 9 | This repo contains the code for the method **conStruct** - a statistical tool
10 | for modeling continuous and discrete population genetic structure.
11 | 
12 | The manuscript, data files, and analysis scripts associated with the publication
13 | "Inferring Continuous and Discrete Population Genetic Structure Across Space,"
14 | have been moved, and can be accessed at the links below:
15 | 
16 |  * [paper](https://doi.org/10.1534/genetics.118.301333)
17 |  * [manuscript repo](https://github.com/gbradburd/conStruct-paper)
18 |  * [data dryad](https://doi.org/10.5061/dryad.5qj7h09)
19 | 
20 | ## Installation
21 | 
22 | ### Latest release
23 | 
24 | To install the most recent release of the **conStruct** R package:
25 | 
26 | ```r
27 | install.packages("conStruct")
28 | ```
29 | 
30 | Upon installation, the **conStruct** models will be compiled, which may
31 | spit lots of text, and possibly some warnings, to your screen. This is
32 | totally normal, and you should only be concerned if you get errors
33 | and the installation fails.
34 | 
35 | 
36 | ### Development version
37 | 
38 | To install the development version from github:
39 | 
40 | ```r
41 | 	library(devtools)
42 | 	install_github("gbradburd/conStruct",build_vignettes=TRUE)
43 | ```
44 | 
45 | Note that Windows users may have to download Rtools as a
46 | standalone executable before trying to install the **conStruct** R package.
47 | 
48 | 
49 | ## Getting Started
50 | 
51 | A complete manual for all documented functions is available [here](https://github.com/gbradburd/conStruct/blob/master/man/conStruct-manual.pdf).
52 | 
53 | In addition, there are four vignettes included in the package that walk through
54 | various steps in the analysis pipeline in detail. You can find them using:
55 | 
56 | ```r
57 | # formatting data
58 | 	vignette(topic="format-data",package="conStruct")
59 | 
60 | # how to run a conStruct analysis
61 | 	vignette(topic="run-conStruct",package="conStruct")
62 | 
63 | # how to visualize the output of a conStruct model
64 | 	vignette(topic="visualize-results",package="conStruct")
65 | 
66 | # how to compare and select between different conStruct models
67 | 	vignette(topic="model-comparison",package="conStruct")
68 | ```
69 | 
70 | There is also an example data file included in the package, which you can
71 | load using the command:
72 | 
73 | ```r
74 | 	data(conStruct.data)
75 | ```
76 | 
77 | ## Contact
78 | 
79 | After referring to the manual and vignettes,
80 | please direct all queries to bradburd (at) umich.edu,
81 | or post as issues on the git repo.


--------------------------------------------------------------------------------
/configure:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | 
3 | # Generated by rstantools.  Do not edit by hand.
4 | 
5 | "${R_HOME}/bin/Rscript" -e "rstantools::rstan_config()"
6 | 


--------------------------------------------------------------------------------
/configure.win:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | 
3 | # Generated by rstantools.  Do not edit by hand.
4 | 
5 | "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "rstantools::rstan_config()"
6 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Test environments
 3 | * local OS X install R 4.2.3
 4 | * win-builder
 5 | 
 6 | ## R CMD check results
 7 | There were no ERRORs or WARNINGs
 8 | 
 9 | There were 2 NOTES:
10 | 
11 | * checking for GNU extensions in Makefiles ... NOTE
12 |   GNU make is a SystemRequirements.
13 | 
14 | Explanation: GNU make is required for packages that 
15 | depend on rstan and which are developed using rstantools. 
16 | The requirement is noted in the DESCRIPTION file.
17 | 
18 | * checking installed package size ... NOTE
19 |   installed size is  5.5Mb
20 |   sub-directories of 1Mb or more:
21 |     libs   4.6Mb
22 | 
23 | Explanation: This package has a large installed library 
24 | size because it uses the Stan MCMC library as a backend, 
25 | and the C++ Stan models included in the package are 
26 | compiled upon installation.
27 | 
28 | 
29 | ## Downstream dependendencies
30 | 
31 | * There are currently no downstream dependencies for this package.


--------------------------------------------------------------------------------
/data/conStruct.data.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/data/conStruct.data.rda


--------------------------------------------------------------------------------
/data/data.block.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/data/data.block.rda


--------------------------------------------------------------------------------
/inst/include/stan_meta_header.hpp:
--------------------------------------------------------------------------------
1 | // Insert all #include<foo.hpp> statements here
2 | 


--------------------------------------------------------------------------------
/inst/stan/multiK.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 | 	matrix admixed_covariance(int N, int K, matrix w_mat, vector nugget, vector phi, real gamma) {
 3 | 		matrix[N,N] parCov;
 4 | 		matrix[N,N] Nug_mat;
 5 | 		parCov = rep_matrix(0,N,N);
 6 | 		Nug_mat = diag_matrix(nugget);
 7 | 		for(k in 1:K){
 8 | 			parCov += tcrossprod(to_matrix(w_mat[,k])) * phi[k];
 9 | 		}
10 | 		parCov += gamma + Nug_mat;
11 | 		return parCov;	
12 | 	}
13 | 	matrix make_w_matrix(int N, int K, array[] vector w){
14 | 		matrix[N,K] w_mat;
15 | 		for(i in 1:N){
16 | 			w_mat[i] = to_row_vector(w[i]);
17 | 		}
18 | 		return w_mat;
19 | 	}
20 | }
21 | data {
22 | 	int<lower=1> K;		  				// number of layers
23 | 	int<lower=2> N; 	  				// number of samples
24 | 	int<lower=N+1> L;	    			// number of loci
25 | 	matrix[N,N] obsCov; 				// observed projected covariance
26 | 	real varMeanFreqs;
27 | }
28 | transformed data {
29 | 	matrix[N,N] LobsCov;				// n.loci multiplied by the sample covariance
30 | 	vector[K] dirConPar;
31 | 	LobsCov  = L * obsCov;
32 | 	dirConPar = rep_vector(0.1,K);
33 | }
34 | parameters {
35 | 	positive_ordered[K] phi;				// shared drift effect in layer k
36 | 	real<lower=0> gamma;				// covariance between all layers
37 |   	vector<lower=0>[N] nugget; 			// sample-specific variance (allele sampling error + sample-specific drift)
38 | 	array[N] simplex[K]    w;    				// every sample (N in total) has a K simplex (i.e. K layers)
39 | }
40 | transformed parameters {
41 | 	matrix[N,N] parCov;					// this specifies the parametric, admixed covariance matrix
42 | 	matrix[N,K] w_mat;
43 | 	w_mat = make_w_matrix(N,K,w);
44 | 	parCov = admixed_covariance(N, K, w_mat, nugget, phi, gamma);
45 | }
46 | model {
47 | 	nugget ~ normal(0,1);										// prior on nugget
48 | 	phi ~ normal(0,1);
49 | 	gamma ~ normal(varMeanFreqs,0.5);
50 | 	for(i in 1:N) w[i] ~ dirichlet(dirConPar);				    // prior on admixture proportions
51 | 	LobsCov ~ wishart(L,parCov);						// likelihood function
52 | }
53 | 


--------------------------------------------------------------------------------
/inst/stan/oneK.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 | 	matrix Cov(int N, vector nugget, real gamma) {
 3 | 		matrix[N,N] parCov;
 4 | 		matrix[N,N] Nug_mat;
 5 | 		parCov = rep_matrix(gamma,N,N);
 6 | 		Nug_mat = diag_matrix(nugget);
 7 | 		parCov += Nug_mat;
 8 | 		return parCov;	
 9 | 	}
10 | }
11 | data {
12 | 	int<lower=1> K;		  				// number of layers
13 | 	int<lower=2> N; 	  				// number of samples
14 | 	int<lower=N+1> L;	    			// number of loci
15 | 	matrix[N,N] obsCov; 				// observed projected covariance
16 | 	real varMeanFreqs;					// variance in mean frequencies
17 | }
18 | transformed data {
19 | 	matrix[N,N] LobsCov;				// n.loci multiplied by the sample covariance
20 | 	LobsCov  = L * obsCov;
21 | }
22 | parameters {
23 | 	real<lower=0> gamma;				// covariance between all layers
24 |   	vector<lower=0>[N] nugget; 			// sample-specific variance (allele sampling error + sample-specific drift)
25 | }
26 | transformed parameters {
27 | 	matrix[N,N] parCov;					// this specifies the parametric, admixed covariance matrix
28 | 	parCov = Cov(N, nugget, gamma);
29 | }
30 | model {
31 | 	nugget ~ normal(0,1);				// prior on nugget
32 | 	gamma ~ normal(varMeanFreqs,0.5);	// prior on gamma
33 | 	LobsCov ~ wishart(L,parCov);		// likelihood function
34 | }
35 | 


--------------------------------------------------------------------------------
/inst/stan/space_multiK.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 | 	matrix spCov(int N, real a0, real aD, real a2, matrix D, real phi){
 3 | 		matrix[N,N] cov;
 4 | 		for(i in 1:N){
 5 | 			for(j in i:N){
 6 | 				cov[i,j] = a0 * exp( -(aD* D[i,j])^a2) + phi;
 7 | 				cov[j,i] = cov[i,j];
 8 | 			}
 9 | 		}
10 | 		return cov;
11 | 	}
12 | 	matrix admixed_covariance(int N, int K, vector alpha0, vector alphaD, vector alpha2, matrix geoDist, matrix w_mat, vector nugget, vector phi, real gamma) {
13 | 		matrix[N,N] parCov;
14 | 		matrix[N,N] Nug_mat;
15 | 		parCov = rep_matrix(0,N,N);
16 | 		Nug_mat = diag_matrix(nugget);
17 | 		for(k in 1:K){
18 | 			parCov += tcrossprod(to_matrix(w_mat[,k])) .* spCov(N,alpha0[k],alphaD[k],alpha2[k],geoDist,phi[k]);
19 | 		}
20 | 		parCov += gamma + Nug_mat;
21 | 		return parCov;	
22 | 	}
23 | 	matrix make_w_matrix(int N, int K, array[] vector w){
24 | 		matrix[N,K] w_mat;
25 | 		for(i in 1:N){
26 | 			w_mat[i] = to_row_vector(w[i]);
27 | 		}
28 | 		return w_mat;
29 | 	}
30 | }
31 | data {
32 | 	int<lower=1> K;		  				// number of layers
33 | 	int<lower=2> N; 	  				// number of samples
34 | 	int<lower=N+1> L;	    			// number of loci
35 | 	matrix[N,N] obsCov; 				// observed projected covariance
36 | 	matrix[N, N] geoDist; 				// matrix of pairwise geographic distance
37 | 	real varMeanFreqs;
38 | }
39 | transformed data {
40 | 	matrix[N,N] LobsCov;				// n.loci multiplied by the sample covariance
41 | 	vector[K] dirConPar;
42 | 	LobsCov  = L * obsCov;
43 | 	dirConPar = rep_vector(0.1,K);
44 | }
45 | parameters {
46 | 	vector<lower=0>[K] alpha0;								// sill of the parametric covariance in layer k
47 | 	vector<lower=0>[K] alphaD;								// effect of geographic distance in the parametric covariance in layer k
48 | 	vector<lower=0, upper=2>[K]  alpha2;					// exponential slope parameter in the parametric covariance in layer k
49 | 	positive_ordered[K] phi;									// shared drift effect in layer k
50 |   	vector<lower=0>[N] nugget; 								// sample-specific variance (allele sampling error + sample-specific drift)
51 | 	array[N] simplex[K]    w;    									// every sample (N in total) has a K simplex (i.e. K layers)
52 | 	real<lower=0> gamma;
53 | }
54 | transformed parameters {
55 | 	matrix[N,N] parCov;					// this specifies the parametric, admixed covariance matrix
56 | 	matrix[N,K] w_mat;
57 | 	w_mat = make_w_matrix(N,K,w);
58 | 	parCov = admixed_covariance(N, K, alpha0, alphaD, alpha2, geoDist, w_mat, nugget, phi, gamma);
59 | }
60 | model {
61 | 	alpha0 ~ normal(0,1);										// prior on alpha0
62 | 	alphaD ~ normal(0,1);										// prior on alphaD
63 | 	alpha2 ~ uniform(0,2);										// prior on alpha2
64 | 	nugget ~ normal(0,1);										// prior on nugget
65 | 	phi ~ normal(0,1);
66 | 	gamma ~ normal(varMeanFreqs,0.5);
67 | 	for(i in 1:N) w[i] ~ dirichlet(dirConPar);		// prior on admixture proportions
68 | 	LobsCov ~ wishart(L,parCov);					// likelihood function
69 | }
70 | 


--------------------------------------------------------------------------------
/inst/stan/space_oneK.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 | 	matrix spCov(int N, real a0, real aD, real a2, matrix D, vector nugget, real gamma) {
 3 | 		matrix[N,N] parCov;
 4 | 		matrix[N,N] Nug_mat;
 5 | 		parCov = rep_matrix(0,N,N);
 6 | 		Nug_mat = diag_matrix(nugget);
 7 | 		for(i in 1:N){
 8 | 			for(j in i:N){
 9 | 				parCov[i,j] = a0 * exp( -(aD * D[i,j])^a2);
10 | 				parCov[j,i] = parCov[i,j];
11 | 			}
12 | 		}
13 | 		parCov += gamma + Nug_mat;
14 | 		return parCov;	
15 | 	}
16 | }
17 | data {
18 | 	int<lower=1> K;		  				// number of layers
19 | 	int<lower=2> N; 	  				// number of samples
20 | 	int<lower=N+1> L;	    			// number of loci
21 | 	matrix[N,N] obsCov; 				// observed projected covariance
22 | 	matrix[N, N] geoDist; 				// matrix of pairwise geographic distance 
23 | 	real varMeanFreqs;
24 | }
25 | transformed data {
26 | 	matrix[N,N] LobsCov;				// n.loci multiplied by the sample covariance
27 | 	LobsCov  = L * obsCov;
28 | }
29 | parameters {
30 | 	real<lower=0> alpha0;								// sill of the parametric covariance in layer k
31 | 	real<lower=0> alphaD;								// effect of geographic distance in the parametric covariance in layer k
32 | 	real<lower=0, upper=2>  alpha2;					// exponential slope parameter in the parametric covariance in layer k
33 | 	real<lower=0> gamma;				// covariance between all layers
34 |   	vector<lower=0>[N] nugget; 								// sample-specific variance (allele sampling error + sample-specific drift)
35 | }
36 | transformed parameters {
37 | 	matrix[N,N] parCov;					// this specifies the parametric, admixed covariance matrix
38 | 	parCov = spCov(N, alpha0, alphaD, alpha2, geoDist, nugget, gamma);
39 | }
40 | model {
41 | 	alpha0 ~ normal(0,1);										// prior on alpha0
42 | 	alphaD ~ normal(0,1);										// prior on alphaD
43 | 	alpha2 ~ uniform(0,2);										// prior on alpha2
44 | 	nugget ~ normal(0,1);										// prior on nugget
45 | 	gamma ~ normal(varMeanFreqs,0.5);							// prior on global covariance
46 | 	LobsCov ~ wishart(L,parCov);								// likelihood function
47 | }
48 | 


--------------------------------------------------------------------------------
/man/calculate.layer.contribution.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/model.comparison.R
 3 | \name{calculate.layer.contribution}
 4 | \alias{calculate.layer.contribution}
 5 | \title{Calculate layer contribution}
 6 | \usage{
 7 | calculate.layer.contribution(conStruct.results, data.block, layer.order = NULL)
 8 | }
 9 | \arguments{
10 | \item{conStruct.results}{The list output by a 
11 | \code{conStruct} run for a given MCMC chain.}
12 | 
13 | \item{data.block}{A \code{data.block} list saved during a 
14 | \code{conStruct} run.}
15 | 
16 | \item{layer.order}{An optional \code{vector} giving the
17 | order in which the layers of \code{conStruct.results} are 
18 |     read.}
19 | }
20 | \value{
21 | This function returns a \code{vector} giving the 
22 | 			relative contributions of the layers 
23 | 			in the analysis.
24 | }
25 | \description{
26 | \code{calculate.layer.contribution}
27 | }
28 | \details{
29 | This function takes the results of a \code{conStruct} 
30 | analysis and calculates the relative contributions of 
31 | each layer to total covariance.
32 | 
33 | This function calculates the contribution of each layer to
34 | 		total covariance by multiplying the within-layer covariance 
35 | 		in a given layer by the admixture proportions samples draw 
36 | 		from that layer. The relative contribution of that layer 
37 | 		is this absolute contribution divided by the sum of those of 
38 | 		all other layers. 
39 | 			A layer can have a large contribution if many samples draw 
40 | 		large amounts of admixture from it, or if it has a very large 
41 | 		within-layer covariance parameter (phi), or some combination 
42 | 		of the two. Layer contribution can be useful for evaluating 
43 | 		an appropriate level of model complexity for the data (e.g., 
44 | 		choosing a value of \code{K} or comparing the spatial and 
45 | 		nonspatial models).
46 | }
47 | 


--------------------------------------------------------------------------------
/man/compare.two.runs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.output.R
 3 | \name{compare.two.runs}
 4 | \alias{compare.two.runs}
 5 | \title{Compare two conStruct runs}
 6 | \usage{
 7 | compare.two.runs(
 8 |   conStruct.results1,
 9 |   data.block1,
10 |   conStruct.results2,
11 |   data.block2,
12 |   prefix,
13 |   layer.colors = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{conStruct.results1}{The list output by a 
18 | \code{conStruct} run.}
19 | 
20 | \item{data.block1}{A \code{data.block} list saved during a 
21 | \code{conStruct} run.}
22 | 
23 | \item{conStruct.results2}{The list output by a second
24 | \code{conStruct} run.}
25 | 
26 | \item{data.block2}{A \code{data.block} list saved during a 
27 | second \code{conStruct} run.}
28 | 
29 | \item{prefix}{A character vector to be prepended to all figures.}
30 | 
31 | \item{layer.colors}{A \code{vector} of colors to be used in 
32 | plotting results for different layers. Users must 
33 | specify one color per layer.  If \code{NULL}, plots 
34 | will use a pre-specified vector of colors.}
35 | }
36 | \value{
37 | This function has only invisible return values.
38 | }
39 | \description{
40 | \code{compare.two.runs} makes figures comparing the output 
41 | 	from two conStruct analyses.
42 | }
43 | \details{
44 | This function takes the outputs from two conStruct analyses and 
45 | generates a number of plots for comparing results and 
46 | diagnosing MCMC performance.
47 | 
48 | This function produces a variety of plots that can be 
49 | useful for comparing results from two \code{conStruct} analyses.
50 |  The runs must have the same number of independent MCMC chains, 
51 | 	but may have different values of \code{K}. The spatial and 
52 | nonspatial models can be compared. If the runs were executed 
53 | with different values of \code{K}, the run with the smaller 
54 | value of \code{K} should be specified in the first set of 
55 | arguments (\code{conStruct.results1} and \code{data.block1}).
56 | 
57 |  The plots made are by no means an exhaustive, and users are 
58 | 	encouraged to make further plots, or customize these plots as they 
59 | see fit.  For each plot, one file is generated for each MCMC chain 
60 | in each analysis (specified with the \code{n.chains} argument in 
61 | the function \code{conStruct}. For clarity, the layers in the second 
62 | 	are matched to those in the first using the function 
63 | \code{match.clusters.x.runs} The plots generated (as .pdf files) are:
64 | \itemize{
65 | 	\item Structure plot - STRUCTURE-style plot, where each sample 
66 | 		is represented as a stacked bar plot, and the length of the 
67 | 		bar plot segments of each color represent that sample's 
68 | 		admixture proportion in that layer. Described further 
69 | 		in the help page for \code{make.structure.plot}.
70 | 	\item Admixture pie plot - A map of samples in which each sample's 
71 | 			location is denoted with a pie chart, and the proportion 
72 | 			of a pie chart of each color represents that sample's 
73 | 			admixture in each layer. Described further in the help 
74 | 			page for \code{make.admix.pie.plot}
75 | 	\item model.fit.CIs - A plot of the sample allelic covariance 
76 | 		shown with the 95\% credible interval of the parametric 
77 | 		covariance for each entry in the matrix.
78 | 	\item layer.covariances - A plot of the layer-specific 
79 | 			covariances overlain unto the sample allelic covariance.
80 | 	\item Trace plots - Plots of parameter values over the MCMC.
81 | 	\itemize{
82 | 		\item lpd - A plot of the log posterior probability over the MCMC.
83 | 		\item nuggets - A plot of estimates of the nugget parameters 
84 | 			over the MCMC.
85 | 		\item gamma - A plot of estimates of the gamma parameter 
86 | 			over the MCMC.
87 | 		\item layer.cov.params - Plots of estimates of the 
88 | 			layer-specific parameters over the MCMC.
89 | 		\item admix.props - A plot of estimates of the admixture proportions 
90 | 			over the MCMC.
91 | 	}
92 | }
93 | }
94 | 


--------------------------------------------------------------------------------
/man/conStruct-manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/man/conStruct-manual.pdf


--------------------------------------------------------------------------------
/man/conStruct-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/conStruct-package.R
 3 | \docType{package}
 4 | \name{conStruct-package}
 5 | \alias{conStruct-package}
 6 | \title{The 'conStruct' package.}
 7 | \description{
 8 | A method for modeling genetic data as a combination of discrete
 9 |    layers, within each of which relatedness may decay continuously with geographic
10 |    distance. This package contains code for running analyses (which are implemented
11 |    in the modeling language 'rstan') and visualizing and interpreting output. See the
12 |    associated paper for more details on the model and its utility.
13 | }
14 | \references{
15 | G.S. Bradburd, G.M. Coop, and P.L. Ralph (2018) <doi: 10.1534/genetics.118.301333>.
16 | 
17 | Stan Development Team (2018). RStan: the R interface to Stan. R package version 2.17.3. http://mc-stan.org
18 | }
19 | 


--------------------------------------------------------------------------------
/man/conStruct.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/run.conStruct.R
  3 | \name{conStruct}
  4 | \alias{conStruct}
  5 | \title{Run a conStruct analysis.}
  6 | \usage{
  7 | conStruct(
  8 |   spatial = TRUE,
  9 |   K,
 10 |   freqs,
 11 |   geoDist = NULL,
 12 |   coords,
 13 |   prefix = "",
 14 |   n.chains = 1,
 15 |   n.iter = 1000,
 16 |   make.figs = TRUE,
 17 |   save.files = TRUE,
 18 |   ...
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{spatial}{A logical indicating whether to perform a spatial analysis. 
 23 | Default is \code{TRUE}.}
 24 | 
 25 | \item{K}{An \code{integer} that indicates the number of layers to be 
 26 | included in the analysis.}
 27 | 
 28 | \item{freqs}{A \code{matrix} of allele frequencies with one column per 
 29 | locus and one row per sample.
 30 |     Missing data should be indicated with \code{NA}.}
 31 | 
 32 | \item{geoDist}{A full \code{matrix} of geographic distance between samples. 
 33 | If \code{NULL}, user can only run the nonspatial model.}
 34 | 
 35 | \item{coords}{A \code{matrix} giving the longitude and latitude 
 36 | (or X and Y coordinates) of the samples.}
 37 | 
 38 | \item{prefix}{A character \code{vector} giving the prefix to be attached 
 39 | to all output files.}
 40 | 
 41 | \item{n.chains}{An integer indicating the number of MCMC chains to be run 
 42 | in the analysis. Default is 1.}
 43 | 
 44 | \item{n.iter}{An \code{integer} giving the number of iterations each MCMC 
 45 | chain is run. Default is 1e3.  If the number of iterations 
 46 | is greater than 500, the MCMC is thinned so that the number 
 47 | of retained iterations is 500 (before burn-in).}
 48 | 
 49 | \item{make.figs}{A \code{logical} value indicating whether to automatically 
 50 | make figures once the analysis is complete. Default is 
 51 | \code{TRUE}.}
 52 | 
 53 | \item{save.files}{A \code{logical} value indicating whether to automatically 
 54 | save output and intermediate files once the analysis is
 55 |  complete. Default is \code{TRUE}.}
 56 | 
 57 | \item{...}{Further options to be passed to rstan::sampling (e.g., adapt_delta).}
 58 | }
 59 | \value{
 60 | This function returns a list with one entry for each chain run 
 61 | 		(specified with \code{n.chains}). The entry for each chain is named 
 62 | 		"chain_X" for the Xth chain.  The components of the entries for each 
 63 | 		are detailed below: 
 64 | 		\itemize{
 65 | 			\item \code{posterior} gives parameter estimates over the posterior 
 66 | 					distribution of the MCMC.
 67 | 				\itemize{
 68 | 					\item \code{n.iter} number of MCMC iterations retained for 
 69 | 							analysis (half of the \code{n.iter} argument 
 70 | 							specified in the function call).
 71 | 					\item \code{lpd} vector of log posterior density over the retained 
 72 | 							MCMC iterations.
 73 | 					\item \code{nuggets} matrix of estimated nugget parameters with 
 74 | 							one row per MCMC iteration and one column per sample.
 75 | 					\item \code{par.cov} array of estimated parametric covariance matrices, 
 76 | 							for which the first dimension is the number of MCMC iterations.
 77 | 					\item \code{gamma} vector of estimated gamma parameter.
 78 | 					\item \code{layer.params} list summarizing estimates of layer-specific 
 79 | 							parameters. There is one entry for each layer specified, and the 
 80 | 							entry for the kth layer is named "Layer_k".
 81 | 						\itemize{
 82 | 							\item \code{alpha0} vector of estimated alpha0 parameter in the 
 83 | 									kth layer.
 84 | 							\item \code{alphaD} vector of estimated alphaD parameter in the 
 85 | 									kth layer.
 86 | 							\item \code{alpha2} vector of estimated alpha2 parameter in the 
 87 | 									kth layer.
 88 | 							\item \code{mu} vector of estimated mu parameter in the 
 89 | 									kth layer.
 90 | 							\item \code{layer.cov} vector of estimated layer-specific 
 91 | 									covariance parameter in the kth layer.
 92 | 						}
 93 | 					\item \code{admix.proportions} array of estimated admixture proportions.
 94 | 							The first dimension is the number of MCMC iterations, 
 95 | 							the second is the number of samples, 
 96 | 								and the third is the number of layers.
 97 | 				}
 98 | 		\item \code{MAP} gives point estimates of the parameters listed in the \code{posterior}
 99 | 							list described above. Values are indexed at the MCMC iteration 
100 | 							with the greatest posterior probability.
101 | 				\itemize{
102 | 					\item \code{index.iter} the iteration of the MCMC with the highest 
103 | 							posterior probability, which is used to index all parameters 
104 | 							included in the \code{MAP} list
105 | 					\item \code{lpd} the greatest value of the posterior probability
106 | 					\item \code{nuggets} point estimate of nugget parameters
107 | 					\item \code{par.cov} point estimate of parametric covariance
108 | 					\item \code{gamma} point estimate of gamma parameter
109 | 					\item \code{layer.params} point estimates of all layer-specific parameters 
110 | 					\item \code{admix.proportions} point estimates of admixture proportions.
111 | 				}
112 | 		}
113 | }
114 | \description{
115 | \code{conStruct} runs a conStruct analysis of genetic data.
116 | }
117 | \details{
118 | This function initiates an analysis that uses  
119 | geographic and genetic relationships between samples 
120 | to estimate sample membership (admixture proportions) across 
121 | a user-specified number of layers.
122 | 
123 | This function acts as a wrapper around a STAN model block determined 
124 | 		by the user-specified model (e.g., a spatial model with 3 layers, 
125 | 		or a nonspatial model with 5 layers).
126 | 		User-specified data are checked for appropriate format and consistent dimensions,
127 | 		then formatted into a \code{data.block},
128 | 		which is then passed to the STAN model block.
129 | 		Along with the \code{conStruct.results} output described above, 
130 | 		several objects are saved during the course of a \code{conStruct} call
131 | 		(if \code{save.files=TRUE}).
132 | 		These are the \code{data.block}, which contains all data passed to the STAN model block,
133 | 		\code{model.fit}, which is unprocessed results of the STAN run in \code{stanfit} format,
134 | 		and the \code{conStruct.results}, which are saved in the course of the function call
135 | 		in addition to being returned.
136 | 		If \code{make.figs=TRUE}, running \code{conStruct} will also generate many output figures, 
137 | 		which are detailed in the function \code{make.all.the.plots} in this package.
138 | }
139 | \examples{
140 | # load example dataset
141 | data(conStruct.data)
142 | 
143 | # run example spatial analysis with K=1
144 | 	#	
145 | # for this example, make.figs and save.files
146 | #	are set to FALSE, but most users will want them 
147 | #	set to TRUE
148 | my.run <- conStruct(spatial = TRUE,
149 | 		 			K = 1,
150 | 		 			freqs = conStruct.data$allele.frequencies,
151 | 		 			geoDist = conStruct.data$geoDist,
152 | 		 			coords = conStruct.data$coords,
153 | 		 			prefix = "test",
154 | 		 			n.chains = 1,
155 | 		 			n.iter = 1e3,
156 | 		 			make.figs = FALSE,
157 | 		 			save.files = FALSE)
158 | 
159 | }
160 | 


--------------------------------------------------------------------------------
/man/conStruct.data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{conStruct.data}
 5 | \alias{conStruct.data}
 6 | \title{Example dataset used in a \code{conStruct} analysis}
 7 | \format{
 8 | A list with two elements:
 9 | \describe{
10 | 	\item{allele.frequencies}{a matrix with one row for each of 
11 | 		the 16 samples and one column for each of 10,000 loci, 
12 | 		giving the frequency of the counted allele at each locus 
13 | 		in each sample}
14 | 	\item{coords}{a matrix with one row for each of the 16 samples, 
15 | 		in the same order as that of the allele frequency matrix, 
16 | 		and two columns, the first giving the x-coordinate 
17 | 		(or longitude), the second giving the y-coordinate (or latitude)}
18 | }
19 | }
20 | \usage{
21 | conStruct.data
22 | }
23 | \description{
24 | A simulated dataset containing the allele frequency 
25 | and sampling coordinate data necessary to run a 
26 | \code{conStruct} analysis.
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------
/man/data.block.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{data.block}
 5 | \alias{data.block}
 6 | \title{Example \code{data.block} generated by a \code{conStruct} analysis}
 7 | \format{
 8 | A list with 7 elements:
 9 | \describe{
10 | 	\item{\code{N}}{the number of samples included in the analysis}
11 | 	\item{\code{K}}{the number of clusters/layers included in the model}
12 | 	\item{\code{spatial}}{a boolean indicating whether the spatial
13 | 		model has been specified}
14 | 	\item{\code{L}}{the number of loci included in the analysis}
15 | 	\item{\code{coords}}{a matrix with one row for each of the \code{N} samples, 
16 | 		in the same order as that of the \code{obsCov} matrix, 
17 | 		and two columns, the first giving the x-coordinate 
18 | 		(or longitude), the second giving the y-coordinate (or latitude)}
19 | 	\item{\code{obsCov}}{the sample allelic covariance matrix, 
20 | 		in the same order as that of the \code{coords} matrix, 
21 | 		with \code{N} rows and columns}
22 | 	\item{\code{geoDist}}{a matrix of pairwise geographic distance between , 
23 | 		samples in the same order as that of the \code{obsCov}, 
24 | 		with \code{N} rows and columns}
25 | 	\item{\code{sd.geoDist}}{the standard deviation of the raw geographic 
26 | 		distance matrix, used for normalizing \code{geoDist} within the 
27 | 		stan model}
28 | 	\item{\code{varMeanFreqs}}{the variance of the mean allele frequencies, 
29 | 		averaged over choice of counted allele (passed to the model 
30 | 		as a prior on the global covariance parameter)}
31 | }
32 | }
33 | \usage{
34 | data.block
35 | }
36 | \description{
37 | An example \code{data.block} object generated in a \code{conStruct} 
38 | analysis from the raw data supplied by the user. This object is 
39 | automatically saved and is used in several subsequent plotting functions.
40 | }
41 | \keyword{datasets}
42 | 


--------------------------------------------------------------------------------
/man/make.admix.pie.plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.output.R
 3 | \name{make.admix.pie.plot}
 4 | \alias{make.admix.pie.plot}
 5 | \title{Make admixture pie plot}
 6 | \usage{
 7 | make.admix.pie.plot(
 8 |   admix.proportions,
 9 |   coords,
10 |   layer.colors = NULL,
11 |   radii = 2.7,
12 |   add = FALSE,
13 |   x.lim = NULL,
14 |   y.lim = NULL,
15 |   mar = c(2, 2, 2, 2)
16 | )
17 | }
18 | \arguments{
19 | \item{admix.proportions}{A \code{matrix} of admixture proportions, 
20 | with one row per sample and one column per layer.}
21 | 
22 | \item{coords}{\code{matrix} of sample coordinates, with one row 
23 | per sample and two columns giving (respectively) the X 
24 | and Y plotting coordinates.}
25 | 
26 | \item{layer.colors}{A \code{vector} of colors to be used in 
27 | plotting results for different layers. Users must 
28 | specify one color per layer.  If \code{NULL}, the plot 
29 | will use a pre-specified vector of colors.}
30 | 
31 | \item{radii}{A \code{vector} of numeric values giving the radii to be 
32 | used in plotting admixture pie plots. If the number of values 
33 | specified is smaller than the number of samples, radii values 
34 | will be recycled across samples. The default is 2.7.}
35 | 
36 | \item{add}{A \code{logical} value indicating whether to add the pie plots 
37 | to an existing plot.  Default is \code{FALSE}.}
38 | 
39 | \item{x.lim}{A \code{vector} giving the x limits of the plot. The default
40 | value is \code{NULL}, which indicates that the range of values 
41 | given in the first column of \code{coords} should be used.}
42 | 
43 | \item{y.lim}{A \code{vector} giving the y limits of the plot. The default
44 | value is \code{NULL}, which indicates that the range of values 
45 | given in the second column of \code{coords} should be used.}
46 | 
47 | \item{mar}{A \code{vector} giving the number of lines of margin specified 
48 | for the four sides of the plotting window (passed to \code{par}). 
49 | Default value, which is only used if \code{add=FALSE}, is 
50 | \code{c(2,2,2,2)}.}
51 | }
52 | \value{
53 | This function has only invisible return values.
54 | }
55 | \description{
56 | \code{make.structure.plot} makes a map of pie plots showing admixture 
57 | proportions across layers.
58 | }
59 | \details{
60 | This function takes the output from a conStruct analysis and 
61 | makes a map of pie plots showing admixture proportions across layers, 
62 | where each sample is represented as a pie chart, and the proportion of 
63 | the pie of each color represent that sample's 
64 | admixture proportion in that layer.
65 | }
66 | \examples{
67 | \dontshow{
68 | 	admix.props <- matrix(c(0.086,0.000,0.500,0.505,0.099,0.052,0.024,0.007,0.800,0.000,0.216,0.744,0.917,0.199,0.469,0.000,0.783,0.298,0.329,0.446,0.000,0.000,0.637,0.903,0.000,0.000,0.000,0.012,0.021,0.000,0.000,0.089,0.000,0.554,0.002,0.000,0.000,0.095,0.020,0.001,0.001,0.011,0.000,0.200,0.000,0.060,0.053,0.082,0.036,0.013,0.000,0.062,0.169,0.137,0.029,0.001,0.000,0.178,0.079,0.000,0.999,1.000,0.988,0.979,0.975,1.000,0.744,0.984,0.435,0.998,0.914,1.000,0.405,0.475,0.900,0.947,0.965,0.993,0.000,1.000,0.725,0.203,0.000,0.765,0.518,1.000,0.154,0.533,0.534,0.525,0.999,1.000,0.185,0.018,1.000,0.001,0.000,0.000,0.000,0.025,0.000,0.167,0.016,0.012,0.000),ncol=3)
69 | 	coords <- matrix(c(-126.38,-125.23,-126.97,-128.54,-126.95,-121.71,-126.79,-123.38,-137.88,-125.82,-122.94,-130.73,-123.08,-122.84,-128.58,-124.82,-129.75,-122.25,-122.32,-129.10,-125.28,-123.98,-133.35,-131.74,-124.16,-146.35,-94.63,-149.02,-111.50,-126.67,-133.77,-118.63,-115.78,-113.42,-135.33,52.40,49.84,54.66,54.65,51.69,49.44,52.82,50.05,59.52,51.34,45.81,56.81,44.71,50.24,54.14,51.04,56.68,52.98,54.04,55.34,50.64,50.23,58.76,57.30,50.54,64.90,56.35,63.87,56.92,65.23,68.38,54.75,60.80,50.82,60.70),ncol=2)
70 | }	
71 | # make admixture pie plot
72 | make.admix.pie.plot(admix.proportions = admix.props,coords = coords)
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/man/make.all.the.plots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.output.R
 3 | \name{make.all.the.plots}
 4 | \alias{make.all.the.plots}
 5 | \title{Make output plots}
 6 | \usage{
 7 | make.all.the.plots(conStruct.results, data.block, prefix, layer.colors = NULL)
 8 | }
 9 | \arguments{
10 | \item{conStruct.results}{The list output by a 
11 | \code{conStruct} run.}
12 | 
13 | \item{data.block}{A \code{data.block} list saved during a 
14 | \code{conStruct} run.}
15 | 
16 | \item{prefix}{A character vector to be prepended to all figures.}
17 | 
18 | \item{layer.colors}{A \code{vector} of colors to be used in 
19 | plotting results for different layers. Users must 
20 | specify one color per layer.  If \code{NULL}, plots 
21 | will use a pre-specified vector of colors.}
22 | }
23 | \value{
24 | This function has only invisible return values.
25 | }
26 | \description{
27 | \code{make.all.the.plots} makes figures from the output from a 
28 | 	conStruct analysis.
29 | }
30 | \details{
31 | This function takes the output from a conStruct analysis and 
32 | generates a number of plots for visualizing results and 
33 | diagnosing MCMC performance.
34 | 
35 | This function produces a variety of plots that can be 
36 | useful for visualizing results or diagnosing MCMC performance. 
37 |  The plots made are by no means exhaustive, and users are 
38 | 	encouraged to make further plots, or customize these plots as they 
39 | see fit.  For each plot, one file is generated for each MCMC chain 
40 | (specified with the \code{n.chains} argument in the function 
41 | \code{conStruct}. The plots generated (as .pdf files) are:
42 | \itemize{
43 | 	\item Structure plot - STRUCTURE-style plot, where each sample 
44 | 		is represented as a stacked bar plot, and the length of the 
45 | 		bar plot segments of each color represent that sample's 
46 | 		admixture proportion in that layer. Described further 
47 | 		in the help page for \code{make.structure.plot}.
48 | 	\item Admixture pie plot - A map of samples in which each sample's 
49 | 			location is denoted with a pie chart, and the proportion 
50 | 			of a pie chart of each color represents that sample's 
51 | 			admixture in each layer. Described further in the help 
52 | 			page for \code{make.admix.pie.plot}
53 | 	\item model.fit.CIs - A plot of the sample allelic covariance 
54 | 		shown with the 95\% credible interval of the parametric 
55 | 		covariance for each entry in the matrix.
56 | 	\item layer.covariances - A plot of the layer-specific 
57 | 			covariances overlain unto the sample allelic covariance.
58 | 	\item Trace plots - Plots of parameter values over the MCMC.
59 | 	\itemize{
60 | 		\item lpd - A plot of the log posterior probability over the MCMC.
61 | 		\item nuggets - A plot of estimates of the nugget parameters 
62 | 			over the MCMC.
63 | 		\item gamma - A plot of estimates of the gamma parameter 
64 | 			over the MCMC.
65 | 		\item layer.cov.params - Plots of estimates of the 
66 | 			layer-specific parameters over the MCMC.
67 | 		\item admix.props - A plot of estimates of the admixture proportions 
68 | 			over the MCMC.
69 | 	}
70 | }
71 | }
72 | 


--------------------------------------------------------------------------------
/man/make.structure.plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.output.R
 3 | \name{make.structure.plot}
 4 | \alias{make.structure.plot}
 5 | \title{Make STRUCTURE output plot}
 6 | \usage{
 7 | make.structure.plot(
 8 |   admix.proportions,
 9 |   mar = c(2, 4, 2, 2),
10 |   sample.order = NULL,
11 |   layer.order = NULL,
12 |   sample.names = NULL,
13 |   sort.by = NULL,
14 |   layer.colors = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{admix.proportions}{A \code{matrix} of admixture proportions, 
19 | with one row per sample and one column per layer.}
20 | 
21 | \item{mar}{A \code{vector} of plotting margins passed to \code{par}.
22 | Default is \code{c(2,4,2,2)}, which tends to look good.}
23 | 
24 | \item{sample.order}{A \code{vector} giving the order in which sample 
25 | admixture proportions are to be plotted, left to right.  If 
26 | \code{NULL}, samples are plotted in the order they occur in 
27 | \code{admix.proportions}.}
28 | 
29 | \item{layer.order}{A \code{vector} giving the order in which layers 
30 | are plotted, bottom to top. If \code{NULL}, layers are plotted 
31 | in the order they occur in \code{admix.proportions}.}
32 | 
33 | \item{sample.names}{Vector of names to be plotted under each sample's 
34 | admixture proportion bar plot. The index of a sample's name 
35 | should be the same as the index of the sample's row in 
36 | \code{admix.proportions}. If \code{NULL}, no names 
37 | are printed.}
38 | 
39 | \item{sort.by}{An \code{integer} giving the column index of the \code{admix.proportions} 
40 | matrix to be used in determining sample plotting order.  If specified, 
41 | samples are plotted from left to right in increasing order of their 
42 | membership in that layer.  If \code{NULL}, samples are plotted 
43 | in the order they occur in \code{admix.proportions}.}
44 | 
45 | \item{layer.colors}{A \code{vector} of colors to be used in plotting 
46 | results for different layers. Users must specify one 
47 | color per layer.  If \code{NULL}, the plot will use 
48 | a pre-specified vector of colors.}
49 | }
50 | \value{
51 | This function has only invisible return values.
52 | }
53 | \description{
54 | \code{make.structure.plot} makes a STRUCTURE-style plot from the output from a 
55 | 	conStruct analysis.
56 | }
57 | \details{
58 | This function takes the output from a conStruct analysis and 
59 | makes a STRUCTURE-style plot, where each sample 
60 | is represented as a stacked bar plot, and the length of the 
61 | bar plot segments of each color represent that sample's 
62 | admixture proportion in that layer.
63 | }
64 | \examples{
65 | \dontshow{
66 | 	admix.props <- matrix(c(0.086,0.000,0.500,0.505,0.099,0.052,0.024,0.007,0.800,0.000,0.216,0.744,0.917,0.199,0.469,0.000,0.783,0.298,0.329,0.446,0.000,0.000,0.637,0.903,0.000,0.000,0.000,0.012,0.021,0.000,0.000,0.089,0.000,0.554,0.002,0.000,0.000,0.095,0.020,0.001,0.001,0.011,0.000,0.200,0.000,0.060,0.053,0.082,0.036,0.013,0.000,0.062,0.169,0.137,0.029,0.001,0.000,0.178,0.079,0.000,0.999,1.000,0.988,0.979,0.975,1.000,0.744,0.984,0.435,0.998,0.914,1.000,0.405,0.475,0.900,0.947,0.965,0.993,0.000,1.000,0.725,0.203,0.000,0.765,0.518,1.000,0.154,0.533,0.534,0.525,0.999,1.000,0.185,0.018,1.000,0.001,0.000,0.000,0.000,0.025,0.000,0.167,0.016,0.012,0.000),ncol=3)
67 | }	
68 | # make STRUCTURE-style plot
69 | 	make.structure.plot(admix.proportions = admix.props)
70 | 
71 | # make STRUCTURE-style plot, sorted by membership in layer 1
72 | make.structure.plot(admix.proportions = admix.props,sort.by=1) 
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/man/match.layers.x.runs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/model.comparison.R
 3 | \name{match.layers.x.runs}
 4 | \alias{match.layers.x.runs}
 5 | \title{Match layers up across independent conStruct runs}
 6 | \usage{
 7 | match.layers.x.runs(admix.mat1, admix.mat2, admix.mat1.order = NULL)
 8 | }
 9 | \arguments{
10 | \item{admix.mat1}{A \code{matrix} of estimated admixture proportions
11 | from the original \code{conStruct} analysis, with one row 
12 | per sample and one column per layer.}
13 | 
14 | \item{admix.mat2}{A \code{matrix} of estimated admixture proportions
15 | from a second \code{conStruct} analysis, with one row per 
16 | sample and one column per layer, for which the 
17 | layer order is desired. Must have equal or greater number 
18 | of layers to \code{admix.mat1}.}
19 | 
20 | \item{admix.mat1.order}{An optional \code{vector} giving the
21 | order in which the layers of \code{admix.mat1} are read.}
22 | }
23 | \value{
24 | This function returns a \code{vector} giving the ordering 
25 | 			of the layers in \code{admix.mat2} that maximizes 
26 | 			similarity between \code{admix.mat1} and re-ordered 
27 | 		\code{admix.mat2}.
28 | }
29 | \description{
30 | \code{match.layers.x.runs}
31 | }
32 | \details{
33 | This function takes the results of two independent
34 | \code{conStruct} analyses and compares them to identify 
35 | which layers in a new analysis correspond most closely 
36 | to the layers from an original analysis.
37 | 
38 | This function compares admixture proportions in layers across 
39 | 		independent \code{conStruct} runs, and compares between them to 
40 | 			identify the layers in \code{admix.mat2} that correspond most 
41 | 		closely to those in \code{admix.mat1}. It then returns a vector 
42 | 			giving an ordering of \code{admix.mat2} that matches up the order
43 | 			of the layers that correspond to each other.  This can be useful 
44 | 			for:
45 | 		\enumerate{
46 | 			\item Dealing with "label switching" across independent runs 
47 | 				with the same number of layers; 
48 | 			\item Plotting results from independent runs with different 
49 | 				numbers of layers using consistent colors
50 | 					(e.g., the "blue" layer shows up as blue even as 
51 | 				\code{K} increases); 
52 | 				\item Examining results for multimodality (i.e., multiple 
53 | 				distinct solutions with qualitatively different patterns
54 | 				of membership across layers).
55 | 			}
56 | 			The \code{admix.mat1.order} argument can be useful when running 
57 | 			this function to sync up plotting colors/order across the output 
58 | 			of more than two \code{conStruct} runs.
59 | }
60 | \examples{
61 | \dontshow{
62 | 	admix.props1 <- matrix(c(0.09,0.00,0.50,0.51,0.10,0.05,0.02,0.01,0.80,0.00,0.22,0.74,0.92,0.20,0.47,0.00,0.78,0.30,0.33,0.45,0.00,0.00,0.64,0.90,0.00,0.00,0.00,0.01,0.02,0.00,0.00,0.09,0.00,0.55,0.00,0.00,0.00,0.09,0.02,0.00,0.00,0.01,0.00,0.20,0.00,0.06,0.05,0.08,0.04,0.01,0.00,0.06,0.17,0.14,0.03,0.00,0.00,0.18,0.08,0.00,1.00,1.00,0.99,0.98,0.98,1.00,0.74,0.98,0.43,1.00,0.91,1.00,0.41,0.47,0.90,0.95,0.96,0.99,0.00,1.00,0.72,0.20,0.00,0.77,0.52,1.00,0.15,0.53,0.53,0.53,1.00,1.00,0.18,0.02,1.00,0.00,0.00,0.00,0.00,0.02,0.00,0.17,0.02,0.01,0.00),ncol=3)
63 | 	admix.props2 <- matrix(c(0.36,0.35,0.42,0.38,0.35,0.35,0.36,0.35,0.48,0.36,0.39,0.39,0.40,0.36,0.36,0.35,0.40,0.46,0.45,0.38,0.34,0.35,0.47,0.40,0.35,1.00,1.00,0.99,0.99,0.98,1.00,0.84,0.99,0.63,1.00,0.32,0.35,0.24,0.24,0.33,0.34,0.33,0.35,0.15,0.32,0.32,0.10,0.30,0.33,0.27,0.36,0.13,0.26,0.27,0.22,0.36,0.35,0.14,0.11,0.35,0.00,0.00,0.00,0.01,0.01,0.00,0.07,0.00,0.18,0.00,0.32,0.30,0.34,0.38,0.31,0.30,0.31,0.30,0.36,0.32,0.30,0.51,0.30,0.31,0.37,0.30,0.47,0.29,0.28,0.40,0.30,0.31,0.39,0.49,0.30,0.00,0.00,0.00,0.00,0.01,0.00,0.09,0.01,0.19,0.00),ncol=3)
64 | }
65 | # compare the estimated admixture proportions from 
66 | # two different conStruct runs to determine which 
67 | # layers in one run correspond to those in the other
68 | match.layers.x.runs(admix.props1,admix.props2)
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/man/print.conStruct.results.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/process.model.fit.R
 3 | \name{print.conStruct.results}
 4 | \alias{print.conStruct.results}
 5 | \title{An S3 print method for class conStruct.results}
 6 | \usage{
 7 | \method{print}{conStruct.results}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{an object of class \code{conStruct.results}}
11 | 
12 | \item{...}{further options to be passed to \code{print}}
13 | }
14 | \value{
15 | prints a top-level summary of the conStruct.results, returns nothing
16 | }
17 | \description{
18 | An S3 print method for class conStruct.results
19 | }
20 | 


--------------------------------------------------------------------------------
/man/print.data.block.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run.conStruct.R
 3 | \name{print.data.block}
 4 | \alias{print.data.block}
 5 | \title{An S3 print method for class data.block}
 6 | \usage{
 7 | \method{print}{data.block}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{an object of class \code{data.block}}
11 | 
12 | \item{...}{further options to be passed to \code{print}}
13 | }
14 | \value{
15 | prints a top-level summary of the data.block, returns nothing
16 | }
17 | \description{
18 | An S3 print method for class data.block
19 | }
20 | 


--------------------------------------------------------------------------------
/man/print.freq.data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run.conStruct.R
 3 | \name{print.freq.data}
 4 | \alias{print.freq.data}
 5 | \title{An S3 print method for class freq.data}
 6 | \usage{
 7 | \method{print}{freq.data}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{an object of class \code{freq.data}}
11 | 
12 | \item{...}{further options to be passed to \code{print}}
13 | }
14 | \value{
15 | prints a top-level summary of the freq.data, returns nothing
16 | }
17 | \description{
18 | An S3 print method for class freq.data
19 | }
20 | 


--------------------------------------------------------------------------------
/man/print.layer.params.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/process.model.fit.R
 3 | \name{print.layer.params}
 4 | \alias{print.layer.params}
 5 | \title{An S3 print method for class layer.params}
 6 | \usage{
 7 | \method{print}{layer.params}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{an object of class \code{layer.params}}
11 | 
12 | \item{...}{further options to be passed to \code{print}}
13 | }
14 | \value{
15 | prints a top-level summary of the layer.params, returns nothing
16 | }
17 | \description{
18 | An S3 print method for class layer.params
19 | }
20 | 


--------------------------------------------------------------------------------
/man/structure2conStruct.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/format.data.R
 3 | \name{structure2conStruct}
 4 | \alias{structure2conStruct}
 5 | \title{Convert a dataset from STRUCTURE to conStruct format}
 6 | \usage{
 7 | structure2conStruct(
 8 |   infile,
 9 |   onerowperind,
10 |   start.loci,
11 |   start.samples = 1,
12 |   missing.datum,
13 |   outfile
14 | )
15 | }
16 | \arguments{
17 | \item{infile}{The name and path of the file in STRUCTURE format 
18 | to be converted to \code{conStruct} format.}
19 | 
20 | \item{onerowperind}{Indicates whether the file format has 
21 | one row per individual (\code{TRUE}) or two rows per 
22 | individual (\code{FALSE}).}
23 | 
24 | \item{start.loci}{The index of the first column in the dataset 
25 | that contains genotype data.}
26 | 
27 | \item{start.samples}{The index of the first row in the dataset 
28 | that contains genotype data (e.g., after any headers). 
29 | Default value is 1.}
30 | 
31 | \item{missing.datum}{The character or value used to denote 
32 | missing data in the STRUCTURE dataset (often 0 or -9).}
33 | 
34 | \item{outfile}{The name and path of the file containing the 
35 | \code{conStruct} formatted dataset to be generated 
36 |     by this function.}
37 | }
38 | \value{
39 | This function returns an allele frequency data matrix 
40 | 	that can be used as the \code{freqs} argument in a conStruct 
41 | 	analysis run using \code{\link{conStruct}}.  It also saves 
42 | 	this object as an .RData file so that it can be used in 
43 | 	future analyses.
44 | }
45 | \description{
46 | \code{structure2conStruct} converts a STRUCTURE dataset 
47 | to conStruct format
48 | }
49 | \details{
50 | This function takes a population genetics dataset in 
51 | STRUCTURE format and converts it to conStruct format. 
52 | The STRUCTURE file can have one row per individual 
53 | and two columns per locus, or one column and two rows 
54 | per individual. It can only contain bi-allelic SNPs.
55 | Missing data is acceptable, but must be indicated with 
56 | a single value throughout the dataset.
57 | 
58 | This function takes a STRUCTURE format data file and 
59 | 	converts it to a \code{conStruct} format data file.
60 | 	This function can only be applied to diploid organisms.
61 | 	The STRUCTURE data file must be a plain text file. 
62 | 	If there is extraneous text or column headers before the data 
63 | 	starts, those extra lines should be deleted by hand or 
64 | 	taken into account via the \code{start.samples} argument.
65 | 	
66 | 		The STRUCTURE dataset can either be in the ONEROWPERIND=1 
67 | 		file format, with one row per individual and two columns 
68 | 		per locus, or the ONEROWPERIND=0 format, with two rows and 
69 | 	one column per individual. The first column of the STRUCTURE 
70 | 		dataset should be individual names. There may be any number 
71 | 		of other columns that contain non-genotype information before 
72 | 	the first column that contains genotype data, but there can 
73 | 		be no extraneous columns at the end of the dataset, after the 
74 | 		genotype data.
75 | 	
76 | 	The genotype data must be bi-allelic 
77 | 	single nucleotide polymorphisms (SNPs). Applying this function 
78 | 	to datasets with more than two alleles per locus may result in 
79 | 	cryptic failure. For more details, see the \code{format-data} 
80 | 	vignette.
81 | }
82 | 


--------------------------------------------------------------------------------
/man/x.validation.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/model.comparison.R
  3 | \name{x.validation}
  4 | \alias{x.validation}
  5 | \title{Run a conStruct cross-validation analysis}
  6 | \usage{
  7 | x.validation(
  8 |   train.prop = 0.9,
  9 |   n.reps,
 10 |   K,
 11 |   freqs = NULL,
 12 |   data.partitions = NULL,
 13 |   geoDist,
 14 |   coords,
 15 |   prefix,
 16 |   n.iter,
 17 |   make.figs = FALSE,
 18 |   save.files = FALSE,
 19 |   parallel = FALSE,
 20 |   n.nodes = NULL,
 21 |   ...
 22 | )
 23 | }
 24 | \arguments{
 25 | \item{train.prop}{A numeric value between 0 and 1 that gives 
 26 | the proportions of the data to be used in the 
 27 | training partition of the analysis. Default is 0.9.}
 28 | 
 29 | \item{n.reps}{An \code{integer} giving the number of cross-
 30 | validation replicates to be run.}
 31 | 
 32 | \item{K}{A numeric \code{vector} giving the numbers of layers 
 33 | to be tested in each cross-validation replicate.
 34 | E.g., \code{K=1:7}.}
 35 | 
 36 | \item{freqs}{A \code{matrix} of allele frequencies with one column per 
 37 | locus and one row per sample.
 38 |     Missing data should be indicated with \code{NA}.}
 39 | 
 40 | \item{data.partitions}{A list with one element for each desired 
 41 | cross-validation replicate. This argument can be specified 
 42 | instead of the \code{freqs} argument if the user wants to 
 43 | provide their own data partitions for model training and testing.
 44 | See the model comparison vignette for details on what this 
 45 | should look like.}
 46 | 
 47 | \item{geoDist}{A \code{matrix} of geographic distance between samples. 
 48 | If \code{NULL}, user can only run the nonspatial model.}
 49 | 
 50 | \item{coords}{A \code{matrix} giving the longitude and latitude 
 51 | (or X and Y coordinates) of the samples.}
 52 | 
 53 | \item{prefix}{A character \code{vector} giving the prefix to be attached 
 54 | to all output files.}
 55 | 
 56 | \item{n.iter}{An \code{integer} giving the number of iterations each MCMC 
 57 | chain is run. Default is 1e3.  If the number of iterations 
 58 | is greater than 500, the MCMC is thinned so that the number 
 59 | of retained iterations is 500 (before burn-in).}
 60 | 
 61 | \item{make.figs}{A \code{logical} value indicating whether to automatically 
 62 | make figures during the course of the cross-validation analysis. 
 63 | Default is \code{FALSE}.}
 64 | 
 65 | \item{save.files}{A \code{logical} value indicating whether to automatically 
 66 | save output and intermediate files once the analysis is
 67 | complete. Default is \code{FALSE}.}
 68 | 
 69 | \item{parallel}{A \code{logical} value indicating whether or not to run the 
 70 | different cross-validation replicates in parallel. Default is \code{FALSE}.
 71 | For more details on how to set up runs in parallel, see the model 
 72 | comparison vignette.}
 73 | 
 74 | \item{n.nodes}{Number of nodes to run parallel analyses on. Default is 
 75 | \code{NULL}. Ignored if \code{parallel} is \code{FALSE}. For more details 
 76 | in how to set up runs in parallel, see the model comparison vignette.}
 77 | 
 78 | \item{...}{Further options to be passed to rstan::sampling (e.g., adapt_delta).}
 79 | }
 80 | \value{
 81 | This function returns (and also saves as a .Robj) a \code{list} 
 82 | 	containing the standardized results of the cross-validation analysis
 83 | 	across replicates.  For each replicate, the function returns 
 84 | 		a list with the following elements:
 85 | 	\itemize{
 86 | 		\item \code{sp} - the mean of the standardized log likelihoods of the 
 87 | 	"testing" data partition of that replicate for the spatial model for
 88 | 		each value of K specified in \code{K}.
 89 | 		\item \code{nsp} - the mean of the standardized log likelihoods of the 
 90 | 	"testing" data partitions of that replicate for the nonspatial model for
 91 | 		each value of K specified in \code{K}.
 92 | }
 93 | In addition, this function saves two text files containing the standardized 
 94 | cross-validation results for the spatial and nonspatial results 
 95 | (prefix_sp_xval_results.txt and prefix_nsp_xval_results.txt, respectively).
 96 | These values are written as matrices for user convenience; each column is 
 97 | a cross-validation replicate, and each row gives the result for a value of 
 98 | \code{K}.
 99 | }
100 | \description{
101 | \code{x.validation} runs a conStruct cross-validation analysis
102 | }
103 | \details{
104 | This function initiates a cross-validation analysis that 
105 | uses Monte Carlo cross-validation to determine the statistical 
106 | support for models with different numbers of layers or 
107 | with and without a spatial component.
108 | }
109 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | # Generated by rstantools.  Do not edit by hand.
 2 | 
 3 | STANHEADERS_SRC = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" -e "message()" | grep "StanHeaders")
 4 | 
 5 | STANC_FLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "cat(ifelse(utils::packageVersion('rstan') >= 2.26, '-DUSE_STANC3',''))")
 6 | PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG -DBOOST_MATH_OVERFLOW_ERROR_POLICY=errno_on_error $(STANC_FLAGS)
 7 | PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::CxxFlags()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::CxxFlags()")
 8 | PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()")
 9 | 
10 | CXX_STD = CXX14
11 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | # Generated by rstantools.  Do not edit by hand.
 2 | 
 3 | STANHEADERS_SRC = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" -e "message()" | grep "StanHeaders")
 4 | 
 5 | STANC_FLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "cat(ifelse(utils::packageVersion('rstan') >= 2.26, '-DUSE_STANC3',''))")
 6 | PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG -DRCPP_PARALLEL_USE_TBB=1 $(STANC_FLAGS)
 7 | PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::CxxFlags()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::CxxFlags()")
 8 | PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()")
 9 | 
10 | CXX_STD = CXX14
11 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <RcppEigen.h>
 5 | #include <Rcpp.h>
 6 | 
 7 | using namespace Rcpp;
 8 | 
 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
12 | #endif
13 | 
14 | 
15 | RcppExport SEXP _rcpp_module_boot_stan_fit4multiK_mod();
16 | RcppExport SEXP _rcpp_module_boot_stan_fit4oneK_mod();
17 | RcppExport SEXP _rcpp_module_boot_stan_fit4space_multiK_mod();
18 | RcppExport SEXP _rcpp_module_boot_stan_fit4space_oneK_mod();
19 | 
20 | static const R_CallMethodDef CallEntries[] = {
21 |     {"_rcpp_module_boot_stan_fit4multiK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4multiK_mod, 0},
22 |     {"_rcpp_module_boot_stan_fit4oneK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4oneK_mod, 0},
23 |     {"_rcpp_module_boot_stan_fit4space_multiK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4space_multiK_mod, 0},
24 |     {"_rcpp_module_boot_stan_fit4space_oneK_mod", (DL_FUNC) &_rcpp_module_boot_stan_fit4space_oneK_mod, 0},
25 |     {NULL, NULL, 0}
26 | };
27 | 
28 | RcppExport void R_init_conStruct(DllInfo *dll) {
29 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
30 |     R_useDynamicSymbols(dll, FALSE);
31 | }
32 | 


--------------------------------------------------------------------------------
/src/stanExports_multiK.cc:
--------------------------------------------------------------------------------
 1 | // Generated by rstantools.  Do not edit by hand.
 2 | 
 3 | #include <Rcpp.h>
 4 | using namespace Rcpp ;
 5 | #include "stanExports_multiK.h"
 6 | 
 7 | RCPP_MODULE(stan_fit4multiK_mod) {
 8 | 
 9 | 
10 |     class_<rstan::stan_fit<stan_model, boost::random::ecuyer1988> >("rstantools_model_multiK")
11 | 
12 |     .constructor<SEXP,SEXP,SEXP>()
13 | 
14 | 
15 |     .method("call_sampler", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::call_sampler)
16 |     .method("param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names)
17 |     .method("param_names_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names_oi)
18 |     .method("param_fnames_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_fnames_oi)
19 |     .method("param_dims", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims)
20 |     .method("param_dims_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims_oi)
21 |     .method("update_param_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::update_param_oi)
22 |     .method("param_oi_tidx", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_oi_tidx)
23 |     .method("grad_log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::grad_log_prob)
24 |     .method("log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::log_prob)
25 |     .method("unconstrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrain_pars)
26 |     .method("constrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrain_pars)
27 |     .method("num_pars_unconstrained", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::num_pars_unconstrained)
28 |     .method("unconstrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrained_param_names)
29 |     .method("constrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrained_param_names)
30 |     .method("standalone_gqs", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::standalone_gqs)
31 |     ;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/stanExports_oneK.cc:
--------------------------------------------------------------------------------
 1 | // Generated by rstantools.  Do not edit by hand.
 2 | 
 3 | #include <Rcpp.h>
 4 | using namespace Rcpp ;
 5 | #include "stanExports_oneK.h"
 6 | 
 7 | RCPP_MODULE(stan_fit4oneK_mod) {
 8 | 
 9 | 
10 |     class_<rstan::stan_fit<stan_model, boost::random::ecuyer1988> >("rstantools_model_oneK")
11 | 
12 |     .constructor<SEXP,SEXP,SEXP>()
13 | 
14 | 
15 |     .method("call_sampler", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::call_sampler)
16 |     .method("param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names)
17 |     .method("param_names_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names_oi)
18 |     .method("param_fnames_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_fnames_oi)
19 |     .method("param_dims", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims)
20 |     .method("param_dims_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims_oi)
21 |     .method("update_param_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::update_param_oi)
22 |     .method("param_oi_tidx", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_oi_tidx)
23 |     .method("grad_log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::grad_log_prob)
24 |     .method("log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::log_prob)
25 |     .method("unconstrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrain_pars)
26 |     .method("constrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrain_pars)
27 |     .method("num_pars_unconstrained", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::num_pars_unconstrained)
28 |     .method("unconstrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrained_param_names)
29 |     .method("constrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrained_param_names)
30 |     .method("standalone_gqs", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::standalone_gqs)
31 |     ;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/stanExports_oneK.h:
--------------------------------------------------------------------------------
  1 | // Generated by rstantools.  Do not edit by hand.
  2 | 
  3 | #ifndef MODELS_HPP
  4 | #define MODELS_HPP
  5 | #define STAN__SERVICES__COMMAND_HPP
  6 | #include <rstan/rstaninc.hpp>
  7 | // Code generated by Stan version 2.21.0
  8 | #include <stan/model/model_header.hpp>
  9 | namespace model_oneK_namespace {
 10 | using std::istream;
 11 | using std::string;
 12 | using std::stringstream;
 13 | using std::vector;
 14 | using stan::io::dump;
 15 | using stan::math::lgamma;
 16 | using stan::model::prob_grad;
 17 | using namespace stan::math;
 18 | static int current_statement_begin__;
 19 | stan::io::program_reader prog_reader__() {
 20 |     stan::io::program_reader reader;
 21 |     reader.add_event(0, 0, "start", "model_oneK");
 22 |     reader.add_event(36, 34, "end", "model_oneK");
 23 |     return reader;
 24 | }
 25 | template <typename T1__, typename T2__>
 26 | Eigen::Matrix<typename boost::math::tools::promote_args<T1__, T2__>::type, Eigen::Dynamic, Eigen::Dynamic>
 27 | Cov(const int& N,
 28 |         const Eigen::Matrix<T1__, Eigen::Dynamic, 1>& nugget,
 29 |         const T2__& gamma, std::ostream* pstream__) {
 30 |     typedef typename boost::math::tools::promote_args<T1__, T2__>::type local_scalar_t__;
 31 |     typedef local_scalar_t__ fun_return_scalar_t__;
 32 |     const static bool propto__ = true;
 33 |     (void) propto__;
 34 |         local_scalar_t__ DUMMY_VAR__(std::numeric_limits<double>::quiet_NaN());
 35 |         (void) DUMMY_VAR__;  // suppress unused var warning
 36 |     int current_statement_begin__ = -1;
 37 |     try {
 38 |         {
 39 |         current_statement_begin__ = 3;
 40 |         validate_non_negative_index("parCov", "N", N);
 41 |         validate_non_negative_index("parCov", "N", N);
 42 |         Eigen::Matrix<local_scalar_t__, Eigen::Dynamic, Eigen::Dynamic> parCov(N, N);
 43 |         stan::math::initialize(parCov, DUMMY_VAR__);
 44 |         stan::math::fill(parCov, DUMMY_VAR__);
 45 |         current_statement_begin__ = 4;
 46 |         validate_non_negative_index("Nug_mat", "N", N);
 47 |         validate_non_negative_index("Nug_mat", "N", N);
 48 |         Eigen::Matrix<local_scalar_t__, Eigen::Dynamic, Eigen::Dynamic> Nug_mat(N, N);
 49 |         stan::math::initialize(Nug_mat, DUMMY_VAR__);
 50 |         stan::math::fill(Nug_mat, DUMMY_VAR__);
 51 |         current_statement_begin__ = 5;
 52 |         stan::math::assign(parCov, rep_matrix(gamma, N, N));
 53 |         current_statement_begin__ = 6;
 54 |         stan::math::assign(Nug_mat, diag_matrix(nugget));
 55 |         current_statement_begin__ = 7;
 56 |         stan::math::assign(parCov, add(parCov, Nug_mat));
 57 |         current_statement_begin__ = 8;
 58 |         return stan::math::promote_scalar<fun_return_scalar_t__>(parCov);
 59 |         }
 60 |     } catch (const std::exception& e) {
 61 |         stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__());
 62 |         // Next line prevents compiler griping about no return
 63 |         throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***");
 64 |     }
 65 | }
 66 | struct Cov_functor__ {
 67 |     template <typename T1__, typename T2__>
 68 |         Eigen::Matrix<typename boost::math::tools::promote_args<T1__, T2__>::type, Eigen::Dynamic, Eigen::Dynamic>
 69 |     operator()(const int& N,
 70 |         const Eigen::Matrix<T1__, Eigen::Dynamic, 1>& nugget,
 71 |         const T2__& gamma, std::ostream* pstream__) const {
 72 |         return Cov(N, nugget, gamma, pstream__);
 73 |     }
 74 | };
 75 | #include <stan_meta_header.hpp>
 76 | class model_oneK
 77 |   : public stan::model::model_base_crtp<model_oneK> {
 78 | private:
 79 |         int K;
 80 |         int N;
 81 |         int L;
 82 |         matrix_d obsCov;
 83 |         double varMeanFreqs;
 84 |         matrix_d LobsCov;
 85 | public:
 86 |     model_oneK(stan::io::var_context& context__,
 87 |         std::ostream* pstream__ = 0)
 88 |         : model_base_crtp(0) {
 89 |         ctor_body(context__, 0, pstream__);
 90 |     }
 91 |     model_oneK(stan::io::var_context& context__,
 92 |         unsigned int random_seed__,
 93 |         std::ostream* pstream__ = 0)
 94 |         : model_base_crtp(0) {
 95 |         ctor_body(context__, random_seed__, pstream__);
 96 |     }
 97 |     void ctor_body(stan::io::var_context& context__,
 98 |                    unsigned int random_seed__,
 99 |                    std::ostream* pstream__) {
100 |         typedef double local_scalar_t__;
101 |         boost::ecuyer1988 base_rng__ =
102 |           stan::services::util::create_rng(random_seed__, 0);
103 |         (void) base_rng__;  // suppress unused var warning
104 |         current_statement_begin__ = -1;
105 |         static const char* function__ = "model_oneK_namespace::model_oneK";
106 |         (void) function__;  // dummy to suppress unused var warning
107 |         size_t pos__;
108 |         (void) pos__;  // dummy to suppress unused var warning
109 |         std::vector<int> vals_i__;
110 |         std::vector<double> vals_r__;
111 |         local_scalar_t__ DUMMY_VAR__(std::numeric_limits<double>::quiet_NaN());
112 |         (void) DUMMY_VAR__;  // suppress unused var warning
113 |         try {
114 |             // initialize data block variables from context__
115 |             current_statement_begin__ = 12;
116 |             context__.validate_dims("data initialization", "K", "int", context__.to_vec());
117 |             K = int(0);
118 |             vals_i__ = context__.vals_i("K");
119 |             pos__ = 0;
120 |             K = vals_i__[pos__++];
121 |             check_greater_or_equal(function__, "K", K, 1);
122 |             current_statement_begin__ = 13;
123 |             context__.validate_dims("data initialization", "N", "int", context__.to_vec());
124 |             N = int(0);
125 |             vals_i__ = context__.vals_i("N");
126 |             pos__ = 0;
127 |             N = vals_i__[pos__++];
128 |             check_greater_or_equal(function__, "N", N, 2);
129 |             current_statement_begin__ = 14;
130 |             context__.validate_dims("data initialization", "L", "int", context__.to_vec());
131 |             L = int(0);
132 |             vals_i__ = context__.vals_i("L");
133 |             pos__ = 0;
134 |             L = vals_i__[pos__++];
135 |             check_greater_or_equal(function__, "L", L, (N + 1));
136 |             current_statement_begin__ = 15;
137 |             validate_non_negative_index("obsCov", "N", N);
138 |             validate_non_negative_index("obsCov", "N", N);
139 |             context__.validate_dims("data initialization", "obsCov", "matrix_d", context__.to_vec(N,N));
140 |             obsCov = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>(N, N);
141 |             vals_r__ = context__.vals_r("obsCov");
142 |             pos__ = 0;
143 |             size_t obsCov_j_2_max__ = N;
144 |             size_t obsCov_j_1_max__ = N;
145 |             for (size_t j_2__ = 0; j_2__ < obsCov_j_2_max__; ++j_2__) {
146 |                 for (size_t j_1__ = 0; j_1__ < obsCov_j_1_max__; ++j_1__) {
147 |                     obsCov(j_1__, j_2__) = vals_r__[pos__++];
148 |                 }
149 |             }
150 |             current_statement_begin__ = 16;
151 |             context__.validate_dims("data initialization", "varMeanFreqs", "double", context__.to_vec());
152 |             varMeanFreqs = double(0);
153 |             vals_r__ = context__.vals_r("varMeanFreqs");
154 |             pos__ = 0;
155 |             varMeanFreqs = vals_r__[pos__++];
156 |             // initialize transformed data variables
157 |             current_statement_begin__ = 19;
158 |             validate_non_negative_index("LobsCov", "N", N);
159 |             validate_non_negative_index("LobsCov", "N", N);
160 |             LobsCov = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>(N, N);
161 |             stan::math::fill(LobsCov, DUMMY_VAR__);
162 |             // execute transformed data statements
163 |             current_statement_begin__ = 20;
164 |             stan::math::assign(LobsCov, multiply(L, obsCov));
165 |             // validate transformed data
166 |             // validate, set parameter ranges
167 |             num_params_r__ = 0U;
168 |             param_ranges_i__.clear();
169 |             current_statement_begin__ = 23;
170 |             num_params_r__ += 1;
171 |             current_statement_begin__ = 24;
172 |             validate_non_negative_index("nugget", "N", N);
173 |             num_params_r__ += N;
174 |         } catch (const std::exception& e) {
175 |             stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__());
176 |             // Next line prevents compiler griping about no return
177 |             throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***");
178 |         }
179 |     }
180 |     ~model_oneK() { }
181 |     void transform_inits(const stan::io::var_context& context__,
182 |                          std::vector<int>& params_i__,
183 |                          std::vector<double>& params_r__,
184 |                          std::ostream* pstream__) const {
185 |         typedef double local_scalar_t__;
186 |         stan::io::writer<double> writer__(params_r__, params_i__);
187 |         size_t pos__;
188 |         (void) pos__; // dummy call to supress warning
189 |         std::vector<double> vals_r__;
190 |         std::vector<int> vals_i__;
191 |         current_statement_begin__ = 23;
192 |         if (!(context__.contains_r("gamma")))
193 |             stan::lang::rethrow_located(std::runtime_error(std::string("Variable gamma missing")), current_statement_begin__, prog_reader__());
194 |         vals_r__ = context__.vals_r("gamma");
195 |         pos__ = 0U;
196 |         context__.validate_dims("parameter initialization", "gamma", "double", context__.to_vec());
197 |         double gamma(0);
198 |         gamma = vals_r__[pos__++];
199 |         try {
200 |             writer__.scalar_lb_unconstrain(0, gamma);
201 |         } catch (const std::exception& e) {
202 |             stan::lang::rethrow_located(std::runtime_error(std::string("Error transforming variable gamma: ") + e.what()), current_statement_begin__, prog_reader__());
203 |         }
204 |         current_statement_begin__ = 24;
205 |         if (!(context__.contains_r("nugget")))
206 |             stan::lang::rethrow_located(std::runtime_error(std::string("Variable nugget missing")), current_statement_begin__, prog_reader__());
207 |         vals_r__ = context__.vals_r("nugget");
208 |         pos__ = 0U;
209 |         validate_non_negative_index("nugget", "N", N);
210 |         context__.validate_dims("parameter initialization", "nugget", "vector_d", context__.to_vec(N));
211 |         Eigen::Matrix<double, Eigen::Dynamic, 1> nugget(N);
212 |         size_t nugget_j_1_max__ = N;
213 |         for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) {
214 |             nugget(j_1__) = vals_r__[pos__++];
215 |         }
216 |         try {
217 |             writer__.vector_lb_unconstrain(0, nugget);
218 |         } catch (const std::exception& e) {
219 |             stan::lang::rethrow_located(std::runtime_error(std::string("Error transforming variable nugget: ") + e.what()), current_statement_begin__, prog_reader__());
220 |         }
221 |         params_r__ = writer__.data_r();
222 |         params_i__ = writer__.data_i();
223 |     }
224 |     void transform_inits(const stan::io::var_context& context,
225 |                          Eigen::Matrix<double, Eigen::Dynamic, 1>& params_r,
226 |                          std::ostream* pstream__) const {
227 |       std::vector<double> params_r_vec;
228 |       std::vector<int> params_i_vec;
229 |       transform_inits(context, params_i_vec, params_r_vec, pstream__);
230 |       params_r.resize(params_r_vec.size());
231 |       for (int i = 0; i < params_r.size(); ++i)
232 |         params_r(i) = params_r_vec[i];
233 |     }
234 |     template <bool propto__, bool jacobian__, typename T__>
235 |     T__ log_prob(std::vector<T__>& params_r__,
236 |                  std::vector<int>& params_i__,
237 |                  std::ostream* pstream__ = 0) const {
238 |         typedef T__ local_scalar_t__;
239 |         local_scalar_t__ DUMMY_VAR__(std::numeric_limits<double>::quiet_NaN());
240 |         (void) DUMMY_VAR__;  // dummy to suppress unused var warning
241 |         T__ lp__(0.0);
242 |         stan::math::accumulator<T__> lp_accum__;
243 |         try {
244 |             stan::io::reader<local_scalar_t__> in__(params_r__, params_i__);
245 |             // model parameters
246 |             current_statement_begin__ = 23;
247 |             local_scalar_t__ gamma;
248 |             (void) gamma;  // dummy to suppress unused var warning
249 |             if (jacobian__)
250 |                 gamma = in__.scalar_lb_constrain(0, lp__);
251 |             else
252 |                 gamma = in__.scalar_lb_constrain(0);
253 |             current_statement_begin__ = 24;
254 |             Eigen::Matrix<local_scalar_t__, Eigen::Dynamic, 1> nugget;
255 |             (void) nugget;  // dummy to suppress unused var warning
256 |             if (jacobian__)
257 |                 nugget = in__.vector_lb_constrain(0, N, lp__);
258 |             else
259 |                 nugget = in__.vector_lb_constrain(0, N);
260 |             // transformed parameters
261 |             current_statement_begin__ = 27;
262 |             validate_non_negative_index("parCov", "N", N);
263 |             validate_non_negative_index("parCov", "N", N);
264 |             Eigen::Matrix<local_scalar_t__, Eigen::Dynamic, Eigen::Dynamic> parCov(N, N);
265 |             stan::math::initialize(parCov, DUMMY_VAR__);
266 |             stan::math::fill(parCov, DUMMY_VAR__);
267 |             // transformed parameters block statements
268 |             current_statement_begin__ = 28;
269 |             stan::math::assign(parCov, Cov(N, nugget, gamma, pstream__));
270 |             // validate transformed parameters
271 |             const char* function__ = "validate transformed params";
272 |             (void) function__;  // dummy to suppress unused var warning
273 |             current_statement_begin__ = 27;
274 |             size_t parCov_j_1_max__ = N;
275 |             size_t parCov_j_2_max__ = N;
276 |             for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) {
277 |                 for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) {
278 |                     if (stan::math::is_uninitialized(parCov(j_1__, j_2__))) {
279 |                         std::stringstream msg__;
280 |                         msg__ << "Undefined transformed parameter: parCov" << "(" << j_1__ << ", " << j_2__ << ")";
281 |                         stan::lang::rethrow_located(std::runtime_error(std::string("Error initializing variable parCov: ") + msg__.str()), current_statement_begin__, prog_reader__());
282 |                     }
283 |                 }
284 |             }
285 |             // model body
286 |             current_statement_begin__ = 31;
287 |             lp_accum__.add(normal_log<propto__>(nugget, 0, 1));
288 |             current_statement_begin__ = 32;
289 |             lp_accum__.add(normal_log<propto__>(gamma, varMeanFreqs, 0.5));
290 |             current_statement_begin__ = 33;
291 |             lp_accum__.add(wishart_log<propto__>(LobsCov, L, parCov));
292 |         } catch (const std::exception& e) {
293 |             stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__());
294 |             // Next line prevents compiler griping about no return
295 |             throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***");
296 |         }
297 |         lp_accum__.add(lp__);
298 |         return lp_accum__.sum();
299 |     } // log_prob()
300 |     template <bool propto, bool jacobian, typename T_>
301 |     T_ log_prob(Eigen::Matrix<T_,Eigen::Dynamic,1>& params_r,
302 |                std::ostream* pstream = 0) const {
303 |       std::vector<T_> vec_params_r;
304 |       vec_params_r.reserve(params_r.size());
305 |       for (int i = 0; i < params_r.size(); ++i)
306 |         vec_params_r.push_back(params_r(i));
307 |       std::vector<int> vec_params_i;
308 |       return log_prob<propto,jacobian,T_>(vec_params_r, vec_params_i, pstream);
309 |     }
310 |     void get_param_names(std::vector<std::string>& names__) const {
311 |         names__.resize(0);
312 |         names__.push_back("gamma");
313 |         names__.push_back("nugget");
314 |         names__.push_back("parCov");
315 |     }
316 |     void get_dims(std::vector<std::vector<size_t> >& dimss__) const {
317 |         dimss__.resize(0);
318 |         std::vector<size_t> dims__;
319 |         dims__.resize(0);
320 |         dimss__.push_back(dims__);
321 |         dims__.resize(0);
322 |         dims__.push_back(N);
323 |         dimss__.push_back(dims__);
324 |         dims__.resize(0);
325 |         dims__.push_back(N);
326 |         dims__.push_back(N);
327 |         dimss__.push_back(dims__);
328 |     }
329 |     template <typename RNG>
330 |     void write_array(RNG& base_rng__,
331 |                      std::vector<double>& params_r__,
332 |                      std::vector<int>& params_i__,
333 |                      std::vector<double>& vars__,
334 |                      bool include_tparams__ = true,
335 |                      bool include_gqs__ = true,
336 |                      std::ostream* pstream__ = 0) const {
337 |         typedef double local_scalar_t__;
338 |         vars__.resize(0);
339 |         stan::io::reader<local_scalar_t__> in__(params_r__, params_i__);
340 |         static const char* function__ = "model_oneK_namespace::write_array";
341 |         (void) function__;  // dummy to suppress unused var warning
342 |         // read-transform, write parameters
343 |         double gamma = in__.scalar_lb_constrain(0);
344 |         vars__.push_back(gamma);
345 |         Eigen::Matrix<double, Eigen::Dynamic, 1> nugget = in__.vector_lb_constrain(0, N);
346 |         size_t nugget_j_1_max__ = N;
347 |         for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) {
348 |             vars__.push_back(nugget(j_1__));
349 |         }
350 |         double lp__ = 0.0;
351 |         (void) lp__;  // dummy to suppress unused var warning
352 |         stan::math::accumulator<double> lp_accum__;
353 |         local_scalar_t__ DUMMY_VAR__(std::numeric_limits<double>::quiet_NaN());
354 |         (void) DUMMY_VAR__;  // suppress unused var warning
355 |         if (!include_tparams__ && !include_gqs__) return;
356 |         try {
357 |             // declare and define transformed parameters
358 |             current_statement_begin__ = 27;
359 |             validate_non_negative_index("parCov", "N", N);
360 |             validate_non_negative_index("parCov", "N", N);
361 |             Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> parCov(N, N);
362 |             stan::math::initialize(parCov, DUMMY_VAR__);
363 |             stan::math::fill(parCov, DUMMY_VAR__);
364 |             // do transformed parameters statements
365 |             current_statement_begin__ = 28;
366 |             stan::math::assign(parCov, Cov(N, nugget, gamma, pstream__));
367 |             if (!include_gqs__ && !include_tparams__) return;
368 |             // validate transformed parameters
369 |             const char* function__ = "validate transformed params";
370 |             (void) function__;  // dummy to suppress unused var warning
371 |             // write transformed parameters
372 |             if (include_tparams__) {
373 |                 size_t parCov_j_2_max__ = N;
374 |                 size_t parCov_j_1_max__ = N;
375 |                 for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) {
376 |                     for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) {
377 |                         vars__.push_back(parCov(j_1__, j_2__));
378 |                     }
379 |                 }
380 |             }
381 |             if (!include_gqs__) return;
382 |         } catch (const std::exception& e) {
383 |             stan::lang::rethrow_located(e, current_statement_begin__, prog_reader__());
384 |             // Next line prevents compiler griping about no return
385 |             throw std::runtime_error("*** IF YOU SEE THIS, PLEASE REPORT A BUG ***");
386 |         }
387 |     }
388 |     template <typename RNG>
389 |     void write_array(RNG& base_rng,
390 |                      Eigen::Matrix<double,Eigen::Dynamic,1>& params_r,
391 |                      Eigen::Matrix<double,Eigen::Dynamic,1>& vars,
392 |                      bool include_tparams = true,
393 |                      bool include_gqs = true,
394 |                      std::ostream* pstream = 0) const {
395 |       std::vector<double> params_r_vec(params_r.size());
396 |       for (int i = 0; i < params_r.size(); ++i)
397 |         params_r_vec[i] = params_r(i);
398 |       std::vector<double> vars_vec;
399 |       std::vector<int> params_i_vec;
400 |       write_array(base_rng, params_r_vec, params_i_vec, vars_vec, include_tparams, include_gqs, pstream);
401 |       vars.resize(vars_vec.size());
402 |       for (int i = 0; i < vars.size(); ++i)
403 |         vars(i) = vars_vec[i];
404 |     }
405 |     std::string model_name() const {
406 |         return "model_oneK";
407 |     }
408 |     void constrained_param_names(std::vector<std::string>& param_names__,
409 |                                  bool include_tparams__ = true,
410 |                                  bool include_gqs__ = true) const {
411 |         std::stringstream param_name_stream__;
412 |         param_name_stream__.str(std::string());
413 |         param_name_stream__ << "gamma";
414 |         param_names__.push_back(param_name_stream__.str());
415 |         size_t nugget_j_1_max__ = N;
416 |         for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) {
417 |             param_name_stream__.str(std::string());
418 |             param_name_stream__ << "nugget" << '.' << j_1__ + 1;
419 |             param_names__.push_back(param_name_stream__.str());
420 |         }
421 |         if (!include_gqs__ && !include_tparams__) return;
422 |         if (include_tparams__) {
423 |             size_t parCov_j_2_max__ = N;
424 |             size_t parCov_j_1_max__ = N;
425 |             for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) {
426 |                 for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) {
427 |                     param_name_stream__.str(std::string());
428 |                     param_name_stream__ << "parCov" << '.' << j_1__ + 1 << '.' << j_2__ + 1;
429 |                     param_names__.push_back(param_name_stream__.str());
430 |                 }
431 |             }
432 |         }
433 |         if (!include_gqs__) return;
434 |     }
435 |     void unconstrained_param_names(std::vector<std::string>& param_names__,
436 |                                    bool include_tparams__ = true,
437 |                                    bool include_gqs__ = true) const {
438 |         std::stringstream param_name_stream__;
439 |         param_name_stream__.str(std::string());
440 |         param_name_stream__ << "gamma";
441 |         param_names__.push_back(param_name_stream__.str());
442 |         size_t nugget_j_1_max__ = N;
443 |         for (size_t j_1__ = 0; j_1__ < nugget_j_1_max__; ++j_1__) {
444 |             param_name_stream__.str(std::string());
445 |             param_name_stream__ << "nugget" << '.' << j_1__ + 1;
446 |             param_names__.push_back(param_name_stream__.str());
447 |         }
448 |         if (!include_gqs__ && !include_tparams__) return;
449 |         if (include_tparams__) {
450 |             size_t parCov_j_2_max__ = N;
451 |             size_t parCov_j_1_max__ = N;
452 |             for (size_t j_2__ = 0; j_2__ < parCov_j_2_max__; ++j_2__) {
453 |                 for (size_t j_1__ = 0; j_1__ < parCov_j_1_max__; ++j_1__) {
454 |                     param_name_stream__.str(std::string());
455 |                     param_name_stream__ << "parCov" << '.' << j_1__ + 1 << '.' << j_2__ + 1;
456 |                     param_names__.push_back(param_name_stream__.str());
457 |                 }
458 |             }
459 |         }
460 |         if (!include_gqs__) return;
461 |     }
462 | }; // model
463 | }  // namespace
464 | typedef model_oneK_namespace::model_oneK stan_model;
465 | #ifndef USING_R
466 | stan::model::model_base& new_model(
467 |         stan::io::var_context& data_context,
468 |         unsigned int seed,
469 |         std::ostream* msg_stream) {
470 |   stan_model* m = new stan_model(data_context, seed, msg_stream);
471 |   return *m;
472 | }
473 | #endif
474 | #endif
475 | 


--------------------------------------------------------------------------------
/src/stanExports_space_multiK.cc:
--------------------------------------------------------------------------------
 1 | // Generated by rstantools.  Do not edit by hand.
 2 | 
 3 | #include <Rcpp.h>
 4 | using namespace Rcpp ;
 5 | #include "stanExports_space_multiK.h"
 6 | 
 7 | RCPP_MODULE(stan_fit4space_multiK_mod) {
 8 | 
 9 | 
10 |     class_<rstan::stan_fit<stan_model, boost::random::ecuyer1988> >("rstantools_model_space_multiK")
11 | 
12 |     .constructor<SEXP,SEXP,SEXP>()
13 | 
14 | 
15 |     .method("call_sampler", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::call_sampler)
16 |     .method("param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names)
17 |     .method("param_names_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names_oi)
18 |     .method("param_fnames_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_fnames_oi)
19 |     .method("param_dims", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims)
20 |     .method("param_dims_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims_oi)
21 |     .method("update_param_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::update_param_oi)
22 |     .method("param_oi_tidx", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_oi_tidx)
23 |     .method("grad_log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::grad_log_prob)
24 |     .method("log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::log_prob)
25 |     .method("unconstrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrain_pars)
26 |     .method("constrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrain_pars)
27 |     .method("num_pars_unconstrained", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::num_pars_unconstrained)
28 |     .method("unconstrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrained_param_names)
29 |     .method("constrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrained_param_names)
30 |     .method("standalone_gqs", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::standalone_gqs)
31 |     ;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/stanExports_space_oneK.cc:
--------------------------------------------------------------------------------
 1 | // Generated by rstantools.  Do not edit by hand.
 2 | 
 3 | #include <Rcpp.h>
 4 | using namespace Rcpp ;
 5 | #include "stanExports_space_oneK.h"
 6 | 
 7 | RCPP_MODULE(stan_fit4space_oneK_mod) {
 8 | 
 9 | 
10 |     class_<rstan::stan_fit<stan_model, boost::random::ecuyer1988> >("rstantools_model_space_oneK")
11 | 
12 |     .constructor<SEXP,SEXP,SEXP>()
13 | 
14 | 
15 |     .method("call_sampler", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::call_sampler)
16 |     .method("param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names)
17 |     .method("param_names_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_names_oi)
18 |     .method("param_fnames_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_fnames_oi)
19 |     .method("param_dims", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims)
20 |     .method("param_dims_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_dims_oi)
21 |     .method("update_param_oi", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::update_param_oi)
22 |     .method("param_oi_tidx", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::param_oi_tidx)
23 |     .method("grad_log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::grad_log_prob)
24 |     .method("log_prob", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::log_prob)
25 |     .method("unconstrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrain_pars)
26 |     .method("constrain_pars", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrain_pars)
27 |     .method("num_pars_unconstrained", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::num_pars_unconstrained)
28 |     .method("unconstrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::unconstrained_param_names)
29 |     .method("constrained_param_names", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::constrained_param_names)
30 |     .method("standalone_gqs", &rstan::stan_fit<stan_model, boost::random::ecuyer1988> ::standalone_gqs)
31 |     ;
32 | }
33 | 


--------------------------------------------------------------------------------
/testing/runs/sim.dataset.Robj:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/testing/runs/sim.dataset.Robj


--------------------------------------------------------------------------------
/testing/runs/test.mods.R:
--------------------------------------------------------------------------------
 1 | library(conStruct)
 2 | library(doParallel)
 3 | library(foreach)
 4 | load("sim.dataset.Robj")
 5 | 
 6 | options(error=recover)
 7 | args <- list("run1" = list("spatial" = FALSE,
 8 | 						   "geoDist" = fields::rdist(sim.dataset$coords),
 9 | 						   "K" = 1,
10 | 						   "prefix" = "nsp1"),
11 | 			 "run2" = list("spatial" = FALSE,
12 | 						   "geoDist" = NULL,
13 | 						   "K" = 1,
14 | 						   "prefix" = "nsp1a"),
15 | 			 "run3" = list("spatial" = TRUE,
16 | 						   "geoDist" = fields::rdist(sim.dataset$coords),
17 | 						   "K" = 1,
18 | 						   "prefix" = "sp1"),
19 | 			 "run4" = list("spatial" = TRUE,
20 | 						   "geoDist" = fields::rdist(sim.dataset$coords),
21 | 						   "K" = 3,
22 | 						   "prefix" = "sp3"),
23 | 			 "run5" = list("spatial" = FALSE,
24 | 						   "geoDist" = fields::rdist(sim.dataset$coords),
25 | 						   "K" = 3,
26 | 						   "prefix" = "nsp3"),
27 | 			 "run6" = list("spatial" = FALSE,
28 | 						   "geoDist" = NULL,
29 | 						   "K" = 3,
30 | 						   "prefix" = "nsp3b")
31 | 		)
32 | 
33 | cl <- parallel::makeCluster(3,type="FORK")
34 | doParallel::registerDoParallel(cl)
35 | 
36 | tmp <- foreach::foreach(i=1:length(args)) %dopar% {
37 | 						x <- args[[i]] ; 
38 |         					conStruct::conStruct(spatial = x[["spatial"]],
39 | 						  		  			 K = x[["K"]],
40 | 						  		  			 freqs = sim.dataset$freq.data$freqs,
41 | 						  		  			 geoDist = x[["geoDist"]],
42 | 						  		  			 coords = sim.dataset$coords,
43 | 						  		  			 prefix = x[["prefix"]])
44 |     			 }
45 | 
46 | parallel::stopCluster(cl)
47 | 


--------------------------------------------------------------------------------
/testing/runs/testOne.R:
--------------------------------------------------------------------------------
 1 | library(conStruct)
 2 | 
 3 | load("sim.dataset.Robj")
 4 | 
 5 | options(error=recover)
 6 | test <- conStruct::conStruct(spatial = TRUE,
 7 | 		  		  			 K = 2,
 8 | 		  		  			 freqs = sim.dataset$freq.data$freqs,
 9 | 		  		  			 geoDist = fields::rdist(sim.dataset$coords),
10 | 		  		  			 coords = sim.dataset$coords,
11 | 		  		  			 prefix = "test1")
12 | 
13 | test <- conStruct::conStruct(spatial = FALSE,
14 | 		  		  			 K = 2,
15 | 		  		  			 freqs = sim.dataset$freq.data$freqs,
16 | 		  		  			 geoDist = fields::rdist(sim.dataset$coords),
17 | 		  		  			 coords = sim.dataset$coords,
18 | 		  		  			 prefix = "test2",
19 | 		  		  			 n.iter=400)
20 | 
21 | test <- conStruct::conStruct(spatial = FALSE,
22 | 		  		  			 K = 2,
23 | 		  		  			 freqs = sim.dataset$freq.data$freqs,
24 | 		  		  			 geoDist = fields::rdist(sim.dataset$coords),
25 | 		  		  			 coords = sim.dataset$coords,
26 | 		  		  			 prefix = "test3",
27 | 		  		  			 n.iter=500)
28 | 
29 | test <- conStruct::conStruct(spatial = FALSE,
30 | 		  		  			 K = 2,
31 | 		  		  			 freqs = sim.dataset$freq.data$freqs,
32 | 		  		  			 geoDist = fields::rdist(sim.dataset$coords),
33 | 		  		  			 coords = sim.dataset$coords,
34 | 		  		  			 prefix = "test4",
35 | 		  		  			 n.iter=510)
36 | 
37 | test <- conStruct::conStruct(spatial = FALSE,
38 | 		  		  			 K = 2,
39 | 		  		  			 freqs = sim.dataset$freq.data$freqs,
40 | 		  		  			 geoDist = fields::rdist(sim.dataset$coords),
41 | 		  		  			 coords = sim.dataset$coords,
42 | 		  		  			 prefix = "test5",
43 | 		  		  			 n.iter=2e3)
44 | 
45 | test <- conStruct::conStruct(spatial = FALSE,
46 | 		  		  			 K = 2,
47 | 		  		  			 freqs = sim.dataset$freq.data$freqs,
48 | 		  		  			 geoDist = fields::rdist(sim.dataset$coords),
49 | 		  		  			 coords = sim.dataset$coords,
50 | 		  		  			 prefix = "test5",
51 | 		  		  			 n.iter=2e3,
52 | 		  		  			 control = setNames(list(0.9),"adapt_delta"))


--------------------------------------------------------------------------------
/testing/xval/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY : test clean
2 | 
3 | test : clean
4 | 	Rscript test.xval.R &> test.log
5 | 	Rscript test.xval2.R &> test2.log
6 | 
7 | clean :
8 | 	rm  -f test1.xvals.Robj xval_test1.xval.data.partitions.Robj xval_test1.xval.results.Robj xval_test1_nsp_xval_results.txt xval_test1_sp_xval_results.txt test.log test2.log xval_test2.xval.data.partitions.Robj xval_test2.xval.results.Robj xval_test2_nsp_xval_results.txt xval_test2_sp_xval_results.txt xvals2.Robj .Rapp.history .RData


--------------------------------------------------------------------------------
/testing/xval/sim.dataset.Robj:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gbradburd/conStruct/a41049e475be68fb267996045a47e968c737af92/testing/xval/sim.dataset.Robj


--------------------------------------------------------------------------------
/testing/xval/test.xval.R:
--------------------------------------------------------------------------------
 1 | library(conStruct)
 2 | load("sim.dataset.Robj")
 3 | 
 4 | options(error=recover)
 5 | 
 6 | xvals <- x.validation(train.prop = 0.9,
 7 | 					  n.reps = 2,
 8 | 					  K = 1:2,
 9 | 					  freqs = sim.dataset$freqs,
10 | 					  geoDist = fields::rdist(sim.dataset$coords),
11 | 					  coords = sim.dataset$coords,
12 | 					  prefix = "xval_test1",
13 | 					  n.iter = 1e3,
14 | 					  make.figs = FALSE,
15 | 					  save.files = FALSE,
16 | 					  parallel = FALSE,
17 | 					  n.nodes = NULL)
18 | 
19 | save(xvals,file="test1.xvals.Robj")


--------------------------------------------------------------------------------
/testing/xval/test.xval2.R:
--------------------------------------------------------------------------------
 1 | library(conStruct)
 2 | load("sim.dataset.Robj")
 3 | 
 4 | library(foreach)
 5 | library(doParallel)
 6 | cl <- makeCluster(2,type="FORK")
 7 | registerDoParallel(cl)
 8 | 
 9 | xvals <- x.validation(train.prop = 0.9,
10 | 					  n.reps = 2,
11 | 					  K = 1:2,
12 | 					  freqs = sim.dataset$freqs,
13 | 					  data.partitions = NULL,
14 | 					  geoDist = fields::rdist(sim.dataset$coords),
15 | 					  coords = sim.dataset$coords,
16 | 					  prefix = "xval_test2",
17 | 					  n.iter = 1e3,
18 | 					  make.figs = FALSE,
19 | 					  save.files = FALSE,
20 | 					  parallel = TRUE,
21 | 					  n.nodes = 2)
22 | 
23 | save(xvals,file="xvals2.Robj")
24 | 
25 | stopCluster(cl)


--------------------------------------------------------------------------------
/vignettes/format-data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "How to format data for a conStruct analysis"
  3 | author: "Gideon Bradburd"
  4 | date: '`r format(Sys.Date(), "%B %d, %Y")`'
  5 | output:
  6 |   rmarkdown::html_vignette:
  7 |     toc: true
  8 | vignette: >
  9 |   %\VignetteIndexEntry{format-data}
 10 |   %\VignetteEngine{knitr::rmarkdown}
 11 |   %\VignetteEncoding{UTF-8}
 12 | ---
 13 | ```{r, echo = FALSE}
 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
 15 | ```
 16 | 
 17 | <!-- library(rmarkdown) ; render("format-data.Rmd",html_vignette(toc=TRUE))	-->
 18 | 
 19 | ## Format data
 20 | This document describes the format of the data used in 
 21 | a `conStruct` analysis.
 22 | 
 23 | For information on how to run a `conStruct` analysis 
 24 | after you've formatted your data, see the companion 
 25 | vignette on [how to run conStruct](run-conStruct.html).
 26 | 
 27 | Throughout the document, I'll be referring to the 
 28 | example dataset included with the package:
 29 | 
 30 | ```{r}
 31 | library(conStruct)
 32 | data(conStruct.data)
 33 | ```
 34 | 
 35 | ## conStruct data
 36 | 
 37 | There are 3 data objects you need to run a `conStruct` analysis:
 38 | 
 39 | 1. [allele frequency data]
 40 | 
 41 | 2. [geographic sampling coordinates]
 42 | 
 43 | 3. [geographic distance matrix]
 44 | 
 45 | In the sections below, I walk through the specific format required for each.
 46 | 
 47 | ### Allele frequency data
 48 | 
 49 | You must specify a matrix of allele frequency data for your samples.
 50 | (Make sure the data are of class `matrix`, and that it's not a `data.frame`.)
 51 | I assume that the data consist of bi-allelic SNPs.
 52 | At each locus, you pick an allele to count across all samples
 53 | (it doesn't matter whether it's randomly chosen or 
 54 | whether it's always the major or minor allele).
 55 | The frequency of the counted allele at a locus in a sample 
 56 | is the number of times the counted allele is observed at a locus 
 57 | divided by the number of chromosomes genotyped in that sample.
 58 | A sample can consist of a single individual or multiple individuals 
 59 | lumped together.
 60 | So, a successfully genotyped diploid individual heterozygous at a
 61 | particular locus would have an allele frequency of 0.5.
 62 | If the sample is a population of 13 haploids, of which 12 have the 
 63 | counted allele at a given locus, the frequency in that sample at that 
 64 | locus would be 12/13.
 65 | 
 66 | The matrix of allele frequencies should have one row per sample and 
 67 | one column per locus.  Missing data should be denoted with the value `NA`.
 68 | An small example allele frequency data matrix is shown below:
 69 | 
 70 | | Sample | Locus1 | Locus2 | Locus3 | Locus4 | Locus5 | Locus6 | Locus7 | Locus8 | Locus9 | Locus10 |
 71 | |:------|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
 72 | |  Pop1  | 0   | 1    | NA | 0.8 | 0.7 | 0  | 0   | 0.6 | 0   | 1   | 
 73 | |  Pop2  | 0   | 1    | 1  | 0.9 | 1   | 1  | 0.1 | 0.6 | 0   | 0.9 |
 74 | |  Pop3  | 0.2 | 0.75 | 0  | 1   | 1   | NA | 1   | 1   | 0.1 | 1   |
 75 | |  Pop4  | 0.1 | 0.9  | 1  | 1   | 0.8 | 1  | 0.2 | 0.7 | 0.1 | 0.3 |
 76 | |  Pop5  | 0   | 1    | 1  | 1   | 1   | 1  | 0.3 | 0.9 | 0.3 | NA  |
 77 | 
 78 | An full example allele frequency data matrix is included in the 
 79 | `conStruct.data` object included with the package.
 80 | 
 81 | ```{r}
 82 | # load the example data object
 83 | data(conStruct.data)
 84 | 
 85 | # look at the allele frequency data 
 86 | #	for the first 5 populations and 10 loci
 87 | conStruct.data$allele.frequencies[1:5,1:10]
 88 | ```
 89 | 
 90 | ### Geographic sampling coordinates
 91 | 
 92 | You must specify a matrix of geographic sampling coordinates, 
 93 | which will be used for plotting the results of the analysis.
 94 | This should be a matrix with two columns that give the sample 
 95 | x-coordinates (longitude) and y-coordinates (latitude), 
 96 | respectively.  The order of rows of the matrix should be the same 
 97 | as the order of the rows of the allele frequencies matrix.
 98 | If you specify longitude and latitude, they should be in 
 99 | decimal degrees.
100 | 
101 | A full example sampling coordinate data matrix is included 
102 | in the `conStruct.data` object included with the package.
103 | 
104 | ```{r}
105 | # load the example data object
106 | data(conStruct.data)
107 | 
108 | # look at the geographic sampling coordinates 
109 | #	for the first 5 populations
110 | conStruct.data$coords[1:5,]
111 | ```
112 | 
113 | ### Geographic distance matrix 
114 | 
115 | If you choose to run the spatial model implemented in `conStruct`, 
116 | you must specify a matrix of pairwise geographic distance between 
117 | all samples.  If the coordinates of the samples are real locations 
118 | on Earth (as opposed to simulated coordinates), I recommend 
119 | calculating pairwise great-circle distance between sampling 
120 | coordinates (using, e.g., `geosphere::distm` or `geosphere::distGeo`).
121 | 
122 | The order of the samples in the geographic distance matrix should 
123 | match both that of the geographic coordinates and that of the 
124 | allele frequency data matrix, and all three matrices should have 
125 | the same number of rows.
126 | 
127 | The geographic distance matrix you specify should be the full 
128 | matrix (that is, not the upper- or lower-triangles), with values 
129 | of `0` on the diagonal entries.
130 | 
131 | A full example geographic distance matrix between all samples 
132 | is in the `conStruct.data` object included with the package.
133 | 
134 | ```{r}
135 | # load the example data object
136 | data(conStruct.data)
137 | 
138 | # look at pariwise geographic distance 
139 | #	between the first 5 populations
140 | conStruct.data$geoDist[1:5,1:5]
141 | ```
142 | 
143 | 
144 | # Other formats to conStruct
145 | 
146 | For convenience, I've written a function to convert population 
147 | genetic data in STRUCTURE format to that used in `conStruct`.
148 | 
149 | ## STRUCTURE to conStruct
150 | 
151 | The program STRUCTURE is one of the most widely used 
152 | methods for model-based clustering in population genetics. 
153 | Many existing programs, including plink (v1.9 and above) 
154 | and PgdSpider, convert data from diverse formats (including 
155 | .vcf files) into STRUCTURE format.  In this section of the 
156 | vignette, I walk through an example of converting a STRUCTURE 
157 | format data file into a `conStruct` format data file.
158 | 
159 | ### STRUCTURE data format
160 | More extensive documentation on STRUCTURE's data format 
161 | can be found in [the STRUCTURE manual](https://web.stanford.edu/group/pritchardlab/structure_software/release_versions/v2.3.4/structure_doc.pdf).
162 | An example STRUCTURE-formatted dataset is shown below:
163 | 
164 | |        |     | Loc1 | Loc1 | Loc2 | Loc2 | Loc3 | Loc3 | Loc4 | Loc4 |
165 | |:------:|:---:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
166 | |  Ind1  |  1  |  1   |  1   |  2   |  2   |   1  |  2   |  -9  |  -9  |
167 | |  Ind2  |  1  |  1   |  2   |  2   |  2   |   2  |  2   |   2  |   2  |
168 | |  Ind3  |  1  |  1   |  1   |  2   |  2   |   2  |  2   |   2  |   2  |
169 | |  Ind4  |  2  |  -9  |  -9  |  1   |  2   |   1  |  1   |   1  |   1  |
170 | |  Ind5  |  2  |  2   |  1   |  2   |  2   |   1  |  1   |   1  |   2  |
171 | 
172 | 	Example STRUCTURE format dataset, with one row per individual 
173 | 	and two columns per locus. The first column gives sample names, the 
174 | 	second refers to the sample locations, and the last 8 columns give 
175 | 	genotype data for four loci. The numbers in the genotype data refer to 
176 | 	the allele present at that locus: A1 = `1`, A2 = `2`, missing = `-9`.
177 | 
178 | To convert a STRUCTURE format file to `conStruct` format, 
179 | you can use the function `structure2conStruct`, included in the 
180 | `conStruct` package.
181 | 
182 | Below, I give an example of the usage of this function, assuming 
183 | that the file containing the STRUCTURE format data is called 
184 | "myStructureData.str", and that it's on the "desktop" directory 
185 | on the computer.  I also assume that the data are formatted as in 
186 | the table above, with the genotype data starting at the 3rd column 
187 | of the data matrix, and missing data denoted with a value of -9.
188 | 
189 | **Note that the STRUCTURE-format data must be a text file
190 | and there can be no lines of text before the data table begins.
191 | If your file is in an Excel spreadsheet, it can be converted to 
192 | a text file using Save As > File Format = Tab delimited Text 
193 | (.txt). If there are lines of text at the top of the document 
194 | before the data matrix begins, they must be deleted or specified 
195 | via the `start.samples` argument. In addition, 
196 | your data can only contain bi-allelic data. If you have loci with 
197 | more than two alleles, they should be not be included in the dataset. 
198 | For more information on multi-allelic datasets, see the section on 
199 | [Microsatellites](#microsatellites) below.**
200 | 
201 | ```{r,eval=FALSE}
202 | conStruct.data <- structure2conStruct(infile = "~/Desktop/myStructureData.str",
203 | 									  onerowperind = TRUE,
204 | 									  start.loci = 3,
205 | 									  start.samples = 1,
206 | 				 					  missing.datum = -9,
207 | 									  outfile = "~/Desktop/myConStructData")
208 | 
209 | ```
210 | 
211 | An alternate STRUCTURE data format has two rows and one column per 
212 | diploid genotype:
213 | 
214 | |        |     | Loc1 | Loc2 | Loc3 | Loc4 | Loc5 | Loc6 | Loc7 | Loc8 |
215 | |:------:|:---:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|:----:|
216 | |  Ind1  |  1  |  1   |  1   |  2   |  2   |   1  |  2   |   0  |   1  |
217 | |  Ind1  |  1  |  1   |  2   |  2   |  2   |   2  |  2   |   0  |   1  |
218 | |  Ind2  |  1  |  0   |  1   |  2   |  2   |   2  |  2   |   2  |   2  |
219 | |  Ind2  |  1  |  0   |  2   |  1   |  2   |   1  |  1   |   1  |   1  |
220 | |  Ind3  |  2  |  2   |  1   |  2   |  1   |   1  |  1   |   1  |   2  |
221 | |  Ind3  |  2  |  2   |  1   |  2   |  1   |   1  |  1   |   1  |   2  |
222 | |  Ind4  |  2  |  2   |  0   |  2   |  2   |   2  |  1   |   0  |   2  |
223 | |  Ind4  |  2  |  2   |  0   |  2   |  2   |   1  |  1   |   0  |   2  |
224 | 
225 | 	Example STRUCTURE format dataset, with two rows per individual 
226 | 	and one column per locus. The first column gives sample names, the 
227 | 	second refers to the sample locations, and the last 8 columns give 
228 | 	genotype data for 8 loci. The numbers in the genotype data refer to 
229 | 	the allele present at that locus: A1 = `1`, A2 = `2`, missing = `0`.
230 | 
231 | Data in this format can be converted to `conStruct` format using the 
232 | command below:
233 | 
234 | ```{r,eval=FALSE}
235 | conStruct.data <- structure2conStruct(infile = "~/Desktop/myStructureData.str",
236 | 									  onerowperind = FALSE,
237 | 									  start.loci = 3,
238 | 									  start.samples = 1,
239 | 				 					  missing.datum = 0,
240 | 									  outfile = "~/Desktop/myConStructData")
241 | 
242 | ```
243 | 
244 | Further documentation for this function is in its help page, 
245 | which you can go to using the command `help(structure2conStruct)`.
246 | 
247 | If you wish to group multiple individuals together into a single 
248 | sample for analysis you can collapse rows of the `conStruct` format 
249 | data file.  For example, if you have 12 individuals from 4 
250 | locations (3 individuals from each location), and you wish to 
251 | analyze the data treating populations at a sampling location 
252 | as the unit of analysis,  you can do something like the
253 | following:
254 | 
255 | ```{r,eval=FALSE}
256 | pop.data.matrix <- matrix(NA,nrow=4,ncol=ncol(conStruct.data))
257 | for(i in 1:nrow(pop.data.matrix)){
258 | 	pop.data.matrix[i,] <- colMeans(
259 | 								conStruct.data[
260 | 									which(pop.index==i),,
261 | 									drop=FALSE
262 | 								],na.rm=TRUE
263 | 							)
264 | }
265 | ```
266 | where `pop.index` is a vector that gives the population of origin 
267 | for each of the individuals sampled.  In the example above, with 
268 | 12 individuals sampled from 4 locations (3 from each), 
269 | `pop.index` would be `c(1,1,1,2,2,2,3,3,3,4,4,4)`.
270 | 
271 | 
272 | ## Microsatellites
273 | 
274 | This method is designed to run on large datasets consisting of 
275 | bi-allelic SNPs.  If you have a microsatellite dataset and you 
276 | wish to run `conStruct`, the first consideration is whether you 
277 | have sufficient data.  You should have more loci than samples 
278 | in your data matrix (i.e., your data matrix should have more 
279 | columns than rows).
280 | 
281 | If that's the case, the second consideration is how to format your 
282 | microsat data so that you can run conStruct.  There are two standard 
283 | ways of "SNP-ifying" a microsat dataset.
284 | 
285 | The first is to lump all microsatellite alleles present at a locus 
286 | into two categories: "major" and "other".  The "major" allele is the 
287 | allele that occurs most frequently at a particular locus; all other 
288 | alleles are put in the "other" bin.  You then can create a dataset 
289 | in which you only report the frequency of the major allele, 
290 | effectively reducing the number of alleles per locus to 2. 
291 | This method has the disadvantage of throwing out data, but 
292 | acknowledges the simplex relationships between alleles at a locus 
293 | (the sum of the frequencies of all alleles at a locus must be 1).
294 | 
295 | The second approach, introduced by Cavalli-Sforza, is to split out 
296 | each allele at a locus into a separate pseudo-locus consisting of only 
297 | that allele.  That is, if you had 4 alleles present in the genotyped 
298 | sample at a particular locus, at frequencies {0.4,0.3,0.1,0.2}, 
299 | you would split those out into 4 separate columns in your data matrix 
300 | (pseudo-loci), with frequencies in the sampled population of 
301 | {0.4,0.3,0.1,0.2}.  This approach has the advantage of not throwing 
302 | data away, but does not acknowledge the inter-allele dependence 
303 | structure in frequencies, and therefore introduces some 
304 | pseudoreplication into the dataset. This pseudoreplication may make 
305 | you overconfident in your results, as the credible intervals on 
306 | parameter estimates may be artificially narrow.
307 | 
308 | I would recommend trying both approaches, and comparing the 
309 | estimates of pairwise relatedness you get from each to those 
310 | derived from the raw microsatellite data to see which best 
311 | recovers the patterns of relatedness in the data.  I also recommend 
312 | running `conStruct` on datasets SNP-ified using both approaches, 
313 | and comparing the results.


--------------------------------------------------------------------------------
/vignettes/model-comparison.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "How to compare conStruct model runs"
  3 | author: "Gideon Bradburd"
  4 | date: '`r format(Sys.Date(), "%B %d, %Y")`'
  5 | output:
  6 |   rmarkdown::html_vignette:
  7 |     toc: true
  8 | vignette: >
  9 |   %\VignetteIndexEntry{model-comparison}
 10 |   %\VignetteEngine{knitr::rmarkdown}
 11 |   %\VignetteEncoding{UTF-8}
 12 | ---
 13 | ```{r, echo = FALSE}
 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
 15 | ```
 16 | 
 17 | <!-- library(rmarkdown) ; render("model-comparison.Rmd",html_vignette(toc=TRUE))	 -->
 18 | 
 19 | ## Model comparison
 20 | 
 21 | This document describes how to do model comparison 
 22 | for conStruct analyses. It assumes that you are already 
 23 | familiar with the [companion vignette for running conStruct](run-conStruct.html).
 24 | 
 25 | --------------------------------------------------------------------------
 26 | 
 27 | **Caveat user!**
 28 | Although it may sometimes be necessary to 
 29 | simplify the presentation of the results of several analyses by 
 30 | only showing the output from a single "best" run, it is important 
 31 | to remember several things:
 32 | 
 33 | 1. First, choice of best _K_ is always relative to the data at hand, 
 34 | and, as the amount of data increases, statistical support for larger 
 35 | _K_ will likely increase. With infinite data, the "best" value of _K_ 
 36 | would probably be the number of samples in the dataset.
 37 | 
 38 | 2. Although we think that conStruct is less likely to falsely ascribe 
 39 | continuous patterns of genetic variation to discrete population clusters 
 40 | than other existing methods, that does not mean that the discrete groups 
 41 | identified by conStruct are biologically real. See "A tutorial on how not 
 42 | to over-interpret STRUCTURE and ADMIXTURE bar plots" (Lawson, van Dorp, 
 43 | and Falush 2018) for a more in-depth discussion of these issues.
 44 | 
 45 | 3. Finally, as with all other statistical inference, output should be 
 46 | interpreted with care and a large grain of salt. We strongly recommend 
 47 | that users check whether individual runs seem to have performed well 
 48 | and whether results are consistent across independent runs. We also 
 49 | recommend that users compare output across runs with different values 
 50 | of _K_ to see which samples split out into their own layers in the 
 51 | different analyses.
 52 | 
 53 | --------------------------------------------------------------------------
 54 | 
 55 | So, you've run two or more conStruct analyses and you want 
 56 | to compare them to see which might be the best model to 
 57 | describe the variation in your data. There are two methods in 
 58 | the conStruct package for doing model comparison: 
 59 | 
 60 | 1. Cross-validation
 61 | 
 62 | 2. Calculating layer contributions
 63 | 
 64 | Below, I describe both options and give examples for how to 
 65 | use their associated functions in the `conStruct` package 
 66 | and visualize the output they generate.
 67 | 
 68 | Note that if you are interested in visually comparing two 
 69 | independent `conStruct` runs, you can use the function 
 70 | `compare.two.runs`, the documentation for which can be found with the  
 71 | command `help(compare.two.runs)`. This function is further described 
 72 | in the [companion vignette for visualizing results](visualize-results.html).
 73 | 
 74 | 
 75 | ## Cross-validation
 76 | 
 77 | Cross-validation is a tool for testing how the results of an analysis 
 78 | will generalize to an independent dataset.
 79 | 
 80 | ### How it works
 81 | 
 82 | In general, the more parameters included in the model, the better the 
 83 | fit to the data. To determine an appropriate level of parameterization 
 84 | for a given dataset, we can use cross-validation.  In `conStruct`, 
 85 | this works by fitting a model to a "training" subset of the data, then 
 86 | testing the fit to the remaining "testing" subset. If the parameter 
 87 | values estimated from the training data parameterize a model that 
 88 | describes the testing data well, the predictive accuracy of the model 
 89 | is good. If the model is overparameterized, it will fit the training data 
 90 | very well, but may not fit the testing better any better than (or even as 
 91 | well as) a less parameter-rich model. By fitting a given model to many 
 92 | training partitions and testing its fit to the accompanying testing 
 93 | partitions, we can get a mean predictive accuracy for each model. We can 
 94 | then compare predictive accuracies across models to determine which model 
 95 | strikes has the best goodness-of-fit without overfitting.
 96 | 
 97 | ### How to run a cross-validation analysis
 98 | 
 99 | To run a cross-validation analysis in `conStruct`, you can use the 
100 | `x.validation` function.
101 | 
102 | ```{r,eval=FALSE}
103 | # load the library
104 | library(conStruct)
105 | 
106 | # load the example dataset
107 | data(conStruct.data)
108 | 
109 | # to run a cross-validation analysis
110 | # you have to specify:
111 | #		the numbers of layers you want to compare (K)
112 | #		the allele frequency data (freqs)
113 | #		the geographic distance matrix (geoDist)
114 | #		the sampling coordinates (coords)
115 | 
116 | my.xvals <- x.validation(train.prop = 0.9,
117 | 				  		 n.reps = 8,
118 | 				  		 K = 1:3,
119 | 				  		 freqs = conStruct.data$allele.frequencies,
120 | 				  		 data.partitions = NULL,
121 | 				  		 geoDist = conStruct.data$geoDist,
122 | 				  		 coords = conStruct.data$coords,
123 | 				  		 prefix = "example",
124 | 				  		 n.iter = 1e3,
125 | 				  		 make.figs = TRUE,
126 | 				  		 save.files = FALSE,
127 | 				  		 parallel = FALSE,
128 | 				  		 n.nodes = NULL)
129 | ```
130 | 
131 | In the example above, we ran a cross-validation analysis with 8 
132 | cross-validation replicates, comparing the spatial and nonspatial 
133 | models with _K_ = 1 through 3 for each replicate. Each training 
134 | partition (one per replicate) was created by randomly subsampling 
135 | 90% of the total number of loci. This function call will run a total 
136 | of 24 `conStruct` analyses (_K_ = 1:3 for each of 8 replicates), 
137 | each for 1,000 MCMC iterations (`n.iter` = 1000), which will 
138 | generate a lot of output figures and files. To avoid these piling up, 
139 | we can set the `make.figs` and `save.files` options to `FALSE`. 
140 | However, as with all analyses, it's important to make 
141 | sure these runs are mixing well, so we suggest checking the output 
142 | figures to make sure they look good.
143 | 
144 | The `x.validation` function returns a list containing the results of 
145 | the cross-validation analysis, standardized within each replicate. 
146 | The model with the best predictive accuracy within each replicate has 
147 | a standardized score of 0. Smaller (i.e., more negative) values 
148 | indicate worse model fit to the testing data in that replicate.
149 | 
150 | For convenience, the function also writes a table of results to a 
151 | text file for both the spatial model (`prefix_sp_xval_results.txt`) 
152 | and the nonspatial model (`prefix_nsp_xval_results.txt`). Each 
153 | column in the table gives the results for a single cross-validation 
154 | replicate over evaluated values of _K_, and each row gives the 
155 | results of a given value of _K_ across replicates.
156 | 
157 | The arguments `parallel` and `n.nodes` can be used to 
158 | parallelize the cross-validation analysis. These are described 
159 | in further detail below in [Parallelization](#parallelization). 
160 | The argument `data.partitions` allows the user to specify their 
161 | own training/testing data partitions to be used across replicates. 
162 | This option is described further below in 
163 | [Specifying data partitions](#specifying-data-partitions).
164 | 
165 | ### Visualizing results
166 | 
167 | To visualize the output of a cross-validation analysis, you can 
168 | use either the output list or the text files. Examples of both 
169 | are given below.
170 | 
171 | ```{r, eval=FALSE}
172 | # read in results from text files
173 | 
174 | sp.results <- as.matrix(
175 | 				read.table("example_sp_xval_results.txt",
176 | 						   header = TRUE,
177 | 						   stringsAsFactors = FALSE)
178 | 			   )
179 | nsp.results <- as.matrix(
180 | 				read.table("example_nsp_xval_results.txt",
181 | 						   header = TRUE,
182 | 						   stringsAsFactors = FALSE)
183 | 			   )
184 | 
185 | # or, format results from the output list
186 | sp.results <- Reduce("cbind",lapply(my.xvals,function(x){unlist(x$sp)}),init=NULL)
187 | nsp.results <- Reduce("cbind",lapply(my.xvals,function(x){unlist(x$nsp)}),init=NULL)
188 | ```
189 | 
190 | ```{r,echo=FALSE}
191 | 	sp.results <- matrix(c(-1.201, 0.000, -1.819, -4.579, -5.730, 0.000, 0.000, -5.346, -1.114, -7.315, -8.853, 0.000, 0.000, -6.125, -3.602, 0.000, -11.155, -5.506, -3.650, 0.000, -2.909, 0.000, -4.799, -9.890),nrow=3,ncol=8)
192 | 	row.names(sp.results) <- paste0("K=",1:3)
193 | 	nsp.results <- matrix(c(-685.108, -416.726, -141.223, -684.230, -418.651, -148.589, -679.392, -404.326, -147.367, -682.996, -415.190, -147.767, -680.044, -411.200, -147.288, -677.238, -410.037, -149.066, -679.914, -404.820, -145.464, -672.501, -414.927, -145.073),nrow=3,ncol=8)
194 | 	row.names(nsp.results) <- paste0("K=",1:3)
195 | ```
196 | 
197 | The results look like this:
198 | ```{r,eval=TRUE,echo=FALSE}
199 | knitr::kable(sp.results,row.names=TRUE,col.names=paste0("rep",1:8),caption="Spatial cross-validation results")
200 | ```
201 | 
202 | A quick and dirty plot of the output is given below:
203 | 
204 | ```{r, eval=TRUE, fig.width=8,fig.height=5}
205 | 
206 | # first, get the 95% confidence intervals for the spatial and nonspatial
207 | #	models over values of K (mean +/- 1.96 the standard error)
208 | 
209 | sp.CIs <- apply(sp.results,1,function(x){mean(x) + c(-1.96,1.96) * sd(x)/length(x)})
210 | nsp.CIs <- apply(nsp.results,1,function(x){mean(x) + c(-1.96,1.96) * sd(x)/length(x)})
211 | 
212 | # then, plot cross-validation results for K=1:3 with 8 replicates
213 | 
214 | par(mfrow=c(1,2))
215 | plot(rowMeans(sp.results),
216 | 	 pch=19,col="blue",
217 | 	 ylab="predictive accuracy",xlab="values of K",
218 | 	 ylim=range(sp.results,nsp.results),
219 | 	 main="cross-validation results")
220 | 	points(rowMeans(nsp.results),col="green",pch=19)
221 | 
222 | # finally, visualize results for the spatial model
223 | #	separately with its confidence interval bars
224 | #
225 | # note that you could do the same with the spatial model, 
226 | #	but the confidence intervals don't really show up 
227 | #	because the differences between predictive accuracies
228 | #	across values of K are so large.
229 | 
230 | plot(rowMeans(sp.results),
231 | 	 pch=19,col="blue",
232 | 	 ylab="predictive accuracy",xlab="values of K",
233 | 	 ylim=range(sp.CIs),
234 | 	 main="spatial cross-validation results")
235 | segments(x0 = 1:nrow(sp.results),
236 | 		 y0 = sp.CIs[1,],
237 | 		 x1 = 1:nrow(sp.results),
238 | 		 y1 = sp.CIs[2,],
239 | 		 col = "blue",lwd=2)
240 | ```
241 | 
242 | 
243 | ### Interpreting results
244 | 
245 | The model with the highest mean predictive accuracy is the "best" model, 
246 | but, as noted above, we caution against overinterpretation of these 
247 | cross-validation results. If a significance test for the "best" number 
248 | of layers is required, you can use a t-test comparing cross-validation 
249 | scores across values of K, paired by replicate. E.g., 
250 | `t.test(sp.results[2,],sp.results[1,],paired=TRUE,alternative="greater")`.
251 | 
252 | I would interpret the results above as strong evidence that the spatial 
253 | model is preferred over the nonspatial model over all tested values of _K_ 
254 | (indicating that isolation by distance is probably a feature of the data).
255 | The cross-validation analyses also strongly support the conclusion that a 
256 | single spatial layer (_K_ = 1) is sufficient to describe the variation in 
257 | the data.
258 | 
259 | A final caveat of this section is that, with sufficient data, it is possible 
260 | to get strong statistical support for layers that contribute little to overall 
261 | patterns of covariance. Therefore, it's good to interpret cross-validation 
262 | results alongside calculated layer contributions (discussed further in 
263 | [Layer Contributions](#layer-contributions).
264 | 
265 | ### Parallelization
266 | 
267 | Because each cross-validation replicate consists of several analyses (one for 
268 | each specified value of _K_), and because several cross-validation replicates 
269 | are required for model comparison, a single call to `x.validation` can take a 
270 | long time. To reduce computational burden, we have introduced an option for 
271 | users to parallelize their analyses across replicates. The simplest way to 
272 | parallelize is to use the `parallel` and `n.nodes` arguments of in the 
273 | `x.validation` function, which we illustrate using the same `x.validation` 
274 | given above in [How it works](#how-it-works):
275 | 
276 | ```{r,eval=FALSE}
277 | 
278 | # load the example dataset
279 | data(conStruct.data)
280 | 
281 | # to run a cross-validation analysis
282 | # you have to specify:
283 | #		the numbers of layers you want to compare (K)
284 | #		the allele frequency data (freqs)
285 | #		the geographic distance matrix (geoDist)
286 | #		the sampling coordinates (coords)
287 | 
288 | # in addition, here we run our analysis parallelized 
289 | #	across all replicates using 4 nodes
290 | 
291 | my.xvals <- x.validation(train.prop = 0.9,
292 | 				  		 n.reps = 8,
293 | 				  		 K = 1:3,
294 | 				  		 freqs = conStruct.data$allele.frequencies,
295 | 				  		 data.partitions = NULL,
296 | 				  		 geoDist = conStruct.data$geoDist,
297 | 				  		 coords = conStruct.data$coords,
298 | 				  		 prefix = "example",
299 | 				  		 n.iter = 1e3,
300 | 				  		 make.figs = TRUE,
301 | 				  		 save.files = FALSE,
302 | 				  		 parallel = TRUE,
303 | 				  		 n.nodes = 4)
304 | 
305 | ```
306 | 
307 | The example above should run ~4 times as fast as cross-validation with the 
308 | same number of replicates not run in parallel. At the end of the cross-validation 
309 | analysis, the parallel workers generated at the beginning of the run will be 
310 | terminated.
311 | 
312 | 
313 | To facilitate greater flexibility in parallelization, users can also specify 
314 | their own parallelization scheme before running a cross-validation analysis, 
315 | in which case they should simply set `parallel=TRUE` and make sure that `n.nodes` 
316 | is equal to the number of nodes they've set up.  If you've set up your own 
317 | parallelization beforehand (as in the example that follows), `x.validation` will use 
318 | that set-up rather than initializing one itself. E.g., 
319 | 
320 | ```{r,eval=FALSE}
321 | 
322 | library(parallel)
323 | library(foreach)
324 | library(doParallel)
325 | 
326 | cl <- makeCluster(4,type="FORK")
327 | registerDoParallel(cl)
328 | 
329 | my.xvals <- x.validation(train.prop = 0.9,
330 | 				  		 n.reps = 8,
331 | 				  		 K = 1:3,
332 | 				  		 freqs = conStruct.data$allele.frequencies,
333 | 				  		 data.partitions = NULL,
334 | 				  		 geoDist = conStruct.data$geoDist,
335 | 				  		 coords = conStruct.data$coords,
336 | 				  		 prefix = "example",
337 | 				  		 n.iter = 1e3,
338 | 				  		 make.figs = TRUE,
339 | 				  		 save.files = FALSE,
340 | 				  		 parallel = TRUE,
341 | 				  		 n.nodes = 4)
342 | 
343 | stopCluster(cl)
344 | 
345 | ```
346 | 
347 | Note that if you have prespecified a parallelization scheme, you 
348 | are responsible for ending the parallelization yourself, as shown 
349 | above with the `stopCluster()` call. Linux and Mac users may wish 
350 | use `makeCluster(N,type="FORK")`, as it does better with memory 
351 | usage. Windows users should user the default PSOCK cluster 
352 | (e.g., `makeCluster(N,type="PSOCK")`).
353 | 
354 | ## Layer contributions
355 | 
356 | Layer contributions offer a second metric users can employ to compare models 
357 | with different numbers of layers.
358 | 
359 | ### How it works
360 | 
361 | In a `conStruct` run, users are estimating a parametric covariance matrix to 
362 | fit their sample allelic covariance. Each layer in the model contributes to 
363 | that parametric covariance, and those contributions can be calculated and 
364 | compared. If there is a layer that no samples draw appreciable admixture from, 
365 | it will contribute almost nothing to overall covariance, and is therefore of 
366 | little biological importance in the model.
367 | 
368 | By comparing layer contributions across different `conStruct` analyses run 
369 | with different values of _K_, users can identify the point at which layers 
370 | included in the analysis contribute little to overall covariance, and pick 
371 | a "best" value of _K_ below that point.
372 | 
373 | ### How to calculate layer contributions
374 | 
375 | Layer contributions are calculated from the output of a standard
376 | `conStruct` analysis using the function `calculate.layer.contribution`.
377 | 
378 | ```{r,eval=FALSE}
379 | 
380 | # Loop through output files generated by conStruct 
381 | #	runs with K=1 through 5 and calculate the 
382 | #	layer contributions for each layer in each run	
383 | 
384 | layer.contributions <- matrix(NA,nrow=5,ncol=5)
385 | 
386 | # load the conStruct.results.Robj and data.block.Robj
387 | #	files saved at the end of a conStruct run
388 | load("K1_sp_conStruct.results.Robj")
389 | load("K1_sp_data.block.Robj")
390 | 
391 | # calculate layer contributions
392 | layer.contributions[,1] <- c(calculate.layer.contribution(conStruct.results[[1]],data.block),rep(0,4))
393 | tmp <- conStruct.results[[1]]$MAP$admix.proportions
394 | 
395 | for(i in 2:5){
396 | 	# load the conStruct.results.Robj and data.block.Robj
397 | 	#	files saved at the end of a conStruct run
398 | 	load(sprintf("K%s_sp_conStruct.results.Robj",i))
399 | 	load(sprintf("K%s_sp_data.block.Robj",i))
400 | 	
401 | 	# match layers up across runs to keep plotting colors consistent
402 | 	#	for the same layers in different runs
403 | 	tmp.order <- match.layers.x.runs(tmp,conStruct.results[[1]]$MAP$admix.proportions)	
404 | 
405 | 	# calculate layer contributions
406 | 	layer.contributions[,i] <- c(calculate.layer.contribution(conStruct.results=conStruct.results[[1]],
407 | 															 data.block=data.block,
408 | 															 layer.order=tmp.order),
409 | 									rep(0,5-i))
410 | 	tmp <- conStruct.results[[1]]$MAP$admix.proportions[,tmp.order]
411 | }
412 | ```
413 | 
414 | Note that, because layers can label switch across runs, the example 
415 | above uses the `match.layers.x.runs` function to determine which 
416 | layers correspond to each other across analyses run with different 
417 | values of _K_.
418 | 
419 | ### Visualizing results
420 | 
421 | ```{r, echo=FALSE}
422 | 	layer.contributions <- matrix(c(1.000, 0.000, 0.000, 0.000, 0.000, 0.680, 0.320, 0.000, 0.000, 0.000, 0.682, 0.318, 0.000, 0.000, 0.000, 0.678, 0.322, 0.000, 0.000, 0.000, 0.684, 0.315, 0.000, 0.000, 0.000),nrow=5,ncol=5)
423 | 	row.names(layer.contributions) <- paste0("Layer_",1:5)
424 | ```
425 | 
426 | The table of layer contributions looks like this:
427 | ```{r, eval=TRUE,echo=FALSE}
428 | knitr::kable(layer.contributions,row.names=TRUE,col.names=paste0("K=",1:5),caption="Contributions for each layer for runs done with K=1 through 5")
429 | ```
430 | 
431 | Layer contributions can be easily plotted across values of _K_ using
432 | a stacked barplot:
433 | 
434 | ```{r, eval=TRUE,fig.width=5,fig.height=5}
435 | barplot(layer.contributions,
436 | 		col=c("blue", "red", "goldenrod1", "forestgreen", "darkorchid1"),
437 | 		xlab="",
438 | 		ylab="layer contributions",
439 | 		names.arg=paste0("K=",1:5))
440 | ```
441 | 
442 | In this case, the contributions of layers beyond _K_ = 2 is so small 
443 | that they don't even show up on the barplot.
444 | 
445 | ### Interpreting results
446 | 
447 | If a layer in a given model contributes very little to overall covariance, 
448 | it is unlikely to have much biological significance. If you run `conStruct` 
449 | analyses across values of _K_, and see that, after a certain value of _K_,
450 | no additional clusters contribute much to overall covariance, that may be 
451 | a good indication that that value of _K_ (or at least, no larger value of 
452 | _K_) is best for describing the variation in your data. For example, in 
453 | the layer contributions plotted above in [Visualizing results](# visualizing-results-1),
454 | additional layers after _K_ = 2 have negligible layer contributions, so 
455 | we might reasonably conclude that the best value of _K_ for describing our 
456 | data is no greater than 2.
457 | 
458 | Users can also set some threshold (e.g., 0.01) below which they count a layer's 
459 | contribution as negligible, and, by setting this threshold _a priori_, can 
460 | use layer contributions as a metric for model selection.
461 | 
462 | ## Cross-validation vs. Layer contribution
463 | 
464 | With sufficient data, a cross-validation analysis may indicate 
465 | strong support for layers that each contribute very little to overall 
466 | covariance. In such a case, the input from cross-validation and 
467 | layer contributions are at odds, with the former arguing for the inclusion 
468 | of more layers, and the latter arguing against. What to do with that situation?
469 | 
470 | Well, the specifics will vary from dataset to dataset, but we encourage users 
471 | to distinguish between statistical and biological significance, and not get too 
472 | caught up in the first at the expense of the second.
473 | 
474 | ## Advanced options
475 | 
476 | Below, we include information on advanced topics that will not be of use 
477 | for the average user.
478 | 
479 | ### Specifying data partitions
480 | 
481 | In many cases, there will be no genome assembly available for the focal species in a 
482 | `conStruct` analysis, and the genotyped loci will have no known genomic location. 
483 | When genomic positions are known, advanced users may wish to specify their own data 
484 | partitions to maximize the efficacy of the cross-validation procedure. This is 
485 | because the cross-validation results are most trustworthy when the testing data 
486 | partition is independent from but still representative of the training data. 
487 | Because coalescent histories tend to be shared by adjacent loci on the genome, 
488 | if neighboring loci are split (one in the training dataset, the other in the 
489 | testing dataset), the training/testing partitions might not be truly independent. 
490 | In this case, the model parameterized by the training dataset will be fitting 
491 | coalescent "noise" that's also present in the testing dataset, the most likely 
492 | result of which is overfitting. Another concern is that different regions of the 
493 | genome have different properties (e.g., centromeres vs. non-centromeric DNA), so 
494 | to keep the training and testing partitions representative of each other, it's 
495 | best to try to match by genomic properties.
496 | 
497 | **Our recommendation**: If genomic position and LD information is available, we 
498 | recommend divvying the genome up into blocks of length equal to twice the scale 
499 | of LD, then randomly assigning 90% of those blocks to a training partition, and 
500 | the remaining 10% to the testing partition for each replicate.
501 | 
502 | To facilitate this type of custom data partitioning, users can specify their own 
503 | data partitions for a `x.validation` analysis using the `data.partitions` argument. 
504 | There is no function in the package for generating custom a custom data partitions 
505 | object, as the details of the data format and specifics of the desired partitioning 
506 | scheme will vary from user to user and genome to genome. Instead, we describe the 
507 | structure of the `data.partitions` object in detail below so that users can create 
508 | it for themselves.
509 | 
510 | The `data.partitions` object must be a `list` of length `n.reps` as specified in 
511 | the `x.validation` function (one partitioning scheme per cross-validation replicate). 
512 | Each of the `n.reps` elements of the list must contain two elements, one named 
513 | `training` and one named `testing`, which contain the training and testing data 
514 | partitions, respectively. Each training and testing element of the list must contain 
515 | three named elements: `data`, `n.loci`, and `varMeanFreqs`.
516 | 
517 | The `data` element contains the allelic covariance matrix for that partition of the 
518 | data; `n.loci` gives the number of loci in that partition; and `varMeanFreqs` gives 
519 | the variance in mean allele frequencies across loci (averaged over choice of counted 
520 | allele).
521 | 
522 | Peeking under the hood of how conStruct creates this `data.partitions` object when none 
523 | is specified, the relevant functions are:
524 | 
525 | * conStruct:::make.data.partitions
526 | 
527 |     * conStruct:::xval.process.data
528 | 
529 |         + conStruct:::calc.covariance
530 | 
531 |         + conStruct:::get.var.mean.freqs
532 | 
533 | Users attempting to specify their own `data.partitions` object are encouraged to use 
534 | these functions as guides for what operations are being carried out to generate the 
535 | data partitions list for a cross-validation analysis. The structure of an example 
536 | `data.partitions` object with 3 partitioning schemes (for 3 cross-validation replicates) 
537 | is shown below:
538 | 
539 | ```{r,echo=FALSE}
540 | library(conStruct)
541 | data(conStruct.data)
542 | data.partitions <- conStruct:::make.data.partitions(3,conStruct.data$allele.frequencies,0.9)
543 | ```
544 | 
545 | ```{r,eval=TRUE}
546 | # In this dataset, there are 36 samples and 1e4 loci total, 
547 | #	and the data partitions are generated 
548 | #		with a 90% training 10% testing split
549 | 
550 | str(data.partitions,max.level=3,give.attr=FALSE,vec.len=3)
551 | ```
552 | 


--------------------------------------------------------------------------------
/vignettes/run-conStruct.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "How to run a conStruct analysis"
  3 | author: "Gideon Bradburd"
  4 | date: '`r format(Sys.Date(), "%B %d, %Y")`'
  5 | output:
  6 |   rmarkdown::html_vignette:
  7 |     toc: true
  8 | vignette: >
  9 |   %\VignetteIndexEntry{run-conStruct}
 10 |   %\VignetteEngine{knitr::rmarkdown}
 11 |   %\VignetteEncoding{UTF-8}
 12 | ---
 13 | ```{r, echo = FALSE}
 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
 15 | ```
 16 | 
 17 | <!-- library(rmarkdown) ; render("run-conStruct.Rmd",html_vignette(toc=TRUE))	-->
 18 | 
 19 | ## Run conStruct 
 20 | This document describes how to run a `conStruct` analysis.
 21 | 
 22 | Throughout the document, I'll be referring to the 
 23 | example dataset included with the package:
 24 | 
 25 | ```{r}
 26 | library(conStruct)
 27 | data(conStruct.data)
 28 | ```
 29 | 
 30 | The format for the data you need to run a `conStruct` 
 31 | analysis is covered in a separate vignette in this 
 32 | package. You can view that vignette using the command: 
 33 | `vignette(package="conStruct",topic="format-data")`.
 34 | If you've already run `conStruct` and you want more 
 35 | information on how to visualize the results, please see 
 36 | the companion vignette for [visualizing results](visualize-results.html).
 37 | If you've run several `conStruct` analyses and want to 
 38 | compare them, please see the companion vignette for 
 39 | [model comparison](model-comparison.html).
 40 | 
 41 | ## Running a conStruct analysis
 42 | 
 43 | The function you use to run a `conStruct` analysis 
 44 | is called, fittingly, `conStruct`. This vignette 
 45 | walks through the use of this function in detail, 
 46 | and should be used in concert with the documentation 
 47 | for the function, which can be viewed using the command: 
 48 | `help(conStruct)`.
 49 | 
 50 | ### Spatial Model
 51 | 
 52 | The default model in the conStruct package is 
 53 | the spatial model, which allows relatedness 
 54 | within a layer to decay as a function of the distance 
 55 | between samples drawing ancestry from that layer.
 56 | 
 57 | Below, I show an example of how to run a `conStruct` 
 58 | analysis using the spatial model.
 59 | 
 60 | ```{r,eval=FALSE}
 61 | # load the example dataset
 62 | data(conStruct.data)
 63 | 
 64 | # run a conStruct analysis
 65 | 
 66 | #	you have to specify:
 67 | #		the number of layers (K)
 68 | #		the allele frequency data (freqs)
 69 | #		the geographic distance matrix (geoDist)
 70 | #		the sampling coordinates (coords)
 71 | 
 72 | my.run <- conStruct(spatial = TRUE, 
 73 | 	 	  			K = 3, 
 74 | 				  	freqs = conStruct.data$allele.frequencies,
 75 | 				  	geoDist = conStruct.data$geoDist, 
 76 | 				  	coords = conStruct.data$coords,
 77 | 				  	prefix = "spK3")
 78 | ```
 79 | 
 80 | The function call above runs `conStruct`'s spatial model 
 81 | using 3 discrete layers. All output files will be have "spK3" 
 82 | prepended to their names. To vary the number of layers 
 83 | in the spatial model, you need only change the value of `K`.
 84 | The example dataset `conStruct.data` is organized into an R list 
 85 | for convenience, but users can provide their data to the function 
 86 | any way they see fit, so long as each argument is properly formatted 
 87 | (e.g., `freqs` is a matrix, `prefix` is a character vector, etc.).
 88 | 
 89 | ### Nonspatial Model
 90 | 
 91 | You can also run a nonspatial model using the `conStruct` 
 92 | function, in which relatedness within each of the K clusters 
 93 | does not decay with distance.  This model is analogous to 
 94 | the model implemented in STRUCTURE.
 95 | 
 96 | Below, I show an example of how to run a `conStruct` 
 97 | analysis using the nonspatial model.
 98 | 
 99 | ```{r,eval=FALSE}
100 | # load the example dataset
101 | data(conStruct.data)
102 | 
103 | # run a conStruct analysis
104 | 
105 | #	you have to specify:
106 | #		the number of layers (K)
107 | #		the allele frequency data (freqs)
108 | #		the sampling coordinates (coords)
109 | #
110 | #	if you're running the nonspatial model, 
111 | #		you do not have to specify 
112 | #		the geographic distance matrix (geoDist)
113 | 
114 | my.run <- conStruct(spatial = FALSE, 
115 | 				    K = 2, 
116 | 				    freqs = conStruct.data$allele.frequencies, 
117 | 				    geoDist = NULL, 
118 | 				    coords = conStruct.data$coords,
119 | 				    prefix = "nspK2")
120 | ```
121 | 
122 | The function call above runs `conStruct`'s nonspatial model 
123 | using 2 discrete layers. All output files will be have "nspK2" 
124 | prepended to their names. As with the spatial model, if you 
125 | want to vary the number of layers, you change the value of `K`.
126 | 
127 | ### Other function options
128 | 
129 | The `conStruct` function has other arguments that 
130 | have default values, for which you don't have to 
131 | specify any values.  However, you may wish to alter 
132 | these defaults, so we describe them below:
133 | 
134 | The full function call for the spatial model with 3 layers is:
135 | 
136 | ```{r,eval=FALSE}
137 | my.run <- conStruct(spatial = TRUE, 
138 | 					K = 3, 
139 | 					freqs = conStruct.data$allele.frequencies, 
140 | 					geoDist = conStruct.data$geoDist, 
141 | 					coords = conStruct.data$coords, 
142 | 					prefix = "spK3", 
143 | 					n.chains = 1, 
144 | 					n.iter = 1000, 
145 | 					make.figs = TRUE, 
146 | 					save.files = TRUE)
147 | ```
148 | 
149 | The other options are `n.chains`, `n.iter`, `make.figs`, `save.files`; 
150 | I describe each of them below:
151 | 
152 | * `n.chains` - gives the number of independent MCMCs to be run for this model. 
153 | The default is `1`, but you may wish to run multiple independent chains to 
154 | make sure you get consistent results across them.
155 | 
156 | * `n.iter` - gives the number of iterations per MCMC. The default is `1000`.
157 | If you have more genotyped samples, you generally need more iterations 
158 | to describe the posterior probability surface well. There are no 
159 | hard and fast rules on how many iterations you should run. 
160 | I **strongly recommend** examining model output to assess convergence; 
161 | if you don't see good convergence, you can run the analysis using a 
162 | larger number of iterations.
163 | 
164 | * `make.figs` - determines whether or not to automatically make figures 
165 | describing the results. The default is `TRUE`. However, if you're running 
166 | lots of independent analyses, or if you're running on a cluster with limited 
167 | disk space, you may wish to set this option to `FALSE` and make the figures 
168 | later on your own.
169 | 
170 | * `save.files` - determines whether or not to automatically save all output 
171 | files.  The default is `TRUE`.  However, again, there may be circumstances 
172 | in which you don't want to automatically save these files, and instead want 
173 | to capture the results of the analysis, which are the returned value of the 
174 | `conStruct` function call.
175 | 
176 | ## Model diagnosis
177 | 
178 | As with any statistical model, it is important to assess the 
179 | performance of the inference method. Below, I briefly walk 
180 | through some of the important things to look out for when 
181 | you run a `conStruct` analysis.
182 | 
183 | ### MCMC diagnosis
184 | 
185 | Although the Hamiltonian Monte Carlo algorithm implemented in STAN 
186 | is quite robust, it's always a good idea to look at the results of 
187 | the analysis to diagnose MCMC performance. If the chain is mixing 
188 | well, the trace plots for the different parameters and the posterior 
189 | probability will resemble a “fuzzy caterpillar,” as in panel (a) 
190 | below. If the trace plots have not plateaued (as in panel (b)), 
191 | it is an indication that the chain has not converged on the 
192 | stationary distribution, and that it should be run longer. 
193 | If the chain appears to be bouncing between two or more modes, 
194 | as in panel (c) below, that may be an indication of a multi-modal 
195 | likelihood surface, with multiple points in parameter space that 
196 | have equal or similar posterior probability given the data. 
197 | 
198 | 
199 | ```{r,echo=FALSE,fig.width=7,fig.height=2.7}
200 | par(mfrow=c(1,3),mar=c(4,3,1.5,1))
201 | 	plot(c(0,rnorm(500,1,0.2)),type='l',
202 | 		xlab="",yaxt='n',ylab="")
203 | 		mtext(side=2,text="parameter estimate",padj=-1)
204 | 		mtext(side=3,text="(a) looks good",padj=-0.1)
205 | 	plot(c(0,rnorm(500,c(log(seq(0,1,length.out=500))),0.2)),type='l',
206 | 		xlab="",yaxt='n',ylab="")
207 | 		mtext(side=1,text="mcmc iterations",padj=2.6)
208 | 		mtext(side=3,text="(b) hasn't converged",padj=-0.1)
209 | 	plot(c(0,rnorm(150,1,0.2),rnorm(200,3,0.2),rnorm(150,1,0.2)),type='l',
210 | 		xlab="",yaxt='n',ylab="")
211 | 		mtext(side=3,text="(c) multi-modal",padj=-0.1)
212 | ```
213 | 
214 | ### Independent runs
215 | 
216 | Above, I highlight the importance of evaluating performance of 
217 | individual MCMC runs, but it's also a good idea to run multiple, 
218 | independent analyses and compare results across them.  Ideally, 
219 | multiple independent runs converge on the same stationary distribution, 
220 | with similar parameter estimates and posterior probabilities.  
221 | If different runs give very different results, you can check whether 
222 | there's a mixing problem or a truly multi-modal posterior probability 
223 | surface by comparing the values of the posterior probability across 
224 | runs. If two runs have very different parameter estimates but their 
225 | posterior probability distributions are indistinguishable, that's an 
226 | indication of multi-modality. If multiple runs show different parameter 
227 | estimates, but the posterior probabilities for a subset of the runs that 
228 | show consistent results are higher than those of a different subset that 
229 | gives conflicting results, that indicates that some of the runs are not 
230 | mixing well.
231 | 
232 | ### Missing data
233 | 
234 | Missing data can affect the sample allelic covariance, and 
235 | therefore the results of a `conStruct` analysis. This is 
236 | especially the case when the distribution of missing data is 
237 | biased - that is, when individuals of particular ancestry are 
238 | more likely to be missing data at a locus. This pattern 
239 | is expected when, for example, allelic dropout occurs in a 
240 | RADseq dataset.
241 | 
242 | In some empirical datasets with missing data that I used to 
243 | test `conStruct`, I observed a phenomenon of "homogeneous 
244 | minimum layer membership," (HMLM) in which all samples had troublingly 
245 | similar admixture proportions in a particular cluster (see 
246 | membership in the blue layer in the figure below).
247 | 
248 | \
249 | 
250 | ```{r,echo=FALSE,fig.width=7,fig.height=3}
251 | w <- matrix(rnorm(40,sample(2:10,40,replace=TRUE),1),
252 | 			nrow=20,ncol=2)
253 | w <- w/rowSums(w)
254 | w <- cbind(pmax(rnorm(20,0.15,0.005),0),w)
255 | w <- w/rowSums(w)
256 | conStruct::make.structure.plot(w)
257 | ```
258 | 
259 | \
260 | 
261 | Users are advised to check the results of their analyses carefully 
262 | for this HMLM behavior. If you encounter this issue, try reducing 
263 | the amount of missing data in your dataset, either by dropping 
264 | poorly genotyped samples or poorly genotyped loci (rows and columns 
265 | of the allele frequency data matrix, respectively).


--------------------------------------------------------------------------------
/vignettes/visualize-results.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "How to visualize the results of a conStruct analysis"
  3 | author: "Gideon Bradburd"
  4 | date: '`r format(Sys.Date(), "%B %d, %Y")`'
  5 | output:
  6 |   rmarkdown::html_vignette:
  7 |     toc: true
  8 | vignette: >
  9 |   %\VignetteIndexEntry{visualize-results}
 10 |   %\VignetteEngine{knitr::rmarkdown}
 11 |   %\VignetteEncoding{UTF-8}
 12 | ---
 13 | ```{r, echo = FALSE}
 14 | knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
 15 | ```
 16 | 
 17 | <!-- library(rmarkdown) ; render("visualize-results.Rmd",html_vignette(toc=TRUE))	-->
 18 | 
 19 | ## Visualize results
 20 | This document describes the use of the functions included in 
 21 | the conStruct package for visualizing analysis outputs.
 22 | For more information on how to run a `conStruct` analysis, 
 23 | see the companion vignette for [running conStruct](run-conStruct.html).
 24 | 
 25 | Throughout, this vignette will make use of the example data output objects 
 26 | generated by a `conStruct` run:
 27 | 
 28 | ```{r}
 29 | library(conStruct)
 30 | data(data.block)
 31 | ```
 32 | 
 33 | ## Make all the plots
 34 | If the `make.figs` is set to `TRUE` in a `conStruct` run, 
 35 | the run will finish by calling the function `make.all.the.plots`.
 36 | As the name implies, this function makes all the relevant plots 
 37 | from a set of conStruct results:
 38 | 
 39 | * STRUCTURE plot
 40 | * Admixture pie plot
 41 | * Model fit plot
 42 | * Layer covariance functions plot
 43 | * Trace plots for relevant MCMC quantities including:
 44 | 	* the log posterior density
 45 | 	* nugget parameters
 46 | 	* gamma parameter
 47 | 	* layer-specific parameters
 48 | 	* admixture proportions
 49 | 
 50 | More information is available in the documentation for the function, 
 51 | which you can view using the command:
 52 | 
 53 | ```{r,eval=FALSE}
 54 | help(make.all.the.plots)
 55 | ```
 56 | 
 57 | If you deleted the output plots from an analysis, or if you 
 58 | set `make.figs` to `FALSE` to avoid making them in the first place, 
 59 | you can make them by calling the `make.all.the.plots` function.
 60 | The arguments you have to specify are a `conStruct.results` output 
 61 | object and a `data.block` output object, both of which are automatically 
 62 | generated and saved when you execute a `conStruct` analysis. You must 
 63 | also specify a `prefix`, which will be prepended to all output pdf 
 64 | file names. If you choose, you can specify a the colors you want each 
 65 | layer to be plotted in; if none are specified, the function will use 
 66 | its own internal vector of colors, which I think look nice but are 
 67 | otherwise arbitrary.
 68 | 
 69 | An example call to `make.all.the.plots` using the example output 
 70 | data objects loaded above is shown below.
 71 | 
 72 | ```{r,eval=FALSE}
 73 | make.all.the.plots(conStruct.results = conStruct.results,
 74 | 					data.block = data.block,
 75 | 					prefix = "example",
 76 | 					layer.colors = NULL)
 77 | # generates a bunch of pdf figures
 78 | ```
 79 | 
 80 | ## Visualizing estimated admixture proportions
 81 | 
 82 | Generally, users are most interested in the estimated admixture 
 83 | proportions for each sample. These are commonly visualized using 
 84 | STRUCTURE plots and pie plots. Functions for both are included in 
 85 | the package, and their use is detailed below.
 86 | 
 87 | ### STRUCTURE plots
 88 | 
 89 | Probably the most common method for visualizing admixture proportions 
 90 | is using a stacked bar plot (commonly called a STRUCTURE plot after 
 91 | the model-based clustering method `STRUCTURE`).
 92 | 
 93 | Users can generate a STRUCTURE plot for their data using the command 
 94 | `make.structure.plot`, (see documentation at `help(make.structure.plot)`). 
 95 | This function takes as its principal argument the estimated admixture 
 96 | proportions and makes a STRUCTURE plot in the plotting window. An 
 97 | example is given below.
 98 | 
 99 | ```{r,echo=FALSE}
100 | admix.props <- matrix(
101 | 				c(0.086, 0.000, 0.500, 0.505, 0.099, 0.052, 0.024, 0.007, 0.800, 0.000, 0.216, 0.744, 0.917,
102 | 				0.199, 0.469, 0.000, 0.783, 0.298, 0.329, 0.446, 0.000, 0.000, 0.637, 0.903, 0.000, 0.000,
103 | 				0.000, 0.012, 0.021, 0.000, 0.000, 0.089, 0.000, 0.554, 0.002, 0.000, 0.000, 0.095, 0.020,
104 | 				0.001, 0.001, 0.011, 0.000, 0.200, 0.000, 0.060, 0.053, 0.082, 0.036, 0.013, 0.000, 0.062,
105 | 				0.169, 0.137, 0.029, 0.001, 0.000, 0.178, 0.079, 0.000, 0.999, 1.000, 0.988, 0.979, 0.975,
106 | 				1.000, 0.744, 0.984, 0.435, 0.998, 0.914, 1.000, 0.405, 0.475, 0.900, 0.947, 0.965, 0.993,
107 | 				0.000, 1.000, 0.725, 0.203, 0.000, 0.765, 0.518, 1.000, 0.154, 0.533, 0.534, 0.525, 0.999,
108 | 				1.000, 0.185, 0.018, 1.000, 0.001, 0.000, 0.000, 0.000, 0.025, 0.000, 0.167, 0.016, 0.012,
109 | 				0.000),nrow=35,ncol=3)
110 | ```
111 | 
112 | First, we load the `conStruct.results` data output object 
113 | and, for convenience, assign the _maximum a posteriori_ 
114 | admixture parameter estimates to a variable with a 
115 | shorter name:
116 | 
117 | ```{r,eval=FALSE}
118 | load("my_conStruct.results.Robj")
119 | 
120 | # assign the MAP admixture proportions from 
121 | #	the first MCMC chain to a variable 
122 | #	with a new name
123 | 
124 | admix.props <- conStruct.results$chain_1$MAP$admix.proportions
125 | ```
126 | 
127 | Now we can visualize the results:
128 | 
129 | ```{r, fig.width=8,fig.height=4}
130 | # make a STRUCTURE plot using the 
131 | #	maximum a posteriori (MAP) estimates
132 | #	from the first chain of a conStruct run
133 | 
134 | make.structure.plot(admix.proportions = admix.props)
135 | 
136 | ```
137 | 
138 | #### Order STRUCTURE plots
139 | 
140 | The function also includes a variety of options for tweaking the order of the 
141 | plotted samples.
142 | 
143 | ```{r, fig.width=8,fig.height=4}
144 | 
145 | # order by membership in layer 1
146 | make.structure.plot(admix.proportions = admix.props,
147 | 					sort.by = 1)
148 | 
149 | # re-order the stacking order of the layers
150 | make.structure.plot(admix.proportions = admix.props,
151 | 					layer.order = c(2,1,3),
152 | 					sort.by = 2)
153 | 
154 | # provide a custom sample ordering
155 | #	in this case by sample latitude
156 | make.structure.plot(admix.proportions = admix.props,
157 | 					sample.order = order(data.block$coords[,2]))
158 | 
159 | # add sample names
160 | make.structure.plot(admix.proportions = admix.props,
161 | 					sample.names = row.names(data.block$coords),
162 | 					mar = c(4.5,4,2,2))
163 | ```
164 | 
165 | 
166 | ### ADMIXTURE pie plots
167 | 
168 | It is often also useful to visualize estimated admixture 
169 | proportions in a spatial context by plotting them on a 
170 | map. The most common way to do this is to plot a pie plot 
171 | at the sampling location of each sample, in which each 
172 | modeled layer gets its own slice of the pie (`K` wedges), 
173 | and the size of each slice in the pie is proportional to the 
174 | sample's admixture proportion in that layer.
175 | 
176 | Users can make an admixture pie plot with their own data 
177 | using the command `make.admix.pie.plot` (see documentation 
178 | at `help(make.admix.pie.plot)`. This function takes as its 
179 | principal arguments the estimated admixture proportions and 
180 | the sample coordinates, then makes an admixture pie plot in 
181 | the plotting window. An example is given below:
182 | 
183 | ```{r,fig.width=6,fig.height=6}
184 | # make an admix pie plot using the 
185 | #	maximum a posteriori (MAP) estimates
186 | #	from the first chain of a conStruct run
187 | 	make.admix.pie.plot(admix.proportions = admix.props,
188 | 						coords = data.block$coords)
189 | 
190 | # increase pie chart size
191 | 	make.admix.pie.plot(admix.proportions = admix.props,
192 | 						coords = data.block$coords,
193 | 						radii = 4)
194 | 						
195 | # zoom in on a subsection of the map
196 | 	make.admix.pie.plot(admix.proportions = admix.props,
197 | 						coords = data.block$coords,
198 | 						x.lim = c(-130,-120),
199 | 						y.lim = c(49,56))
200 | ```
201 | 
202 | #### Pie plot on a map
203 | 
204 | Users can also add the pie plot directly to a map of their own 
205 | creation using the `make.admix.pie.plot` by setting the `add` 
206 | argument to `TRUE`. E.g., 
207 | 
208 | ```{r,fig.width=6,fig.height=6}
209 | 
210 | # add pie plot to an existing map
211 | 
212 | # make the desired map
213 | 	maps::map(xlim = range(data.block$coords[,1]) + c(-5,5), ylim = range(data.block$coords[,2])+c(-2,2), col="gray")
214 | 
215 | # add the admixture pie plot
216 | 	make.admix.pie.plot(admix.proportions = admix.props,
217 | 						coords = data.block$coords,
218 | 						add = TRUE)
219 | ```
220 | 
221 | ## Comparing two conStruct runs
222 | 
223 | If you've run multiple `conStruct` analyses you may want to 
224 | visually compare them. Although you could always just open up 
225 | both sets of output pdfs, label-switching between independent 
226 | runs can make visual comparisons difficult. Label-switching 
227 | different models have the same, or very similar, estimated 
228 | admixture proportions, but with a different permutation of 
229 | layer labels (e.g., Layer 1 in run 1, and Layer 3 in run 2).
230 | To enable easy comparison between a pair of `conStruct` runs, 
231 | you can use the function `compare.two.runs`.
232 | 
233 | To do so, you need to specify to sets of `conStruct.results` output R 
234 | objects, as well as the `data.block` objects associated with each run. 
235 | Independent runs with the same model can be compared, as can analyses 
236 | run with different models (e.g., spatial vs. nonspatial) or 
237 | different values of `K`. The only restriction is that if the user is 
238 | comparing two models run with different values of `K`, the run with 
239 | the smaller value should be specified first (`conStruct.results2`).
240 | Documentation for `compare.two.runs` can be found using the command 
241 | `help(compare.two.runs)`. Example usage is shown below:
242 | 
243 | ```{r, eval=FALSE}
244 | # load output files from a run with 
245 | #	the spatial model and K=4
246 | load("spK4.conStruct.results.Robj")
247 | load("spK4.data.block.Robj")
248 | 
249 | # assign to new variable names
250 | spK4_cr <- conStruct.results
251 | spK4_db <- data.block
252 | 
253 | # load output files from a run with 
254 | #	the spatial model and K=3
255 | load("spK3.conStruct.results.Robj")
256 | load("spK3.data.block.Robj")
257 | 
258 | # assign to new variable names
259 | spK3_cr <- conStruct.results
260 | spK3_db <- data.block
261 | 
262 | # compare the two runs
263 | compare.two.runs(conStruct.results1=spK3_cr,
264 | 				 data.block1=spK3_db,
265 | 				 conStruct.results2=spK4_cr,
266 | 				 data.block2=spK4_db,
267 | 				 prefix="spK3_vs_spK4")
268 | 
269 | # generates a bunch of pdf figures
270 | ```


--------------------------------------------------------------------------------