├── ExampleColourList.txt
├── CVErrorBoxplotPlotter.R
├── CompileData.sh
├── AdmixturePlotter.R
└── README.md


/ExampleColourList.txt:
--------------------------------------------------------------------------------
 1 | "#FF994E"
 2 | "#0099E5"
 3 | "#E4FE02"
 4 | "#FF9AE6"
 5 | "#339933"
 6 | "#850184"
 7 | "#FF004C"
 8 | "#00FF02"
 9 | "#0001FE"
10 | "#FF00FF"
11 | "#FFE698"
12 | "#B24D00"
13 | "#01FFFF"
14 | "#807E05"
15 | "#FF9899"
16 | "#008183"
17 | "#98BF25"
18 | "#7326E6"
19 | "#28BF99"
20 | "#808080"


--------------------------------------------------------------------------------
/CVErrorBoxplotPlotter.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library(stringr)
 3 | 
 4 | args <- commandArgs(TRUE)
 5 | if (is.na(args[1]) == T || is.na(args[2]) == T ){
 6 |   print("Usage: Rscript CVErrorBoxplotPlotter.R input output")
 7 |   quit(status=1)
 8 | }
 9 | input <- args[1]
10 | output <- paste0(sub(x=args[2], replacement="", pattern=".png"),".png") ## Will ignore '.png' suffix if provided by user
11 | 
12 | ## CV error distribution for the top K replicates
13 | CVs = read.table(input, header=T)
14 | minK <- str_remove(names(CVs), "X") [1]
15 | maxK <- rev(str_remove(names(CVs), "X")) [1]
16 | ## box plot for CV errors of all replicates
17 | png(output, height=20, width=1.5*ncol(CVs), res=300, units="cm")
18 | par(cex.main=1.2, cex.axis=1, cex.lab=1)
19 | par(mar=c(5.1,4.6,4.1,2.1))
20 | boxplot(CVs, xlab="K", ylab="CV error", xaxt="n", main="")
21 | # mtext(3, text=input, line=2.2, cex=1.3, font=2)
22 | mtext(3, text="CV error for all replicates", line=1.5, cex=1.3, font=2)
23 | axis(side=1, at=1:ncol(CVs), label=minK:maxK)
24 | dev.off()


--------------------------------------------------------------------------------
/CompileData.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ###################################################################
 4 | ## Compiling ADMIXTURE run CV errors and Q matrices for plotting ##
 5 | ###################################################################
 6 | 
 7 | fn0="/PATH/TO/ADMIXTURE/OUTPUT" # This should be a directory with one folder per K (which itself contains one folder per replicate and one Logs folder with the slurm logs).
 8 | cd ${fn0}
 9 | mkdir -p ${fn0}/Plotting
10 | 
11 | 
12 | ## Set ADMIXTURE IndFile (Eigenstrat), ADMIXTURE input .bed file and minimum and maximum K values.
13 | IndFile="/PATH/TO/YOUR/PLINK/DATA/Data.ind" # An Eigenstrat .ind version of the data you converted to plink
14 | bedFile="/PATH/TO/YOUR/PLINK/DATA/Data.pruned.bed" # The input .bed file you ran ADMIXTURE on.
15 | Kmin=2 #The minimum number of Ks you ran
16 | Kmax=17 #The maximum number of Ks you ran
17 | 
18 | 
19 | ## Compile CV Errors
20 | touch CVErrors.txt
21 | for i in $(seq ${Kmax} -1 ${Kmin}); do #seq doesn’t like backwards counting on the clusters, so giving the increment of "-1" fixes the problem.
22 |   (echo $i; grep CV $i/Logs/* | cut -f 4 -d " ") | paste  -d " " - CVErrors.txt > temp_CVErrors
23 |   mv temp_CVErrors CVErrors.txt
24 | done
25 | while read r; do echo ${r% } >>temp_CVErrors; done <CVErrors.txt
26 | mv temp_CVErrors CVErrors.txt
27 | mv CVErrors.txt Plotting/CVErrors.txt
28 | 
29 | 
30 | ## Compile list of replicates with highest Likelihood
31 | for i in $(seq ${Kmin} ${Kmax}); do grep -H ^Logli $i/Logs/*.log | sort -nrk2 | head -n1; done >Plotting/best_runs.txt
32 | 
33 | 
34 | ## Compile Q matrices and add ind/pop labels
35 | unset runs
36 | for i in $(seq ${Kmin} ${Kmax}); do
37 |   X=$(grep "K${i}\_" Plotting/best_runs.txt | cut -d "K" -f2 | cut -d ":" -f1 )
38 |   K=$(echo ${X} | cut -f1 -d "_"); Rep=$(echo ${X} | cut -d "_" -f 2)
39 |   runs+="$K/${Rep%.log}/$(basename ${bedFile} .bed).$K.Q "
40 | done
41 | paste -d " " ${runs} >Plotting/temp_data.txt
42 | 
43 | 
44 | ## Create compiled Q matrix header.
45 | for i in $(seq ${Kmin} ${Kmax}); do
46 |   for x in $(seq 1 ${i}); do
47 |     echo -n "${i}:${x}" >>Plotting/temp_header.txt
48 |     echo -n " " >>Plotting/temp_header.txt
49 |   done
50 | done
51 | 
52 | 
53 | ## Remove trailing space from header.
54 | echo "" >> Plotting/temp_header.txt
55 | while read r; do echo ${r% } > Plotting/temp_header.txt; done <Plotting/temp_header.txt
56 | 
57 | 
58 | ## Create individual and Pop list
59 | echo "Ind Pop" >Plotting/temp_pop_labels.txt; awk '{print $1,$3}' $IndFile >>Plotting/temp_pop_labels.txt
60 | 
61 | 
62 | ## Put together header, data and Pop labels to create compiled data table
63 | cd ${fn0}/Plotting
64 | cat temp_header.txt temp_data.txt >temp_compound.data.txt
65 | paste -d " " temp_pop_labels.txt temp_compound.data.txt  >compound.labelled.QperK.txt 
66 | 
67 | ## Clean up
68 | rm temp_*
69 | 


--------------------------------------------------------------------------------
/AdmixturePlotter.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Define functions ----------------------------
  4 | 
  5 | ## A function that calculates the correlation matrix for a K and it's K-1
  6 | correlate_components <- function(k, k_min) {
  7 |   start_this_k = 2+sum(1:k-1)-(sum(1:k_min-1))+1
  8 |   end_this_k = start_this_k+k-1
  9 |   end_prev_k = start_this_k-1
 10 |   start_prev_k = end_prev_k-(k-2)
 11 |   # print (paste0("This K: ",start_this_k,":",end_this_k))
 12 |   # print (paste0("Prev K: ",start_prev_k,":",end_prev_k))
 13 |   cor(raw_data[start_prev_k:end_prev_k], raw_data[start_this_k:end_this_k])
 14 | }
 15 | 
 16 | ## A function that returns the column name of the best correlated component in 
 17 | ##   the K-1 run, for each component in the K run.
 18 | fix_colours <- function(k, k_min) {
 19 |   ## If the K being processed is the minimum K in the data, keep columns 
 20 |   ##  unchanged (since there is nothing to compare to).
 21 |   if (k == k_min) {
 22 |     return(paste0(k, ":", 1:k))
 23 |   }
 24 |   
 25 |   ## Calculate a correlation matrix for each component of this K to the 
 26 |   ##   components of K-1.
 27 |   cor_mat <- correlate_components(k, k_min)
 28 |   
 29 |   ## Find the most correlated component from the last K for each component in 
 30 |   ##   this K.
 31 |   component_order <- c()
 32 |   for (x in 1:k-1) {
 33 |     component_order <- append(component_order, which.max(cor_mat[x, ]))
 34 |   }
 35 |   
 36 |   ## If one component in the last K is the most correlated with two components 
 37 |   ##   on this K, find the next best correlated component, and if that is 
 38 |   ##   unique, assign that as the correct component.
 39 |   ## If it is not unique, repeat the process until a unique component is found.
 40 |   if (any(duplicated(component_order)) == T && 
 41 |       sum(duplicated(component_order)) == 1) {
 42 |     duplicate <- which(duplicated(component_order))
 43 |     condition <- T
 44 |     top_correlates <- c()
 45 |     while (condition == T) {
 46 |       ## 
 47 |       ## until one that isn't already crrelated with another component is found.
 48 |       top_correlates <- c(top_correlates, which.max(cor_mat[duplicate, ]))
 49 |       cor_mat[duplicate, top_correlates] <- NA
 50 |       component_order[duplicate] <- which.max(cor_mat[duplicate, ])
 51 |       if (any(duplicated(component_order)) == F) {condition = F}
 52 |     }
 53 |   } else if (sum(duplicated(component_order)) > 1) {
 54 |     stop (paste0("Correlation of components failed. Usually this is caused by high CV errors for some of the components you are trying to plot. 
 55 | Please consider limiting your input dataset to K=",k_min," to ",k-1,".
 56 | 
 57 | You can use this command to extract the suggested columns from the input file:
 58 |     cut -d ' ' -f 1-",sum(seq(k_min,k-1))+2,"
 59 |     "), call.=FALSE)
 60 |   }
 61 |   ## If a component hasn't been resolved yet, add it as the newest component.
 62 |   missing_component = setdiff(1:k, component_order)
 63 |   component_order<-append(component_order, missing_component)
 64 |   return(paste0(k,":", component_order))
 65 | }
 66 | 
 67 | pick_colour <- function(x) {
 68 |   return(colours[x])
 69 | }
 70 | #### MAIN ####
 71 | 
 72 | ## Load libraries -----------------------------
 73 | library(optparse)
 74 | library(ggplot2)
 75 | library(dplyr, warn.conflicts = F)
 76 | library(tidyr)
 77 | library(stringr)
 78 | library(readr)
 79 | 
 80 | ## Parse arguments ----------------------------
 81 | parser <- OptionParser()
 82 | parser <- add_option(parser, c("-i", "--input"), type = 'character', 
 83 |                      action = "store", dest = "input", 
 84 |                      help = "The input data file. This file should contain all 
 85 |                      components per K per indiviual for all K values.")
 86 | parser <- add_option(parser, c("-c", "--colourList"), type = 'character',
 87 |                      action = "store", dest = "colourList", 
 88 |                      help = "A file of desired colours, in R compatible formats.
 89 |                      One colour per line.")
 90 | parser <- add_option(parser, c("-p", "--popOrder"), type = "character",
 91 |                      action = 'store', dest = 'popOrder', 
 92 |                      help = "A file containing one population per line in the 
 93 |                      desired order.")
 94 | parser <- add_option(parser, c("-o", "--outputPlot"), type = "character", 
 95 |                      action = 'store', default = "OutputPlot", dest = 'output', 
 96 |                      help = "The desired name of the output plot. 
 97 |                      [Default: '%default.pdf']")
 98 | parser <- add_option(parser, c("-r", "--remove"), type = "logical", 
 99 |                      action = 'store_true', default = F, dest = 'remove', 
100 |                      help = "If an order list is provided, should populations not 
101 |                      in the list be removed from the output plot?
102 |                      Not recommended for final figures, but can help in cases 
103 |                      where you are trying to focus on a certain subset of your 
104 |                      populations.")
105 | 
106 | args <- parse_args(parser)
107 | ## If no input is given, script will exit and provide Usage information.
108 | if (is.null(args$input) == T) {
109 |   write("No input file given. Halting execution.", stderr())
110 |   print_help(parser)
111 |   quit(status = 1)
112 | }
113 | 
114 | ## Read cli options into variables.
115 | input <- args$input
116 | colour_file <- args$colourList
117 | ## Output name will ignore '.pdf' suffix if provided by user
118 | output <- sub(x = args$output, replacement = "", pattern = ".pdf") 
119 | pop_order <- args$popOrder
120 | if (args$remove && is.null(args$popOrder)){
121 |   write("No population order specified. 'remove' option ignored.", stderr())
122 | }
123 | 
124 | ## Load data --------------------------------
125 | 
126 | ## read data
127 | raw_data <- read_delim(input, " ", col_types = cols())
128 | 
129 | ## Infer min and max K values.
130 | k_min <- as.numeric(str_split_fixed(names(raw_data[,3]),":",2)[1])
131 | k_max <- as.numeric(str_split_fixed(names(raw_data[,ncol(raw_data)]),":",2)[1])
132 | 
133 | ## Sort components of each K according to correlation with components of K-1. 
134 | ##   This needs to happen per K, otherwise the correlations will not match 
135 | ##   beyond the first pair of Ks.
136 | header <- names(raw_data) ## Take column names from original data
137 | 
138 | ## 'Ind' and 'Pop' should always be at the start of the reformatted data.
139 | refcols = c("Ind", "Pop") 
140 | 
141 | ## For each K in the data, use fix_colours to extract the vector of most correlated
142 | ##   column names for each Component in the K. Then sort the components of this 
143 | ##   K in the raw data.
144 | for (k in k_min:k_max) { 
145 |   refcols <- c(refcols, fix_colours(k,k_min)) ##    
146 |   raw_data <- raw_data[, c(refcols, setdiff(names(raw_data), refcols))] ## 
147 | }
148 | 
149 | ## Finally, fix the column names so that inference of component numbers is correct
150 | names(raw_data) <- header 
151 | 
152 | ## Flatten data to long format
153 | long_data <- gather(raw_data, temp, value, 3:ncol(raw_data))
154 | ## Remove raw_data from memory to reduce memory footprint.
155 | rm(raw_data)
156 | ## Split K and Component name to separate columns
157 | long_data <- long_data %>% 
158 |   separate(temp, c("K","Component"), sep = ":") %>%
159 |   mutate(K = as.numeric(K), Component = as.numeric(Component))
160 | 
161 | ## If no colour list is provided, use rainbow() to generate the required number 
162 | ##   of colours. Otherwise, read the colour definitions into a vector.
163 | if (is.null(colour_file) == T){
164 |   colours = rainbow(k_max)
165 | } else {
166 |   colours = read_delim(colour_file, "\n", col_types = cols(), col_names = F)
167 |   colours <- colours$X1
168 | }
169 | 
170 | ## Create colour column based on colour vector.
171 | ## Each component in each K run is given the colour of the same index as that 
172 | ##   component from the colours list.
173 | long_data <- long_data %>% 
174 |   mutate(clr = purrr::map(Component, pick_colour) %>% 
175 |            unlist)
176 | 
177 | ## Set order of Pops
178 | ## If no OrderList is provided, then the populations are sorted alphabetically
179 | if (is.null(pop_order) == F) {
180 |   order <- read.delim(pop_order, header = F, col.names = "Pops")
181 |   long_data$Pop_f <- factor(long_data$Pop, levels = order$Pops)
182 | } else {
183 |   long_data$Pop_f <- long_data$Pop
184 | }
185 | 
186 | ## Early testing dataset subset
187 | # temp_data <- filter(long_data, K == 2)
188 | 
189 | ## Create the named vector (dictionary) of colours needed for scale_fill_manual.
190 | ## Each colour is mapped to itself.
191 | col <- as.character(long_data$clr)
192 | names(col) <- as.character(long_data$clr)
193 | 
194 | if (is.null(args$remove) == F) {
195 |   long_data <- drop_na(long_data, 'Pop_f')
196 | }
197 | 
198 | ## Plot data --------------------------------------------
199 | 
200 | ## Plot the value of each component(y) per individual(x). 'clr' is also the 
201 | ##   categorical variable, which is ok since each category will be seen once per K.
202 | ggplot(long_data, aes(x = Ind , y = value, fill = clr)) +
203 |           geom_bar(stat = 'identity', width = 1) +
204 |   ## Colour bars by colour vector.
205 |   scale_fill_manual(values = col) +
206 |   ## X scale changed per Pop. 0 multiplicative change, and +1 additive.
207 |   ## Creates the white bars between groups.
208 |   scale_x_discrete(expand = c(0, 1)) +
209 |   ## Set Y axis label
210 |   ylab("K = ") +
211 |   theme_minimal() +
212 |   theme(legend.position = "none", ## No legend
213 |         text = element_text(family = "Helvetica"),
214 |         # axis.text.x = element_text(angle = 90, hjust = 1, size = 6),
215 |         ## Rotate and resize X axis strip text (Pop Names)
216 |         strip.text.x = element_text(angle = 90, hjust = 0, size = 4),
217 |         ## Rotate Y axis strip text (K value)
218 |         strip.text.y = element_text(angle = 180),
219 |         panel.spacing.x = unit(0, "lines"),
220 |         ## Set white space between K plots.
221 |         panel.spacing.y = unit(0.005, "lines"), 
222 |         ## Remove axis ticks, and axis text (ancestry proportion and sample names)
223 |         axis.ticks = element_blank(), 
224 |         axis.text.y = element_blank(),
225 |         axis.text.x = element_blank(),
226 |         ## Remove X axis title ("Ind") 
227 |         axis.title.x = element_blank(), 
228 |         ## Remove gridlines from plot
229 |         panel.grid = element_blank()) +
230 |   ## Creates the plot made so far for each K and each Pop.
231 |   ## The plots per POP are then plotted on top of one another to create each K plot. 
232 |   ## The per K plots are plotted below one another.
233 |   facet_grid(K~ Pop_f,
234 |              scales = "free_x",
235 |              space = "free",
236 |              ## switchlabels of the Y-axis so they is plotted to the left (K=).
237 |              switch = "y")
238 | ## Saves the plot as a pdf with specified size.
239 | ggsave(filename = paste0(output,".pdf"), 
240 |         limitsize = F,
241 |         width = 50, height = 20,
242 |         units = "cm")
243 | 
244 | ## Silently remove the Rplots.pdf file, if one was created.
245 | if (file.exists("Rplots.pdf") && output !=  "Rplots") {
246 |     invisible(file.remove("Rplots.pdf"))
247 | }
248 | 
249 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AdmixturePlotter
  2 | A set of scripts to generate plots for ADMIXTURE runs, for multiple K values. 
  3 | 
  4 | # Dependencies
  5 |  * R (tested on version 3.5.2 "Eggshell Igloo")
  6 |    * optparse
  7 |    * ggplot2
  8 |    * dplyr
  9 |    * tidyr
 10 |    * stringr
 11 |    * readr
 12 | 
 13 | 
 14 | # CompileData.sh
 15 | This is an example script for compiling the data from multiple K runs and replicates into the input formats for `CVErrorBoxplotPlotter.R` and `AdmixturePlotter.R`.
 16 | 
 17 | `CompileData.sh` assumes a folder structure of `OUTPUT_FOLDER/K_Value/Replicate_Number/Result.Q` with a `Logs` folder within each `K_Value` folder, which contains the logfile of all replicates from the admixture runs of that K. In turn these logfiles should be named `<K_Value>_<Replicate_Number>.log`. Given that structure, one should copy and edit the script to include their own paths to their bed format data, the Eigenstrat individual file of the dataset, and the range of K values admixture was ran for. The script can then be ran to produce the correct format of data.
 18 | 
 19 | Example directory structure within output folder, for a run of 5 replicates with K=2-4:
 20 | ```bash
 21 | /PATH/TO/MY/ADMIXTURE/OUTPUT/ $ ls -l *
 22 | 2:	## The K value for the admixture runs
 23 | total 0
 24 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 1/	## Output for K=2 run replicate 1
 25 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 2/	## Output for K=2 run replicate 2
 26 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 3/	## Output for K=2 run replicate 3
 27 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 4/	## Output for K=2 run replicate 4
 28 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 5/	## Output for K=2 run replicate 5
 29 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 Logs/	## The logfiles from all replicates with this K value go in here.
 30 | 
 31 | 3:	## The K value for the admixture runs
 32 | total 0
 33 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 1/	## Output for K=3 run replicate 1
 34 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 2/	## Output for K=3 run replicate 2
 35 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 3/	## Output for K=3 run replicate 3
 36 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 4/	## Output for K=3 run replicate 4
 37 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 5/	## Output for K=3 run replicate 5
 38 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 Logs/	## The logfiles from all replicates with this K value go in here.
 39 | 
 40 | 4:	## The K value for the admixture runs
 41 | total 0
 42 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 1/	## Output for K=4 run replicate 1
 43 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 2/	## Output for K=4 run replicate 2
 44 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 3/	## Output for K=4 run replicate 3
 45 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 4/	## Output for K=4 run replicate 4
 46 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 5/	## Output for K=4 run replicate 5
 47 | drwxrwsr-x 2 user group 4.0K Sep  1  2018 Logs/	## The logfiles from all replicates with this K value go in here.
 48 | ```
 49 | 
 50 | And within each K result folder:
 51 | ```bash
 52 | /PATH/TO/MY/ADMIXTURE/OUTPUT/ $ cd 2
 53 | 
 54 | /PATH/TO/MY/ADMIXTURE/OUTPUT/2 $ ls -l *
 55 | ## These are the contents of each subfolder of the K2 runs.
 56 | 1:	## Output for K=2 run replicate 1
 57 | total 7.7M
 58 | -rw-rw-r-- 1 user group 3.8M Sep  1  2018 Admixture.Output.2.P
 59 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q
 60 | -rw-rw-r-- 1 user group  18K Sep  1  2018 Admixture.Output.2.Q_bias
 61 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q_se
 62 | 
 63 | 2:	## Output for K=2 run replicate 2
 64 | total 7.7M
 65 | -rw-rw-r-- 1 user group 3.8M Sep  1  2018 Admixture.Output.2.P
 66 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q
 67 | -rw-rw-r-- 1 user group  18K Sep  1  2018 Admixture.Output.2.Q_bias
 68 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q_se
 69 | 
 70 | 3:	## Output for K=2 run replicate 3
 71 | total 7.7M
 72 | -rw-rw-r-- 1 user group 3.8M Sep  1  2018 Admixture.Output.2.P
 73 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q
 74 | -rw-rw-r-- 1 user group  18K Sep  1  2018 Admixture.Output.2.Q_bias
 75 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q_se
 76 | 
 77 | 4:	## Output for K=2 run replicate 4
 78 | total 7.7M
 79 | -rw-rw-r-- 1 user group 3.8M Sep  1  2018 Admixture.Output.2.P
 80 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q
 81 | -rw-rw-r-- 1 user group  18K Sep  1  2018 Admixture.Output.2.Q_bias
 82 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q_se
 83 | 
 84 | 5:	## Output for K=2 run replicate 5
 85 | total 7.7M
 86 | -rw-rw-r-- 1 user group 3.8M Sep  1  2018 Admixture.Output.2.P
 87 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q
 88 | -rw-rw-r-- 1 user group  18K Sep  1  2018 Admixture.Output.2.Q_bias
 89 | -rw-rw-r-- 1 user group  17K Sep  1  2018 Admixture.Output.2.Q_se
 90 | 
 91 | Logs:	## The logfiles from all replicates with this K value go in here. 
 92 | total 320K
 93 | -rw-rw-r-- 1 user group 30K Sep  1  2018 K2_1.log	## Logfile of K=2 run, replicate 1.
 94 | -rw-rw-r-- 1 user group 30K Sep  1  2018 K2_2.log	## Logfile of K=2 run, replicate 2.
 95 | -rw-rw-r-- 1 user group 30K Sep  1  2018 K2_3.log	## Logfile of K=2 run, replicate 3.
 96 | -rw-rw-r-- 1 user group 30K Sep  1  2018 K2_4.log	## Logfile of K=2 run, replicate 4.
 97 | -rw-rw-r-- 1 user group 30K Sep  1  2018 K2_5.log	## Logfile of K=2 run, replicate 5.
 98 | ```
 99 | 
100 | If your folder structure does not follow this system, looking into the code is a good starting point for the commands that can be used to create the desired data.
101 | 
102 | # CVErrorBoxplotPlotter.R
103 | This is a script to plot the CV error for multiple replicates per K value in box-and-whisker format. The expected input is a space- or 
104 | tab-separated table with each column being a K value, and each row a replicate. A header is expected of the K value the replicates in 
105 | the column correspond to.
106 | 
107 | Example input for K=2 to 5 (space separated):
108 | ```
109 | 2 3 4 5
110 | 0.49993 0.47984 0.47426 0.47029
111 | 0.49993 0.47985 0.47427 0.47028
112 | 0.49992 0.47985 0.47430 0.47032
113 | 0.49993 0.47984 0.47424 0.47033
114 | 0.49994 0.47984 0.47428 0.47033
115 | ```
116 | 
117 | `CVErrorBoxplotPlotter.R` can then be ran by specifying the input and output file names.
118 | ```bash
119 | CVErrorBoxplotPlotter.R CVErrors.input.txt CVErrorBoxPlot
120 | ```
121 | This will create a figure named **CVErrorBoxPlot.png**.
122 | 
123 | Running the plotter without any arguments will print a short usage message:
124 | ```
125 | [1] "Usage: Rscript CVErrorBoxplotPlotter.R input output"
126 | ```
127 | 
128 | # AdmixturePlotter.R
129 | This is a script to plot ADMIXTURE output for multiple K values. It uses a correlation matrix between different components across
130 | sequential K values to (attempt to) correctly assign consistent colours across Ks. The expected input file is a space separated compound 
131 | dataset of all the results per component per K, with labelling of individual and population name. Once again, a header is expected. Each 
132 | line correctponds to one individual. The first two columns correspont to the Individual ID and population respectively. The rest of the 
133 | columns correspond to components within each ADMIXTURE run to be plotted, with `2:1` corresponding the component 1 of the K=2 run, `2:2` the second component of that run, `3:1` the first component of the K=3 ADMIXTURE run, etc. 
134 | 
135 | Example input for K=2 to 5, for 5 individuals:
136 | ```
137 | Ind Pop 2:1 2:2 3:1 3:2 3:3 4:1 4:2 4:3 4:4 5:1 5:2 5:3 5:4 5:5
138 | Ind1 Pop1 0.942951 0.057049 0.012524 0.987466 0.000010 0.000010 0.315992 0.683988 0.000010 0.118408 0.881562 0.000010 0.000010 0.000010
139 | Ind2 Pop2 0.914518 0.085482 0.125482 0.006548 0.867970 0.864029 0.000010 0.014720 0.121241 0.000010 0.012874 0.864779 0.000436 0.121901
140 | Ind3 Pop2 0.927737 0.072263 0.107645 0.019653 0.872702 0.867123 0.000010 0.029861 0.103005 0.000010 0.020055 0.867397 0.009572 0.102967
141 | Ind4 Pop2 0.929991 0.070009 0.103765 0.011336 0.884900 0.880428 0.000010 0.019200 0.100363 0.000010 0.010325 0.880608 0.008083 0.100974
142 | Ind5 Pop3 0.933301 0.066699 0.098573 0.011919 0.889508 0.885705 0.000010 0.017836 0.096449 0.000010 0.016165 0.886247 0.001159 0.096418
143 | ```
144 | 
145 | `AdmixturePlotter.R` comes with a number of options. Usage information and helptext is shown when the script is provided with the `-h` option.
146 | ```
147 |   Usage: ./AdmixturePlotter.R [options]
148 | 
149 | 
150 | Options:
151 | 	-h, --help
152 | 		Show this help message and exit
153 | 
154 | 	-i INPUT, --input=INPUT
155 | 		The input data file. This file should contain all components per K per indiviual for all K values.
156 | 
157 | 	-c COLOURLIST, --colourList=COLOURLIST
158 | 		A file of desired colours, in R compatible formats. One colour per line.
159 | 
160 | 	-p POPORDER, --popOrder=POPORDER
161 | 		A file containing one population per line in the desired order.
162 | 
163 | 	-o OUTPUTPLOT, --outputPlot=OUTPUTPLOT
164 | 		The desired name of the output plot. [Default: 'OutputPlot.pdf']
165 | 
166 | 	-r, --remove
167 | 		If an order list is provided, should populations not in the list be removed from the output plot?
168 |                      Not recommended for final figures, but can help in cases where you are trying to focus on a certain subset of your populations.
169 | ```
170 | `AdmixturePlotter.R` can be ran by specifying the input data alone. The output will always be in pdf format, and will be named according to the provided `-o/--outputPlot` option. If neither of the two options is provided, the resulting figure will be named `OutputPlot.pdf`.
171 | 
172 | You can provide the script with a colour definitions file with the `-c/--colourList` option. An example colour definition file is provided at `ExampleColourList.txt` that defines a set of colours up to K=20 (Colour source: Haak et al. 2015, Figure S6.3). 
173 | 
174 | When no colour list is provided, the script will use the R function `rainbow()` to create an appropriate number of colours for assignment. Be warned that with large maximum K values, this may result in components being assigned colours that are visually similar to each other. 
175 | 
176 | It is possible to set the order of populations in the resulting plot by using population order list. This should be a text file with all 
177 | plotted populations, one population per line. The path to this list should be provided using the `-p/--popOrder`. Any populations whose order is not specified in the list, but are part of the dataset, will be plotted in population "NA" at the right end of the plot. This makes it easy to check if your population order list is missing any populations from your dataset.
178 | 
179 | In cases where you are trying to plot only a subset of your dataset, you can set a population order list that contains all the populations you wish to include in your plot (in the desired order) in addition to the `-r/--remove` option.
180 | 
181 | ```bash
182 | AdmixturePlotter.R -i SampleData.input.txt -c ExampleColourList.txt -o SamplePlot -p PopOrder.txt [-r]
183 | ```
184 | 
185 | The colours provided in the colour list are appointed to components by index, so it possible to change a specific colour in the output by changing the colour definition of that specific colour in the colour list.
186 | 
187 | # Troubleshooting
188 | If you run into problems or get unexpected results, please contact the author of this script, or submit an issue via GitHub.
189 | 


--------------------------------------------------------------------------------