├── ExampleColourList.txt ├── CVErrorBoxplotPlotter.R ├── CompileData.sh ├── AdmixturePlotter.R └── README.md /ExampleColourList.txt: -------------------------------------------------------------------------------- 1 | "#FF994E" 2 | "#0099E5" 3 | "#E4FE02" 4 | "#FF9AE6" 5 | "#339933" 6 | "#850184" 7 | "#FF004C" 8 | "#00FF02" 9 | "#0001FE" 10 | "#FF00FF" 11 | "#FFE698" 12 | "#B24D00" 13 | "#01FFFF" 14 | "#807E05" 15 | "#FF9899" 16 | "#008183" 17 | "#98BF25" 18 | "#7326E6" 19 | "#28BF99" 20 | "#808080" -------------------------------------------------------------------------------- /CVErrorBoxplotPlotter.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(stringr) 3 | 4 | args <- commandArgs(TRUE) 5 | if (is.na(args[1]) == T || is.na(args[2]) == T ){ 6 | print("Usage: Rscript CVErrorBoxplotPlotter.R input output") 7 | quit(status=1) 8 | } 9 | input <- args[1] 10 | output <- paste0(sub(x=args[2], replacement="", pattern=".png"),".png") ## Will ignore '.png' suffix if provided by user 11 | 12 | ## CV error distribution for the top K replicates 13 | CVs = read.table(input, header=T) 14 | minK <- str_remove(names(CVs), "X") [1] 15 | maxK <- rev(str_remove(names(CVs), "X")) [1] 16 | ## box plot for CV errors of all replicates 17 | png(output, height=20, width=1.5*ncol(CVs), res=300, units="cm") 18 | par(cex.main=1.2, cex.axis=1, cex.lab=1) 19 | par(mar=c(5.1,4.6,4.1,2.1)) 20 | boxplot(CVs, xlab="K", ylab="CV error", xaxt="n", main="") 21 | # mtext(3, text=input, line=2.2, cex=1.3, font=2) 22 | mtext(3, text="CV error for all replicates", line=1.5, cex=1.3, font=2) 23 | axis(side=1, at=1:ncol(CVs), label=minK:maxK) 24 | dev.off() -------------------------------------------------------------------------------- /CompileData.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ################################################################### 4 | ## Compiling ADMIXTURE run CV errors and Q matrices for plotting ## 5 | ################################################################### 6 | 7 | fn0="/PATH/TO/ADMIXTURE/OUTPUT" # This should be a directory with one folder per K (which itself contains one folder per replicate and one Logs folder with the slurm logs). 8 | cd ${fn0} 9 | mkdir -p ${fn0}/Plotting 10 | 11 | 12 | ## Set ADMIXTURE IndFile (Eigenstrat), ADMIXTURE input .bed file and minimum and maximum K values. 13 | IndFile="/PATH/TO/YOUR/PLINK/DATA/Data.ind" # An Eigenstrat .ind version of the data you converted to plink 14 | bedFile="/PATH/TO/YOUR/PLINK/DATA/Data.pruned.bed" # The input .bed file you ran ADMIXTURE on. 15 | Kmin=2 #The minimum number of Ks you ran 16 | Kmax=17 #The maximum number of Ks you ran 17 | 18 | 19 | ## Compile CV Errors 20 | touch CVErrors.txt 21 | for i in $(seq ${Kmax} -1 ${Kmin}); do #seq doesn’t like backwards counting on the clusters, so giving the increment of "-1" fixes the problem. 22 | (echo $i; grep CV $i/Logs/* | cut -f 4 -d " ") | paste -d " " - CVErrors.txt > temp_CVErrors 23 | mv temp_CVErrors CVErrors.txt 24 | done 25 | while read r; do echo ${r% } >>temp_CVErrors; done Plotting/best_runs.txt 32 | 33 | 34 | ## Compile Q matrices and add ind/pop labels 35 | unset runs 36 | for i in $(seq ${Kmin} ${Kmax}); do 37 | X=$(grep "K${i}\_" Plotting/best_runs.txt | cut -d "K" -f2 | cut -d ":" -f1 ) 38 | K=$(echo ${X} | cut -f1 -d "_"); Rep=$(echo ${X} | cut -d "_" -f 2) 39 | runs+="$K/${Rep%.log}/$(basename ${bedFile} .bed).$K.Q " 40 | done 41 | paste -d " " ${runs} >Plotting/temp_data.txt 42 | 43 | 44 | ## Create compiled Q matrix header. 45 | for i in $(seq ${Kmin} ${Kmax}); do 46 | for x in $(seq 1 ${i}); do 47 | echo -n "${i}:${x}" >>Plotting/temp_header.txt 48 | echo -n " " >>Plotting/temp_header.txt 49 | done 50 | done 51 | 52 | 53 | ## Remove trailing space from header. 54 | echo "" >> Plotting/temp_header.txt 55 | while read r; do echo ${r% } > Plotting/temp_header.txt; done Plotting/temp_pop_labels.txt; awk '{print $1,$3}' $IndFile >>Plotting/temp_pop_labels.txt 60 | 61 | 62 | ## Put together header, data and Pop labels to create compiled data table 63 | cd ${fn0}/Plotting 64 | cat temp_header.txt temp_data.txt >temp_compound.data.txt 65 | paste -d " " temp_pop_labels.txt temp_compound.data.txt >compound.labelled.QperK.txt 66 | 67 | ## Clean up 68 | rm temp_* 69 | -------------------------------------------------------------------------------- /AdmixturePlotter.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Define functions ---------------------------- 4 | 5 | ## A function that calculates the correlation matrix for a K and it's K-1 6 | correlate_components <- function(k, k_min) { 7 | start_this_k = 2+sum(1:k-1)-(sum(1:k_min-1))+1 8 | end_this_k = start_this_k+k-1 9 | end_prev_k = start_this_k-1 10 | start_prev_k = end_prev_k-(k-2) 11 | # print (paste0("This K: ",start_this_k,":",end_this_k)) 12 | # print (paste0("Prev K: ",start_prev_k,":",end_prev_k)) 13 | cor(raw_data[start_prev_k:end_prev_k], raw_data[start_this_k:end_this_k]) 14 | } 15 | 16 | ## A function that returns the column name of the best correlated component in 17 | ## the K-1 run, for each component in the K run. 18 | fix_colours <- function(k, k_min) { 19 | ## If the K being processed is the minimum K in the data, keep columns 20 | ## unchanged (since there is nothing to compare to). 21 | if (k == k_min) { 22 | return(paste0(k, ":", 1:k)) 23 | } 24 | 25 | ## Calculate a correlation matrix for each component of this K to the 26 | ## components of K-1. 27 | cor_mat <- correlate_components(k, k_min) 28 | 29 | ## Find the most correlated component from the last K for each component in 30 | ## this K. 31 | component_order <- c() 32 | for (x in 1:k-1) { 33 | component_order <- append(component_order, which.max(cor_mat[x, ])) 34 | } 35 | 36 | ## If one component in the last K is the most correlated with two components 37 | ## on this K, find the next best correlated component, and if that is 38 | ## unique, assign that as the correct component. 39 | ## If it is not unique, repeat the process until a unique component is found. 40 | if (any(duplicated(component_order)) == T && 41 | sum(duplicated(component_order)) == 1) { 42 | duplicate <- which(duplicated(component_order)) 43 | condition <- T 44 | top_correlates <- c() 45 | while (condition == T) { 46 | ## 47 | ## until one that isn't already crrelated with another component is found. 48 | top_correlates <- c(top_correlates, which.max(cor_mat[duplicate, ])) 49 | cor_mat[duplicate, top_correlates] <- NA 50 | component_order[duplicate] <- which.max(cor_mat[duplicate, ]) 51 | if (any(duplicated(component_order)) == F) {condition = F} 52 | } 53 | } else if (sum(duplicated(component_order)) > 1) { 54 | stop (paste0("Correlation of components failed. Usually this is caused by high CV errors for some of the components you are trying to plot. 55 | Please consider limiting your input dataset to K=",k_min," to ",k-1,". 56 | 57 | You can use this command to extract the suggested columns from the input file: 58 | cut -d ' ' -f 1-",sum(seq(k_min,k-1))+2," 59 | "), call.=FALSE) 60 | } 61 | ## If a component hasn't been resolved yet, add it as the newest component. 62 | missing_component = setdiff(1:k, component_order) 63 | component_order<-append(component_order, missing_component) 64 | return(paste0(k,":", component_order)) 65 | } 66 | 67 | pick_colour <- function(x) { 68 | return(colours[x]) 69 | } 70 | #### MAIN #### 71 | 72 | ## Load libraries ----------------------------- 73 | library(optparse) 74 | library(ggplot2) 75 | library(dplyr, warn.conflicts = F) 76 | library(tidyr) 77 | library(stringr) 78 | library(readr) 79 | 80 | ## Parse arguments ---------------------------- 81 | parser <- OptionParser() 82 | parser <- add_option(parser, c("-i", "--input"), type = 'character', 83 | action = "store", dest = "input", 84 | help = "The input data file. This file should contain all 85 | components per K per indiviual for all K values.") 86 | parser <- add_option(parser, c("-c", "--colourList"), type = 'character', 87 | action = "store", dest = "colourList", 88 | help = "A file of desired colours, in R compatible formats. 89 | One colour per line.") 90 | parser <- add_option(parser, c("-p", "--popOrder"), type = "character", 91 | action = 'store', dest = 'popOrder', 92 | help = "A file containing one population per line in the 93 | desired order.") 94 | parser <- add_option(parser, c("-o", "--outputPlot"), type = "character", 95 | action = 'store', default = "OutputPlot", dest = 'output', 96 | help = "The desired name of the output plot. 97 | [Default: '%default.pdf']") 98 | parser <- add_option(parser, c("-r", "--remove"), type = "logical", 99 | action = 'store_true', default = F, dest = 'remove', 100 | help = "If an order list is provided, should populations not 101 | in the list be removed from the output plot? 102 | Not recommended for final figures, but can help in cases 103 | where you are trying to focus on a certain subset of your 104 | populations.") 105 | 106 | args <- parse_args(parser) 107 | ## If no input is given, script will exit and provide Usage information. 108 | if (is.null(args$input) == T) { 109 | write("No input file given. Halting execution.", stderr()) 110 | print_help(parser) 111 | quit(status = 1) 112 | } 113 | 114 | ## Read cli options into variables. 115 | input <- args$input 116 | colour_file <- args$colourList 117 | ## Output name will ignore '.pdf' suffix if provided by user 118 | output <- sub(x = args$output, replacement = "", pattern = ".pdf") 119 | pop_order <- args$popOrder 120 | if (args$remove && is.null(args$popOrder)){ 121 | write("No population order specified. 'remove' option ignored.", stderr()) 122 | } 123 | 124 | ## Load data -------------------------------- 125 | 126 | ## read data 127 | raw_data <- read_delim(input, " ", col_types = cols()) 128 | 129 | ## Infer min and max K values. 130 | k_min <- as.numeric(str_split_fixed(names(raw_data[,3]),":",2)[1]) 131 | k_max <- as.numeric(str_split_fixed(names(raw_data[,ncol(raw_data)]),":",2)[1]) 132 | 133 | ## Sort components of each K according to correlation with components of K-1. 134 | ## This needs to happen per K, otherwise the correlations will not match 135 | ## beyond the first pair of Ks. 136 | header <- names(raw_data) ## Take column names from original data 137 | 138 | ## 'Ind' and 'Pop' should always be at the start of the reformatted data. 139 | refcols = c("Ind", "Pop") 140 | 141 | ## For each K in the data, use fix_colours to extract the vector of most correlated 142 | ## column names for each Component in the K. Then sort the components of this 143 | ## K in the raw data. 144 | for (k in k_min:k_max) { 145 | refcols <- c(refcols, fix_colours(k,k_min)) ## 146 | raw_data <- raw_data[, c(refcols, setdiff(names(raw_data), refcols))] ## 147 | } 148 | 149 | ## Finally, fix the column names so that inference of component numbers is correct 150 | names(raw_data) <- header 151 | 152 | ## Flatten data to long format 153 | long_data <- gather(raw_data, temp, value, 3:ncol(raw_data)) 154 | ## Remove raw_data from memory to reduce memory footprint. 155 | rm(raw_data) 156 | ## Split K and Component name to separate columns 157 | long_data <- long_data %>% 158 | separate(temp, c("K","Component"), sep = ":") %>% 159 | mutate(K = as.numeric(K), Component = as.numeric(Component)) 160 | 161 | ## If no colour list is provided, use rainbow() to generate the required number 162 | ## of colours. Otherwise, read the colour definitions into a vector. 163 | if (is.null(colour_file) == T){ 164 | colours = rainbow(k_max) 165 | } else { 166 | colours = read_delim(colour_file, "\n", col_types = cols(), col_names = F) 167 | colours <- colours$X1 168 | } 169 | 170 | ## Create colour column based on colour vector. 171 | ## Each component in each K run is given the colour of the same index as that 172 | ## component from the colours list. 173 | long_data <- long_data %>% 174 | mutate(clr = purrr::map(Component, pick_colour) %>% 175 | unlist) 176 | 177 | ## Set order of Pops 178 | ## If no OrderList is provided, then the populations are sorted alphabetically 179 | if (is.null(pop_order) == F) { 180 | order <- read.delim(pop_order, header = F, col.names = "Pops") 181 | long_data$Pop_f <- factor(long_data$Pop, levels = order$Pops) 182 | } else { 183 | long_data$Pop_f <- long_data$Pop 184 | } 185 | 186 | ## Early testing dataset subset 187 | # temp_data <- filter(long_data, K == 2) 188 | 189 | ## Create the named vector (dictionary) of colours needed for scale_fill_manual. 190 | ## Each colour is mapped to itself. 191 | col <- as.character(long_data$clr) 192 | names(col) <- as.character(long_data$clr) 193 | 194 | if (is.null(args$remove) == F) { 195 | long_data <- drop_na(long_data, 'Pop_f') 196 | } 197 | 198 | ## Plot data -------------------------------------------- 199 | 200 | ## Plot the value of each component(y) per individual(x). 'clr' is also the 201 | ## categorical variable, which is ok since each category will be seen once per K. 202 | ggplot(long_data, aes(x = Ind , y = value, fill = clr)) + 203 | geom_bar(stat = 'identity', width = 1) + 204 | ## Colour bars by colour vector. 205 | scale_fill_manual(values = col) + 206 | ## X scale changed per Pop. 0 multiplicative change, and +1 additive. 207 | ## Creates the white bars between groups. 208 | scale_x_discrete(expand = c(0, 1)) + 209 | ## Set Y axis label 210 | ylab("K = ") + 211 | theme_minimal() + 212 | theme(legend.position = "none", ## No legend 213 | text = element_text(family = "Helvetica"), 214 | # axis.text.x = element_text(angle = 90, hjust = 1, size = 6), 215 | ## Rotate and resize X axis strip text (Pop Names) 216 | strip.text.x = element_text(angle = 90, hjust = 0, size = 4), 217 | ## Rotate Y axis strip text (K value) 218 | strip.text.y = element_text(angle = 180), 219 | panel.spacing.x = unit(0, "lines"), 220 | ## Set white space between K plots. 221 | panel.spacing.y = unit(0.005, "lines"), 222 | ## Remove axis ticks, and axis text (ancestry proportion and sample names) 223 | axis.ticks = element_blank(), 224 | axis.text.y = element_blank(), 225 | axis.text.x = element_blank(), 226 | ## Remove X axis title ("Ind") 227 | axis.title.x = element_blank(), 228 | ## Remove gridlines from plot 229 | panel.grid = element_blank()) + 230 | ## Creates the plot made so far for each K and each Pop. 231 | ## The plots per POP are then plotted on top of one another to create each K plot. 232 | ## The per K plots are plotted below one another. 233 | facet_grid(K~ Pop_f, 234 | scales = "free_x", 235 | space = "free", 236 | ## switchlabels of the Y-axis so they is plotted to the left (K=). 237 | switch = "y") 238 | ## Saves the plot as a pdf with specified size. 239 | ggsave(filename = paste0(output,".pdf"), 240 | limitsize = F, 241 | width = 50, height = 20, 242 | units = "cm") 243 | 244 | ## Silently remove the Rplots.pdf file, if one was created. 245 | if (file.exists("Rplots.pdf") && output != "Rplots") { 246 | invisible(file.remove("Rplots.pdf")) 247 | } 248 | 249 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AdmixturePlotter 2 | A set of scripts to generate plots for ADMIXTURE runs, for multiple K values. 3 | 4 | # Dependencies 5 | * R (tested on version 3.5.2 "Eggshell Igloo") 6 | * optparse 7 | * ggplot2 8 | * dplyr 9 | * tidyr 10 | * stringr 11 | * readr 12 | 13 | 14 | # CompileData.sh 15 | This is an example script for compiling the data from multiple K runs and replicates into the input formats for `CVErrorBoxplotPlotter.R` and `AdmixturePlotter.R`. 16 | 17 | `CompileData.sh` assumes a folder structure of `OUTPUT_FOLDER/K_Value/Replicate_Number/Result.Q` with a `Logs` folder within each `K_Value` folder, which contains the logfile of all replicates from the admixture runs of that K. In turn these logfiles should be named `_.log`. Given that structure, one should copy and edit the script to include their own paths to their bed format data, the Eigenstrat individual file of the dataset, and the range of K values admixture was ran for. The script can then be ran to produce the correct format of data. 18 | 19 | Example directory structure within output folder, for a run of 5 replicates with K=2-4: 20 | ```bash 21 | /PATH/TO/MY/ADMIXTURE/OUTPUT/ $ ls -l * 22 | 2: ## The K value for the admixture runs 23 | total 0 24 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 1/ ## Output for K=2 run replicate 1 25 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 2/ ## Output for K=2 run replicate 2 26 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 3/ ## Output for K=2 run replicate 3 27 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 4/ ## Output for K=2 run replicate 4 28 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 5/ ## Output for K=2 run replicate 5 29 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 Logs/ ## The logfiles from all replicates with this K value go in here. 30 | 31 | 3: ## The K value for the admixture runs 32 | total 0 33 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 1/ ## Output for K=3 run replicate 1 34 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 2/ ## Output for K=3 run replicate 2 35 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 3/ ## Output for K=3 run replicate 3 36 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 4/ ## Output for K=3 run replicate 4 37 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 5/ ## Output for K=3 run replicate 5 38 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 Logs/ ## The logfiles from all replicates with this K value go in here. 39 | 40 | 4: ## The K value for the admixture runs 41 | total 0 42 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 1/ ## Output for K=4 run replicate 1 43 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 2/ ## Output for K=4 run replicate 2 44 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 3/ ## Output for K=4 run replicate 3 45 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 4/ ## Output for K=4 run replicate 4 46 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 5/ ## Output for K=4 run replicate 5 47 | drwxrwsr-x 2 user group 4.0K Sep 1 2018 Logs/ ## The logfiles from all replicates with this K value go in here. 48 | ``` 49 | 50 | And within each K result folder: 51 | ```bash 52 | /PATH/TO/MY/ADMIXTURE/OUTPUT/ $ cd 2 53 | 54 | /PATH/TO/MY/ADMIXTURE/OUTPUT/2 $ ls -l * 55 | ## These are the contents of each subfolder of the K2 runs. 56 | 1: ## Output for K=2 run replicate 1 57 | total 7.7M 58 | -rw-rw-r-- 1 user group 3.8M Sep 1 2018 Admixture.Output.2.P 59 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q 60 | -rw-rw-r-- 1 user group 18K Sep 1 2018 Admixture.Output.2.Q_bias 61 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q_se 62 | 63 | 2: ## Output for K=2 run replicate 2 64 | total 7.7M 65 | -rw-rw-r-- 1 user group 3.8M Sep 1 2018 Admixture.Output.2.P 66 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q 67 | -rw-rw-r-- 1 user group 18K Sep 1 2018 Admixture.Output.2.Q_bias 68 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q_se 69 | 70 | 3: ## Output for K=2 run replicate 3 71 | total 7.7M 72 | -rw-rw-r-- 1 user group 3.8M Sep 1 2018 Admixture.Output.2.P 73 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q 74 | -rw-rw-r-- 1 user group 18K Sep 1 2018 Admixture.Output.2.Q_bias 75 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q_se 76 | 77 | 4: ## Output for K=2 run replicate 4 78 | total 7.7M 79 | -rw-rw-r-- 1 user group 3.8M Sep 1 2018 Admixture.Output.2.P 80 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q 81 | -rw-rw-r-- 1 user group 18K Sep 1 2018 Admixture.Output.2.Q_bias 82 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q_se 83 | 84 | 5: ## Output for K=2 run replicate 5 85 | total 7.7M 86 | -rw-rw-r-- 1 user group 3.8M Sep 1 2018 Admixture.Output.2.P 87 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q 88 | -rw-rw-r-- 1 user group 18K Sep 1 2018 Admixture.Output.2.Q_bias 89 | -rw-rw-r-- 1 user group 17K Sep 1 2018 Admixture.Output.2.Q_se 90 | 91 | Logs: ## The logfiles from all replicates with this K value go in here. 92 | total 320K 93 | -rw-rw-r-- 1 user group 30K Sep 1 2018 K2_1.log ## Logfile of K=2 run, replicate 1. 94 | -rw-rw-r-- 1 user group 30K Sep 1 2018 K2_2.log ## Logfile of K=2 run, replicate 2. 95 | -rw-rw-r-- 1 user group 30K Sep 1 2018 K2_3.log ## Logfile of K=2 run, replicate 3. 96 | -rw-rw-r-- 1 user group 30K Sep 1 2018 K2_4.log ## Logfile of K=2 run, replicate 4. 97 | -rw-rw-r-- 1 user group 30K Sep 1 2018 K2_5.log ## Logfile of K=2 run, replicate 5. 98 | ``` 99 | 100 | If your folder structure does not follow this system, looking into the code is a good starting point for the commands that can be used to create the desired data. 101 | 102 | # CVErrorBoxplotPlotter.R 103 | This is a script to plot the CV error for multiple replicates per K value in box-and-whisker format. The expected input is a space- or 104 | tab-separated table with each column being a K value, and each row a replicate. A header is expected of the K value the replicates in 105 | the column correspond to. 106 | 107 | Example input for K=2 to 5 (space separated): 108 | ``` 109 | 2 3 4 5 110 | 0.49993 0.47984 0.47426 0.47029 111 | 0.49993 0.47985 0.47427 0.47028 112 | 0.49992 0.47985 0.47430 0.47032 113 | 0.49993 0.47984 0.47424 0.47033 114 | 0.49994 0.47984 0.47428 0.47033 115 | ``` 116 | 117 | `CVErrorBoxplotPlotter.R` can then be ran by specifying the input and output file names. 118 | ```bash 119 | CVErrorBoxplotPlotter.R CVErrors.input.txt CVErrorBoxPlot 120 | ``` 121 | This will create a figure named **CVErrorBoxPlot.png**. 122 | 123 | Running the plotter without any arguments will print a short usage message: 124 | ``` 125 | [1] "Usage: Rscript CVErrorBoxplotPlotter.R input output" 126 | ``` 127 | 128 | # AdmixturePlotter.R 129 | This is a script to plot ADMIXTURE output for multiple K values. It uses a correlation matrix between different components across 130 | sequential K values to (attempt to) correctly assign consistent colours across Ks. The expected input file is a space separated compound 131 | dataset of all the results per component per K, with labelling of individual and population name. Once again, a header is expected. Each 132 | line correctponds to one individual. The first two columns correspont to the Individual ID and population respectively. The rest of the 133 | columns correspond to components within each ADMIXTURE run to be plotted, with `2:1` corresponding the component 1 of the K=2 run, `2:2` the second component of that run, `3:1` the first component of the K=3 ADMIXTURE run, etc. 134 | 135 | Example input for K=2 to 5, for 5 individuals: 136 | ``` 137 | Ind Pop 2:1 2:2 3:1 3:2 3:3 4:1 4:2 4:3 4:4 5:1 5:2 5:3 5:4 5:5 138 | Ind1 Pop1 0.942951 0.057049 0.012524 0.987466 0.000010 0.000010 0.315992 0.683988 0.000010 0.118408 0.881562 0.000010 0.000010 0.000010 139 | Ind2 Pop2 0.914518 0.085482 0.125482 0.006548 0.867970 0.864029 0.000010 0.014720 0.121241 0.000010 0.012874 0.864779 0.000436 0.121901 140 | Ind3 Pop2 0.927737 0.072263 0.107645 0.019653 0.872702 0.867123 0.000010 0.029861 0.103005 0.000010 0.020055 0.867397 0.009572 0.102967 141 | Ind4 Pop2 0.929991 0.070009 0.103765 0.011336 0.884900 0.880428 0.000010 0.019200 0.100363 0.000010 0.010325 0.880608 0.008083 0.100974 142 | Ind5 Pop3 0.933301 0.066699 0.098573 0.011919 0.889508 0.885705 0.000010 0.017836 0.096449 0.000010 0.016165 0.886247 0.001159 0.096418 143 | ``` 144 | 145 | `AdmixturePlotter.R` comes with a number of options. Usage information and helptext is shown when the script is provided with the `-h` option. 146 | ``` 147 | Usage: ./AdmixturePlotter.R [options] 148 | 149 | 150 | Options: 151 | -h, --help 152 | Show this help message and exit 153 | 154 | -i INPUT, --input=INPUT 155 | The input data file. This file should contain all components per K per indiviual for all K values. 156 | 157 | -c COLOURLIST, --colourList=COLOURLIST 158 | A file of desired colours, in R compatible formats. One colour per line. 159 | 160 | -p POPORDER, --popOrder=POPORDER 161 | A file containing one population per line in the desired order. 162 | 163 | -o OUTPUTPLOT, --outputPlot=OUTPUTPLOT 164 | The desired name of the output plot. [Default: 'OutputPlot.pdf'] 165 | 166 | -r, --remove 167 | If an order list is provided, should populations not in the list be removed from the output plot? 168 | Not recommended for final figures, but can help in cases where you are trying to focus on a certain subset of your populations. 169 | ``` 170 | `AdmixturePlotter.R` can be ran by specifying the input data alone. The output will always be in pdf format, and will be named according to the provided `-o/--outputPlot` option. If neither of the two options is provided, the resulting figure will be named `OutputPlot.pdf`. 171 | 172 | You can provide the script with a colour definitions file with the `-c/--colourList` option. An example colour definition file is provided at `ExampleColourList.txt` that defines a set of colours up to K=20 (Colour source: Haak et al. 2015, Figure S6.3). 173 | 174 | When no colour list is provided, the script will use the R function `rainbow()` to create an appropriate number of colours for assignment. Be warned that with large maximum K values, this may result in components being assigned colours that are visually similar to each other. 175 | 176 | It is possible to set the order of populations in the resulting plot by using population order list. This should be a text file with all 177 | plotted populations, one population per line. The path to this list should be provided using the `-p/--popOrder`. Any populations whose order is not specified in the list, but are part of the dataset, will be plotted in population "NA" at the right end of the plot. This makes it easy to check if your population order list is missing any populations from your dataset. 178 | 179 | In cases where you are trying to plot only a subset of your dataset, you can set a population order list that contains all the populations you wish to include in your plot (in the desired order) in addition to the `-r/--remove` option. 180 | 181 | ```bash 182 | AdmixturePlotter.R -i SampleData.input.txt -c ExampleColourList.txt -o SamplePlot -p PopOrder.txt [-r] 183 | ``` 184 | 185 | The colours provided in the colour list are appointed to components by index, so it possible to change a specific colour in the output by changing the colour definition of that specific colour in the colour list. 186 | 187 | # Troubleshooting 188 | If you run into problems or get unexpected results, please contact the author of this script, or submit an issue via GitHub. 189 | --------------------------------------------------------------------------------