├── Images ├── 1.admixture_barplot.png ├── 2.pie_charts_map.png └── 3.Admixture_bar_map.png ├── README.md ├── TJ_genind2structure_function.R ├── coordinates.csv ├── lobster_1278ind_79snps_40pop.RData └── pie_chart_admixture_map_tutorial.R /Images/1.admixture_barplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tom-Jenkins/admixture_pie_chart_map_tutorial/b4c44c1bd3dbf546b425b90b603a5d795b3f5a9e/Images/1.admixture_barplot.png -------------------------------------------------------------------------------- /Images/2.pie_charts_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tom-Jenkins/admixture_pie_chart_map_tutorial/b4c44c1bd3dbf546b425b90b603a5d795b3f5a9e/Images/2.pie_charts_map.png -------------------------------------------------------------------------------- /Images/3.Admixture_bar_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tom-Jenkins/admixture_pie_chart_map_tutorial/b4c44c1bd3dbf546b425b90b603a5d795b3f5a9e/Images/3.Admixture_bar_map.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Visualising admixture on a map using R 2 | The scripts in this repository cover how to conduct STRUCTURE-like analyses in R using the R package LEA, how to visualise admixture proportions at both the individual and the population level, and how to plot admixture data as pie charts on a map. 3 | 4 | 5 | -------------------------------------------------------------------------------- /TJ_genind2structure_function.R: -------------------------------------------------------------------------------- 1 | #====================================== 2 | # R function to export a genind object in STRUCTURE format 3 | # 4 | # Tom Jenkins t.l.jenkins@exeter.ac.uk 5 | # 6 | # July 2018 7 | # 8 | #====================================== 9 | 10 | # data: genind object 11 | # file: file name to write 12 | # pops: whether to include population info 13 | # markers: whether to include marker info 14 | # unix: export as a unix text file (windows text file default) 15 | # Function is flexible with regards to ploidy, although genotypes are 16 | # considered to be unambiguous. 17 | # Missing data must be recorded as NA. 18 | # SNP data must be biallelic with all alleles present. 19 | 20 | # Example use: 21 | # library(adegenet) 22 | # ind = as.character(paste("ind_", seq(1:100), sep="")) 23 | # pop = as.character(c(rep("pop1",25), rep("pop2",25), rep("pop3",25), rep("pop4",25))) 24 | # loci = list(c("AA","AC","CC"), c("GG","GC","CC"), c("TT","TA","AA"), c("CC","CT","TT")) 25 | # loci = sample(loci, 100, replace=T) 26 | # loci = lapply(loci, sample, size=100, replace=TRUE) 27 | # geno = as.data.frame(loci, col.names= .genlab("loc",100)) 28 | # data = df2genind(geno, ploidy=2, ind.names=ind, pop=pop, sep="") 29 | # genind2structure(data, file="example_structure.str") 30 | 31 | # Convert Windows text file to Unix text file in Linux 32 | # awk '{ sub("\r$", ""); print }' winfile.txt > unixfile.txt 33 | 34 | 35 | genind2structure = function(data, file="", pops=TRUE, markers=TRUE, unix=FALSE){ 36 | 37 | ## Check input file a genind object 38 | if(!"genind" %in% class(data)){ 39 | warning("Function was designed for genind objects.") 40 | } 41 | 42 | ## Check adegenet, miscTools and stringr are installed 43 | if(!require(adegenet)){install.packages("adegenet")} 44 | if(!require(miscTools)){install.packages("miscTools")} 45 | if(!require(stringr)){install.packages("stringr")} 46 | 47 | 48 | # ---------------- # 49 | # 50 | # Preamble 51 | # 52 | # ---------------- # 53 | 54 | ## Ploidy 55 | ploid = max(data$ploidy) 56 | 57 | ## Number of individuals 58 | ind = nInd(data) 59 | 60 | ## Create dataframe containing individual labels 61 | ## Number of duplicated labels depends on the ploidy of the dataset 62 | df = data.frame(ind = rep(indNames(data), each=ploid)) 63 | 64 | ## Locus names 65 | loci = locNames(data) 66 | 67 | 68 | # ---------------- # 69 | # 70 | # Population IDs 71 | # 72 | # ---------------- # 73 | 74 | if(pops){ 75 | 76 | ## Create dataframe containing individual labels 77 | ## Number of duplicated labels depends on the ploidy of the dataset 78 | df.pop = data.frame(pop = rep(as.numeric(data$pop), each=ploid)) 79 | 80 | ## Add population IDs to dataframe 81 | df$Pop = df.pop$pop 82 | } 83 | 84 | # ---------------- # 85 | # 86 | # Process genotypes 87 | # 88 | # ---------------- # 89 | 90 | ## Add columns for genotypes 91 | df = cbind(df, matrix(-9, # -9 codes for missing data 92 | nrow=dim(df)[1], 93 | ncol=nLoc(data), 94 | dimnames=list(NULL,loci))) 95 | 96 | ## Loop through dataset to extract genotypes 97 | for(L in loci){ 98 | thesedata = data$tab[, grep(paste("^", L, "\\.", sep=""), dimnames(data$tab)[[2]])] # dataotypes by locus 99 | al = 1:dim(thesedata)[2] # numbered alleles 100 | for(s in 1:ind){ 101 | if(all(!is.na(thesedata[s,]))){ 102 | tabrows = (1:dim(df)[1])[df[[1]] == indNames(data)[s]] # index of rows in output to write to 103 | tabrows = tabrows[1:sum(thesedata[s,])] # subset if this is lower ploidy than max ploidy 104 | df[tabrows,L] = rep(al, times = thesedata[s,]) 105 | } 106 | } 107 | } 108 | 109 | 110 | # ---------------- # 111 | # 112 | # Marker IDs 113 | # 114 | # ---------------- # 115 | 116 | if(markers){ 117 | 118 | ## Add a row at the top containing loci names 119 | df = as.data.frame(insertRow(as.matrix(df), 1, c(loci,"",""))) 120 | 121 | } 122 | 123 | # ---------------- # 124 | # 125 | # Export file 126 | # 127 | # ---------------- # 128 | 129 | ## Export dataframe 130 | write.table(df, file=file, sep="\t", quote=FALSE, 131 | row.names=FALSE, col.names=FALSE) 132 | 133 | ## Export dataframe as unix text file 134 | if(unix){ 135 | output_file = file(file, open="wb") 136 | write.table(df, file=output_file, sep="\t", quote=FALSE, 137 | row.names=FALSE, col.names=FALSE) 138 | close(output_file) 139 | } 140 | 141 | } 142 | 143 | 144 | -------------------------------------------------------------------------------- /coordinates.csv: -------------------------------------------------------------------------------- 1 | Code,Lat,Lon 2 | Ale,40.84,25.87 3 | Ber,60.65,4.77 4 | Brd,54.08,-0.17 5 | Cor,51.84,-8.26 6 | Cro,52.94,1.31 7 | Eye,55.88,-2.07 8 | Flo,58.42,8.76 9 | Gul,58.25,11.33 10 | Heb,57.79,-7.25 11 | Hel,54.18,7.9 12 | Hoo,52.12,-6.92 13 | Idr16,46.13,-1.250001 14 | Idr17,46.13,-1.250001 15 | Iom,54.12,-4.5 16 | Ios,49.92,-6.33 17 | Jer,49.16,-2.12 18 | Kav,58.33,11.37 19 | Kil,53.28,-9.77 20 | Laz,41.44,12.62 21 | Loo,50.35,-4.44 22 | Lyn,52.93,-4.62 23 | Lys,58.26,11.37 24 | Mul,54.19,-10.15 25 | Oos,51.61,3.7 26 | Ork,59,-2.83 27 | Pad,50.56,-4.98 28 | Pem,51.81,-5.29 29 | Sar13,41.26,9.2 30 | Sar17,41.26,9.2 31 | Sbs,50.82,-0.26 32 | She,60.17,-1.4 33 | Sin,59.08,11.12 34 | Sky,38.82,24.53 35 | Sul,59.09,-6.16 36 | Tar,42.23,11.68 37 | The,40.36,22.88 38 | Tor,40.17,23.54 39 | Tro,63.76,9.15 40 | Ven,52.12,-10.35 41 | Vig,42.49,-8.99 42 | -------------------------------------------------------------------------------- /lobster_1278ind_79snps_40pop.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tom-Jenkins/admixture_pie_chart_map_tutorial/b4c44c1bd3dbf546b425b90b603a5d795b3f5a9e/lobster_1278ind_79snps_40pop.RData -------------------------------------------------------------------------------- /pie_chart_admixture_map_tutorial.R: -------------------------------------------------------------------------------- 1 | # --------------------------- # 2 | # 3 | # Tutorial: 4 | # Admixture > Pie charts > Map 5 | # 6 | # Description: 7 | # Tutorial on how to run STRUCTURE-like analyses in R, plot admixture proportions 8 | # using pie charts, and visualise the pie charts on a map. 9 | # 10 | # Data: 11 | # European lobster SNP genotypes generated by a Fluidigm EP1 system. 12 | # Tutorial uses ten sites from the original study (Jenkins et al. 2019). 13 | # Data can be download from the link below: 14 | # https://doi.org/10.5061/dryad.2v1kr38 15 | # "lobster_1278ind_79snps_40pop.RData" in Miscellaneous_files.zip 16 | # 17 | # Notes before execution: 18 | # 1. Make sure all required R packages are installed. 19 | # 2. Set your working directory to the location of this R script. 20 | # 21 | # --------------------------- # 22 | 23 | # Load packages 24 | library(adegenet) 25 | library(poppr) 26 | library(LEA) 27 | library(reshape2) 28 | library(dplyr) 29 | library(ggplot2) 30 | library(rworldmap) 31 | library(rworldxtra) 32 | library(ggsn) 33 | library(sf) 34 | library(raster) 35 | library(rgeos) 36 | library(maps) 37 | library(maptools) 38 | library(grid) 39 | library(miscTools) 40 | library(stringr) 41 | library(ggpubr) 42 | 43 | # Import genotypes 44 | load("lobster_1278ind_79snps_40pop.RData") 45 | 46 | # Explore data 47 | data_filt 48 | nLoc(data_filt) # number of loci 49 | nPop(data_filt) # number of sites 50 | nInd(data_filt) # number of individuals 51 | summary(data_filt$pop) # sample size 52 | 53 | # Subset data to reduce computation time 54 | subsites = sort(c("Vig","Ios","Mul","Cro","She","Idr17","Hel","Flo","Lys","Ber")) 55 | data_filt = popsub(data_filt, sublist = subsites) 56 | data_filt 57 | 58 | 59 | # ----------------- # 60 | # 61 | # Admixture analysis using SNMF from LEA 62 | # 63 | # ----------------- # 64 | 65 | # Export genotypes in STRUCTURE format 66 | source("TJ_genind2structure_function.R") 67 | genind2structure(data_filt, file = "genotypes", pops = FALSE, markers = FALSE) 68 | 69 | # Convert STRUCTURE file to .geno format 70 | struct2geno("genotypes", ploidy = 2, FORMAT = 2, extra.column = 1) 71 | 72 | # Run snmf algorithm 73 | set.seed(123) 74 | snmf1 = snmf("genotypes.geno", 75 | K = 1:10, # number of K ancestral populations to run 76 | repetitions = 10, # ten repetitions for each K 77 | entropy = TRUE, # calculate cross-entropy 78 | project = "new") 79 | 80 | # Load snmf project 81 | snmf1 = load.snmfProject("genotypes.snmfProject") 82 | 83 | # Plot cross-entropy results to assess optimal number of K 84 | # Smaller values of cross-entropy usually mean better runs 85 | # A plateau usually represents the K that best fits the data 86 | plot(snmf1, col = "blue", cex = 1.5, pch = 19) 87 | 88 | # Extract the cross-entropy of all runs where K = 2 89 | ce = cross.entropy(snmf1, K = 2) 90 | ce 91 | 92 | # Find the run with the lowest cross-entropy 93 | lowest.ce = which.min(ce) 94 | lowest.ce 95 | 96 | # Extract Q-matrix for the best run 97 | qmatrix = as.data.frame(Q(snmf1, K = 2, run = lowest.ce)) 98 | head(qmatrix) 99 | 100 | # Label column names of qmatrix 101 | ncol(qmatrix) 102 | cluster_names = c() 103 | for (i in 1:ncol(qmatrix)){ 104 | cluster_names[i] = paste("Cluster", i) 105 | } 106 | cluster_names 107 | colnames(qmatrix) = cluster_names 108 | head(qmatrix) 109 | 110 | # Add individual IDs 111 | qmatrix$Ind = indNames(data_filt) 112 | 113 | # Add site IDs 114 | qmatrix$Site = data_filt$pop 115 | head(qmatrix) 116 | 117 | # Convert dataframe to long format 118 | qlong = melt(qmatrix, id.vars=c("Ind","Site")) 119 | head(qlong) 120 | 121 | # Change order of sites by using the factor function 122 | # site.order = c("Vig","Ios","Cor","Mul","She","Cro","Hel","Flo","Lys","Ber") 123 | # qlong$Site_ord = factor(qlong$Site, levels = site.order) 124 | 125 | # Adjust facet labels 126 | levels(qlong$Site) 127 | facet.labs = c("Bergen","Cromer","Flodevigen","Helgoland","Île de Ré", 128 | "Isles of Scilly","Lysekil","Mullet Peninsula","Shetland","Vigo") 129 | levels(qlong$Site) = facet.labs 130 | levels(qlong$Site) 131 | 132 | # Define colour palette 133 | pal = colorRampPalette(c("green","blue")) 134 | cols = pal(length(unique(qlong$variable))) 135 | 136 | # Plot admixture barplot 137 | admix.bar = ggplot(data=qlong, aes(x=Ind, y=value, fill=variable))+ 138 | geom_bar(stat = "identity")+ 139 | scale_y_continuous(expand = c(0,0))+ 140 | facet_wrap(~Site, scales = "free", ncol = 2)+ 141 | scale_fill_manual(values = cols)+ 142 | ylab("Admixture proportion")+ 143 | # xlab("Individual")+ 144 | theme(axis.text.x = element_blank(), 145 | axis.ticks.x = element_blank(), 146 | axis.title.x = element_blank(), 147 | strip.text = element_text(colour="black", size=12), 148 | panel.grid = element_blank(), 149 | panel.background = element_blank(), 150 | legend.position = "top", 151 | legend.title = element_blank(), 152 | legend.text = element_text(size = 12)) 153 | admix.bar 154 | ggsave("1.admixture_barplot.png", width=6, height=10, dpi=300) 155 | 156 | 157 | # ----------------- # 158 | # 159 | # Prepare pie charts 160 | # 161 | # ----------------- # 162 | 163 | # Calculate mean admixture proportions for each site 164 | head(qmatrix) 165 | clusters = grep("Cluster", names(qmatrix)) # indexes of cluster columns 166 | avg_admix = aggregate(qmatrix[, clusters], list(qmatrix$Site), mean) 167 | 168 | # Order alphabetically by site 169 | avg_admix = avg_admix[order(as.character(avg_admix$Group.1)), ] 170 | avg_admix 171 | 172 | # Convert dataframe from wide to long format 173 | avg_admix = melt(avg_admix, id.vars = "Group.1") 174 | head(avg_admix) 175 | 176 | # Define a function to plot pie charts using ggplot for each site 177 | pie_charts = function(admix_df, site, cols){ 178 | # admix_df = dataframe in long format of admixture proportions per site 179 | # site = string 180 | # cols = vector of colours of length(clusters) 181 | ggplot(data = subset(admix_df, Group.1 == site), 182 | aes(x = "", y = value, fill = variable))+ 183 | geom_bar(width = 1, stat = "identity", colour = "black", show.legend = FALSE)+ 184 | coord_polar(theta = "y")+ 185 | scale_fill_manual(values = cols)+ 186 | theme_void() 187 | } 188 | 189 | # Test function on one site 190 | pie_charts(avg_admix, site = "Ber", cols = cols) 191 | 192 | # Apply function to all sites using for loop 193 | pies = list() 194 | for (i in subsites){ 195 | pies[[i]] = pie_charts(admix_df = avg_admix, site = i, cols = cols) 196 | } 197 | 198 | 199 | # ----------------- # 200 | # 201 | # Prepare basemap 202 | # 203 | # ----------------- # 204 | 205 | # Import csv file containing coordinates 206 | coords = read.csv("coordinates.csv") 207 | 208 | # Subset coordinates 209 | coords = coords[coords$Code %in% subsites, ] 210 | 211 | # Order alphabetically by site 212 | coords = coords[order(coords$Code), ] 213 | coords 214 | 215 | # Check order matches coords order 216 | as.character(avg_admix$Group.1) == as.character(coords$Code) 217 | 218 | # Set map boundary (xmin, xmax, ymin, ymax) 219 | boundary = extent(-15, 15, 40, 64) 220 | boundary 221 | 222 | # Get map outlines from rworldmap package 223 | map.outline = getMap(resolution = "high") 224 | 225 | # Crop to boundary and convert to dataframe 226 | map.outline = crop(map.outline, y = boundary) %>% fortify() 227 | 228 | # Plot basemap 229 | basemap = ggplot()+ 230 | geom_polygon(data=map.outline, aes(x=long, y=lat, group=group), fill="grey", 231 | colour="black", size=0.5)+ 232 | coord_quickmap(expand=F)+ 233 | ggsn::north(map.outline, symbol = 10, scale = 0.06, location = "topleft")+ 234 | ggsn::scalebar(data = map.outline, dist = 200, dist_unit = "km", height = 0.01, 235 | transform = TRUE, model = "WGS84", 236 | location = "bottomleft", anchor = c(x = -12.5, y = 45), 237 | st.bottom = FALSE, st.size = 4, st.dist = 0.015)+ 238 | xlab("Longitude")+ 239 | ylab("Latitude")+ 240 | theme( 241 | axis.text = element_text(colour="black", size=12), 242 | axis.title = element_text(colour="black", size=14), 243 | panel.background = element_rect(fill="lightsteelblue2"), 244 | panel.border = element_rect(fill = NA, colour = "black", size = 0.5), 245 | panel.grid.minor = element_line(colour="grey90", size=0.5), 246 | panel.grid.major = element_line(colour="grey90", size=0.5), 247 | legend.text = element_text(size=12), 248 | legend.title = element_blank(), 249 | legend.key.size = unit(0.7, "cm"), 250 | legend.position = "top" 251 | ) 252 | basemap 253 | 254 | 255 | # ----------------- # 256 | # 257 | # Add pie charts to basemap 258 | # 259 | # ----------------- # 260 | 261 | # Extract coordinates for each site 262 | coord.list = list() 263 | for (i in subsites){ 264 | coord.list[[i]] = c(subset(coords, Code == i)$Lon, subset(coords, Code == i)$Lat) 265 | } 266 | coord.list 267 | 268 | # Define pie chart sizes 269 | radius = 1 270 | 271 | # Convert ggplot pie charts to annotation_custom layers 272 | pies.ac = list() 273 | for (i in 1:length(subsites)){ 274 | pies.ac[[i]] = annotation_custom(grob = ggplotGrob(pies[[i]]), 275 | xmin = coord.list[[i]][[1]] - radius, 276 | xmax = coord.list[[i]][[1]] + radius, 277 | ymin = coord.list[[i]][[2]] - radius, 278 | ymax = coord.list[[i]][[2]] + radius) 279 | } 280 | 281 | # Add layers to basemap 282 | pie.map = basemap + pies.ac 283 | pie.map 284 | ggsave("2.pie_charts_map.png", width = 8, height = 10, dpi = 300) 285 | 286 | # Combine ggplots 287 | ggarrange(admix.bar + theme(legend.position = "right") + labs(title = "Individual admixture proportions", tag = "A"), 288 | pie.map + labs(title = "Mean admixture proportions for each site", tag = "B")) 289 | ggsave("3.Admixture_bar_map.png", width = 15, height = 10, dpi = 300) 290 | --------------------------------------------------------------------------------