├── figures ├── spacetime_schema.png └── spacetime_schema.svg ├── .gitignore ├── README.md ├── generators.Rproj ├── pruning_ranges.txt ├── LICENSE ├── technical_helpers.R ├── runAdmixture.sh ├── spacetime_test.R └── admixpops_test.R /figures/spacetime_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/paagen/master/figures/spacetime_schema.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/_build/ 2 | .stack-work/ 3 | .ipynb_checkpoints/ 4 | spacetime_test_data/ 5 | admixpops_test_data/ 6 | .Rhistory 7 | .Rproj.user 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository only stores some code to experiment with the genotype data generators in [xerxes](https://github.com/poseidon-framework/poseidon-analysis-hs). The generators where once implemented here, but later moved. 2 | -------------------------------------------------------------------------------- /generators.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: c51eff51-55d6-4637-a8e3-74ba3dc7da1a 3 | 4 | RestoreWorkspace: Default 5 | SaveWorkspace: Default 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: Sweave 14 | LaTeX: pdfLaTeX 15 | -------------------------------------------------------------------------------- /pruning_ranges.txt: -------------------------------------------------------------------------------- 1 | 6 20400000 41500000 Region1 2 | 8 7100000 12800000 Region2 3 | 11 48000000 56200000 Region3 4 | 1 48000000 52000000 Region4 5 | 2 86000000 100500000 Region5 6 | 2 134500000 138000000 Region6 7 | 2 183000000 190000000 Region7 8 | 3 47500000 50000000 Region8 9 | 3 83500000 87000000 Region9 10 | 3 89000000 97500000 Region10 11 | 5 44500000 50500000 Region11 12 | 5 98000000 100500000 Region12 13 | 5 129000000 132000000 Region13 14 | 5 135500000 138500000 Region14 15 | 6 57000000 64000000 Region15 16 | 6 140000000 142500000 Region16 17 | 7 55000000 66000000 Region17 18 | 8 8000000 12000000 Region18 19 | 8 43000000 50000000 Region19 20 | 10 37000000 43000000 Region20 21 | 11 87500000 90500000 Region21 22 | 12 33000000 40000000 Region22 23 | 12 109500000 112000000 Region23 24 | 20 32000000 34500000 Region24 25 | 8 112000000 115000000 Region25 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Clemens Schmid 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /technical_helpers.R: -------------------------------------------------------------------------------- 1 | # read mds 2 | read_mds <- function(x) { 3 | readr::read_fwf( 4 | file = x, 5 | col_positions = readr::fwf_empty( 6 | x, 7 | skip = 1, 8 | col_names = c("FID", "IID", "SOL", "C1", "C2"), 9 | n = 1000 10 | ), 11 | trim_ws = T, 12 | col_types = "ccddd_", 13 | skip = 1 14 | ) 15 | } 16 | 17 | # run system command RStudio 18 | s <- function(x) { 19 | termId <- rstudioapi::terminalExecute(x) 20 | wait(termId) 21 | stdout_vec <- rstudioapi::terminalBuffer(termId) 22 | rstudioapi::terminalKill(termId) 23 | return(stdout_vec) 24 | } 25 | 26 | wait <- function(termId) { 27 | while (is.null(rstudioapi::terminalExitCode(termId))) { 28 | Sys.sleep(1) 29 | } 30 | } 31 | 32 | # run system command 33 | sb <- function(x, o = T, e = T) { 34 | redir <- if (e) { "2>&1" } else { "" } 35 | res <- system(paste(x, redir), intern=TRUE, ignore.stdout = !o) 36 | if (length(res) > 1) { cat(res, sep='\n') } 37 | } 38 | 39 | # delete directory 40 | dd <- function(x) { unlink(x, recursive = T) } 41 | 42 | # delete directory and then create it again 43 | nd <- function(x) { 44 | unlink(x, recursive = T) 45 | dir.create(x) 46 | } 47 | -------------------------------------------------------------------------------- /runAdmixture.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #$ -S /bin/bash #defines bash as the shell for execution 4 | #$ -N admix #Name of the command that will be listed in the queue 5 | #$ -cwd #change to current directory 6 | #$ -j y #join error and standard output in one file, no error file will be written 7 | #$ -o ~/log #standard output file or directory (joined with error because of -j y) 8 | #$ -q archgen.q #queue 9 | #$ -pe smp 4 #needs X CPU cores 10 | #$ -l h_vmem=10G #request XGb of memory 11 | #$ -V # load personal profile 12 | #$ -t 1-5 # array job length (here: amount of Ks * amount of reps) 13 | #$ -tc 20 # number of concurrently running tasks in array 14 | 15 | ## Set ADMIXTURE run parameters. Output folder, input .bed file, minimum and maximum K values, and Number of CPUs per admixture run. 16 | fn0="/mnt/archgen/users/schmid/paagen/admixpops_test_data/admixture_test" ## The intended result directory. It will be created, with one subfolder per K (which itself contains one folder per replicate and one 'Logs' folder with the ADMIXTURE logfiles). 17 | bedFile="/mnt/archgen/users/schmid/paagen/admixpops_test_data/mbutihanfrench_merged/mbutihanfrench_merged.bed" ## The input .bed file you ran ADMIXTURE on. 18 | Kmin='3' ## The minimum number of Ks you want to run 19 | Kmax='3' ## The maximum number of Ks you want to run 20 | Reps='5' ## The number of replicates to run for each K value. We normally use 5 replicates per K. 21 | NumCPUs='4' ## The number of CPUs you wish to use. Make sure this number matches the '-c' option for SBATCH above. 22 | 23 | date 24 | 25 | ## Create the bash arrays which we will iterate through with each array job. 26 | AllKs=($(seq ${Kmin} ${Kmax})) 27 | AllReps=($(seq 1 ${Reps})) 28 | 29 | ## Use the SGE_TASK_ID to iterate over the Ks and Reps in the correct manner (only iterate over AllKs once for every full iteration over AllReps) 30 | i=$((SGE_TASK_ID - 1)) 31 | CurrentK=${AllKs[`expr ${i} / ${#AllReps[@]}`]} 32 | CurrentRep=${AllReps[`expr ${i} % ${#AllReps[@]}`]} 33 | 34 | ## Make necessary output directories if needed. 35 | mkdir -p ${fn0}/ 36 | cd ${fn0} 37 | mkdir -p ${CurrentK}/Logs 38 | mkdir -p ${CurrentK}/${CurrentRep} 39 | 40 | ## Finally, run ADMIXTURE. 41 | cd ${CurrentK}/${CurrentRep} 42 | admixture --supervised -j${NumCPUs} -s ${RANDOM} --cv ${bedFile} ${CurrentK} 1>${fn0}/${CurrentK}/Logs/K${CurrentK}_${CurrentRep}.log 43 | 44 | date 45 | 46 | exit 0 47 | -------------------------------------------------------------------------------- /spacetime_test.R: -------------------------------------------------------------------------------- 1 | source("technical_helpers.R") 2 | 3 | library(magrittr) 4 | 5 | nd("spacetime_test_data") 6 | s('trident fetch -d spacetime_test_data -f "*2012_PattersonGenetics*"') 7 | 8 | janno_raw <- poseidonR::read_janno("spacetime_test_data/2012_PattersonGenetics") 9 | 10 | # filtering to sampels with spati 11 | janno_filtered <- janno_raw %>% dplyr::filter( 12 | !is.na(Latitude) & !is.na(Longitude) 13 | ) 14 | 15 | # Nr_autosomal_SNPs: should be >= 20000 SNPs 16 | janno_QC <- janno_filtered %>% dplyr::filter( 17 | Nr_autosomal_SNPs >= 20000 18 | ) 19 | # Xcontam: if male, then should not be higher then 10% 20 | janno_QC <- janno_QC %>% dplyr::filter( 21 | is.na(Xcontam) | Genetic_Sex == "F" | (Genetic_Sex == "M" & Xcontam < 0.1) 22 | ) 23 | # Genetic_Sex: Individuals with unknown genetic sex should be removed 24 | janno_QC <- janno_QC %>% dplyr::filter(Genetic_Sex != "U") 25 | # Indicated as contaminated: Individuals which are indicated as potentially contaminated 26 | # in their ID should be removed 27 | janno_QC <- janno_QC %>% dplyr::filter( 28 | !grepl("cont|excluded|Ignore", x = Individual_ID, ignore.case = T) & 29 | !grepl("cont|excluded|Ignore", x = Group_Name, ignore.case = T) 30 | ) 31 | 32 | janno_final <- janno_QC 33 | 34 | save(janno_final, file = "spacetime_test_data/janno_final.RData") 35 | 36 | load("spacetime_test_data/janno_final.RData") 37 | 38 | # store ind list for poseidon extraction 39 | tibble::tibble( 40 | #pop = sapply(janno_filtered_final$Group_Name, function(x) { x[[1]] }), 41 | ind = paste0("<", sort(janno_final$Individual_ID), ">") 42 | ) %>% 43 | readr::write_delim( 44 | file = "spacetime_test_data/ind_list.txt", 45 | delim = " ", 46 | col_names = FALSE 47 | ) 48 | 49 | dd("spacetime_test_data/pat") 50 | s('trident forge --forgeFile spacetime_test_data/ind_list.txt -d spacetime_test_data/2012_PattersonGenetics -n pat -o spacetime_test_data/pat') 51 | 52 | #manual_pois <- tibble::tribble( 53 | # ~time, ~lat, ~lon, 54 | # 2000, 46, 3, 55 | # 2000, 46, 3, 56 | # 2000, 46, 3, 57 | # 2000, 55, 37 58 | #) 59 | 60 | world <- spData::world 61 | 62 | poi_grid <- world %>% 63 | dplyr::filter(continent != "Antarctica") %>% 64 | sf::st_make_grid(cellsize = 20, what = "centers") %>% 65 | sf::st_sf() %>% 66 | sf::st_intersection(world) %>% 67 | dplyr::mutate( 68 | lon = sf::st_coordinates(.)[,1], 69 | lat = sf::st_coordinates(.)[,2] 70 | ) %>% 71 | sf::st_drop_geometry() %>% 72 | dplyr::transmute( 73 | ind = paste0("poi", 1:(dplyr::n())), 74 | group = gsub(" ", "_", continent), 75 | time = 2000, 76 | lon = round(lon, 3), 77 | lat = round(lat, 3) 78 | ) 79 | 80 | poi_string <- purrr::pmap(poi_grid, function(ind, group, time, lat, lon) { 81 | paste0("[", ind, ":", group, "](", paste(time, lat, lon, sep = ","), ")") 82 | }) %>% paste(collapse = ";") 83 | 84 | poi_string 85 | 86 | library(ggplot2) 87 | 88 | janno_grouped <- janno_final %>% 89 | dplyr::group_by(Group_Name, Latitude, Longitude) %>% 90 | dplyr::summarise(n = dplyr::n(), .groups = "drop") 91 | 92 | ggplot() + 93 | geom_sf(data = world) + 94 | geom_point(data = janno_grouped, aes(x = Longitude, y = Latitude, size = n)) + 95 | ggrepel::geom_label_repel( 96 | data = janno_grouped, 97 | aes(x = Longitude, y = Latitude, label = Group_Name), 98 | color = "grey", size = 3, max.overlaps = 100 99 | ) + 100 | geom_text(data = poi_grid, aes(x = lon, y = lat, color = group, label = ind)) 101 | 102 | dd("spacetime_test_data/poi") 103 | s(paste0('paagen spacetime -d spacetime_test_data/pat -p "', poi_string, '" --neighbors 100 -o spacetime_test_data/poi --outFormat EIGENSTRAT')) 104 | 105 | dd("spacetime_test_data/merged") 106 | s('trident forge -d spacetime_test_data/pat -d spacetime_test_data/poi -f "*pat*,*spacetime_package*" -o spacetime_test_data/merged -n merged') 107 | 108 | # pruning 109 | # nd("spacetime_test_data/merged_pruned") 110 | # s('plink1.9 --bfile spacetime_test_data/merged/merged --exclude spacetime_test_data/myrange.txt --range --maf --make-bed --out spacetime_test_data/merged_pruned/merged.pruned') 111 | 112 | # generate general pairwise stats 113 | nd("spacetime_test_data/merge_distances") 114 | s('plink1.9 --bfile spacetime_test_data/merged/merged --genome --out spacetime_test_data/merge_distances/merged') 115 | 116 | # create mds table 117 | nd("spacetime_test_data/mds") 118 | s('plink1.9 --bfile spacetime_test_data/merged/merged --cluster --mds-plot 2 --read-genome spacetime_test_data/merge_distances/merged.genome --out spacetime_test_data/mds/mds') 119 | 120 | mds_raw <- readr::read_delim( 121 | "spacetime_test_data/mds/mds.mds", " ", trim_ws = T, 122 | col_types = "ccddd_" 123 | ) 124 | 125 | load("spacetime_test_data/janno_final.RData") 126 | 127 | input_spatpos <- janno_final %>% dplyr::transmute( 128 | ind = Individual_ID, 129 | group = sapply(Group_Name, function(x){x[1]}), 130 | time = 2000, 131 | lon = Longitude, 132 | lat = Latitude 133 | ) %>% 134 | dplyr::left_join( 135 | mds_raw, by = c("ind" = "IID") 136 | ) 137 | 138 | input_spatpos_grouped <- input_spatpos %>% 139 | dplyr::group_by(FID) %>% 140 | dplyr::summarise( 141 | C1 = mean(C1), 142 | C2 = mean(C2) 143 | ) 144 | 145 | input_grid <- poi_grid %>% 146 | dplyr::left_join( 147 | mds_raw, by = c("ind" = "IID") 148 | ) 149 | 150 | head(input_spatpos_grouped) 151 | 152 | head(input_grid) 153 | 154 | ggplot() + 155 | ggpointgrid::geom_textgrid(data = input_grid, aes(x = C1, y = C2, color = group, label = ind), size = 3) + 156 | ggpointgrid::geom_textgrid(data = input_spatpos_grouped, aes(x = C1, y = C2, label = FID)) 157 | -------------------------------------------------------------------------------- /admixpops_test.R: -------------------------------------------------------------------------------- 1 | source("technical_helpers.R") 2 | library(ggplot2) 3 | 4 | # prepare data 5 | #nd("admixpops_test_data") 6 | #nd("admixpops_test_data/plots") 7 | 8 | # download 9 | s('trident fetch -d admixpops_test_data -f "*2012_PattersonGenetics-2.1.3*"') 10 | 11 | # pruning 12 | nd("admixpops_test_data/plink_patterson_pruned") 13 | s("~/software/plink --bfile admixpops_test_data/2012_PattersonGenetics-2.1.3/2012_PattersonGenetics --exclude pruning_ranges.txt --range --maf --make-bed --out admixpops_test_data/plink_patterson_pruned/pruned") 14 | 15 | s("trident init --snpSet Other -p admixpops_test_data/plink_patterson_pruned/pruned.bed -o admixpops_test_data/2012_PattersonGenetics_pruned -n 2012_PattersonGenetics_pruned") 16 | 17 | #### one large population test #### 18 | 19 | # run admixpops 20 | s('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a "[1:HanDom](Han=100);[2:HanDom](Han=100);[3:HanDom](Han=100);[4:HanDom](Han=100);[5:HanDom](Han=100)" -o admixpops_test_data/han') 21 | # here: tangent to test vcf and zipped output 22 | s('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a "[1c:HanDomChunk](Han=100);[2c:HanDomChunk](Han=100);[3c:HanDomChunk](Han=100);[4c:HanDomChunk](Han=100);[5c:HanDomChunk](Han=100)" -o admixpops_test_data/han_chunks5000 --inChunks --outFormat VCF --zip') 23 | 24 | # create data subset 25 | s('trident forge -d admixpops_test_data/2012_PattersonGenetics_pruned -d admixpops_test_data/han -d admixpops_test_data/han_chunks5000 -f "HanDom,Han,HanDomChunk" -n han_merged -o admixpops_test_data/han_merged') 26 | 27 | # mds 28 | nd("admixpops_test_data/han_mds") 29 | s('~/software/plink --bfile admixpops_test_data/han_merged/han_merged --genome --out admixpops_test_data/han_mds/pairwise_stats') 30 | s('~/software/plink --bfile admixpops_test_data/han_merged/han_merged --cluster --mds-plot 2 --read-genome admixpops_test_data/han_mds/pairwise_stats.genome --out admixpops_test_data/han_mds/mds') 31 | 32 | # plot 33 | mds_raw <- read_mds("admixpops_test_data/han_mds/mds.mds") 34 | 35 | p <- mds_raw |> 36 | ggplot() + 37 | geom_point(aes(x = C1, y = C2, colour = FID)) 38 | 39 | ggsave( 40 | "admixpops_test_data/plots/han_mds.jpeg", 41 | plot = p, 42 | device = "jpeg", 43 | width = 10, 44 | height = 6, 45 | scale = 0.8 46 | ) 47 | 48 | #### one small population test #### 49 | 50 | s('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a "[1:BantuSADom](BantuSA=100);[2:BantuSADom](BantuSA=100);[3:BantuSADom](BantuSA=100);[4:BantuSADom](BantuSA=100);[5:BantuSADom](BantuSA=100)" -o admixpops_test_data/BantuSA') 51 | s('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a "[1c:BantuSADomChunk](BantuSA=100);[2c:BantuSADomChunk](BantuSA=100);[3c:BantuSADomChunk](BantuSA=100);[4c:BantuSADomChunk](BantuSA=100);[5c:BantuSADomChunk](BantuSA=100)" -o admixpops_test_data/BantuSA_chunks5000 --inChunks') 52 | 53 | # create data subset 54 | s('trident forge -d admixpops_test_data/2012_PattersonGenetics_pruned -d admixpops_test_data/BantuSA -d admixpops_test_data/BantuSA_chunks5000 -f "BantuSA,BantuSADom,BantuSADomChunk" -n BantuSA_merged -o admixpops_test_data/BantuSA_merged') 55 | 56 | # mds 57 | nd("admixpops_test_data/BantuSA_mds") 58 | s('~/software/plink --bfile admixpops_test_data/BantuSA_merged/BantuSA_merged --genome --out admixpops_test_data/BantuSA_mds/pairwise_stats') 59 | s('~/software/plink --bfile admixpops_test_data/BantuSA_merged/BantuSA_merged --cluster --mds-plot 2 --read-genome admixpops_test_data/BantuSA_mds/pairwise_stats.genome --out admixpops_test_data/BantuSA_mds/mds') 60 | 61 | # plot 62 | mds_raw <- read_mds("admixpops_test_data/BantuSA_mds/mds.mds") 63 | 64 | p <- mds_raw |> 65 | ggplot() + 66 | geom_point(aes(x = C1, y = C2, colour = FID)) 67 | 68 | ggsave( 69 | "admixpops_test_data/plots/BantuSA_mds.jpeg", 70 | plot = p, 71 | device = "jpeg", 72 | width = 10, 73 | height = 6, 74 | scale = 0.8 75 | ) 76 | 77 | #### two populations test #### 78 | 79 | combinations <- partitions::compositions(n = 10, m = 2, include.zero = T) |> 80 | {\(x) x*10}() |> 81 | as.matrix() |> 82 | t() |> 83 | as.data.frame() |> 84 | dplyr::mutate( 85 | id = paste(V1, V2, sep = "|"), 86 | unit = dplyr::case_when( 87 | V1 > V2 ~ "HanDom", 88 | V2 > V1 ~ "FrenchDom", 89 | TRUE ~ "Center" 90 | ) 91 | ) 92 | 93 | combinations_chunks <- combinations |> {\(x) { 94 | x |> 95 | dplyr::mutate(unit = paste0(unit, "Chunk")) 96 | }}() 97 | 98 | ind_admixpops <- combinations |> {\(x) { 99 | purrr::pmap_chr( 100 | list(x$id, x$unit, x$V1, x$V2), 101 | \(a,b,c,d) { 102 | paste0("[",a,":",b,"]","(Han=",c,"+French=",d,")") 103 | } 104 | ) 105 | }}() |> 106 | {\(x) paste(x, collapse = ";")}() 107 | 108 | ind_admixpops_chunks <- combinations_chunks |> {\(x) { 109 | purrr::pmap_chr( 110 | list(x$id, x$unit, x$V1, x$V2), 111 | \(a,b,c,d) { 112 | paste0("[",a,"c:",b,"]","(Han=",c,"+French=",d,")") 113 | } 114 | ) 115 | }}() |> 116 | {\(x) paste(x, collapse = ";")}() 117 | 118 | # run admixpops 119 | s(paste0('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a \"', ind_admixpops, '\" -o admixpops_test_data/hanfrench')) 120 | s(paste0('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a \"', ind_admixpops_chunks, '\" -o admixpops_test_data/hanfrench_chunks5000 --inChunks')) 121 | 122 | # create data subset 123 | s('trident forge -d admixpops_test_data/2012_PattersonGenetics_pruned -d admixpops_test_data/hanfrench -d admixpops_test_data/hanfrench_chunks5000 -f "Han,French,HanDom,FrenchDom,Center,HanDomChunk,FrenchDomChunk,CenterChunk" -n hanfrench_merged -o admixpops_test_data/hanfrench_merged') 124 | 125 | # mds 126 | nd("admixpops_test_data/mds") 127 | s('~/software/plink --bfile admixpops_test_data/hanfrench_merged/hanfrench_merged --genome --out admixpops_test_data/mds/pairwise_stats') 128 | s('~/software/plink --bfile admixpops_test_data/hanfrench_merged/hanfrench_merged --cluster --mds-plot 2 --read-genome admixpops_test_data/mds/pairwise_stats.genome --out admixpops_test_data/mds/mds') 129 | 130 | # plot 131 | mds_raw <- read_mds("admixpops_test_data/mds/mds.mds") 132 | 133 | p <- mds_raw |> 134 | dplyr::mutate( 135 | label = ifelse(grepl("\\|", IID), IID, NA) 136 | ) |> 137 | ggplot(aes(x = C1, y = C2, colour = FID, label = label)) + 138 | geom_point() + 139 | ggrepel::geom_label_repel() 140 | 141 | ggsave( 142 | "admixpops_test_data/plots/mds_mds.jpeg", 143 | plot = p, 144 | device = "jpeg", 145 | width = 10, 146 | height = 6, 147 | scale = 0.8 148 | ) 149 | 150 | #### three populations test #### 151 | 152 | ind_admixpops2_table <- partitions::compositions(n = 10, m = 3, include.zero = T) |> 153 | {\(x) x*10}() |> 154 | as.matrix() |> 155 | t() |> 156 | as.data.frame() |> 157 | stats::setNames(c("Mbuti", "Han", "French")) |> 158 | dplyr::mutate( 159 | id = 1:dplyr::n(), 160 | unit = dplyr::case_when( 161 | Mbuti > Han & Mbuti > French ~ "MbutiDom", 162 | Han > Mbuti & Han > French ~ "HanDom", 163 | French > Mbuti & French > Han ~ "FrenchDom", 164 | TRUE ~ "Center" 165 | ) 166 | ) 167 | 168 | ind_admixpops2_table_chunks <- ind_admixpops2_table |> {\(x) { 169 | x |> 170 | dplyr::mutate(unit = paste0(unit, "Chunk")) 171 | }}() 172 | 173 | ind_admixpops2 <- ind_admixpops2_table |> {\(x) { 174 | purrr::pmap_chr( 175 | list(x$id, x$unit, x$Mbuti, x$Han, x$French), 176 | \(a,b,c,d,e) { 177 | paste0("[",a,":",b,"]","(Mbuti=",c,"+Han=",d,"+French=",e,")") 178 | } 179 | ) 180 | }}() |> 181 | {\(x) paste(x, collapse = ";")}() 182 | 183 | ind_admixpops2_chunks <- ind_admixpops2_table_chunks |> {\(x) { 184 | purrr::pmap_chr( 185 | list(x$id, x$unit, x$Mbuti, x$Han, x$French), 186 | \(a,b,c,d,e) { 187 | paste0("[",a,"c:",b,"]","(Mbuti=",c,"+Han=",d,"+French=",e,")") 188 | } 189 | ) 190 | }}() |> 191 | {\(x) paste(x, collapse = ";")}() 192 | 193 | # run admixpops 194 | s(paste0('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a \"', ind_admixpops2, '\" -o admixpops_test_data/mbutihanfrench')) 195 | s(paste0('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a \"', ind_admixpops2_chunks, '\" -o admixpops_test_data/mbutihanfrench_chunks5000 --inChunks')) 196 | 197 | # create data subset 198 | s('trident forge -d admixpops_test_data/2012_PattersonGenetics_pruned -d admixpops_test_data/mbutihanfrench -d admixpops_test_data/mbutihanfrench_chunks5000 -f "Mbuti,Han,French,MbutiDom,HanDom,FrenchDom,Center,MbutiDomChunk,HanDomChunk,FrenchDomChunk,CenterChunk" -n mbutihanfrench_merged -o admixpops_test_data/mbutihanfrench_merged') 199 | 200 | # mds 201 | nd("admixpops_test_data/mbutihanfrench_mds") 202 | s('~/software/plink --bfile admixpops_test_data/mbutihanfrench_merged/mbutihanfrench_merged --genome --out admixpops_test_data/mbutihanfrench_mds/pairwise_stats') 203 | s('~/software/plink --bfile admixpops_test_data/mbutihanfrench_merged/mbutihanfrench_merged --cluster --mds-plot 2 --read-genome admixpops_test_data/mbutihanfrench_mds/pairwise_stats.genome --out admixpops_test_data/mbutihanfrench_mds/mds') 204 | 205 | # plot 206 | mds_raw <- read_mds("admixpops_test_data/mbutihanfrench_mds/mds.mds") 207 | 208 | p <- mds_raw |> 209 | dplyr::mutate( 210 | method = dplyr::case_when( 211 | grepl("Chunk", FID) ~ "in Chunks", 212 | (!grepl("Dom", FID) & FID != "Center") ~ "input", 213 | TRUE ~ "per SNP" 214 | ) 215 | ) |> 216 | #dplyr::filter(method %in% c("per SNP", "input")) |> 217 | dplyr::filter(method %in% c("in Chunks", "input")) |> 218 | ggplot() + 219 | geom_point(aes(x = C1, y = C2, colour = FID)) 220 | 221 | ggsave( 222 | "admixpops_test_data/plots/mbutihanfrench_mds_chunks.jpeg", 223 | plot = p, 224 | device = "jpeg", 225 | width = 10, 226 | height = 6, 227 | scale = 0.8 228 | ) 229 | 230 | #### three pops per SNP: now with --marginalizeMissing #### 231 | 232 | # run admixpops 233 | s(paste0('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a \"', ind_admixpops2, '\" --marginalizeMissing -o admixpops_test_data/mbutihanfrench_mm')) 234 | 235 | # create data subset 236 | s('trident forge -d admixpops_test_data/2012_PattersonGenetics_pruned -d admixpops_test_data/mbutihanfrench_mm -f "MbutiDom,HanDom,FrenchDom,Center,Mbuti,Han,French" -n mbutihanfrench_mm_merged -o admixpops_test_data/mbutihanfrench_mm_merged') 237 | 238 | # mds 239 | nd("admixpops_test_data/mbutihanfrench_mm_mds") 240 | s('~/software/plink --bfile admixpops_test_data/mbutihanfrench_mm_merged/mbutihanfrench_mm_merged --genome --out admixpops_test_data/mbutihanfrench_mm_mds/pairwise_stats') 241 | s('~/software/plink --bfile admixpops_test_data/mbutihanfrench_mm_merged/mbutihanfrench_mm_merged --cluster --mds-plot 2 --read-genome admixpops_test_data/mbutihanfrench_mm_mds/pairwise_stats.genome --out admixpops_test_data/mbutihanfrench_mm_mds/mds') 242 | 243 | # plot 244 | mds_raw <- read_mds("admixpops_test_data/mbutihanfrench_mm_mds/mds.mds") 245 | 246 | p <- mds_raw |> 247 | ggplot() + 248 | geom_point(aes(x = C1, y = C2, colour = FID)) 249 | 250 | ggsave( 251 | "admixpops_test_data/plots/mbutihanfrench_mm_mds.jpeg", 252 | plot = p, 253 | device = "jpeg", 254 | width = 10, 255 | height = 6, 256 | scale = 0.8 257 | ) 258 | 259 | #### three pops: admixture analysis #### 260 | 261 | # create .pop file for supervised admixture 262 | fam <- readr::read_tsv( 263 | "admixpops_test_data/mbutihanfrench_merged/mbutihanfrench_merged.fam", 264 | col_names = F 265 | ) 266 | 267 | writeLines( 268 | ifelse( 269 | fam$X1 %in% c("French", "Mbuti", "Han"), fam$X1, "-" 270 | ), 271 | "admixpops_test_data/mbutihanfrench_merged/mbutihanfrench_merged.pop" 272 | ) 273 | 274 | pw <- "..." 275 | u <- "..." 276 | h <- "daghead1.eva.mpg.de" 277 | 278 | # upload data to cluster 279 | eva.cluster::cluster_up( 280 | "~/agora/paagen/admixpops_test_data/mbutihanfrench_merged/" ~ 281 | "/mnt/archgen/users/schmid/paagen/admixpops_test_data/mbutihanfrench_merged/", 282 | user = u, host = h, pw = pw 283 | ) 284 | 285 | # run on cluster: qsub runAdmixture.sh 286 | 287 | eva.cluster::cluster_down( 288 | "/mnt/archgen/users/schmid/paagen/admixpops_test_data/admixture_test/" ~ 289 | "~/agora/paagen/admixpops_test_data/admixture_test/", 290 | user = u, host = h, pw = pw 291 | ) 292 | 293 | # read all results and bring them together 294 | merged_admixture_results_wide <- list.files( 295 | "~/agora/paagen/admixpops_test_data/admixture_test/3", 296 | recursive = T, 297 | pattern = ".Q", 298 | full.names = T 299 | ) |> 300 | (\(x) Map(\(y) { 301 | num_chimeras <- length(ind_admixpops2_table) 302 | raw_out <- readr::read_delim(y, col_names = F, delim = " ") 303 | sorted_dims <- raw_out[raw_out |> colSums() |> sort() |> names()] |> stats::setNames(c("OMbuti", "OFrench", "OHan")) 304 | sorted_dims |> 305 | (\(x) x[(nrow(x) - nrow(ind_admixpops2_table) + 1):nrow(x),])() |> 306 | dplyr::mutate(run = y) |> 307 | dplyr::bind_cols(ind_admixpops2_table) 308 | }, x))() |> 309 | dplyr::bind_rows() |> 310 | dplyr::group_by(id) |> 311 | dplyr::summarise( 312 | mean_OMbuti = mean(OMbuti), 313 | mean_OHan = mean(OHan), 314 | mean_OFrench = mean(OFrench), 315 | # sd is trivially small! 316 | IMbuti = dplyr::first(Mbuti), 317 | IHan = dplyr::first(Han), 318 | IFrench = dplyr::first(French), 319 | unit = dplyr::first(unit) 320 | ) 321 | 322 | # transform data to ggtern 323 | merged_admixture_results_long <- merged_admixture_results_wide |> 324 | tidyr::pivot_longer( 325 | tidyselect::starts_with(c("mean_", "I"), ignore.case = F) 326 | ) |> 327 | dplyr::mutate( 328 | type = dplyr::case_when( 329 | grepl("mean", name) ~ "out_paagen+admixture", 330 | TRUE ~ "in_theoretical" 331 | ), 332 | name = dplyr::case_when( 333 | grepl("Mbuti", name) ~ "Mbuti", 334 | grepl("Han", name) ~ "Han", 335 | grepl("French", name) ~ "French" 336 | ) 337 | ) |> 338 | tidyr::pivot_wider( 339 | id_cols = c(id, unit, type), 340 | names_from = name, 341 | values_from = value 342 | ) 343 | 344 | # plot 345 | library(ggtern) 346 | p <- ggtern() + 347 | geom_segment( 348 | data = merged_admixture_results_wide, 349 | aes(mean_OMbuti, mean_OHan, mean_OFrench, xend = IMbuti, yend = IHan, zend = IFrench), alpha = 0.5 350 | ) + 351 | geom_point( 352 | data = merged_admixture_results_long, 353 | aes(Mbuti, Han, French, color = unit, shape = type), 354 | size = 2 355 | ) + 356 | theme_nomask() + 357 | xlab("Mbuti") + 358 | ylab("Han") + 359 | zlab("French") 360 | 361 | ggsave( 362 | "admixpops_test_data/plots/mbutihanfrench_admix.jpeg", 363 | plot = p, 364 | device = "jpeg", 365 | width = 10, 366 | height = 6, 367 | scale = 0.8 368 | ) 369 | 370 | #### independent runs, but with other pops in MDS #### 371 | 372 | s(paste0('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a "[6:FrenchDom](French=100);[7:FrenchDom](French=100);[8:FrenchDom](French=100);[9:FrenchDom](French=100);[10:FrenchDom](French=100)" -o admixpops_test_data/french')) 373 | 374 | s(paste0('xerxes admixpops -d admixpops_test_data/2012_PattersonGenetics_pruned -a "[11:MbutiDom](Mbuti=100);[12:MbutiDom](Mbuti=100);[13:MbutiDom](Mbuti=100);[14:MbutiDom](Mbuti=100);[15:MbutiDom](Mbuti=100)" -o admixpops_test_data/mbuti')) 375 | 376 | # create data subset 377 | s('trident forge -d admixpops_test_data/2012_PattersonGenetics_pruned -p admixpops_test_data/han/han.bed -p admixpops_test_data/french/french.bed -p admixpops_test_data/mbuti/mbuti.bed -f "MbutiDom,FrenchDom,HanDom,Han,French,Mbuti" -n independenthanfrenchmbuti_merged -o admixpops_test_data/independenthanfrenchmbuti_merged') 378 | 379 | # mds 380 | nd("admixpops_test_data/independenthanfrenchmbuti_mds") 381 | s('~/software/plink --bfile admixpops_test_data/independenthanfrenchmbuti_merged/independenthanfrenchmbuti_merged --genome --out admixpops_test_data/independenthanfrenchmbuti_mds/pairwise_stats') 382 | s('~/software/plink --bfile admixpops_test_data/independenthanfrenchmbuti_merged/independenthanfrenchmbuti_merged --cluster --mds-plot 2 --read-genome admixpops_test_data/independenthanfrenchmbuti_mds/pairwise_stats.genome --out admixpops_test_data/independenthanfrenchmbuti_mds/mds') 383 | 384 | # plot 385 | mds_raw <- read_mds("admixpops_test_data/independenthanfrenchmbuti_mds/mds.mds") 386 | 387 | p <- mds_raw |> 388 | ggplot() + 389 | geom_point(aes(x = C1, y = C2, colour = FID)) 390 | 391 | ggsave( 392 | "admixpops_test_data/plots/independenthanfrenchmbuti_mds.jpeg", 393 | plot = p, 394 | device = "jpeg", 395 | width = 10, 396 | height = 6, 397 | scale = 0.8 398 | ) 399 | 400 | -------------------------------------------------------------------------------- /figures/spacetime_schema.svg: -------------------------------------------------------------------------------- 1 | 2 | 20 | 22 | 30 | 35 | 36 | 45 | 50 | 51 | 59 | 64 | 65 | 74 | 79 | 80 | 88 | 93 | 94 | 102 | 107 | 108 | 117 | 122 | 123 | 131 | 136 | 137 | 145 | 150 | 151 | 159 | 164 | 165 | 172 | 173 | 196 | 198 | 199 | 201 | image/svg+xml 202 | 204 | 205 | 206 | 207 | 208 | 212 | 220 | 225 | 230 | 235 | 240 | 245 | 250 | 255 | 260 | 265 | 270 | 275 | 280 | 285 | 290 | 295 | 299 | 304 | time 316 | long 327 | lat 339 | POIs 350 | 356 | Pop 1 367 | 373 | 379 | 385 | 391 | 397 | Pop 2 408 | 414 | 420 | 426 | 432 | 438 | Pop 3 449 | 455 | 461 | 467 | ...01 0 010020990002......01 9 010110990012......92 0 910020219012......01 9 010920219002......01 0 010920999002......00 1 020020111000......90 0 020929111900......00 1 010920112099......00 1 920020119900......11 1 012020109109......12 1 112020109000......91 1 912021109100......92 1 911020109100......91 1 011020109000... 548 | dspacetime 562 | 569 | 0/HomAlt: sum wspacetime1/Het: sum wspacetime2/HomRef: sum wspacetime9/Missing: sum wspacetime 603 | wspacetime = 1 / dspacetime (rescaled between 0 and 1) 619 | 626 | 632 | 638 | 644 | 650 | 656 | 663 | 669 | 674 | 679 | ...91 1 912020109100......11 1 920021119109......90 1 012020109199......92 1 011920109100......91 0 011020101100... 720 | 725 | 732 | weighted randomsampling 748 | POIs 759 | 765 | 771 | 777 | 783 | 789 | paagen spacetime 800 | 801 | 802 | --------------------------------------------------------------------------------