├── img ├── MA1_affinities.png ├── f3_phylogeny.png ├── f4_phylogeny.png ├── Patterson_2012_ANEfig.png ├── Patterson_2012_table.png └── outgroupf3_phylogeny.png ├── .gitignore ├── f4_param.txt ├── adm_f3_popfile.txt ├── adm_f3_param.txt ├── outgroup_f3_param_Han.txt ├── outgroup_f3_param_MA1.txt ├── f4_popfile.txt ├── pca.AllEurasia.params.txt ├── pca.WestEurasia.params.txt ├── supp ├── tasks.sh ├── convertf_param.txt ├── WestEurasia.poplist.txt ├── AllEurasia.poplist.txt └── poplist.txt ├── outgroup_f3_popfile_Han.txt ├── outgroup_f3_popfile_MA1.txt ├── README.md ├── population_frequencies.txt ├── f3_outgroup_stats_Han.txt ├── f3_outgroup_stats_MA1.txt ├── 04_Rmd_plotting_pca.Rmd ├── 03_Rmd_smartpca.Rmd ├── pca.WestEurasia.eval ├── 01_bashnb_getting_started.ipynb ├── 03_bashnb_smartpca.ipynb ├── 05_Rmd_fstatistics.Rmd └── pca.AllEurasia.eval /img/MA1_affinities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/popgen_course/master/img/MA1_affinities.png -------------------------------------------------------------------------------- /img/f3_phylogeny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/popgen_course/master/img/f3_phylogeny.png -------------------------------------------------------------------------------- /img/f4_phylogeny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/popgen_course/master/img/f4_phylogeny.png -------------------------------------------------------------------------------- /img/Patterson_2012_ANEfig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/popgen_course/master/img/Patterson_2012_ANEfig.png -------------------------------------------------------------------------------- /img/Patterson_2012_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/popgen_course/master/img/Patterson_2012_table.png -------------------------------------------------------------------------------- /img/outgroupf3_phylogeny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/popgen_course/master/img/outgroupf3_phylogeny.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | 3 | # rendered documents 4 | *.html 5 | 6 | # data 7 | data/* 8 | 9 | # R 10 | .Rhistory 11 | -------------------------------------------------------------------------------- /f4_param.txt: -------------------------------------------------------------------------------- 1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno 2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp 3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind 4 | popfilename: f4_popfile.txt 5 | f4mode: YES -------------------------------------------------------------------------------- /adm_f3_popfile.txt: -------------------------------------------------------------------------------- 1 | Nganasan French Finnish 2 | Nganasan Icelandic Finnish 3 | Nganasan Lithuanian Finnish 4 | Nganasan Norwegian Finnish 5 | BolshoyOleniOstrov French Finnish 6 | BolshoyOleniOstrov Icelandic Finnish 7 | BolshoyOleniOstrov Lithuanian Finnish 8 | BolshoyOleniOstrov Norwegian Finnish -------------------------------------------------------------------------------- /adm_f3_param.txt: -------------------------------------------------------------------------------- 1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno 2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp 3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind 4 | popfilename: adm_f3_popfile.txt 5 | inbreed: YES -------------------------------------------------------------------------------- /outgroup_f3_param_Han.txt: -------------------------------------------------------------------------------- 1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno 2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp 3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind 4 | popfilename: outgroup_f3_popfile_Han.txt -------------------------------------------------------------------------------- /outgroup_f3_param_MA1.txt: -------------------------------------------------------------------------------- 1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno 2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp 3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind 4 | popfilename: outgroup_f3_popfile_MA1.txt -------------------------------------------------------------------------------- /f4_popfile.txt: -------------------------------------------------------------------------------- 1 | Mbuti Nganasan French Finnish 2 | Mbuti Nganasan Icelandic Finnish 3 | Mbuti Nganasan Lithuanian Finnish 4 | Mbuti Nganasan Norwegian Finnish 5 | Mbuti BolshoyOleniOstrov French Finnish 6 | Mbuti BolshoyOleniOstrov Icelandic Finnish 7 | Mbuti BolshoyOleniOstrov Lithuanian Finnish 8 | Mbuti BolshoyOleniOstrov Norwegian Finnish -------------------------------------------------------------------------------- /pca.AllEurasia.params.txt: -------------------------------------------------------------------------------- 1 | genotypename: /data/popgen_course/genotypes_small.geno 2 | snpname: /data/popgen_course/genotypes_small.snp 3 | indivname: /data/popgen_course/genotypes_small.ind 4 | evecoutname: pca.AllEurasia.evec 5 | evaloutname: pca.AllEurasia.eval 6 | poplistname: /data/popgen_course/AllEurasia.poplist.txt 7 | lsqproject: YES 8 | numoutevec: 4 9 | numthreads: 1 10 | -------------------------------------------------------------------------------- /pca.WestEurasia.params.txt: -------------------------------------------------------------------------------- 1 | genotypename: data/popgen_course/genotypes_small.geno 2 | snpname: data/popgen_course/genotypes_small.snp 3 | indivname: data/popgen_course/genotypes_small.ind 4 | evecoutname: pca.WestEurasia.evec 5 | evaloutname: pca.WestEurasia.eval 6 | poplistname: data/popgen_course/WestEurasia.poplist.txt 7 | lsqproject: YES 8 | numoutevec: 4 9 | numthreads: 1 10 | -------------------------------------------------------------------------------- /supp/tasks.sh: -------------------------------------------------------------------------------- 1 | OUT=~/Data/workshop_dataset_prep 2 | mkdir -p $OUT 3 | 4 | #Copy genotyping data from Thiseas: 5 | D=/projects1/AncientFinnish/DataFreeze20_07_17/results/calls/PublishedOnly.HO.1240K.Ancients+Saami 6 | scp sdag:$D.ind $OUT/HumanOrigins_FennoScandian.ind 7 | scp sdag:$D.geno $OUT/HumanOrigins_FennoScandian.geno 8 | scp sdag:$D.snp $OUT/HumanOrigins_FennoScandian.snp 9 | 10 | # Extract smaller dataset 11 | convertf -p convertf_param.txt 12 | 13 | -------------------------------------------------------------------------------- /supp/convertf_param.txt: -------------------------------------------------------------------------------- 1 | genotypename: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian.geno 2 | snpname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian.snp 3 | indivname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian.ind 4 | outputformat: EIGENSTRAT 5 | genotypeoutname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian_small.geno 6 | snpoutname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian_small.snp 7 | indivoutname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian_small.ind 8 | poplistname: poplist.txt 9 | -------------------------------------------------------------------------------- /outgroup_f3_popfile_Han.txt: -------------------------------------------------------------------------------- 1 | Han Chuvash Mbuti 2 | Han Albanian Mbuti 3 | Han Armenian Mbuti 4 | Han Bulgarian Mbuti 5 | Han Czech Mbuti 6 | Han Druze Mbuti 7 | Han English Mbuti 8 | Han Estonian Mbuti 9 | Han Finnish Mbuti 10 | Han French Mbuti 11 | Han Georgian Mbuti 12 | Han Greek Mbuti 13 | Han Hungarian Mbuti 14 | Han Icelandic Mbuti 15 | Han Italian_North Mbuti 16 | Han Italian_South Mbuti 17 | Han Lithuanian Mbuti 18 | Han Maltese Mbuti 19 | Han Mordovian Mbuti 20 | Han Norwegian Mbuti 21 | Han Orcadian Mbuti 22 | Han Russian Mbuti 23 | Han Sardinian Mbuti 24 | Han Scottish Mbuti 25 | Han Sicilian Mbuti 26 | Han Spanish_North Mbuti 27 | Han Spanish Mbuti 28 | Han Ukrainian Mbuti 29 | Han Levanluhta Mbuti 30 | Han BolshoyOleniOstrov Mbuti 31 | Han ChalmnyVarre Mbuti 32 | Han Saami.DG Mbuti -------------------------------------------------------------------------------- /outgroup_f3_popfile_MA1.txt: -------------------------------------------------------------------------------- 1 | MA1_HG.SG Chuvash Mbuti 2 | MA1_HG.SG Albanian Mbuti 3 | MA1_HG.SG Armenian Mbuti 4 | MA1_HG.SG Bulgarian Mbuti 5 | MA1_HG.SG Czech Mbuti 6 | MA1_HG.SG Druze Mbuti 7 | MA1_HG.SG English Mbuti 8 | MA1_HG.SG Estonian Mbuti 9 | MA1_HG.SG Finnish Mbuti 10 | MA1_HG.SG French Mbuti 11 | MA1_HG.SG Georgian Mbuti 12 | MA1_HG.SG Greek Mbuti 13 | MA1_HG.SG Hungarian Mbuti 14 | MA1_HG.SG Icelandic Mbuti 15 | MA1_HG.SG Italian_North Mbuti 16 | MA1_HG.SG Italian_South Mbuti 17 | MA1_HG.SG Lithuanian Mbuti 18 | MA1_HG.SG Maltese Mbuti 19 | MA1_HG.SG Mordovian Mbuti 20 | MA1_HG.SG Norwegian Mbuti 21 | MA1_HG.SG Orcadian Mbuti 22 | MA1_HG.SG Russian Mbuti 23 | MA1_HG.SG Sardinian Mbuti 24 | MA1_HG.SG Scottish Mbuti 25 | MA1_HG.SG Sicilian Mbuti 26 | MA1_HG.SG Spanish_North Mbuti 27 | MA1_HG.SG Spanish Mbuti 28 | MA1_HG.SG Ukrainian Mbuti 29 | MA1_HG.SG Levanluhta Mbuti 30 | MA1_HG.SG BolshoyOleniOstrov Mbuti 31 | MA1_HG.SG ChalmnyVarre Mbuti 32 | MA1_HG.SG Saami.DG Mbuti -------------------------------------------------------------------------------- /supp/WestEurasia.poplist.txt: -------------------------------------------------------------------------------- 1 | Chuvash 2 | Abkhasian 3 | Adygei 4 | Albanian 5 | Armenian 6 | Assyrian 7 | Balkar 8 | Basque 9 | BedouinA 10 | BedouinB 11 | Belarusian 12 | Bulgarian 13 | Canary_Islander 14 | Chechen 15 | Croatian 16 | Cypriot 17 | Czech 18 | Druze 19 | English 20 | Estonian 21 | Finnish 22 | French 23 | Georgian 24 | German 25 | Greek 26 | Hungarian 27 | Icelandic 28 | Iranian 29 | Irish 30 | Irish_Ulster 31 | Italian_North 32 | Italian_South 33 | Jew_Ashkenazi 34 | Jew_Georgian 35 | Jew_Iranian 36 | Jew_Iraqi 37 | Jew_Libyan 38 | Jew_Moroccan 39 | Jew_Tunisian 40 | Jew_Turkish 41 | Jew_Yemenite 42 | Jordanian 43 | Kumyk 44 | Lebanese_Christian 45 | Lebanese 46 | Lebanese_Muslim 47 | Lezgin 48 | Lithuanian 49 | Maltese 50 | Mordovian 51 | North_Ossetian 52 | Norwegian 53 | Orcadian 54 | Palestinian 55 | Polish 56 | Romanian 57 | Russian 58 | Sardinian 59 | Saudi 60 | Scottish 61 | Shetlandic 62 | Sicilian 63 | Sorb 64 | Spanish_North 65 | Spanish 66 | Syrian 67 | Turkish 68 | Ukrainian 69 | -------------------------------------------------------------------------------- /supp/AllEurasia.poplist.txt: -------------------------------------------------------------------------------- 1 | Abkhasian 2 | Adygei 3 | Albanian 4 | Aleut 5 | Aleut_Tlingit 6 | Altaian 7 | Ami 8 | Armenian 9 | Assyrian 10 | Atayal 11 | Avar 12 | Azeri 13 | Balkar 14 | Basque 15 | BedouinA 16 | BedouinB 17 | Belarusian 18 | Borneo 19 | Bulgarian 20 | Buryat 21 | Cambodian 22 | Chechen 23 | Chukchi 24 | Chukchi1 25 | Chuvash 26 | Croatian 27 | Cypriot 28 | Czech 29 | Dai 30 | Daur 31 | Dolgan 32 | Druze 33 | English 34 | Eskimo_ChaplinSireniki 35 | Eskimo_Naukan 36 | Estonian 37 | Even 38 | Finnish 39 | French 40 | Georgian 41 | German 42 | Greek 43 | Han 44 | Hezhen 45 | Hungarian 46 | Icelandic 47 | Iranian 48 | Italian_North 49 | Italian_South 50 | Itelmen 51 | Japanese 52 | Jew_Ashkenazi 53 | Jew_Georgian 54 | Jew_Iranian 55 | Jew_Iraqi 56 | Jew_Libyan 57 | Jew_Moroccan 58 | Jew_Tunisian 59 | Jew_Turkish 60 | Jew_Yemenite 61 | Jordanian 62 | Kalmyk 63 | Kinh 64 | Korean 65 | Koryak 66 | Kumyk 67 | Kurd 68 | Kyrgyz 69 | Lahu 70 | Lebanese 71 | Lezgin 72 | Lithuanian 73 | Maltese 74 | Mansi 75 | Miao 76 | Mongol 77 | Mongola 78 | Mordovian 79 | Naxi 80 | Nganasan 81 | Nogai 82 | North_Ossetian 83 | Norwegian 84 | Orcadian 85 | Oroqen 86 | Ossetian 87 | Palestinian 88 | Polish 89 | Russian 90 | Saami.DG 91 | Saami_WGA 92 | Sardinian 93 | Saudi 94 | Scottish 95 | Selkup 96 | Semende 97 | She 98 | Sherpa.DG 99 | Sicilian 100 | Spanish 101 | Spanish_North 102 | Syrian 103 | Tajik 104 | Thai 105 | Tibetan.DG 106 | Tu 107 | Tubalar 108 | Tujia 109 | Turkish 110 | Turkmen 111 | Tuvinian 112 | Ukrainian 113 | Ulchi 114 | Uygur 115 | Uzbek 116 | Xibo 117 | Yakut 118 | Yi 119 | Yukagir 120 | -------------------------------------------------------------------------------- /supp/poplist.txt: -------------------------------------------------------------------------------- 1 | Abkhasian 2 | Adygei 3 | Albanian 4 | Aleut 5 | Aleut_Tlingit 6 | Altaian 7 | Ami 8 | Armenian 9 | Assyrian 10 | Atayal 11 | Avar 12 | Azeri 13 | Balkar 14 | Basque 15 | BedouinA 16 | BedouinB 17 | Belarusian 18 | BolshoyOleniOstrov 19 | Borneo 20 | Bulgarian 21 | Buryat 22 | Cambodian 23 | Canary_Islander 24 | ChalmnyVarre 25 | Chechen 26 | Chukchi 27 | Chukchi1 28 | Chuvash 29 | Croatian 30 | Cypriot 31 | Czech 32 | Dai 33 | Daur 34 | Dolgan 35 | Druze 36 | English 37 | Eskimo_ChaplinSireniki 38 | Eskimo_Naukan 39 | Estonian 40 | Even 41 | Finnish 42 | French 43 | Georgian 44 | German 45 | Greek 46 | Han 47 | Hezhen 48 | Hungarian 49 | Icelandic 50 | Iranian 51 | Irish 52 | Irish_Ulster 53 | Italian_North 54 | Italian_South 55 | Itelmen 56 | Japanese 57 | Jew_Ashkenazi 58 | Jew_Georgian 59 | Jew_Iranian 60 | Jew_Iraqi 61 | Jew_Libyan 62 | Jew_Moroccan 63 | Jew_Tunisian 64 | Jew_Turkish 65 | Jew_Yemenite 66 | Jordanian 67 | Kalmyk 68 | Kinh 69 | Korean 70 | Koryak 71 | Kumyk 72 | Kurd 73 | Kyrgyz 74 | Lahu 75 | Lebanese 76 | Lebanese_Christian 77 | Lebanese_Muslim 78 | Levanluhta 79 | Levanluhta_Outlier 80 | Lezgin 81 | LBK_EN 82 | Lithuanian 83 | Maltese 84 | Mansi 85 | MA1_HG.SG 86 | Mbuti 87 | Miao 88 | Mongol 89 | Mongola 90 | Mordovian 91 | Naxi 92 | Nganasan 93 | Nogai 94 | North_Ossetian 95 | Norwegian 96 | Orcadian 97 | Oroqen 98 | Ossetian 99 | Palestinian 100 | Polish 101 | Romanian 102 | Russian 103 | Saami.DG 104 | Saami_WGA 105 | Sardinian 106 | Saudi 107 | Scottish 108 | Selkup 109 | Semende 110 | She 111 | Sherpa.DG 112 | Shetlandic 113 | Sicilian 114 | Sorb 115 | Spanish 116 | Spanish_North 117 | Syrian 118 | Tajik 119 | Thai 120 | Tibetan.DG 121 | Tu 122 | Tubalar 123 | Tujia 124 | Turkish 125 | Turkmen 126 | Tuvinian 127 | Ukrainian 128 | Ulchi 129 | Uygur 130 | Uzbek 131 | WHG 132 | Xibo 133 | Yakut 134 | Yamnaya_Samara 135 | Yi 136 | Yukagir -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # popgen_course 2 | A course with Jupyter Notebooks for Computational Population Genetics 3 | 4 | by Stephan Schiffels 5 | 6 | *Disclaimer: This is still work in progress.* 7 | 8 | This repository contains several Jupyter Notebooks that I have used in the past for teaching various elements of population-genetic data analyses to students with no initial training in population genetics or Unix-based data analysis. It is definitely not yet fully self-contained and needs an experienced instructor to go through. 9 | 10 | Having said that, for someone reasonably experienced with Unix, it is possible to go through the chapters yourself. In that case, here are a few steps for settup up your enviroment to make these work: 11 | 12 | 1. Install [Jupyter](https://jupyter.org) notebooks with [Bash extension](https://github.com/takluyver/bash_kernel). You will also need [Eigensoft](https://github.com/DReichLab/EIG) and [ADMIXTOOLS](https://github.com/DReichLab/AdmixTools). 13 | 2. Clone this repository in your home directory running `git clone https://github.com/stschiff/popgen_course.git` 14 | 3. Download the genotype data needed for these exercises from [here](https://oc.gnz.mpg.de/owncloud/index.php/s/dT9KzFhLfunk3Tb). In my notebooks, I assume that this data has been downloaded into the directory `/data/popgen_course`. 15 | 16 | Having Jupyter installed, you can now simply open the Notebooks directly from within Jupyter, or you can access static versions of them here: 17 | 18 | 1. [Getting Started (Bash)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/01_bashnb_getting_started.ipynb) 19 | 2. [Getting Started (Python)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/02_pynb_getting_started.ipynb) 20 | 3. [Principal Components Analysis (Bash)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/03_bashnb_smartpca.ipynb) 21 | 4. [Principal Components Analysis (Python)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/04_pynb_plotting_pca.ipynb) 22 | 5. [F Statistics (Python)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/05_pynb_fstatistics.ipynb) 23 | 24 | -------------------------------------------------------------------------------- /population_frequencies.txt: -------------------------------------------------------------------------------- 1 | 9 Abkhasian 2 | 16 Adygei 3 | 6 Albanian 4 | 7 Aleut 5 | 4 Aleut_Tlingit 6 | 7 Altaian 7 | 10 Ami 8 | 10 Armenian 9 | 9 Atayal 10 | 10 Balkar 11 | 29 Basque 12 | 25 BedouinA 13 | 19 BedouinB 14 | 10 Belarusian 15 | 6 BolshoyOleniOstrov 16 | 9 Borneo 17 | 10 Bulgarian 18 | 8 Cambodian 19 | 2 Canary_Islander 20 | 2 ChalmnyVarre 21 | 9 Chechen 22 | 20 Chukchi 23 | 3 Chukchi1 24 | 10 Chuvash 25 | 10 Croatian 26 | 8 Cypriot 27 | 10 Czech 28 | 10 Dai 29 | 9 Daur 30 | 4 Dolgan 31 | 39 Druze 32 | 10 English 33 | 10 Estonian 34 | 9 Even 35 | 8 Finnish 36 | 32 French 37 | 10 Georgian 38 | 20 Greek 39 | 43 Han 40 | 8 Hezhen 41 | 20 Hungarian 42 | 12 Icelandic 43 | 8 Iranian 44 | 20 Italian_North 45 | 1 Italian_South 46 | 6 Itelmen 47 | 29 Japanese 48 | 7 Jew_Ashkenazi 49 | 7 Jew_Georgian 50 | 9 Jew_Iranian 51 | 6 Jew_Iraqi 52 | 9 Jew_Libyan 53 | 6 Jew_Moroccan 54 | 7 Jew_Tunisian 55 | 8 Jew_Turkish 56 | 8 Jew_Yemenite 57 | 1 JK2065 58 | 9 Jordanian 59 | 10 Kalmyk 60 | 8 Kinh 61 | 6 Korean 62 | 9 Koryak 63 | 8 Kumyk 64 | 9 Kyrgyz 65 | 8 Lahu 66 | 14 LBK_EN 67 | 8 Lebanese 68 | 2 Levanluhta 69 | 9 Lezgin 70 | 10 Lithuanian 71 | 8 Maltese 72 | 8 Mansi 73 | 10 Miao 74 | 6 Mongola 75 | 10 Mordovian 76 | 9 Naxi 77 | 11 Nganasan 78 | 9 Nogai 79 | 11 Norwegian 80 | 13 Orcadian 81 | 9 Oroqen 82 | 10 Ossetian 83 | 38 Palestinian 84 | 22 Russian 85 | 2 Saami.DG 86 | 1 Saami_WGA 87 | 27 Sardinian 88 | 8 Saudi 89 | 4 Scottish 90 | 10 Selkup 91 | 10 Semende 92 | 10 She 93 | 2 Sherpa.DG 94 | 11 Sicilian 95 | 53 Spanish 96 | 5 Spanish_North 97 | 8 Syrian 98 | 8 Tajik 99 | 10 Thai 100 | 2 Tibetan.DG 101 | 10 Tu 102 | 22 Tubalar 103 | 10 Tujia 104 | 50 Turkish 105 | 7 Turkmen 106 | 10 Tuvinian 107 | 9 Ukrainian 108 | 25 Ulchi 109 | 10 Uygur 110 | 10 Uzbek 111 | 3 WHG 112 | 7 Xibo 113 | 20 Yakut 114 | 9 Yamnaya_Samara 115 | 10 Yi 116 | 19 Yukagir 117 | -------------------------------------------------------------------------------- /f3_outgroup_stats_Han.txt: -------------------------------------------------------------------------------- 1 | result: Han Chuvash Mbuti 0.233652 0.002072 112.782 502678 2 | result: Han Albanian Mbuti 0.215629 0.002029 106.291 501734 3 | result: Han Armenian Mbuti 0.213724 0.001963 108.882 504370 4 | result: Han Bulgarian Mbuti 0.216193 0.001979 109.266 504310 5 | result: Han Czech Mbuti 0.218060 0.002002 108.939 504089 6 | result: Han Druze Mbuti 0.209551 0.001919 109.205 510853 7 | result: Han English Mbuti 0.216959 0.001973 109.954 504161 8 | result: Han Estonian Mbuti 0.220730 0.002019 109.332 503503 9 | result: Han Finnish Mbuti 0.223447 0.002044 109.345 502217 10 | result: Han French Mbuti 0.216623 0.001969 110.012 509613 11 | result: Han Georgian Mbuti 0.214295 0.001935 110.721 503598 12 | result: Han Greek Mbuti 0.215203 0.001984 108.465 507475 13 | result: Han Hungarian Mbuti 0.217894 0.001999 109.004 507409 14 | result: Han Icelandic Mbuti 0.218683 0.002015 108.553 504655 15 | result: Han Italian_North Mbuti 0.215332 0.001978 108.854 507589 16 | result: Han Italian_South Mbuti 0.211787 0.002271 93.265 492400 17 | result: Han Lithuanian Mbuti 0.219615 0.002032 108.098 503681 18 | result: Han Maltese Mbuti 0.210359 0.001956 107.542 503985 19 | result: Han Mordovian Mbuti 0.223469 0.002008 111.296 503441 20 | result: Han Norwegian Mbuti 0.218873 0.002023 108.197 504621 21 | result: Han Orcadian Mbuti 0.217773 0.002014 108.115 504993 22 | result: Han Russian Mbuti 0.223993 0.001995 112.274 506525 23 | result: Han Sardinian Mbuti 0.213230 0.001980 107.711 508413 24 | result: Han Scottish Mbuti 0.218489 0.002039 107.145 499784 25 | result: Han Sicilian Mbuti 0.212272 0.001975 107.486 505477 26 | result: Han Spanish_North Mbuti 0.215885 0.002029 106.383 500853 27 | result: Han Spanish Mbuti 0.213869 0.001975 108.297 513648 28 | result: Han Ukrainian Mbuti 0.218716 0.002007 108.950 503981 29 | result: Han Levanluhta Mbuti 0.236252 0.002383 99.123 263049 30 | result: Han BolshoyOleniOstrov Mbuti 0.247814 0.002177 113.849 457102 31 | result: Han ChalmnyVarre Mbuti 0.233499 0.002304 101.345 366220 32 | result: Han Saami.DG Mbuti 0.236198 0.002274 103.852 489038 -------------------------------------------------------------------------------- /f3_outgroup_stats_MA1.txt: -------------------------------------------------------------------------------- 1 | result: MA1_HG.SG Chuvash Mbuti 0.243818 0.002349 103.781 350484 2 | result: MA1_HG.SG Albanian Mbuti 0.236494 0.002296 103.008 344332 3 | result: MA1_HG.SG Armenian Mbuti 0.231399 0.002264 102.229 349612 4 | result: MA1_HG.SG Bulgarian Mbuti 0.237498 0.002281 104.103 349800 5 | result: MA1_HG.SG Czech Mbuti 0.243224 0.002328 104.457 349553 6 | result: MA1_HG.SG Druze Mbuti 0.226740 0.002197 103.193 359004 7 | result: MA1_HG.SG English Mbuti 0.243135 0.002317 104.941 349321 8 | result: MA1_HG.SG Estonian Mbuti 0.247065 0.002362 104.619 348861 9 | result: MA1_HG.SG Finnish Mbuti 0.245684 0.002379 103.266 347208 10 | result: MA1_HG.SG French Mbuti 0.240235 0.002269 105.886 357842 11 | result: MA1_HG.SG Georgian Mbuti 0.232645 0.002253 103.243 349082 12 | result: MA1_HG.SG Greek Mbuti 0.236566 0.002280 103.757 355261 13 | result: MA1_HG.SG Hungarian Mbuti 0.241720 0.002313 104.483 355340 14 | result: MA1_HG.SG Icelandic Mbuti 0.244488 0.002386 102.481 350287 15 | result: MA1_HG.SG Italian_North Mbuti 0.236407 0.002273 104.002 354999 16 | result: MA1_HG.SG Italian_South Mbuti 0.230839 0.002767 83.427 321217 17 | result: MA1_HG.SG Lithuanian Mbuti 0.246864 0.002403 102.718 348656 18 | result: MA1_HG.SG Maltese Mbuti 0.230200 0.002259 101.903 347725 19 | result: MA1_HG.SG Mordovian Mbuti 0.245284 0.002346 104.571 350058 20 | result: MA1_HG.SG Norwegian Mbuti 0.243930 0.002301 106.031 350182 21 | result: MA1_HG.SG Orcadian Mbuti 0.243614 0.002320 105.008 351053 22 | result: MA1_HG.SG Russian Mbuti 0.245212 0.002298 106.698 355953 23 | result: MA1_HG.SG Sardinian Mbuti 0.231967 0.002264 102.449 355548 24 | result: MA1_HG.SG Scottish Mbuti 0.244598 0.002434 100.512 339441 25 | result: MA1_HG.SG Sicilian Mbuti 0.231141 0.002260 102.297 351028 26 | result: MA1_HG.SG Spanish_North Mbuti 0.238479 0.002426 98.319 341661 27 | result: MA1_HG.SG Spanish Mbuti 0.235386 0.002257 104.293 361951 28 | result: MA1_HG.SG Ukrainian Mbuti 0.243551 0.002345 103.881 348948 29 | result: MA1_HG.SG Levanluhta Mbuti 0.247640 0.003030 81.728 174148 30 | result: MA1_HG.SG BolshoyOleniOstrov Mbuti 0.256041 0.002624 97.561 305851 31 | result: MA1_HG.SG ChalmnyVarre Mbuti 0.249619 0.002862 87.212 239594 32 | result: MA1_HG.SG Saami.DG Mbuti 0.251530 0.002622 95.922 326072 33 | -------------------------------------------------------------------------------- /04_Rmd_plotting_pca.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Principal Component Plots" 3 | output: html_document 4 | editor_options: 5 | chunk_output_type: console 6 | --- 7 | 8 | ```{r, echo=FALSE} 9 | knitr::opts_chunk$set(message = FALSE) 10 | ``` 11 | 12 | ```{r} 13 | library(magrittr) 14 | ``` 15 | 16 | For this chapter, you will need the PCA results that we ran in the last chapter. I have actually included the output files of my runs into this repository, so you can just use them if something didn't work in the previous chapter. 17 | 18 | For making plots in python, one of the most popular libaries around is ggplot2. You can load it via: 19 | 20 | ```{r} 21 | library(ggplot2) 22 | ``` 23 | 24 | Let's have a look at the main results file from smartpca: 25 | 26 | ```{r, warning=F} 27 | pcaDat <- readr::read_delim("pca.WestEurasia.evec", " ", trim_ws = T) 28 | pcaDat2 <- readr::read_delim("pca.AllEurasia.evec", " ", trim_ws = T) 29 | ``` 30 | 31 | The first row contains the eigenvalues for the first 4 principal components (PCs), and all further rows contain the PC coordinates for each individual. The first column contains the name of each individual, the last row the population. To load this dataset with R, we use the readr package. To load data using readr, we used the read_delim() function. We can now change the column headers: 32 | 33 | ```{r} 34 | colnames(pcaDat) <- colnames(pcaDat2) <- c("Name", "PC1", "PC2", "PC3", "PC4", "Group") 35 | ``` 36 | 37 | Looking at the data, we find that it is a tibble (a better data.frame), with each individual on one row, and the columns denoting the first 4 principal components. The last column contains the population for each individual: 38 | 39 | ```{r} 40 | pcaDat 41 | ``` 42 | 43 | We can quickly plot the first two PCs for all individuals: 44 | 45 | ```{r} 46 | pcaDat %>% 47 | ggplot() + 48 | geom_point(aes(x = PC1, y = PC2)) 49 | ``` 50 | 51 | which is not very helpful, because we can't see where each population falls. We can highlight a few populations to get a bit more of a feeling: 52 | 53 | ```{r} 54 | ggplot() + 55 | geom_point( 56 | data = pcaDat %>% dplyr::filter(!(Group %in% c("Finnish", "Sardinian", "Armenian", "BedouinB"))), 57 | aes(x = PC1, y = PC2) 58 | ) + 59 | geom_point( 60 | data = pcaDat %>% dplyr::filter(Group %in% c("Finnish", "Sardinian", "Armenian", "BedouinB")), 61 | aes(x = PC1, y = PC2, color = Group) 62 | ) 63 | ``` 64 | 65 | ## Showing all populations 66 | 67 | OK, but how do we systematically show all the populations? There are too many of those to separate them all by different colors, or by different symbols, so we need to combine colours and symbols and use all the combinations of them to show all the populations. 68 | 69 | ```{r} 70 | populations <- readr::read_csv("data/popgen_course/WestEurasia.poplist.txt", col_names = F)$X1 71 | ``` 72 | 73 | ```{r, fig.height=10} 74 | pcaDat %>% 75 | dplyr::filter(Group %in% populations) %>% 76 | ggplot() + 77 | geom_point(aes( 78 | x = PC1, y = PC2, 79 | color = Group, shape = Group 80 | )) + 81 | scale_shape_manual(values = rep(0:18, len = 57)) + 82 | theme(legend.position = "bottom") 83 | ``` 84 | 85 | ## Adding ancient populations 86 | 87 | Of course, until now we haven't yet included any of the actual ancient test individuals that we want to analyse. 88 | 89 | We add the following ancient populations to this plot: 90 | 91 | - Levanluhta (two individuals from Finland from the first millenium AD) 92 | - BolshoyOleniOstrov (a group of 3500 year old individuals from Northern Russia). 93 | - WHG (short for Western Hunter-Gatherers, about 8000 years ago) 94 | - LBK_EN (short for Linearbandkeramik Early Neolithic, from about 6,000 years ago) 95 | - Yamnaya_Samara, a late Neolithic population from the Russian Steppe, about 4,800 years ago. 96 | 97 | The first two populations are from a publication on ancient Fennoscandian genomes ([Lamnidis et al. 2018](https://www.nature.com/articles/s41467-018-07483-5)), and are instructive to understand what PCA can be used for. The latter three populations are from two famous publications ([Lazaridis et al. 2014](https://www.nature.com/articles/nature13673) and [Haak et al. 2015](https://www.nature.com/articles/nature14317)). It can be shown that modern European genetic diversity is formed by a mix of three ancestries represented by these ancient groups. To highlight these ancient populations, we plot them in black and using different symbols. While we're at it, we should also add the population called "Saami.DG": 98 | 99 | ```{r, fig.height=10} 100 | ancient_populations <- c("Levanluhta", "BolshoyOleniOstrov", "WHG", "LBK_EN", "Yamnaya_Samara", "Saami.DG") 101 | 102 | ggplot() + 103 | geom_point( 104 | data = pcaDat %>% dplyr::filter(Group %in% populations), 105 | mapping = aes( 106 | x = PC1, y = PC2, 107 | color = Group, shape = Group 108 | ) 109 | ) + 110 | geom_point( 111 | data = pcaDat %>% dplyr::filter(Group %in% ancient_populations), 112 | mapping = aes( 113 | x = PC1, y = PC2 114 | ), 115 | color = "black", shape = 15 116 | ) + 117 | ggrepel::geom_label_repel( 118 | data = pcaDat %>% dplyr::filter(Group %in% ancient_populations) %>% 119 | dplyr::group_by(Group) %>% 120 | dplyr::summarise(PC1 = mean(PC1), PC2 = mean(PC2)), 121 | mapping = aes( 122 | x = PC1, y = PC2, label = Group 123 | ) 124 | ) + 125 | scale_shape_manual(values = rep(0:14, len = 57)) + 126 | theme(legend.position = "bottom") 127 | ``` 128 | 129 | OK, so what are we looking at? This is quite a rich plot, of course, and we won't discuss all the details here. I just want to highlight two things. First, you can see that most present-day Europeans are scattered in a relatively tight space in the center of a triangle span up by the WHG on the lower left, LBK_EN on the lower right (seen from European points) and by Yamnaya_Samara (top). Indeed, a widely-accepted model for present-day Europeans assumes these three ancient source populations for all Europeans ([Lazaridis et al. 2014](https://www.nature.com/articles/nature13673) and [Haak et al. 2015](https://www.nature.com/articles/nature14317)). 130 | 131 | The second thing that is noteworthy here is that present-day people from Northeastern Europe, such as Finns, Saami and other Uralic speaking populations are "dragged" towards the ancient samples form Bolshoy Oleni Ostrov. Indeed, a recent model published by us assumes that "Siberian" genetic ancestry entered Europe around 4000 years ago as a kind of fourth genetic component on top of the three other components discusseda bove, and is nowadays found in most Uralic speakers in Europe, including Finns, Saami and Estonians. 132 | 133 | ## East-Eurasian PCA 134 | 135 | We can make a similar plot using the all-Eurasian PCA that we have run: 136 | 137 | ```{r} 138 | populations <- readr::read_csv("data/popgen_course/AllEurasia.poplist.txt", col_names = F)$X1 139 | ``` 140 | 141 | ```{r, fig.height=12} 142 | ggplot() + 143 | geom_point( 144 | data = pcaDat2 %>% dplyr::filter(Group %in% populations), 145 | mapping = aes( 146 | x = PC1, y = PC2, 147 | color = Group, shape = Group 148 | ) 149 | ) + 150 | geom_point( 151 | data = pcaDat2 %>% dplyr::filter(Group %in% ancient_populations), 152 | mapping = aes( 153 | x = PC1, y = PC2 154 | ), 155 | color = "black", shape = 15 156 | ) + 157 | ggrepel::geom_label_repel( 158 | data = pcaDat2 %>% dplyr::filter(Group %in% ancient_populations) %>% 159 | dplyr::group_by(Group) %>% 160 | dplyr::summarise(PC1 = mean(PC1), PC2 = mean(PC2)), 161 | mapping = aes( 162 | x = PC1, y = PC2, label = Group 163 | ) 164 | ) + 165 | scale_shape_manual(values = rep(0:14, len = 108)) + 166 | theme(legend.position = "bottom") 167 | ``` 168 | 169 | This PCA looks quite different. Here, we have all Western-Eurasian groups squished together on the left side of the plot, and on the right we have East-Asian populations. The plot roughly reflects Geography, with Northern East-Asian people such as the Nganasan on the top-right, and Southern East-Asian people like the Taiwanese Ami on the lower right. Here we can now see that the ancient samples from Russia and Finnland, as well as present-day Uralic populations are actually distributed between East and West, contrary to most other Europeans. This confirms that these group in Europe have quite a distinctive East-Asian genetic ancestry, and we found that it is best represented by the Nganasan ([Lamnidis et al. 2018](https://www.nature.com/articles/s41467-018-07483-5)). 170 | -------------------------------------------------------------------------------- /03_Rmd_smartpca.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Principal Components Analysis (PCA)" 3 | output: html_document 4 | editor_options: 5 | chunk_output_type: console 6 | --- 7 | 8 | ```{r, echo=FALSE} 9 | knitr::opts_chunk$set(message = FALSE) 10 | ``` 11 | 12 | ```{r} 13 | library(magrittr) 14 | ``` 15 | 16 | Principal components analysis (PCA) is one of the most useful techniques to visualise genetic diversity in a dataset. The methodology is not restricted to genetic data, but in general allows breaking down high-dimensional datasets to two or more dimensions for visualisation in a two-dimensional space. 17 | 18 | ## Genotype Data 19 | 20 | This lesson is also our first contact with the genotype data used in this and most of the following lessons. The dataset that we will work with contains 1,340 individuals, each represented by 593,124 single nucleotide polymorphisms (SNPs). Those SNPs have exactly two different alleles, and each individual has one of four possible values at each genotype: homozygous reference, heterozygous, homozygous alternative, or missing. Those four values are encoded 2, 1, 0 and 9 respectively. 21 | 22 | The data is laid out as a matrix, with columns indicating individuals, and rows indicating SNPs. The data itself comes in the so-called \"EIGENSTRAT\" format, which is defined in the [Eigensoft package](https://github.com/DReichLab/EIG) used by many tools used in this workshop. In this format, a genotype dataset consists of three files, usually with the following file endings: 23 | 24 | * `*.snp`: The file containing the SNP positions. It consists of six columns: SNP-name, chromosome, genetic positions, physical position, reference allele, alternative allele. 25 | * `*.ind`: The file containing the names of the individuals. It consists of three columns: Individual Name, Sex (encoded as M(ale), F(emale), or U(nknown)), and population name. 26 | * `*.geno`: The file containing the genotype matrix, with individuals laid out from left to right, and SNP positions laid out from top to bottom. 27 | 28 | In the following, we will explore the files using R in this Rmarkdown document. 29 | 30 | The data that we want to analyse is stored at `data/popgen_course`. Let's list the contents of that directory: 31 | 32 | ```{r} 33 | list.files("data/popgen_course/") 34 | ``` 35 | 36 | Let's explore those files a bit. Here are the first 20 individuals: 37 | 38 | ```{r} 39 | individuals <- readr::read_delim( 40 | "data/popgen_course/genotypes_small.ind", 41 | delim = " ", 42 | trim_ws = T, 43 | col_names = c( 44 | "name", 45 | "sex", 46 | "population" 47 | ) 48 | ) 49 | 50 | individuals %>% head(20) 51 | ``` 52 | 53 | And here the first 20 SNP rows: 54 | 55 | ```{r} 56 | snps <- readr::read_delim( 57 | "data/popgen_course/genotypes_small.snp", 58 | delim = " ", 59 | trim_ws = T, 60 | col_names = c( 61 | "SNP_name", 62 | "chromosome", 63 | "genetic_position", 64 | "physical_position", 65 | "reference_allele", 66 | "alternative_allele" 67 | ) 68 | ) 69 | ``` 70 | 71 | And here are the first 20 genotypes of the first 50 individuals: 72 | 73 | ```{r} 74 | geno <- readr::read_lines( 75 | "data/popgen_course/genotypes_small.geno", 76 | n_max = 20 77 | ) 78 | 79 | geno %>% substr(1, 50) 80 | ``` 81 | 82 | Counting how many individuals and SNPs there are: 83 | 84 | ```{r} 85 | nrow(individuals) 86 | nrow(snps) 87 | ``` 88 | 89 | And now we check that the first row of the `*.geno` file indeed contains the same number of columns: 90 | 91 | ```{r} 92 | nchar(geno[1]) 93 | ``` 94 | 95 | Now counting the number of rows in the `*.geno`-file (this takes a few seconds, as the file is several hundred MB large): 96 | 97 | ```{r} 98 | R.utils::countLines("data/popgen_course/genotypes_small.geno") %>% as.integer() 99 | ``` 100 | 101 | Great, the number of rows and columns agrees with the numbers indicated in the `*.ind` and `*.snp` file! Now we're counting how many different populations there are. Let's first see the first 10 populations in the sorted list, alongside the number of individuals in each group: 102 | 103 | ```{r} 104 | individuals %>% 105 | dplyr::group_by(population) %>% 106 | dplyr::count() 107 | ``` 108 | 109 | ## How PCA works 110 | 111 | To understand how PCA works, consider a single individual and its representation by its 593,124 markers. Formally, each individual is a point in a 593,124-dimensional space, where each dimension can take only the three possible genotypes indicated above, or have missing data. To visualise this high-dimensional dataset, we would like to project it down to two dimensions. But as there are many ways to project the shadow of a three-dimensional object on a two dimensional plane, there are many (and even more) ways to project a 593,124-dimensional cloud of points to two dimensions. What PCA does is figuring out the \"best\" way to do this project in order to visualise the major components of variance in the data. 112 | 113 | For actually running the analysis, we use a software called `smartPCA` from the [Eigensoft package](https://github.com/DReichLab/EIG). As many other tools from this and related packages, `smartPCA` reads in a parameter file which specifies its input and output files and options. In our case, we want the parameter file to have the following content: 114 | 115 | ``` 116 | genotypename: data/popgen_course/genotypes_small.geno 117 | snpname: data/popgen_course/genotypes_small.snp 118 | indivname: data/popgen_course/genotypes_small.ind 119 | evecoutname: pca.WestEurasia.evec 120 | evaloutname: pca.WestEurasia.eval 121 | poplistname: data/popgen_course/WestEurasia.poplist.txt 122 | lsqproject: YES 123 | numoutevec: 4 124 | numthreads: 1 125 | ``` 126 | 127 | Here, the first three parameters specify the input genotype files. The next two rows specify two output file names, typically with ending `*.evec` and `*.eval`. The parameter line beginning with `poplistname` contains a file with a list of populations used for calculating the principal components (see below). The option `lsqproject` is important for applications including ancient DNA with lots of missing data, which I will not elaborate on. For the purpose of this workshop, you should use `lsqproject: YES`. The next option `numoutevec` specifies the number of principal components that we compute, the last option `numthreads` the number of CPUs to use for this run. We use just one since we're working together on the same computer, so cannot afford everyone running on lots of CPUs. 128 | 129 | ## Population lists vs. Projection 130 | 131 | The parameter named `poplistname` is a very crucial one. It specifies the populations whose individuals are used to calculate the principal components. Why not just all of them you ask? For two reasons: First, there are simply too many of them and we don't want to use all of them, since the computation would take too long. More importantly, however, we generally try to avoid using ancient samples to compute principal components, to avoid specific ancient-DNA related artefacts affecting the computation. Finally, the list of populations to use for PCA should be informed by your question. If you're investigating African population structure, in makes no sense to put Asian or European individuals in your population list, since then the main axes of genetic differentiation would not be inside of Africa, but between Africans and Non-Africans. 132 | 133 | So what happens to individuals that are not in populations listed in the population list? Well, fortunately, they are not just ignored, but \"projected\". This means that after the principal components have been computed, *all* individuals (not just the one in the list) are projected onto these principal components. That way, we can visualise ancient populations in the context of modern genetic variation. While that may sound a bit problematic at first (Some variation in ancient populations is not represented well by modern populations), but it turns out to be nevertheless one of the most useful tools for this purpose. The advantage of avoiding ancient-DNA artefacts and batch effects to affect the visualisation outweighs the disadvantage of missing some private genetic variation components in the ancient populations themselves. Of course, that argument breaks down once the analysed populations become too ancient and detached from modern genetic variation. But for our purposes it will work just fine. 134 | 135 | For this workshop, I prepared two population lists:: 136 | 137 | ``` 138 | data/popgen_course/WestEurasia.poplist.txt 139 | data/popgen_course/AllEurasia.poplist.txt 140 | ``` 141 | 142 | As you can tell from the names of the files, they specify two sets of modern populations representing West Eurasia or all of Europe and Asia, respectively. 143 | 144 | I recommend to look through both of the population lists and google some population names that you don't recognise to get a feeling for the ethnic groups represented here. 145 | 146 | ## Running `smartPCA` 147 | 148 | Now go ahead and open a new text file using your Jupyter Browser, you can name it anything you like. For the sake of a concrete name, let's call it `pca.WestEurasia.params.txt`. Text files in Jupyter are opened in a text editor, so you can then simply copy-paste the above lines into the new file. 149 | 150 | ```{r} 151 | readr::write_lines(c( 152 | "genotypename: data/popgen_course/genotypes_small.geno", 153 | "snpname: data/popgen_course/genotypes_small.snp", 154 | "indivname: data/popgen_course/genotypes_small.ind", 155 | "evecoutname: pca.WestEurasia.evec", 156 | "evaloutname: pca.WestEurasia.eval", 157 | "poplistname: data/popgen_course/WestEurasia.poplist.txt", 158 | "lsqproject: YES", 159 | "numoutevec: 4", 160 | "numthreads: 1" 161 | ), 162 | path = "pca.WestEurasia.params.txt" 163 | ) 164 | ``` 165 | 166 | Let's see whether it worked, by printing out the contents of that file into your notebook: 167 | 168 | ```{r} 169 | readr::read_lines( 170 | "pca.WestEurasia.params.txt" 171 | ) 172 | ``` 173 | 174 | Great, so that's our parameter file for running `smartPCA`. 175 | 176 | **Note:** that we specified two output files in our parameter file, here called `pca.WestEurasia.evec` and `pca.WestEurasia.eval`. You can actually put any names you want in there. But beware of relative vs. absolute paths. File names starting with `/` are considered \"absolute\", that is, taken to go from the root of the file system. In contrast, filenames not starting with `/` are considered \"relative\" to the current working directory. If you forgot which directory you're in, run `pwd`. 177 | 178 | **Note:** The option `poplistname` is a crucial one. Here you need to specify which populations are used to compute the eigenvectors of the principal components analysis. In our case, I have prepared two population list files: `data/popgen_course/WestEurasia.poplist.txt` and `data/popgen_course/AllEurasia.poplist.txt`. Pick one of the two to carry on. 179 | 180 | Good, now we can run `smartPCA`. To do that, it's more convenient to use the terminal than a Rmarkdown file. So open a terminal and run 181 | 182 | ``` 183 | smartpca -p pca.WestEurasia.params.txt 184 | ``` 185 | 186 | This will typically run for about 30 minutes and output lots of logging output to the screen. 187 | 188 | In a similar manner we can prepare a parameter file for the AllEurasia population list. This is how it should look: 189 | 190 | ``` 191 | genotypename: data/popgen_course/genotypes_small.geno 192 | snpname: data/popgen_course/genotypes_small.snp 193 | indivname: data/popgen_course/genotypes_small.ind 194 | evecoutname: pca.AllEurasia.evec 195 | evaloutname: pca.AllEurasia.eval 196 | poplistname: data/popgen_course/AllEurasia.poplist.txt 197 | lsqproject: YES 198 | numoutevec: 4 199 | numthreads: 1 200 | ``` 201 | 202 | And similar to the command above, we can run pca on the AllEurasia population list via: 203 | 204 | ``` 205 | smartpca -p pca.AllEurasia.params.txt 206 | ``` 207 | 208 | which will run slightly longer than the first one because there are more populations. 209 | -------------------------------------------------------------------------------- /pca.WestEurasia.eval: -------------------------------------------------------------------------------- 1 | 6.288558 2 | 3.094820 3 | 2.692703 4 | 2.010356 5 | 1.742554 6 | 1.689588 7 | 1.626518 8 | 1.616861 9 | 1.590569 10 | 1.589779 11 | 1.580970 12 | 1.570292 13 | 1.562215 14 | 1.556105 15 | 1.550321 16 | 1.543074 17 | 1.540315 18 | 1.534597 19 | 1.524939 20 | 1.524176 21 | 1.520778 22 | 1.511707 23 | 1.505727 24 | 1.501742 25 | 1.494597 26 | 1.489205 27 | 1.484356 28 | 1.477644 29 | 1.469235 30 | 1.466139 31 | 1.464872 32 | 1.453282 33 | 1.439786 34 | 1.436309 35 | 1.424679 36 | 1.422132 37 | 1.416346 38 | 1.411205 39 | 1.407204 40 | 1.402511 41 | 1.399908 42 | 1.397946 43 | 1.395569 44 | 1.390360 45 | 1.385084 46 | 1.378586 47 | 1.377115 48 | 1.374029 49 | 1.370217 50 | 1.362597 51 | 1.360695 52 | 1.359427 53 | 1.354377 54 | 1.350525 55 | 1.344736 56 | 1.343233 57 | 1.339959 58 | 1.338966 59 | 1.335077 60 | 1.330144 61 | 1.323053 62 | 1.320174 63 | 1.317857 64 | 1.315216 65 | 1.310151 66 | 1.308080 67 | 1.305296 68 | 1.302001 69 | 1.300860 70 | 1.289421 71 | 1.287915 72 | 1.285110 73 | 1.279333 74 | 1.275902 75 | 1.271351 76 | 1.269675 77 | 1.267124 78 | 1.264680 79 | 1.262351 80 | 1.259080 81 | 1.257861 82 | 1.255196 83 | 1.252261 84 | 1.250030 85 | 1.247348 86 | 1.245184 87 | 1.242175 88 | 1.239379 89 | 1.238133 90 | 1.236569 91 | 1.232437 92 | 1.230241 93 | 1.228187 94 | 1.225265 95 | 1.223943 96 | 1.222205 97 | 1.219820 98 | 1.217824 99 | 1.215598 100 | 1.213455 101 | 1.209457 102 | 1.208494 103 | 1.205137 104 | 1.203316 105 | 1.201609 106 | 1.200145 107 | 1.196033 108 | 1.192887 109 | 1.192256 110 | 1.191376 111 | 1.188856 112 | 1.187536 113 | 1.185889 114 | 1.184822 115 | 1.181227 116 | 1.178798 117 | 1.175043 118 | 1.173146 119 | 1.171438 120 | 1.168925 121 | 1.166938 122 | 1.165445 123 | 1.164524 124 | 1.163296 125 | 1.161620 126 | 1.160886 127 | 1.159673 128 | 1.158730 129 | 1.155633 130 | 1.153882 131 | 1.151985 132 | 1.151777 133 | 1.149913 134 | 1.149450 135 | 1.147317 136 | 1.142969 137 | 1.142523 138 | 1.141568 139 | 1.138510 140 | 1.136462 141 | 1.134942 142 | 1.134773 143 | 1.133566 144 | 1.132394 145 | 1.130553 146 | 1.128126 147 | 1.127649 148 | 1.126379 149 | 1.125571 150 | 1.123615 151 | 1.121786 152 | 1.119842 153 | 1.119581 154 | 1.118005 155 | 1.116301 156 | 1.115939 157 | 1.114271 158 | 1.113468 159 | 1.111258 160 | 1.110785 161 | 1.109414 162 | 1.108017 163 | 1.105467 164 | 1.104533 165 | 1.103341 166 | 1.102456 167 | 1.101475 168 | 1.099997 169 | 1.098662 170 | 1.098401 171 | 1.095494 172 | 1.094875 173 | 1.094238 174 | 1.093169 175 | 1.091754 176 | 1.090332 177 | 1.089938 178 | 1.089230 179 | 1.087557 180 | 1.087194 181 | 1.086377 182 | 1.084882 183 | 1.084031 184 | 1.083202 185 | 1.082490 186 | 1.081516 187 | 1.078116 188 | 1.076987 189 | 1.075117 190 | 1.074305 191 | 1.074135 192 | 1.072921 193 | 1.071640 194 | 1.071192 195 | 1.069853 196 | 1.069550 197 | 1.067037 198 | 1.066036 199 | 1.065113 200 | 1.063452 201 | 1.062919 202 | 1.061595 203 | 1.060710 204 | 1.060346 205 | 1.059358 206 | 1.059034 207 | 1.056998 208 | 1.056422 209 | 1.055168 210 | 1.054756 211 | 1.054162 212 | 1.052285 213 | 1.051447 214 | 1.050835 215 | 1.050287 216 | 1.048713 217 | 1.047416 218 | 1.046505 219 | 1.046476 220 | 1.045557 221 | 1.044029 222 | 1.042965 223 | 1.042316 224 | 1.041413 225 | 1.040081 226 | 1.038227 227 | 1.037778 228 | 1.036910 229 | 1.035252 230 | 1.034431 231 | 1.034115 232 | 1.032947 233 | 1.031785 234 | 1.031263 235 | 1.030600 236 | 1.030253 237 | 1.028534 238 | 1.027096 239 | 1.026720 240 | 1.025726 241 | 1.025525 242 | 1.025322 243 | 1.024046 244 | 1.023472 245 | 1.022728 246 | 1.021405 247 | 1.021292 248 | 1.020123 249 | 1.019659 250 | 1.018497 251 | 1.017623 252 | 1.016996 253 | 1.016558 254 | 1.015365 255 | 1.014977 256 | 1.014861 257 | 1.013228 258 | 1.012071 259 | 1.011203 260 | 1.010988 261 | 1.010408 262 | 1.009380 263 | 1.007719 264 | 1.006532 265 | 1.006188 266 | 1.005178 267 | 1.004976 268 | 1.004504 269 | 1.003641 270 | 1.003191 271 | 1.002226 272 | 1.001685 273 | 1.000463 274 | 1.000337 275 | 0.999162 276 | 0.999093 277 | 0.998314 278 | 0.996493 279 | 0.996077 280 | 0.995453 281 | 0.994081 282 | 0.993268 283 | 0.992901 284 | 0.992295 285 | 0.991612 286 | 0.990978 287 | 0.989752 288 | 0.989216 289 | 0.988180 290 | 0.987540 291 | 0.986679 292 | 0.986364 293 | 0.986057 294 | 0.984414 295 | 0.983226 296 | 0.982901 297 | 0.981817 298 | 0.981467 299 | 0.980461 300 | 0.980450 301 | 0.978894 302 | 0.978277 303 | 0.977965 304 | 0.976690 305 | 0.976150 306 | 0.975217 307 | 0.974853 308 | 0.974546 309 | 0.973920 310 | 0.972813 311 | 0.972662 312 | 0.971645 313 | 0.970437 314 | 0.970208 315 | 0.969803 316 | 0.968331 317 | 0.967328 318 | 0.967162 319 | 0.966423 320 | 0.965545 321 | 0.965118 322 | 0.964917 323 | 0.964216 324 | 0.964092 325 | 0.963881 326 | 0.962568 327 | 0.962315 328 | 0.961592 329 | 0.961002 330 | 0.959286 331 | 0.958323 332 | 0.957938 333 | 0.957303 334 | 0.956762 335 | 0.955824 336 | 0.955388 337 | 0.954092 338 | 0.953812 339 | 0.952941 340 | 0.952400 341 | 0.951429 342 | 0.951262 343 | 0.950638 344 | 0.949361 345 | 0.949126 346 | 0.948465 347 | 0.948398 348 | 0.947911 349 | 0.946306 350 | 0.945961 351 | 0.945585 352 | 0.945162 353 | 0.944826 354 | 0.944243 355 | 0.942899 356 | 0.941973 357 | 0.941903 358 | 0.941103 359 | 0.940600 360 | 0.940065 361 | 0.939547 362 | 0.938725 363 | 0.938459 364 | 0.937967 365 | 0.937326 366 | 0.936784 367 | 0.935674 368 | 0.934585 369 | 0.934092 370 | 0.933774 371 | 0.932926 372 | 0.932327 373 | 0.932100 374 | 0.931712 375 | 0.931148 376 | 0.930232 377 | 0.929202 378 | 0.928810 379 | 0.928522 380 | 0.927521 381 | 0.927328 382 | 0.926623 383 | 0.926042 384 | 0.925250 385 | 0.924349 386 | 0.923473 387 | 0.923142 388 | 0.922366 389 | 0.921817 390 | 0.920906 391 | 0.920704 392 | 0.920235 393 | 0.919048 394 | 0.918843 395 | 0.917704 396 | 0.917045 397 | 0.916487 398 | 0.916337 399 | 0.916258 400 | 0.914903 401 | 0.914464 402 | 0.914041 403 | 0.913881 404 | 0.913094 405 | 0.912843 406 | 0.911970 407 | 0.911049 408 | 0.910202 409 | 0.909671 410 | 0.909440 411 | 0.908489 412 | 0.908079 413 | 0.907741 414 | 0.907548 415 | 0.906737 416 | 0.906195 417 | 0.905644 418 | 0.905111 419 | 0.904308 420 | 0.904040 421 | 0.903366 422 | 0.903223 423 | 0.902615 424 | 0.901622 425 | 0.901467 426 | 0.901179 427 | 0.900098 428 | 0.899927 429 | 0.899792 430 | 0.898742 431 | 0.897845 432 | 0.897437 433 | 0.896148 434 | 0.896008 435 | 0.895563 436 | 0.895410 437 | 0.894941 438 | 0.894237 439 | 0.893422 440 | 0.892932 441 | 0.891691 442 | 0.891632 443 | 0.891039 444 | 0.890625 445 | 0.889614 446 | 0.888892 447 | 0.888194 448 | 0.887807 449 | 0.887436 450 | 0.887371 451 | 0.886883 452 | 0.886195 453 | 0.885164 454 | 0.884265 455 | 0.884077 456 | 0.883718 457 | 0.883042 458 | 0.882538 459 | 0.882156 460 | 0.881516 461 | 0.881069 462 | 0.880161 463 | 0.879962 464 | 0.879528 465 | 0.878992 466 | 0.878825 467 | 0.878309 468 | 0.877843 469 | 0.877039 470 | 0.876641 471 | 0.876319 472 | 0.875079 473 | 0.874339 474 | 0.874230 475 | 0.873874 476 | 0.872857 477 | 0.872205 478 | 0.871989 479 | 0.871395 480 | 0.871152 481 | 0.870044 482 | 0.869753 483 | 0.869425 484 | 0.868858 485 | 0.867088 486 | 0.866709 487 | 0.866390 488 | 0.865938 489 | 0.865212 490 | 0.864425 491 | 0.864098 492 | 0.863748 493 | 0.863379 494 | 0.862923 495 | 0.862344 496 | 0.861429 497 | 0.860924 498 | 0.860355 499 | 0.859607 500 | 0.859477 501 | 0.858994 502 | 0.858649 503 | 0.857258 504 | 0.857078 505 | 0.856810 506 | 0.856177 507 | 0.855982 508 | 0.855824 509 | 0.855144 510 | 0.854475 511 | 0.853987 512 | 0.853647 513 | 0.852613 514 | 0.851769 515 | 0.851541 516 | 0.851137 517 | 0.850515 518 | 0.850446 519 | 0.849870 520 | 0.849277 521 | 0.848720 522 | 0.848360 523 | 0.847259 524 | 0.847193 525 | 0.846994 526 | 0.845861 527 | 0.845364 528 | 0.845043 529 | 0.844380 530 | 0.843360 531 | 0.842825 532 | 0.842303 533 | 0.841749 534 | 0.840919 535 | 0.840577 536 | 0.840009 537 | 0.839850 538 | 0.839015 539 | 0.838237 540 | 0.837920 541 | 0.837112 542 | 0.836878 543 | 0.836296 544 | 0.835706 545 | 0.835160 546 | 0.834576 547 | 0.834100 548 | 0.833720 549 | 0.832415 550 | 0.832231 551 | 0.832171 552 | 0.831552 553 | 0.831134 554 | 0.830803 555 | 0.829452 556 | 0.829250 557 | 0.828797 558 | 0.828511 559 | 0.828037 560 | 0.827513 561 | 0.827326 562 | 0.826752 563 | 0.826086 564 | 0.825650 565 | 0.824973 566 | 0.824409 567 | 0.823716 568 | 0.823206 569 | 0.822073 570 | 0.821757 571 | 0.821173 572 | 0.820873 573 | 0.820026 574 | 0.819839 575 | 0.818535 576 | 0.817973 577 | 0.817111 578 | 0.816751 579 | 0.816403 580 | 0.815699 581 | 0.815324 582 | 0.815057 583 | 0.814641 584 | 0.813534 585 | 0.812941 586 | 0.812452 587 | 0.811794 588 | 0.811542 589 | 0.810991 590 | 0.810724 591 | 0.809677 592 | 0.809547 593 | 0.808589 594 | 0.808439 595 | 0.807674 596 | 0.806734 597 | 0.806092 598 | 0.805974 599 | 0.805148 600 | 0.804800 601 | 0.804496 602 | 0.803970 603 | 0.803488 604 | 0.802755 605 | 0.802544 606 | 0.801909 607 | 0.800832 608 | 0.800408 609 | 0.799845 610 | 0.799108 611 | 0.798765 612 | 0.797802 613 | 0.797240 614 | 0.796978 615 | 0.796476 616 | 0.796009 617 | 0.795472 618 | 0.795240 619 | 0.794519 620 | 0.794289 621 | 0.793201 622 | 0.792818 623 | 0.792018 624 | 0.791806 625 | 0.791236 626 | 0.790462 627 | 0.789405 628 | 0.789034 629 | 0.788772 630 | 0.788432 631 | 0.787917 632 | 0.787389 633 | 0.786096 634 | 0.785929 635 | 0.785572 636 | 0.785046 637 | 0.784438 638 | 0.784134 639 | 0.783391 640 | 0.783121 641 | 0.782414 642 | 0.781586 643 | 0.780389 644 | 0.780236 645 | 0.779400 646 | 0.778745 647 | 0.778569 648 | 0.777887 649 | 0.776693 650 | 0.776351 651 | 0.776070 652 | 0.774920 653 | 0.774418 654 | 0.774176 655 | 0.773834 656 | 0.773016 657 | 0.771908 658 | 0.771459 659 | 0.770713 660 | 0.769870 661 | 0.769206 662 | 0.768963 663 | 0.767738 664 | 0.767465 665 | 0.766510 666 | 0.765908 667 | 0.765482 668 | 0.765061 669 | 0.764236 670 | 0.764026 671 | 0.763117 672 | 0.761653 673 | 0.761508 674 | 0.761167 675 | 0.760186 676 | 0.759873 677 | 0.759109 678 | 0.757924 679 | 0.757252 680 | 0.756537 681 | 0.756182 682 | 0.755559 683 | 0.754790 684 | 0.753408 685 | 0.752626 686 | 0.751722 687 | 0.751316 688 | 0.750610 689 | 0.750227 690 | 0.749308 691 | 0.748432 692 | 0.747693 693 | 0.747260 694 | 0.746210 695 | 0.744726 696 | 0.744155 697 | 0.743165 698 | 0.742642 699 | 0.742262 700 | 0.740150 701 | 0.739781 702 | 0.738910 703 | 0.738168 704 | 0.737248 705 | 0.736519 706 | 0.735534 707 | 0.735123 708 | 0.734035 709 | 0.733200 710 | 0.731250 711 | 0.729438 712 | 0.727373 713 | 0.727101 714 | 0.726889 715 | 0.724111 716 | 0.721821 717 | 0.718911 718 | 0.694921 719 | 0.685380 720 | -0.000000 721 | -------------------------------------------------------------------------------- /01_bashnb_getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Jupyter" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Basic Usage" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "When you first access Jupyter, you will get a file browser view of your home directory on the server. In the beginning, your home directory will be empty, and will be populated with notebooks and files throughout this workshop.\n", 22 | "\n", 23 | "To create a new text file, click on New (in the upper right corner) and then Text File, which opens a text editor within your browser. You can now add content into the file, or edit existing content and save. The filename can be changed by clicking into the Filename on top. You can now go back to your file browser window and update using the button with the two arrows in the upper right corner, and you should see your text file saved in your home directory.\n", 24 | "\n", 25 | "You can also use Jupyter to open a Terminal within the browser: Click on New and then Terminal, which will open a terminal window in a separate browser tab. You can enter Unix Bash commands to change directories, view files or execute programs (as we will learn below).\n", 26 | "\n", 27 | "Finally, you can create new Folders by clicking on New and then Folder. To rename the new folder, click on the checkbox beside the new folder, and click the Rename button on top, which appeared. To change into the new folder, click on it. To move back, click on the parent folder appearing on top of the file browser." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "***Excercise:*** Create a new folder called hello, and a text file within that folder using Jupyter. Name that text file hello.txt and fill it with arbitrary content, such as `Hello, World!`. Then open a terminal and output the contents of the new text file typing `cat hello/hello.txt` followed by ENTER." 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "***Note:*** While the Jupyter terminal and Jupyter Text Files are different ways to interact with the server, both access the same file system. So files created with the Text Editor are saved in your home directory, and can be accessed via the terminal, and vice versa: Files created via the Terminal can be accessed via the Text Editor, by simpling clicking on them in the Jupyter File Browser." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Notebooks" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Notebook can be loaded for different underlying kernels: bash, python and R. Notebooks are useful to document interactive data analysis. It combines code cells with markdown cells. A markdown cell can contain text, math or headings. " 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "You can create new bash notebooks using the \"New\" Dropdown list in the Jupyter File Browser and then selecting \"Bash\". Notebooks open if you click on them." 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "In Jupyter notebooks, you work with *Cells*. You can create new cells, or insert them above or below existing cells using the menu items in the `Insert` menu. Use the dropdown list in the command bar in Jupyter to change the type of the cell. The two main types we're going to use are `Markdown` and `Code`. Markdown cells are useful for documenting stuff, Code cells for running code. Markdown cells can be edited by double-clicking into them. Layout them by runnign Shift-Enter." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Code cells are used to enter and execute code. Let's look at some examples." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "We can first check which directory we are in, using the `pwd` (=Present Working Directory) command:" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 2, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "/home/stephan/popgen_course\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "pwd" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "OK, so we're in the `coursework` subfolder within our home folder `/home/stephan`. We can list the contents of that folder:" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "01_bashnb_getting_started.ipynb pca.AllEurasia.params.txt\n", 120 | "02_pynb_getting_started.ipynb\t pca.WestEurasia.eval\n", 121 | "03_bashnb_smartpca.ipynb\t pca.WestEurasia.evec\n", 122 | "04_pynb_plotting_pca.ipynb\t pca.WestEurasia.params.txt\n", 123 | "pca.AllEurasia.eval\t\t population_frequencies.txt\n", 124 | "pca.AllEurasia.evec\t\t README.md\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "ls" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "We can now create a new directory:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 6, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "mkdir testDir" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "and change into that directory:" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "cd testDir" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "and confirm that we are now in the new dir:" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 8, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "/home/stephan/popgen_course/testDir\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "pwd" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "OK, let's go back and delete the subfolder again:" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 9, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "cd ..\n", 202 | "rm -r testDir" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Here is a simple example of how to use ``echo``:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 10, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "Hello, how are you?\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "echo \"Hello, how are you?\"" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "OK, so let's try some more useful things with ``grep``, which can be used to filter large text files by searching for patterns, in this case just the occurrence of the word \"French\":" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | " HGDP00511 M French\n", 246 | " HGDP00512 M French\n", 247 | " HGDP00513 F French\n", 248 | " HGDP00514 F French\n", 249 | " HGDP00515 M French\n", 250 | " HGDP00516 F French\n", 251 | " HGDP00517 F French\n", 252 | " HGDP00518 M French\n", 253 | " HGDP00519 M French\n", 254 | " HGDP00522 M French\n", 255 | " HGDP00523 F French\n", 256 | " HGDP00524 F French\n", 257 | " HGDP00525 M French\n", 258 | " HGDP00526 F French\n", 259 | " HGDP00527 F French\n", 260 | " HGDP00528 M French\n", 261 | " HGDP00529 F French\n", 262 | " HGDP00531 F French\n", 263 | " HGDP00533 M French\n", 264 | " HGDP00534 F French\n", 265 | " HGDP00535 F French\n", 266 | " HGDP00536 F French\n", 267 | " HGDP00537 F French\n", 268 | " HGDP00538 M French\n", 269 | " HGDP00539 F French\n", 270 | " SouthFrench3326 M French\n", 271 | " SouthFrench3947 M French\n", 272 | " SouthFrench1323 M French\n", 273 | " SouthFrench3951 M French\n", 274 | " SouthFrench3068 M French\n", 275 | " SouthFrench1112 M French\n", 276 | " SouthFrench4018 M French\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "grep French /data/popgen_course/genotypes_small.ind" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "Alright, so that lists all French individuals. Now let's count them, by simply passing the flag `-c`:" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 12, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "32\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "grep -c French /data/popgen_course/genotypes_small.ind" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "***Note:*** We so far have seen the `pwd`, `mkdir`, `cd`, `rm`, `ls` and `grep` commands. If you want to find out more about those, just google them, they are among the most popular and widely used commands/programs in Unix." 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "In Python3 notebooks you can plot things: Create a new python3 notebook, and run this boilerplate code in the first cell:\n", 320 | "\n", 321 | " %matplotlib inline\n", 322 | " import matplotlib.pyplot as plt\n", 323 | "\n", 324 | "Then plot something, opening a second cell:\n", 325 | "\n", 326 | "***Exercise:*** Create a simple plot using `plt.plot([1, 2, 3], [5, 2, 6])`\n" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "# Bash Pipes" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "OK. So this first Notebook operates on Bash, which is more or less the lingua franca of Linux operating systems. Everything you do on command lines uses bash. One of the most useful techniques in bash scripting or bash commands are Unix pipes. To illustrate them, consider the following." 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "Let's look at the structure of our ``ind`` file:" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 4, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | " Yuk_009 M Yukagir\n", 360 | " Yuk_025 F Yukagir\n", 361 | " Yuk_022 F Yukagir\n", 362 | " Yuk_020 F Yukagir\n", 363 | " MC_40 M Chukchi\n", 364 | " Yuk_024 F Yukagir\n", 365 | " Yuk_023 F Yukagir\n", 366 | " MC_16 M Chukchi\n", 367 | " MC_15 F Chukchi\n", 368 | " MC_18 M Chukchi\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "head /data/popgen_course/genotypes_small.ind" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "***Note:*** The `head` command just lists the top 10 rows of a file." 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "Let's filter out the population column:" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 5, 393 | "metadata": {}, 394 | "outputs": [ 395 | { 396 | "name": "stdout", 397 | "output_type": "stream", 398 | "text": [ 399 | "Yukagir\n", 400 | "Yukagir\n", 401 | "Yukagir\n", 402 | "Yukagir\n", 403 | "Chukchi\n", 404 | "Yukagir\n", 405 | "Yukagir\n", 406 | "Chukchi\n", 407 | "Chukchi\n", 408 | "Chukchi\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "head /data/popgen_course/genotypes_small.ind | awk '{print $3}'" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "***Note:*** The `awk` program is one of the most powerful programs for text-file processing in the Unix-world. It is actually a full-fledged programming language itself. Here we only use it in one of its simplest form. The program `{print $3}` simply says \"For every line of the input file, print out the third field\"." 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "***Note:*** The pipe symbol `|` tells Unix to redirect the output of the program to its left into the program to its right as standard input. " 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "Let's sort the output (notice we now use ``cat`` instead of ``head``, but use ``head`` in the end:" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 8, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "Abkhasian\n", 447 | "Abkhasian\n", 448 | "Abkhasian\n", 449 | "Abkhasian\n", 450 | "Abkhasian\n", 451 | "Abkhasian\n", 452 | "Abkhasian\n", 453 | "Abkhasian\n", 454 | "Abkhasian\n", 455 | "Adygei\n", 456 | "sort: Schreiben fehlgeschlagen: Standardausgabe: Datenübergabe unterbrochen (broken pipe)\n", 457 | "sort: Schreibfehler\n" 458 | ] 459 | } 460 | ], 461 | "source": [ 462 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | head" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "OK, so there are some error messages in the end because ``head`` ungracefully discards the rest of the data, but that's OK." 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "Now let's use ``uniq`` to get rid of population name duplicates:" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 9, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "Abkhasian\n", 489 | "Adygei\n", 490 | "Albanian\n", 491 | "Aleut\n", 492 | "Aleut_Tlingit\n", 493 | "Altaian\n", 494 | "Ami\n", 495 | "Armenian\n", 496 | "Atayal\n", 497 | "Balkar\n" 498 | ] 499 | } 500 | ], 501 | "source": [ 502 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq | head" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "And now let's count:" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 10, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "name": "stdout", 519 | "output_type": "stream", 520 | "text": [ 521 | "116\n" 522 | ] 523 | } 524 | ], 525 | "source": [ 526 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq | wc -l" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "OK, so there are 116 populations in the dataset. And how many individuals?" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 11, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "1340 /data/popgen_course/genotypes_small.ind\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "wc -l /data/popgen_course/genotypes_small.ind" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "So 1340 individuals on 116 populations, so a bit more than 10 per population on average. Good to know!" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "***Note:*** we learned some new Unix commands: `awk`, `cat`, `head`, `sort`, `uniq` and `wc`." 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "As a final step, let's modify our pipeline to output not just the unique populations, but also the number of individuals per populations. Fortunately this is extremely easy, since the flag `-c` to the `uniq` command already does the job:" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 20, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stdout", 581 | "output_type": "stream", 582 | "text": [ 583 | " 9 Abkhasian\n", 584 | " 16 Adygei\n", 585 | " 6 Albanian\n", 586 | " 7 Aleut\n", 587 | " 4 Aleut_Tlingit\n", 588 | " 7 Altaian\n", 589 | " 10 Ami\n", 590 | " 10 Armenian\n", 591 | " 9 Atayal\n", 592 | " 10 Balkar\n" 593 | ] 594 | } 595 | ], 596 | "source": [ 597 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq -c | head" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "Nice. Let's put that list into a file that we can then import for plotting later." 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 21, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq -c > population_frequencies.txt" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "OK, we have created a new file called `population_frequencies.txt` in our current directory. We have used the bash redirection sumbol `>` for writing outputs from a command or pipeline into a file. The file should now contain the population number data. We can check this by running:" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 22, 626 | "metadata": {}, 627 | "outputs": [ 628 | { 629 | "name": "stdout", 630 | "output_type": "stream", 631 | "text": [ 632 | " 9 Abkhasian\n", 633 | " 16 Adygei\n", 634 | " 6 Albanian\n", 635 | " 7 Aleut\n", 636 | " 4 Aleut_Tlingit\n", 637 | " 7 Altaian\n", 638 | " 10 Ami\n", 639 | " 10 Armenian\n", 640 | " 9 Atayal\n", 641 | " 10 Balkar\n" 642 | ] 643 | } 644 | ], 645 | "source": [ 646 | "head population_frequencies.txt" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "OK, it seems to have worked. If you want to look at the file in a more interactive way, go back to your Jupyter File Browser and click on the file, which you should now see within your working directory. The file should open in a text editor that you can use to scroll around." 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "OK, now that we have a file to plot, let's try it out using a new python3 notebook. See the next notebook, called `02_pynb_getting_started` in this series." 661 | ] 662 | } 663 | ], 664 | "metadata": { 665 | "kernelspec": { 666 | "display_name": "Bash", 667 | "language": "bash", 668 | "name": "bash" 669 | }, 670 | "language_info": { 671 | "codemirror_mode": "shell", 672 | "file_extension": ".sh", 673 | "mimetype": "text/x-sh", 674 | "name": "bash" 675 | } 676 | }, 677 | "nbformat": 4, 678 | "nbformat_minor": 2 679 | } 680 | -------------------------------------------------------------------------------- /03_bashnb_smartpca.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Principal Components Analysis (PCA)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Principal components analysis (PCA) is one of the most useful techniques to visualise genetic diversity in a dataset. The methodology is not restricted to genetic data, but in general allows breaking down high-dimensional datasets to two or more dimensions for visualisation in a two-dimensional space." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Genotype Data" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "This lesson is also our first contact with the genotype data used in this and most of the following lessons. The dataset that we will work with contains 1,340 individuals, each represented by 593,124 single nucleotide polymorphisms (SNPs). Those SNPs have exactly two different alleles, and each individual has one of four possible values at each genotype: homozygous reference, heterozygous, homozygous alternative, or missing. Those four values are encoded 2, 1, 0 and 9 respectively. \n", 29 | "\n", 30 | "The data is laid out as a matrix, with columns indicating individuals, and rows indicating SNPs. The data itself comes in the so-called \"EIGENSTRAT\" format, which is defined in the [Eigensoft package](https://github.com/DReichLab/EIG) used by many tools used in this workshop. In this format, a genotype dataset consists of three files, usually with the following file endings:\n", 31 | "\n", 32 | "* `*.snp`: The file containing the SNP positions. It consists of six columns: SNP-name, chromosome, genetic positions, physical position, reference allele, alternative allele.\n", 33 | "* `*.ind`: The file containing the names of the individuals. It consists of three columns: Individual Name, Sex (encoded as M(ale), F(emale), or U(nknown)), and population name.\n", 34 | "* `*.geno`: The file containing the genotype matrix, with individuals laid out from left to right, and SNP positions laid out from top to bottom.\n", 35 | " \n", 36 | "In the following, we will explore the files using bash in this notebook." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "The data that we want to analyse is stored at `/data/popgen_course`. Let's list the contents of that directory:" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "AllEurasia.poplist.txt\tgenotypes_small.ind WestEurasia.poplist.txt\n", 56 | "genotypes_small.geno\tgenotypes_small.snp\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "ls /data/popgen_course" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Let's explore those files a bit. Here are the first 20 individuals:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 2, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | " Yuk_009 M Yukagir\n", 81 | " Yuk_025 F Yukagir\n", 82 | " Yuk_022 F Yukagir\n", 83 | " Yuk_020 F Yukagir\n", 84 | " MC_40 M Chukchi\n", 85 | " Yuk_024 F Yukagir\n", 86 | " Yuk_023 F Yukagir\n", 87 | " MC_16 M Chukchi\n", 88 | " MC_15 F Chukchi\n", 89 | " MC_18 M Chukchi\n", 90 | " Yuk_004 M Yukagir\n", 91 | " MC_08 F Chukchi\n", 92 | " Nov_005 M Nganasan\n", 93 | " MC_25 F Chukchi\n", 94 | " Yuk_019 F Yukagir\n", 95 | " Yuk_011 M Yukagir\n", 96 | " Sesk_47 M Chukchi1\n", 97 | " MC_17 M Chukchi\n", 98 | " Yuk_021 M Yukagir\n", 99 | " MC_06 F Chukchi\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "head -20 /data/popgen_course/genotypes_small.ind" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "And here the first 20 SNP rows:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | " 1_752566 1 0.020130 752566 G A\n", 124 | " 1_842013 1 0.022518 842013 T G\n", 125 | " 1_891021 1 0.024116 891021 G A\n", 126 | " 1_903426 1 0.024457 903426 C T\n", 127 | " 1_949654 1 0.025727 949654 A G\n", 128 | " 1_1018704 1 0.026288 1018704 A G\n", 129 | " 1_1045331 1 0.026665 1045331 G A\n", 130 | " 1_1048955 1 0.026674 1048955 A G\n", 131 | " 1_1061166 1 0.026711 1061166 T C\n", 132 | " 1_1108637 1 0.028311 1108637 G A\n", 133 | " 1_1120431 1 0.028916 1120431 G A\n", 134 | " 1_1156131 1 0.029335 1156131 T C\n", 135 | " 1_1157547 1 0.029356 1157547 T C\n", 136 | " 1_1158277 1 0.029367 1158277 G A\n", 137 | " 1_1161780 1 0.029391 1161780 C T\n", 138 | " 1_1170587 1 0.029450 1170587 C T\n", 139 | " 1_1205155 1 0.029735 1205155 A C\n", 140 | " 1_1211292 1 0.029785 1211292 C T\n", 141 | " 1_1235792 1 0.030045 1235792 C T\n", 142 | " 1_1254255 1 0.030111 1254255 G A\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "head -20 /data/popgen_course/genotypes_small.snp" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "And here are the first 20 genotypes of the first 50 individuals:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 3, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "01011012111022101020212001000102000000110010002000\n", 167 | "20121210122100111221001112022012221211022221211210\n", 168 | "11001120011100210010011110000112000001111000011100\n", 169 | "00001122102221212211211002022212221221121122112021\n", 170 | "00000000000000000000000000001000000000000000001000\n", 171 | "10121002211022011011211101201100000100120020102001\n", 172 | "22222222222222222222222222222222222222222222222222\n", 173 | "22112220022120221020012122222122122222101222121212\n", 174 | "22112220022120221020012122020122122122101222121211\n", 175 | "22222222221022222022222222222222222222222222112222\n", 176 | "22122222121222222222222222222212222222222222202211\n", 177 | "11011000010000010010000002220100212000012021101011\n", 178 | "12211212212222112212222221212212222122222222222222\n", 179 | "12211212212222112212222221212212222122222222222222\n", 180 | "12211212212222112212222221212212222122222222222222\n", 181 | "22222222222222222222222222222222222222222222222222\n", 182 | "22222222222222222222222222222222222222222222222222\n", 183 | "10111111021001110011002001222210222112112220212122\n", 184 | "22222222222222222222222222222222222222222222222222\n", 185 | "21221212121022212022222222222222211222122221922222\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "head -20 /data/popgen_course/genotypes_small.geno | cut -c1-50" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "Counting how many individuals and SNPs there are:" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 4, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "1340 /data/popgen_course/genotypes_small.ind\n", 210 | "593124 /data/popgen_course/genotypes_small.snp\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "wc -l /data/popgen_course/genotypes_small.ind\n", 216 | "wc -l /data/popgen_course/genotypes_small.snp" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "And now we check that the first row of the `*.geno` file indeed contains the same number of columns:" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 6, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "1341\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "head -1 /data/popgen_course/genotypes_small.geno | wc -c" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "which is one more, including the newline character at the end of the line. Now counting the number of rows in the `*.geno`-file (this takes a few seconds, as the file is several hundred MB large):" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 7, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "593124 /data/popgen_course/genotypes_small.geno\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "wc -l /data/popgen_course/genotypes_small.geno" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "Great, the number of rows and columns agrees with the numbers indicated in the `*.ind` and `*.snp` file!\n", 272 | "Now we're counting how many different populations there are. Let's first see the first 10 populations in the sorted list, alongside the number of individuals in each group:" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 5, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | " 9 Abkhasian\n", 285 | " 16 Adygei\n", 286 | " 6 Albanian\n", 287 | " 7 Aleut\n", 288 | " 4 Aleut_Tlingit\n", 289 | " 7 Altaian\n", 290 | " 10 Ami\n", 291 | " 10 Armenian\n", 292 | " 9 Atayal\n", 293 | " 10 Balkar\n", 294 | " 29 Basque\n", 295 | " 25 BedouinA\n", 296 | " 19 BedouinB\n", 297 | " 10 Belarusian\n", 298 | " 6 BolshoyOleniOstrov\n", 299 | " 9 Borneo\n", 300 | " 10 Bulgarian\n", 301 | " 8 Cambodian\n", 302 | " 2 Canary_Islander\n", 303 | " 2 ChalmnyVarre\n" 304 | ] 305 | } 306 | ], 307 | "source": [ 308 | "awk '{print $3}' /data/popgen_course/genotypes_small.ind | sort | uniq -c | head -20" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "## How PCA works\n", 316 | "\n", 317 | "To understand how PCA works, consider a single individual and its representation by its 593,124 markers. Formally, each individual is a point in a 593,124-dimensional space, where each dimension\n", 318 | "can take only the three possible genotypes indicated above, or have missing data. To visualise this high-dimensional dataset, we would like to project it down to two dimensions. But as there are many ways to project the shadow of a three-dimensional object on a two dimensional plane, there are many (and even more) ways to project a 593,124-dimensional cloud of points to two dimensions. What PCA does is figuring out the \"best\" way to do this project in order to visualise the major components of variance in the data.\n" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "## Parameter files\n", 326 | "For actually running the analysis, we use a software called `smartPCA` from the [Eigensoft package](https://github.com/DReichLab/EIG). As many other tools from this and related packages, `smartPCA` reads in a parameter file which specifies its input and output files and options. In our case, we want the parameter file to have the following content:\n", 327 | "\n", 328 | " genotypename: /data/popgen_course/genotypes_small.geno\n", 329 | " snpname: /data/popgen_course/genotypes_small.snp\n", 330 | " indivname: /data/popgen_course/genotypes_small.ind\n", 331 | " evecoutname: pca.WestEurasia.evec\n", 332 | " evaloutname: pca.WestEurasia.eval\n", 333 | " poplistname: /data/popgen_course/WestEurasia.poplist.txt\n", 334 | " lsqproject: YES\n", 335 | " numoutevec: 4\n", 336 | " numthreads: 1" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "Here, the first three parameters specify the input genotype files. The next two rows specify two output file names, typically with ending `*.evec` and `*.eval`. The parameter line beginning with `poplistname` contains a file with a list of populations used for calculating the principal components (see below). The option `lsqproject` is important for applications including ancient DNA with lots of missing data, which I will not elaborate on. For the purpose of this workshop, you should use `lsqproject: YES`. The next option `numoutevec` specifies the number of principal components that we compute, the last option `numthreads` the number of CPUs to use for this run. We use just one since we're working together on the same computer, so cannot afford everyone running on lots of CPUs." 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "## Population lists vs. Projection\n", 351 | "\n", 352 | "The parameter named `poplistname` is a very crucial one. It specifies the populations whose individuals are used to calculate the principal components. Why not just all of them you ask? For two reasons: First, there are simply too many of them and we don't want to use all of them, since the computation would take too long. More importantly, however, we generally try to avoid using ancient samples to compute principal components, to avoid specific ancient-DNA related artefacts affecting the computation. Finally, the list of populations to use for PCA should be informed by your question. If you're investigating African population structure, in makes no sense to put Asian or European individuals in your population list, since then the main axes of genetic differentiation would not be inside of Africa, but between Africans and Non-Africans.\n", 353 | "\n", 354 | "So what happens to individuals that are not in populations listed in the population list? Well, fortunately, they are not just ignored, but \"projected\". This means that after the principal components have been computed, *all* individuals (not just the one in the list) are projected onto these principal components. That way, we can visualise ancient populations in the context of modern genetic variation. While that may sound a bit problematic at first (Some variation in ancient populations is not represented well by modern populations), but it turns out to be nevertheless one of the most useful tools for this purpose. The advantage of avoiding ancient-DNA artefacts and batch effects to affect the visualisation outweighs the disadvantage of missing some private genetic variation components in the ancient populations themselves. Of course, that argument breaks down once the analysed populations become too ancient and detached from modern genetic variation. But for our purposes it will work just fine.\n", 355 | "\n", 356 | "For this workshop, I prepared two population lists::\n", 357 | "\n", 358 | " /data/popgen_course/WestEurasia.poplist.txt\n", 359 | " /data/popgen_course/AllEurasia.poplist.txt\n", 360 | "\n", 361 | "As you can tell from the names of the files, they specify two sets of modern populations representing West Eurasia or all of Europe and Asia, respectively.\n", 362 | "\n", 363 | "I recommend to look through both of the population lists and google some population names that you don't recognise to get a feeling for the ethnic groups represented here." 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "## Running `smartPCA`" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "Now go ahead and open a new text file using your Jupyter Browser, you can name it anything you like. For the sake of a concrete name, let's call it `pca.WestEurasia.params.txt`. Text files in Jupyter are opene in a text editor, so you can then simply copy-paste the above lines into the new file." 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "Let's see whether it worked, by printing out the contents of that file into your notebook:" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 8, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "genotypename: /data/popgen_course/genotypes_small.geno\n", 397 | "snpname: /data/popgen_course/genotypes_small.snp\n", 398 | "indivname: /data/popgen_course/genotypes_small.ind\n", 399 | "evecoutname: pca.WestEurasia.evec\n", 400 | "evaloutname: pca.WestEurasia.eval\n", 401 | "poplistname: /data/popgen_course/WestEurasia.poplist.txt\n", 402 | "lsqproject: YES\n", 403 | "numoutevec: 4\n", 404 | "numthreads: 1\n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "cat pca.WestEurasia.params.txt" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "Great, so that's our parameter file for running `smartPCA`.\n", 417 | "\n", 418 | "***Note:*** that we specified two output files in our parameter file, here called `pca.WestEurasia.evec` and `pca.WestEurasia.eval`. You can actually put any names you want in there. But beware of relative vs. absolute paths. File names starting with `/` are considered \"absolute\", that is, taken to go from the root of the file system. In contrast, filenames not starting with `/` are considered \"relative\" to the current working directory. If you forgot which directory you're in, run `pwd`.\n", 419 | "\n", 420 | "***Note:*** The option `poplistname` is a crucial one. Here you need to specify which populations are used to compute the eigenvectors of the principal components analysis. In our case, I have prepared two population list files: `/data/popgen_course/WestEurasia.poplist.txt` and `/data/popgen_course/AllEurasia.poplist.txt`. Pick one of the two to carry on." 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "Good, now we can run `smartPCA`. To do that, it's more convenient to use the terminal than a Notebook. So open a terminal and run\n", 428 | "\n", 429 | " smartpca -p pca.WestEurasia.params.txt" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "This will typically run for about 30 minutes and output lots of logging output to the screen." 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "In a similar manner we can prepare a parameter file for the AllEurasia population list. This is how it should look:" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 11, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "genotypename: /data/popgen_course/genotypes_small.geno\n", 456 | "snpname: /data/popgen_course/genotypes_small.snp\n", 457 | "indivname: /data/popgen_course/genotypes_small.ind\n", 458 | "evecoutname: pca.AllEurasia.evec\n", 459 | "evaloutname: pca.AllEurasia.eval\n", 460 | "poplistname: /data/popgen_course/AllEurasia.poplist.txt\n", 461 | "lsqproject: YES\n", 462 | "numoutevec: 4\n", 463 | "numthreads: 1\n" 464 | ] 465 | } 466 | ], 467 | "source": [ 468 | "cat pca.AllEurasia.params.txt" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "And similar to the command above, we can run pca on the AllEurasia population list via:\n", 476 | "\n", 477 | " smartpca -p pca.AllEurasia.params.txt" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "which will run slightly longer than the first one because there are more populations " 485 | ] 486 | } 487 | ], 488 | "metadata": { 489 | "kernelspec": { 490 | "display_name": "Bash", 491 | "language": "bash", 492 | "name": "bash" 493 | }, 494 | "language_info": { 495 | "codemirror_mode": "shell", 496 | "file_extension": ".sh", 497 | "mimetype": "text/x-sh", 498 | "name": "bash" 499 | } 500 | }, 501 | "nbformat": 4, 502 | "nbformat_minor": 2 503 | } 504 | -------------------------------------------------------------------------------- /05_Rmd_fstatistics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "F Statistics" 3 | output: html_document 4 | editor_options: 5 | chunk_output_type: console 6 | --- 7 | 8 | ```{r, echo=FALSE} 9 | knitr::opts_chunk$set(message = FALSE) 10 | ``` 11 | 12 | ```{r} 13 | library(magrittr) 14 | ``` 15 | 16 | ## F3 Statistics 17 | 18 | F3 statistics are a useful analytical tool to understand population relationships. F3 statistics, just as F4 and F2 statistics measure allele frequency correlations between populations and were introduced by Nick Patterson in his [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037). 19 | 20 | F3 statistics are used for two purposes: i) as a test whether a target population (C) is admixed between two source populations (A and B), and ii) to measure shared drift between two test populations (A and B) from an outgroup (C). 21 | 22 | F3 statistics are in both cases defined as the product of allele frequency differences between population C to A and B, respectively: 23 | 24 | $$F3(A,B;C)=\langle(c−a)(c−b)\rangle$$ 25 | 26 | Here, $\langle\cdot\rangle$ denotes the average over all genotyped sites, and a, b and c 27 | denote the allele frequency for a given site in the three populations A, B and C. 28 | 29 | ## Admixture F3 Statistics 30 | 31 | It can be shown that if that statistics is negative, it provides unambiguous proof that population C is admixed between populations A and B, as in the following phylogeny (taken from Figure 1 from [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037): 32 | 33 | F3-phylogeny 34 | 35 | Intuitively, an F3 statistics becomes negative if the allele frequency of the target population C is on average intermediate between the allele frequencies of A and B. Consider as an extreme example a genomic site where a=0, b=1 and c=0.5. Then we have (c−a)(c−b)=−0.25, which is negative. So if the entire statistics is negative, it suggests that in many positions, the allele frequency c is indeed intermediate, suggesting admixture between the two sources. 36 | 37 | **Note:** If an F3 statistics is *not* negative, it does *not* proof that there is no admixture! 38 | 39 | We will use this statistics to test if Finnish are admixed between East and West, using different Eastern and Western sources. In the West, we use French, Icelandic, Lithuanian and Norwegian as source, and in the East we use Nganasan and one of the ancient individuals analysed in this workshop, *Bolshoy Oleni Ostrov*, 3500 year old individuals from the Northern Russian Kola-peninsula. 40 | 41 | We use the software `qp3Pop` from [AdmixTools](https://github.com/DReichLab/AdmixTools), which similar to `smartpca` takes a parameter file: 42 | 43 | ``` 44 | genotypename: input genotype file (in eigenstrat format) 45 | snpname: input snp file (in eigenstrat format) 46 | indivname: input indiv file (in eigenstrat format) 47 | popfilename: a file containing rows with three populations on each line A, B and C. 48 | inbreed: YES 49 | ``` 50 | 51 | Here, the last option is necessary if we are analysing pseudo-diploid ancient data (which is the case here). 52 | 53 | To prepare the `popfilename`, create a new text file with the following content: 54 | 55 | ``` 56 | Nganasan French Finnish 57 | Nganasan Icelandic Finnish 58 | Nganasan Lithuanian Finnish 59 | Nganasan Norwegian Finnish 60 | BolshoyOleniOstrov French Finnish 61 | BolshoyOleniOstrov Icelandic Finnish 62 | BolshoyOleniOstrov Lithuanian Finnish 63 | BolshoyOleniOstrov Norwegian Finnish 64 | ``` 65 | 66 | **Exercise:** Prepare the parameter file with the input data as in the PCA session (see Principal Components Analysis (PCA)) and then run `qp3Pop -p PARAMETER_FILE`, where `PARAMETERFILE` should be replaced by your parameter file name. As genotype data, use the files called `/data/popgen_course/HumanOrigins_FennoScandian_small.*`. 67 | 68 | The results are in the output that you can view in the Notebook. The crucial bit should look like this: 69 | 70 | ``` 71 | Source 1 Source 2 Target f_3 std. err Z SNPs 72 | result: Nganasan French Finnish -0.004539 0.000510 -8.894 442567 73 | result: Nganasan Icelandic Finnish -0.005297 0.000563 -9.404 427954 74 | result: Nganasan Lithuanian Finnish -0.005062 0.000590 -8.574 426231 75 | result: Nganasan Norwegian Finnish -0.004744 0.000569 -8.332 428161 76 | result: BolshoyOleniOstrov French Finnish -0.002814 0.000444 -6.341 402958 77 | result: BolshoyOleniOstrov Icelandic Finnish -0.002590 0.000486 -5.323 386418 78 | result: BolshoyOleniOstrov Lithuanian Finnish -0.001523 0.000536 -2.840 384134 79 | result: BolshoyOleniOstrov Norwegian Finnish -0.001553 0.000502 -3.092 386203 80 | ``` 81 | 82 | This output shows as first three columns the three populations A, B (sources) and C (target). Then the f3 statistics, which is negative in all cases tested here, a standard error, a Z score and the number of SNPs involved in the statistics. 83 | 84 | The Z score is key: It gives the deviation of the f3 statistic from zero in units of the standard error. As general rule, a Z score of -3 or more suggests a significant rejection of the Null hypothesis that the statistic is not negative. In this case, all of the statistics are significantly negative, proving that Finnish have ancestral admixture of East and West Eurasian ancestry. Note that the statistics does not suggest when this admixture happened! 85 | 86 | ## F4 Statistics 87 | 88 | A different way to test for admixture is by “F4 statistics” (or “D statistics” which is very similar), also introduced in [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037). 89 | 90 | F4 statistics are also defined in terms of correlations of allele frequency differences, similarly to F3 statistics (see above), but involving four different populations, not just three. Specifically we define 91 | 92 | $$F4(A,B;C,D)=\langle(a−b)(c−d)\rangle.$$ 93 | 94 | To understand the statistics, consider the following tree: 95 | 96 | F4-phylogeny 97 | 98 | In this tree, without any additional admixture, the allele frequency difference between A and B should be completely independent from the allele frequency difference between C and D. In that case, F4(A, B; C, D) should be zero, or at least not statistically different from zero. However, if there was gene flow from C or D into A or B, the statistic should be different from zero. Specifically, if the statistic is significantly negative, it implies gene flow between either C and B, or D and A. If it is significantly positive, it implies gene flow between A and C, or B and D. 99 | 100 | The way this statistic is often used, is to put a divergent outgroup as population A, for which we know for sure that there was no admixture into either C or D. With this setup, we can then test for gene flow between B and D (if the statistic is positive), or B and C (if it is negative). 101 | 102 | Here, we can use this statistic to test for East Asian admixture in Finns, similarly to the test using Admixture F3 statistics above. We will use the `qpDstat` program from [AdmixTools](https://github.com/DReichLab/AdmixTools) for that. We need to again prepare a population list file, this time with four populations (A, B, C, D). I suggest you open a new file and fill it with: 103 | 104 | ``` 105 | Mbuti Nganasan French Finnish 106 | Mbuti Nganasan Icelandic Finnish 107 | Mbuti Nganasan Lithuanian Finnish 108 | Mbuti Nganasan Norwegian Finnish 109 | Mbuti BolshoyOleniOstrov French Finnish 110 | Mbuti BolshoyOleniOstrov Icelandic Finnish 111 | Mbuti BolshoyOleniOstrov Lithuanian Finnish 112 | Mbuti BolshoyOleniOstrov Norwegian Finnish 113 | ``` 114 | 115 | You can then use this file again in a parameter file, similar to the one prepared for `qp3Pop` above: 116 | 117 | ``` 118 | genotypename: input genotype file (in eigenstrat format) 119 | snpname: input snp file (in eigenstrat format) 120 | indivname: input indiv file (in eigenstrat format) 121 | popfilename: a file containing rows with three populations on each line A, B and C. 122 | f4mode: YES 123 | ``` 124 | 125 | Note that you cannot give the “inbreed” option here. 126 | 127 | **Exercise:** Prepare the parameter file as suggested above and then run `qpDstat -p PARAMETER_FILE`, where `PARAMETERFILE` should be replaced by your parameter file name. This will take 5-6 minutes. 128 | 129 | The results should be (skipping some header lines): 130 | 131 | ``` 132 | result: Mbuti Nganasan French Finnish 0.002363 19.016 29254 27852 593124 133 | result: Mbuti Nganasan Icelandic Finnish 0.001721 11.926 28915 27894 593124 134 | result: Mbuti Nganasan Lithuanian Finnish 0.001368 9.664 28745 27933 593124 135 | result: Mbuti Nganasan Norwegian Finnish 0.001685 11.663 28933 27934 593124 136 | result: Mbuti BolshoyOleniOstrov French Finnish 0.001962 16.737 27249 26175 547486 137 | result: Mbuti BolshoyOleniOstrov Icelandic Finnish 0.001084 7.776 26876 26282 547486 138 | result: Mbuti BolshoyOleniOstrov Lithuanian Finnish 0.000554 3.942 26683 26380 547486 139 | result: Mbuti BolshoyOleniOstrov Norwegian Finnish 0.000952 6.707 26873 26351 547486 140 | ``` 141 | 142 | Here, the key columns are columns 2, 3, 4 and 5, denoting A, B, C and D, and column 6 and 7, which denote the F4 statistic and the Z score, measuring significance in difference from zero. 143 | 144 | As you can see, in all cases, the Z score is positive and larger than 3, indicating a significant deviation from zero, and implying gene flow between Nganasan and Finnish, and BolshoyOleniOstrov and Finnish, when compared to French, Icelandic, Lithuanian or Norwegian. 145 | 146 | ## Outgroup-F3-Statistics 147 | 148 | Outgroup F3 statistics are a special case how to use F3 statistics. The definition is the same as for Admixture F3 statistics, but instead of a target C and two source populations A and B, one now gives an outgroup C and two test populations A and B. 149 | 150 | To get an intuition for this statistics, consider the following tree: 151 | 152 | Outgroup-F3-phylogeny 153 | 154 | In this scenario, the statistic F3(A, B; C) measures the branch length from C to the common ancestor of A and B, coloured red. So this statistic is simply a measure of how closely two population A and B are related with each other, as measured from a distant outgroup. It is thus a similarity measure: The higher the statistic, the more genetically similar A and B are to one another. 155 | 156 | We can use this statistic to measure for example the the genetic affinity to East Asia, by performing the statistic F3(Han, X; Mbuti), where Mbuti is a distant African population and acts as outgroup here, Han denote Han Chinese, and X denotes various European populations that we want to test. 157 | 158 | You need to start, again, by preparing a list of population triples to be measured. I suggest the following list: 159 | 160 | ``` 161 | Han Chuvash Mbuti 162 | Han Albanian Mbuti 163 | Han Armenian Mbuti 164 | Han Bulgarian Mbuti 165 | Han Czech Mbuti 166 | Han Druze Mbuti 167 | Han English Mbuti 168 | Han Estonian Mbuti 169 | Han Finnish Mbuti 170 | Han French Mbuti 171 | Han Georgian Mbuti 172 | Han Greek Mbuti 173 | Han Hungarian Mbuti 174 | Han Icelandic Mbuti 175 | Han Italian_North Mbuti 176 | Han Italian_South Mbuti 177 | Han Lithuanian Mbuti 178 | Han Maltese Mbuti 179 | Han Mordovian Mbuti 180 | Han Norwegian Mbuti 181 | Han Orcadian Mbuti 182 | Han Russian Mbuti 183 | Han Sardinian Mbuti 184 | Han Scottish Mbuti 185 | Han Sicilian Mbuti 186 | Han Spanish_North Mbuti 187 | Han Spanish Mbuti 188 | Han Ukrainian Mbuti 189 | Han Levanluhta Mbuti 190 | Han BolshoyOleniOstrov Mbuti 191 | Han ChalmnyVarre Mbuti 192 | Han Saami.DG Mbuti 193 | ``` 194 | 195 | which cycles through many populations from Europe, including the ancient individuals from Chalmny Varre, Bolshoy Oleni Ostrov and Levänluhta. 196 | 197 | **Exercise:** Copy this list into a file, and prepare a parameter file for running `qp3Pop`, similar to the parameter file for admixture F3 statistics above, and run `qp3Pop` with that parameter file as above. Note that here you don't need the line beginning with `inbreed`. This will take up to 10 minutes. 198 | 199 | You should find this (skipping header lines from the output): 200 | 201 | ``` 202 | Source 1 Source 2 Target f_3 std. err Z SNPs 203 | result: Han Chuvash Mbuti 0.233652 0.002072 112.782 502678 204 | result: Han Albanian Mbuti 0.215629 0.002029 106.291 501734 205 | result: Han Armenian Mbuti 0.213724 0.001963 108.882 504370 206 | result: Han Bulgarian Mbuti 0.216193 0.001979 109.266 504310 207 | result: Han Czech Mbuti 0.218060 0.002002 108.939 504089 208 | result: Han Druze Mbuti 0.209551 0.001919 109.205 510853 209 | result: Han English Mbuti 0.216959 0.001973 109.954 504161 210 | result: Han Estonian Mbuti 0.220730 0.002019 109.332 503503 211 | result: Han Finnish Mbuti 0.223447 0.002044 109.345 502217 212 | result: Han French Mbuti 0.216623 0.001969 110.012 509613 213 | result: Han Georgian Mbuti 0.214295 0.001935 110.721 503598 214 | result: Han Greek Mbuti 0.215203 0.001984 108.465 507475 215 | result: Han Hungarian Mbuti 0.217894 0.001999 109.004 507409 216 | result: Han Icelandic Mbuti 0.218683 0.002015 108.553 504655 217 | result: Han Italian_North Mbuti 0.215332 0.001978 108.854 507589 218 | result: Han Italian_South Mbuti 0.211787 0.002271 93.265 492400 219 | result: Han Lithuanian Mbuti 0.219615 0.002032 108.098 503681 220 | result: Han Maltese Mbuti 0.210359 0.001956 107.542 503985 221 | result: Han Mordovian Mbuti 0.223469 0.002008 111.296 503441 222 | result: Han Norwegian Mbuti 0.218873 0.002023 108.197 504621 223 | result: Han Orcadian Mbuti 0.217773 0.002014 108.115 504993 224 | result: Han Russian Mbuti 0.223993 0.001995 112.274 506525 225 | result: Han Sardinian Mbuti 0.213230 0.001980 107.711 508413 226 | result: Han Scottish Mbuti 0.218489 0.002039 107.145 499784 227 | result: Han Sicilian Mbuti 0.212272 0.001975 107.486 505477 228 | result: Han Spanish_North Mbuti 0.215885 0.002029 106.383 500853 229 | result: Han Spanish Mbuti 0.213869 0.001975 108.297 513648 230 | result: Han Ukrainian Mbuti 0.218716 0.002007 108.950 503981 231 | result: Han Levanluhta Mbuti 0.236252 0.002383 99.123 263049 232 | result: Han BolshoyOleniOstrov Mbuti 0.247814 0.002177 113.849 457102 233 | result: Han ChalmnyVarre Mbuti 0.233499 0.002304 101.345 366220 234 | result: Han Saami.DG Mbuti 0.236198 0.002274 103.852 489038 235 | ``` 236 | 237 | Now it’s time to plot these results using R. Copy the results (all lines from the output beginning with “results:”) into a text file named "f3_outgroup_stats_Han.txt", and load it into an R tibble using: 238 | 239 | ```{r} 240 | d <- readr::read_delim( 241 | "f3_outgroup_stats_Han.txt", 242 | delim = " ", 243 | trim_ws = T, 244 | col_names = c("dummy", "A", "B", "C", "F3", "StdErr", "Z", "SNPS") 245 | ) 246 | ``` 247 | 248 | We can check that it worked: 249 | 250 | ```{r} 251 | d 252 | ``` 253 | 254 | ```{r} 255 | library(ggplot2) 256 | 257 | d %>% 258 | ggplot() + 259 | geom_errorbarh( 260 | aes( 261 | xmin = F3 - StdErr, 262 | xmax = F3 + StdErr, 263 | y = forcats::fct_reorder(B, F3) 264 | ) 265 | ) + 266 | geom_point( 267 | aes(x = F3, y = forcats::fct_reorder(B, F3)) 268 | ) + 269 | xlab("F3(Han, Test; Mbuti)") 270 | ``` 271 | 272 | As expected, the ancient samples and modern Saami are the ones with the highest allele sharing with present-day East Asians (as represented by Han) compared to many other Europeans. 273 | 274 | ## Outgroup F3 Statistics Scatter plot 275 | 276 | The above plot shows an intriguing cline of differential relatedness to Han in many Europeans. For example, would you have guessed that Icelandics are closer to Han than Armenians are to Han? This is very surprising, and it shows that European ancestry has a complex relationship to East Asians. To understand this better, you can read [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037), who makes some intriguing observations. Patterson and colleagues use Admixture F3 statistics and apply it to many populations world-wide. They summarise some population triples with the most negative F3 statistics in the following table: 277 | 278 | Patterson 2012 table 279 | 280 | There are many interesting results here, but one of the most striking one is the finding of F3(Sardinian, Karitiana; French), which is highly significantly negative. This statistics implies that French are admixed between Sardinians and Karitiana, a Native American population from Brazil. How is that possible? We can of course rule out any recent Native American backflow into Europe. 281 | 282 | Patterson and colleagues explained this finding with hypothesising an ancient admixture event, from a Siberian population that contributed to both Europeans and to Native Americans. They termed that population the “Ancient North Eurasians (ANE)”. The following admixture graph was suggested: 283 | 284 | Patterson 2012 ANE graph 285 | 286 | As you can see, the idea is that modern Central Europeans, such as French, are admixed between Southern Europeans (Sardinians) and ANE. The Ancient North Eurasians are a classic example for a “Ghost” population, a population which does not exist anymore in unmixed form, and from which we have no direct individual representative. 287 | 288 | Amazingly, two years after the publication of [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037), the ANE ghost population was actually found: [Raghavan et al.](https://www.nature.com/articles/nature12736) and colleagues, in 2014, published a paper called “Upper Palaeolithic Siberian genome reveals dual ancestry of Native Americans”. A 24,000 year old boy (called MA1) from the site of “Mal’ta” in Siberia was shown to have close genetic affinity with both Europeans and in particular Native Americans, just as proposed in [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037). 289 | 290 | The affinities are summarised nicely in this figure from [Raghavan et al.](https://www.nature.com/articles/nature12736): 291 | 292 | MA1 Affinities 293 | 294 | OK, so we now know that ancestry related to Native Americans contributed to European countries. Could that possibly explain the affinity of our ancient samples and Saami to Han Chinese in some way? To test this, we will run the same Outgroup F3 statistics as above, but this time not with Han but with MA1 as test population. Specifically, we run the following population triples in `qp3Pop`: 295 | 296 | ``` 297 | MA1_HG.SG Chuvash Mbuti 298 | MA1_HG.SG Albanian Mbuti 299 | MA1_HG.SG Armenian Mbuti 300 | MA1_HG.SG Bulgarian Mbuti 301 | MA1_HG.SG Czech Mbuti 302 | MA1_HG.SG Druze Mbuti 303 | MA1_HG.SG English Mbuti 304 | MA1_HG.SG Estonian Mbuti 305 | MA1_HG.SG Finnish Mbuti 306 | MA1_HG.SG French Mbuti 307 | MA1_HG.SG Georgian Mbuti 308 | MA1_HG.SG Greek Mbuti 309 | MA1_HG.SG Hungarian Mbuti 310 | MA1_HG.SG Icelandic Mbuti 311 | MA1_HG.SG Italian_North Mbuti 312 | MA1_HG.SG Italian_South Mbuti 313 | MA1_HG.SG Lithuanian Mbuti 314 | MA1_HG.SG Maltese Mbuti 315 | MA1_HG.SG Mordovian Mbuti 316 | MA1_HG.SG Norwegian Mbuti 317 | MA1_HG.SG Orcadian Mbuti 318 | MA1_HG.SG Russian Mbuti 319 | MA1_HG.SG Sardinian Mbuti 320 | MA1_HG.SG Scottish Mbuti 321 | MA1_HG.SG Sicilian Mbuti 322 | MA1_HG.SG Spanish_North Mbuti 323 | MA1_HG.SG Spanish Mbuti 324 | MA1_HG.SG Ukrainian Mbuti 325 | MA1_HG.SG Levanluhta Mbuti 326 | MA1_HG.SG BolshoyOleniOstrov Mbuti 327 | MA1_HG.SG ChalmnyVarre Mbuti 328 | MA1_HG.SG Saami.DG Mbuti 329 | ``` 330 | 331 | here, `MA1_HG.SG` is the somewhat cryptic population name for the MA1-individual. 332 | 333 | **Exercise:** Follow the same protocol as above: Copy the list into a file, prepare a parameter file for `qp3Pop` with that population triple list, and run `qp3Pop`. Copy the results (all lines beginning with “results:”) into a file, named "f3_outgroup_stats_MA1.txt" 334 | 335 | To test in what way the relationship to Han Chinese is correlated with the relationship with MA1, we will now plot the two statistics against each other in a scatter plot. We first have to merge the two outgroup-F3 datasets together. Here is the code including loading (assuming that the two F3 dataframes are called `outgroupf3dat_Han` and `outgroupf3dat_MA1`): 336 | 337 | ```{r} 338 | outgroupf3dat_Han <- readr::read_delim( 339 | "f3_outgroup_stats_Han.txt", 340 | delim = " ", 341 | trim_ws = T, 342 | col_names = c("dummy", "A", "B", "C", "F3", "stderr", "Z", "nSNPs") 343 | ) 344 | 345 | outgroupf3dat_MA1 <- readr::read_delim( 346 | "f3_outgroup_stats_MA1.txt", 347 | delim = " ", 348 | trim_ws = T, 349 | col_names = c("dummy", "A", "B", "C", "F3", "stderr", "Z", "nSNPs") 350 | ) 351 | 352 | outgroupf3dat_merged <- dplyr::full_join( 353 | outgroupf3dat_Han, 354 | outgroupf3dat_MA1, 355 | by = "B", 356 | suffix = c("_Han", "_MA1") 357 | ) 358 | ``` 359 | 360 | Again, we check that everything worked: 361 | 362 | ```{r} 363 | outgroupf3dat_merged 364 | ``` 365 | 366 | Now we can make a scatter plot: 367 | 368 | ```{r} 369 | outgroupf3dat_merged %>% 370 | ggplot() + 371 | geom_point( 372 | aes( 373 | x = F3_Han, 374 | y = F3_MA1 375 | ) 376 | ) + 377 | xlab("F3(Test, Han; Mbuti)") + 378 | ylab("F3(Test, MA1; Mbuti)") 379 | ``` 380 | 381 | This isn’t very useful, however, as we cannot see which point is which population. We use the `geom_label_repel` function from ggrepel to add text labels to each point: 382 | 383 | ```{r} 384 | outgroupf3dat_merged %>% 385 | ggplot() + 386 | geom_point( 387 | aes( 388 | x = F3_Han, 389 | y = F3_MA1 390 | ) 391 | ) + 392 | xlab("F3(Test, Han; Mbuti)") + 393 | ylab("F3(Test, MA1; Mbuti)") + 394 | ggrepel::geom_label_repel( 395 | aes( 396 | x = F3_Han, 397 | y = F3_MA1, 398 | label = B 399 | ) 400 | ) 401 | ``` 402 | 403 | The result shows that indeed the affinity to East Asians in the bulk of European contries can be explained by MA1-related ancestry. Most European countries have a linear relationship between their affinity to Han and their affinity to MA1. However, this is not true for our ancient samples from Fennoscandia and for modern Saami and Chuvash, who have extra affinity to Han not explained by MA1 ([Lazaridis et al. 2014](https://www.nature.com/articles/nature13673)). 404 | 405 | Now, why there is a connection between MA1 and Han is not trivial to explain. The most probable explanation involves "Basal Eurasian" ancestry, which happens to be anti-correlated to MA1-ancestry in Europe, and which drives those populations with high "Basal Eurasian" ancestry further away from Han. See [Lazaridis et al. 2014](https://www.nature.com/articles/nature13673) for more details. 406 | -------------------------------------------------------------------------------- /pca.AllEurasia.eval: -------------------------------------------------------------------------------- 1 | 71.715980 2 | 9.646096 3 | 6.317298 4 | 3.871764 5 | 3.075333 6 | 2.646867 7 | 2.454284 8 | 2.236205 9 | 2.006447 10 | 2.000151 11 | 1.949834 12 | 1.946900 13 | 1.913285 14 | 1.909476 15 | 1.886648 16 | 1.873806 17 | 1.865762 18 | 1.844984 19 | 1.826839 20 | 1.818292 21 | 1.813557 22 | 1.806923 23 | 1.794645 24 | 1.787562 25 | 1.764851 26 | 1.756468 27 | 1.741944 28 | 1.735706 29 | 1.725703 30 | 1.721535 31 | 1.718770 32 | 1.712666 33 | 1.699156 34 | 1.689564 35 | 1.685036 36 | 1.677270 37 | 1.672807 38 | 1.666883 39 | 1.660374 40 | 1.653085 41 | 1.650461 42 | 1.643042 43 | 1.636804 44 | 1.634764 45 | 1.628574 46 | 1.623305 47 | 1.615182 48 | 1.601129 49 | 1.595264 50 | 1.583416 51 | 1.576798 52 | 1.574901 53 | 1.566180 54 | 1.559824 55 | 1.556275 56 | 1.549500 57 | 1.542913 58 | 1.540315 59 | 1.531804 60 | 1.528939 61 | 1.521507 62 | 1.515997 63 | 1.512632 64 | 1.512015 65 | 1.507849 66 | 1.499872 67 | 1.496241 68 | 1.493827 69 | 1.486508 70 | 1.483965 71 | 1.478250 72 | 1.469645 73 | 1.467586 74 | 1.458038 75 | 1.452991 76 | 1.449844 77 | 1.449182 78 | 1.446437 79 | 1.438036 80 | 1.433108 81 | 1.431148 82 | 1.428462 83 | 1.421567 84 | 1.417396 85 | 1.416161 86 | 1.411774 87 | 1.410701 88 | 1.406160 89 | 1.405373 90 | 1.399438 91 | 1.393987 92 | 1.388933 93 | 1.384344 94 | 1.381818 95 | 1.379500 96 | 1.374324 97 | 1.367127 98 | 1.364338 99 | 1.354982 100 | 1.351974 101 | 1.350672 102 | 1.348083 103 | 1.345424 104 | 1.341461 105 | 1.337633 106 | 1.333327 107 | 1.330888 108 | 1.328745 109 | 1.325143 110 | 1.321715 111 | 1.318676 112 | 1.318241 113 | 1.314366 114 | 1.311146 115 | 1.308522 116 | 1.306540 117 | 1.304946 118 | 1.301508 119 | 1.297752 120 | 1.297505 121 | 1.295093 122 | 1.291474 123 | 1.289045 124 | 1.286803 125 | 1.284147 126 | 1.282275 127 | 1.280667 128 | 1.279286 129 | 1.276059 130 | 1.275587 131 | 1.273570 132 | 1.272469 133 | 1.269639 134 | 1.266366 135 | 1.263176 136 | 1.260103 137 | 1.256532 138 | 1.254596 139 | 1.252190 140 | 1.250768 141 | 1.249041 142 | 1.244996 143 | 1.244038 144 | 1.242172 145 | 1.238261 146 | 1.236299 147 | 1.234178 148 | 1.231280 149 | 1.229033 150 | 1.228112 151 | 1.226033 152 | 1.221697 153 | 1.219402 154 | 1.218948 155 | 1.217277 156 | 1.215639 157 | 1.213122 158 | 1.208787 159 | 1.207875 160 | 1.205892 161 | 1.204942 162 | 1.203346 163 | 1.200594 164 | 1.198044 165 | 1.195350 166 | 1.193223 167 | 1.191716 168 | 1.190070 169 | 1.189331 170 | 1.187069 171 | 1.185844 172 | 1.183531 173 | 1.181102 174 | 1.180088 175 | 1.177938 176 | 1.175316 177 | 1.174158 178 | 1.171976 179 | 1.170660 180 | 1.167873 181 | 1.165652 182 | 1.164344 183 | 1.163405 184 | 1.159586 185 | 1.158722 186 | 1.158274 187 | 1.156118 188 | 1.154081 189 | 1.152420 190 | 1.151642 191 | 1.149088 192 | 1.147936 193 | 1.146993 194 | 1.144313 195 | 1.141345 196 | 1.137728 197 | 1.137247 198 | 1.136279 199 | 1.135313 200 | 1.133580 201 | 1.131784 202 | 1.131469 203 | 1.129439 204 | 1.127615 205 | 1.126348 206 | 1.125415 207 | 1.124355 208 | 1.121443 209 | 1.120501 210 | 1.119133 211 | 1.118446 212 | 1.117300 213 | 1.116887 214 | 1.115978 215 | 1.113715 216 | 1.112566 217 | 1.112342 218 | 1.109144 219 | 1.107941 220 | 1.106964 221 | 1.105361 222 | 1.105044 223 | 1.102753 224 | 1.101930 225 | 1.100408 226 | 1.099652 227 | 1.098429 228 | 1.098332 229 | 1.098243 230 | 1.094135 231 | 1.093516 232 | 1.092382 233 | 1.091670 234 | 1.090078 235 | 1.089586 236 | 1.088270 237 | 1.086303 238 | 1.085263 239 | 1.084290 240 | 1.083358 241 | 1.082818 242 | 1.082273 243 | 1.080266 244 | 1.079481 245 | 1.077849 246 | 1.076985 247 | 1.076192 248 | 1.076136 249 | 1.073774 250 | 1.072358 251 | 1.072124 252 | 1.071058 253 | 1.069525 254 | 1.069366 255 | 1.067774 256 | 1.067285 257 | 1.065857 258 | 1.064099 259 | 1.063845 260 | 1.062725 261 | 1.061943 262 | 1.060416 263 | 1.060043 264 | 1.059428 265 | 1.058306 266 | 1.057950 267 | 1.057505 268 | 1.057060 269 | 1.054535 270 | 1.053238 271 | 1.053102 272 | 1.052623 273 | 1.051572 274 | 1.050980 275 | 1.050070 276 | 1.049240 277 | 1.047290 278 | 1.046165 279 | 1.045326 280 | 1.044794 281 | 1.043890 282 | 1.043100 283 | 1.042825 284 | 1.041543 285 | 1.040521 286 | 1.038824 287 | 1.038218 288 | 1.037671 289 | 1.036877 290 | 1.036013 291 | 1.035027 292 | 1.034440 293 | 1.033807 294 | 1.032238 295 | 1.031766 296 | 1.030900 297 | 1.029723 298 | 1.029454 299 | 1.029267 300 | 1.028585 301 | 1.027856 302 | 1.027158 303 | 1.026376 304 | 1.025794 305 | 1.024513 306 | 1.024265 307 | 1.022640 308 | 1.022056 309 | 1.022035 310 | 1.021338 311 | 1.020752 312 | 1.019220 313 | 1.018966 314 | 1.018401 315 | 1.016882 316 | 1.016381 317 | 1.016260 318 | 1.015723 319 | 1.015156 320 | 1.013542 321 | 1.013257 322 | 1.012991 323 | 1.011783 324 | 1.011264 325 | 1.010738 326 | 1.009866 327 | 1.009583 328 | 1.008919 329 | 1.007864 330 | 1.007229 331 | 1.006901 332 | 1.005933 333 | 1.005583 334 | 1.004380 335 | 1.003731 336 | 1.003455 337 | 1.002697 338 | 1.002511 339 | 1.001233 340 | 1.000980 341 | 1.000107 342 | 0.999920 343 | 0.999383 344 | 0.998479 345 | 0.997897 346 | 0.997478 347 | 0.997201 348 | 0.995749 349 | 0.995228 350 | 0.994264 351 | 0.993564 352 | 0.993059 353 | 0.992377 354 | 0.991550 355 | 0.991430 356 | 0.990713 357 | 0.990020 358 | 0.989282 359 | 0.989015 360 | 0.988423 361 | 0.988142 362 | 0.987656 363 | 0.986261 364 | 0.985854 365 | 0.985454 366 | 0.985131 367 | 0.984279 368 | 0.983098 369 | 0.982697 370 | 0.982245 371 | 0.981888 372 | 0.981737 373 | 0.981172 374 | 0.980173 375 | 0.979647 376 | 0.979371 377 | 0.978758 378 | 0.978358 379 | 0.977391 380 | 0.976937 381 | 0.976641 382 | 0.976026 383 | 0.975142 384 | 0.974387 385 | 0.973590 386 | 0.973458 387 | 0.973009 388 | 0.972056 389 | 0.971135 390 | 0.970599 391 | 0.970517 392 | 0.969697 393 | 0.969303 394 | 0.968879 395 | 0.968092 396 | 0.967964 397 | 0.967065 398 | 0.966825 399 | 0.966743 400 | 0.965838 401 | 0.965401 402 | 0.964752 403 | 0.963642 404 | 0.963346 405 | 0.962434 406 | 0.962165 407 | 0.961905 408 | 0.961024 409 | 0.960495 410 | 0.959737 411 | 0.959140 412 | 0.959096 413 | 0.958226 414 | 0.957956 415 | 0.957269 416 | 0.956886 417 | 0.956086 418 | 0.955981 419 | 0.955657 420 | 0.955189 421 | 0.954771 422 | 0.953468 423 | 0.953362 424 | 0.953062 425 | 0.952075 426 | 0.951706 427 | 0.951235 428 | 0.950837 429 | 0.950302 430 | 0.949604 431 | 0.949190 432 | 0.948684 433 | 0.948069 434 | 0.947813 435 | 0.947164 436 | 0.946304 437 | 0.945771 438 | 0.945406 439 | 0.944962 440 | 0.944757 441 | 0.944197 442 | 0.943876 443 | 0.942923 444 | 0.942592 445 | 0.942162 446 | 0.941549 447 | 0.941221 448 | 0.940900 449 | 0.940533 450 | 0.939559 451 | 0.939265 452 | 0.939117 453 | 0.938712 454 | 0.938331 455 | 0.938069 456 | 0.937496 457 | 0.936374 458 | 0.936015 459 | 0.935724 460 | 0.935130 461 | 0.934824 462 | 0.934040 463 | 0.933420 464 | 0.933205 465 | 0.932738 466 | 0.932193 467 | 0.931963 468 | 0.931454 469 | 0.931035 470 | 0.930492 471 | 0.929848 472 | 0.929349 473 | 0.929121 474 | 0.928145 475 | 0.927946 476 | 0.927775 477 | 0.927228 478 | 0.926476 479 | 0.925830 480 | 0.924999 481 | 0.924882 482 | 0.924624 483 | 0.924254 484 | 0.923437 485 | 0.922936 486 | 0.922757 487 | 0.922369 488 | 0.921947 489 | 0.921621 490 | 0.920983 491 | 0.920648 492 | 0.920081 493 | 0.919799 494 | 0.919478 495 | 0.919088 496 | 0.918109 497 | 0.917490 498 | 0.917307 499 | 0.916769 500 | 0.916590 501 | 0.915881 502 | 0.915463 503 | 0.915134 504 | 0.914584 505 | 0.914211 506 | 0.913969 507 | 0.913261 508 | 0.913220 509 | 0.912676 510 | 0.912265 511 | 0.911897 512 | 0.911728 513 | 0.911222 514 | 0.910678 515 | 0.910456 516 | 0.910011 517 | 0.909571 518 | 0.909092 519 | 0.908675 520 | 0.908044 521 | 0.907784 522 | 0.907319 523 | 0.907030 524 | 0.906959 525 | 0.906055 526 | 0.905717 527 | 0.905481 528 | 0.905318 529 | 0.904515 530 | 0.904088 531 | 0.903689 532 | 0.902584 533 | 0.902340 534 | 0.902202 535 | 0.901933 536 | 0.901337 537 | 0.900931 538 | 0.900531 539 | 0.899700 540 | 0.899510 541 | 0.899448 542 | 0.898828 543 | 0.898442 544 | 0.897904 545 | 0.897538 546 | 0.896594 547 | 0.896224 548 | 0.896043 549 | 0.895678 550 | 0.895322 551 | 0.894900 552 | 0.894500 553 | 0.894205 554 | 0.893792 555 | 0.893110 556 | 0.892631 557 | 0.892244 558 | 0.891665 559 | 0.891467 560 | 0.890949 561 | 0.890557 562 | 0.890284 563 | 0.890120 564 | 0.889878 565 | 0.888517 566 | 0.888418 567 | 0.888256 568 | 0.887652 569 | 0.887366 570 | 0.886798 571 | 0.886249 572 | 0.885959 573 | 0.885612 574 | 0.885271 575 | 0.885018 576 | 0.884410 577 | 0.884006 578 | 0.883420 579 | 0.882979 580 | 0.882828 581 | 0.882098 582 | 0.881565 583 | 0.881310 584 | 0.881150 585 | 0.880599 586 | 0.880347 587 | 0.879740 588 | 0.879552 589 | 0.878985 590 | 0.878718 591 | 0.878483 592 | 0.878217 593 | 0.877969 594 | 0.877166 595 | 0.876720 596 | 0.876588 597 | 0.875999 598 | 0.875724 599 | 0.875312 600 | 0.875060 601 | 0.874626 602 | 0.874394 603 | 0.873977 604 | 0.873148 605 | 0.872821 606 | 0.872459 607 | 0.872096 608 | 0.871806 609 | 0.871684 610 | 0.871358 611 | 0.871084 612 | 0.870736 613 | 0.870239 614 | 0.869744 615 | 0.869448 616 | 0.868877 617 | 0.868834 618 | 0.868298 619 | 0.867668 620 | 0.867114 621 | 0.866990 622 | 0.866831 623 | 0.866211 624 | 0.865688 625 | 0.865635 626 | 0.864710 627 | 0.864149 628 | 0.864141 629 | 0.863711 630 | 0.862948 631 | 0.862400 632 | 0.862224 633 | 0.861785 634 | 0.861445 635 | 0.861259 636 | 0.861043 637 | 0.860382 638 | 0.860117 639 | 0.859707 640 | 0.859216 641 | 0.859209 642 | 0.858204 643 | 0.858035 644 | 0.857629 645 | 0.857090 646 | 0.857034 647 | 0.856552 648 | 0.855977 649 | 0.855882 650 | 0.855628 651 | 0.855513 652 | 0.854912 653 | 0.854641 654 | 0.854036 655 | 0.853970 656 | 0.853373 657 | 0.852897 658 | 0.852067 659 | 0.852048 660 | 0.851803 661 | 0.851459 662 | 0.851016 663 | 0.850728 664 | 0.850291 665 | 0.849942 666 | 0.849572 667 | 0.849281 668 | 0.848894 669 | 0.848685 670 | 0.848422 671 | 0.848015 672 | 0.847328 673 | 0.847273 674 | 0.846640 675 | 0.846226 676 | 0.845960 677 | 0.845485 678 | 0.845159 679 | 0.844546 680 | 0.844416 681 | 0.844113 682 | 0.843630 683 | 0.843193 684 | 0.842665 685 | 0.842379 686 | 0.842040 687 | 0.841749 688 | 0.841546 689 | 0.841290 690 | 0.841188 691 | 0.840506 692 | 0.839818 693 | 0.839536 694 | 0.839376 695 | 0.838980 696 | 0.838796 697 | 0.838279 698 | 0.837635 699 | 0.837285 700 | 0.836838 701 | 0.836294 702 | 0.836187 703 | 0.835985 704 | 0.835624 705 | 0.835082 706 | 0.834812 707 | 0.834301 708 | 0.834018 709 | 0.833686 710 | 0.833486 711 | 0.833046 712 | 0.832747 713 | 0.832353 714 | 0.832011 715 | 0.831617 716 | 0.831215 717 | 0.830883 718 | 0.830429 719 | 0.829964 720 | 0.829774 721 | 0.829540 722 | 0.829070 723 | 0.828846 724 | 0.828117 725 | 0.827983 726 | 0.827625 727 | 0.827316 728 | 0.827115 729 | 0.826908 730 | 0.826476 731 | 0.825891 732 | 0.825584 733 | 0.825149 734 | 0.825076 735 | 0.824591 736 | 0.824412 737 | 0.823907 738 | 0.823624 739 | 0.823109 740 | 0.823052 741 | 0.822477 742 | 0.822333 743 | 0.821695 744 | 0.821324 745 | 0.820815 746 | 0.820577 747 | 0.820041 748 | 0.819847 749 | 0.819615 750 | 0.819072 751 | 0.818881 752 | 0.818542 753 | 0.818240 754 | 0.818033 755 | 0.817741 756 | 0.817351 757 | 0.816811 758 | 0.816287 759 | 0.815814 760 | 0.815423 761 | 0.815192 762 | 0.815034 763 | 0.814883 764 | 0.814052 765 | 0.813897 766 | 0.813726 767 | 0.813660 768 | 0.812896 769 | 0.812774 770 | 0.812149 771 | 0.811883 772 | 0.811682 773 | 0.811341 774 | 0.811214 775 | 0.811013 776 | 0.810373 777 | 0.810169 778 | 0.809624 779 | 0.809076 780 | 0.808794 781 | 0.808444 782 | 0.808326 783 | 0.808179 784 | 0.807618 785 | 0.807567 786 | 0.807352 787 | 0.806826 788 | 0.806653 789 | 0.806221 790 | 0.805727 791 | 0.805221 792 | 0.804998 793 | 0.804585 794 | 0.804224 795 | 0.803660 796 | 0.803305 797 | 0.803221 798 | 0.802845 799 | 0.802669 800 | 0.802409 801 | 0.801995 802 | 0.801480 803 | 0.801126 804 | 0.800777 805 | 0.800579 806 | 0.800107 807 | 0.799610 808 | 0.799097 809 | 0.798955 810 | 0.798845 811 | 0.798418 812 | 0.797948 813 | 0.797613 814 | 0.797464 815 | 0.796897 816 | 0.796723 817 | 0.796541 818 | 0.795860 819 | 0.795637 820 | 0.795418 821 | 0.795167 822 | 0.794763 823 | 0.794421 824 | 0.793827 825 | 0.793678 826 | 0.793548 827 | 0.793303 828 | 0.792505 829 | 0.792223 830 | 0.791879 831 | 0.791164 832 | 0.790971 833 | 0.790681 834 | 0.790180 835 | 0.789786 836 | 0.789691 837 | 0.789369 838 | 0.788991 839 | 0.788721 840 | 0.788559 841 | 0.788323 842 | 0.788091 843 | 0.787413 844 | 0.786945 845 | 0.786669 846 | 0.786279 847 | 0.786021 848 | 0.785453 849 | 0.785168 850 | 0.784955 851 | 0.784383 852 | 0.784065 853 | 0.783717 854 | 0.783495 855 | 0.783116 856 | 0.782517 857 | 0.782418 858 | 0.781996 859 | 0.781478 860 | 0.781150 861 | 0.780929 862 | 0.780612 863 | 0.780346 864 | 0.779740 865 | 0.779687 866 | 0.779626 867 | 0.779090 868 | 0.778778 869 | 0.778558 870 | 0.778293 871 | 0.778082 872 | 0.777478 873 | 0.777164 874 | 0.777004 875 | 0.776450 876 | 0.776249 877 | 0.776016 878 | 0.775638 879 | 0.775471 880 | 0.775117 881 | 0.774738 882 | 0.774340 883 | 0.773849 884 | 0.773749 885 | 0.773193 886 | 0.772833 887 | 0.772437 888 | 0.772363 889 | 0.771980 890 | 0.771546 891 | 0.770945 892 | 0.770807 893 | 0.770712 894 | 0.770284 895 | 0.769755 896 | 0.769364 897 | 0.768872 898 | 0.768608 899 | 0.768006 900 | 0.767707 901 | 0.767287 902 | 0.766956 903 | 0.766804 904 | 0.766640 905 | 0.766513 906 | 0.765853 907 | 0.765604 908 | 0.765247 909 | 0.765033 910 | 0.764525 911 | 0.763868 912 | 0.763589 913 | 0.763303 914 | 0.763255 915 | 0.762772 916 | 0.762657 917 | 0.762382 918 | 0.761943 919 | 0.761652 920 | 0.761166 921 | 0.760886 922 | 0.760642 923 | 0.760246 924 | 0.759796 925 | 0.759547 926 | 0.759167 927 | 0.758572 928 | 0.758437 929 | 0.758402 930 | 0.757537 931 | 0.757399 932 | 0.757261 933 | 0.757044 934 | 0.756354 935 | 0.756024 936 | 0.755860 937 | 0.755357 938 | 0.755136 939 | 0.754750 940 | 0.754214 941 | 0.754005 942 | 0.753724 943 | 0.752996 944 | 0.752836 945 | 0.752400 946 | 0.752306 947 | 0.751759 948 | 0.751661 949 | 0.751330 950 | 0.751168 951 | 0.751020 952 | 0.750659 953 | 0.750007 954 | 0.749689 955 | 0.749495 956 | 0.749119 957 | 0.748759 958 | 0.748478 959 | 0.748065 960 | 0.747625 961 | 0.747449 962 | 0.746687 963 | 0.746264 964 | 0.746058 965 | 0.745328 966 | 0.744984 967 | 0.744437 968 | 0.744369 969 | 0.744013 970 | 0.743688 971 | 0.743510 972 | 0.743082 973 | 0.742683 974 | 0.742467 975 | 0.742365 976 | 0.742243 977 | 0.741437 978 | 0.741378 979 | 0.740992 980 | 0.740443 981 | 0.740272 982 | 0.739879 983 | 0.739771 984 | 0.739407 985 | 0.739154 986 | 0.738702 987 | 0.738091 988 | 0.737694 989 | 0.737644 990 | 0.737240 991 | 0.736978 992 | 0.736598 993 | 0.736027 994 | 0.735746 995 | 0.735229 996 | 0.734727 997 | 0.734338 998 | 0.734315 999 | 0.734027 1000 | 0.733939 1001 | 0.733623 1002 | 0.733333 1003 | 0.732575 1004 | 0.732473 1005 | 0.732394 1006 | 0.732105 1007 | 0.731576 1008 | 0.731172 1009 | 0.731164 1010 | 0.730483 1011 | 0.730308 1012 | 0.729689 1013 | 0.729551 1014 | 0.729166 1015 | 0.728784 1016 | 0.728480 1017 | 0.728378 1018 | 0.728030 1019 | 0.727821 1020 | 0.727293 1021 | 0.726755 1022 | 0.726355 1023 | 0.726085 1024 | 0.725702 1025 | 0.725261 1026 | 0.724964 1027 | 0.724439 1028 | 0.724318 1029 | 0.723856 1030 | 0.723428 1031 | 0.722977 1032 | 0.722882 1033 | 0.722623 1034 | 0.722001 1035 | 0.721677 1036 | 0.721337 1037 | 0.720897 1038 | 0.720502 1039 | 0.720374 1040 | 0.719693 1041 | 0.719497 1042 | 0.719073 1043 | 0.718917 1044 | 0.718117 1045 | 0.717986 1046 | 0.717598 1047 | 0.716888 1048 | 0.716489 1049 | 0.716429 1050 | 0.716233 1051 | 0.715416 1052 | 0.714722 1053 | 0.714685 1054 | 0.714282 1055 | 0.714057 1056 | 0.713964 1057 | 0.713437 1058 | 0.713171 1059 | 0.712531 1060 | 0.712118 1061 | 0.711659 1062 | 0.711530 1063 | 0.711407 1064 | 0.711102 1065 | 0.710655 1066 | 0.710381 1067 | 0.709936 1068 | 0.709708 1069 | 0.709649 1070 | 0.708671 1071 | 0.708125 1072 | 0.707835 1073 | 0.707500 1074 | 0.707042 1075 | 0.706892 1076 | 0.706112 1077 | 0.706009 1078 | 0.705823 1079 | 0.705104 1080 | 0.704860 1081 | 0.704239 1082 | 0.703932 1083 | 0.703477 1084 | 0.703391 1085 | 0.702766 1086 | 0.702445 1087 | 0.702274 1088 | 0.701940 1089 | 0.701665 1090 | 0.700902 1091 | 0.700590 1092 | 0.700421 1093 | 0.700069 1094 | 0.699486 1095 | 0.699260 1096 | 0.698845 1097 | 0.698525 1098 | 0.698164 1099 | 0.697589 1100 | 0.697331 1101 | 0.697240 1102 | 0.696655 1103 | 0.695899 1104 | 0.695641 1105 | 0.695463 1106 | 0.695160 1107 | 0.695045 1108 | 0.693824 1109 | 0.693280 1110 | 0.693161 1111 | 0.692488 1112 | 0.692260 1113 | 0.691847 1114 | 0.691383 1115 | 0.691129 1116 | 0.690650 1117 | 0.690521 1118 | 0.690001 1119 | 0.689281 1120 | 0.689164 1121 | 0.688493 1122 | 0.688327 1123 | 0.687731 1124 | 0.687431 1125 | 0.686862 1126 | 0.686461 1127 | 0.686314 1128 | 0.685825 1129 | 0.685344 1130 | 0.684978 1131 | 0.684476 1132 | 0.684136 1133 | 0.683983 1134 | 0.683262 1135 | 0.683166 1136 | 0.682451 1137 | 0.682255 1138 | 0.681984 1139 | 0.681503 1140 | 0.680910 1141 | 0.680809 1142 | 0.679644 1143 | 0.679039 1144 | 0.678826 1145 | 0.678661 1146 | 0.678207 1147 | 0.677428 1148 | 0.677068 1149 | 0.676767 1150 | 0.675457 1151 | 0.675406 1152 | 0.675308 1153 | 0.674391 1154 | 0.674118 1155 | 0.673968 1156 | 0.673589 1157 | 0.672906 1158 | 0.672320 1159 | 0.671887 1160 | 0.671262 1161 | 0.670848 1162 | 0.670659 1163 | 0.670100 1164 | 0.669492 1165 | 0.668869 1166 | 0.668393 1167 | 0.667479 1168 | 0.667404 1169 | 0.666943 1170 | 0.666272 1171 | 0.665854 1172 | 0.665310 1173 | 0.665164 1174 | 0.664333 1175 | 0.663281 1176 | 0.662910 1177 | 0.662375 1178 | 0.661634 1179 | 0.661110 1180 | 0.660643 1181 | 0.660077 1182 | 0.659467 1183 | 0.658888 1184 | 0.658330 1185 | 0.658068 1186 | 0.657553 1187 | 0.657307 1188 | 0.656757 1189 | 0.655750 1190 | 0.654891 1191 | 0.654242 1192 | 0.653675 1193 | 0.653497 1194 | 0.653385 1195 | 0.652308 1196 | 0.651295 1197 | 0.650471 1198 | 0.649780 1199 | 0.649056 1200 | 0.648642 1201 | 0.647491 1202 | 0.647090 1203 | 0.646993 1204 | 0.645525 1205 | 0.644970 1206 | 0.644049 1207 | 0.642704 1208 | 0.642170 1209 | 0.641427 1210 | 0.640678 1211 | 0.639895 1212 | 0.639148 1213 | 0.638247 1214 | 0.636725 1215 | 0.636166 1216 | 0.635749 1217 | 0.633287 1218 | 0.631706 1219 | 0.631330 1220 | 0.631145 1221 | 0.630361 1222 | 0.629449 1223 | 0.627495 1224 | 0.626398 1225 | 0.624864 1226 | 0.623195 1227 | 0.622484 1228 | 0.620204 1229 | 0.619257 1230 | 0.618031 1231 | 0.616679 1232 | 0.614482 1233 | 0.612315 1234 | 0.609651 1235 | 0.606166 1236 | 0.605721 1237 | 0.601163 1238 | 0.600258 1239 | 0.598812 1240 | 0.597943 1241 | 0.595561 1242 | 0.594310 1243 | 0.591526 1244 | 0.583090 1245 | 0.581623 1246 | 0.580843 1247 | 0.577479 1248 | 0.575503 1249 | 0.572902 1250 | 0.571719 1251 | 0.564517 1252 | 0.558561 1253 | 0.556191 1254 | 0.549372 1255 | 0.540657 1256 | 0.515586 1257 | 0.508704 1258 | -0.000000 1259 | --------------------------------------------------------------------------------