├── img
├── MA1_affinities.png
├── f3_phylogeny.png
├── f4_phylogeny.png
├── Patterson_2012_ANEfig.png
├── Patterson_2012_table.png
└── outgroupf3_phylogeny.png
├── .gitignore
├── f4_param.txt
├── adm_f3_popfile.txt
├── adm_f3_param.txt
├── outgroup_f3_param_Han.txt
├── outgroup_f3_param_MA1.txt
├── f4_popfile.txt
├── pca.AllEurasia.params.txt
├── pca.WestEurasia.params.txt
├── supp
├── tasks.sh
├── convertf_param.txt
├── WestEurasia.poplist.txt
├── AllEurasia.poplist.txt
└── poplist.txt
├── outgroup_f3_popfile_Han.txt
├── outgroup_f3_popfile_MA1.txt
├── README.md
├── population_frequencies.txt
├── f3_outgroup_stats_Han.txt
├── f3_outgroup_stats_MA1.txt
├── 04_Rmd_plotting_pca.Rmd
├── 03_Rmd_smartpca.Rmd
├── pca.WestEurasia.eval
├── 01_bashnb_getting_started.ipynb
├── 03_bashnb_smartpca.ipynb
├── 05_Rmd_fstatistics.Rmd
└── pca.AllEurasia.eval
/img/MA1_affinities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/popgen_course/master/img/MA1_affinities.png
--------------------------------------------------------------------------------
/img/f3_phylogeny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/popgen_course/master/img/f3_phylogeny.png
--------------------------------------------------------------------------------
/img/f4_phylogeny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/popgen_course/master/img/f4_phylogeny.png
--------------------------------------------------------------------------------
/img/Patterson_2012_ANEfig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/popgen_course/master/img/Patterson_2012_ANEfig.png
--------------------------------------------------------------------------------
/img/Patterson_2012_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/popgen_course/master/img/Patterson_2012_table.png
--------------------------------------------------------------------------------
/img/outgroupf3_phylogeny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/popgen_course/master/img/outgroupf3_phylogeny.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 |
3 | # rendered documents
4 | *.html
5 |
6 | # data
7 | data/*
8 |
9 | # R
10 | .Rhistory
11 |
--------------------------------------------------------------------------------
/f4_param.txt:
--------------------------------------------------------------------------------
1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno
2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp
3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind
4 | popfilename: f4_popfile.txt
5 | f4mode: YES
--------------------------------------------------------------------------------
/adm_f3_popfile.txt:
--------------------------------------------------------------------------------
1 | Nganasan French Finnish
2 | Nganasan Icelandic Finnish
3 | Nganasan Lithuanian Finnish
4 | Nganasan Norwegian Finnish
5 | BolshoyOleniOstrov French Finnish
6 | BolshoyOleniOstrov Icelandic Finnish
7 | BolshoyOleniOstrov Lithuanian Finnish
8 | BolshoyOleniOstrov Norwegian Finnish
--------------------------------------------------------------------------------
/adm_f3_param.txt:
--------------------------------------------------------------------------------
1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno
2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp
3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind
4 | popfilename: adm_f3_popfile.txt
5 | inbreed: YES
--------------------------------------------------------------------------------
/outgroup_f3_param_Han.txt:
--------------------------------------------------------------------------------
1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno
2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp
3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind
4 | popfilename: outgroup_f3_popfile_Han.txt
--------------------------------------------------------------------------------
/outgroup_f3_param_MA1.txt:
--------------------------------------------------------------------------------
1 | genotypename: /data/popgen_course/HumanOrigins_FennoScandian_small.geno
2 | snpname: /data/popgen_course/HumanOrigins_FennoScandian_small.snp
3 | indivname: /data/popgen_course/HumanOrigins_FennoScandian_small.ind
4 | popfilename: outgroup_f3_popfile_MA1.txt
--------------------------------------------------------------------------------
/f4_popfile.txt:
--------------------------------------------------------------------------------
1 | Mbuti Nganasan French Finnish
2 | Mbuti Nganasan Icelandic Finnish
3 | Mbuti Nganasan Lithuanian Finnish
4 | Mbuti Nganasan Norwegian Finnish
5 | Mbuti BolshoyOleniOstrov French Finnish
6 | Mbuti BolshoyOleniOstrov Icelandic Finnish
7 | Mbuti BolshoyOleniOstrov Lithuanian Finnish
8 | Mbuti BolshoyOleniOstrov Norwegian Finnish
--------------------------------------------------------------------------------
/pca.AllEurasia.params.txt:
--------------------------------------------------------------------------------
1 | genotypename: /data/popgen_course/genotypes_small.geno
2 | snpname: /data/popgen_course/genotypes_small.snp
3 | indivname: /data/popgen_course/genotypes_small.ind
4 | evecoutname: pca.AllEurasia.evec
5 | evaloutname: pca.AllEurasia.eval
6 | poplistname: /data/popgen_course/AllEurasia.poplist.txt
7 | lsqproject: YES
8 | numoutevec: 4
9 | numthreads: 1
10 |
--------------------------------------------------------------------------------
/pca.WestEurasia.params.txt:
--------------------------------------------------------------------------------
1 | genotypename: data/popgen_course/genotypes_small.geno
2 | snpname: data/popgen_course/genotypes_small.snp
3 | indivname: data/popgen_course/genotypes_small.ind
4 | evecoutname: pca.WestEurasia.evec
5 | evaloutname: pca.WestEurasia.eval
6 | poplistname: data/popgen_course/WestEurasia.poplist.txt
7 | lsqproject: YES
8 | numoutevec: 4
9 | numthreads: 1
10 |
--------------------------------------------------------------------------------
/supp/tasks.sh:
--------------------------------------------------------------------------------
1 | OUT=~/Data/workshop_dataset_prep
2 | mkdir -p $OUT
3 |
4 | #Copy genotyping data from Thiseas:
5 | D=/projects1/AncientFinnish/DataFreeze20_07_17/results/calls/PublishedOnly.HO.1240K.Ancients+Saami
6 | scp sdag:$D.ind $OUT/HumanOrigins_FennoScandian.ind
7 | scp sdag:$D.geno $OUT/HumanOrigins_FennoScandian.geno
8 | scp sdag:$D.snp $OUT/HumanOrigins_FennoScandian.snp
9 |
10 | # Extract smaller dataset
11 | convertf -p convertf_param.txt
12 |
13 |
--------------------------------------------------------------------------------
/supp/convertf_param.txt:
--------------------------------------------------------------------------------
1 | genotypename: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian.geno
2 | snpname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian.snp
3 | indivname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian.ind
4 | outputformat: EIGENSTRAT
5 | genotypeoutname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian_small.geno
6 | snpoutname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian_small.snp
7 | indivoutname: /Users/schiffels/Data/workshop_dataset_prep/HumanOrigins_FennoScandian_small.ind
8 | poplistname: poplist.txt
9 |
--------------------------------------------------------------------------------
/outgroup_f3_popfile_Han.txt:
--------------------------------------------------------------------------------
1 | Han Chuvash Mbuti
2 | Han Albanian Mbuti
3 | Han Armenian Mbuti
4 | Han Bulgarian Mbuti
5 | Han Czech Mbuti
6 | Han Druze Mbuti
7 | Han English Mbuti
8 | Han Estonian Mbuti
9 | Han Finnish Mbuti
10 | Han French Mbuti
11 | Han Georgian Mbuti
12 | Han Greek Mbuti
13 | Han Hungarian Mbuti
14 | Han Icelandic Mbuti
15 | Han Italian_North Mbuti
16 | Han Italian_South Mbuti
17 | Han Lithuanian Mbuti
18 | Han Maltese Mbuti
19 | Han Mordovian Mbuti
20 | Han Norwegian Mbuti
21 | Han Orcadian Mbuti
22 | Han Russian Mbuti
23 | Han Sardinian Mbuti
24 | Han Scottish Mbuti
25 | Han Sicilian Mbuti
26 | Han Spanish_North Mbuti
27 | Han Spanish Mbuti
28 | Han Ukrainian Mbuti
29 | Han Levanluhta Mbuti
30 | Han BolshoyOleniOstrov Mbuti
31 | Han ChalmnyVarre Mbuti
32 | Han Saami.DG Mbuti
--------------------------------------------------------------------------------
/outgroup_f3_popfile_MA1.txt:
--------------------------------------------------------------------------------
1 | MA1_HG.SG Chuvash Mbuti
2 | MA1_HG.SG Albanian Mbuti
3 | MA1_HG.SG Armenian Mbuti
4 | MA1_HG.SG Bulgarian Mbuti
5 | MA1_HG.SG Czech Mbuti
6 | MA1_HG.SG Druze Mbuti
7 | MA1_HG.SG English Mbuti
8 | MA1_HG.SG Estonian Mbuti
9 | MA1_HG.SG Finnish Mbuti
10 | MA1_HG.SG French Mbuti
11 | MA1_HG.SG Georgian Mbuti
12 | MA1_HG.SG Greek Mbuti
13 | MA1_HG.SG Hungarian Mbuti
14 | MA1_HG.SG Icelandic Mbuti
15 | MA1_HG.SG Italian_North Mbuti
16 | MA1_HG.SG Italian_South Mbuti
17 | MA1_HG.SG Lithuanian Mbuti
18 | MA1_HG.SG Maltese Mbuti
19 | MA1_HG.SG Mordovian Mbuti
20 | MA1_HG.SG Norwegian Mbuti
21 | MA1_HG.SG Orcadian Mbuti
22 | MA1_HG.SG Russian Mbuti
23 | MA1_HG.SG Sardinian Mbuti
24 | MA1_HG.SG Scottish Mbuti
25 | MA1_HG.SG Sicilian Mbuti
26 | MA1_HG.SG Spanish_North Mbuti
27 | MA1_HG.SG Spanish Mbuti
28 | MA1_HG.SG Ukrainian Mbuti
29 | MA1_HG.SG Levanluhta Mbuti
30 | MA1_HG.SG BolshoyOleniOstrov Mbuti
31 | MA1_HG.SG ChalmnyVarre Mbuti
32 | MA1_HG.SG Saami.DG Mbuti
--------------------------------------------------------------------------------
/supp/WestEurasia.poplist.txt:
--------------------------------------------------------------------------------
1 | Chuvash
2 | Abkhasian
3 | Adygei
4 | Albanian
5 | Armenian
6 | Assyrian
7 | Balkar
8 | Basque
9 | BedouinA
10 | BedouinB
11 | Belarusian
12 | Bulgarian
13 | Canary_Islander
14 | Chechen
15 | Croatian
16 | Cypriot
17 | Czech
18 | Druze
19 | English
20 | Estonian
21 | Finnish
22 | French
23 | Georgian
24 | German
25 | Greek
26 | Hungarian
27 | Icelandic
28 | Iranian
29 | Irish
30 | Irish_Ulster
31 | Italian_North
32 | Italian_South
33 | Jew_Ashkenazi
34 | Jew_Georgian
35 | Jew_Iranian
36 | Jew_Iraqi
37 | Jew_Libyan
38 | Jew_Moroccan
39 | Jew_Tunisian
40 | Jew_Turkish
41 | Jew_Yemenite
42 | Jordanian
43 | Kumyk
44 | Lebanese_Christian
45 | Lebanese
46 | Lebanese_Muslim
47 | Lezgin
48 | Lithuanian
49 | Maltese
50 | Mordovian
51 | North_Ossetian
52 | Norwegian
53 | Orcadian
54 | Palestinian
55 | Polish
56 | Romanian
57 | Russian
58 | Sardinian
59 | Saudi
60 | Scottish
61 | Shetlandic
62 | Sicilian
63 | Sorb
64 | Spanish_North
65 | Spanish
66 | Syrian
67 | Turkish
68 | Ukrainian
69 |
--------------------------------------------------------------------------------
/supp/AllEurasia.poplist.txt:
--------------------------------------------------------------------------------
1 | Abkhasian
2 | Adygei
3 | Albanian
4 | Aleut
5 | Aleut_Tlingit
6 | Altaian
7 | Ami
8 | Armenian
9 | Assyrian
10 | Atayal
11 | Avar
12 | Azeri
13 | Balkar
14 | Basque
15 | BedouinA
16 | BedouinB
17 | Belarusian
18 | Borneo
19 | Bulgarian
20 | Buryat
21 | Cambodian
22 | Chechen
23 | Chukchi
24 | Chukchi1
25 | Chuvash
26 | Croatian
27 | Cypriot
28 | Czech
29 | Dai
30 | Daur
31 | Dolgan
32 | Druze
33 | English
34 | Eskimo_ChaplinSireniki
35 | Eskimo_Naukan
36 | Estonian
37 | Even
38 | Finnish
39 | French
40 | Georgian
41 | German
42 | Greek
43 | Han
44 | Hezhen
45 | Hungarian
46 | Icelandic
47 | Iranian
48 | Italian_North
49 | Italian_South
50 | Itelmen
51 | Japanese
52 | Jew_Ashkenazi
53 | Jew_Georgian
54 | Jew_Iranian
55 | Jew_Iraqi
56 | Jew_Libyan
57 | Jew_Moroccan
58 | Jew_Tunisian
59 | Jew_Turkish
60 | Jew_Yemenite
61 | Jordanian
62 | Kalmyk
63 | Kinh
64 | Korean
65 | Koryak
66 | Kumyk
67 | Kurd
68 | Kyrgyz
69 | Lahu
70 | Lebanese
71 | Lezgin
72 | Lithuanian
73 | Maltese
74 | Mansi
75 | Miao
76 | Mongol
77 | Mongola
78 | Mordovian
79 | Naxi
80 | Nganasan
81 | Nogai
82 | North_Ossetian
83 | Norwegian
84 | Orcadian
85 | Oroqen
86 | Ossetian
87 | Palestinian
88 | Polish
89 | Russian
90 | Saami.DG
91 | Saami_WGA
92 | Sardinian
93 | Saudi
94 | Scottish
95 | Selkup
96 | Semende
97 | She
98 | Sherpa.DG
99 | Sicilian
100 | Spanish
101 | Spanish_North
102 | Syrian
103 | Tajik
104 | Thai
105 | Tibetan.DG
106 | Tu
107 | Tubalar
108 | Tujia
109 | Turkish
110 | Turkmen
111 | Tuvinian
112 | Ukrainian
113 | Ulchi
114 | Uygur
115 | Uzbek
116 | Xibo
117 | Yakut
118 | Yi
119 | Yukagir
120 |
--------------------------------------------------------------------------------
/supp/poplist.txt:
--------------------------------------------------------------------------------
1 | Abkhasian
2 | Adygei
3 | Albanian
4 | Aleut
5 | Aleut_Tlingit
6 | Altaian
7 | Ami
8 | Armenian
9 | Assyrian
10 | Atayal
11 | Avar
12 | Azeri
13 | Balkar
14 | Basque
15 | BedouinA
16 | BedouinB
17 | Belarusian
18 | BolshoyOleniOstrov
19 | Borneo
20 | Bulgarian
21 | Buryat
22 | Cambodian
23 | Canary_Islander
24 | ChalmnyVarre
25 | Chechen
26 | Chukchi
27 | Chukchi1
28 | Chuvash
29 | Croatian
30 | Cypriot
31 | Czech
32 | Dai
33 | Daur
34 | Dolgan
35 | Druze
36 | English
37 | Eskimo_ChaplinSireniki
38 | Eskimo_Naukan
39 | Estonian
40 | Even
41 | Finnish
42 | French
43 | Georgian
44 | German
45 | Greek
46 | Han
47 | Hezhen
48 | Hungarian
49 | Icelandic
50 | Iranian
51 | Irish
52 | Irish_Ulster
53 | Italian_North
54 | Italian_South
55 | Itelmen
56 | Japanese
57 | Jew_Ashkenazi
58 | Jew_Georgian
59 | Jew_Iranian
60 | Jew_Iraqi
61 | Jew_Libyan
62 | Jew_Moroccan
63 | Jew_Tunisian
64 | Jew_Turkish
65 | Jew_Yemenite
66 | Jordanian
67 | Kalmyk
68 | Kinh
69 | Korean
70 | Koryak
71 | Kumyk
72 | Kurd
73 | Kyrgyz
74 | Lahu
75 | Lebanese
76 | Lebanese_Christian
77 | Lebanese_Muslim
78 | Levanluhta
79 | Levanluhta_Outlier
80 | Lezgin
81 | LBK_EN
82 | Lithuanian
83 | Maltese
84 | Mansi
85 | MA1_HG.SG
86 | Mbuti
87 | Miao
88 | Mongol
89 | Mongola
90 | Mordovian
91 | Naxi
92 | Nganasan
93 | Nogai
94 | North_Ossetian
95 | Norwegian
96 | Orcadian
97 | Oroqen
98 | Ossetian
99 | Palestinian
100 | Polish
101 | Romanian
102 | Russian
103 | Saami.DG
104 | Saami_WGA
105 | Sardinian
106 | Saudi
107 | Scottish
108 | Selkup
109 | Semende
110 | She
111 | Sherpa.DG
112 | Shetlandic
113 | Sicilian
114 | Sorb
115 | Spanish
116 | Spanish_North
117 | Syrian
118 | Tajik
119 | Thai
120 | Tibetan.DG
121 | Tu
122 | Tubalar
123 | Tujia
124 | Turkish
125 | Turkmen
126 | Tuvinian
127 | Ukrainian
128 | Ulchi
129 | Uygur
130 | Uzbek
131 | WHG
132 | Xibo
133 | Yakut
134 | Yamnaya_Samara
135 | Yi
136 | Yukagir
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # popgen_course
2 | A course with Jupyter Notebooks for Computational Population Genetics
3 |
4 | by Stephan Schiffels
5 |
6 | *Disclaimer: This is still work in progress.*
7 |
8 | This repository contains several Jupyter Notebooks that I have used in the past for teaching various elements of population-genetic data analyses to students with no initial training in population genetics or Unix-based data analysis. It is definitely not yet fully self-contained and needs an experienced instructor to go through.
9 |
10 | Having said that, for someone reasonably experienced with Unix, it is possible to go through the chapters yourself. In that case, here are a few steps for settup up your enviroment to make these work:
11 |
12 | 1. Install [Jupyter](https://jupyter.org) notebooks with [Bash extension](https://github.com/takluyver/bash_kernel). You will also need [Eigensoft](https://github.com/DReichLab/EIG) and [ADMIXTOOLS](https://github.com/DReichLab/AdmixTools).
13 | 2. Clone this repository in your home directory running `git clone https://github.com/stschiff/popgen_course.git`
14 | 3. Download the genotype data needed for these exercises from [here](https://oc.gnz.mpg.de/owncloud/index.php/s/dT9KzFhLfunk3Tb). In my notebooks, I assume that this data has been downloaded into the directory `/data/popgen_course`.
15 |
16 | Having Jupyter installed, you can now simply open the Notebooks directly from within Jupyter, or you can access static versions of them here:
17 |
18 | 1. [Getting Started (Bash)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/01_bashnb_getting_started.ipynb)
19 | 2. [Getting Started (Python)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/02_pynb_getting_started.ipynb)
20 | 3. [Principal Components Analysis (Bash)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/03_bashnb_smartpca.ipynb)
21 | 4. [Principal Components Analysis (Python)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/04_pynb_plotting_pca.ipynb)
22 | 5. [F Statistics (Python)](https://nbviewer.jupyter.org/github/stschiff/popgen_course/blob/master/05_pynb_fstatistics.ipynb)
23 |
24 |
--------------------------------------------------------------------------------
/population_frequencies.txt:
--------------------------------------------------------------------------------
1 | 9 Abkhasian
2 | 16 Adygei
3 | 6 Albanian
4 | 7 Aleut
5 | 4 Aleut_Tlingit
6 | 7 Altaian
7 | 10 Ami
8 | 10 Armenian
9 | 9 Atayal
10 | 10 Balkar
11 | 29 Basque
12 | 25 BedouinA
13 | 19 BedouinB
14 | 10 Belarusian
15 | 6 BolshoyOleniOstrov
16 | 9 Borneo
17 | 10 Bulgarian
18 | 8 Cambodian
19 | 2 Canary_Islander
20 | 2 ChalmnyVarre
21 | 9 Chechen
22 | 20 Chukchi
23 | 3 Chukchi1
24 | 10 Chuvash
25 | 10 Croatian
26 | 8 Cypriot
27 | 10 Czech
28 | 10 Dai
29 | 9 Daur
30 | 4 Dolgan
31 | 39 Druze
32 | 10 English
33 | 10 Estonian
34 | 9 Even
35 | 8 Finnish
36 | 32 French
37 | 10 Georgian
38 | 20 Greek
39 | 43 Han
40 | 8 Hezhen
41 | 20 Hungarian
42 | 12 Icelandic
43 | 8 Iranian
44 | 20 Italian_North
45 | 1 Italian_South
46 | 6 Itelmen
47 | 29 Japanese
48 | 7 Jew_Ashkenazi
49 | 7 Jew_Georgian
50 | 9 Jew_Iranian
51 | 6 Jew_Iraqi
52 | 9 Jew_Libyan
53 | 6 Jew_Moroccan
54 | 7 Jew_Tunisian
55 | 8 Jew_Turkish
56 | 8 Jew_Yemenite
57 | 1 JK2065
58 | 9 Jordanian
59 | 10 Kalmyk
60 | 8 Kinh
61 | 6 Korean
62 | 9 Koryak
63 | 8 Kumyk
64 | 9 Kyrgyz
65 | 8 Lahu
66 | 14 LBK_EN
67 | 8 Lebanese
68 | 2 Levanluhta
69 | 9 Lezgin
70 | 10 Lithuanian
71 | 8 Maltese
72 | 8 Mansi
73 | 10 Miao
74 | 6 Mongola
75 | 10 Mordovian
76 | 9 Naxi
77 | 11 Nganasan
78 | 9 Nogai
79 | 11 Norwegian
80 | 13 Orcadian
81 | 9 Oroqen
82 | 10 Ossetian
83 | 38 Palestinian
84 | 22 Russian
85 | 2 Saami.DG
86 | 1 Saami_WGA
87 | 27 Sardinian
88 | 8 Saudi
89 | 4 Scottish
90 | 10 Selkup
91 | 10 Semende
92 | 10 She
93 | 2 Sherpa.DG
94 | 11 Sicilian
95 | 53 Spanish
96 | 5 Spanish_North
97 | 8 Syrian
98 | 8 Tajik
99 | 10 Thai
100 | 2 Tibetan.DG
101 | 10 Tu
102 | 22 Tubalar
103 | 10 Tujia
104 | 50 Turkish
105 | 7 Turkmen
106 | 10 Tuvinian
107 | 9 Ukrainian
108 | 25 Ulchi
109 | 10 Uygur
110 | 10 Uzbek
111 | 3 WHG
112 | 7 Xibo
113 | 20 Yakut
114 | 9 Yamnaya_Samara
115 | 10 Yi
116 | 19 Yukagir
117 |
--------------------------------------------------------------------------------
/f3_outgroup_stats_Han.txt:
--------------------------------------------------------------------------------
1 | result: Han Chuvash Mbuti 0.233652 0.002072 112.782 502678
2 | result: Han Albanian Mbuti 0.215629 0.002029 106.291 501734
3 | result: Han Armenian Mbuti 0.213724 0.001963 108.882 504370
4 | result: Han Bulgarian Mbuti 0.216193 0.001979 109.266 504310
5 | result: Han Czech Mbuti 0.218060 0.002002 108.939 504089
6 | result: Han Druze Mbuti 0.209551 0.001919 109.205 510853
7 | result: Han English Mbuti 0.216959 0.001973 109.954 504161
8 | result: Han Estonian Mbuti 0.220730 0.002019 109.332 503503
9 | result: Han Finnish Mbuti 0.223447 0.002044 109.345 502217
10 | result: Han French Mbuti 0.216623 0.001969 110.012 509613
11 | result: Han Georgian Mbuti 0.214295 0.001935 110.721 503598
12 | result: Han Greek Mbuti 0.215203 0.001984 108.465 507475
13 | result: Han Hungarian Mbuti 0.217894 0.001999 109.004 507409
14 | result: Han Icelandic Mbuti 0.218683 0.002015 108.553 504655
15 | result: Han Italian_North Mbuti 0.215332 0.001978 108.854 507589
16 | result: Han Italian_South Mbuti 0.211787 0.002271 93.265 492400
17 | result: Han Lithuanian Mbuti 0.219615 0.002032 108.098 503681
18 | result: Han Maltese Mbuti 0.210359 0.001956 107.542 503985
19 | result: Han Mordovian Mbuti 0.223469 0.002008 111.296 503441
20 | result: Han Norwegian Mbuti 0.218873 0.002023 108.197 504621
21 | result: Han Orcadian Mbuti 0.217773 0.002014 108.115 504993
22 | result: Han Russian Mbuti 0.223993 0.001995 112.274 506525
23 | result: Han Sardinian Mbuti 0.213230 0.001980 107.711 508413
24 | result: Han Scottish Mbuti 0.218489 0.002039 107.145 499784
25 | result: Han Sicilian Mbuti 0.212272 0.001975 107.486 505477
26 | result: Han Spanish_North Mbuti 0.215885 0.002029 106.383 500853
27 | result: Han Spanish Mbuti 0.213869 0.001975 108.297 513648
28 | result: Han Ukrainian Mbuti 0.218716 0.002007 108.950 503981
29 | result: Han Levanluhta Mbuti 0.236252 0.002383 99.123 263049
30 | result: Han BolshoyOleniOstrov Mbuti 0.247814 0.002177 113.849 457102
31 | result: Han ChalmnyVarre Mbuti 0.233499 0.002304 101.345 366220
32 | result: Han Saami.DG Mbuti 0.236198 0.002274 103.852 489038
--------------------------------------------------------------------------------
/f3_outgroup_stats_MA1.txt:
--------------------------------------------------------------------------------
1 | result: MA1_HG.SG Chuvash Mbuti 0.243818 0.002349 103.781 350484
2 | result: MA1_HG.SG Albanian Mbuti 0.236494 0.002296 103.008 344332
3 | result: MA1_HG.SG Armenian Mbuti 0.231399 0.002264 102.229 349612
4 | result: MA1_HG.SG Bulgarian Mbuti 0.237498 0.002281 104.103 349800
5 | result: MA1_HG.SG Czech Mbuti 0.243224 0.002328 104.457 349553
6 | result: MA1_HG.SG Druze Mbuti 0.226740 0.002197 103.193 359004
7 | result: MA1_HG.SG English Mbuti 0.243135 0.002317 104.941 349321
8 | result: MA1_HG.SG Estonian Mbuti 0.247065 0.002362 104.619 348861
9 | result: MA1_HG.SG Finnish Mbuti 0.245684 0.002379 103.266 347208
10 | result: MA1_HG.SG French Mbuti 0.240235 0.002269 105.886 357842
11 | result: MA1_HG.SG Georgian Mbuti 0.232645 0.002253 103.243 349082
12 | result: MA1_HG.SG Greek Mbuti 0.236566 0.002280 103.757 355261
13 | result: MA1_HG.SG Hungarian Mbuti 0.241720 0.002313 104.483 355340
14 | result: MA1_HG.SG Icelandic Mbuti 0.244488 0.002386 102.481 350287
15 | result: MA1_HG.SG Italian_North Mbuti 0.236407 0.002273 104.002 354999
16 | result: MA1_HG.SG Italian_South Mbuti 0.230839 0.002767 83.427 321217
17 | result: MA1_HG.SG Lithuanian Mbuti 0.246864 0.002403 102.718 348656
18 | result: MA1_HG.SG Maltese Mbuti 0.230200 0.002259 101.903 347725
19 | result: MA1_HG.SG Mordovian Mbuti 0.245284 0.002346 104.571 350058
20 | result: MA1_HG.SG Norwegian Mbuti 0.243930 0.002301 106.031 350182
21 | result: MA1_HG.SG Orcadian Mbuti 0.243614 0.002320 105.008 351053
22 | result: MA1_HG.SG Russian Mbuti 0.245212 0.002298 106.698 355953
23 | result: MA1_HG.SG Sardinian Mbuti 0.231967 0.002264 102.449 355548
24 | result: MA1_HG.SG Scottish Mbuti 0.244598 0.002434 100.512 339441
25 | result: MA1_HG.SG Sicilian Mbuti 0.231141 0.002260 102.297 351028
26 | result: MA1_HG.SG Spanish_North Mbuti 0.238479 0.002426 98.319 341661
27 | result: MA1_HG.SG Spanish Mbuti 0.235386 0.002257 104.293 361951
28 | result: MA1_HG.SG Ukrainian Mbuti 0.243551 0.002345 103.881 348948
29 | result: MA1_HG.SG Levanluhta Mbuti 0.247640 0.003030 81.728 174148
30 | result: MA1_HG.SG BolshoyOleniOstrov Mbuti 0.256041 0.002624 97.561 305851
31 | result: MA1_HG.SG ChalmnyVarre Mbuti 0.249619 0.002862 87.212 239594
32 | result: MA1_HG.SG Saami.DG Mbuti 0.251530 0.002622 95.922 326072
33 |
--------------------------------------------------------------------------------
/04_Rmd_plotting_pca.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Principal Component Plots"
3 | output: html_document
4 | editor_options:
5 | chunk_output_type: console
6 | ---
7 |
8 | ```{r, echo=FALSE}
9 | knitr::opts_chunk$set(message = FALSE)
10 | ```
11 |
12 | ```{r}
13 | library(magrittr)
14 | ```
15 |
16 | For this chapter, you will need the PCA results that we ran in the last chapter. I have actually included the output files of my runs into this repository, so you can just use them if something didn't work in the previous chapter.
17 |
18 | For making plots in python, one of the most popular libaries around is ggplot2. You can load it via:
19 |
20 | ```{r}
21 | library(ggplot2)
22 | ```
23 |
24 | Let's have a look at the main results file from smartpca:
25 |
26 | ```{r, warning=F}
27 | pcaDat <- readr::read_delim("pca.WestEurasia.evec", " ", trim_ws = T)
28 | pcaDat2 <- readr::read_delim("pca.AllEurasia.evec", " ", trim_ws = T)
29 | ```
30 |
31 | The first row contains the eigenvalues for the first 4 principal components (PCs), and all further rows contain the PC coordinates for each individual. The first column contains the name of each individual, the last row the population. To load this dataset with R, we use the readr package. To load data using readr, we used the read_delim() function. We can now change the column headers:
32 |
33 | ```{r}
34 | colnames(pcaDat) <- colnames(pcaDat2) <- c("Name", "PC1", "PC2", "PC3", "PC4", "Group")
35 | ```
36 |
37 | Looking at the data, we find that it is a tibble (a better data.frame), with each individual on one row, and the columns denoting the first 4 principal components. The last column contains the population for each individual:
38 |
39 | ```{r}
40 | pcaDat
41 | ```
42 |
43 | We can quickly plot the first two PCs for all individuals:
44 |
45 | ```{r}
46 | pcaDat %>%
47 | ggplot() +
48 | geom_point(aes(x = PC1, y = PC2))
49 | ```
50 |
51 | which is not very helpful, because we can't see where each population falls. We can highlight a few populations to get a bit more of a feeling:
52 |
53 | ```{r}
54 | ggplot() +
55 | geom_point(
56 | data = pcaDat %>% dplyr::filter(!(Group %in% c("Finnish", "Sardinian", "Armenian", "BedouinB"))),
57 | aes(x = PC1, y = PC2)
58 | ) +
59 | geom_point(
60 | data = pcaDat %>% dplyr::filter(Group %in% c("Finnish", "Sardinian", "Armenian", "BedouinB")),
61 | aes(x = PC1, y = PC2, color = Group)
62 | )
63 | ```
64 |
65 | ## Showing all populations
66 |
67 | OK, but how do we systematically show all the populations? There are too many of those to separate them all by different colors, or by different symbols, so we need to combine colours and symbols and use all the combinations of them to show all the populations.
68 |
69 | ```{r}
70 | populations <- readr::read_csv("data/popgen_course/WestEurasia.poplist.txt", col_names = F)$X1
71 | ```
72 |
73 | ```{r, fig.height=10}
74 | pcaDat %>%
75 | dplyr::filter(Group %in% populations) %>%
76 | ggplot() +
77 | geom_point(aes(
78 | x = PC1, y = PC2,
79 | color = Group, shape = Group
80 | )) +
81 | scale_shape_manual(values = rep(0:18, len = 57)) +
82 | theme(legend.position = "bottom")
83 | ```
84 |
85 | ## Adding ancient populations
86 |
87 | Of course, until now we haven't yet included any of the actual ancient test individuals that we want to analyse.
88 |
89 | We add the following ancient populations to this plot:
90 |
91 | - Levanluhta (two individuals from Finland from the first millenium AD)
92 | - BolshoyOleniOstrov (a group of 3500 year old individuals from Northern Russia).
93 | - WHG (short for Western Hunter-Gatherers, about 8000 years ago)
94 | - LBK_EN (short for Linearbandkeramik Early Neolithic, from about 6,000 years ago)
95 | - Yamnaya_Samara, a late Neolithic population from the Russian Steppe, about 4,800 years ago.
96 |
97 | The first two populations are from a publication on ancient Fennoscandian genomes ([Lamnidis et al. 2018](https://www.nature.com/articles/s41467-018-07483-5)), and are instructive to understand what PCA can be used for. The latter three populations are from two famous publications ([Lazaridis et al. 2014](https://www.nature.com/articles/nature13673) and [Haak et al. 2015](https://www.nature.com/articles/nature14317)). It can be shown that modern European genetic diversity is formed by a mix of three ancestries represented by these ancient groups. To highlight these ancient populations, we plot them in black and using different symbols. While we're at it, we should also add the population called "Saami.DG":
98 |
99 | ```{r, fig.height=10}
100 | ancient_populations <- c("Levanluhta", "BolshoyOleniOstrov", "WHG", "LBK_EN", "Yamnaya_Samara", "Saami.DG")
101 |
102 | ggplot() +
103 | geom_point(
104 | data = pcaDat %>% dplyr::filter(Group %in% populations),
105 | mapping = aes(
106 | x = PC1, y = PC2,
107 | color = Group, shape = Group
108 | )
109 | ) +
110 | geom_point(
111 | data = pcaDat %>% dplyr::filter(Group %in% ancient_populations),
112 | mapping = aes(
113 | x = PC1, y = PC2
114 | ),
115 | color = "black", shape = 15
116 | ) +
117 | ggrepel::geom_label_repel(
118 | data = pcaDat %>% dplyr::filter(Group %in% ancient_populations) %>%
119 | dplyr::group_by(Group) %>%
120 | dplyr::summarise(PC1 = mean(PC1), PC2 = mean(PC2)),
121 | mapping = aes(
122 | x = PC1, y = PC2, label = Group
123 | )
124 | ) +
125 | scale_shape_manual(values = rep(0:14, len = 57)) +
126 | theme(legend.position = "bottom")
127 | ```
128 |
129 | OK, so what are we looking at? This is quite a rich plot, of course, and we won't discuss all the details here. I just want to highlight two things. First, you can see that most present-day Europeans are scattered in a relatively tight space in the center of a triangle span up by the WHG on the lower left, LBK_EN on the lower right (seen from European points) and by Yamnaya_Samara (top). Indeed, a widely-accepted model for present-day Europeans assumes these three ancient source populations for all Europeans ([Lazaridis et al. 2014](https://www.nature.com/articles/nature13673) and [Haak et al. 2015](https://www.nature.com/articles/nature14317)).
130 |
131 | The second thing that is noteworthy here is that present-day people from Northeastern Europe, such as Finns, Saami and other Uralic speaking populations are "dragged" towards the ancient samples form Bolshoy Oleni Ostrov. Indeed, a recent model published by us assumes that "Siberian" genetic ancestry entered Europe around 4000 years ago as a kind of fourth genetic component on top of the three other components discusseda bove, and is nowadays found in most Uralic speakers in Europe, including Finns, Saami and Estonians.
132 |
133 | ## East-Eurasian PCA
134 |
135 | We can make a similar plot using the all-Eurasian PCA that we have run:
136 |
137 | ```{r}
138 | populations <- readr::read_csv("data/popgen_course/AllEurasia.poplist.txt", col_names = F)$X1
139 | ```
140 |
141 | ```{r, fig.height=12}
142 | ggplot() +
143 | geom_point(
144 | data = pcaDat2 %>% dplyr::filter(Group %in% populations),
145 | mapping = aes(
146 | x = PC1, y = PC2,
147 | color = Group, shape = Group
148 | )
149 | ) +
150 | geom_point(
151 | data = pcaDat2 %>% dplyr::filter(Group %in% ancient_populations),
152 | mapping = aes(
153 | x = PC1, y = PC2
154 | ),
155 | color = "black", shape = 15
156 | ) +
157 | ggrepel::geom_label_repel(
158 | data = pcaDat2 %>% dplyr::filter(Group %in% ancient_populations) %>%
159 | dplyr::group_by(Group) %>%
160 | dplyr::summarise(PC1 = mean(PC1), PC2 = mean(PC2)),
161 | mapping = aes(
162 | x = PC1, y = PC2, label = Group
163 | )
164 | ) +
165 | scale_shape_manual(values = rep(0:14, len = 108)) +
166 | theme(legend.position = "bottom")
167 | ```
168 |
169 | This PCA looks quite different. Here, we have all Western-Eurasian groups squished together on the left side of the plot, and on the right we have East-Asian populations. The plot roughly reflects Geography, with Northern East-Asian people such as the Nganasan on the top-right, and Southern East-Asian people like the Taiwanese Ami on the lower right. Here we can now see that the ancient samples from Russia and Finnland, as well as present-day Uralic populations are actually distributed between East and West, contrary to most other Europeans. This confirms that these group in Europe have quite a distinctive East-Asian genetic ancestry, and we found that it is best represented by the Nganasan ([Lamnidis et al. 2018](https://www.nature.com/articles/s41467-018-07483-5)).
170 |
--------------------------------------------------------------------------------
/03_Rmd_smartpca.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Principal Components Analysis (PCA)"
3 | output: html_document
4 | editor_options:
5 | chunk_output_type: console
6 | ---
7 |
8 | ```{r, echo=FALSE}
9 | knitr::opts_chunk$set(message = FALSE)
10 | ```
11 |
12 | ```{r}
13 | library(magrittr)
14 | ```
15 |
16 | Principal components analysis (PCA) is one of the most useful techniques to visualise genetic diversity in a dataset. The methodology is not restricted to genetic data, but in general allows breaking down high-dimensional datasets to two or more dimensions for visualisation in a two-dimensional space.
17 |
18 | ## Genotype Data
19 |
20 | This lesson is also our first contact with the genotype data used in this and most of the following lessons. The dataset that we will work with contains 1,340 individuals, each represented by 593,124 single nucleotide polymorphisms (SNPs). Those SNPs have exactly two different alleles, and each individual has one of four possible values at each genotype: homozygous reference, heterozygous, homozygous alternative, or missing. Those four values are encoded 2, 1, 0 and 9 respectively.
21 |
22 | The data is laid out as a matrix, with columns indicating individuals, and rows indicating SNPs. The data itself comes in the so-called \"EIGENSTRAT\" format, which is defined in the [Eigensoft package](https://github.com/DReichLab/EIG) used by many tools used in this workshop. In this format, a genotype dataset consists of three files, usually with the following file endings:
23 |
24 | * `*.snp`: The file containing the SNP positions. It consists of six columns: SNP-name, chromosome, genetic positions, physical position, reference allele, alternative allele.
25 | * `*.ind`: The file containing the names of the individuals. It consists of three columns: Individual Name, Sex (encoded as M(ale), F(emale), or U(nknown)), and population name.
26 | * `*.geno`: The file containing the genotype matrix, with individuals laid out from left to right, and SNP positions laid out from top to bottom.
27 |
28 | In the following, we will explore the files using R in this Rmarkdown document.
29 |
30 | The data that we want to analyse is stored at `data/popgen_course`. Let's list the contents of that directory:
31 |
32 | ```{r}
33 | list.files("data/popgen_course/")
34 | ```
35 |
36 | Let's explore those files a bit. Here are the first 20 individuals:
37 |
38 | ```{r}
39 | individuals <- readr::read_delim(
40 | "data/popgen_course/genotypes_small.ind",
41 | delim = " ",
42 | trim_ws = T,
43 | col_names = c(
44 | "name",
45 | "sex",
46 | "population"
47 | )
48 | )
49 |
50 | individuals %>% head(20)
51 | ```
52 |
53 | And here the first 20 SNP rows:
54 |
55 | ```{r}
56 | snps <- readr::read_delim(
57 | "data/popgen_course/genotypes_small.snp",
58 | delim = " ",
59 | trim_ws = T,
60 | col_names = c(
61 | "SNP_name",
62 | "chromosome",
63 | "genetic_position",
64 | "physical_position",
65 | "reference_allele",
66 | "alternative_allele"
67 | )
68 | )
69 | ```
70 |
71 | And here are the first 20 genotypes of the first 50 individuals:
72 |
73 | ```{r}
74 | geno <- readr::read_lines(
75 | "data/popgen_course/genotypes_small.geno",
76 | n_max = 20
77 | )
78 |
79 | geno %>% substr(1, 50)
80 | ```
81 |
82 | Counting how many individuals and SNPs there are:
83 |
84 | ```{r}
85 | nrow(individuals)
86 | nrow(snps)
87 | ```
88 |
89 | And now we check that the first row of the `*.geno` file indeed contains the same number of columns:
90 |
91 | ```{r}
92 | nchar(geno[1])
93 | ```
94 |
95 | Now counting the number of rows in the `*.geno`-file (this takes a few seconds, as the file is several hundred MB large):
96 |
97 | ```{r}
98 | R.utils::countLines("data/popgen_course/genotypes_small.geno") %>% as.integer()
99 | ```
100 |
101 | Great, the number of rows and columns agrees with the numbers indicated in the `*.ind` and `*.snp` file! Now we're counting how many different populations there are. Let's first see the first 10 populations in the sorted list, alongside the number of individuals in each group:
102 |
103 | ```{r}
104 | individuals %>%
105 | dplyr::group_by(population) %>%
106 | dplyr::count()
107 | ```
108 |
109 | ## How PCA works
110 |
111 | To understand how PCA works, consider a single individual and its representation by its 593,124 markers. Formally, each individual is a point in a 593,124-dimensional space, where each dimension can take only the three possible genotypes indicated above, or have missing data. To visualise this high-dimensional dataset, we would like to project it down to two dimensions. But as there are many ways to project the shadow of a three-dimensional object on a two dimensional plane, there are many (and even more) ways to project a 593,124-dimensional cloud of points to two dimensions. What PCA does is figuring out the \"best\" way to do this project in order to visualise the major components of variance in the data.
112 |
113 | For actually running the analysis, we use a software called `smartPCA` from the [Eigensoft package](https://github.com/DReichLab/EIG). As many other tools from this and related packages, `smartPCA` reads in a parameter file which specifies its input and output files and options. In our case, we want the parameter file to have the following content:
114 |
115 | ```
116 | genotypename: data/popgen_course/genotypes_small.geno
117 | snpname: data/popgen_course/genotypes_small.snp
118 | indivname: data/popgen_course/genotypes_small.ind
119 | evecoutname: pca.WestEurasia.evec
120 | evaloutname: pca.WestEurasia.eval
121 | poplistname: data/popgen_course/WestEurasia.poplist.txt
122 | lsqproject: YES
123 | numoutevec: 4
124 | numthreads: 1
125 | ```
126 |
127 | Here, the first three parameters specify the input genotype files. The next two rows specify two output file names, typically with ending `*.evec` and `*.eval`. The parameter line beginning with `poplistname` contains a file with a list of populations used for calculating the principal components (see below). The option `lsqproject` is important for applications including ancient DNA with lots of missing data, which I will not elaborate on. For the purpose of this workshop, you should use `lsqproject: YES`. The next option `numoutevec` specifies the number of principal components that we compute, the last option `numthreads` the number of CPUs to use for this run. We use just one since we're working together on the same computer, so cannot afford everyone running on lots of CPUs.
128 |
129 | ## Population lists vs. Projection
130 |
131 | The parameter named `poplistname` is a very crucial one. It specifies the populations whose individuals are used to calculate the principal components. Why not just all of them you ask? For two reasons: First, there are simply too many of them and we don't want to use all of them, since the computation would take too long. More importantly, however, we generally try to avoid using ancient samples to compute principal components, to avoid specific ancient-DNA related artefacts affecting the computation. Finally, the list of populations to use for PCA should be informed by your question. If you're investigating African population structure, in makes no sense to put Asian or European individuals in your population list, since then the main axes of genetic differentiation would not be inside of Africa, but between Africans and Non-Africans.
132 |
133 | So what happens to individuals that are not in populations listed in the population list? Well, fortunately, they are not just ignored, but \"projected\". This means that after the principal components have been computed, *all* individuals (not just the one in the list) are projected onto these principal components. That way, we can visualise ancient populations in the context of modern genetic variation. While that may sound a bit problematic at first (Some variation in ancient populations is not represented well by modern populations), but it turns out to be nevertheless one of the most useful tools for this purpose. The advantage of avoiding ancient-DNA artefacts and batch effects to affect the visualisation outweighs the disadvantage of missing some private genetic variation components in the ancient populations themselves. Of course, that argument breaks down once the analysed populations become too ancient and detached from modern genetic variation. But for our purposes it will work just fine.
134 |
135 | For this workshop, I prepared two population lists::
136 |
137 | ```
138 | data/popgen_course/WestEurasia.poplist.txt
139 | data/popgen_course/AllEurasia.poplist.txt
140 | ```
141 |
142 | As you can tell from the names of the files, they specify two sets of modern populations representing West Eurasia or all of Europe and Asia, respectively.
143 |
144 | I recommend to look through both of the population lists and google some population names that you don't recognise to get a feeling for the ethnic groups represented here.
145 |
146 | ## Running `smartPCA`
147 |
148 | Now go ahead and open a new text file using your Jupyter Browser, you can name it anything you like. For the sake of a concrete name, let's call it `pca.WestEurasia.params.txt`. Text files in Jupyter are opened in a text editor, so you can then simply copy-paste the above lines into the new file.
149 |
150 | ```{r}
151 | readr::write_lines(c(
152 | "genotypename: data/popgen_course/genotypes_small.geno",
153 | "snpname: data/popgen_course/genotypes_small.snp",
154 | "indivname: data/popgen_course/genotypes_small.ind",
155 | "evecoutname: pca.WestEurasia.evec",
156 | "evaloutname: pca.WestEurasia.eval",
157 | "poplistname: data/popgen_course/WestEurasia.poplist.txt",
158 | "lsqproject: YES",
159 | "numoutevec: 4",
160 | "numthreads: 1"
161 | ),
162 | path = "pca.WestEurasia.params.txt"
163 | )
164 | ```
165 |
166 | Let's see whether it worked, by printing out the contents of that file into your notebook:
167 |
168 | ```{r}
169 | readr::read_lines(
170 | "pca.WestEurasia.params.txt"
171 | )
172 | ```
173 |
174 | Great, so that's our parameter file for running `smartPCA`.
175 |
176 | **Note:** that we specified two output files in our parameter file, here called `pca.WestEurasia.evec` and `pca.WestEurasia.eval`. You can actually put any names you want in there. But beware of relative vs. absolute paths. File names starting with `/` are considered \"absolute\", that is, taken to go from the root of the file system. In contrast, filenames not starting with `/` are considered \"relative\" to the current working directory. If you forgot which directory you're in, run `pwd`.
177 |
178 | **Note:** The option `poplistname` is a crucial one. Here you need to specify which populations are used to compute the eigenvectors of the principal components analysis. In our case, I have prepared two population list files: `data/popgen_course/WestEurasia.poplist.txt` and `data/popgen_course/AllEurasia.poplist.txt`. Pick one of the two to carry on.
179 |
180 | Good, now we can run `smartPCA`. To do that, it's more convenient to use the terminal than a Rmarkdown file. So open a terminal and run
181 |
182 | ```
183 | smartpca -p pca.WestEurasia.params.txt
184 | ```
185 |
186 | This will typically run for about 30 minutes and output lots of logging output to the screen.
187 |
188 | In a similar manner we can prepare a parameter file for the AllEurasia population list. This is how it should look:
189 |
190 | ```
191 | genotypename: data/popgen_course/genotypes_small.geno
192 | snpname: data/popgen_course/genotypes_small.snp
193 | indivname: data/popgen_course/genotypes_small.ind
194 | evecoutname: pca.AllEurasia.evec
195 | evaloutname: pca.AllEurasia.eval
196 | poplistname: data/popgen_course/AllEurasia.poplist.txt
197 | lsqproject: YES
198 | numoutevec: 4
199 | numthreads: 1
200 | ```
201 |
202 | And similar to the command above, we can run pca on the AllEurasia population list via:
203 |
204 | ```
205 | smartpca -p pca.AllEurasia.params.txt
206 | ```
207 |
208 | which will run slightly longer than the first one because there are more populations.
209 |
--------------------------------------------------------------------------------
/pca.WestEurasia.eval:
--------------------------------------------------------------------------------
1 | 6.288558
2 | 3.094820
3 | 2.692703
4 | 2.010356
5 | 1.742554
6 | 1.689588
7 | 1.626518
8 | 1.616861
9 | 1.590569
10 | 1.589779
11 | 1.580970
12 | 1.570292
13 | 1.562215
14 | 1.556105
15 | 1.550321
16 | 1.543074
17 | 1.540315
18 | 1.534597
19 | 1.524939
20 | 1.524176
21 | 1.520778
22 | 1.511707
23 | 1.505727
24 | 1.501742
25 | 1.494597
26 | 1.489205
27 | 1.484356
28 | 1.477644
29 | 1.469235
30 | 1.466139
31 | 1.464872
32 | 1.453282
33 | 1.439786
34 | 1.436309
35 | 1.424679
36 | 1.422132
37 | 1.416346
38 | 1.411205
39 | 1.407204
40 | 1.402511
41 | 1.399908
42 | 1.397946
43 | 1.395569
44 | 1.390360
45 | 1.385084
46 | 1.378586
47 | 1.377115
48 | 1.374029
49 | 1.370217
50 | 1.362597
51 | 1.360695
52 | 1.359427
53 | 1.354377
54 | 1.350525
55 | 1.344736
56 | 1.343233
57 | 1.339959
58 | 1.338966
59 | 1.335077
60 | 1.330144
61 | 1.323053
62 | 1.320174
63 | 1.317857
64 | 1.315216
65 | 1.310151
66 | 1.308080
67 | 1.305296
68 | 1.302001
69 | 1.300860
70 | 1.289421
71 | 1.287915
72 | 1.285110
73 | 1.279333
74 | 1.275902
75 | 1.271351
76 | 1.269675
77 | 1.267124
78 | 1.264680
79 | 1.262351
80 | 1.259080
81 | 1.257861
82 | 1.255196
83 | 1.252261
84 | 1.250030
85 | 1.247348
86 | 1.245184
87 | 1.242175
88 | 1.239379
89 | 1.238133
90 | 1.236569
91 | 1.232437
92 | 1.230241
93 | 1.228187
94 | 1.225265
95 | 1.223943
96 | 1.222205
97 | 1.219820
98 | 1.217824
99 | 1.215598
100 | 1.213455
101 | 1.209457
102 | 1.208494
103 | 1.205137
104 | 1.203316
105 | 1.201609
106 | 1.200145
107 | 1.196033
108 | 1.192887
109 | 1.192256
110 | 1.191376
111 | 1.188856
112 | 1.187536
113 | 1.185889
114 | 1.184822
115 | 1.181227
116 | 1.178798
117 | 1.175043
118 | 1.173146
119 | 1.171438
120 | 1.168925
121 | 1.166938
122 | 1.165445
123 | 1.164524
124 | 1.163296
125 | 1.161620
126 | 1.160886
127 | 1.159673
128 | 1.158730
129 | 1.155633
130 | 1.153882
131 | 1.151985
132 | 1.151777
133 | 1.149913
134 | 1.149450
135 | 1.147317
136 | 1.142969
137 | 1.142523
138 | 1.141568
139 | 1.138510
140 | 1.136462
141 | 1.134942
142 | 1.134773
143 | 1.133566
144 | 1.132394
145 | 1.130553
146 | 1.128126
147 | 1.127649
148 | 1.126379
149 | 1.125571
150 | 1.123615
151 | 1.121786
152 | 1.119842
153 | 1.119581
154 | 1.118005
155 | 1.116301
156 | 1.115939
157 | 1.114271
158 | 1.113468
159 | 1.111258
160 | 1.110785
161 | 1.109414
162 | 1.108017
163 | 1.105467
164 | 1.104533
165 | 1.103341
166 | 1.102456
167 | 1.101475
168 | 1.099997
169 | 1.098662
170 | 1.098401
171 | 1.095494
172 | 1.094875
173 | 1.094238
174 | 1.093169
175 | 1.091754
176 | 1.090332
177 | 1.089938
178 | 1.089230
179 | 1.087557
180 | 1.087194
181 | 1.086377
182 | 1.084882
183 | 1.084031
184 | 1.083202
185 | 1.082490
186 | 1.081516
187 | 1.078116
188 | 1.076987
189 | 1.075117
190 | 1.074305
191 | 1.074135
192 | 1.072921
193 | 1.071640
194 | 1.071192
195 | 1.069853
196 | 1.069550
197 | 1.067037
198 | 1.066036
199 | 1.065113
200 | 1.063452
201 | 1.062919
202 | 1.061595
203 | 1.060710
204 | 1.060346
205 | 1.059358
206 | 1.059034
207 | 1.056998
208 | 1.056422
209 | 1.055168
210 | 1.054756
211 | 1.054162
212 | 1.052285
213 | 1.051447
214 | 1.050835
215 | 1.050287
216 | 1.048713
217 | 1.047416
218 | 1.046505
219 | 1.046476
220 | 1.045557
221 | 1.044029
222 | 1.042965
223 | 1.042316
224 | 1.041413
225 | 1.040081
226 | 1.038227
227 | 1.037778
228 | 1.036910
229 | 1.035252
230 | 1.034431
231 | 1.034115
232 | 1.032947
233 | 1.031785
234 | 1.031263
235 | 1.030600
236 | 1.030253
237 | 1.028534
238 | 1.027096
239 | 1.026720
240 | 1.025726
241 | 1.025525
242 | 1.025322
243 | 1.024046
244 | 1.023472
245 | 1.022728
246 | 1.021405
247 | 1.021292
248 | 1.020123
249 | 1.019659
250 | 1.018497
251 | 1.017623
252 | 1.016996
253 | 1.016558
254 | 1.015365
255 | 1.014977
256 | 1.014861
257 | 1.013228
258 | 1.012071
259 | 1.011203
260 | 1.010988
261 | 1.010408
262 | 1.009380
263 | 1.007719
264 | 1.006532
265 | 1.006188
266 | 1.005178
267 | 1.004976
268 | 1.004504
269 | 1.003641
270 | 1.003191
271 | 1.002226
272 | 1.001685
273 | 1.000463
274 | 1.000337
275 | 0.999162
276 | 0.999093
277 | 0.998314
278 | 0.996493
279 | 0.996077
280 | 0.995453
281 | 0.994081
282 | 0.993268
283 | 0.992901
284 | 0.992295
285 | 0.991612
286 | 0.990978
287 | 0.989752
288 | 0.989216
289 | 0.988180
290 | 0.987540
291 | 0.986679
292 | 0.986364
293 | 0.986057
294 | 0.984414
295 | 0.983226
296 | 0.982901
297 | 0.981817
298 | 0.981467
299 | 0.980461
300 | 0.980450
301 | 0.978894
302 | 0.978277
303 | 0.977965
304 | 0.976690
305 | 0.976150
306 | 0.975217
307 | 0.974853
308 | 0.974546
309 | 0.973920
310 | 0.972813
311 | 0.972662
312 | 0.971645
313 | 0.970437
314 | 0.970208
315 | 0.969803
316 | 0.968331
317 | 0.967328
318 | 0.967162
319 | 0.966423
320 | 0.965545
321 | 0.965118
322 | 0.964917
323 | 0.964216
324 | 0.964092
325 | 0.963881
326 | 0.962568
327 | 0.962315
328 | 0.961592
329 | 0.961002
330 | 0.959286
331 | 0.958323
332 | 0.957938
333 | 0.957303
334 | 0.956762
335 | 0.955824
336 | 0.955388
337 | 0.954092
338 | 0.953812
339 | 0.952941
340 | 0.952400
341 | 0.951429
342 | 0.951262
343 | 0.950638
344 | 0.949361
345 | 0.949126
346 | 0.948465
347 | 0.948398
348 | 0.947911
349 | 0.946306
350 | 0.945961
351 | 0.945585
352 | 0.945162
353 | 0.944826
354 | 0.944243
355 | 0.942899
356 | 0.941973
357 | 0.941903
358 | 0.941103
359 | 0.940600
360 | 0.940065
361 | 0.939547
362 | 0.938725
363 | 0.938459
364 | 0.937967
365 | 0.937326
366 | 0.936784
367 | 0.935674
368 | 0.934585
369 | 0.934092
370 | 0.933774
371 | 0.932926
372 | 0.932327
373 | 0.932100
374 | 0.931712
375 | 0.931148
376 | 0.930232
377 | 0.929202
378 | 0.928810
379 | 0.928522
380 | 0.927521
381 | 0.927328
382 | 0.926623
383 | 0.926042
384 | 0.925250
385 | 0.924349
386 | 0.923473
387 | 0.923142
388 | 0.922366
389 | 0.921817
390 | 0.920906
391 | 0.920704
392 | 0.920235
393 | 0.919048
394 | 0.918843
395 | 0.917704
396 | 0.917045
397 | 0.916487
398 | 0.916337
399 | 0.916258
400 | 0.914903
401 | 0.914464
402 | 0.914041
403 | 0.913881
404 | 0.913094
405 | 0.912843
406 | 0.911970
407 | 0.911049
408 | 0.910202
409 | 0.909671
410 | 0.909440
411 | 0.908489
412 | 0.908079
413 | 0.907741
414 | 0.907548
415 | 0.906737
416 | 0.906195
417 | 0.905644
418 | 0.905111
419 | 0.904308
420 | 0.904040
421 | 0.903366
422 | 0.903223
423 | 0.902615
424 | 0.901622
425 | 0.901467
426 | 0.901179
427 | 0.900098
428 | 0.899927
429 | 0.899792
430 | 0.898742
431 | 0.897845
432 | 0.897437
433 | 0.896148
434 | 0.896008
435 | 0.895563
436 | 0.895410
437 | 0.894941
438 | 0.894237
439 | 0.893422
440 | 0.892932
441 | 0.891691
442 | 0.891632
443 | 0.891039
444 | 0.890625
445 | 0.889614
446 | 0.888892
447 | 0.888194
448 | 0.887807
449 | 0.887436
450 | 0.887371
451 | 0.886883
452 | 0.886195
453 | 0.885164
454 | 0.884265
455 | 0.884077
456 | 0.883718
457 | 0.883042
458 | 0.882538
459 | 0.882156
460 | 0.881516
461 | 0.881069
462 | 0.880161
463 | 0.879962
464 | 0.879528
465 | 0.878992
466 | 0.878825
467 | 0.878309
468 | 0.877843
469 | 0.877039
470 | 0.876641
471 | 0.876319
472 | 0.875079
473 | 0.874339
474 | 0.874230
475 | 0.873874
476 | 0.872857
477 | 0.872205
478 | 0.871989
479 | 0.871395
480 | 0.871152
481 | 0.870044
482 | 0.869753
483 | 0.869425
484 | 0.868858
485 | 0.867088
486 | 0.866709
487 | 0.866390
488 | 0.865938
489 | 0.865212
490 | 0.864425
491 | 0.864098
492 | 0.863748
493 | 0.863379
494 | 0.862923
495 | 0.862344
496 | 0.861429
497 | 0.860924
498 | 0.860355
499 | 0.859607
500 | 0.859477
501 | 0.858994
502 | 0.858649
503 | 0.857258
504 | 0.857078
505 | 0.856810
506 | 0.856177
507 | 0.855982
508 | 0.855824
509 | 0.855144
510 | 0.854475
511 | 0.853987
512 | 0.853647
513 | 0.852613
514 | 0.851769
515 | 0.851541
516 | 0.851137
517 | 0.850515
518 | 0.850446
519 | 0.849870
520 | 0.849277
521 | 0.848720
522 | 0.848360
523 | 0.847259
524 | 0.847193
525 | 0.846994
526 | 0.845861
527 | 0.845364
528 | 0.845043
529 | 0.844380
530 | 0.843360
531 | 0.842825
532 | 0.842303
533 | 0.841749
534 | 0.840919
535 | 0.840577
536 | 0.840009
537 | 0.839850
538 | 0.839015
539 | 0.838237
540 | 0.837920
541 | 0.837112
542 | 0.836878
543 | 0.836296
544 | 0.835706
545 | 0.835160
546 | 0.834576
547 | 0.834100
548 | 0.833720
549 | 0.832415
550 | 0.832231
551 | 0.832171
552 | 0.831552
553 | 0.831134
554 | 0.830803
555 | 0.829452
556 | 0.829250
557 | 0.828797
558 | 0.828511
559 | 0.828037
560 | 0.827513
561 | 0.827326
562 | 0.826752
563 | 0.826086
564 | 0.825650
565 | 0.824973
566 | 0.824409
567 | 0.823716
568 | 0.823206
569 | 0.822073
570 | 0.821757
571 | 0.821173
572 | 0.820873
573 | 0.820026
574 | 0.819839
575 | 0.818535
576 | 0.817973
577 | 0.817111
578 | 0.816751
579 | 0.816403
580 | 0.815699
581 | 0.815324
582 | 0.815057
583 | 0.814641
584 | 0.813534
585 | 0.812941
586 | 0.812452
587 | 0.811794
588 | 0.811542
589 | 0.810991
590 | 0.810724
591 | 0.809677
592 | 0.809547
593 | 0.808589
594 | 0.808439
595 | 0.807674
596 | 0.806734
597 | 0.806092
598 | 0.805974
599 | 0.805148
600 | 0.804800
601 | 0.804496
602 | 0.803970
603 | 0.803488
604 | 0.802755
605 | 0.802544
606 | 0.801909
607 | 0.800832
608 | 0.800408
609 | 0.799845
610 | 0.799108
611 | 0.798765
612 | 0.797802
613 | 0.797240
614 | 0.796978
615 | 0.796476
616 | 0.796009
617 | 0.795472
618 | 0.795240
619 | 0.794519
620 | 0.794289
621 | 0.793201
622 | 0.792818
623 | 0.792018
624 | 0.791806
625 | 0.791236
626 | 0.790462
627 | 0.789405
628 | 0.789034
629 | 0.788772
630 | 0.788432
631 | 0.787917
632 | 0.787389
633 | 0.786096
634 | 0.785929
635 | 0.785572
636 | 0.785046
637 | 0.784438
638 | 0.784134
639 | 0.783391
640 | 0.783121
641 | 0.782414
642 | 0.781586
643 | 0.780389
644 | 0.780236
645 | 0.779400
646 | 0.778745
647 | 0.778569
648 | 0.777887
649 | 0.776693
650 | 0.776351
651 | 0.776070
652 | 0.774920
653 | 0.774418
654 | 0.774176
655 | 0.773834
656 | 0.773016
657 | 0.771908
658 | 0.771459
659 | 0.770713
660 | 0.769870
661 | 0.769206
662 | 0.768963
663 | 0.767738
664 | 0.767465
665 | 0.766510
666 | 0.765908
667 | 0.765482
668 | 0.765061
669 | 0.764236
670 | 0.764026
671 | 0.763117
672 | 0.761653
673 | 0.761508
674 | 0.761167
675 | 0.760186
676 | 0.759873
677 | 0.759109
678 | 0.757924
679 | 0.757252
680 | 0.756537
681 | 0.756182
682 | 0.755559
683 | 0.754790
684 | 0.753408
685 | 0.752626
686 | 0.751722
687 | 0.751316
688 | 0.750610
689 | 0.750227
690 | 0.749308
691 | 0.748432
692 | 0.747693
693 | 0.747260
694 | 0.746210
695 | 0.744726
696 | 0.744155
697 | 0.743165
698 | 0.742642
699 | 0.742262
700 | 0.740150
701 | 0.739781
702 | 0.738910
703 | 0.738168
704 | 0.737248
705 | 0.736519
706 | 0.735534
707 | 0.735123
708 | 0.734035
709 | 0.733200
710 | 0.731250
711 | 0.729438
712 | 0.727373
713 | 0.727101
714 | 0.726889
715 | 0.724111
716 | 0.721821
717 | 0.718911
718 | 0.694921
719 | 0.685380
720 | -0.000000
721 |
--------------------------------------------------------------------------------
/01_bashnb_getting_started.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introduction to Jupyter"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Basic Usage"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "When you first access Jupyter, you will get a file browser view of your home directory on the server. In the beginning, your home directory will be empty, and will be populated with notebooks and files throughout this workshop.\n",
22 | "\n",
23 | "To create a new text file, click on New (in the upper right corner) and then Text File, which opens a text editor within your browser. You can now add content into the file, or edit existing content and save. The filename can be changed by clicking into the Filename on top. You can now go back to your file browser window and update using the button with the two arrows in the upper right corner, and you should see your text file saved in your home directory.\n",
24 | "\n",
25 | "You can also use Jupyter to open a Terminal within the browser: Click on New and then Terminal, which will open a terminal window in a separate browser tab. You can enter Unix Bash commands to change directories, view files or execute programs (as we will learn below).\n",
26 | "\n",
27 | "Finally, you can create new Folders by clicking on New and then Folder. To rename the new folder, click on the checkbox beside the new folder, and click the Rename button on top, which appeared. To change into the new folder, click on it. To move back, click on the parent folder appearing on top of the file browser."
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "***Excercise:*** Create a new folder called hello, and a text file within that folder using Jupyter. Name that text file hello.txt and fill it with arbitrary content, such as `Hello, World!`. Then open a terminal and output the contents of the new text file typing `cat hello/hello.txt` followed by ENTER."
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "***Note:*** While the Jupyter terminal and Jupyter Text Files are different ways to interact with the server, both access the same file system. So files created with the Text Editor are saved in your home directory, and can be accessed via the terminal, and vice versa: Files created via the Terminal can be accessed via the Text Editor, by simpling clicking on them in the Jupyter File Browser."
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## Notebooks"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "Notebook can be loaded for different underlying kernels: bash, python and R. Notebooks are useful to document interactive data analysis. It combines code cells with markdown cells. A markdown cell can contain text, math or headings. "
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "You can create new bash notebooks using the \"New\" Dropdown list in the Jupyter File Browser and then selecting \"Bash\". Notebooks open if you click on them."
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "In Jupyter notebooks, you work with *Cells*. You can create new cells, or insert them above or below existing cells using the menu items in the `Insert` menu. Use the dropdown list in the command bar in Jupyter to change the type of the cell. The two main types we're going to use are `Markdown` and `Code`. Markdown cells are useful for documenting stuff, Code cells for running code. Markdown cells can be edited by double-clicking into them. Layout them by runnign Shift-Enter."
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "Code cells are used to enter and execute code. Let's look at some examples."
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "We can first check which directory we are in, using the `pwd` (=Present Working Directory) command:"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 2,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "name": "stdout",
93 | "output_type": "stream",
94 | "text": [
95 | "/home/stephan/popgen_course\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "pwd"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "OK, so we're in the `coursework` subfolder within our home folder `/home/stephan`. We can list the contents of that folder:"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 5,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "01_bashnb_getting_started.ipynb pca.AllEurasia.params.txt\n",
120 | "02_pynb_getting_started.ipynb\t pca.WestEurasia.eval\n",
121 | "03_bashnb_smartpca.ipynb\t pca.WestEurasia.evec\n",
122 | "04_pynb_plotting_pca.ipynb\t pca.WestEurasia.params.txt\n",
123 | "pca.AllEurasia.eval\t\t population_frequencies.txt\n",
124 | "pca.AllEurasia.evec\t\t README.md\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "ls"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "We can now create a new directory:"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 6,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "mkdir testDir"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "and change into that directory:"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 7,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "cd testDir"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "and confirm that we are now in the new dir:"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 8,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "/home/stephan/popgen_course/testDir\n"
181 | ]
182 | }
183 | ],
184 | "source": [
185 | "pwd"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "OK, let's go back and delete the subfolder again:"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 9,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "cd ..\n",
202 | "rm -r testDir"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "Here is a simple example of how to use ``echo``:"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 10,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | "Hello, how are you?\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "echo \"Hello, how are you?\""
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "OK, so let's try some more useful things with ``grep``, which can be used to filter large text files by searching for patterns, in this case just the occurrence of the word \"French\":"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 11,
239 | "metadata": {},
240 | "outputs": [
241 | {
242 | "name": "stdout",
243 | "output_type": "stream",
244 | "text": [
245 | " HGDP00511 M French\n",
246 | " HGDP00512 M French\n",
247 | " HGDP00513 F French\n",
248 | " HGDP00514 F French\n",
249 | " HGDP00515 M French\n",
250 | " HGDP00516 F French\n",
251 | " HGDP00517 F French\n",
252 | " HGDP00518 M French\n",
253 | " HGDP00519 M French\n",
254 | " HGDP00522 M French\n",
255 | " HGDP00523 F French\n",
256 | " HGDP00524 F French\n",
257 | " HGDP00525 M French\n",
258 | " HGDP00526 F French\n",
259 | " HGDP00527 F French\n",
260 | " HGDP00528 M French\n",
261 | " HGDP00529 F French\n",
262 | " HGDP00531 F French\n",
263 | " HGDP00533 M French\n",
264 | " HGDP00534 F French\n",
265 | " HGDP00535 F French\n",
266 | " HGDP00536 F French\n",
267 | " HGDP00537 F French\n",
268 | " HGDP00538 M French\n",
269 | " HGDP00539 F French\n",
270 | " SouthFrench3326 M French\n",
271 | " SouthFrench3947 M French\n",
272 | " SouthFrench1323 M French\n",
273 | " SouthFrench3951 M French\n",
274 | " SouthFrench3068 M French\n",
275 | " SouthFrench1112 M French\n",
276 | " SouthFrench4018 M French\n"
277 | ]
278 | }
279 | ],
280 | "source": [
281 | "grep French /data/popgen_course/genotypes_small.ind"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "Alright, so that lists all French individuals. Now let's count them, by simply passing the flag `-c`:"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 12,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "name": "stdout",
298 | "output_type": "stream",
299 | "text": [
300 | "32\n"
301 | ]
302 | }
303 | ],
304 | "source": [
305 | "grep -c French /data/popgen_course/genotypes_small.ind"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "***Note:*** We so far have seen the `pwd`, `mkdir`, `cd`, `rm`, `ls` and `grep` commands. If you want to find out more about those, just google them, they are among the most popular and widely used commands/programs in Unix."
313 | ]
314 | },
315 | {
316 | "cell_type": "markdown",
317 | "metadata": {},
318 | "source": [
319 | "In Python3 notebooks you can plot things: Create a new python3 notebook, and run this boilerplate code in the first cell:\n",
320 | "\n",
321 | " %matplotlib inline\n",
322 | " import matplotlib.pyplot as plt\n",
323 | "\n",
324 | "Then plot something, opening a second cell:\n",
325 | "\n",
326 | "***Exercise:*** Create a simple plot using `plt.plot([1, 2, 3], [5, 2, 6])`\n"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {},
332 | "source": [
333 | "# Bash Pipes"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "OK. So this first Notebook operates on Bash, which is more or less the lingua franca of Linux operating systems. Everything you do on command lines uses bash. One of the most useful techniques in bash scripting or bash commands are Unix pipes. To illustrate them, consider the following."
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "Let's look at the structure of our ``ind`` file:"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 4,
353 | "metadata": {},
354 | "outputs": [
355 | {
356 | "name": "stdout",
357 | "output_type": "stream",
358 | "text": [
359 | " Yuk_009 M Yukagir\n",
360 | " Yuk_025 F Yukagir\n",
361 | " Yuk_022 F Yukagir\n",
362 | " Yuk_020 F Yukagir\n",
363 | " MC_40 M Chukchi\n",
364 | " Yuk_024 F Yukagir\n",
365 | " Yuk_023 F Yukagir\n",
366 | " MC_16 M Chukchi\n",
367 | " MC_15 F Chukchi\n",
368 | " MC_18 M Chukchi\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "head /data/popgen_course/genotypes_small.ind"
374 | ]
375 | },
376 | {
377 | "cell_type": "markdown",
378 | "metadata": {},
379 | "source": [
380 | "***Note:*** The `head` command just lists the top 10 rows of a file."
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "Let's filter out the population column:"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 5,
393 | "metadata": {},
394 | "outputs": [
395 | {
396 | "name": "stdout",
397 | "output_type": "stream",
398 | "text": [
399 | "Yukagir\n",
400 | "Yukagir\n",
401 | "Yukagir\n",
402 | "Yukagir\n",
403 | "Chukchi\n",
404 | "Yukagir\n",
405 | "Yukagir\n",
406 | "Chukchi\n",
407 | "Chukchi\n",
408 | "Chukchi\n"
409 | ]
410 | }
411 | ],
412 | "source": [
413 | "head /data/popgen_course/genotypes_small.ind | awk '{print $3}'"
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "metadata": {},
419 | "source": [
420 | "***Note:*** The `awk` program is one of the most powerful programs for text-file processing in the Unix-world. It is actually a full-fledged programming language itself. Here we only use it in one of its simplest form. The program `{print $3}` simply says \"For every line of the input file, print out the third field\"."
421 | ]
422 | },
423 | {
424 | "cell_type": "markdown",
425 | "metadata": {},
426 | "source": [
427 | "***Note:*** The pipe symbol `|` tells Unix to redirect the output of the program to its left into the program to its right as standard input. "
428 | ]
429 | },
430 | {
431 | "cell_type": "markdown",
432 | "metadata": {},
433 | "source": [
434 | "Let's sort the output (notice we now use ``cat`` instead of ``head``, but use ``head`` in the end:"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 8,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "name": "stdout",
444 | "output_type": "stream",
445 | "text": [
446 | "Abkhasian\n",
447 | "Abkhasian\n",
448 | "Abkhasian\n",
449 | "Abkhasian\n",
450 | "Abkhasian\n",
451 | "Abkhasian\n",
452 | "Abkhasian\n",
453 | "Abkhasian\n",
454 | "Abkhasian\n",
455 | "Adygei\n",
456 | "sort: Schreiben fehlgeschlagen: Standardausgabe: Datenübergabe unterbrochen (broken pipe)\n",
457 | "sort: Schreibfehler\n"
458 | ]
459 | }
460 | ],
461 | "source": [
462 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | head"
463 | ]
464 | },
465 | {
466 | "cell_type": "markdown",
467 | "metadata": {},
468 | "source": [
469 | "OK, so there are some error messages in the end because ``head`` ungracefully discards the rest of the data, but that's OK."
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {},
475 | "source": [
476 | "Now let's use ``uniq`` to get rid of population name duplicates:"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 9,
482 | "metadata": {},
483 | "outputs": [
484 | {
485 | "name": "stdout",
486 | "output_type": "stream",
487 | "text": [
488 | "Abkhasian\n",
489 | "Adygei\n",
490 | "Albanian\n",
491 | "Aleut\n",
492 | "Aleut_Tlingit\n",
493 | "Altaian\n",
494 | "Ami\n",
495 | "Armenian\n",
496 | "Atayal\n",
497 | "Balkar\n"
498 | ]
499 | }
500 | ],
501 | "source": [
502 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq | head"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "metadata": {},
508 | "source": [
509 | "And now let's count:"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 10,
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "name": "stdout",
519 | "output_type": "stream",
520 | "text": [
521 | "116\n"
522 | ]
523 | }
524 | ],
525 | "source": [
526 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq | wc -l"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "OK, so there are 116 populations in the dataset. And how many individuals?"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 11,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "name": "stdout",
543 | "output_type": "stream",
544 | "text": [
545 | "1340 /data/popgen_course/genotypes_small.ind\n"
546 | ]
547 | }
548 | ],
549 | "source": [
550 | "wc -l /data/popgen_course/genotypes_small.ind"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "So 1340 individuals on 116 populations, so a bit more than 10 per population on average. Good to know!"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "***Note:*** we learned some new Unix commands: `awk`, `cat`, `head`, `sort`, `uniq` and `wc`."
565 | ]
566 | },
567 | {
568 | "cell_type": "markdown",
569 | "metadata": {},
570 | "source": [
571 | "As a final step, let's modify our pipeline to output not just the unique populations, but also the number of individuals per populations. Fortunately this is extremely easy, since the flag `-c` to the `uniq` command already does the job:"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 20,
577 | "metadata": {},
578 | "outputs": [
579 | {
580 | "name": "stdout",
581 | "output_type": "stream",
582 | "text": [
583 | " 9 Abkhasian\n",
584 | " 16 Adygei\n",
585 | " 6 Albanian\n",
586 | " 7 Aleut\n",
587 | " 4 Aleut_Tlingit\n",
588 | " 7 Altaian\n",
589 | " 10 Ami\n",
590 | " 10 Armenian\n",
591 | " 9 Atayal\n",
592 | " 10 Balkar\n"
593 | ]
594 | }
595 | ],
596 | "source": [
597 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq -c | head"
598 | ]
599 | },
600 | {
601 | "cell_type": "markdown",
602 | "metadata": {},
603 | "source": [
604 | "Nice. Let's put that list into a file that we can then import for plotting later."
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 21,
610 | "metadata": {},
611 | "outputs": [],
612 | "source": [
613 | "cat /data/popgen_course/genotypes_small.ind | awk '{print $3}' | sort | uniq -c > population_frequencies.txt"
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "metadata": {},
619 | "source": [
620 | "OK, we have created a new file called `population_frequencies.txt` in our current directory. We have used the bash redirection sumbol `>` for writing outputs from a command or pipeline into a file. The file should now contain the population number data. We can check this by running:"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 22,
626 | "metadata": {},
627 | "outputs": [
628 | {
629 | "name": "stdout",
630 | "output_type": "stream",
631 | "text": [
632 | " 9 Abkhasian\n",
633 | " 16 Adygei\n",
634 | " 6 Albanian\n",
635 | " 7 Aleut\n",
636 | " 4 Aleut_Tlingit\n",
637 | " 7 Altaian\n",
638 | " 10 Ami\n",
639 | " 10 Armenian\n",
640 | " 9 Atayal\n",
641 | " 10 Balkar\n"
642 | ]
643 | }
644 | ],
645 | "source": [
646 | "head population_frequencies.txt"
647 | ]
648 | },
649 | {
650 | "cell_type": "markdown",
651 | "metadata": {},
652 | "source": [
653 | "OK, it seems to have worked. If you want to look at the file in a more interactive way, go back to your Jupyter File Browser and click on the file, which you should now see within your working directory. The file should open in a text editor that you can use to scroll around."
654 | ]
655 | },
656 | {
657 | "cell_type": "markdown",
658 | "metadata": {},
659 | "source": [
660 | "OK, now that we have a file to plot, let's try it out using a new python3 notebook. See the next notebook, called `02_pynb_getting_started` in this series."
661 | ]
662 | }
663 | ],
664 | "metadata": {
665 | "kernelspec": {
666 | "display_name": "Bash",
667 | "language": "bash",
668 | "name": "bash"
669 | },
670 | "language_info": {
671 | "codemirror_mode": "shell",
672 | "file_extension": ".sh",
673 | "mimetype": "text/x-sh",
674 | "name": "bash"
675 | }
676 | },
677 | "nbformat": 4,
678 | "nbformat_minor": 2
679 | }
680 |
--------------------------------------------------------------------------------
/03_bashnb_smartpca.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Principal Components Analysis (PCA)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Principal components analysis (PCA) is one of the most useful techniques to visualise genetic diversity in a dataset. The methodology is not restricted to genetic data, but in general allows breaking down high-dimensional datasets to two or more dimensions for visualisation in a two-dimensional space."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Genotype Data"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "This lesson is also our first contact with the genotype data used in this and most of the following lessons. The dataset that we will work with contains 1,340 individuals, each represented by 593,124 single nucleotide polymorphisms (SNPs). Those SNPs have exactly two different alleles, and each individual has one of four possible values at each genotype: homozygous reference, heterozygous, homozygous alternative, or missing. Those four values are encoded 2, 1, 0 and 9 respectively. \n",
29 | "\n",
30 | "The data is laid out as a matrix, with columns indicating individuals, and rows indicating SNPs. The data itself comes in the so-called \"EIGENSTRAT\" format, which is defined in the [Eigensoft package](https://github.com/DReichLab/EIG) used by many tools used in this workshop. In this format, a genotype dataset consists of three files, usually with the following file endings:\n",
31 | "\n",
32 | "* `*.snp`: The file containing the SNP positions. It consists of six columns: SNP-name, chromosome, genetic positions, physical position, reference allele, alternative allele.\n",
33 | "* `*.ind`: The file containing the names of the individuals. It consists of three columns: Individual Name, Sex (encoded as M(ale), F(emale), or U(nknown)), and population name.\n",
34 | "* `*.geno`: The file containing the genotype matrix, with individuals laid out from left to right, and SNP positions laid out from top to bottom.\n",
35 | " \n",
36 | "In the following, we will explore the files using bash in this notebook."
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "The data that we want to analyse is stored at `/data/popgen_course`. Let's list the contents of that directory:"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 1,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "AllEurasia.poplist.txt\tgenotypes_small.ind WestEurasia.poplist.txt\n",
56 | "genotypes_small.geno\tgenotypes_small.snp\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "ls /data/popgen_course"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "Let's explore those files a bit. Here are the first 20 individuals:"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 2,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | " Yuk_009 M Yukagir\n",
81 | " Yuk_025 F Yukagir\n",
82 | " Yuk_022 F Yukagir\n",
83 | " Yuk_020 F Yukagir\n",
84 | " MC_40 M Chukchi\n",
85 | " Yuk_024 F Yukagir\n",
86 | " Yuk_023 F Yukagir\n",
87 | " MC_16 M Chukchi\n",
88 | " MC_15 F Chukchi\n",
89 | " MC_18 M Chukchi\n",
90 | " Yuk_004 M Yukagir\n",
91 | " MC_08 F Chukchi\n",
92 | " Nov_005 M Nganasan\n",
93 | " MC_25 F Chukchi\n",
94 | " Yuk_019 F Yukagir\n",
95 | " Yuk_011 M Yukagir\n",
96 | " Sesk_47 M Chukchi1\n",
97 | " MC_17 M Chukchi\n",
98 | " Yuk_021 M Yukagir\n",
99 | " MC_06 F Chukchi\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "head -20 /data/popgen_course/genotypes_small.ind"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "And here the first 20 SNP rows:"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 3,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | " 1_752566 1 0.020130 752566 G A\n",
124 | " 1_842013 1 0.022518 842013 T G\n",
125 | " 1_891021 1 0.024116 891021 G A\n",
126 | " 1_903426 1 0.024457 903426 C T\n",
127 | " 1_949654 1 0.025727 949654 A G\n",
128 | " 1_1018704 1 0.026288 1018704 A G\n",
129 | " 1_1045331 1 0.026665 1045331 G A\n",
130 | " 1_1048955 1 0.026674 1048955 A G\n",
131 | " 1_1061166 1 0.026711 1061166 T C\n",
132 | " 1_1108637 1 0.028311 1108637 G A\n",
133 | " 1_1120431 1 0.028916 1120431 G A\n",
134 | " 1_1156131 1 0.029335 1156131 T C\n",
135 | " 1_1157547 1 0.029356 1157547 T C\n",
136 | " 1_1158277 1 0.029367 1158277 G A\n",
137 | " 1_1161780 1 0.029391 1161780 C T\n",
138 | " 1_1170587 1 0.029450 1170587 C T\n",
139 | " 1_1205155 1 0.029735 1205155 A C\n",
140 | " 1_1211292 1 0.029785 1211292 C T\n",
141 | " 1_1235792 1 0.030045 1235792 C T\n",
142 | " 1_1254255 1 0.030111 1254255 G A\n"
143 | ]
144 | }
145 | ],
146 | "source": [
147 | "head -20 /data/popgen_course/genotypes_small.snp"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "And here are the first 20 genotypes of the first 50 individuals:"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 3,
160 | "metadata": {},
161 | "outputs": [
162 | {
163 | "name": "stdout",
164 | "output_type": "stream",
165 | "text": [
166 | "01011012111022101020212001000102000000110010002000\n",
167 | "20121210122100111221001112022012221211022221211210\n",
168 | "11001120011100210010011110000112000001111000011100\n",
169 | "00001122102221212211211002022212221221121122112021\n",
170 | "00000000000000000000000000001000000000000000001000\n",
171 | "10121002211022011011211101201100000100120020102001\n",
172 | "22222222222222222222222222222222222222222222222222\n",
173 | "22112220022120221020012122222122122222101222121212\n",
174 | "22112220022120221020012122020122122122101222121211\n",
175 | "22222222221022222022222222222222222222222222112222\n",
176 | "22122222121222222222222222222212222222222222202211\n",
177 | "11011000010000010010000002220100212000012021101011\n",
178 | "12211212212222112212222221212212222122222222222222\n",
179 | "12211212212222112212222221212212222122222222222222\n",
180 | "12211212212222112212222221212212222122222222222222\n",
181 | "22222222222222222222222222222222222222222222222222\n",
182 | "22222222222222222222222222222222222222222222222222\n",
183 | "10111111021001110011002001222210222112112220212122\n",
184 | "22222222222222222222222222222222222222222222222222\n",
185 | "21221212121022212022222222222222211222122221922222\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "head -20 /data/popgen_course/genotypes_small.geno | cut -c1-50"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "Counting how many individuals and SNPs there are:"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 4,
203 | "metadata": {},
204 | "outputs": [
205 | {
206 | "name": "stdout",
207 | "output_type": "stream",
208 | "text": [
209 | "1340 /data/popgen_course/genotypes_small.ind\n",
210 | "593124 /data/popgen_course/genotypes_small.snp\n"
211 | ]
212 | }
213 | ],
214 | "source": [
215 | "wc -l /data/popgen_course/genotypes_small.ind\n",
216 | "wc -l /data/popgen_course/genotypes_small.snp"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "And now we check that the first row of the `*.geno` file indeed contains the same number of columns:"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 6,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "1341\n"
236 | ]
237 | }
238 | ],
239 | "source": [
240 | "head -1 /data/popgen_course/genotypes_small.geno | wc -c"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "which is one more, including the newline character at the end of the line. Now counting the number of rows in the `*.geno`-file (this takes a few seconds, as the file is several hundred MB large):"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 7,
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "name": "stdout",
257 | "output_type": "stream",
258 | "text": [
259 | "593124 /data/popgen_course/genotypes_small.geno\n"
260 | ]
261 | }
262 | ],
263 | "source": [
264 | "wc -l /data/popgen_course/genotypes_small.geno"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "Great, the number of rows and columns agrees with the numbers indicated in the `*.ind` and `*.snp` file!\n",
272 | "Now we're counting how many different populations there are. Let's first see the first 10 populations in the sorted list, alongside the number of individuals in each group:"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 5,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "name": "stdout",
282 | "output_type": "stream",
283 | "text": [
284 | " 9 Abkhasian\n",
285 | " 16 Adygei\n",
286 | " 6 Albanian\n",
287 | " 7 Aleut\n",
288 | " 4 Aleut_Tlingit\n",
289 | " 7 Altaian\n",
290 | " 10 Ami\n",
291 | " 10 Armenian\n",
292 | " 9 Atayal\n",
293 | " 10 Balkar\n",
294 | " 29 Basque\n",
295 | " 25 BedouinA\n",
296 | " 19 BedouinB\n",
297 | " 10 Belarusian\n",
298 | " 6 BolshoyOleniOstrov\n",
299 | " 9 Borneo\n",
300 | " 10 Bulgarian\n",
301 | " 8 Cambodian\n",
302 | " 2 Canary_Islander\n",
303 | " 2 ChalmnyVarre\n"
304 | ]
305 | }
306 | ],
307 | "source": [
308 | "awk '{print $3}' /data/popgen_course/genotypes_small.ind | sort | uniq -c | head -20"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "metadata": {},
314 | "source": [
315 | "## How PCA works\n",
316 | "\n",
317 | "To understand how PCA works, consider a single individual and its representation by its 593,124 markers. Formally, each individual is a point in a 593,124-dimensional space, where each dimension\n",
318 | "can take only the three possible genotypes indicated above, or have missing data. To visualise this high-dimensional dataset, we would like to project it down to two dimensions. But as there are many ways to project the shadow of a three-dimensional object on a two dimensional plane, there are many (and even more) ways to project a 593,124-dimensional cloud of points to two dimensions. What PCA does is figuring out the \"best\" way to do this project in order to visualise the major components of variance in the data.\n"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "## Parameter files\n",
326 | "For actually running the analysis, we use a software called `smartPCA` from the [Eigensoft package](https://github.com/DReichLab/EIG). As many other tools from this and related packages, `smartPCA` reads in a parameter file which specifies its input and output files and options. In our case, we want the parameter file to have the following content:\n",
327 | "\n",
328 | " genotypename: /data/popgen_course/genotypes_small.geno\n",
329 | " snpname: /data/popgen_course/genotypes_small.snp\n",
330 | " indivname: /data/popgen_course/genotypes_small.ind\n",
331 | " evecoutname: pca.WestEurasia.evec\n",
332 | " evaloutname: pca.WestEurasia.eval\n",
333 | " poplistname: /data/popgen_course/WestEurasia.poplist.txt\n",
334 | " lsqproject: YES\n",
335 | " numoutevec: 4\n",
336 | " numthreads: 1"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "Here, the first three parameters specify the input genotype files. The next two rows specify two output file names, typically with ending `*.evec` and `*.eval`. The parameter line beginning with `poplistname` contains a file with a list of populations used for calculating the principal components (see below). The option `lsqproject` is important for applications including ancient DNA with lots of missing data, which I will not elaborate on. For the purpose of this workshop, you should use `lsqproject: YES`. The next option `numoutevec` specifies the number of principal components that we compute, the last option `numthreads` the number of CPUs to use for this run. We use just one since we're working together on the same computer, so cannot afford everyone running on lots of CPUs."
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "metadata": {},
349 | "source": [
350 | "## Population lists vs. Projection\n",
351 | "\n",
352 | "The parameter named `poplistname` is a very crucial one. It specifies the populations whose individuals are used to calculate the principal components. Why not just all of them you ask? For two reasons: First, there are simply too many of them and we don't want to use all of them, since the computation would take too long. More importantly, however, we generally try to avoid using ancient samples to compute principal components, to avoid specific ancient-DNA related artefacts affecting the computation. Finally, the list of populations to use for PCA should be informed by your question. If you're investigating African population structure, in makes no sense to put Asian or European individuals in your population list, since then the main axes of genetic differentiation would not be inside of Africa, but between Africans and Non-Africans.\n",
353 | "\n",
354 | "So what happens to individuals that are not in populations listed in the population list? Well, fortunately, they are not just ignored, but \"projected\". This means that after the principal components have been computed, *all* individuals (not just the one in the list) are projected onto these principal components. That way, we can visualise ancient populations in the context of modern genetic variation. While that may sound a bit problematic at first (Some variation in ancient populations is not represented well by modern populations), but it turns out to be nevertheless one of the most useful tools for this purpose. The advantage of avoiding ancient-DNA artefacts and batch effects to affect the visualisation outweighs the disadvantage of missing some private genetic variation components in the ancient populations themselves. Of course, that argument breaks down once the analysed populations become too ancient and detached from modern genetic variation. But for our purposes it will work just fine.\n",
355 | "\n",
356 | "For this workshop, I prepared two population lists::\n",
357 | "\n",
358 | " /data/popgen_course/WestEurasia.poplist.txt\n",
359 | " /data/popgen_course/AllEurasia.poplist.txt\n",
360 | "\n",
361 | "As you can tell from the names of the files, they specify two sets of modern populations representing West Eurasia or all of Europe and Asia, respectively.\n",
362 | "\n",
363 | "I recommend to look through both of the population lists and google some population names that you don't recognise to get a feeling for the ethnic groups represented here."
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "## Running `smartPCA`"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "Now go ahead and open a new text file using your Jupyter Browser, you can name it anything you like. For the sake of a concrete name, let's call it `pca.WestEurasia.params.txt`. Text files in Jupyter are opene in a text editor, so you can then simply copy-paste the above lines into the new file."
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {},
383 | "source": [
384 | "Let's see whether it worked, by printing out the contents of that file into your notebook:"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 8,
390 | "metadata": {},
391 | "outputs": [
392 | {
393 | "name": "stdout",
394 | "output_type": "stream",
395 | "text": [
396 | "genotypename: /data/popgen_course/genotypes_small.geno\n",
397 | "snpname: /data/popgen_course/genotypes_small.snp\n",
398 | "indivname: /data/popgen_course/genotypes_small.ind\n",
399 | "evecoutname: pca.WestEurasia.evec\n",
400 | "evaloutname: pca.WestEurasia.eval\n",
401 | "poplistname: /data/popgen_course/WestEurasia.poplist.txt\n",
402 | "lsqproject: YES\n",
403 | "numoutevec: 4\n",
404 | "numthreads: 1\n"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "cat pca.WestEurasia.params.txt"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "Great, so that's our parameter file for running `smartPCA`.\n",
417 | "\n",
418 | "***Note:*** that we specified two output files in our parameter file, here called `pca.WestEurasia.evec` and `pca.WestEurasia.eval`. You can actually put any names you want in there. But beware of relative vs. absolute paths. File names starting with `/` are considered \"absolute\", that is, taken to go from the root of the file system. In contrast, filenames not starting with `/` are considered \"relative\" to the current working directory. If you forgot which directory you're in, run `pwd`.\n",
419 | "\n",
420 | "***Note:*** The option `poplistname` is a crucial one. Here you need to specify which populations are used to compute the eigenvectors of the principal components analysis. In our case, I have prepared two population list files: `/data/popgen_course/WestEurasia.poplist.txt` and `/data/popgen_course/AllEurasia.poplist.txt`. Pick one of the two to carry on."
421 | ]
422 | },
423 | {
424 | "cell_type": "markdown",
425 | "metadata": {},
426 | "source": [
427 | "Good, now we can run `smartPCA`. To do that, it's more convenient to use the terminal than a Notebook. So open a terminal and run\n",
428 | "\n",
429 | " smartpca -p pca.WestEurasia.params.txt"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "metadata": {},
435 | "source": [
436 | "This will typically run for about 30 minutes and output lots of logging output to the screen."
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "metadata": {},
442 | "source": [
443 | "In a similar manner we can prepare a parameter file for the AllEurasia population list. This is how it should look:"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 11,
449 | "metadata": {},
450 | "outputs": [
451 | {
452 | "name": "stdout",
453 | "output_type": "stream",
454 | "text": [
455 | "genotypename: /data/popgen_course/genotypes_small.geno\n",
456 | "snpname: /data/popgen_course/genotypes_small.snp\n",
457 | "indivname: /data/popgen_course/genotypes_small.ind\n",
458 | "evecoutname: pca.AllEurasia.evec\n",
459 | "evaloutname: pca.AllEurasia.eval\n",
460 | "poplistname: /data/popgen_course/AllEurasia.poplist.txt\n",
461 | "lsqproject: YES\n",
462 | "numoutevec: 4\n",
463 | "numthreads: 1\n"
464 | ]
465 | }
466 | ],
467 | "source": [
468 | "cat pca.AllEurasia.params.txt"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {},
474 | "source": [
475 | "And similar to the command above, we can run pca on the AllEurasia population list via:\n",
476 | "\n",
477 | " smartpca -p pca.AllEurasia.params.txt"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "metadata": {},
483 | "source": [
484 | "which will run slightly longer than the first one because there are more populations "
485 | ]
486 | }
487 | ],
488 | "metadata": {
489 | "kernelspec": {
490 | "display_name": "Bash",
491 | "language": "bash",
492 | "name": "bash"
493 | },
494 | "language_info": {
495 | "codemirror_mode": "shell",
496 | "file_extension": ".sh",
497 | "mimetype": "text/x-sh",
498 | "name": "bash"
499 | }
500 | },
501 | "nbformat": 4,
502 | "nbformat_minor": 2
503 | }
504 |
--------------------------------------------------------------------------------
/05_Rmd_fstatistics.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "F Statistics"
3 | output: html_document
4 | editor_options:
5 | chunk_output_type: console
6 | ---
7 |
8 | ```{r, echo=FALSE}
9 | knitr::opts_chunk$set(message = FALSE)
10 | ```
11 |
12 | ```{r}
13 | library(magrittr)
14 | ```
15 |
16 | ## F3 Statistics
17 |
18 | F3 statistics are a useful analytical tool to understand population relationships. F3 statistics, just as F4 and F2 statistics measure allele frequency correlations between populations and were introduced by Nick Patterson in his [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037).
19 |
20 | F3 statistics are used for two purposes: i) as a test whether a target population (C) is admixed between two source populations (A and B), and ii) to measure shared drift between two test populations (A and B) from an outgroup (C).
21 |
22 | F3 statistics are in both cases defined as the product of allele frequency differences between population C to A and B, respectively:
23 |
24 | $$F3(A,B;C)=\langle(c−a)(c−b)\rangle$$
25 |
26 | Here, $\langle\cdot\rangle$ denotes the average over all genotyped sites, and a, b and c
27 | denote the allele frequency for a given site in the three populations A, B and C.
28 |
29 | ## Admixture F3 Statistics
30 |
31 | It can be shown that if that statistics is negative, it provides unambiguous proof that population C is admixed between populations A and B, as in the following phylogeny (taken from Figure 1 from [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037):
32 |
33 |
34 |
35 | Intuitively, an F3 statistics becomes negative if the allele frequency of the target population C is on average intermediate between the allele frequencies of A and B. Consider as an extreme example a genomic site where a=0, b=1 and c=0.5. Then we have (c−a)(c−b)=−0.25, which is negative. So if the entire statistics is negative, it suggests that in many positions, the allele frequency c is indeed intermediate, suggesting admixture between the two sources.
36 |
37 | **Note:** If an F3 statistics is *not* negative, it does *not* proof that there is no admixture!
38 |
39 | We will use this statistics to test if Finnish are admixed between East and West, using different Eastern and Western sources. In the West, we use French, Icelandic, Lithuanian and Norwegian as source, and in the East we use Nganasan and one of the ancient individuals analysed in this workshop, *Bolshoy Oleni Ostrov*, 3500 year old individuals from the Northern Russian Kola-peninsula.
40 |
41 | We use the software `qp3Pop` from [AdmixTools](https://github.com/DReichLab/AdmixTools), which similar to `smartpca` takes a parameter file:
42 |
43 | ```
44 | genotypename: input genotype file (in eigenstrat format)
45 | snpname: input snp file (in eigenstrat format)
46 | indivname: input indiv file (in eigenstrat format)
47 | popfilename: a file containing rows with three populations on each line A, B and C.
48 | inbreed: YES
49 | ```
50 |
51 | Here, the last option is necessary if we are analysing pseudo-diploid ancient data (which is the case here).
52 |
53 | To prepare the `popfilename`, create a new text file with the following content:
54 |
55 | ```
56 | Nganasan French Finnish
57 | Nganasan Icelandic Finnish
58 | Nganasan Lithuanian Finnish
59 | Nganasan Norwegian Finnish
60 | BolshoyOleniOstrov French Finnish
61 | BolshoyOleniOstrov Icelandic Finnish
62 | BolshoyOleniOstrov Lithuanian Finnish
63 | BolshoyOleniOstrov Norwegian Finnish
64 | ```
65 |
66 | **Exercise:** Prepare the parameter file with the input data as in the PCA session (see Principal Components Analysis (PCA)) and then run `qp3Pop -p PARAMETER_FILE`, where `PARAMETERFILE` should be replaced by your parameter file name. As genotype data, use the files called `/data/popgen_course/HumanOrigins_FennoScandian_small.*`.
67 |
68 | The results are in the output that you can view in the Notebook. The crucial bit should look like this:
69 |
70 | ```
71 | Source 1 Source 2 Target f_3 std. err Z SNPs
72 | result: Nganasan French Finnish -0.004539 0.000510 -8.894 442567
73 | result: Nganasan Icelandic Finnish -0.005297 0.000563 -9.404 427954
74 | result: Nganasan Lithuanian Finnish -0.005062 0.000590 -8.574 426231
75 | result: Nganasan Norwegian Finnish -0.004744 0.000569 -8.332 428161
76 | result: BolshoyOleniOstrov French Finnish -0.002814 0.000444 -6.341 402958
77 | result: BolshoyOleniOstrov Icelandic Finnish -0.002590 0.000486 -5.323 386418
78 | result: BolshoyOleniOstrov Lithuanian Finnish -0.001523 0.000536 -2.840 384134
79 | result: BolshoyOleniOstrov Norwegian Finnish -0.001553 0.000502 -3.092 386203
80 | ```
81 |
82 | This output shows as first three columns the three populations A, B (sources) and C (target). Then the f3 statistics, which is negative in all cases tested here, a standard error, a Z score and the number of SNPs involved in the statistics.
83 |
84 | The Z score is key: It gives the deviation of the f3 statistic from zero in units of the standard error. As general rule, a Z score of -3 or more suggests a significant rejection of the Null hypothesis that the statistic is not negative. In this case, all of the statistics are significantly negative, proving that Finnish have ancestral admixture of East and West Eurasian ancestry. Note that the statistics does not suggest when this admixture happened!
85 |
86 | ## F4 Statistics
87 |
88 | A different way to test for admixture is by “F4 statistics” (or “D statistics” which is very similar), also introduced in [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037).
89 |
90 | F4 statistics are also defined in terms of correlations of allele frequency differences, similarly to F3 statistics (see above), but involving four different populations, not just three. Specifically we define
91 |
92 | $$F4(A,B;C,D)=\langle(a−b)(c−d)\rangle.$$
93 |
94 | To understand the statistics, consider the following tree:
95 |
96 |
97 |
98 | In this tree, without any additional admixture, the allele frequency difference between A and B should be completely independent from the allele frequency difference between C and D. In that case, F4(A, B; C, D) should be zero, or at least not statistically different from zero. However, if there was gene flow from C or D into A or B, the statistic should be different from zero. Specifically, if the statistic is significantly negative, it implies gene flow between either C and B, or D and A. If it is significantly positive, it implies gene flow between A and C, or B and D.
99 |
100 | The way this statistic is often used, is to put a divergent outgroup as population A, for which we know for sure that there was no admixture into either C or D. With this setup, we can then test for gene flow between B and D (if the statistic is positive), or B and C (if it is negative).
101 |
102 | Here, we can use this statistic to test for East Asian admixture in Finns, similarly to the test using Admixture F3 statistics above. We will use the `qpDstat` program from [AdmixTools](https://github.com/DReichLab/AdmixTools) for that. We need to again prepare a population list file, this time with four populations (A, B, C, D). I suggest you open a new file and fill it with:
103 |
104 | ```
105 | Mbuti Nganasan French Finnish
106 | Mbuti Nganasan Icelandic Finnish
107 | Mbuti Nganasan Lithuanian Finnish
108 | Mbuti Nganasan Norwegian Finnish
109 | Mbuti BolshoyOleniOstrov French Finnish
110 | Mbuti BolshoyOleniOstrov Icelandic Finnish
111 | Mbuti BolshoyOleniOstrov Lithuanian Finnish
112 | Mbuti BolshoyOleniOstrov Norwegian Finnish
113 | ```
114 |
115 | You can then use this file again in a parameter file, similar to the one prepared for `qp3Pop` above:
116 |
117 | ```
118 | genotypename: input genotype file (in eigenstrat format)
119 | snpname: input snp file (in eigenstrat format)
120 | indivname: input indiv file (in eigenstrat format)
121 | popfilename: a file containing rows with three populations on each line A, B and C.
122 | f4mode: YES
123 | ```
124 |
125 | Note that you cannot give the “inbreed” option here.
126 |
127 | **Exercise:** Prepare the parameter file as suggested above and then run `qpDstat -p PARAMETER_FILE`, where `PARAMETERFILE` should be replaced by your parameter file name. This will take 5-6 minutes.
128 |
129 | The results should be (skipping some header lines):
130 |
131 | ```
132 | result: Mbuti Nganasan French Finnish 0.002363 19.016 29254 27852 593124
133 | result: Mbuti Nganasan Icelandic Finnish 0.001721 11.926 28915 27894 593124
134 | result: Mbuti Nganasan Lithuanian Finnish 0.001368 9.664 28745 27933 593124
135 | result: Mbuti Nganasan Norwegian Finnish 0.001685 11.663 28933 27934 593124
136 | result: Mbuti BolshoyOleniOstrov French Finnish 0.001962 16.737 27249 26175 547486
137 | result: Mbuti BolshoyOleniOstrov Icelandic Finnish 0.001084 7.776 26876 26282 547486
138 | result: Mbuti BolshoyOleniOstrov Lithuanian Finnish 0.000554 3.942 26683 26380 547486
139 | result: Mbuti BolshoyOleniOstrov Norwegian Finnish 0.000952 6.707 26873 26351 547486
140 | ```
141 |
142 | Here, the key columns are columns 2, 3, 4 and 5, denoting A, B, C and D, and column 6 and 7, which denote the F4 statistic and the Z score, measuring significance in difference from zero.
143 |
144 | As you can see, in all cases, the Z score is positive and larger than 3, indicating a significant deviation from zero, and implying gene flow between Nganasan and Finnish, and BolshoyOleniOstrov and Finnish, when compared to French, Icelandic, Lithuanian or Norwegian.
145 |
146 | ## Outgroup-F3-Statistics
147 |
148 | Outgroup F3 statistics are a special case how to use F3 statistics. The definition is the same as for Admixture F3 statistics, but instead of a target C and two source populations A and B, one now gives an outgroup C and two test populations A and B.
149 |
150 | To get an intuition for this statistics, consider the following tree:
151 |
152 |
153 |
154 | In this scenario, the statistic F3(A, B; C) measures the branch length from C to the common ancestor of A and B, coloured red. So this statistic is simply a measure of how closely two population A and B are related with each other, as measured from a distant outgroup. It is thus a similarity measure: The higher the statistic, the more genetically similar A and B are to one another.
155 |
156 | We can use this statistic to measure for example the the genetic affinity to East Asia, by performing the statistic F3(Han, X; Mbuti), where Mbuti is a distant African population and acts as outgroup here, Han denote Han Chinese, and X denotes various European populations that we want to test.
157 |
158 | You need to start, again, by preparing a list of population triples to be measured. I suggest the following list:
159 |
160 | ```
161 | Han Chuvash Mbuti
162 | Han Albanian Mbuti
163 | Han Armenian Mbuti
164 | Han Bulgarian Mbuti
165 | Han Czech Mbuti
166 | Han Druze Mbuti
167 | Han English Mbuti
168 | Han Estonian Mbuti
169 | Han Finnish Mbuti
170 | Han French Mbuti
171 | Han Georgian Mbuti
172 | Han Greek Mbuti
173 | Han Hungarian Mbuti
174 | Han Icelandic Mbuti
175 | Han Italian_North Mbuti
176 | Han Italian_South Mbuti
177 | Han Lithuanian Mbuti
178 | Han Maltese Mbuti
179 | Han Mordovian Mbuti
180 | Han Norwegian Mbuti
181 | Han Orcadian Mbuti
182 | Han Russian Mbuti
183 | Han Sardinian Mbuti
184 | Han Scottish Mbuti
185 | Han Sicilian Mbuti
186 | Han Spanish_North Mbuti
187 | Han Spanish Mbuti
188 | Han Ukrainian Mbuti
189 | Han Levanluhta Mbuti
190 | Han BolshoyOleniOstrov Mbuti
191 | Han ChalmnyVarre Mbuti
192 | Han Saami.DG Mbuti
193 | ```
194 |
195 | which cycles through many populations from Europe, including the ancient individuals from Chalmny Varre, Bolshoy Oleni Ostrov and Levänluhta.
196 |
197 | **Exercise:** Copy this list into a file, and prepare a parameter file for running `qp3Pop`, similar to the parameter file for admixture F3 statistics above, and run `qp3Pop` with that parameter file as above. Note that here you don't need the line beginning with `inbreed`. This will take up to 10 minutes.
198 |
199 | You should find this (skipping header lines from the output):
200 |
201 | ```
202 | Source 1 Source 2 Target f_3 std. err Z SNPs
203 | result: Han Chuvash Mbuti 0.233652 0.002072 112.782 502678
204 | result: Han Albanian Mbuti 0.215629 0.002029 106.291 501734
205 | result: Han Armenian Mbuti 0.213724 0.001963 108.882 504370
206 | result: Han Bulgarian Mbuti 0.216193 0.001979 109.266 504310
207 | result: Han Czech Mbuti 0.218060 0.002002 108.939 504089
208 | result: Han Druze Mbuti 0.209551 0.001919 109.205 510853
209 | result: Han English Mbuti 0.216959 0.001973 109.954 504161
210 | result: Han Estonian Mbuti 0.220730 0.002019 109.332 503503
211 | result: Han Finnish Mbuti 0.223447 0.002044 109.345 502217
212 | result: Han French Mbuti 0.216623 0.001969 110.012 509613
213 | result: Han Georgian Mbuti 0.214295 0.001935 110.721 503598
214 | result: Han Greek Mbuti 0.215203 0.001984 108.465 507475
215 | result: Han Hungarian Mbuti 0.217894 0.001999 109.004 507409
216 | result: Han Icelandic Mbuti 0.218683 0.002015 108.553 504655
217 | result: Han Italian_North Mbuti 0.215332 0.001978 108.854 507589
218 | result: Han Italian_South Mbuti 0.211787 0.002271 93.265 492400
219 | result: Han Lithuanian Mbuti 0.219615 0.002032 108.098 503681
220 | result: Han Maltese Mbuti 0.210359 0.001956 107.542 503985
221 | result: Han Mordovian Mbuti 0.223469 0.002008 111.296 503441
222 | result: Han Norwegian Mbuti 0.218873 0.002023 108.197 504621
223 | result: Han Orcadian Mbuti 0.217773 0.002014 108.115 504993
224 | result: Han Russian Mbuti 0.223993 0.001995 112.274 506525
225 | result: Han Sardinian Mbuti 0.213230 0.001980 107.711 508413
226 | result: Han Scottish Mbuti 0.218489 0.002039 107.145 499784
227 | result: Han Sicilian Mbuti 0.212272 0.001975 107.486 505477
228 | result: Han Spanish_North Mbuti 0.215885 0.002029 106.383 500853
229 | result: Han Spanish Mbuti 0.213869 0.001975 108.297 513648
230 | result: Han Ukrainian Mbuti 0.218716 0.002007 108.950 503981
231 | result: Han Levanluhta Mbuti 0.236252 0.002383 99.123 263049
232 | result: Han BolshoyOleniOstrov Mbuti 0.247814 0.002177 113.849 457102
233 | result: Han ChalmnyVarre Mbuti 0.233499 0.002304 101.345 366220
234 | result: Han Saami.DG Mbuti 0.236198 0.002274 103.852 489038
235 | ```
236 |
237 | Now it’s time to plot these results using R. Copy the results (all lines from the output beginning with “results:”) into a text file named "f3_outgroup_stats_Han.txt", and load it into an R tibble using:
238 |
239 | ```{r}
240 | d <- readr::read_delim(
241 | "f3_outgroup_stats_Han.txt",
242 | delim = " ",
243 | trim_ws = T,
244 | col_names = c("dummy", "A", "B", "C", "F3", "StdErr", "Z", "SNPS")
245 | )
246 | ```
247 |
248 | We can check that it worked:
249 |
250 | ```{r}
251 | d
252 | ```
253 |
254 | ```{r}
255 | library(ggplot2)
256 |
257 | d %>%
258 | ggplot() +
259 | geom_errorbarh(
260 | aes(
261 | xmin = F3 - StdErr,
262 | xmax = F3 + StdErr,
263 | y = forcats::fct_reorder(B, F3)
264 | )
265 | ) +
266 | geom_point(
267 | aes(x = F3, y = forcats::fct_reorder(B, F3))
268 | ) +
269 | xlab("F3(Han, Test; Mbuti)")
270 | ```
271 |
272 | As expected, the ancient samples and modern Saami are the ones with the highest allele sharing with present-day East Asians (as represented by Han) compared to many other Europeans.
273 |
274 | ## Outgroup F3 Statistics Scatter plot
275 |
276 | The above plot shows an intriguing cline of differential relatedness to Han in many Europeans. For example, would you have guessed that Icelandics are closer to Han than Armenians are to Han? This is very surprising, and it shows that European ancestry has a complex relationship to East Asians. To understand this better, you can read [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037), who makes some intriguing observations. Patterson and colleagues use Admixture F3 statistics and apply it to many populations world-wide. They summarise some population triples with the most negative F3 statistics in the following table:
277 |
278 |
279 |
280 | There are many interesting results here, but one of the most striking one is the finding of F3(Sardinian, Karitiana; French), which is highly significantly negative. This statistics implies that French are admixed between Sardinians and Karitiana, a Native American population from Brazil. How is that possible? We can of course rule out any recent Native American backflow into Europe.
281 |
282 | Patterson and colleagues explained this finding with hypothesising an ancient admixture event, from a Siberian population that contributed to both Europeans and to Native Americans. They termed that population the “Ancient North Eurasians (ANE)”. The following admixture graph was suggested:
283 |
284 |
285 |
286 | As you can see, the idea is that modern Central Europeans, such as French, are admixed between Southern Europeans (Sardinians) and ANE. The Ancient North Eurasians are a classic example for a “Ghost” population, a population which does not exist anymore in unmixed form, and from which we have no direct individual representative.
287 |
288 | Amazingly, two years after the publication of [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037), the ANE ghost population was actually found: [Raghavan et al.](https://www.nature.com/articles/nature12736) and colleagues, in 2014, published a paper called “Upper Palaeolithic Siberian genome reveals dual ancestry of Native Americans”. A 24,000 year old boy (called MA1) from the site of “Mal’ta” in Siberia was shown to have close genetic affinity with both Europeans and in particular Native Americans, just as proposed in [Patterson 2012](http://www.genetics.org/content/early/2012/09/06/genetics.112.145037).
289 |
290 | The affinities are summarised nicely in this figure from [Raghavan et al.](https://www.nature.com/articles/nature12736):
291 |
292 |
293 |
294 | OK, so we now know that ancestry related to Native Americans contributed to European countries. Could that possibly explain the affinity of our ancient samples and Saami to Han Chinese in some way? To test this, we will run the same Outgroup F3 statistics as above, but this time not with Han but with MA1 as test population. Specifically, we run the following population triples in `qp3Pop`:
295 |
296 | ```
297 | MA1_HG.SG Chuvash Mbuti
298 | MA1_HG.SG Albanian Mbuti
299 | MA1_HG.SG Armenian Mbuti
300 | MA1_HG.SG Bulgarian Mbuti
301 | MA1_HG.SG Czech Mbuti
302 | MA1_HG.SG Druze Mbuti
303 | MA1_HG.SG English Mbuti
304 | MA1_HG.SG Estonian Mbuti
305 | MA1_HG.SG Finnish Mbuti
306 | MA1_HG.SG French Mbuti
307 | MA1_HG.SG Georgian Mbuti
308 | MA1_HG.SG Greek Mbuti
309 | MA1_HG.SG Hungarian Mbuti
310 | MA1_HG.SG Icelandic Mbuti
311 | MA1_HG.SG Italian_North Mbuti
312 | MA1_HG.SG Italian_South Mbuti
313 | MA1_HG.SG Lithuanian Mbuti
314 | MA1_HG.SG Maltese Mbuti
315 | MA1_HG.SG Mordovian Mbuti
316 | MA1_HG.SG Norwegian Mbuti
317 | MA1_HG.SG Orcadian Mbuti
318 | MA1_HG.SG Russian Mbuti
319 | MA1_HG.SG Sardinian Mbuti
320 | MA1_HG.SG Scottish Mbuti
321 | MA1_HG.SG Sicilian Mbuti
322 | MA1_HG.SG Spanish_North Mbuti
323 | MA1_HG.SG Spanish Mbuti
324 | MA1_HG.SG Ukrainian Mbuti
325 | MA1_HG.SG Levanluhta Mbuti
326 | MA1_HG.SG BolshoyOleniOstrov Mbuti
327 | MA1_HG.SG ChalmnyVarre Mbuti
328 | MA1_HG.SG Saami.DG Mbuti
329 | ```
330 |
331 | here, `MA1_HG.SG` is the somewhat cryptic population name for the MA1-individual.
332 |
333 | **Exercise:** Follow the same protocol as above: Copy the list into a file, prepare a parameter file for `qp3Pop` with that population triple list, and run `qp3Pop`. Copy the results (all lines beginning with “results:”) into a file, named "f3_outgroup_stats_MA1.txt"
334 |
335 | To test in what way the relationship to Han Chinese is correlated with the relationship with MA1, we will now plot the two statistics against each other in a scatter plot. We first have to merge the two outgroup-F3 datasets together. Here is the code including loading (assuming that the two F3 dataframes are called `outgroupf3dat_Han` and `outgroupf3dat_MA1`):
336 |
337 | ```{r}
338 | outgroupf3dat_Han <- readr::read_delim(
339 | "f3_outgroup_stats_Han.txt",
340 | delim = " ",
341 | trim_ws = T,
342 | col_names = c("dummy", "A", "B", "C", "F3", "stderr", "Z", "nSNPs")
343 | )
344 |
345 | outgroupf3dat_MA1 <- readr::read_delim(
346 | "f3_outgroup_stats_MA1.txt",
347 | delim = " ",
348 | trim_ws = T,
349 | col_names = c("dummy", "A", "B", "C", "F3", "stderr", "Z", "nSNPs")
350 | )
351 |
352 | outgroupf3dat_merged <- dplyr::full_join(
353 | outgroupf3dat_Han,
354 | outgroupf3dat_MA1,
355 | by = "B",
356 | suffix = c("_Han", "_MA1")
357 | )
358 | ```
359 |
360 | Again, we check that everything worked:
361 |
362 | ```{r}
363 | outgroupf3dat_merged
364 | ```
365 |
366 | Now we can make a scatter plot:
367 |
368 | ```{r}
369 | outgroupf3dat_merged %>%
370 | ggplot() +
371 | geom_point(
372 | aes(
373 | x = F3_Han,
374 | y = F3_MA1
375 | )
376 | ) +
377 | xlab("F3(Test, Han; Mbuti)") +
378 | ylab("F3(Test, MA1; Mbuti)")
379 | ```
380 |
381 | This isn’t very useful, however, as we cannot see which point is which population. We use the `geom_label_repel` function from ggrepel to add text labels to each point:
382 |
383 | ```{r}
384 | outgroupf3dat_merged %>%
385 | ggplot() +
386 | geom_point(
387 | aes(
388 | x = F3_Han,
389 | y = F3_MA1
390 | )
391 | ) +
392 | xlab("F3(Test, Han; Mbuti)") +
393 | ylab("F3(Test, MA1; Mbuti)") +
394 | ggrepel::geom_label_repel(
395 | aes(
396 | x = F3_Han,
397 | y = F3_MA1,
398 | label = B
399 | )
400 | )
401 | ```
402 |
403 | The result shows that indeed the affinity to East Asians in the bulk of European contries can be explained by MA1-related ancestry. Most European countries have a linear relationship between their affinity to Han and their affinity to MA1. However, this is not true for our ancient samples from Fennoscandia and for modern Saami and Chuvash, who have extra affinity to Han not explained by MA1 ([Lazaridis et al. 2014](https://www.nature.com/articles/nature13673)).
404 |
405 | Now, why there is a connection between MA1 and Han is not trivial to explain. The most probable explanation involves "Basal Eurasian" ancestry, which happens to be anti-correlated to MA1-ancestry in Europe, and which drives those populations with high "Basal Eurasian" ancestry further away from Han. See [Lazaridis et al. 2014](https://www.nature.com/articles/nature13673) for more details.
406 |
--------------------------------------------------------------------------------
/pca.AllEurasia.eval:
--------------------------------------------------------------------------------
1 | 71.715980
2 | 9.646096
3 | 6.317298
4 | 3.871764
5 | 3.075333
6 | 2.646867
7 | 2.454284
8 | 2.236205
9 | 2.006447
10 | 2.000151
11 | 1.949834
12 | 1.946900
13 | 1.913285
14 | 1.909476
15 | 1.886648
16 | 1.873806
17 | 1.865762
18 | 1.844984
19 | 1.826839
20 | 1.818292
21 | 1.813557
22 | 1.806923
23 | 1.794645
24 | 1.787562
25 | 1.764851
26 | 1.756468
27 | 1.741944
28 | 1.735706
29 | 1.725703
30 | 1.721535
31 | 1.718770
32 | 1.712666
33 | 1.699156
34 | 1.689564
35 | 1.685036
36 | 1.677270
37 | 1.672807
38 | 1.666883
39 | 1.660374
40 | 1.653085
41 | 1.650461
42 | 1.643042
43 | 1.636804
44 | 1.634764
45 | 1.628574
46 | 1.623305
47 | 1.615182
48 | 1.601129
49 | 1.595264
50 | 1.583416
51 | 1.576798
52 | 1.574901
53 | 1.566180
54 | 1.559824
55 | 1.556275
56 | 1.549500
57 | 1.542913
58 | 1.540315
59 | 1.531804
60 | 1.528939
61 | 1.521507
62 | 1.515997
63 | 1.512632
64 | 1.512015
65 | 1.507849
66 | 1.499872
67 | 1.496241
68 | 1.493827
69 | 1.486508
70 | 1.483965
71 | 1.478250
72 | 1.469645
73 | 1.467586
74 | 1.458038
75 | 1.452991
76 | 1.449844
77 | 1.449182
78 | 1.446437
79 | 1.438036
80 | 1.433108
81 | 1.431148
82 | 1.428462
83 | 1.421567
84 | 1.417396
85 | 1.416161
86 | 1.411774
87 | 1.410701
88 | 1.406160
89 | 1.405373
90 | 1.399438
91 | 1.393987
92 | 1.388933
93 | 1.384344
94 | 1.381818
95 | 1.379500
96 | 1.374324
97 | 1.367127
98 | 1.364338
99 | 1.354982
100 | 1.351974
101 | 1.350672
102 | 1.348083
103 | 1.345424
104 | 1.341461
105 | 1.337633
106 | 1.333327
107 | 1.330888
108 | 1.328745
109 | 1.325143
110 | 1.321715
111 | 1.318676
112 | 1.318241
113 | 1.314366
114 | 1.311146
115 | 1.308522
116 | 1.306540
117 | 1.304946
118 | 1.301508
119 | 1.297752
120 | 1.297505
121 | 1.295093
122 | 1.291474
123 | 1.289045
124 | 1.286803
125 | 1.284147
126 | 1.282275
127 | 1.280667
128 | 1.279286
129 | 1.276059
130 | 1.275587
131 | 1.273570
132 | 1.272469
133 | 1.269639
134 | 1.266366
135 | 1.263176
136 | 1.260103
137 | 1.256532
138 | 1.254596
139 | 1.252190
140 | 1.250768
141 | 1.249041
142 | 1.244996
143 | 1.244038
144 | 1.242172
145 | 1.238261
146 | 1.236299
147 | 1.234178
148 | 1.231280
149 | 1.229033
150 | 1.228112
151 | 1.226033
152 | 1.221697
153 | 1.219402
154 | 1.218948
155 | 1.217277
156 | 1.215639
157 | 1.213122
158 | 1.208787
159 | 1.207875
160 | 1.205892
161 | 1.204942
162 | 1.203346
163 | 1.200594
164 | 1.198044
165 | 1.195350
166 | 1.193223
167 | 1.191716
168 | 1.190070
169 | 1.189331
170 | 1.187069
171 | 1.185844
172 | 1.183531
173 | 1.181102
174 | 1.180088
175 | 1.177938
176 | 1.175316
177 | 1.174158
178 | 1.171976
179 | 1.170660
180 | 1.167873
181 | 1.165652
182 | 1.164344
183 | 1.163405
184 | 1.159586
185 | 1.158722
186 | 1.158274
187 | 1.156118
188 | 1.154081
189 | 1.152420
190 | 1.151642
191 | 1.149088
192 | 1.147936
193 | 1.146993
194 | 1.144313
195 | 1.141345
196 | 1.137728
197 | 1.137247
198 | 1.136279
199 | 1.135313
200 | 1.133580
201 | 1.131784
202 | 1.131469
203 | 1.129439
204 | 1.127615
205 | 1.126348
206 | 1.125415
207 | 1.124355
208 | 1.121443
209 | 1.120501
210 | 1.119133
211 | 1.118446
212 | 1.117300
213 | 1.116887
214 | 1.115978
215 | 1.113715
216 | 1.112566
217 | 1.112342
218 | 1.109144
219 | 1.107941
220 | 1.106964
221 | 1.105361
222 | 1.105044
223 | 1.102753
224 | 1.101930
225 | 1.100408
226 | 1.099652
227 | 1.098429
228 | 1.098332
229 | 1.098243
230 | 1.094135
231 | 1.093516
232 | 1.092382
233 | 1.091670
234 | 1.090078
235 | 1.089586
236 | 1.088270
237 | 1.086303
238 | 1.085263
239 | 1.084290
240 | 1.083358
241 | 1.082818
242 | 1.082273
243 | 1.080266
244 | 1.079481
245 | 1.077849
246 | 1.076985
247 | 1.076192
248 | 1.076136
249 | 1.073774
250 | 1.072358
251 | 1.072124
252 | 1.071058
253 | 1.069525
254 | 1.069366
255 | 1.067774
256 | 1.067285
257 | 1.065857
258 | 1.064099
259 | 1.063845
260 | 1.062725
261 | 1.061943
262 | 1.060416
263 | 1.060043
264 | 1.059428
265 | 1.058306
266 | 1.057950
267 | 1.057505
268 | 1.057060
269 | 1.054535
270 | 1.053238
271 | 1.053102
272 | 1.052623
273 | 1.051572
274 | 1.050980
275 | 1.050070
276 | 1.049240
277 | 1.047290
278 | 1.046165
279 | 1.045326
280 | 1.044794
281 | 1.043890
282 | 1.043100
283 | 1.042825
284 | 1.041543
285 | 1.040521
286 | 1.038824
287 | 1.038218
288 | 1.037671
289 | 1.036877
290 | 1.036013
291 | 1.035027
292 | 1.034440
293 | 1.033807
294 | 1.032238
295 | 1.031766
296 | 1.030900
297 | 1.029723
298 | 1.029454
299 | 1.029267
300 | 1.028585
301 | 1.027856
302 | 1.027158
303 | 1.026376
304 | 1.025794
305 | 1.024513
306 | 1.024265
307 | 1.022640
308 | 1.022056
309 | 1.022035
310 | 1.021338
311 | 1.020752
312 | 1.019220
313 | 1.018966
314 | 1.018401
315 | 1.016882
316 | 1.016381
317 | 1.016260
318 | 1.015723
319 | 1.015156
320 | 1.013542
321 | 1.013257
322 | 1.012991
323 | 1.011783
324 | 1.011264
325 | 1.010738
326 | 1.009866
327 | 1.009583
328 | 1.008919
329 | 1.007864
330 | 1.007229
331 | 1.006901
332 | 1.005933
333 | 1.005583
334 | 1.004380
335 | 1.003731
336 | 1.003455
337 | 1.002697
338 | 1.002511
339 | 1.001233
340 | 1.000980
341 | 1.000107
342 | 0.999920
343 | 0.999383
344 | 0.998479
345 | 0.997897
346 | 0.997478
347 | 0.997201
348 | 0.995749
349 | 0.995228
350 | 0.994264
351 | 0.993564
352 | 0.993059
353 | 0.992377
354 | 0.991550
355 | 0.991430
356 | 0.990713
357 | 0.990020
358 | 0.989282
359 | 0.989015
360 | 0.988423
361 | 0.988142
362 | 0.987656
363 | 0.986261
364 | 0.985854
365 | 0.985454
366 | 0.985131
367 | 0.984279
368 | 0.983098
369 | 0.982697
370 | 0.982245
371 | 0.981888
372 | 0.981737
373 | 0.981172
374 | 0.980173
375 | 0.979647
376 | 0.979371
377 | 0.978758
378 | 0.978358
379 | 0.977391
380 | 0.976937
381 | 0.976641
382 | 0.976026
383 | 0.975142
384 | 0.974387
385 | 0.973590
386 | 0.973458
387 | 0.973009
388 | 0.972056
389 | 0.971135
390 | 0.970599
391 | 0.970517
392 | 0.969697
393 | 0.969303
394 | 0.968879
395 | 0.968092
396 | 0.967964
397 | 0.967065
398 | 0.966825
399 | 0.966743
400 | 0.965838
401 | 0.965401
402 | 0.964752
403 | 0.963642
404 | 0.963346
405 | 0.962434
406 | 0.962165
407 | 0.961905
408 | 0.961024
409 | 0.960495
410 | 0.959737
411 | 0.959140
412 | 0.959096
413 | 0.958226
414 | 0.957956
415 | 0.957269
416 | 0.956886
417 | 0.956086
418 | 0.955981
419 | 0.955657
420 | 0.955189
421 | 0.954771
422 | 0.953468
423 | 0.953362
424 | 0.953062
425 | 0.952075
426 | 0.951706
427 | 0.951235
428 | 0.950837
429 | 0.950302
430 | 0.949604
431 | 0.949190
432 | 0.948684
433 | 0.948069
434 | 0.947813
435 | 0.947164
436 | 0.946304
437 | 0.945771
438 | 0.945406
439 | 0.944962
440 | 0.944757
441 | 0.944197
442 | 0.943876
443 | 0.942923
444 | 0.942592
445 | 0.942162
446 | 0.941549
447 | 0.941221
448 | 0.940900
449 | 0.940533
450 | 0.939559
451 | 0.939265
452 | 0.939117
453 | 0.938712
454 | 0.938331
455 | 0.938069
456 | 0.937496
457 | 0.936374
458 | 0.936015
459 | 0.935724
460 | 0.935130
461 | 0.934824
462 | 0.934040
463 | 0.933420
464 | 0.933205
465 | 0.932738
466 | 0.932193
467 | 0.931963
468 | 0.931454
469 | 0.931035
470 | 0.930492
471 | 0.929848
472 | 0.929349
473 | 0.929121
474 | 0.928145
475 | 0.927946
476 | 0.927775
477 | 0.927228
478 | 0.926476
479 | 0.925830
480 | 0.924999
481 | 0.924882
482 | 0.924624
483 | 0.924254
484 | 0.923437
485 | 0.922936
486 | 0.922757
487 | 0.922369
488 | 0.921947
489 | 0.921621
490 | 0.920983
491 | 0.920648
492 | 0.920081
493 | 0.919799
494 | 0.919478
495 | 0.919088
496 | 0.918109
497 | 0.917490
498 | 0.917307
499 | 0.916769
500 | 0.916590
501 | 0.915881
502 | 0.915463
503 | 0.915134
504 | 0.914584
505 | 0.914211
506 | 0.913969
507 | 0.913261
508 | 0.913220
509 | 0.912676
510 | 0.912265
511 | 0.911897
512 | 0.911728
513 | 0.911222
514 | 0.910678
515 | 0.910456
516 | 0.910011
517 | 0.909571
518 | 0.909092
519 | 0.908675
520 | 0.908044
521 | 0.907784
522 | 0.907319
523 | 0.907030
524 | 0.906959
525 | 0.906055
526 | 0.905717
527 | 0.905481
528 | 0.905318
529 | 0.904515
530 | 0.904088
531 | 0.903689
532 | 0.902584
533 | 0.902340
534 | 0.902202
535 | 0.901933
536 | 0.901337
537 | 0.900931
538 | 0.900531
539 | 0.899700
540 | 0.899510
541 | 0.899448
542 | 0.898828
543 | 0.898442
544 | 0.897904
545 | 0.897538
546 | 0.896594
547 | 0.896224
548 | 0.896043
549 | 0.895678
550 | 0.895322
551 | 0.894900
552 | 0.894500
553 | 0.894205
554 | 0.893792
555 | 0.893110
556 | 0.892631
557 | 0.892244
558 | 0.891665
559 | 0.891467
560 | 0.890949
561 | 0.890557
562 | 0.890284
563 | 0.890120
564 | 0.889878
565 | 0.888517
566 | 0.888418
567 | 0.888256
568 | 0.887652
569 | 0.887366
570 | 0.886798
571 | 0.886249
572 | 0.885959
573 | 0.885612
574 | 0.885271
575 | 0.885018
576 | 0.884410
577 | 0.884006
578 | 0.883420
579 | 0.882979
580 | 0.882828
581 | 0.882098
582 | 0.881565
583 | 0.881310
584 | 0.881150
585 | 0.880599
586 | 0.880347
587 | 0.879740
588 | 0.879552
589 | 0.878985
590 | 0.878718
591 | 0.878483
592 | 0.878217
593 | 0.877969
594 | 0.877166
595 | 0.876720
596 | 0.876588
597 | 0.875999
598 | 0.875724
599 | 0.875312
600 | 0.875060
601 | 0.874626
602 | 0.874394
603 | 0.873977
604 | 0.873148
605 | 0.872821
606 | 0.872459
607 | 0.872096
608 | 0.871806
609 | 0.871684
610 | 0.871358
611 | 0.871084
612 | 0.870736
613 | 0.870239
614 | 0.869744
615 | 0.869448
616 | 0.868877
617 | 0.868834
618 | 0.868298
619 | 0.867668
620 | 0.867114
621 | 0.866990
622 | 0.866831
623 | 0.866211
624 | 0.865688
625 | 0.865635
626 | 0.864710
627 | 0.864149
628 | 0.864141
629 | 0.863711
630 | 0.862948
631 | 0.862400
632 | 0.862224
633 | 0.861785
634 | 0.861445
635 | 0.861259
636 | 0.861043
637 | 0.860382
638 | 0.860117
639 | 0.859707
640 | 0.859216
641 | 0.859209
642 | 0.858204
643 | 0.858035
644 | 0.857629
645 | 0.857090
646 | 0.857034
647 | 0.856552
648 | 0.855977
649 | 0.855882
650 | 0.855628
651 | 0.855513
652 | 0.854912
653 | 0.854641
654 | 0.854036
655 | 0.853970
656 | 0.853373
657 | 0.852897
658 | 0.852067
659 | 0.852048
660 | 0.851803
661 | 0.851459
662 | 0.851016
663 | 0.850728
664 | 0.850291
665 | 0.849942
666 | 0.849572
667 | 0.849281
668 | 0.848894
669 | 0.848685
670 | 0.848422
671 | 0.848015
672 | 0.847328
673 | 0.847273
674 | 0.846640
675 | 0.846226
676 | 0.845960
677 | 0.845485
678 | 0.845159
679 | 0.844546
680 | 0.844416
681 | 0.844113
682 | 0.843630
683 | 0.843193
684 | 0.842665
685 | 0.842379
686 | 0.842040
687 | 0.841749
688 | 0.841546
689 | 0.841290
690 | 0.841188
691 | 0.840506
692 | 0.839818
693 | 0.839536
694 | 0.839376
695 | 0.838980
696 | 0.838796
697 | 0.838279
698 | 0.837635
699 | 0.837285
700 | 0.836838
701 | 0.836294
702 | 0.836187
703 | 0.835985
704 | 0.835624
705 | 0.835082
706 | 0.834812
707 | 0.834301
708 | 0.834018
709 | 0.833686
710 | 0.833486
711 | 0.833046
712 | 0.832747
713 | 0.832353
714 | 0.832011
715 | 0.831617
716 | 0.831215
717 | 0.830883
718 | 0.830429
719 | 0.829964
720 | 0.829774
721 | 0.829540
722 | 0.829070
723 | 0.828846
724 | 0.828117
725 | 0.827983
726 | 0.827625
727 | 0.827316
728 | 0.827115
729 | 0.826908
730 | 0.826476
731 | 0.825891
732 | 0.825584
733 | 0.825149
734 | 0.825076
735 | 0.824591
736 | 0.824412
737 | 0.823907
738 | 0.823624
739 | 0.823109
740 | 0.823052
741 | 0.822477
742 | 0.822333
743 | 0.821695
744 | 0.821324
745 | 0.820815
746 | 0.820577
747 | 0.820041
748 | 0.819847
749 | 0.819615
750 | 0.819072
751 | 0.818881
752 | 0.818542
753 | 0.818240
754 | 0.818033
755 | 0.817741
756 | 0.817351
757 | 0.816811
758 | 0.816287
759 | 0.815814
760 | 0.815423
761 | 0.815192
762 | 0.815034
763 | 0.814883
764 | 0.814052
765 | 0.813897
766 | 0.813726
767 | 0.813660
768 | 0.812896
769 | 0.812774
770 | 0.812149
771 | 0.811883
772 | 0.811682
773 | 0.811341
774 | 0.811214
775 | 0.811013
776 | 0.810373
777 | 0.810169
778 | 0.809624
779 | 0.809076
780 | 0.808794
781 | 0.808444
782 | 0.808326
783 | 0.808179
784 | 0.807618
785 | 0.807567
786 | 0.807352
787 | 0.806826
788 | 0.806653
789 | 0.806221
790 | 0.805727
791 | 0.805221
792 | 0.804998
793 | 0.804585
794 | 0.804224
795 | 0.803660
796 | 0.803305
797 | 0.803221
798 | 0.802845
799 | 0.802669
800 | 0.802409
801 | 0.801995
802 | 0.801480
803 | 0.801126
804 | 0.800777
805 | 0.800579
806 | 0.800107
807 | 0.799610
808 | 0.799097
809 | 0.798955
810 | 0.798845
811 | 0.798418
812 | 0.797948
813 | 0.797613
814 | 0.797464
815 | 0.796897
816 | 0.796723
817 | 0.796541
818 | 0.795860
819 | 0.795637
820 | 0.795418
821 | 0.795167
822 | 0.794763
823 | 0.794421
824 | 0.793827
825 | 0.793678
826 | 0.793548
827 | 0.793303
828 | 0.792505
829 | 0.792223
830 | 0.791879
831 | 0.791164
832 | 0.790971
833 | 0.790681
834 | 0.790180
835 | 0.789786
836 | 0.789691
837 | 0.789369
838 | 0.788991
839 | 0.788721
840 | 0.788559
841 | 0.788323
842 | 0.788091
843 | 0.787413
844 | 0.786945
845 | 0.786669
846 | 0.786279
847 | 0.786021
848 | 0.785453
849 | 0.785168
850 | 0.784955
851 | 0.784383
852 | 0.784065
853 | 0.783717
854 | 0.783495
855 | 0.783116
856 | 0.782517
857 | 0.782418
858 | 0.781996
859 | 0.781478
860 | 0.781150
861 | 0.780929
862 | 0.780612
863 | 0.780346
864 | 0.779740
865 | 0.779687
866 | 0.779626
867 | 0.779090
868 | 0.778778
869 | 0.778558
870 | 0.778293
871 | 0.778082
872 | 0.777478
873 | 0.777164
874 | 0.777004
875 | 0.776450
876 | 0.776249
877 | 0.776016
878 | 0.775638
879 | 0.775471
880 | 0.775117
881 | 0.774738
882 | 0.774340
883 | 0.773849
884 | 0.773749
885 | 0.773193
886 | 0.772833
887 | 0.772437
888 | 0.772363
889 | 0.771980
890 | 0.771546
891 | 0.770945
892 | 0.770807
893 | 0.770712
894 | 0.770284
895 | 0.769755
896 | 0.769364
897 | 0.768872
898 | 0.768608
899 | 0.768006
900 | 0.767707
901 | 0.767287
902 | 0.766956
903 | 0.766804
904 | 0.766640
905 | 0.766513
906 | 0.765853
907 | 0.765604
908 | 0.765247
909 | 0.765033
910 | 0.764525
911 | 0.763868
912 | 0.763589
913 | 0.763303
914 | 0.763255
915 | 0.762772
916 | 0.762657
917 | 0.762382
918 | 0.761943
919 | 0.761652
920 | 0.761166
921 | 0.760886
922 | 0.760642
923 | 0.760246
924 | 0.759796
925 | 0.759547
926 | 0.759167
927 | 0.758572
928 | 0.758437
929 | 0.758402
930 | 0.757537
931 | 0.757399
932 | 0.757261
933 | 0.757044
934 | 0.756354
935 | 0.756024
936 | 0.755860
937 | 0.755357
938 | 0.755136
939 | 0.754750
940 | 0.754214
941 | 0.754005
942 | 0.753724
943 | 0.752996
944 | 0.752836
945 | 0.752400
946 | 0.752306
947 | 0.751759
948 | 0.751661
949 | 0.751330
950 | 0.751168
951 | 0.751020
952 | 0.750659
953 | 0.750007
954 | 0.749689
955 | 0.749495
956 | 0.749119
957 | 0.748759
958 | 0.748478
959 | 0.748065
960 | 0.747625
961 | 0.747449
962 | 0.746687
963 | 0.746264
964 | 0.746058
965 | 0.745328
966 | 0.744984
967 | 0.744437
968 | 0.744369
969 | 0.744013
970 | 0.743688
971 | 0.743510
972 | 0.743082
973 | 0.742683
974 | 0.742467
975 | 0.742365
976 | 0.742243
977 | 0.741437
978 | 0.741378
979 | 0.740992
980 | 0.740443
981 | 0.740272
982 | 0.739879
983 | 0.739771
984 | 0.739407
985 | 0.739154
986 | 0.738702
987 | 0.738091
988 | 0.737694
989 | 0.737644
990 | 0.737240
991 | 0.736978
992 | 0.736598
993 | 0.736027
994 | 0.735746
995 | 0.735229
996 | 0.734727
997 | 0.734338
998 | 0.734315
999 | 0.734027
1000 | 0.733939
1001 | 0.733623
1002 | 0.733333
1003 | 0.732575
1004 | 0.732473
1005 | 0.732394
1006 | 0.732105
1007 | 0.731576
1008 | 0.731172
1009 | 0.731164
1010 | 0.730483
1011 | 0.730308
1012 | 0.729689
1013 | 0.729551
1014 | 0.729166
1015 | 0.728784
1016 | 0.728480
1017 | 0.728378
1018 | 0.728030
1019 | 0.727821
1020 | 0.727293
1021 | 0.726755
1022 | 0.726355
1023 | 0.726085
1024 | 0.725702
1025 | 0.725261
1026 | 0.724964
1027 | 0.724439
1028 | 0.724318
1029 | 0.723856
1030 | 0.723428
1031 | 0.722977
1032 | 0.722882
1033 | 0.722623
1034 | 0.722001
1035 | 0.721677
1036 | 0.721337
1037 | 0.720897
1038 | 0.720502
1039 | 0.720374
1040 | 0.719693
1041 | 0.719497
1042 | 0.719073
1043 | 0.718917
1044 | 0.718117
1045 | 0.717986
1046 | 0.717598
1047 | 0.716888
1048 | 0.716489
1049 | 0.716429
1050 | 0.716233
1051 | 0.715416
1052 | 0.714722
1053 | 0.714685
1054 | 0.714282
1055 | 0.714057
1056 | 0.713964
1057 | 0.713437
1058 | 0.713171
1059 | 0.712531
1060 | 0.712118
1061 | 0.711659
1062 | 0.711530
1063 | 0.711407
1064 | 0.711102
1065 | 0.710655
1066 | 0.710381
1067 | 0.709936
1068 | 0.709708
1069 | 0.709649
1070 | 0.708671
1071 | 0.708125
1072 | 0.707835
1073 | 0.707500
1074 | 0.707042
1075 | 0.706892
1076 | 0.706112
1077 | 0.706009
1078 | 0.705823
1079 | 0.705104
1080 | 0.704860
1081 | 0.704239
1082 | 0.703932
1083 | 0.703477
1084 | 0.703391
1085 | 0.702766
1086 | 0.702445
1087 | 0.702274
1088 | 0.701940
1089 | 0.701665
1090 | 0.700902
1091 | 0.700590
1092 | 0.700421
1093 | 0.700069
1094 | 0.699486
1095 | 0.699260
1096 | 0.698845
1097 | 0.698525
1098 | 0.698164
1099 | 0.697589
1100 | 0.697331
1101 | 0.697240
1102 | 0.696655
1103 | 0.695899
1104 | 0.695641
1105 | 0.695463
1106 | 0.695160
1107 | 0.695045
1108 | 0.693824
1109 | 0.693280
1110 | 0.693161
1111 | 0.692488
1112 | 0.692260
1113 | 0.691847
1114 | 0.691383
1115 | 0.691129
1116 | 0.690650
1117 | 0.690521
1118 | 0.690001
1119 | 0.689281
1120 | 0.689164
1121 | 0.688493
1122 | 0.688327
1123 | 0.687731
1124 | 0.687431
1125 | 0.686862
1126 | 0.686461
1127 | 0.686314
1128 | 0.685825
1129 | 0.685344
1130 | 0.684978
1131 | 0.684476
1132 | 0.684136
1133 | 0.683983
1134 | 0.683262
1135 | 0.683166
1136 | 0.682451
1137 | 0.682255
1138 | 0.681984
1139 | 0.681503
1140 | 0.680910
1141 | 0.680809
1142 | 0.679644
1143 | 0.679039
1144 | 0.678826
1145 | 0.678661
1146 | 0.678207
1147 | 0.677428
1148 | 0.677068
1149 | 0.676767
1150 | 0.675457
1151 | 0.675406
1152 | 0.675308
1153 | 0.674391
1154 | 0.674118
1155 | 0.673968
1156 | 0.673589
1157 | 0.672906
1158 | 0.672320
1159 | 0.671887
1160 | 0.671262
1161 | 0.670848
1162 | 0.670659
1163 | 0.670100
1164 | 0.669492
1165 | 0.668869
1166 | 0.668393
1167 | 0.667479
1168 | 0.667404
1169 | 0.666943
1170 | 0.666272
1171 | 0.665854
1172 | 0.665310
1173 | 0.665164
1174 | 0.664333
1175 | 0.663281
1176 | 0.662910
1177 | 0.662375
1178 | 0.661634
1179 | 0.661110
1180 | 0.660643
1181 | 0.660077
1182 | 0.659467
1183 | 0.658888
1184 | 0.658330
1185 | 0.658068
1186 | 0.657553
1187 | 0.657307
1188 | 0.656757
1189 | 0.655750
1190 | 0.654891
1191 | 0.654242
1192 | 0.653675
1193 | 0.653497
1194 | 0.653385
1195 | 0.652308
1196 | 0.651295
1197 | 0.650471
1198 | 0.649780
1199 | 0.649056
1200 | 0.648642
1201 | 0.647491
1202 | 0.647090
1203 | 0.646993
1204 | 0.645525
1205 | 0.644970
1206 | 0.644049
1207 | 0.642704
1208 | 0.642170
1209 | 0.641427
1210 | 0.640678
1211 | 0.639895
1212 | 0.639148
1213 | 0.638247
1214 | 0.636725
1215 | 0.636166
1216 | 0.635749
1217 | 0.633287
1218 | 0.631706
1219 | 0.631330
1220 | 0.631145
1221 | 0.630361
1222 | 0.629449
1223 | 0.627495
1224 | 0.626398
1225 | 0.624864
1226 | 0.623195
1227 | 0.622484
1228 | 0.620204
1229 | 0.619257
1230 | 0.618031
1231 | 0.616679
1232 | 0.614482
1233 | 0.612315
1234 | 0.609651
1235 | 0.606166
1236 | 0.605721
1237 | 0.601163
1238 | 0.600258
1239 | 0.598812
1240 | 0.597943
1241 | 0.595561
1242 | 0.594310
1243 | 0.591526
1244 | 0.583090
1245 | 0.581623
1246 | 0.580843
1247 | 0.577479
1248 | 0.575503
1249 | 0.572902
1250 | 0.571719
1251 | 0.564517
1252 | 0.558561
1253 | 0.556191
1254 | 0.549372
1255 | 0.540657
1256 | 0.515586
1257 | 0.508704
1258 | -0.000000
1259 |
--------------------------------------------------------------------------------