├── NGS.pdf ├── CGA_MuseumGenomics_Singhal.pdf ├── Massoko_smartpca.par ├── convert.par ├── summarize_ld.py ├── vcf_to_phy.py ├── vcf_to_geno.py └── pop_gen_tutorial.rst /NGS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/singhal/popgen_tutorial/HEAD/NGS.pdf -------------------------------------------------------------------------------- /CGA_MuseumGenomics_Singhal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/singhal/popgen_tutorial/HEAD/CGA_MuseumGenomics_Singhal.pdf -------------------------------------------------------------------------------- /Massoko_smartpca.par: -------------------------------------------------------------------------------- 1 | genotypename: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.geno 2 | snpname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.snp 3 | indivname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.ind 4 | evecoutname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.evec 5 | evaloutname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.eval 6 | -------------------------------------------------------------------------------- /convert.par: -------------------------------------------------------------------------------- 1 | genotypename: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.geno 2 | snpname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.snp 3 | indivname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.ind 4 | outputformat: PED 5 | genotypeoutname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.ped 6 | snpoutname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.map 7 | indivoutname: Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.pedind 8 | -------------------------------------------------------------------------------- /summarize_ld.py: -------------------------------------------------------------------------------- 1 | import re 2 | import gzip 3 | import os 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser( 7 | description="Summarize LD file.", 8 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 9 | ) 10 | 11 | parser.add_argument('--infile', type=str, default=None, 12 | help='VCFtools LD file to convert') 13 | 14 | parser.add_argument('--win', type=int, default = 1, 15 | help='The window across which to summarize LD patterns') 16 | 17 | args = parser.parse_args() 18 | 19 | f = open(args.infile, 'r') 20 | out = args.infile + '_summary_window%s' % args.win 21 | o = open(out, 'w') 22 | 23 | r2 = {} 24 | header = f.next() 25 | for l in f: 26 | d = re.split('\s+', l.rstrip()) 27 | dist = int(d[2]) - int(d[1]) 28 | dist = int(dist / float(args.win)) * args.win 29 | 30 | if dist not in r2: 31 | r2[dist] = {'val': 0, 'num': 0} 32 | r2[dist]['val'] += float(d[4]) 33 | r2[dist]['num'] += 1 34 | f.close() 35 | 36 | o.write('distance\tnum_comparisons\tr2\n') 37 | for dist in sorted(r2.keys()): 38 | avg = r2[dist]['val'] / float(r2[dist]['num']) 39 | o.write('%s\t%s\t%.4f\n' % (dist, r2[dist]['num'], avg)) 40 | o.close() 41 | 42 | -------------------------------------------------------------------------------- /vcf_to_phy.py: -------------------------------------------------------------------------------- 1 | import re 2 | import gzip 3 | import os 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser( 7 | description="Turn VCF into phy.", 8 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 9 | ) 10 | 11 | parser.add_argument('--infile', type=str, default=None, 12 | help='Gzipped infile to convert') 13 | 14 | parser.add_argument('--thin', type=int, default = 1, 15 | help='Further thinning') 16 | 17 | args = parser.parse_args() 18 | 19 | # set up the input and output files 20 | infile = args.infile 21 | f = gzip.open(infile, 'r') 22 | out_fa = re.sub('.vcf.gz', '_thin%s.phy' % args.thin, infile) 23 | o = open(out_fa, 'w') 24 | 25 | seq = {} 26 | 27 | for ix, l in enumerate(f): 28 | l = l.decode('ascii') 29 | if re.search('CHROM', l): 30 | d = re.split('\t', l.rstrip()) 31 | inds = d[9:] 32 | for ind in inds: 33 | seq[ind] = '' 34 | elif not re.search('^#', l): 35 | # thin it further! 36 | if ix % args.thin == 0: 37 | d = re.split('\t', l.rstrip()) 38 | 39 | snps = {} 40 | alleles = [d[3]] + re.split(',', d[4]) 41 | for ix, a in enumerate(alleles): 42 | snps[str(ix)] = a 43 | snps['.'] = 'N' 44 | 45 | genos = [re.search('^(\S)', x).group(1) for x in d[9:]] 46 | 47 | for ind, geno in zip(inds, genos): 48 | seq[ind] += snps[geno] 49 | f.close() 50 | 51 | o.write('%s %s\n' % (len(seq), len(seq[list(seq.keys())[0]]))) 52 | for ind, s in seq.items(): 53 | o.write('%s %s\n' % (ind, s)) 54 | o.close() 55 | -------------------------------------------------------------------------------- /vcf_to_geno.py: -------------------------------------------------------------------------------- 1 | import re 2 | import gzip 3 | import os 4 | 5 | dir = '/home/ubuntu/' 6 | 7 | in_vcf = os.path.join(dir, 'Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.vcf.gz') 8 | f = gzip.open(in_vcf, 'r') 9 | 10 | out_g = os.path.join(dir, 'Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.geno') 11 | out_s = os.path.join(dir, 'Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.snp') 12 | out_i = os.path.join(dir, 'Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.ind') 13 | o1 = open(out_g, 'w') 14 | # total hack the snp data because human data 15 | o2 = open(out_s, 'w') 16 | o3 = open(out_i, 'w') 17 | 18 | alleles = { '0|0': '0', '0|1': '1', '1|1': '2', 19 | '0/0': '0', '0/1': '1', '1/1': '2', '1|0': '1'} 20 | 21 | gen_length = 850000000 22 | max_len = 0 23 | cur_len = 0 24 | cur_chr = 'NA' 25 | 26 | snp = 1 27 | snps = {} 28 | for l in f: 29 | l = l.decode('ascii') 30 | if re.search('CHROM', l): 31 | d = re.split('\t', l.rstrip()) 32 | inds = d[9:] 33 | for ind in inds: 34 | group = re.sub('_[^_]+$', '', ind) 35 | o3.write('%s\tU\t%s\n' % (ind, group)) 36 | o3.close() 37 | elif not re.search('^#', l): 38 | d = re.split('\t', l.rstrip()) 39 | 40 | if d[0] != cur_chr: 41 | cur_chr = d[0] 42 | cur_len += max_len 43 | 44 | max_len = int(d[1]) 45 | cur_chr = d[0] 46 | 47 | genos = [re.search('^(\S\S\S)', x).group(1) for x in d[9:]] 48 | if len(set(genos)) > 1: 49 | genos = [alleles[x] if x in alleles else '9' for x in genos] 50 | o1.write('%s\n' % ''.join(genos)) 51 | o2.write('rs%s\t1\t%.3f\t%s\n' % 52 | (snp, (int(d[1]) + cur_len) / float(gen_length), int(d[1]) + cur_len)) 53 | snp += 1 54 | f.close() 55 | o1.close() 56 | -------------------------------------------------------------------------------- /pop_gen_tutorial.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | Population Genetics Tutorial 3 | ============================ 4 | 5 | ========= 6 | Exercise 7 | ========= 8 | Before you guys got here 9 | ~~~~~~~~~~~~~~~~~~~~~~~~ 10 | 11 | Started with data from: "Genomic islands of speciation separate cichlid ecomorphs in an East African crater lake", Malinsky et al 2015. 12 | 13 | Downloaded VCF from http://datadryad.org/resource/doi:10.5061/dryad.770mc 14 | - http://datadryad.org/bitstream/handle/10255/dryad.101389/Massoko_Dryad_VCF_final.vcf.gz 15 | - These data had been filtered for quality 16 | - And only variable sites had been retained 17 | - And phased using the program `BEAGLE`, which relies on linkage disequilibrium to phase haplotypes 18 | 19 | Made the VCF smaller so we could analyze it in this lifetime: 36 individuals and no indels.:: 20 | 21 | vcftools --gzvcf Massoko_Dryad_VCF_final.vcf.gz --keep inds_to_keep.txt --stdout --recode --recode-INFO-all --remove-indels | gzip -c > Massoko_Dryad_VCF_final_subset_noIndels.vcf.gz 22 | 23 | Made the VCF smaller still to remove low frequency sites and then local linkage disequilibrium. We will use these files for many of our analyses.:: 24 | 25 | vcftools --gzvcf Massoko_Dryad_VCF_final_subset_noIndels.vcf.gz --maf 0.05 --max-maf 0.95 --stdout --recode --recode-INFO-all | gzip -c > Massoko_Dryad_VCF_final_subset_noIndels_maf05.vcf.gz 26 | vcftools --gzvcf Massoko_Dryad_VCF_final_subset_noIndels_maf05.vcf.gz --thin 1000 --stdout --recode --recode-INFO-all | gzip -c > Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.vcf.gz 27 | 28 | Used the thinned VCF to make input files for phylogenetic inference and population structure analyses.:: 29 | 30 | python vcf_to_phy.py --infile Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.vcf.gz --thin 5 31 | python vcf_to_geno.py 32 | EIG-6.1.3/bin/convertf -p convert.par 33 | cat Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.map | awk -F'\\\s+' '{print $1,$2, $3,$4}' > map 34 | mv map Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.map 35 | plink --file Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K --recode 36 | mv plink.ped Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.ped 37 | mv plink.map Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.map 38 | 39 | 40 | Now you start 41 | ~~~~~~~~~~~~~ 42 | 43 | Install many useful software.:: 44 | 45 | R 46 | raxml 47 | vcftools 48 | eigensoft 49 | admixture 50 | 51 | You can download the smaller data set and ancillary files from here.:: 52 | 53 | wget https://www.dropbox.com/s/ra4yqix0jfe1fgn/tutorial_files.tar.gz 54 | tar -xzvf tutorial_files.tar.gz 55 | cd tutorial_files 56 | 57 | Calculate nucleotide diversity (pi). Use `VCFtools` to figure out how to calculate it. We want to calculate it for 'benthic' and 'littoral' morphs separately.:: 58 | 59 | vcftools --gzvcf Massoko_Dryad_VCF_final_subset_noIndels.vcf.gz --keep littoral.txt --window-pi 100000 --out littoral_pi 60 | vcftools --gzvcf Massoko_Dryad_VCF_final_subset_noIndels.vcf.gz --keep benthic.txt --window-pi 100000 --out benthic_pi 61 | 62 | Calculate linkage disequilibrium.:: 63 | 64 | vcftools --gzvcf Massoko_Dryad_VCF_final_subset_noIndels.vcf.gz --keep littoral.txt --ld-window-bp 500000 --chr scaffold_0 --hap-r2 --out littoral_scaffold_0_ld --min-r2 0.001 65 | vcftools --gzvcf Massoko_Dryad_VCF_final_subset_noIndels.vcf.gz --keep benthic.txt --ld-window-bp 500000 --chr scaffold_0 --hap-r2 --out benthic_scaffold_0_ld --min-r2 0.001 66 | 67 | Summarize linkage disequilibrium data files so that they are smaller and easier to plot.:: 68 | 69 | python summarize_ld.py --infile littoral_scaffold_0_ld.hap.ld --win 10 70 | python summarize_ld.py --infile benthic_scaffold_0_ld.hap.ld --win 10 71 | 72 | Calculate Fst between benthic and limnetic forms.:: 73 | 74 | vcftools --gzvcf Massoko_Dryad_VCF_final_subset_noIndels.vcf.gz --weir-fst-pop littoral.txt --weir-fst-pop benthic.txt --fst-window-size 100000 --out benthic_limnetic_fst 75 | 76 | Make a phylogeny.:: 77 | 78 | raxmlHPC -m GTRGAMMA -n Massoko -s Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K_thin5.phy -p 123 -o A_calliptera_Chitimba,A_calliptera_Bua,A_calliptera_Chizumulu 79 | 80 | Run `ADMIXTURE` for up to 6 populations.:: 81 | 82 | admixture Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.ped 1 83 | admixture Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.ped 2 84 | ... 85 | 86 | Run `smartpca`.:: 87 | 88 | smartpca -p Massoko_smartpca.par > Massoko_smartpca.out 89 | 90 | Now that we have all the different pieces, let's start to plot the data and see what we find. Put all the results into one folder and download them locally so that we can plot and visualize them using `R`. 91 | 92 | Just to be sure, here are all the files you should have. Should things be taking too long, you can borrow my results that I generated earlier: https://www.dropbox.com/s/czrru76ku2kqwt2/results.tar.gz?dl=0 93 | 94 | - benthic_limnetic_fst.windowed.weir.fst 95 | - benthic_pi.windowed.pi 96 | - benthic_scaffold_0_ld.hap.ld_summary_window10 97 | - littoral_pi.windowed.pi 98 | - littoral_scaffold_0_ld.hap.ld_summary_window10 99 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.1.P 100 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.1.Q 101 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.2.P 102 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.2.Q 103 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.3.P 104 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.3.Q 105 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.4.P 106 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.4.Q 107 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.5.P 108 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.5.Q 109 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.6.P 110 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.6.Q 111 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.eval 112 | - Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.evec 113 | - RAxML_bestTree.Massoko 114 | 115 | We have the following data types. 116 | 117 | #. Genetic diversity. 118 | #. Genetic differentiation. (Fst) 119 | #. Decay of linkage disequilibrium. 120 | #. A tree. 121 | #. PCA results. 122 | #. `ADMIXTURE` population clustering results. 123 | 124 | We will be using `R` to plot all these results. I will get you started on how to start thinking about some of these. I would recommend setting your working directory to be the directory that has all your results. For example,:: 125 | 126 | setwd("/Users/sonal/Desktop/results/") 127 | 128 | Note that this is generally considered bad programming practice for scripts that will be publicly shared, but it is convenient when doing exploratory data analysis. 129 | 130 | Genetic diversity 131 | ~~~~~~~~~~~~~~~~~ 132 | To load the genetic diversity results,:: 133 | 134 | b = read.table("benthic_pi.windowed.pi", header=T) 135 | l = read.table("littoral_pi.windowed.pi", header=T) 136 | 137 | Look at how the data is structured and summarize it quickly,:: 138 | 139 | head(b) 140 | summary(b) 141 | 142 | To answer some of the questions below, it might be useful to combine across both data-frames:: 143 | 144 | x = merge(b, l, by=c("CHROM", "BIN_START", "BIN_END")) 145 | 146 | To answer some of the questions below, it might be useful to combine across both data-frames in another way:: 147 | 148 | all = data.frame(c(b$PI, l$PI), c(rep("benthic", nrow(b)), c(rep("littoral", nrow(l))))) 149 | names(all) = c("PI", "MORPH") 150 | 151 | You might want to also explore the following functions to answer the questions:: 152 | 153 | cor.test() 154 | boxplot() 155 | aov() 156 | # if you store the results of aov() in a variable and then run summary() on the variable, you get more info 157 | 158 | 159 | Some questions: 160 | 161 | #. What is min, max, and mean levels of genetic diversity in each morph? 162 | #. Is genetic diversity between the two morphs significantly different? 163 | #. Why might genetic diversity be higher in one morph than another? How could you test this? 164 | #. How correlated is genetic diversity between the two morphs? 165 | #. Why would genetic diversity be correlated between the two morphs? 166 | 167 | Genetic differentiation 168 | ~~~~~~~~~~~~~~~~~~~~~~~ 169 | To load the genetic differentiation results,:: 170 | 171 | fst = read.table("benthic_limnetic_fst.windowed.weir.fst", header=T) 172 | 173 | To select rows that have certain values,:: 174 | 175 | x = fst[fst$CHROM == 'scaffold_0', ] 176 | x = fst[fst$WEIGHTED_FST >= 0.1, ] 177 | 178 | You might want to explore the functions:: 179 | 180 | dim() 181 | nrow() 182 | 183 | Which allow you to quickly figure out how big these dataframes are. 184 | 185 | Some questions: 186 | 187 | #. What is the mean Fst between these two morphs? 188 | #. Is there a correlation between the number of variants in a window and Fst? If so, it would suggest we need to be cautious of these results. 189 | #. In this paper, the authors emphasize the importance of genomic regions that are highly differentiated. How many 100 kb windows are differentiated above Fst >0.1? Fst >0.2? Fst >0.3? 190 | #. Plot Fst along BIN_START on scaffold_15. The authors originally identified 10 peaks (see Fig. 3D) that are highly differentiated. How many do you identify? Why might our results be different? 191 | #. How might you determine if windows with high Fst are significant? 192 | 193 | Decay of Linkage Disequilibrium 194 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 195 | You can read in the tables for linkage disequilibrium just like you did for nucleotide diversity. 196 | 197 | Having done that, we can now plot the data. Plot distance on the x-axis and :math:`r^2` on the y-axis (a measure of linkage disequilibrium that looks at the correlation coefficient between pairs of loci -- higher values means that two loci "travel" together more than you would expect under random assortment). 198 | 199 | Try plotting both morphs at once. You will want to use the `points()` function. 200 | 201 | Some questions: 202 | 203 | #. Do the two morphs have different decay patterns? 204 | #. A key aspect of linkage disequilibrium is how quickly it decays. At what physical distance is the level of linkage disequilbrium halved? You can estimate this visually or using R. 205 | #. These points are very very noisy. How might you do this exercise again to reduce some of this noise? If you have time, try it! 206 | 207 | Plot the phylogeny 208 | ~~~~~~~~~~~~~~~~~~ 209 | To plot the phylogeny, you will need to install the library ape.:: 210 | 211 | install.packages("ape") 212 | library(ape) 213 | 214 | Then, you can read in and plot tree.:: 215 | 216 | t = read.tree("RAxML_bestTree.Massoko") 217 | # makes the tree easier to visualize by ladderizing it 218 | t = ladderize(t) 219 | plot(t) 220 | 221 | Some questions: 222 | 223 | #. What do you think is going on with the "small" morph? 224 | #. Looking at this tree, would you say that the "littoral" and "benthic" morphs are differentiated? Why or why not? 225 | #. Before we use this tree for any formal analysis, what else might you want to check about the tree? 226 | 227 | Plot the PCA 228 | ~~~~~~~~~~~~ 229 | To read in the PCA data:: 230 | 231 | d = read.table("Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.evec") 232 | 233 | Note that the eval file has the data we would need to calculate the eigenvalues for each PCA axis. 234 | 235 | Look at the data file using `head()` -- how is it structured? What does each column mean? 236 | 237 | You can plot it by:: 238 | 239 | plot(d$V2, d$V3, col=as.factor(d$V12), pch=16) 240 | 241 | This isn't such an informative plot. Why? How would you subset the data to make it more informative? Hint: look at column V12.:: 242 | 243 | s = d[d$V12 %in% c("Massoko_benthic", "Massoko_littoral", "Massoko_small"),] 244 | 245 | This still isn't as informative as it could be. It likely would have been much more informative if we removed the outgroups before doing the PCA. That said, are these morphs differentiated? How do these results compare to what we saw with the phylogeny? Why might these results be different? 246 | 247 | ADMIXTURE results 248 | ~~~~~~~~~~~~~~~~~ 249 | To read in the `ADMIXTURE` results:: 250 | 251 | d1 = read.table("Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.1.Q") 252 | d2 = read.table("Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.2.Q") 253 | d3 = read.table("Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.3.Q") 254 | d4 = read.table("Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.4.Q") 255 | d5 = read.table("Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.5.Q") 256 | d6 = read.table("Massoko_Dryad_VCF_final_subset_noIndels_maf05_thinned1K.6.Q") 257 | 258 | To plot the results:: 259 | 260 | par(mfrow=c(6,1), mar=c(1,4,1,1)) 261 | barplot(t(as.matrix(d1)), col=rainbow(1), border=NA) 262 | barplot(t(as.matrix(d2)), col=rainbow(2), border=NA) 263 | barplot(t(as.matrix(d3)), col=rainbow(3), border=NA) 264 | barplot(t(as.matrix(d4)), col=rainbow(4), border=NA) 265 | barplot(t(as.matrix(d5)), col=rainbow(5), border=NA) 266 | par(mar=c(3,4,1,1)) 267 | x = barplot(t(as.matrix(d6)), col=rainbow(6), border=NA) 268 | inds = c(rep('A_cal', 3), rep('Ita', 3), rep('B', 10), rep('L', 10), rep('S', 10)) 269 | mtext(inds, 1, at=x, las=2) 270 | 271 | What's going on here? Based on all the results you have seen from the phylogeny, the PCA, and this, how would you characterize the differentiation between these morphs? 272 | 273 | ========= 274 | Resources 275 | ========= 276 | Population Genetics Books 277 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 278 | - Coop's Class Notes: http://cooplab.github.io/popgen-notes/ 279 | - Felsenstein's Book: http://evolution.genetics.washington.edu/pgbook/pgbook.html 280 | - Gillespie's *Population Genetics: A Concise Guide* 281 | - Hartl and Clark's *Principles of Population Genetics* 282 | - Nielsen and Slatkin's *An Introduction to Population Genetics* 283 | - Wakeley's *Coalescent Theory* 284 | - Yang's *Computational Molecular Evolution* 285 | 286 | Great set of tutorials 287 | ~~~~~~~~~~~~~~~~~~~~~~ 288 | - http://evomics.org/learning/population-and-speciation-genomics/ 289 | - http://grunwaldlab.github.io/Population_Genetics_in_R/Preface.html 290 | 291 | Papers on population genomics 292 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 293 | - *A framework for variation discovery and genotyping using next-generation DNA sequencing data*, DePristo et al 2010; 10.1038/ng.806 294 | - *Genome sequencing and population genomics in non-model organisms*, Ellegren 2014; 10.1016/j.tree.2013.09.008 295 | - *Genotype and SNP calling from next-generation sequencing data*, Nielsen et al 2011; 10.1038/nrg2986 296 | - *Methods and models for unravelling human evolutionary history*, Schraiber and Akey 2015; 10.1038/nrg4005 297 | - *Population Genomics: Whole-Genome Analysis of Polymorphism and Divergence in Drosophila simulans*, Begun et al 2007; 10.1371/journal.pbio.0050310 298 | - *The power and promise of population genomics: from genotyping to genome typing*, Luikart et al 2003; 10.1038/nrg1226 299 | 300 | Software & Programs for working with data 301 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 302 | - http://pngu.mgh.harvard.edu/~purcell/plink/index.shtml; great for quality filtering and simple parsing of variants 303 | - https://github.com/thibautjombart/adegenet/wiki; R package that can parse variant data 304 | - https://vcftools.github.io/index.html; can generate many useful statistics from VCF files 305 | - https://cran.r-project.org/web/packages/PopGenome/index.html; R package that calculates statistics from VCFs, note not very transparent in how it handles missing data 306 | - http://vcf.iobio.io/; allows quick visualization of VCFs 307 | - http://popgen.dk/wiki/index.php/ANGSD; ideal for low coverage data 308 | 309 | Learn Python 310 | ~~~~~~~~~~~~ 311 | - https://github.com/singhal/python_workshop/blob/master/Python.Md 312 | - http://learnpythonthehardway.org/ 313 | - https://www.coursera.org/course/pythonlearn 314 | - http://rosalind.info/problems/locations/ 315 | 316 | Learn R 317 | ~~~~~~~ 318 | - http://tryr.codeschool.com/ 319 | - https://www.coursera.org/learn/r-programming 320 | - https://www.edx.org/course/introduction-r-data-science-microsoft-dat204x-1 321 | - http://swirlstats.com/students.html 322 | - http://r4ds.had.co.nz/ 323 | 324 | Learn Shell / Unix 325 | ~~~~~~~~~~~~~~~~~~ 326 | - https://www.codecademy.com/learn/learn-the-command-line 327 | - http://korflab.ucdavis.edu/unix_and_Perl/ 328 | - http://www.learnshell.org/ 329 | 330 | Learn Perl 331 | ~~~~~~~~~~ 332 | - http://korflab.ucdavis.edu/unix_and_Perl/ 333 | --------------------------------------------------------------------------------