├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── corescripts
    ├── generate_rsb.R
    ├── haps_indel_and_maf_filter.R
    ├── make_plots.R
    ├── multicore_iHH.R
    └── multicore_iHH.R.old
├── defaults_30-9-14.cfg
├── defaults_8-12-14.cfg
├── defaults_nesi.cfg
├── docker
    ├── Dockerfile
    └── README.md
├── docs
    ├── .gitignore
    ├── Makefile
    ├── jss2.bst
    ├── pictures
    │   ├── CEUFay.png
    │   ├── CEUFay_old.png
    │   ├── CEUYRI2.png
    │   ├── CEUihs.png
    │   ├── CEUtajimas.png
    │   ├── RSBCEUYRI.png
    │   ├── WeirCEUYRI.png
    │   ├── YRIFay.png
    │   ├── YRIFay_old.png
    │   ├── YRIihs.png
    │   ├── YRItajimas.png
    │   ├── bifurcationCEU.png
    │   └── hapmapCEUYRI.png
    ├── selection_pipeline.bib
    ├── selection_pipeline.pdf
    └── selection_pipeline.tex
├── extrascripts
    ├── .Rhistory
    ├── check_haps.sh
    ├── extract_samples_from_haps.py
    ├── fay_and_wus.py
    ├── haps_remove_indels.R
    ├── haps_to_tped.py
    ├── kaks.py
    ├── merge_haps.py
    ├── pipeline_test.cfg
    ├── selection_pipeline.sl
    ├── single_pop_vcf_process.sh
    └── voight_filters.py
├── galaxy
    ├── multi_population.xml
    └── selection_pipeline.xml
├── install.log
├── install.sh
├── referencefiles
    ├── ancestral_ref
    │   └── ANCESTRAL.rst
    ├── genetic_maps
    │   └── SHAPEIT.rst
    ├── human_ref
    │   └── human_files.txt
    └── impute_ref
    │   └── IMPUTE_FILES.rst
├── selection_pipeline
    ├── .gitignore
    ├── __init__.py
    ├── _version.py
    ├── aa_annotate.py
    ├── environment.py
    ├── haps_filters.py
    ├── haps_interpolate.py
    ├── haps_to_hapmap.py
    ├── multipipeline.py
    ├── run_pipeline.py
    ├── selectionTools.egg-info
    │   ├── PKG-INFO
    │   ├── SOURCES.txt
    │   ├── dependency_links.txt
    │   ├── entry_points.txt
    │   ├── not-zip-safe
    │   └── top_level.txt
    ├── selection_pipeline.py
    ├── standard_run.py
    ├── standard_run_utilities.py
    └── tests
    │   ├── CEU_test.ids
    │   ├── CEU_test.vcf
    │   ├── __init__.py
    │   ├── ancestor.fa
    │   ├── ancestor.fa.flat
    │   ├── ancestor.fa.gdx
    │   ├── defaults.cfg
    │   ├── filter.haps
    │   ├── one_line.haps
    │   ├── test_selection_pipeline.py
    │   └── triallelic_haps.haps
├── setup.py
└── src
    ├── .gitignore
    ├── PopGenome_2.0.7.tar.gz
    ├── R_dependencies.R
    ├── beagle.jar
    ├── defaults.cfg
    ├── getopt_1.20.0.tar.gz
    ├── impute_v2.3.1_MacOSX_Intel.tgz
    ├── impute_v2.3.1_x86_64_static.tgz
    ├── multicore_0.1-7.tar.gz
    ├── plink-1.07-mac-intel.zip
    ├── plink-1.07-x86_64.zip
    ├── qctool_v1.4-linux-x86_64.tgz
    ├── qctool_v1.4-osx.tgz
    ├── qctool_v1.4-scientific-linux-x86_64.tgz
    ├── rehh_1.11.tar.gz
    ├── tabix.tar.bz2
    ├── variscan-2.0.3.tar.gz
    ├── vcflib.zip
    ├── vcftools.tar.gz
    └── zlib-1.2.8.tar.gz


/.gitignore:
--------------------------------------------------------------------------------
 1 | selectionTools.egg-info/*
 2 | tmp/*
 3 | */*haps
 4 | */*sample
 5 | src/*/*
 6 | bin/*
 7 | lactase/*
 8 | lib/perl5/*
 9 | referencefiles/*/*
10 | include/*
11 | lib/*
12 | !referencefiles/*/*.rst
13 | share/*
14 | dist/*
15 | build/*
16 | cores/*
17 | MerrimanSelection*/*
18 | /defaults.cfg
19 | .DS_Store
20 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "pyfasta"]
2 | 	path = pyfasta
3 | 	url = https://github.com/brentp/pyfasta.git
4 | [submodule "PyVCF"]
5 | 	path = PyVCF
6 | 	url = https://github.com/jamescasbon/PyVCF.git
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 James Boocock and Murray Cadzow
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This project hasn't been able to be updated or maintained for a long period of time and has now been archived
 2 | 
 3 | 
 4 | # CRITICAL BUGFIX 18NOV15
 5 | https://github.com/smilefreak/selectionTools/issues/16
 6 | bug in aa_annotate.py has now been fixed **only in master (selectionTools1.1) and selectionTools1.1-dev branches or version 1.1.1+**
 7 | 
 8 | 
 9 | **selectionTools1.1 was merged onto the master branch on 12JUN15**
10 | 
11 | 
12 | Original and minor updated versions of 1.0 can be found under releases
13 | 
14 | Citation
15 | ========
16 | 
17 | [![Join the chat at https://gitter.im/smilefreak/selectionTools](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/smilefreak/selectionTools?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
18 | 
19 | 
20 | Cadzow, Murray, et al. "A bioinformatics workflow for detecting signatures of selection in genomic data." Frontiers in genetics 5 (2014).
21 | 
22 | selectionTools 1.1
23 | =========================
24 | Pipeline to take VCF through to Selection Analysis.
25 | 
26 | the branch selectionTools1.1-dev is being used for further development 
27 | 
28 | Software Prerequisites
29 | ---------------------
30 | 
31 | The selection pipeline was developed on a 64-bit Ubuntu 13.04 system and has been tested on 64-bit Centos and Ubuntu 13.10 installations. The pipeline should work on any 64-bit linux system and OSX (without Fay and Wu's H).
32 | 
33 | * Python >= 2.6
34 | * Bourne-again Shell (Bash)
35 | * Perl >=5
36 | * R >= 3.0.0
37 | * GNU Autotools
38 | * GCC
39 | * Git
40 | * Java >= 1.7 (for beagle)
41 | 
42 | Python Dependencies
43 | 
44 | * python-setuptools
45 | * python-numpy
46 | * python-scipy
47 | 
48 | If you are using python < 2.7 the python package argparse will need to be installed. 
49 | Installation
50 | ------------
51 | 
52 | After cloning or downloading and extracting running `./install.sh` in the root directory will attempt to install the pipeline and all required dependencies.
53 | 
54 | By default the pipeline executables will be added to $HOME/.local/bin, you should add this directory to your executable path.
55 | 
56 | __Docker__: in the docker directory there is a dockerfile and instructions to create a docker image. Also try this option if the normal installation method doesn't work
57 | 
58 | Config File
59 | -----------
60 | 
61 | Each run of the pipeline requires the specification of settings in a config file. A default config file is generated after installation in the selectionTools directory
62 | named defaults.cfg. Detailed information on what the settings do and how to change them is avaliable in the pdf manual in the docs/ directory.
63 | 
64 | Single Population
65 | -----------------
66 | 
67 | To run the selection pipeline on a single population
68 | 
69 |     selection_pipeline -c <chromosome number> -i <vcf input> --population <population name> \
70 |     --config-file <config_file> --cores <cpu cores>
71 | 
72 | To view the other options run the help.
73 |     
74 |     selection_pipeline -h
75 | 
76 | Multiple Populations
77 | --------------------
78 | 
79 | To run the selection pipeline on multiple populations.
80 | 
81 |     multipop_selection_pipeline -p <population file1> -p <population file2> \
82 |     -i <merged input vcf> --config-file <config file> -c <chromosome number>
83 | 
84 | For more information on population files consult the PDF manual specifically section 3.3.
85 | 
86 | To view the other options run the help.
87 | 
88 |     multipop_selection_pipeline -h
89 |     
90 |     
91 | Notes
92 | -----
93 | 
94 | With a Red Hat linux system you will need to extract the qctool scientific linux distribution located in the src directory.
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/corescripts/generate_rsb.R:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Murray Cadzow and James Boocock
 3 | #  November 2013
 4 | #  University of Otago
 5 | #
 6 | #
 7 | 
 8 | require(getopt)
 9 | 
10 | require(rehh)
11 | 
12 | args=commandArgs(TRUE)
13 | spec = matrix(c(
14 | 	'pop1', 	'p', 1, 'character',
15 | 	'pop2',   'P', 1, 'character',
16 | 	'chr',    'c', 1, 'character',
17 | 	'pop1file','-i',1 ,'character',
18 | 	'pop2file,','-I',1,'character'
19 | ),byrow=T,ncol=4)
20 | opt = getopt(spec)	
21 | 
22 | if(!is.null(opt$help)){
23 | 	cat(getopt(spec,usage=TRUE));
24 | 	q(status=1);
25 | }
26 | pop1_data = read.table(opt$pop1file,header=T,row.names=1)
27 | pop2_data = read.table(opt$pop2file,header=T,row.names=1)
28 | 
29 | rsb_out = ies2rsb(pop1_data,pop2_data,popname1=opt$pop1,popname2=opt$pop2)
30 | write.table(rsb_out$res.rsb,paste(opt$chr,opt$pop1,opt$pop2,'.rsb',sep=""))
31 | 


--------------------------------------------------------------------------------
/corescripts/haps_indel_and_maf_filter.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Murray Cadzow
 3 | # July 2013
 4 | # University of Otago
 5 | 
 6 | args<-commandArgs(TRUE)
 7 | #read in haps file from shapeit
 8 | hapsPop=read.table(file=args[1])
 9 | maf=as.numeric(args[2])
10 | output=as.character(args[3])
11 | 
12 | hapsPop=read.table(file=args[1])
13 | hapsPop=hapsPop[nchar(as.character(hapsPop[,4]))==1 &
14 |                      as.character(hapsPop[,4]) != "-" &
15 |                      nchar(as.character(hapsPop[,5]))==1 &
16 |                      as.character(hapsPop[,5]) != "-", ] #remove indels
17 | hapsPop[,1]= hapsPop[,2]
18 | 
19 | 
20 | af = apply(hapsPop[,6:length(hapsPop[1,])], 1, FUN=function(x){sum(x)/length(x)}  )
21 | hapsPop=hapsPop[(af >= maf) & (af <= (1-maf)),]
22 | print(paste("af >",maf))
23 | print(table((af > maf ) & (af < (1-maf))))
24 | 
25 | write.table(hapsPop, file=output, quote=FALSE, row.names=FALSE, col.names=FALSE)
26 | 


--------------------------------------------------------------------------------
/corescripts/make_plots.R:
--------------------------------------------------------------------------------
 1 | require(getopt)
 2 | 
 3 | spec = matrix(c(
 4 |     'help',     'h',    0,  "logical",
 5 |     'plot_type','p',    1,  "character",
 6 |     'data_file', 'd',   1,  "character",
 7 |     'plot_output_name', 'n', 1, 'character'
 8 |  ), byrow=T, ncol=4)
 9 | 
10 | opt = getopt(spec)
11 | 
12 | if (!is.null(opt$help)){
13 |     cat(getopt(spec,usage=TRUE));
14 |     q(status=1);
15 | }
16 | 
17 | if (opt$plot_type == "taj"){
18 |     tajimaD=read.table(file=opt$data_file, header=TRUE)
19 |     png(opt$plot_output_name)
20 |     plot(tajimaD[,4] ~ tajimaD[,2],pch='.',cex=2,xlab="Chromosome position (bp)", ylab="D statistic")
21 |     dev.off()
22 | }else if( opt$plot_type == "ihs"){
23 |     ihs = read.table(file=opt$data_file)
24 |     png(opt$plot_output_name)
25 |     plot(ihs[,4] ~ ihs[,2],,pch='.',cex=2,ylab=expression("-" * log[10] * "[" ~ "1-2|" * Phi[scriptstyle(italic(iHS))] * "-0.5|" ~ "]"),
26 |          xlab="Chromosome Position BP")
27 |     dev.off()
28 | }else if(opt$plot_type == "fay"){
29 |     fay = read.table(file=opt$data_file,comment.char="#") 
30 |     png(opt$plot_output_name)
31 |     plot(CEUFay[,15] ~ CEUFay[,1],xlab='Chromosome position (bp)',ylab="H Statistic"))
32 |     dev.off()
33 | }else if(opt$plot_type == "rsb"){
34 | 
35 | }else if(opt$plot_type == "fst_weir"){
36 | 
37 | }else if(opt$plot_type == "fst_hap"){
38 | 
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/corescripts/multicore_iHH.R:
--------------------------------------------------------------------------------
  1 | #!/bin/env Rscript
  2 | # Murray Cadzow and James Boocock
  3 | # July 2013 updated March 2015
  4 | # University of Otago
  5 | #
  6 | 
  7 | require(getopt)
  8 | 
  9 | require(rehh) 
 10 | require(parallel) #package for run in parallel in R.
 11 | #script to split chromosome into x sized segments to compute iHH on
 12 | args<-commandArgs(TRUE)
 13 | missing_code='.'
 14 | spec = matrix(c(
 15 |     'help',     'h',    0,  "logical",
 16 |     'input',   'i', 1,  "character", #haps file
 17 |     'chr',     'c', 1,  "integer",
 18 |     'window',  'w', 1,  "integer", #window size in bp
 19 |     'overlap', 'o', 1,  "integer", #overlap between windows in bp
 20 |     'cores',   'r', 1,  "integer", # cores to use in parallel
 21 |     'working_dir', 'd', 1, "character",
 22 |     'offset',   's' , 1, "integer",
 23 |     "maf"  ,    'm' , 1, "integer", # allele frequency threshold below which to exclude snp from calculations
 24 |     "pop"  ,    'p', 1, "character",
 25 |     "ihs"  ,    'I', 0, "logical", #calculate ihs
 26 |     "big_gap",  'b', 1, "integer", # size of gap in bp to stop calculating ihh (200,000 suggested, voight et al 2006)
 27 |     "small_gap", 'S', 1, "integer", #size of gap to start applying gap penalty (20,000 suggested)
 28 |     "small_gap_penalty", 'P', 1, "integer", # gap penalty to apply (20,000 suggested)
 29 |     "haplo_hh",    "H",   0,   "logical", #do bifurcation diagram pre-calculation
 30 |     "missing_code", "M",  1,   "character",
 31 |     "physical_map_haps",  "g" , 1, "character" #file with physical positions if haps file has genetic positions
 32 | ), byrow=T, ncol=4) 
 33 | opt = getopt(spec)
 34 | if (!is.null(opt$help)){
 35 |     cat(getopt(spec,usage=TRUE));
 36 |     q(status=1);
 37 | }
 38 | #default sizes of each region that should be used
 39 | #window=10000000
 40 | #overlap = 2000000
 41 | 
 42 | #read in haps file from shapeit
 43 | pop1=as.character(opt$pop)
 44 | hapsPop=read.table(opt$input,stringsAsFactors=F,header=F) #haplotype file, ideally coded ancestral and derived
 45 | hapsPop=hapsPop[nchar(as.character(hapsPop[,4]))==1 & nchar(as.character(hapsPop[,5]))==1, ] #remove indels
 46 | chr=as.numeric(opt$chr)
 47 | window=as.numeric(opt$window)
 48 | overlap=as.numeric(opt$overlap)
 49 | cores=as.numeric(opt$cores)
 50 | working_dir=as.character(opt$working_dir)
 51 | offset=as.numeric(opt$offset) #how many windows to offset the start by. default should be 1
 52 | maf=as.numeric(opt$maf) #filter of derived allele freq (MAF) for ihs calculation
 53 | if(!is.null(opt$missing_code)){
 54 |     missing_code = opt$missing_code
 55 | }
 56 | 
 57 | if (!is.null(opt$physical_map_haps)){ #physical_map_haps option is for use after interpolation script of pipeline
 58 |     map_positions=read.table(opt$physical_map_haps, header=F)
 59 |     map_positions=as.numeric(map_positions[,2])
 60 | }else{
 61 |     map_positions=hapsPop[,3]
 62 | }
 63 | 
 64 | #haplo_hh option was so bification diagrams could be used
 65 | # uncomment d= ... to create haplo_hh object
 66 | # not practical for genomewide data
 67 | # not sure if this chunk should remain but still needed for next chunk to work
 68 | if(!is.null(opt$haplo_hh)){
 69 |   bin = hapsPop
 70 |   hapsPop.only=bin[,6:ncol(bin)]
 71 |   allelesPop=bin[,4:5]
 72 |   hapsPop.only[hapsPop.only == "1"] = 2
 73 |   hapsPop.only[hapsPop.only == "0"] = 1
 74 |   hapsPop.only[hapsPop.only == missing_code] = 0
 75 |   t.hapsPop=t(hapsPop.only)
 76 |   ##Construct the ind file
 77 |   ind=matrix(ncol=5,nrow=nrow(bin))
 78 |   ind[,1] = as.character(bin[,2])
 79 |   ind[,2] = chr
 80 |   ind[,3] = bin[,3]
 81 |   ind[,4] = 1
 82 |   ind[,5] = 2
 83 |   indPop1=ind
 84 | # little hack to rename duplicated rsIDs that may pop up due to imputation
 85 |   indPop1[duplicated(indPop1[,1]),1] = paste(indPop1[duplicated(indPop1[,1]),2],indPop1[duplicated(indPop1[,1]),3],sep=":")
 86 | 
 87 |   write.table(indPop1,file=paste(pop1,"chr", chr,"wd",working_dir,".map",sep="_"),col.names=F,row.names=F)
 88 |   write.table(t.hapsPop,file=paste(pop1,"chr", chr,"wd",working_dir,".haps",sep="_"),col.names=F)
 89 |   #d = data2haplohh(hap_file=paste(pop1,"chr", chr,"wd",working_dir,".haps",sep="_"),map_file=paste(pop1,"chr", chr,"wd",working_dir,".map",sep="_"),min_maf=maf)   
 90 |   #save(d, file=paste(pop1,"chr", chr,"wd",working_dir,".RData",sep="_"))
 91 | 	#rm(ind,indPop1,bin,t.hapsPop, hapsPop.only)
 92 | }
 93 | 
 94 | 
 95 | #column 3 is base position
 96 | setwd(working_dir)
 97 | 
 98 | ###
 99 | ### ACTUALLY STARTED DOING STUFF FROM HERE:
100 | ###
101 | ## goal is to break chromosome into overlapping windows
102 | ## and process each window in parallel then join them back together
103 | ## at the end
104 | 
105 | # first window number in file
106 | offset=ceiling(map_positions[1]/(window-overlap))
107 | 
108 | # number of windows needed
109 | i=ceiling(map_positions[1]/(window-overlap))
110 | 
111 | # create overlapping windows
112 | # for each window need to create both a haps file and a map file
113 | # physical map = pos in bp
114 | # genetic map is in centimorgans
115 | # i used as window index
116 | 
117 | while((i-1) * (window - overlap) <= map_positions[length(map_positions)]){
118 |   if(i == 1){
119 |     if (!is.null(opt$physical_map_haps)){
120 |         genetic_pos = map_positions[map_positions < (window *i)]
121 |     }
122 |     bin = hapsPop[map_positions < (window * i),]
123 |   } else {
124 |     if (!is.null(opt$physical_map_haps)){
125 |         genetic_pos = map_positions[map_positions >  (window - overlap) * (i-1) & map_positions < (window - overlap)* i + overlap] 
126 |     }
127 |     bin = hapsPop[ map_positions > (window - overlap) * (i-1) & map_positions < (window - overlap)* i + overlap , ]
128 |   }
129 |   if(length(bin[,3]) > 0){
130 |   hapsPop.only=bin[,6:ncol(bin)]
131 |   allelesPop=bin[,4:5]
132 |   hapsPop.only[hapsPop.only == "1"] = 2
133 |   hapsPop.only[hapsPop.only == "0"] = 1
134 |   hapsPop.only[hapsPop.only == missing_code] = 0
135 |   t.hapsPop=t(hapsPop.only)
136 |   ##Construct the ind file
137 |   ind=matrix(ncol=5,nrow=nrow(bin))
138 |   ind[,1] = as.character(bin[,2])
139 |   ind[,2] = chr
140 |   ind[,3] = bin[,3]
141 |   ind[,4] = 1
142 |   ind[,5] = 2
143 |   indPop1=ind
144 |   indPop1[duplicated(indPop1[,1]),1] = paste(indPop1[duplicated(indPop1[,1]),2],indPop1[duplicated(indPop1[,1]),3],sep=":")
145 |   #write out entire chromosome
146 |   if (!is.null(opt$physical_map_haps)){
147 |         write.table(genetic_pos,file=paste('gene_',pop1,'.map',i,sep=''),col.names=F,row.names=F)
148 |   }
149 |   write.table(indPop1,file=paste("ind_",pop1,".test",i,sep=""),col.names=F,row.names=F)
150 |   write.table(t.hapsPop,file=paste("t_",pop1,".haps",i, sep=""),col.names=F)
151 |   }
152 |   i = i + 1
153 | }
154 | #rm(ind,indPop1,bin,t.hapsPop, hapsPop.only)
155 | 
156 | #the indices of the window files created
157 | fileNumber = offset:i 
158 | if (!is.null(opt$physical_map_haps)){
159 |     genetic_map_file =paste0('gene_',pop1,'.map')
160 | }
161 | 
162 | map_file=paste("ind_",pop1,".test",sep="")
163 | hap_file=paste("t_",pop1,".haps", sep="")
164 | #print(fileNumber)
165 | 
166 | # set up a list of all the files that need to have ihh calculated
167 | flag = 0; 
168 | para = list(); 
169 | new_file_number = 0
170 | for( i in fileNumber){  
171 |   if(file.exists(paste(hap_file,i,sep=""))){ 
172 |     if (!is.null(opt$physical_map_haps)){
173 |         p = c(paste(hap_file,i,sep=""), paste(map_file,i,sep=""),paste(genetic_map_file,i,sep=''))   
174 |     }else{
175 |         p = c(paste(hap_file,i,sep=""), paste(map_file,i,sep=""),-1)   
176 |     }
177 |     new_file_number = new_file_number + 1 
178 |     if(flag==0){        
179 |         para = list(p)      
180 |     }else{        
181 |         para = c(para,list(p))     
182 |     }     
183 |     flag = 1;  
184 |     }
185 | }  
186 | actualfileNumber = offset:(offset+new_file_number-1)
187 | 
188 | # function to perform the ihh calculation
189 | # ideally this will be updated at some stage
190 | my_scan_hh = function(x){     
191 |   if(length(read.table(x[1]) != 0)){
192 |     d = data2haplohh(hap_file=x[1],map_file=x[2],min_maf=maf)   
193 |     if(!is.null(opt$physical_map_haps)){
194 |         physical_positions = read.table(x[3],header=F)
195 |         physical_positions = as.numeric(as.character(physical_positions[,1]))
196 |         res = scan_hh(d,big_gap=opt$big_gap,small_gap=opt$small_gap,small_gap_penalty=opt$small_gap_penalty,physical_positions=physical_positions)
197 |     }else{
198 |         res = scan_hh(d,big_gap=opt$big_gap,small_gap=opt$small_gap,small_gap_penalty=opt$small_gap_penalty)
199 |     }
200 |     write.table(res,paste(x[1],".iHH",sep=""))    
201 |     
202 |     } 
203 |     return(NULL)
204 | }
205 |  
206 | # calculate ihh in parallel 
207 | neutral_res = mclapply(para,my_scan_hh,mc.cores=cores)  
208 | 
209 | # re-read in all the results files - this is to avoid index problems due to missing windows
210 | index = 0
211 | neutral_res=list()
212 | actualFiles = c()
213 | for ( j in fileNumber){
214 | index = index + 1
215 |     if(file.exists(paste(hap_file,j,'.iHH',sep=''))){
216 |         neutral_res[[j]] = read.table(paste(hap_file,j,'.iHH',sep=''))
217 | 	actualFiles = cbind(actualFiles, j)
218 |     } else{
219 |         print(paste(hap_file,j,'.iHH does not exist! Continuing without',sep=""))
220 |     }    
221 | }
222 | #save(neutral_res,file="neutral_res.RData")
223 | save.image(file="working_data.RData")
224 | 
225 | #combine all the windows into a single chromosome again
226 | results=data.frame()
227 | results2=data.frame()
228 | if(!is.null(opt$physical_map_haps)){
229 |   print(fileNumber)
230 |   for (n in actualFiles){
231 |     if(file.exists(paste0('gene_',pop1,'.map',n))){
232 |       temp_physical_map= as.numeric(read.table(paste0('gene_',pop1,'.map',n),header=F)[,1])
233 |       if(n == min(actualFiles)){ # from start to first half of overlaped region (first chunk)
234 |         results = neutral_res[[n]][temp_physical_map <= ((window -overlap)* (n) + (1/2 * overlap)) ,] #take correct window when window1 != file1           
235 |       } else {
236 |         if(n == max(actualFiles)){ #take second half of overlap at start and go until the end (final chunk)
237 |           results = rbind(results, neutral_res[[n]][ ((window-overlap)* (n-1) + 1/2*overlap) <= temp_physical_map  ,])              
238 |         } else { #start =take second half of overlap, end = take first half (middle regions)
239 |                results = rbind(results,neutral_res[[n]][ ((window-overlap)* (n-1) + 1/2*overlap) <= temp_physical_map  & temp_physical_map <  ((window -overlap)* (n) + (1/2 * overlap)), ])     
240 |         }
241 |       } 
242 |     }else{
243 |       print(paste("File: gene_",pop1,'.map',n," DOES NOT EXIST, skipping region.", sep=""))
244 |     }
245 |   }
246 |   #names(map)= c("name", "name2", "gen_pos", "a1", "a2","phys_pos")
247 |   results$name = rownames(results[,])
248 |   ##### replace genetic positions with physical positions
249 |   if (!is.null(opt$physical_map_haps)){
250 |     m= read.table(opt$physical_map_haps, header=F)
251 |     m[,3]=chr
252 |     m[duplicated(m[,1]),1] = paste(m[duplicated(m[,1]),3],m[duplicated(m[,1]),2],sep=":")
253 |     results$name = rownames(results[,])
254 |     z = merge(results, m, by.x="name", by.y="V1", sort=FALSE)
255 |     results = z[,c("CHR","V2", "FREQ_a","IHHa","IHHd", "IES")]
256 |     names(results) = c("CHR","POSITION", "FREQ_a","IHHa","IHHd", "IES")
257 |     rownames(results) = z$name    
258 |     rm(z)  
259 |   }
260 |   write.table(results,paste(pop1,"chr", chr,"wd",working_dir,".ihh",sep="_"))
261 |   if (!is.null(opt$ihs)){
262 |     ihs =ihh2ihs(results)
263 |     write.table(ihs$res.ihs,paste(pop1,"chr", chr,"wd",working_dir,".ihs",sep="_"))
264 |   }
265 | }else{
266 |     for (n in actualFiles){
267 |         if(n == min(actualFiles)){ # from start to first half of overlaped region (first chunk)
268 |             results = neutral_res[[n]][neutral_res[[n]][,2] <= ((window -overlap)* (n) + (1/2 * overlap)) ,] #correct window
269 |         } else {
270 |             if(n == max(actualFiles)){ #take second half of overlap at start and go until the end (final chunk)
271 |                 results = rbind(results, neutral_res[[n]][ ((window-overlap)* (n-1) + 1/2*overlap) <= neutral_res[[n]][,2]  ,])
272 |             } else { #start =take second half of overlap, end = take first half (middle regions)
273 |                 results = rbind(results, neutral_res[[n]][ ((window-overlap)* (n-1) + 1/2*overlap) <= neutral_res[[n]][,2]  & neutral_res[[n]][,2] <  ((window -overlap)* (n) + (1/2 * overlap)),])
274 |             }
275 |         } 
276 |     }
277 |     write.table(results,paste(pop1,"chr", chr,"wd",working_dir,".ihh",sep="_"))
278 |     if (!is.null(opt$ihs)){
279 |         ihs =ihh2ihs(results)
280 |         write.table(ihs$res.ihs,paste(pop1,"chr", chr,"wd",working_dir,".ihs",sep="_"))
281 |     }   
282 | }
283 | 
284 | 


--------------------------------------------------------------------------------
/corescripts/multicore_iHH.R.old:
--------------------------------------------------------------------------------
  1 | #!/bin/env Rscript
  2 | # Murray Cadzow and James Boocock
  3 | # July 2013 updated March 2015
  4 | # University of Otago
  5 | #
  6 | 
  7 | require(getopt)
  8 | 
  9 | require(rehh) 
 10 | require(parallel) #package for run in parallel in R.
 11 | #script to split chromosome into x sized segments to compute iHH on
 12 | args<-commandArgs(TRUE)
 13 | missing_code='.'
 14 | spec = matrix(c(
 15 |     'help',     'h',    0,  "logical",
 16 |     'input',   'i', 1,  "character",
 17 |     'chr',     'c', 1,  "integer",
 18 |     'window',  'w', 1,  "integer",
 19 |     'overlap', 'o', 1,  "integer",
 20 |     'cores',   'r', 1,  "integer",
 21 |     'working_dir', 'd', 1, "character",
 22 |     'offset',   's' , 1, "integer",
 23 |     "maf"  ,    'm' , 1, "integer",
 24 |     "pop"  ,    'p', 1, "character",
 25 |     "ihs"  ,    'I', 0, "logical",
 26 |     "big_gap",  'b', 1, "integer",
 27 |     "small_gap", 'S', 1, "integer",
 28 |     "small_gap_penalty", 'P', 1, "integer",
 29 |     "haplo_hh",    "H",   0,   "logical",
 30 |     "missing_code", "M",  1,   "character",
 31 |     "physical_map_haps",  "g" , 1, "character"
 32 | ), byrow=T, ncol=4) 
 33 | opt = getopt(spec)
 34 | if (!is.null(opt$help)){
 35 |     cat(getopt(spec,usage=TRUE));
 36 |     q(status=1);
 37 | }
 38 | #read in haps file from shapeit
 39 | pop1=as.character(opt$pop)
 40 | #print("*")
 41 | hapsPop=read.table(opt$input,stringsAsFactors=F,header=F)
 42 | hapsPop=hapsPop[nchar(as.character(hapsPop[,4]))==1 & nchar(as.character(hapsPop[,5]))==1, ] #remove indels
 43 | chr=as.numeric(opt$chr)
 44 | window=as.numeric(opt$window)
 45 | overlap=as.numeric(opt$overlap)
 46 | cores=as.numeric(opt$cores)
 47 | working_dir=as.character(opt$working_dir)
 48 | offset=as.numeric(opt$offset)
 49 | maf=as.numeric(opt$maf)
 50 | if(!is.null(opt$missing_code)){
 51 |     missing_code = opt$missing_code
 52 | }
 53 | 
 54 | if (!is.null(opt$physical_map_haps)){
 55 |     map_positions=read.table(opt$physical_map_haps, header=F)
 56 |     map_positions=as.numeric(map_positions[,2])
 57 | }else{
 58 |     map_positions=hapsPop[,3]
 59 | }
 60 | #size of each region
 61 | #window=500000
 62 | #overlap = 100000
 63 | #haps file
 64 | #hapsPop=read.table("CEU.haps")
 65 | if(!is.null(opt$haplo_hh)){
 66 |   bin = hapsPop
 67 |   hapsPop.only=bin[,6:ncol(bin)]
 68 |   allelesPop=bin[,4:5]
 69 |   hapsPop.only[hapsPop.only == "1"] = 2
 70 |   hapsPop.only[hapsPop.only == "0"] = 1
 71 |   hapsPop.only[hapsPop.only == missing_code] = 0
 72 |   t.hapsPop=t(hapsPop.only)
 73 |   ##Construct the ind file
 74 |   ind=matrix(ncol=5,nrow=nrow(bin))
 75 |   ind[,1] = as.character(bin[,2])
 76 |   ind[,2] = chr
 77 |   ind[,3] = bin[,3]
 78 |   ind[,4] = 1
 79 |   ind[,5] = 2
 80 |   indPop1=ind
 81 |   indPop1[duplicated(indPop1[,1]),1] = paste(indPop1[duplicated(indPop1[,1]),2],indPop1[duplicated(indPop1[,1]),3],sep=":")
 82 | 
 83 |   write.table(indPop1,file=paste(pop1,"chr", chr,"wd",working_dir,".map",sep="_"),col.names=F,row.names=F)
 84 |   write.table(t.hapsPop,file=paste(pop1,"chr", chr,"wd",working_dir,".haps",sep="_"),col.names=F)
 85 |   #d = data2haplohh(hap_file=paste(pop1,"chr", chr,"wd",working_dir,".haps",sep="_"),map_file=paste(pop1,"chr", chr,"wd",working_dir,".map",sep="_"),min_maf=maf)   
 86 |   #save(d, file=paste(pop1,"chr", chr,"wd",working_dir,".RData",sep="_"))
 87 | }
 88 | #print("Why are you not working")
 89 | #want to create overlapping bins
 90 | #column 3 is base position
 91 | setwd(working_dir)
 92 | 
 93 | #calculate offset from file - 
 94 | # first position in file
 95 | offset=ceiling(map_positions[1]/(window-overlap))
 96 | #pseudo code
 97 | i=ceiling(map_positions[1]/(window-overlap))
 98 | 
 99 | while((i-1) * (window - overlap) <= map_positions[length(map_positions)]){
100 |   if(i == 1){
101 |     if (!is.null(opt$physical_map_haps)){
102 |         genetic_pos = map_positions[map_positions < (window *i)]
103 |     }
104 |     bin = hapsPop[map_positions < (window * i),]
105 |   } else {
106 |     if (!is.null(opt$physical_map_haps)){
107 |         genetic_pos = map_positions[map_positions >  (window - overlap) * (i-1) & map_positions < (window - overlap)* i + overlap]
108 |     }
109 |     bin = hapsPop[ map_positions > (window - overlap) * (i-1) & map_positions < (window - overlap)* i + overlap , ]
110 |   }
111 |   if(length(bin[,3]) > 0){
112 |   hapsPop.only=bin[,6:ncol(bin)]
113 |   allelesPop=bin[,4:5]
114 |   hapsPop.only[hapsPop.only == "1"] = 2
115 |   hapsPop.only[hapsPop.only == "0"] = 1
116 |   hapsPop.only[hapsPop.only == missing_code] = 0
117 |   t.hapsPop=t(hapsPop.only)
118 |   ##Construct the ind file
119 |   ind=matrix(ncol=5,nrow=nrow(bin))
120 |   ind[,1] = as.character(bin[,2])
121 |   ind[,2] = chr
122 |   ind[,3] = bin[,3]
123 |   ind[,4] = 1
124 |   ind[,5] = 2
125 |   indPop1=ind
126 |   indPop1[duplicated(indPop1[,1]),1] = paste(indPop1[duplicated(indPop1[,1]),2],indPop1[duplicated(indPop1[,1]),3],sep=":")
127 |   #write out entire chromosome
128 |   if (!is.null(opt$physical_map_haps)){
129 |         write.table(genetic_pos,file=paste('gene_',pop1,'.map',i,sep=''),col.names=F,row.names=F)
130 |   }
131 |   write.table(indPop1,file=paste("ind_",pop1,".test",i,sep=""),col.names=F,row.names=F)
132 |   write.table(t.hapsPop,file=paste("t_",pop1,".haps",i, sep=""),col.names=F)
133 |   }
134 |   i = i + 1
135 | }
136 | fileNumber = offset:i 
137 | if (!is.null(opt$physical_map_haps)){
138 |     genetic_map_file =paste0('gene_',pop1,'.map')
139 | }
140 | map_file=paste("ind_",pop1,".test",sep="")
141 | hap_file=paste("t_",pop1,".haps", sep="")
142 | #print(fileNumber)
143 | flag = 0; 
144 | para = list(); 
145 | new_file_number = 0
146 | for( i in fileNumber){  
147 |   if(file.exists(paste(hap_file,i,sep=""))){ 
148 |     if (!is.null(opt$physical_map_haps)){
149 |         p = c(paste(hap_file,i,sep=""), paste(map_file,i,sep=""),paste(genetic_map_file,i,sep=''))   
150 |     }else{
151 |         p = c(paste(hap_file,i,sep=""), paste(map_file,i,sep=""),-1)   
152 |     }
153 |     new_file_number = new_file_number + 1 
154 |     if(flag==0){        
155 |         para = list(p)      
156 |     }else{        
157 |         para = c(para,list(p))     
158 |     }     
159 |     flag = 1;  
160 |     }
161 | }  
162 | actualfileNumber = offset:(offset+new_file_number-1)
163 | 
164 | my_scan_hh = function(x){     
165 |   if(length(read.table(x[1]) != 0)){
166 |     d = data2haplohh(hap_file=x[1],map_file=x[2],min_maf=maf)   
167 |     if(!is.null(opt$physical_map_haps)){
168 |         physical_positions = read.table(x[3],header=F)
169 |         physical_positions = as.numeric(physical_positions[,1])
170 |         res = scan_hh(d,big_gap=opt$big_gap,small_gap=opt$small_gap,small_gap_penalty=opt$small_gap_penalty,physical_positions=physical_positions)
171 |     }else{
172 |         res = scan_hh(d,big_gap=opt$big_gap,small_gap=opt$small_gap,small_gap_penalty=opt$small_gap_penalty)
173 |     }
174 |     write.table(res,paste(x[1],".iHH",sep=""))    
175 |     
176 |     } 
177 |     return(NULL)
178 | }
179 |   
180 | neutral_res = mclapply(para,my_scan_hh,mc.cores=cores)  
181 | index = 0
182 | 
183 | for ( j in fileNumber){
184 | index = index + 1
185 |     if(file.exists(paste(hap_file,j,'.iHH',sep=''))){
186 |         neutral_res[[j]] = read.table(paste(hap_file,j,'.iHH',sep=''))
187 |     } else{
188 |         print(paste(hap_file,j,'.iHH does not exist! Continuing without',sep=""))
189 |     }
190 |     
191 | }
192 | #save(neutral_res,file="neutral_res.RData")
193 | save.image(file="working_data.RData")
194 | 
195 | results=data.frame()
196 | if(!is.null(opt$physical_map_haps)){
197 | 	print(fileNumber)
198 |     for (n in fileNumber){
199 |         i=n-(offset-1)
200 |         if(file.exists(paste0('gene_',pop1,'.map',n))){
201 |             temp_physical_map= as.numeric(read.table(paste0('gene_',pop1,'.map',n),header=F)[,1])
202 |             if(n == 1){ # from start to first half of overlaped region (first chunk)
203 |                 results = neutral_res[[i]][temp_physical_map <= ((n+offset-1) * window - 1/2 *overlap) ,] #correct window
204 |             } else {
205 |                 if(n == max(fileNumber)){ #take second half of overlap at start and go until the end (final chunk)
206 |                     a= results
207 |                     b = neutral_res[[i]][ ((window-overlap)* (n-1) + 1/2*overlap) <= temp_physical_map  ,]
208 |                     results = rbind(a,b)
209 |                 } else { #start =take second half of overlap, end = take first half (middle regions)
210 |                     a = results
211 |                     b = neutral_res[[i]][ ((window-overlap)* (n-1) + 1/2*overlap) <= temp_physical_map  & temp_physical_map <  ((window -overlap)* (n) + (1/2 * overlap)), ]
212 |                     results = rbind(a,b )
213 |                 }
214 |             } 
215 |         }else{
216 |             print(paste("File: gene_",pop1,'.map',n," DOES NOT EXIST, skipping region.", sep=""))
217 |         }
218 |     }
219 | ##### replace genetic positions with physical positions
220 |     if (!is.null(opt$physical_map_haps)){
221 |             results[,2] = map_positions     
222 |     }
223 |     write.table(results,paste(pop1,"chr", chr,"wd",working_dir,".ihh",sep="_"))
224 |     if (!is.null(opt$ihs)){
225 |         ihs =ihh2ihs(results)
226 |         write.table(ihs$res.ihs,paste(pop1,"chr", chr,"wd",working_dir,".ihs",sep="_"))
227 |     }
228 | }else{
229 |     for (n in fileNumber){
230 |         i=n-(offset-1)
231 |         if(n == 1){ # from start to first half of overlaped region (first chunk)
232 |             results = neutral_res[[i]][neutral_res[[i]][,2] <= ((n+offset-1) * window - 1/2 *overlap) ,] #correct window
233 |         } else {
234 |             if(n == max(fileNumber)){ #take second half of overlap at start and go until the end (final chunk)
235 |                 a= results
236 |                 b = neutral_res[[i]][ ((window-overlap)* (n-1) + 1/2*overlap) <= neutral_res[[i]][,2]  ,]
237 |                 results = rbind(a,b)
238 |             } else { #start =take second half of overlap, end = take first half (middle regions)
239 |                 a = results
240 |                 b = neutral_res[[i]][ ((window-overlap)* (n-1) + 1/2*overlap) <= neutral_res[[i]][,2]  & neutral_res[[i]][,2] <  ((window -overlap)* (n) + (1/2 * overlap)), ]
241 |                 results = rbind(a,b )
242 |             }
243 |         } 
244 |     }
245 |     write.table(results,paste(pop1,"chr", chr,"wd",working_dir,".ihh",sep="_"))
246 |     if (!is.null(opt$ihs)){
247 |         ihs =ihh2ihs(results)
248 |         write.table(ihs$res.ihs,paste(pop1,"chr", chr,"wd",working_dir,".ihs",sep="_"))
249 |     }   
250 | }
251 | 
252 | 


--------------------------------------------------------------------------------
/defaults_30-9-14.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # Defaults config file for VCF process
 3 | #
 4 | # If the executables are on your path 
 5 | # just the executable name is required.
 6 | #
 7 | # ? is the willcard flag for the prefix options
 8 | 
 9 | 
10 | 
11 | [system]
12 | cores_avaliable = 1
13 | # Library settings do not change, the library folder are appended to the path when runnig the program#
14 | [environment]
15 | LD_LIBRARY_PATH=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/lib
16 | PERL5LIB=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/lib/perl5
17 | [selection_pipeline]
18 | selection_pipeline_executable = selection_pipeline
19 | [vcftools]
20 | vcf_tools_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcftools
21 | vcf_subset_executable =/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcf-subset 
22 | vcf_merge_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcf-merge
23 | vcf_concat_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcf-concat
24 | extra_args= 
25 | [genetic_map]
26 | genetic_map_dir= /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/impute_ref/1000GP_Phase3
27 | genetic_map_prefix=genetic_map_chr?_combined_b37.txt
28 | [shapeit]
29 | shapeit_executable= /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/shapeit
30 | extra_args =
31 | [impute2]
32 | impute_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/impute2
33 | impute_map_dir=  /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/impute_ref/1000GP_Phase3
34 | impute_reference_dir= /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/impute_ref/1000GP_Phase3
35 | impute_map_prefix=genetic_map_chr?_combined_b37.txt
36 | impute_reference_prefix=1000GP_Phase3_chr?
37 | extra_args = 
38 | [plink]
39 | plink_executable =/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/plink
40 | extra_args = 
41 | [Rscript]
42 | rscript_executable = Rscript
43 | indel_filter = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/corescripts/haps_indel_and_maf_filter.R
44 | generate_rsb = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/corescripts/generate_rsb.R
45 | extra_args=
46 | [haps_scripts]
47 | haps_to_hapmap_script= haps_to_hapmap
48 | haps_filter_script = haps_filters
49 | haps_interpolate_script = haps_interpolate
50 | [ancestral_allele]
51 | split_by_chromosome = True
52 | # not used unless split_by_chromosome is set to False
53 | ancestral_fasta_header_regex = 
54 | # not used unless split_by_chromosome is set to False
55 | ancestral_fasta_file =
56 | ancestral_allele_script= ancestral_annotation
57 | 
58 | ancestral_fasta_dir=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/ancestral_ref/human_ancestor_GRCh37_e59
59 | ancestral_prefix=human_ancestor_?.fa
60 | [qctool]
61 | qctool_executable=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/qctool
62 | 
63 | [multicore_ihh]
64 | multicore_ihh = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/corescripts/multicore_iHH.R
65 | [variscan]
66 | variscan_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/variscan
67 | [java]
68 | java_executable = /usr/bin/java
69 | [beagle]
70 | beagle_jar = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/beagle.jar
71 | vm_size = 4g
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/defaults_8-12-14.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # Defaults config file for VCF process
 3 | #
 4 | # If the executables are on your path 
 5 | # just the executable name is required.
 6 | #
 7 | # ? is the willcard flag for the prefix options
 8 | 
 9 | 
10 | 
11 | [system]
12 | cores_avaliable = 1
13 | # Library settings do not change, the library folder are appended to the path when runnig the program#
14 | [environment]
15 | LD_LIBRARY_PATH=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/lib
16 | PERL5LIB=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/lib/perl5
17 | [selection_pipeline]
18 | selection_pipeline_executable = selection_pipeline
19 | [vcftools]
20 | vcf_tools_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcftools
21 | vcf_subset_executable =/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcf-subset 
22 | vcf_merge_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcf-merge
23 | vcf_concat_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcf-concat
24 | extra_args= 
25 | [genetic_map]
26 | genetic_map_dir= /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/impute_ref/1000GP_Phase3
27 | genetic_map_prefix=genetic_map_chr?_combined_b37.txt
28 | [shapeit]
29 | shapeit_executable= /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/shapeit
30 | extra_args =
31 | [impute2]
32 | impute_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/impute2
33 | impute_map_dir=  /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/impute_ref/1000GP_Phase3
34 | impute_reference_dir= /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/impute_ref/1000GP_Phase3
35 | impute_map_prefix=genetic_map_chr?_combined_b37.txt
36 | impute_reference_prefix=1000GP_Phase3_chr?
37 | extra_args = 
38 | [plink]
39 | plink_executable =/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/plink
40 | extra_args = 
41 | [Rscript]
42 | rscript_executable = Rscript
43 | indel_filter = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/corescripts/haps_indel_and_maf_filter.R
44 | generate_rsb = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/corescripts/generate_rsb.R
45 | extra_args=
46 | [haps_scripts]
47 | haps_to_hapmap_script= haps_to_hapmap
48 | haps_filter_script = haps_filters
49 | haps_interpolate_script = haps_interpolate
50 | [ancestral_allele]
51 | split_by_chromosome = True
52 | # not used unless split_by_chromosome is set to False
53 | ancestral_fasta_header_regex = 
54 | # not used unless split_by_chromosome is set to False
55 | ancestral_fasta_file =
56 | ancestral_allele_script= ancestral_annotation
57 | 
58 | ancestral_fasta_dir=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/referencefiles/ancestral_ref/homo_sapiens_ancestor_GRCh37_e65/
59 | ancestral_prefix=homo_sapiens_ancestor_?.fa
60 | [qctool]
61 | qctool_executable=/Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/qctool
62 | 
63 | [multicore_ihh]
64 | multicore_ihh = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/corescripts/multicore_iHH.R
65 | [variscan]
66 | variscan_executable = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/variscan
67 | [java]
68 | java_executable = /usr/bin/java
69 | [beagle]
70 | beagle_jar = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/beagle.jar
71 | vm_size = 4g
72 | 
73 | [vcflib]
74 | vcflib_vcfsnps = /Volumes/BiochemXsan/staff_users/murraycadzow/Murray/SelectionPipeline/selectionTools/bin/vcfsnps
75 | 


--------------------------------------------------------------------------------
/defaults_nesi.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # Defaults config file for VCF process
 3 | #
 4 | # If the executables are on your path 
 5 | # just the executable name is required.
 6 | #
 7 | # ? is the willcard flag for the prefix options
 8 | 
 9 | 
10 | 
11 | [system]
12 | cores_avaliable = 10
13 | # Library settings do not change, the library folder are appended to the path when runnig the program#
14 | [environment]
15 | LD_LIBRARY_PATH=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/lib
16 | PERL5LIB=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/lib/perl5
17 | [selection_pipeline]
18 | selection_pipeline_executable = ~/.local/bin/selection_pipeline
19 | [vcftools]
20 | vcf_tools_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcftools
21 | vcf_subset_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcf-subset
22 | vcf_merge_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcf-merge
23 | vcf_concat_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcf-concat
24 | extra_args= 
25 | [genetic_map]
26 | genetic_map_dir= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/genetic_maps
27 | genetic_map_prefix=genetic_map_chr?_combined_b37.txt
28 | [shapeit]
29 | shapeit_executable= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/shapeit
30 | extra_args =
31 | [impute2]
32 | impute_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/impute2
33 | impute_map_dir= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/impute_ref
34 | impute_reference_dir= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/impute_ref/ALL_1000G_phase1integrated_v3_impute
35 | impute_map_prefix=genetic_map_chr?_combined_b37.txt
36 | impute_reference_prefix=ALL_1000G_phase1integrated_v3_chr?_impute
37 | extra_args = 
38 | [plink]
39 | plink_executable =/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/plink
40 | extra_args = 
41 | [Rscript]
42 | rscript_executable = Rscript
43 | indel_filter = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/corescripts/haps_indel_and_maf_filter.R
44 | generate_rsb = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/corescripts/generate_rsb.R
45 | extra_args=
46 | [haps_scripts]
47 | haps_to_hapmap_script= /home/murray.cadzow/.local/bin/haps_to_hapmap
48 | haps_filter_script = /home/murray.cadzow/.local/bin/haps_filters
49 | haps_interpolate_script = /home/murray.cadzow/.local/bin/haps_interpolate
50 | [ancestral_allele]
51 | split_by_chromosome = True
52 | # not used unless split_by_chromosome is set to False
53 | ancestral_fasta_header_regex = 
54 | # not used unless split_by_chromosome is set to False
55 | ancestral_fasta_file =
56 | ancestral_allele_script= /home/murray.cadzow/.local/bin/ancestral_annotation
57 | 
58 | ancestral_fasta_dir=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/ancestral_ref
59 | ancestral_prefix=human_ancestor_?.fa
60 | 
61 | [qctool]
62 | qctool_executable=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/qctool
63 | 
64 | [multicore_ihh]
65 | multicore_ihh = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/corescripts/multicore_iHH.R
66 | [variscan]
67 | variscan_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/variscan
68 | [java]
69 | java_executable = /usr/bin/java
70 | [beagle]
71 | beagle_jar = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/beagle.jar
72 | vm_size = 4g
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | MAINTAINER murray.cadzow@otago.ac.nz
 3 | RUN apt-get update && apt-get install -y \
 4 |     python-setuptools \
 5 |     python-numpy \
 6 |     python-scipy \
 7 |     git \
 8 |     wget
 9 | 
10 | RUN apt-get install -y --no-install-recommends \
11 |     r-base r-base-dev r-recommended littler
12 | RUN ln -s /usr/share/doc/littler/examples/install.r \
13 |     /usr/local/bin/install.r
14 | 
15 | RUN git clone https://github.com/smilefreak/selectionTools.git \
16 |     && cd selectionTools \
17 |     && ./install.sh
18 | RUN /usr/local/bin/selection_pipeline -h
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | Ubuntu 14.04
 2 | =============
 3 | 
 4 | To be able to create a docker container with selectionTools inside it:
 5 | (currently need sudo access)
 6 | Install docker
 7 | see https://docs.docker.com
 8 | 
 9 | 
10 | Build the docker image:
11 | selectionTools/docker is the path to the directory containing the dockerfile
12 | $ sudo docker build -t selectiondocker/selectiontools selectionTools/docker/
13 | 
14 | This should build a docker image based from ubuntu 14.04 and install all pre-requisites and the selection pipeline itself
15 | 
16 | 
17 | Login:
18 | $ sudo docker run -t -i selectiondocker/selectiontools /bin/bash
19 | 
20 | To be able to side load your data in run:
21 | $ sudo docker run -t -v <path to data on host>:/data -i selectiondocker/selectiontools /bin/bash
22 | 
23 | Your data will be found in /data
24 | 
25 | 
26 | Test installation:
27 | (logged into the docker image)
28 | $ selection_pipeline -h
29 | 
30 | The selection pipeline directory will be located at /selectionTools
31 | 
32 | If that all succeeds you can continue the setup process from section 2.5 in the manual setting up reference files
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | selection_pipeline.aux
2 | selection_pipeline.bbl
3 | selection_pipeline.blg
4 | selection_pipeline.dvi
5 | selection_pipeline.idx
6 | selection_pipeline.toc
7 | selection_pipeline.out
8 | selection_pipeline.log
9 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Make file for making manual
 2 | #
 3 | # author James Boocock
 4 | 
 5 | PDFLATEX=pdflatex
 6 | LATEX_OPTS=-shell-escape
 7 | BIBTEXT=bibtex
 8 | INPUT_FILES=selection_pipeline.tex selection_pipeline.bib
 9 | 
10 | all: $(INPUT_FILES)
11 | 	$(PDFLATEX) $(LATEX_OPTS) selection_pipeline.tex
12 | 	$(BIBTEXT) selection_pipeline
13 | 	$(PDFLATEX) $(LATEX_OPTS) selection_pipeline.tex
14 | 	$(PDFLATEX) $(LATEX_OPTS) selection_pipeline.tex	
15 | 	rm -f selection_pipeline.aux
16 | 	rm -f selection_pipeline.bbl
17 | 	rm -f selection_pipeline.blg
18 | 	rm -f selection_pipeline.dvi
19 | 	rm -f selection_pipeline.idx
20 | 	rm -f selection_pipeline.toc
21 | 	rm -f selection_pipeline.out
22 | 	rm -f selection_pipeline.log
23 | clean:
24 | 	rm -rf selection_pipeline.pdf
25 | 


--------------------------------------------------------------------------------
/docs/pictures/CEUFay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/CEUFay.png


--------------------------------------------------------------------------------
/docs/pictures/CEUFay_old.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/CEUFay_old.png


--------------------------------------------------------------------------------
/docs/pictures/CEUYRI2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/CEUYRI2.png


--------------------------------------------------------------------------------
/docs/pictures/CEUihs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/CEUihs.png


--------------------------------------------------------------------------------
/docs/pictures/CEUtajimas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/CEUtajimas.png


--------------------------------------------------------------------------------
/docs/pictures/RSBCEUYRI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/RSBCEUYRI.png


--------------------------------------------------------------------------------
/docs/pictures/WeirCEUYRI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/WeirCEUYRI.png


--------------------------------------------------------------------------------
/docs/pictures/YRIFay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/YRIFay.png


--------------------------------------------------------------------------------
/docs/pictures/YRIFay_old.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/YRIFay_old.png


--------------------------------------------------------------------------------
/docs/pictures/YRIihs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/YRIihs.png


--------------------------------------------------------------------------------
/docs/pictures/YRItajimas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/YRItajimas.png


--------------------------------------------------------------------------------
/docs/pictures/bifurcationCEU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/bifurcationCEU.png


--------------------------------------------------------------------------------
/docs/pictures/hapmapCEUYRI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/pictures/hapmapCEUYRI.png


--------------------------------------------------------------------------------
/docs/selection_pipeline.bib:
--------------------------------------------------------------------------------
  1 | % This file was created with JabRef 2.10b2.
  2 | % Encoding: ISO8859_1
  3 | 
  4 | 
  5 | @Article{lactase2004,
  6 |   Title                    = {{{G}enetic signatures of strong recent positive selection at the lactase gene}},
  7 |   Author                   = {Bersaglieri, T. and Sabeti, P. C. and Patterson, N. and Vanderploeg, T. and Schaffner, S. F. and Drake, J. A. and Rhodes, M. and Reich, D. E. and Hirschhorn, J. N. },
  8 |   Journal                  = {Am. J. Hum. Genet.},
  9 |   Year                     = {2004},
 10 | 
 11 |   Month                    = {Jun},
 12 |   Number                   = {6},
 13 |   Pages                    = {1111--1120},
 14 |   Volume                   = {74}
 15 | }
 16 | 
 17 | @Article{Browning:2011ic,
 18 |   Title                    = {{Haplotype phasing: existing methods and new developments.}},
 19 |   Author                   = {Browning, Sharon R and Browning, Brian L},
 20 |   Journal                  = {Nature Reviews Genetics},
 21 |   Year                     = {2011},
 22 | 
 23 |   Month                    = oct,
 24 |   Number                   = {10},
 25 |   Pages                    = {703--714},
 26 |   Volume                   = {12}
 27 | }
 28 | 
 29 | @Article{Browning:2007ge,
 30 |   Title                    = {{Rapid and accurate haplotype phasing and missing-data inference for whole-genome association studies by use of localized haplotype clustering}},
 31 |   Author                   = {Browning, Sharon R and Browning, Brian L},
 32 |   Journal                  = {American journal of human genetics},
 33 |   Year                     = {2007},
 34 | 
 35 |   Month                    = nov,
 36 |   Number                   = {5},
 37 |   Pages                    = {1084--1097},
 38 |   Volume                   = {81}
 39 | }
 40 | 
 41 | @Article{Danecek:2011gz,
 42 |   Title                    = {{The variant call format and VCFtools.}},
 43 |   Author                   = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A and Banks, Eric and DePristo, Mark A and Handsaker, Robert E and Lunter, Gerton and Marth, Gabor T and Sherry, Stephen T and McVean, Gilean and Durbin, Richard and {1000 Genomes Project Analysis Group}},
 44 |   Journal                  = {Bioinformatics (Oxford, England)},
 45 |   Year                     = {2011},
 46 | 
 47 |   Month                    = aug,
 48 |   Number                   = {15},
 49 |   Pages                    = {2156--2158},
 50 |   Volume                   = {27}
 51 | }
 52 | 
 53 | @Article{Delaneau:2013hi,
 54 |   Title                    = {{Improved whole-chromosome phasing for disease and population genetic studies.}},
 55 |   Author                   = {Delaneau, Olivier and Zagury, Jean-Fran{\c c}ois and Marchini, Jonathan},
 56 |   Journal                  = {Nature Methods},
 57 |   Year                     = {2013},
 58 | 
 59 |   Month                    = jan,
 60 |   Number                   = {1},
 61 |   Pages                    = {5--6},
 62 |   Volume                   = {10}
 63 | }
 64 | 
 65 | @Article{fayandwush,
 66 |   Title                    = {{{H}itchhiking under positive {D}arwinian selection}},
 67 |   Author                   = {Fay, J. C. and Wu, C. I.},
 68 |   Journal                  = {Genetics},
 69 |   Year                     = {2000},
 70 | 
 71 |   Month                    = {Jul},
 72 |   Number                   = {3},
 73 |   Pages                    = {1405--1413},
 74 |   Volume                   = {155}
 75 | }
 76 | 
 77 | @Article{Flicek:2012vg,
 78 |   Title                    = {{Ensembl 2012}},
 79 |   Author                   = {Flicek, Paul and Amode, M Ridwan and Barrell, Daniel and Beal, Kathryn and Brent, Simon and Carvalho-Silva, Denise and Clapham, Peter and Coates, Guy and Fairley, Susan and Fitzgerald, Stephen and Gil, Laurent and Gordon, Leo and Hendrix, Maurice and Hourlier, Thibaut and Johnson, Nathan and K{\"a}h{\"a}ri, Andreas K and Keefe, Damian and Keenan, Stephen and Kinsella, Rhoda and Komorowska, Monika and Koscielny, Gautier and Kulesha, Eugene and Larsson, Pontus and Longden, Ian and McLaren, William and Muffato, Matthieu and Overduin, Bert and Pignatelli, Miguel and Pritchard, Bethan and Riat, Harpreet Singh and Ritchie, Graham R S and Ruffier, Magali and Schuster, Michael and Sobral, Daniel and Tang, Y Amy and Taylor, Kieron and Trevanion, Stephen and Vandrovcova, Jana and White, Simon and Wilson, Mark and Wilder, Steven P and Aken, Bronwen L and Birney, Ewan and Cunningham, Fiona and Dunham, Ian and Durbin, Richard and Fern{\'a}ndez-Su{\'a}rez, Xos{\'e} M and Harrow, Jennifer and Herrero, Javier and Hubbard, Tim J P and Parker, Anne and Proctor, Glenn and Spudich, Giulietta and Vogel, Jan and Yates, Andy and Zadissa, Amonida and Searle, Stephen M J},
 80 |   Journal                  = {Nucleic acids Research},
 81 |   Year                     = {2012}
 82 | }
 83 | 
 84 | @Article{Gautier:2012et,
 85 |   Title                    = {{rehh: an R package to detect footprints of selection in genome-wide SNP data from haplotype structure.}},
 86 |   Author                   = {Gautier, Mathieu and Vitalis, Renaud},
 87 |   Journal                  = {Bioinformatics (Oxford, England)},
 88 |   Year                     = {2012},
 89 | 
 90 |   Month                    = apr,
 91 |   Number                   = {8},
 92 |   Pages                    = {1176--1177},
 93 |   Volume                   = {28}
 94 | }
 95 | 
 96 | @Article{Hacia:1999fr,
 97 |   Title                    = {{Determination of ancestral alleles for human single-nucleotide polymorphisms using high-density oligonucleotide arrays.}},
 98 |   Author                   = {Hacia, J G and Fan, J B and Ryder, O and Jin, L and Edgemon, K and Ghandour, G and Mayer, R A and Sun, B and Hsie, L and Robbins, C M and Brody, L C and Wang, D and Lander, E S and Lipshutz, R and Fodor, S P and Collins, F S},
 99 |   Journal                  = {Nature Genetics},
100 |   Year                     = {1999},
101 | 
102 |   Month                    = jun,
103 |   Number                   = {2},
104 |   Pages                    = {164--167},
105 |   Volume                   = {22}
106 | }
107 | 
108 | @Article{Holsinger:2009he,
109 |   Title                    = {{Genetics in geographically structured populations: defining, estimating and interpreting F(ST).}},
110 |   Author                   = {Holsinger, Kent E and Weir, Bruce S},
111 |   Journal                  = {Nature Reviews Genetics},
112 |   Year                     = {2009},
113 | 
114 |   Month                    = sep,
115 |   Number                   = {9},
116 |   Pages                    = {639--650},
117 |   Volume                   = {10}
118 | }
119 | 
120 | @Article{impute22009,
121 |   Title                    = {{{A} flexible and accurate genotype imputation method for the next generation of genome-wide association studies}},
122 |   Author                   = {Howie, B. N. and Donnelly, P. and Marchini, J. },
123 |   Journal                  = {PLoS Genet.},
124 |   Year                     = {2009},
125 | 
126 |   Month                    = {Jun},
127 |   Number                   = {6},
128 |   Pages                    = {e1000529},
129 |   Volume                   = {5}
130 | }
131 | 
132 | @Article{InternationalHapMapConsortium:2005cu,
133 |   Title                    = {{A haplotype map of the human genome.}},
134 |   Author                   = {{International HapMap Consortium}},
135 |   Journal                  = {Nature},
136 |   Year                     = {2005},
137 | 
138 |   Month                    = oct,
139 |   Number                   = {7063},
140 |   Pages                    = {1299--1320},
141 |   Volume                   = {437}
142 | }
143 | 
144 | @Article{InternationalHapMapConsortium:2003gs,
145 |   Title                    = {{The International HapMap Project.}},
146 |   Author                   = {{International HapMap Consortium}},
147 |   Journal                  = {Nature},
148 |   Year                     = {2003},
149 | 
150 |   Month                    = dec,
151 |   Number                   = {6968},
152 |   Pages                    = {789--796},
153 |   Volume                   = {426}
154 | }
155 | 
156 | @Article{Lappalainen:2010iz,
157 |   Title                    = {{Genomic landscape of positive natural selection in Northern European populations.}},
158 |   Author                   = {Lappalainen, Tuuli and Salmela, Elina and Andersen, Peter M and Dahlman-Wright, Karin and Sistonen, Pertti and Savontaus, Marja-Liisa and Schreiber, Stefan and Lahermo, P{\"a}ivi and Kere, Juha},
159 |   Journal                  = {European journal of human genetics : EJHG},
160 |   Year                     = {2010},
161 | 
162 |   Month                    = apr,
163 |   Number                   = {4},
164 |   Pages                    = {471--478},
165 |   Volume                   = {18}
166 | }
167 | 
168 | @Article{Li:2011bi,
169 |   Title                    = {{Tabix: fast retrieval of sequence features from generic TAB-delimited files.}},
170 |   Author                   = {Li, Heng},
171 |   Journal                  = {Bioinformatics (Oxford, England)},
172 |   Year                     = {2011},
173 | 
174 |   Month                    = mar,
175 |   Number                   = {5},
176 |   Pages                    = {718--719},
177 |   Volume                   = {27}
178 | }
179 | 
180 | @Article{Paten:2008bp,
181 |   Title                    = {{Genome-wide nucleotide-level mammalian ancestor reconstruction.}},
182 |   Author                   = {Paten, Benedict and Herrero, Javier and Fitzgerald, Stephen and Beal, Kathryn and Flicek, Paul and Holmes, Ian and Birney, Ewan},
183 |   Journal                  = {Genome Research},
184 |   Year                     = {2008},
185 | 
186 |   Month                    = nov,
187 |   Number                   = {11},
188 |   Pages                    = {1829--1843},
189 |   Volume                   = {18}
190 | }
191 | 
192 | @Article{Purcell:2007dg,
193 |   Title                    = {{PLINK: a tool set for whole-genome association and population-based linkage analyses.}},
194 |   Author                   = {Purcell, Shaun and Neale, Benjamin and Todd-Brown, Kathe and Thomas, Lori and Ferreira, Manuel A R and Bender, David and Maller, Julian and Sklar, Pamela and de Bakker, Paul I W and Daly, Mark J and Sham, Pak C},
195 |   Journal                  = {American journal of human genetics},
196 |   Year                     = {2007},
197 | 
198 |   Month                    = sep,
199 |   Number                   = {3},
200 |   Pages                    = {559--575},
201 |   Volume                   = {81}
202 | }
203 | 
204 | @Article{RCoreTeam:wf,
205 |   Title                    = {{R: A language and environment for statistical computing}},
206 |   Author                   = {{R Core Team}}
207 | }
208 | 
209 | @Article{Sabeti:2002ge,
210 |   Title                    = {{Detecting recent positive selection in the human genome from haplotype structure.}},
211 |   Author                   = {Sabeti, Pardis C and Reich, David E and Higgins, John M and Levine, Haninah Z P and Richter, Daniel J and Schaffner, Stephen F and Gabriel, Stacey B and Platko, Jill V and Patterson, Nick J and McDonald, Gavin J and Ackerman, Hans C and Campbell, Sarah J and Altshuler, David and Cooper, Richard and Kwiatkowski, Dominic and Ward, Ryk and Lander, Eric S},
212 |   Journal                  = {Nature},
213 |   Year                     = {2002},
214 | 
215 |   Month                    = oct,
216 |   Number                   = {6909},
217 |   Pages                    = {832--837},
218 |   Volume                   = {419}
219 | }
220 | 
221 | @Article{Sabeti:2006ha,
222 |   Title                    = {{Positive natural selection in the human lineage.}},
223 |   Author                   = {Sabeti, P C and Schaffner, S F and Fry, B and Lohmueller, J and Varilly, P and Shamovsky, O and Palma, A and Mikkelsen, T S and Altshuler, D and Lander, E S},
224 |   Journal                  = {Science (New York, NY)},
225 |   Year                     = {2006},
226 | 
227 |   Month                    = jun,
228 |   Number                   = {5780},
229 |   Pages                    = {1614--1620},
230 |   Volume                   = {312}
231 | }
232 | 
233 | @Article{Sabeti:2007hg,
234 |   Title                    = {{Genome-wide detection and characterization of positive selection in human populations.}},
235 |   Author                   = {Sabeti, Pardis C and Varilly, Patrick and Fry, Ben and Lohmueller, Jason and Hostetter, Elizabeth and Cotsapas, Chris and Xie, Xiaohui and Byrne, Elizabeth H and McCarroll, Steven A and Gaudet, Rachelle and Schaffner, Stephen F and Lander, Eric S and {International HapMap Consortium} and Frazer, Kelly A and Ballinger, Dennis G and Cox, David R and Hinds, David A and Stuve, Laura L and Gibbs, Richard A and Belmont, John W and Boudreau, Andrew and Hardenbol, Paul and Leal, Suzanne M and Pasternak, Shiran and Wheeler, David A and Willis, Thomas D and Yu, Fuli and Yang, Huanming and Zeng, Changqing and Gao, Yang and Hu, Haoran and Hu, Weitao and Li, Chaohua and Lin, Wei and Liu, Siqi and Pan, Hao and Tang, Xiaoli and Wang, Jian and Wang, Wei and Yu, Jun and Zhang, Bo and Zhang, Qingrun and Zhao, Hongbin and Zhao, Hui and Zhou, Jun and Gabriel, Stacey B and Barry, Rachel and Blumenstiel, Brendan and Camargo, Amy and Defelice, Matthew and Faggart, Maura and Goyette, Mary and Gupta, Supriya and Moore, Jamie and Nguyen, Huy and Onofrio, Robert C and Parkin, Melissa and Roy, Jessica and Stahl, Erich and Winchester, Ellen and Ziaugra, Liuda and Altshuler, David and Shen, Yan and Yao, Zhijian and Huang, Wei and Chu, Xun and He, Yungang and Jin, Li and Liu, Yangfan and Shen, Yayun and Sun, Weiwei and Wang, Haifeng and Wang, Yi and Wang, Ying and Xiong, Xiaoyan and Xu, Liang and Waye, Mary M Y and Tsui, Stephen K W and Xue, Hong and Wong, J Tze-Fei and Galver, Luana M and Fan, Jian-Bing and Gunderson, Kevin and Murray, Sarah S and Oliphant, Arnold R and Chee, Mark S and Montpetit, Alexandre and Chagnon, Fanny and Ferretti, Vincent and Leboeuf, Martin and Olivier, Jean-Fran{\c c}ois and Phillips, Michael S and Roumy, St{\'e}phanie and Sall{\'e}e, Cl{\'e}mentine and Verner, Andrei and Hudson, Thomas J and Kwok, Pui-Yan and Cai, Dongmei and Koboldt, Daniel C and Miller, Raymond D and Pawlikowska, Ludmila and Taillon-Miller, Patricia and Xiao, Ming and Tsui, Lap-Chee and Mak, William and Song, You Qiang and Tam, Paul K H and Nakamura, Yusuke and Kawaguchi, Takahisa and Kitamoto, Takuya and Morizono, Takashi and Nagashima, Atsushi and Ohnishi, Yozo and Sekine, Akihiro and Tanaka, Toshihiro and Tsunoda, Tatsuhiko and Deloukas, Panos and Bird, Christine P and Delgado, Marcos and Dermitzakis, Emmanouil T and Gwilliam, Rhian and Hunt, Sarah and Morrison, Jonathan and Powell, Don and Stranger, Barbara E and Whittaker, Pamela and Bentley, David R and Daly, Mark J and de Bakker, Paul I W and Barrett, Jeff and Chretien, Yves R and Maller, Julian and McCarroll, Steve and Patterson, Nick and Pe'er, Itsik and Price, Alkes and Purcell, Shaun and Richter, Daniel J and Sabeti, Pardis and Saxena, Richa and Schaffner, Stephen F and Sham, Pak C and Varilly, Patrick and Altshuler, David and Stein, Lincoln D and Krishnan, Lalitha and Smith, Albert Vernon and Tello-Ruiz, Marcela K and Thorisson, Gudmundur A and Chakravarti, Aravinda and Chen, Peter E and Cutler, David J and Kashuk, Carl S and Lin, Shin and Abecasis, Gon{\c c}alo R and Guan, Weihua and Li, Yun and Munro, Heather M and Qin, Zhaohui Steve and Thomas, Daryl J and McVean, Gilean and Auton, Adam and Bottolo, Leonardo and Cardin, Niall and Eyheramendy, Susana and Freeman, Colin and Marchini, Jonathan and Myers, Simon and Spencer, Chris and Stephens, Matthew and Donnelly, Peter and Cardon, Lon R and Clarke, Geraldine and Evans, David M and Morris, Andrew P and Weir, Bruce S and Tsunoda, Tatsuhiko and Johnson, Todd A and Mullikin, James C and Sherry, Stephen T and Feolo, Michael and Skol, Andrew and Zhang, Houcan and Zeng, Changqing and Zhao, Hui and Matsuda, Ichiro and Fukushima, Yoshimitsu and Macer, Darryl R and Suda, Eiko and Rotimi, Charles N and Adebamowo, Clement A and Ajayi, Ike and Aniagwu, Toyin and Marshall, Patricia A and Nkwodimmah, Chibuzor and Royal, Charmaine D M and Leppert, Mark F and Dixon, Missy and Peiffer, Andy and Qiu, Renzong and Kent, Alastair and Kato, Kazuto and Niikawa, Norio and Adewole, Isaac F and Knoppers, Bartha M and Foster, Morris W and Clayton, Ellen Wright and Watkin, Jessica and Gibbs, Richard A and Belmont, John W and Muzny, Donna and Nazareth, Lynne and Sodergren, Erica and Weinstock, George M and Wheeler, David A and Yakub, Imtaz and Gabriel, Stacey B and Onofrio, Robert C and Richter, Daniel J and Ziaugra, Liuda and Birren, Bruce W and Daly, Mark J and Altshuler, David and Wilson, Richard K and Fulton, Lucinda L and Rogers, Jane and Burton, John and Carter, Nigel P and Clee, Christopher M and Griffiths, Mark and Jones, Matthew C and McLay, Kirsten and Plumb, Robert W and Ross, Mark T and Sims, Sarah K and Willey, David L and Chen, Zhu and Han, Hua and Kang, Le and Godbout, Martin and Wallenburg, John...},
236 |   Journal                  = {Nature},
237 |   Year                     = {2007},
238 | 
239 |   Month                    = oct,
240 |   Number                   = {7164},
241 |   Pages                    = {913--918},
242 |   Volume                   = {449}
243 | }
244 | 
245 | @Article{Tajima:1989un,
246 |   Title                    = {{Statistical method for testing the neutral mutation hypothesis by DNA polymorphism.}},
247 |   Author                   = {Tajima, F},
248 |   Journal                  = {Genetics},
249 |   Year                     = {1989},
250 | 
251 |   Month                    = nov,
252 |   Number                   = {3},
253 |   Pages                    = {585--595},
254 |   Volume                   = {123}
255 | }
256 | 
257 | @Article{Tang:2007cx,
258 |   Title                    = {{A new approach for using genome scans to detect recent positive selection in the human genome.}},
259 |   Author                   = {Tang, Kun and Thornton, Kevin R and Stoneking, Mark},
260 |   Journal                  = {PLoS biology},
261 |   Year                     = {2007},
262 | 
263 |   Month                    = jul,
264 |   Number                   = {7},
265 |   Pages                    = {e171},
266 |   Volume                   = {5}
267 | }
268 | 
269 | @Article{variscan2005,
270 |   Title                    = {{{V}ari{S}can: {A}nalysis of evolutionary patterns from large-scale {D}{N}{A} sequence polymorphism data}},
271 |   Author                   = {Vilella, A. J. and Blanco-Garcia, A. and Hutter, S. and Rozas, J.},
272 |   Journal                  = {Bioinformatics},
273 |   Year                     = {2005},
274 | 
275 |   Month                    = {Jun},
276 |   Number                   = {11},
277 |   Pages                    = {2791--2793},
278 |   Volume                   = {21}
279 | }
280 | 
281 | @Article{Voight:2006go,
282 |   Title                    = {{A map of recent positive selection in the human genome.}},
283 |   Author                   = {Voight, Benjamin F and Kudaravalli, Sridhar and Wen, Xiaoquan and Pritchard, Jonathan K},
284 |   Journal                  = {PLoS biology},
285 |   Year                     = {2006},
286 | 
287 |   Month                    = mar,
288 |   Number                   = {3},
289 |   Pages                    = {e72},
290 |   Volume                   = {4}
291 | }
292 | 
293 | @Article{Weir:1984vn,
294 |   Title                    = {{Estimating F-statistics for the analysis of population structure}},
295 |   Author                   = {Weir, B S and Cockerham, C C},
296 |   Journal                  = {evolution},
297 |   Year                     = {1984}
298 | }
299 | 
300 | @Article{Williams:2012hd,
301 |   Title                    = {{Phasing of many thousands of genotyped samples.}},
302 |   Author                   = {Williams, Amy L and Patterson, Nick and Glessner, Joseph and Hakonarson, Hakon and Reich, David},
303 |   Journal                  = {American journal of human genetics},
304 |   Year                     = {2012},
305 | 
306 |   Month                    = aug,
307 |   Number                   = {2},
308 |   Pages                    = {238--251},
309 |   Volume                   = {91}
310 | }
311 | @article{Nievergelt:2004ci,
312 |     author = {Nievergelt, Caroline M and Smith, Douglas W and Kohlenberg, J Bradley and Schork, Nicholas J},
313 |     title = {{Large-scale integration of human genetic and physical maps.}},
314 |     journal = {Genome Research},
315 |     year = {2004},
316 |     volume = {14},
317 |     number = {6},
318 |     pages = {1199--1205},
319 |     month = jun
320 | }
321 | 
322 | 
323 | 


--------------------------------------------------------------------------------
/docs/selection_pipeline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/docs/selection_pipeline.pdf


--------------------------------------------------------------------------------
/extrascripts/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/extrascripts/.Rhistory


--------------------------------------------------------------------------------
/extrascripts/check_haps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # James Boocock
 4 | # July 2013
 5 | # University of Otago
 6 | #
 7 | NUMBER_PER_LINE=`awk '{print NF}' $1 | uniq | wc -l  | cut -d " " -f 1`
 8 | REPEAT_POSITIONS=`cat $1 | cut -d " " -f 3 | uniq -d`
 9 | if [ "${NUMBER_PER_LINE}" == 1 ] && [ "${REPEAT_POSITIONS}" == ""]; then
10 |     echo "Correct Data File"
11 | else
12 |     echo "wc per line" 
13 |     echo ${NUMBER_PER_LINE}
14 |     echo "repeat positions in haps"
15 |     echo ${REPEAT_POSITIONS}
16 | fi
17 | 


--------------------------------------------------------------------------------
/extrascripts/extract_samples_from_haps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | from optparse import OptionParser
 6 | 
 7 | def index_ids(sample_file,id,basename):
 8 |     ids = []
 9 |     comma_ids = id.split(',')
10 |     samples = []
11 |     samples_line = []
12 |     new_samples = open(basename+'.sample','w')
13 |     with open(sample_file,'r') as f:
14 |         i = 0
15 |         for line in f:
16 |             if (i >= 2): 
17 |                 samples.append(line.split()[0])
18 |                 samples_line.append(line)
19 |             else:
20 |                 new_samples.write(line)
21 |             i = i + 1
22 |     for id in comma_ids:
23 |         ids.append(samples.index(id))
24 |     ids.sort()
25 |     for id in ids:
26 |         new_samples.write(samples_line[id])
27 |     new_samples.close()
28 |     return ids
29 | 
30 | def haps_keep_samples(haps_file,indexed_ids,basename):
31 |     new_haps = open(basename+'.haps','w')
32 |     with (open(haps_file,'r')) as f:
33 |         for line in f:
34 |             lineSplit = line.split()
35 |             new_haps.write(' '.join((lineSplit[:5]))+' ')
36 |             i = 0
37 |             i_person = 0
38 |             person = indexed_ids[i_person] 
39 |             for item in lineSplit[5:]:
40 |                 if((person == i/2) and (i % 2 == 0)):
41 |                     new_haps.write(item+' ')
42 |                 elif((person <= i/2)):
43 |                     new_haps.write(item+' ')
44 |                     i_person = i_person + 1
45 |                     if(i_person == len(indexed_ids)):
46 |                         break
47 |                     person = indexed_ids[i_person]
48 |                 i = i + 1 
49 |             new_haps.write('\n')
50 |     new_haps.close() 
51 |               
52 | def main():
53 |     parser=OptionParser()
54 |     parser.add_option('-i','--haps',dest='haps',help="Haplotype File (.haps)")
55 |     parser.add_option('-s','--sample',dest="sample",help="Sample File (.sample)")
56 |     parser.add_option('-c','--keep',dest="keep" ,help="Comma seperated list of IDs to keep") 
57 |     parser.add_option('-o','--basename',dest="basename",help="Output haps file")
58 |     (options,args) = parser.parse_args()
59 |     indexed_ids = index_ids(options.sample,options.keep,options.basename)
60 |     haps_keep_samples(options.haps,indexed_ids,options.basename)
61 |     
62 | if __name__=="__main__":main()
63 | 


--------------------------------------------------------------------------------
/extrascripts/fay_and_wus.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from selection_pipeline import *
 3 | import logging
 4 | 
 5 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 6 | parser = argparse.ArgumentParser(description='Generate fay and wus data using variscan')
 7 | parser.add_argument('--fay-window-width',dest='fayandWuWindowWidth',help="Sliding window width for Fay and Wu's")
 8 | parser.add_argument('--fay-window-jump',dest='fayandWuWindowJump',help="Window JJump for Fay and Wus(if fay-window-width = fay-window-jump non-overlapping windows are used")
 9 | parser.add_argument('-i',dest="haps_file",help="Haps input file")
10 | parser.add_argument('-s',dest="sample_input_file",help="Sample input file")
11 | parser.add_argument('--config-file',dest='config_file')
12 | parser.add_argument('-o',dest="output_prefix",help="Prefix for output files")
13 | parser.add_argument('-c',dest="chromosome",help="Chromosome to perform fay and wus on")
14 | args=parser.parse_args()
15 | if(args.fayandWuWindowWidth is None):
16 |     args.fayandWuWindowWidth = str(5000)
17 | if(args.fayandWuWindowJump is None):
18 |     args.fayandWuWindowJump = str(5000);
19 | if(args.config_file is None):
20 |     args.config_file = 'defaults.cfg'
21 | if(args.output_prefix is None):
22 |     args.output_prefix = 'FAY'
23 | 
24 | # Uses the pipeline to run just fay and wus.
25 | config  = parse_config(args)
26 | s=StandardRun(args,config,full_run=False)
27 | haps = s.run_aa_annotate_haps(args.haps_file)
28 | new_sample_file = s.fix_sample_file(args.sample_input_file)
29 | haps2 = s.prepare_haps_for_variscan(haps,new_sample_file)
30 | fayandwus= s.variscan_fayandwus(haps2)
31 | logging.info("Fay and Wus Done")
32 | logging.info(fayandwus)
33 | logging.info("Done :)")
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/extrascripts/haps_remove_indels.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Murray Cadzow
 3 | # July 2013
 4 | # University of Otago
 5 | 
 6 | args<-commandArgs(TRUE)
 7 | #read in haps file from shapeit
 8 | hapsPop=read.table(file=args[1])
 9 | write.table(hapsPop, file=paste(args1,".mod",sep="", quote=FALSE, row.names=FALSE, col.names=FALSE))
10 | 


--------------------------------------------------------------------------------
/extrascripts/haps_to_tped.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Script for converting from haps to tped
 4 | # 
 5 | # haps to tped.
 6 | #
 7 | import os
 8 | import logging
 9 | import shutil
10 | from optparse import OptionParser
11 | 
12 | logging.basicConfig(format="%(asctime)s %(message)s")
13 | 
14 | log = logging.getLogger(__name__)
15 | log.setLevel(level=logging.INFO)
16 | 
17 | def make_allelic_lambda(ref,alt):
18 |     return lambda x: ref if 0 == int(x) else alt 
19 | 
20 | def hap_to_tped(basename,output_basename,chromosome):
21 |     log.debug("Started converting from HAPS to TPED")
22 |     tped_out = open(output_basename + '.tped','w')
23 |     # Use shutil to copy sample to tfam
24 |     #default centimorgan values.
25 |     i = 0
26 |     tfam_out = open(output_basename + '.tfam','w')
27 |     with open(basename + '.sample','r') as f:
28 |         for line in f:
29 |             if i > 1:
30 |                 newline=line.split()
31 |                 newline=newline[0:6]
32 |                 tfam_out.write(' '.join(newline) +'\n')
33 |             i = i + 1
34 |     tfam_out.close()
35 |     centimorgans = '0'
36 |     with open(basename + '.haps','r') as f:
37 |         log.debug("Read in TPED file")
38 |         for line in f:
39 |             split_line = line.split()
40 |             rsid = split_line[0]
41 |             pos = split_line[2]
42 |             ref = split_line[3]
43 |             alt = split_line[4]
44 |             allele_lambda = make_allelic_lambda(ref,alt)
45 |             alleles = map(allele_lambda,split_line[5:])
46 |             tped_out.write(chromosome + ' ' + rsid + ' ' + centimorgans + ' ' + pos + ' ' +' '.join(alleles) + '\n') 
47 |     tped_out.close()
48 |     log.debug("Finished converting from HAPS to TPED")
49 | 
50 | def main():
51 |     parser= OptionParser()
52 |     parser.add_option("-i",'--file',dest="haps_files",help="Base names of haps / sample pair")
53 |     parser.add_option('-v','--verbose',action="store_true",dest="verbose",help="Verbosity of the logging outputs")
54 |     parser.add_option('-o','--output',dest="output",help="Output file basename (w+) used so better clean up directory after running program")
55 |     parser.add_option('-c','--chromosome',dest="chromosome",help="Chromosome haps file originates from")
56 |     (options,args) = parser.parse_args()
57 |     if(options.verbose == None):
58 |         log.setLevel(logging.ERROR)
59 |     else:
60 |         log.setLevel(logging.DEBUG)
61 |     assert (options.haps_files) is not None, "Atleast two haps file base names are required"
62 |     assert options.output is not None, "Output base name needs to be specified"
63 |     hap_to_tped(options.haps_files,options.output,options.chromosome)
64 | 
65 | if __name__=="__main__":main() 
66 | 


--------------------------------------------------------------------------------
/extrascripts/kaks.py:
--------------------------------------------------------------------------------
 1 | import vcf
 2 | import sys
 3 | 
 4 | # HOW TO RUN:
 5 | # use snpEff to annoate VCF file
 6 | # SLOW:
 7 | # python kaks.py annotated_vcf.vcf > kaks.txt
 8 | # FASTER:
 9 | # grep '^#\|missense_variant\|synonymous_variant' annotated_vcf.vcf > mis_syn.txt 
10 | # python kaks.py mis_syn.txt > kaks.txt
11 | 
12 | file = sys.argv[1]
13 | 
14 | vcf_reader = vcf.Reader(open(file, 'r'))
15 | gene_name_index = vcf_reader.infos['ANN'][3].split("|").index(" Gene_Name ")
16 | geneid_index =vcf_reader.infos['ANN'][3].split("|").index(" Gene_ID ")
17 | annotation_index = vcf_reader.infos['ANN'][3].split("|").index(" Annotation ")
18 | feature_id_index = vcf_reader.infos['ANN'][3].split("|").index(" Feature_ID ")
19 | 
20 | #key = geneid, value = [ka, ks, gene_name]
21 | genes_ka_ks = {}
22 | 
23 | def add_gene(record):
24 |     curr_gene = str(record.INFO['ANN']).split('|')[geneid_index]
25 |     for r in record.INFO['ANN']:
26 |         gene = str(r).split('|')[geneid_index]
27 |         #print gene
28 |         #print str(r).split('|')[annotation_index]
29 |         if gene in genes_ka_ks:
30 |             if str(r).split('|')[annotation_index] == "synonymous_variant" :
31 |                 genes_ka_ks[gene] = [genes_ka_ks[gene][0], genes_ka_ks[gene][1] + record.num_het + 2 * record.num_hom_alt, 
32 |                                      genes_ka_ks[gene][2]]
33 |             
34 |             if str(r).split('|')[annotation_index] == "missense_variant" :
35 |                 genes_ka_ks[gene] = [genes_ka_ks[gene][0] + record.num_het + 2 * record.num_hom_alt, genes_ka_ks[gene][1],
36 |                                      genes_ka_ks[gene][2]]
37 |         
38 |         else:
39 |             if str(r).split('|')[annotation_index] == "synonymous_variant" :
40 |                 genes_ka_ks[gene] = [0,  record.num_het + 2 * record.num_hom_alt, 
41 |                                      str(r).split('|')[gene_name_index]]
42 |             
43 |             if str(r).split('|')[annotation_index] == "missense_variant" :
44 |                 genes_ka_ks[gene] = [record.num_het + 2 * record.num_hom_alt, 0, 
45 |                                      str(r).split('|')[gene_name_index]]
46 | 
47 | 
48 | print "GeneID\tGeneName\tka\tks\tka_div_ks_plus1"
49 | for record in vcf_reader:
50 |     add_gene(record)
51 |     
52 | for gene in genes_ka_ks:
53 |     print( gene +"\t"+ genes_ka_ks[gene][2] +"\t"+ str(genes_ka_ks[gene][0]) 
54 |           +"\t"+ str(genes_ka_ks[gene][1]) +"\t"+ str(genes_ka_ks[gene][0]/float(genes_ka_ks[gene][1] +1)) ) 
55 | 
56 | 
57 |             
58 |         
59 | 


--------------------------------------------------------------------------------
/extrascripts/merge_haps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Script for concatenating haps files. 
 4 | #
 5 | # @author James Boocock
 6 | # @date 09/08/2013
 7 | #
 8 | 
 9 | from optparse import OptionParser
10 | import logging
11 | 
12 | logging.basicConfig(format="%(asctime)s %(message)s")
13 | log = logging.getLogger(__name__)
14 | log.setLevel(level=logging.INFO)
15 | 
16 | #If there are any repeat ids between the two columns use the first one#
17 | #
18 | # Haps have no header line.
19 | # file looks like.
20 | # RSID RSID POS REF ALT HAPLOTYPES....
21 | 
22 | def merge_haps(one_haps,two_haps,output_file):
23 |     merged_haps = open(output_file + '.haps','w')
24 |     merged_sample = open(output_file + '.sample','w')
25 |     with open(one_haps + '.sample') as one_f:
26 |         with open(two_haps + '.sample') as two_f:
27 |             for line in one_f:
28 |                 merged_sample.write(line)
29 |             i = 0
30 |             for line in two_f:
31 |                 if ( i >= 2):
32 |                     merged_sample.write(line)
33 |                 i = i + 1
34 |     log.debug("Finished creating Merged sample file for " + one_haps +" " + two_haps)
35 |     with open(one_haps+'.haps','r') as one_f:
36 |         with open(two_haps+'.haps','r') as two_f:
37 |             one_first_line=one_f.readline()
38 |             two_first_line=two_f.readline()
39 |             while(one_first_line != '' and two_first_line != ''):
40 |                 one_split = one_first_line.strip().split()
41 |                 two_split = two_first_line.strip().split()
42 |                 pos_one = one_split[2]
43 |                 pos_two = two_split[2]
44 |                 if(int(pos_two) == int(pos_two)):
45 |                     merged_haps.write(' '.join(one_split) + ' ' + ' '.join(two_split[5:]) + '\n')
46 |                     one_first_line=one_f.readline()
47 |                     two_first_line=two_f.readline()
48 |                 elif (int(pos_one) > int(pos_two)):
49 |                     one_first_line=one_f.readline()
50 |                 else: 
51 |                     two_first_line=two_f.readline()
52 |     log.debug("Finished merging haps files for " + one_haps+  " " + two_haps)
53 |     merged_haps.close()
54 |     return output_file
55 | 
56 | 
57 | 
58 | def main():
59 |     parser = OptionParser()
60 |     parser.add_option("-i",'--file',dest="haps_files",action="append",help="Base names of haps / sample pairs atleast two files are required")
61 |     parser.add_option('-v','--verbose',action="store_true",dest="verbose",help="Verbosity of the logging outputs")
62 |     parser.add_option('-d','--delete-missing-snps',action="store_true",dest="remove",help="Remove the snps that do not occur in both files")
63 |     parser.add_option('-o','--output',dest="output",help="Output file basename (w+) used so better clean up directory after running program")
64 |     (options,args) = parser.parse_args()
65 |     if(options.verbose == None):
66 |         log.setLevel(logging.ERROR)
67 |     else:
68 |         log.setLevel(logging.DEBUG)
69 |     assert len(options.haps_files) > 1, "Atleast two haps file base names are required"
70 |     assert options.output is not None, "Output base name needs to be specified"
71 |     haps_files = options.haps_files
72 |     first=haps_files[0]
73 |     log.debug(haps_files)
74 |     for i in range(1,len(haps_files)):
75 |         second=haps_files[i]
76 |         first = merge_haps(first,second,options.output)
77 | 
78 | 
79 | if __name__=="__main__":main()
80 | 
81 | 


--------------------------------------------------------------------------------
/extrascripts/pipeline_test.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # Defaults config file for VCF process
 3 | #
 4 | # If the executables are on your path 
 5 | # just the executable name is required.
 6 | #
 7 | # ? is the willcard flag for the prefix options
 8 | 
 9 | 
10 | 
11 | [system]
12 | cores_avaliable = 8
13 | # Library settings do not change, the library folder are appended to the path when runnig the program#
14 | [environment]
15 | LD_LIBRARY_PATH=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/lib
16 | PERL5LIB=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/lib/perl5
17 | [selection_pipeline]
18 | selection_pipeline_executable = /home/murray.cadzow/.local/bin/selection_pipeline
19 | [vcftools]
20 | vcf_tools_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcftools
21 | vcf_subset_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcf-subset
22 | vcf_merge_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcf-merge
23 | vcf_concat_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/vcf-concat
24 | extra_args= 
25 | [genetic_map]
26 | genetic_map_dir= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/genetic_maps
27 | genetic_map_prefix=genetic_map_chr?_combined_b37.txt
28 | [shapeit]
29 | shapeit_executable= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/shapeit
30 | extra_args =
31 | [impute2]
32 | impute_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/impute2
33 | impute_map_dir= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/impute_ref
34 | impute_reference_dir= /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/impute_ref
35 | impute_map_prefix=genetic_map_chr?_combined_b37.txt
36 | impute_reference_prefix=ALL_1000G_phase1integrated_v3_chr?_impute
37 | extra_args = 
38 | [plink]
39 | plink_executable =/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/plink
40 | extra_args = 
41 | [Rscript]
42 | rscript_executable = Rscript
43 | indel_filter = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/corescripts/haps_indel_and_maf_filter.R
44 | generate_rsb = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/corescripts/generate_rsb.R
45 | extra_args=
46 | [haps_scripts]
47 | haps_to_hapmap_script= haps_to_hapmap
48 | haps_filter_script = haps_filters
49 | haps_interpolate_script = haps_interpolate
50 | [ancestral_allele]
51 | split_by_chromosome = True
52 | # not used unless split_by_chromosome is set to False
53 | ancestral_fasta_header_regex = 
54 | # not used unless split_by_chromosome is set to False
55 | ancestral_fasta_file =
56 | ancestral_allele_script= ancestral_annotation
57 | 
58 | ancestral_fasta_dir=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/referencefiles/ancestral_ref
59 | ancestral_prefix=human_ancestor_?.fa
60 | [qctool]
61 | qctool_executable=/home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/qctool
62 | 
63 | [multicore_ihh]
64 | multicore_ihh = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/corescripts/multicore_iHH.R
65 | [variscan]
66 | variscan_executable = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/variscan
67 | [java]
68 | java_executable = /usr/bin/java
69 | [beagle]
70 | beagle_jar = /home/murray.cadzow/uoo00008/MerrimanSelectionPipeline/bin/beagle.jar
71 | vm_size = 4g
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/extrascripts/selection_pipeline.sl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -J OpenMP_JOB
 3 | #SBATCH -A          # Project Account
 4 | #SBATCH --time=23:00:00     # Walltime
 5 | #SBATCH --mem-per-cpu=8196  # memory/cpu (in MB)
 6 | #SBATCH --cpus-per-task=8   # 8 OpenMP Threads
 7 | #SBATCH -C sb               # sb=Sandybridge,wm=Westmere
 8 | 
 9 | 
10 | 
11 | 
12 | module load Python/2.7.4
13 | module load R/3.0.3-goolf-1.5.14
14 | 
15 | srun python ~/.local/bin/multipop_selection_pipeline -p CEU_ids.txt -p YRI_ids.txt -i CEU_YRI_lactase.vcf --config-file ~/SelectionPipelineTestData/pipeline_test.cfg --a "--phased-vcf" -c 2
16 | 


--------------------------------------------------------------------------------
/extrascripts/single_pop_vcf_process.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Murray Cadzow
 4 | # July 2013
 5 | # University of Otago
 6 | 
 7 | #$1 = pop name
 8 | #$2 = vcf file of population
 9 | #$3 = chr
10 | 
11 | shapeit_dir="${HOME}/Murray/src/shapeit.v2.r644.linux.x86_64"
12 | 
13 | 
14 | vcftools --gzvcf $2 --plink --out ${1}_chr${3}
15 | plink --noweb --file ${1}_chr${3} --geno 0.99 --out ${1}_chr${3}_missing_removed --recode
16 | shapeit.v2.r644.linux.x86_64 --input-ped ${1}_chr${3}_missing_removed.ped ${1}_chr${3}_missing_removed.map -M "${shapeit_dir}/genetic_maps_b37/genetic_map_chr${3}_combined_b37.txt" --output-max ${1}_chr${3}.phased --thread 10
17 | 


--------------------------------------------------------------------------------
/extrascripts/voight_filters.py:
--------------------------------------------------------------------------------
1 | #
2 | # @description:Penalizes iHS 
3 | # 
4 | #
5 | # @author James Boocock
6 | #
7 | 
8 | 


--------------------------------------------------------------------------------
/galaxy/multi_population.xml:
--------------------------------------------------------------------------------
 1 | <tool id="multi_population_selection_pipeilen" name="Multipopulation Selection Pipeline" version="1.0">
 2 | <description>Multipopulation Selection Pipeline</description>
 3 | <command>
 4 | multi_population 
 5 | 
 6 | </command>
 7 | <inputs>
 8 | 
 9 | </inputs>
10 | <outputs>
11 | </outputs>
12 | <help>
13 | </help>
14 | </tool>
15 | 


--------------------------------------------------------------------------------
/galaxy/selection_pipeline.xml:
--------------------------------------------------------------------------------
 1 | <tool id="selection_pipeline" name="Run Selection Pipeline">
 2 |     <description>Generate selection signatures from a single population VCF </description>
 3 | <command>
 4 | selection_pipeline -l $selection_log -i $in_vcf
 5 | 
 6 | <inputs>
 7 |     <param name="in_vcf" type="data"  format="vcf" label="Single Population VCF file">
 8 |     <param name="chromosome" type="text" label="Specify Chromosome" help="Chromosome e.g (2,3,X)"/> 
 9 |     
10 |     <conditional name="vcf_type_param">
11 |         <param name="vcf_type" type="select" label="Select VCF type" help="Select type of VCF file phased or unphased">
12 |             <option value="unphased">Un-phased</option>
13 |             <option value="phased">Phased</option>
14 |         </param>
15 |         <when value="phased"> 
16 |         </when>
17 |         <when value="unphased">
18 |             <param name="remove_missing" type="text" label="Remove missing genotypes" value="0.99" help="Markers with missing data percentage removed from analysis"/>
19 |             <param name="imputation" type="boolean"
20 |                 value="false" label="Perform Imputation" label="Perform imputation using impute2"/>
21 |             <param name="hwe" type="text" value="0.001", label="Hardy-Weinberg Equillibrium Filter" help="SNPs that fail a HWE at the specified threshold will be discarded from the analysis"/>
22 |         </when>
23 |     </conditional>
24 |         <param name="maf" type="text" value="0.01" label="Minor Allele Frequency Filter" help="Filters SNPs from dataset with MAF less than this threshold" />    
25 |         <param name="daf" type="text" value="0.0" label="Derived Allele Frequency Filter" help="Filters SNPs from data set with DAF less than this threshold (daf calculated after ancestral allele annotation)"/>
26 |         <param name="tajima_d" type="text" value="5000" label="Tajimas D bin size"/> 
27 |         <param name="fay_and_wus_width" type="text" value="5000" label="Fay and Wu's sliding window width"/> 
28 |         <param name="fay_and_wus_jump" type="text"
29 |     value="5000" label="Fay and Wu's jumping window width" help="How far the Fay and Wu's sliding window jumps for each calculation ( recommended equal to the window width )"/>
30 | 		<conditional name="fay_and_wus">
31 | 				<param name="extended_options" type="boolean" value="false" label="Extended options" />
32 | 		
33 | 		</conditional>
34 | 	
35 | <\inputs>
36 | <outputs>
37 |     <data name="ihs" format="vcf"/>
38 |     <data name="selection_log" format="txt"/>
39 | </outputs>
40 | 
41 | 
42 | <\command>
43 | <help>
44 | To use the selection pipeline with galaxy assumes you have setup up your default config file correctly
45 | 
46 | 
47 | </help>
48 | 
49 | 
50 | </tool>
51 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Install all the programs required for the install of the program.
  4 | #
  5 | 
  6 | ORIG_DIR=`pwd`
  7 | 
  8 | # Argument to build each function
  9 | # $1 program name
 10 | # $2 folder
 11 | change_folder(){
 12 |     cd $1
 13 | }
 14 | 
 15 | check_success(){
 16 |     "$@"
 17 |     status=$?
 18 |      if [ $status -ne 0 ]; then
 19 |         echo "error with $1"
 20 |         exit 1
 21 |     fi
 22 |     return $status
 23 | }
 24 | 
 25 | orig_dir(){
 26 |     cd $ORIG_DIR
 27 | }
 28 | 
 29 | mkdir -p bin
 30 | mkdir -p lib/perl5
 31 | echo "Installing Dependencies"
 32 | echo "Install Zlib"
 33 | tar xzf src/zlib-1.2.8.tar.gz
 34 | prefix_zlib=${ORIG_DIR}
 35 | change_folder zlib-1.2.8 
 36 | echo $PWD
 37 | check_success ./configure --prefix=${ORIG_DIR}
 38 | check_success make install
 39 | orig_dir
 40 | rm -Rf zlib-1.2.8
 41 | 
 42 | echo "Installing VCF tools"
 43 | tar xzf src/vcftools.tar.gz
 44 | LIB_VAR="-lz -L${ORIG_DIR}/lib -I${ORIG_DIR}/include"
 45 | change_folder vcftools_0.1.11
 46 | check_success make LIB="${LIB_VAR}"
 47 | orig_dir
 48 | cp vcftools_0.1.11/bin/* bin/
 49 | cp vcftools_0.1.11/perl/*pm lib/perl5/
 50 | rm -Rf vcftools_0.1.11
 51 | 
 52 | 
 53 | echo "Installing VCFlib"
 54 | unzip src/vcflib.zip
 55 | cp vcflib-master/bin/vcfsnps bin/
 56 | chmod 755 bin/vcfsnps
 57 | rm -Rf vcflib-master
 58 | 
 59 | 
 60 | 
 61 | 
 62 | echo "Installing QCTool"
 63 | if [ `uname` = "Darwin" ]; then 
 64 |     tar xzf src/qctool_v1.4-osx.tgz
 65 |     mv qctool_v1.4-osx/qctool bin/
 66 |     rm -Rf qctool_v1.4-osx
 67 | else
 68 |     tar xzf src/qctool_v1.4-linux-x86_64.tgz
 69 |     mv qctool_v1.4-linux-x86_64/qctool bin/
 70 |     rm -Rf qctool_v1.4-linux-x86_64
 71 | fi
 72 | 
 73 | echo "Installing Shapeit"
 74 | if [ `uname` = "Darwin" ]; then
 75 |     wget https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.v2.r790.MacOSX.tgz
 76 |     tar xzf shapeit.v2.r790.MacOSX.tgz
 77 |     rm shapeit.v2.r790.MacOSX.tgz
 78 |     mv shapeit bin/
 79 |     rm -Rf shapeit.v2.r790.MacOSX.tgz
 80 | else
 81 |     echo `uname`
 82 |     http://mathgen.stats.ox.ac.uk/genetics_software/shapeit/old_versions/shapeit.v2.r790.Ubuntu_12.04.4.static.tar.gz
 83 |     #wget https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.v2.r790.Ubuntu_12.04.4.static.tar.gz
 84 |     tar xzf shapeit.v2.r790.Ubuntu_12.04.4.static.tar.gz
 85 |     rm shapeit.v2.r790.Ubuntu_12.04.4.static.tar.gz
 86 |     mv shapeit bin/
 87 |     rm -Rf shapeit.v2.r790.Ubuntu_12.04.4.static.tar.gz
 88 | fi
 89 | rm -Rf example
 90 | rm -f LICENCE
 91 | echo "Installing PLINK"
 92 | if [ `uname` = "Darwin" ]; then
 93 | 	unzip src/plink-1.07-mac-intel.zip
 94 | 	cp plink-1.07-mac-intel/plink bin/
 95 | 	rm -Rf plink-1.07-mac-intel
 96 | else
 97 | 	unzip src/plink-1.07-x86_64.zip
 98 | 	cp plink-1.07-x86_64/plink bin/
 99 | 	rm -Rf plink-1.07-x86_64
100 | fi
101 | echo "Installing Impute2"
102 | if [ `uname` = "Darwin" ]; then
103 |     tar xzf src/impute_v2.3.1_MacOSX_Intel.tgz
104 |     mv impute_v2.3.1_MacOSX_Intel/impute2 bin/
105 |     rm -Rf impute_v2.3.1_MacOSX_Intel
106 | else
107 |     tar xzf src/impute_v2.3.1_x86_64_static.tgz
108 |     mv impute_v2.3.1_x86_64_static/impute2 bin/
109 |     rm -Rf impute_v2.3.1_x86_64_static/
110 | fi
111 | chmod 755 bin/impute2
112 | echo "Installing Tabix"
113 | tar -xjf src/tabix.tar.bz2
114 | change_folder tabix-0.2.6
115 | check_success make LIBPATH="${LIB_VAR}"
116 | orig_dir
117 | cp tabix-0.2.6/bgzip bin/
118 | cp tabix-0.2.6/tabix bin/
119 | rm -Rf tabix-0.2.6
120 | echo "Installing Variscan"
121 | if [ `uname` = "Darwin" ]; then
122 |     echo "Cannot install on OSX"
123 | else
124 |     tar -xzf src/variscan-2.0.3.tar.gz
125 |     (cd variscan-2.0.3/src/ && rm *o)
126 |     change_folder  variscan-2.0.3
127 |     check_success bash autogen.sh && make
128 |     orig_dir
129 |     mv variscan-2.0.3/src/variscan bin/
130 |     rm -Rf variscan-2.0.3
131 | fi
132 | echo "Installing Beagle"
133 | cp src/beagle.jar bin/
134 | echo "Installing getopt"
135 | check_success Rscript src/R_dependencies.R 'getopt'
136 | echo "Installing old rehh"
137 | check_success Rscript src/R_dependencies.R 'rehh'
138 | echo "Install rehh"
139 | check_success R CMD INSTALL src/rehh_1.11.tar.gz
140 | echo "Updating submodules"
141 | git submodule init
142 | git submodule update
143 | echo "Generating Default Config File"
144 | # Because PWD contains slashes (/) need to use # as substitution
145 | sed 's#!SELECT_PIPELINE!#'"${PWD}"'#g' src/defaults.cfg > defaults.cfg
146 | 
147 | 
148 | if [[ $EUID -eq 0 ]]; then
149 |     echo "Installing PyFasta"
150 |     change_folder pyfasta
151 |     check_success python setup.py install
152 |     orig_dir
153 |     echo "Installing PyVCF"
154 |     change_folder PyVCF
155 |     check_success python setup.py install
156 |     orig_dir
157 |     echo "Installing selection_pipeline"
158 |     check_success python setup.py install
159 | else
160 |     echo "Installing PyFasta"
161 |     change_folder pyfasta
162 |     check_success python setup.py install --user
163 |     orig_dir
164 |     change_folder PyVCF
165 |     echo "Install PyVCF"
166 |     check_success python setup.py install --user
167 |     orig_dir
168 |     echo "Installing selection_pipeline"
169 |     check_success python setup.py install --user
170 | fi
171 | 


--------------------------------------------------------------------------------
/referencefiles/ancestral_ref/ANCESTRAL.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/referencefiles/ancestral_ref/ANCESTRAL.rst


--------------------------------------------------------------------------------
/referencefiles/genetic_maps/SHAPEIT.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/referencefiles/genetic_maps/SHAPEIT.rst


--------------------------------------------------------------------------------
/referencefiles/human_ref/human_files.txt:
--------------------------------------------------------------------------------
1 | Human reference fasta file stored here
2 | 


--------------------------------------------------------------------------------
/referencefiles/impute_ref/IMPUTE_FILES.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/referencefiles/impute_ref/IMPUTE_FILES.rst


--------------------------------------------------------------------------------
/selection_pipeline/.gitignore:
--------------------------------------------------------------------------------
1 | __py_cache__/
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/selection_pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | # Merriman selection pipeline library
 2 | #
 3 | # Author James Boocock and Murray Cadzow
 4 | # 
 5 | """
 6 |  A selection pipeline for running 
 7 |     various natural selection tools
 8 |     on next generation sequencing data
 9 | 
10 | """
11 | 
12 | import aa_annotate
13 | import standard_run
14 | import standard_run_utilities
15 | import environment
16 | import selection_pipeline 
17 | import multipipeline
18 | import run_pipeline
19 | import haps_to_hapmap
20 | import haps_filters
21 | import _version
22 | 


--------------------------------------------------------------------------------
/selection_pipeline/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.1"
2 | 


--------------------------------------------------------------------------------
/selection_pipeline/aa_annotate.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # James Boocock and Murray Cadzow
  3 | # July 2013 / Sept 2014
  4 | # University of Otago
  5 | #
  6 | import re
  7 | import vcf
  8 | from optparse import OptionParser
  9 | from pyfasta import Fasta
 10 | #
 11 | # Command Line Arguments
 12 | #
 13 | # --haps phased haps
 14 | # --aa ancestral allele annotation
 15 | # --chr Chromosome
 16 | # --output Output file
 17 | # --format |High|Low|
 18 | # optional reference chromosome fasta can be used also
 19 | #
 20 | # 10000 genomesfasta Ancestral allele file can be downloaded from
 21 | # ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/supporting/ancestral_alignments/human_ancestor_GRCh37_e59.tar.bz2
 22 | #
 23 | # Takes a FASTA File Containing the Ancestral Alleles.
 24 | # Using the same format as the 1000 genomes ancestral alleles file.
 25 | # ATCG high confidence call
 26 | # actg low confidence call
 27 | # N : failure
 28 | # - : the exant species contains a insertion at this position.
 29 | # . : no coverage at this alignment.
 30 | # annotate haps file.
 31 | #Annotate a phased_vcf file that is a file from 1000 genomes
 32 | #with the ancestral alleles
 33 | # Could potentially take a population list of
 34 | #ids also but for now just creates it from the
 35 | # vcf file it is given
 36 | 
 37 | 
 38 | def aa_seq(options):
 39 |     """ Gets the ancestral sequence from a Fasta file
 40 | 
 41 |     """
 42 |     f = Fasta(options.ancestralfasta)
 43 |     keyz = (f.keys())
 44 |     match = ''
 45 |     if (options.single_chromosome):
 46 |         # Single chromosome fasta should only have one sequence.
 47 |         # that sequence should be the sequence of interest.
 48 |         keyz = list(keyz)
 49 |         key = keyz[0]
 50 |     else:
 51 |         get_chromosome_from_header = options.header
 52 |         get_chromosome_from_header = \
 53 |             get_chromosome_from_header.replace('?', options.chromosome)
 54 |         for key in keyz:
 55 |             if(re.match(get_chromosome_from_header, key) is not None):
 56 |                 match = key
 57 |         if(match is ''):
 58 |             raise Exception("No match possible is something wrong with the"
 59 |                             " regex specified to the program as"
 60 |                             "--header-regex")
 61 |     aaSeq = f[key]
 62 |     return(aaSeq)
 63 | 
 64 | 
 65 | def write_sample_file(options, vcf_reader):
 66 |     if(options.sample_file is not None):
 67 |         sample_file = open(options.sample_file, 'w')
 68 |         sample_header = ("ID_1 ID_2 missing father mother sex plink_pheno"
 69 |                          "\n0 0 0 D D D B\n")
 70 |         sample_file.write(sample_header)
 71 |         for sample in vcf_reader.samples:
 72 |             sample_file.write(sample + ' ' + sample + ' 0 0 0 0 -9 ' + '\n')
 73 |         sample_file.close()
 74 | 
 75 | 
 76 | def get_haps_line(options, record):
 77 |     if(record.ID is not None):
 78 |         line = (record.ID + ' ' + record.ID + ' ' + str(record.POS) +
 79 |                 ' ' + str(record.REF) + ' ' + str(record.ALT[0]))
 80 |     else:
 81 |         id = options.chromosome + ":" + str(record.POS)
 82 |         line = (id + ' ' + id + ' ' + str(record.POS) + ' ' +
 83 |                 str(record.REF) + ' ' + str(record.ALT[0]))
 84 | 
 85 |     for samples in record.samples:
 86 |         gt = samples['GT']
 87 |         # Need to skip any snps that have any missing phase data to
 88 |         # increase certainty of our results.
 89 |         # If every snp will indeed be phased
 90 |             
 91 |         if(gt != None and '|' in gt):
 92 |             gtSplit = gt.split('|')
 93 |             if(gtSplit[0] == '.'):
 94 |                 gtSplit[0] = options.missing_data_code
 95 |             if(gtSplit[1] == '.'):
 96 |                 gtSplit[1] = options.missing_data_code
 97 |             line = line + ' ' + gtSplit[0] + ' ' + gtSplit[1]
 98 |         else:
 99 |             line = line + ' '+options.missing_data_code +  ' '+ options.missing_data_code
100 |     return line
101 | 
102 | 
103 | def write_hap_line(options,output_line,output=None):
104 |     """ Writes a haps file out
105 | 
106 |         Either writes a haps file to stdout or stderr.
107 |     """
108 |     if(output_line is not None):
109 |         if (options.output is not None):
110 |                 output.write(output_line + "\n")
111 |         else:
112 |                 print(output_line)
113 | 
114 | def close_files(options,output=None):
115 |     """ Close output haps file.
116 | 
117 |     """
118 |     if(options.output is not None):
119 |         output.close()
120 | 
121 | 
122 | def vcf_to_haps(options):
123 |     """ Converts a VCF file to haps format
124 | 
125 |     """
126 |     if(options.output is not None):
127 |         output = open(options.output, 'w')
128 |     else:
129 |         output = None
130 |     vcf_reader = vcf.Reader(filename=options.vcf_file)
131 |     write_sample_file(options, vcf_reader)
132 |     for record in vcf_reader:
133 |         write_hap_line(options, get_haps_line(options, record), output)
134 |     close_files(options, output)
135 | 
136 | 
137 | def annotate_vcf(options):
138 |     if(options.output is not None):
139 |         output = open(options.output, 'w')
140 |         if(options.output_af is not None):
141 |             output_af = open(options.output,'a')
142 |     else:
143 |         output = None
144 |     vcf_reader = vcf.Reader(filename=options.vcf_file)
145 |     write_sample_file(options, vcf_reader)
146 |     aaSeq = aa_seq(options)
147 |     for record in vcf_reader:
148 |         line = get_haps_line(options, record)
149 |         if(line is not None):
150 |             output_line = aa_check(aaSeq[record.POS-1], record.REF,
151 |                                    record.ALT, options.format, line)
152 |             if(options.output_af is not None):
153 |                 output_af.write(str(pos) + "\t" + ref + "\t" + alt + "\t" + tempSeq + "\t" + allele_freq(line.split()[5:],outputLine.split()[5:]) + "\n")
154 |             write_hap_line(options, output_line, output)
155 |     close_files(options, output)
156 |     if(options.output_af is not None):
157 |         close_files(options,output_af)
158 | 
159 | 
160 | def aa_check(realAA, ref, alt, format, line):
161 |     if(re.match('[ACTGactg]', realAA)):
162 |         if(realAA.islower() and format == "upper"):
163 |             return None
164 |         else:
165 |             if(realAA.upper() == ref.upper()):
166 |                 return line.strip()
167 |             elif(realAA.upper() == alt.upper()):
168 |                 newLine = line.split()
169 |                 newLine[3] = alt
170 |                 newLine[4] = ref
171 |                 for i in range(5, len(newLine)):
172 |                     if((newLine[i]) == "1"):
173 |                         newLine[i] = '0'
174 |                     elif((newLine[i]) == "0"):
175 |                         newLine[i] = '1'
176 |                     # DO nothing leave it as missing
177 |             else:
178 |                 newLine = line.split()
179 |                 newLine[3] = realAA
180 |                 newLine[4] = ref
181 |                 for i in range(5, len(newLine)):
182 |                         newLine[i] = '1'
183 |             return ' '.join(newLine)
184 |     else:
185 |         return None
186 | 
187 | def allele_freq(ref, ances):
188 |     ref = ' '.join(ref)
189 |     ances = ' '.join(ances)
190 |     p1 = ref.count('0')
191 |     q1 = ref.count('1')
192 |     p2 = ances.count('0')
193 |     q2 = ances.count('1')
194 |     maf = q1 / float(p1 + q1)
195 |     daf = q2 /float(p2 + q2)
196 | #    print(str(p1) +"\t" + str(q1) + "\t" +str(p2) + "\t" +str(q2))
197 |     return str(maf) + "\t" + str(daf)
198 | 
199 | 
200 | def annotate_haps(options):
201 |     aaSeq = aa_seq(options)
202 |     output = None
203 |     if(options.output is not None):
204 |         output = open(options.output, 'w')
205 |         if(options.output_af is not None):
206 |             output_af = open(options.output_af, 'a')
207 |     with open(options.haps, 'r') as haps:
208 |         for line in haps:
209 |             lineSplit = line.split()
210 |             pos = int(lineSplit[2])
211 |             ref = lineSplit[3]
212 |             alt = lineSplit[4]
213 |             tempSeq = aaSeq[pos-1]
214 |             outputLine = aa_check(tempSeq, ref, alt, options.format, line)
215 |             if(outputLine is not None):
216 |                 if(options.output is not None):
217 |                     output.write(outputLine + "\n")
218 |                     if(options.output_af is not None):
219 |                         output_af.write(str(pos) + "\t" + ref + "\t" + alt + "\t" + tempSeq + "\t" + allele_freq(line.split()[5:],outputLine.split()[5:]) + "\n")
220 |                 else:
221 |                     print(outputLine)
222 |     if(options.output is not None):
223 |         output.close()
224 |         if(options.output_af is not None):
225 |             output_af.close()
226 | 
227 | 
228 | def main():
229 |     parser = OptionParser()
230 |     parser.add_option('-i', '--haps', dest='haps',
231 |                       help="Haplotype File (.haps)")
232 |     parser.add_option('-a', '--aa', dest='ancestralfasta',
233 |                       help="Ancestral Allele Fasta file")
234 |     parser.add_option('--ref-fasta', action='store_true',
235 |                       dest='ref_fasta',
236 |                       help=('Use reference fasta which does not split'
237 |                             'by chromosome'))
238 |     parser.add_option('-c', '--chr', dest="chromosome", help="Chromosome")
239 |     parser.add_option('-o', '--output', dest="output",
240 |                       help="Output File (optional)")
241 |     parser.add_option('-f', '--format', dest="format",
242 |                       help=("Format use upper case or upper "
243 |                             "and lower case bases"))
244 |     parser.add_option('-v', '--phased-vcf', dest="vcf_file",
245 |                       help="Phased VCF file (.vcf)")
246 |     parser.add_option('-s', '--sample-file', dest="sample_file",
247 |                       help="Output sample_file")
248 |     parser.add_option('--header-regex', dest="header",
249 |                       help=("To determine which chromosome to extract "
250 |                             "is a regex with a ? for the chromosome number"))
251 |     parser.add_option('--single-chromosome', action='store_true',
252 |                       dest='single_chromosome')
253 |     parser.add_option('--no-annotation', action="store_true",
254 |                       dest="no_annotation",
255 |                       help=("No annotation of VCF file just"
256 |                             " convert to haps"))
257 |     parser.add_option('--missing-code',dest='missing_data_code',
258 |                       help='Missing code for output file')
259 |     parser.add_option('--af',dest='output_af',help="filename for file with minor and derived allele frequencies")
260 |     (options, args) = parser.parse_args()
261 |     if(options.missing_data_code is None):
262 |         options.missing_data_code = '.'
263 |     if(options.format is None):
264 |         options.format = 'lower'
265 |     # Will annotate the haps file with exactly what is required
266 |     # More options could be added later covering a wider range of file types
267 |     # andy maybe different input ancestral alleles.
268 |     assert options.haps is not None or options.vcf_file is not None,\
269 |         "Haps or VCF input file required to run ancestral annotation."
270 |     if(options.output_af is None):
271 |         options.output_af = options.output.split('.haps')[0] + ".af"
272 |     if(options.output_af is not None):
273 |         f =  open(options.output_af, 'w')
274 |         f.write("Pos\tRef\tAlt\tAnc\tMAF\tDAF\n")
275 |         f.close()
276 |     if(options.haps is not None):
277 |         annotate_haps(options)
278 |     elif(options.vcf_file is not None):
279 |         if(options.no_annotation is None):
280 |             annotate_vcf(options)
281 |         else:
282 |             vcf_to_haps(options)
283 |     if(options.single_chromosome is None):
284 |         options.single_chromosome = False
285 |         assert options.header is None, \
286 |             "Option header_regex required if the fasta file is"\
287 |             "split by chromosome"
288 | 
289 | if __name__ == "__main__":
290 |     main()
291 | 


--------------------------------------------------------------------------------
/selection_pipeline/environment.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Extend environment extend the environment variables
 3 | # so that the program runs on any linux machine.
 4 | #
 5 | #
 6 | #
 7 | import os
 8 | 
 9 | def set_environment(environment_variables):
10 |     for environ, value in environment_variables.items():
11 |         environ = environ.upper()
12 |         if(environ in os.environ):
13 |             os.environ[environ] += (":" + value)
14 |         else:
15 |             os.environ[environ] = value
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/selection_pipeline/haps_filters.py:
--------------------------------------------------------------------------------
  1 | # Implement's haps filters for HWE MISSINGNESS
  2 | # and MAF
  3 | 
  4 | import argparse
  5 | try:
  6 |     from  scipy import stats
  7 | except ImportError as inst:
  8 |     print ("Could not import scipy install to use haps filters")
  9 |     raise inst
 10 | 
 11 | 
 12 | def hardy_weinberg_asymptotic(obs_het, obs_a , obs_b):
 13 |     obs_het = float(obs_het)
 14 |     obs_a = float(obs_a)
 15 |     obs_b = float(obs_b)
 16 |     sample_size = obs_het + obs_a + obs_b
 17 |     p = (((2 * obs_a) + obs_het) / ( 2 * (sample_size)))
 18 |     q = 1 - p 
 19 |     exp_a = p * p * sample_size 
 20 |     exp_b = q * q * sample_size
 21 |     exp_ab = 2 * p * q * sample_size
 22 |     
 23 |     # get chiSquare values
 24 |     if(exp_a == 0):
 25 |         chi_a = 0
 26 |     else:
 27 |         chi_a = ((obs_a - exp_a) * 2.0) / exp_a
 28 |     if(exp_b == 0):
 29 |         chi_b = 0
 30 |     else:
 31 |         chi_b = ((obs_b - exp_b) * 2.0) / exp_b
 32 |     if(exp_ab == 0):
 33 |         chi_ab = 0
 34 |     else:
 35 |         chi_ab = ((obs_het - exp_ab) * 2.0 ) / exp_ab
 36 |     chi_sq_total = chi_a + chi_b + chi_ab
 37 |     return stats.chisqprob(chi_sq_total, 1)    
 38 |     
 39 | def hardy_weinberg_exact(obs_het, obs_a, obs_b):
 40 |    raise Exception("Not Implemented")
 41 | 
 42 | def filter_haps_file(args):
 43 |     with open(args.haps,'r') as input_haps:
 44 |         with open(args.output,'w') as output_haps:
 45 |             for snp_data in input_haps:
 46 |                 line = snp_data.split()[5:]
 47 |                 total = float(len(line))
 48 |                 if(((line.count('?')/total)) > args.missing):
 49 |                     continue
 50 |                 question_marks = line.count('?')
 51 |                 # TriAllellic Message
 52 |                 if( (question_marks + line.count('0') + line.count('1'))!= int(total)):
 53 |                     continue
 54 |                 p = line.count('0')
 55 |                 q = line.count('1')
 56 |                 major = p
 57 |                 minor = q
 58 |                 if(q > major):
 59 |                     major = q
 60 |                     minor = p
 61 |                 total = float(p + q)
 62 |                 if(minor/total < args.maf):
 63 |                     continue
 64 |                 zipa = line[0::2]
 65 |                 zipb = line[1::2]
 66 |                 countAA = 0
 67 |                 countAB = 0
 68 |                 countBB = 0
 69 |                 for a ,b in zip(zipa , zipb):
 70 |                     if ( a == '0' and b == '0'):
 71 |                         countAA += 1
 72 |                     elif(a == '1' and b == '1'):
 73 |                         countBB += 1
 74 |                     elif(a == '1' and b == '0'):
 75 |                         countAB += 1
 76 |                     elif(a == '0' and b == '1'):
 77 |                         countAB += 1
 78 |                 countAA = float(countAA)
 79 |                 countAB = float(countAB)
 80 |                 countBB = float(countBB)
 81 |                 if(args.chi_square):
 82 |                     hwe_pvalue = \
 83 |                          hardy_weinberg_asymptotic(countAB, countAA, countBB)
 84 |                 else:
 85 |                     hwe_pvalue = \
 86 |                         hardy_weinberg_exact(countAB,countAA,countBB)
 87 |                 if(hwe_pvalue <= args.hwe):
 88 |                     continue
 89 |                 output_haps.write(snp_data)
 90 |                 
 91 |                 
 92 |     
 93 |                 
 94 | 
 95 | 
 96 | def main():
 97 |     parser = argparse.ArgumentParser(description='Preform filtering on a haps file')
 98 |     parser.add_argument('--hwe',dest='hwe')
 99 |     parser.add_argument('--haps',dest='haps')
100 |     parser.add_argument('--output',dest='output')
101 |     parser.add_argument('--maf',dest='maf')
102 |     parser.add_argument('--missing',dest='missing')    
103 |     parser.add_argument('--chi-sq',action='store_true',
104 |                         dest='chi_square',default=False,
105 |                         help="Use a chi-square test instead of an exact test")
106 |     args = parser.parse_args()
107 |     if(args.hwe is None):
108 |         args.hwe = 0.0
109 |     else:
110 |         args.hwe = float(args.hwe)
111 |     if(args.maf is None):
112 |         args.maf = 0.0
113 |     else:
114 |         args.maf = float(args.maf)
115 |     if(args.missing is None):
116 |         args.missing = 1
117 |     else: 
118 |         args.missing = float(args.missing)
119 |     assert args.haps is not None, \
120 |         "Haps file is required to run haps filters"    
121 |     assert args.output is not None, \
122 |         "Output file name is required to haps filter" 
123 |     filter_haps_file(args) 
124 |     
125 | 
126 | 


--------------------------------------------------------------------------------
/selection_pipeline/haps_interpolate.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Annotates a haps file by replacing all the SNPS with the genetic map positions
  3 |     Assumes the genetic map is in the shapeit format. 
  4 | """
  5 | 
  6 | import argparse
  7 | from collections import OrderedDict
  8 | import tempfile
  9 | import decimal
 10 | from decimal import *
 11 | 
 12 | def get_genetic_map_format(genetic_map):
 13 |     """ Method to get the format of the genetic map files 
 14 |         used to work out whether we are dealing with shapeit or
 15 |         plink geneticmap
 16 |     """
 17 |     with open(genetic_map) as gmap:
 18 |         gmap_line = gmap.readline()
 19 |         # If it's 1 it must be a shapeit genetic map I think
 20 |         # Because a plink genetic map is tab seperated
 21 |         if(len(gmap_line.split('\t'))==1):
 22 |             return "shapeit"
 23 |         else:
 24 |             return "plink"
 25 | 
 26 | def plink_to_shapeit_gmap(genetic_map_output,new_genetic_map):
 27 |     """ Convert a plink genetic map to shapeit, as this normalises our
 28 |         genetic map for use further in the pipeline,
 29 | 
 30 |         e.g Enables shapeit to use a plink geneticmap
 31 |         User might prefer beagle and not use shapeit but
 32 |         we use a plink genetic map for rehh
 33 |     """   
 34 |     with open(genetic_map_output) as gmap:
 35 |         for i, line in enumerate(gmap):
 36 |             gmap_line = line.split('\t')
 37 |             if ( i == 0 ):
 38 |                 continue
 39 |             else:
 40 |                 position=int(gmap_line[1])
 41 |                 recomb_rate=float(gmap_line[2])
 42 |                 centi_morgans=float(gmap_line[3])
 43 |                 new_genetic_map.write(str(position) + ' ' + str(recomb_rate) + ' ' + str(centi_morgans) + '\n')
 44 | 
 45 |     return new_genetic_map
 46 | 
 47 | def get_shapeit_genetic_map(genetic_map,temp_genetic_map):
 48 |     """ Returns either the original file name
 49 |         if the file is already in shapeit format 
 50 |     """
 51 |     file_format=get_genetic_map_format(genetic_map)
 52 |     if not isinstance(temp_genetic_map, file):
 53 |         temp_genetic_map_file = open(temp_genetic_map,'w')
 54 |     else:
 55 |         temp_genetic_map_file = temp_genetic_map
 56 |     if(file_format =='shapeit'):
 57 |         with open(genetic_map) as gmap:
 58 |             for i, line in enumerate(gmap):
 59 |                 if (i == 0):
 60 |                     continue
 61 |                 else:
 62 |                     temp_genetic_map_file.write(line)
 63 |     else:
 64 |         plink_to_shapeit_gmap(genetic_map,temp_genetic_map_file)
 65 |     return(temp_genetic_map)
 66 | 
 67 | def load_genetic_map(genetic_map):
 68 |     gmap_pos = OrderedDict()
 69 |     genetic_map.seek(0)
 70 |     for i, line in enumerate(genetic_map):
 71 |         #shape it format is line seperated
 72 |         shapeit_line = line.split()
 73 |         gmap_pos[float(shapeit_line[0])]=Decimal(shapeit_line[2])
 74 |     return gmap_pos
 75 | 
 76 | def interpolate(start_position,end_position,x):
 77 |     start0 = Decimal(str(start_position[0]))
 78 |     end0 = Decimal(str(end_position[0]))
 79 |     slope = (end_position[1] - start_position[1])/(end0 - start0) 
 80 |     intercept=start_position[1]
 81 |     interp = intercept + ((x-start0) * slope)
 82 |     return interp
 83 | 
 84 | def replace_positions(haps,output,gmap_dict,physical_out):
 85 |     interpolate_list = []
 86 |     out = open(output,'w')
 87 |     phys_out = None
 88 |     if (physical_out is not None):
 89 |         phys_out = open(physical_out,'w')
 90 |     with open(haps) as f:
 91 |         gmap_dict = gmap_dict.items()
 92 |         haps_line = f.readline()
 93 |         dictionary_index = 1
 94 |         # Requires the gmap_dictionary is atleast  
 95 |         start_position=[0,Decimal("0.0")]
 96 |         end_position=gmap_dict[0]
 97 |         while(haps_line and dictionary_index < len(gmap_dict)):
 98 |             temp_line = haps_line.split()
 99 |             temp_pos = int(temp_line[2])
100 |             if(temp_pos >= start_position[0] and temp_pos <= end_position[0]):
101 |                 t_inter = interpolate(start_position,end_position,temp_pos)
102 |                 temp_line[2] = str(t_inter)
103 |                 temp_inter = ' '.join(temp_line)
104 |                 out.write(temp_inter + '\n')
105 |                 if (phys_out != None):
106 |                     phys_out.write(temp_line[0] +" " +str(temp_pos)+ '\n')#changed here to include markername
107 |                 haps_line = f.readline()
108 |             else:
109 |                 start_position = end_position
110 |                 end_position = gmap_dict[dictionary_index]
111 |                 dictionary_index += 1
112 |     out.close()
113 |     return interpolate_list
114 | 
115 | def main():
116 |     parser = argparse.ArgumentParser(description="Annotate a haps file with Genetic map positions using linear interpolation")
117 |     parser.add_argument('--haps',dest='haps')
118 |     parser.add_argument('--output',dest='output')
119 |     parser.add_argument('--genetic-map',dest="gmap")
120 |     parser.add_argument('--physical-position-output',dest='physical_positions')
121 |     args = parser.parse_args()
122 |     assert args.haps is not None, \
123 |         "Haps file needs to be specified"
124 |     assert args.output is not None, \
125 |         "Output file needs to be specified"
126 |     assert args.gmap is not None, \
127 |         "Genetic map needs to be specified"
128 |     haps = args.haps
129 |     output = args.output
130 |     genetic_map = args.gmap
131 |     physical_out = args.physical_positions
132 |     temp_file = tempfile.TemporaryFile()
133 |     genetic_map=get_shapeit_genetic_map(genetic_map,temp_file)    
134 |     gmap_dict = load_genetic_map(genetic_map)
135 |     replace_positions(haps, output, gmap_dict, physical_out=physical_out)
136 | 
137 | if __name__ =="__main__":
138 |     main()
139 | 


--------------------------------------------------------------------------------
/selection_pipeline/haps_to_hapmap.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from optparse import OptionParser
  4 | from pyfasta import Fasta
  5 | #
  6 | # $1 haps format to be converted to hapmap format
  7 | # $2 sample format file
  8 | #
  9 | 
 10 | 
 11 | # 11 columns of precursor
 12 | 
 13 | # regex for determining we have a valid SNP #
 14 | 
 15 | 
 16 | def aa_seq(options):
 17 |     f = Fasta(options.ancestralfasta)
 18 |     keyz = (f.keys())
 19 |     match = ''
 20 |     if (options.single_chromosome):
 21 |         # Single chromosome fasta should only have one sequence.
 22 |         # that sequence should be the sequence of interest.
 23 |         keyz = (list(keyz))
 24 |         key = keyz[0]
 25 |     else:
 26 |         get_chromosome_from_header = options.header
 27 |         get_chromosome_from_header.replace('?', options.chromosome)
 28 |         for key in keyz:
 29 |             if(re.match(get_chromosome_from_header, key) is not None):
 30 |                 match = key
 31 |         if(match is ''):
 32 |             raise Exception("No match possible is something wrong with "
 33 |                             " the regex specified to the program as "
 34 |                             "--header-regex")
 35 |     aaSeq = f[key]
 36 |     return(aaSeq)
 37 | 
 38 | 
 39 | def main():
 40 |     header = ("rs# alleles chrom pos strand assembly# center protLSID "
 41 |             "assayLSID panelLSID QCcode")
 42 |     parser = OptionParser()
 43 |     parser.add_option('-i', dest="haps_file", help="Haps Input File")
 44 |     parser.add_option('-s', dest="sample_file", help="Sample Input File")
 45 |     parser.add_option('-c', dest="chromosome", help="Chromosome")
 46 |     parser.add_option('-o', dest="output_file_name", help="Output File name")
 47 |     parser.add_option('-a', dest="ancestralfasta", help="Outgroup fasta file")
 48 |     parser.add_option('--id', dest="ancestral_indivdual_id",
 49 |             help="Name of the ancestral Individual")
 50 |     parser.add_option('--header-regex', dest='header',
 51 |             help=("To determine which chromosome to extract"
 52 |                 "is a regex with a ? for the chromosome number"))
 53 |     parser.add_option('--single-chromosome', action="store_true",
 54 |                     dest="single_chromosome")
 55 |     (options, args) = parser.parse_args()
 56 |     options.chromosome = str(options.chromosome)
 57 |     if(options.single_chromosome is None):
 58 |         options.single_chromosome = False
 59 |         assert options.header is None, \
 60 |                 "Option header_regex required if the fasta file is"\
 61 |                 "split by chromosome"
 62 |     # Set default ancestral ID#
 63 |     if (options.ancestral_indivdual_id is None):
 64 |         options.ancestral_indivdual_id = 'ANCESTOR'
 65 |     sample_ids = []
 66 |     output = open(options.output_file_name, 'w')
 67 |     failed_snps = open('failed_snps.txt', 'w')
 68 |     aaSeq = aa_seq(options)
 69 |     with open(options.sample_file, 'r') as f:
 70 |         for i, line in enumerate(f):
 71 |             if(i > 1):
 72 |                 line = line.split()
 73 |                 sample_ids.append(line[1])
 74 |     # Construct the header line.
 75 |     sample_ids.append(options.ancestral_indivdual_id)
 76 |     header = header + ' ' + ' '.join(sample_ids) + '\n'
 77 |     output.write(header)
 78 |     with open(options.haps_file, 'r') as f:
 79 |         for line in f:
 80 |             output_line = ''
 81 |             line = line.split()
 82 |             rsid = line[1]
 83 |             pos = line[2]
 84 |             ancestral_allele = aaSeq[int(pos)-1]
 85 |             if not (re.match('[ACTGactg]', ancestral_allele)):
 86 |                 failed_snps.write(rsid + ' ' + pos + '\n')
 87 |             else:
 88 |                 a1 = line[3]
 89 |                 a2 = line[4]
 90 |                 ancestral_genotypes = ancestral_allele.upper() + \
 91 |                     ancestral_allele.upper()
 92 |                 def check_alleles(x):
 93 |                     try:
 94 |                         x = int(x)
 95 |                         if(x == 0):
 96 |                             return a1
 97 |                         else:
 98 |                             return a2
 99 |                     except:
100 |                         return "0"
101 |                 change_alleles = map(check_alleles, line[5:])
102 |                 change_alleles = list(change_alleles)
103 |                 zipa = change_alleles[0::2]
104 |                 zipb = change_alleles[1::2]
105 |                 change_alleles = zip(zipa, zipb)
106 |                 change_alleles = [''.join(row) for row in change_alleles]
107 |                 output_line = rsid + ' ' + a1 + '/' + a2 + \
108 |                     ' ' + options.chromosome + ' ' + pos
109 |                 output_line = output_line + ' + -9 -9 -9 -9 -9 -9 ' +\
110 |                     ' '.join(change_alleles) + ' ' + ancestral_genotypes
111 |                 output.write(output_line + '\n')
112 |     output.close()
113 |     failed_snps.close()
114 | 
115 | if __name__ == "__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/selection_pipeline/multipipeline.py:
--------------------------------------------------------------------------------
  1 | # Multipopulation script calls the selection
  2 | 
  3 | # pipeline for each population that we need
  4 | # to do then zips up and runs a script to p# each of the cross population
  5 | # statistics once
  6 | # it has all finished.
  7 | # institution: University of Otago
  8 | # author: James Boocock
  9 | #
 10 | #
 11 | # requires that the selection pipeline
 12 | # installed.
 13 | #
 14 | 
 15 | try:
 16 |     from collections import OrderedDict
 17 | except ImportError:
 18 |     from ordereddict import OrderedDict
 19 | import math
 20 | import sys
 21 | import os
 22 | from optparse import OptionParser
 23 | import ConfigParser
 24 | import logging
 25 | from .environment import set_environment
 26 | from .standard_run_utilities import *
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | SUBPROCESS_FAILED_EXIT = 10
 30 | CANNOT_FIND_EXECUTABLE = 20
 31 | CANNOT_FIND_CONFIG = 30
 32 | 
 33 | # generate RSB after we have calculated ihs
 34 | 
 35 | 
 36 | def rsb(config, options, populations):
 37 |     """ Runs RSB script
 38 | 
 39 |         Reads config options and populations and generates
 40 |         rsb statistics for each population pair.
 41 |     """
 42 |     rscript = config['Rscript']['rscript_executable']
 43 |     generate_rsb = config['Rscript']['generate_rsb']
 44 |     directory = 'rsb'
 45 |     if not os.path.exists(directory):
 46 |         os.mkdir(directory)
 47 |     pops = list(populations.keys())
 48 |     orig_dir = os.getcwd()
 49 |     os.chdir(directory)
 50 |     for i in range(0, len(pops)-1):
 51 |         cmd = []
 52 |         pop1 = pops[i]
 53 |         cmd.append(rscript)
 54 |         pop1_ihh_file = os.path.join(orig_dir, pop1, 'results',
 55 |                                      pop1 + 'chr' + options.chromosome +
 56 |                                      '.ihh')
 57 |         cmd.extend([generate_rsb, '--pop1', pop1, '--pop1file', pop1_ihh_file])
 58 |         for j in range(i+1, len(pops)):
 59 |             tmp_cmd = []
 60 |             tmp_cmd.extend(cmd)
 61 |             pop2 = pops[j]
 62 |             pop2_ihh_file = os.path.join(orig_dir, pop2, 'results', pop2 +
 63 |                                          'chr' + options.chromosome + '.ihh')
 64 |             tmp_cmd.extend(['--pop2', pop2, '--pop2file', pop2_ihh_file])
 65 |             tmp_cmd.extend(['--chr', options.chromosome])
 66 |             run_subprocess(tmp_cmd, 'rsb_generation')
 67 |     os.chdir(orig_dir)
 68 | 
 69 | 
 70 | def get_populations(populations):
 71 |     """ Returns the populations and ids
 72 | 
 73 |         Imports data from the input population files and creates a dictionary
 74 |         of the populations.
 75 |     """
 76 |     pops = {}
 77 |     for pop in populations:
 78 |         with open(pop, 'r') as f:
 79 |             for i, line in enumerate(f):
 80 |                 line = line.strip()
 81 |                 if (i == 0):
 82 |                     pop_name = line
 83 |                     pops[pop_name] = []
 84 |                 else:
 85 |                     pops[pop_name].append(line)
 86 |     return pops
 87 | 
 88 | 
 89 | def parse_config(options):
 90 |     """ Parse config file
 91 | 
 92 |         Read the config file and save the results in a dictionary
 93 |     """
 94 |     config = ConfigParser.ConfigParser()
 95 |     config.read(options.config_file)
 96 |     config_parsed = {}
 97 |     logger.debug(config.sections())
 98 |     for section in config.sections():
 99 |         logger.debug(section)
100 |         opts = config.options(section)
101 |         config_parsed[section] = {}
102 |         for op in opts:
103 |             logger.debug(op)
104 |             try:
105 |                 config_parsed[section][op] = config.get(section, op)
106 |             except:
107 |                 logger.info("exception on {0}".format(op))
108 |                 config_parsed[section][op] = None
109 |     return config_parsed
110 | 
111 | 
112 | def check_executables_and_scripts_exist(options, config):
113 |     """ Check the executables actually exist where specified.
114 | 
115 |         Uses the config file to determine whether the executable
116 |         are at the location as expeceted.
117 |     """
118 |     if(which(config['vcftools']['vcf_subset_executable'],
119 |              'vcf-subset') is None):
120 |         return False
121 |     if(which(config['selection_pipeline']['selection_pipeline_executable'],
122 |              'selection_pipeline') is None):
123 |         return False
124 |     return True
125 | 
126 | 
127 | def subset_vcf(vcf_input, config, populations):
128 |     """ Run subset VCF to break the VCF file into populations
129 | 
130 |         Uses the VCF input file and the population dictionary
131 |         to run VCF-subset uses parallelizations as specified
132 |         by the number of cores.
133 |     """
134 |     vcf_outputs = []
135 |     vcf_dict = {}
136 |     no_pops = len(populations)
137 |     threads = int(config['system']['cores_avaliable'])
138 |     threads_per_job = int(math.ceil(threads / float(no_pops)))
139 |     line_count = get_vcf_line_count(vcf_input)
140 |     split_length = line_count // threads_per_job
141 |     split_positions = [split_length * i for i in range(1, threads_per_job+1)]
142 |     remainder_length = line_count % threads
143 |     split_positions[len(split_positions) - 1] += remainder_length
144 |     vcf_inputs = split_vcf(vcf_input, split_positions)
145 |     cmds = []
146 |     stdouts = []
147 |     for i, vcf in enumerate(vcf_inputs):
148 |         for key, value in populations.items():
149 |             cmd = []
150 |             output_file = key + str(i) + '.vcf'
151 |             try:
152 |                 vcf_dict[key].append(output_file)
153 |             except KeyError:
154 |                 vcf_dict[key] = [output_file]
155 |             comma_list_ids = ','.join(value)
156 |             vcf_subset_executable = config['vcftools']['vcf_subset_executable']
157 |             cmd.append(vcf_subset_executable)
158 |             cmd.extend(['-f', '-c', comma_list_ids, vcf])
159 |             stdouts.append(output_file)
160 |             cmds.append(list(cmd))
161 |     queue_jobs(cmds, 'vcf-subset',
162 |                config['system']['cores_avaliable'], stdouts=stdouts)
163 |     cmds = []
164 |     for key, value in vcf_dict.items():
165 |         # generate the commands for vcf concat for each output file generated
166 |         cmd = []
167 |         output_file = key + '.vcf'
168 |         # Append to vcf_outputs
169 |         vcf_outputs.append(output_file)
170 |         if(len(value) == 1):
171 |             os.rename(value[0], output_file)
172 |         else:
173 |             vcf_concat_executable = config['vcftools']['vcf_concat_executable']
174 |             cmd.append(vcf_concat_executable)
175 |             cmd.extend(value)
176 |             cmds.append(list(cmd))
177 |     if(len(cmds) != 0):
178 |         queue_jobs(cmds, 'vcf-concat', config['system']['cores_avaliable'],
179 |                    stdouts=vcf_outputs)
180 |   # call the queue jobs to run vcf-subset
181 |     # return the population concatenated vcf file
182 |     return vcf_outputs
183 | 
184 | 
185 | def run_selection_pipeline(output_vcfs, options, populations, config):
186 |     """ Runs the selection_pipeline script for each population
187 | 
188 |         Uses the population dictionary and the output vcfs from the subset
189 |         process to run the selection pipeline on each population.
190 |     """
191 |     cores = config['system']['cores_avaliable']
192 |     parralelise_populations = False
193 |     # Arbitrary cut off for parralelising each population
194 |     # 4 at the moment could be calculated given the amount
195 |     # of parralelisation needed in each run.
196 |     if(len(populations) >= 4 and int(cores) >= 4):
197 |         parralelise_populations = True
198 |         cores_per_run = str(int(cores) // len(populations))
199 |     else:
200 |         cores_per_run = cores
201 |     orig_dir = os.getcwd()
202 |     if(options.extra_args is not None):
203 |         extra_args = options.extra_args
204 |     else:
205 |         extra_args = ''
206 |     if options.cores is not None:
207 |         extra_args += ' --cores ' + cores_per_run
208 |     # Run the selection pipeline for a single run job #
209 |     selection_pipeline_executable = \
210 |         config['selection_pipeline']['selection_pipeline_executable']
211 |     cmds = []
212 |     # check whether we should disable rsb given that iHS could have potentially
213 |     # been disabled and if that is the case we cannot perform rsb calculation
214 |     if options.extra_args is not None:
215 |         if '--no-ihs' in options.extra_args:
216 |             options.no_rsb = True
217 |     if parralelise_populations:
218 |         folder_names = []
219 |     for vcf, population_name in zip(sorted(output_vcfs), sorted(populations)):
220 |         cmd = []
221 |         cmd.append(selection_pipeline_executable)
222 |         cmd.extend(['-c', options.chromosome, '-i', os.path.abspath(vcf),
223 |                    '--population', population_name,
224 |                    '--config-file', os.path.abspath(options.config_file)])
225 |         cmd.extend(extra_args.split())
226 |         cmds.append(cmd)
227 |         directory = population_name
228 |         if not os.path.exists(directory):
229 |             os.mkdir(directory)
230 |         if parralelise_populations:
231 |             folder_names.append(directory)
232 |         else:
233 |         # Create directory for each sub population to run in
234 |             os.chdir(directory)
235 |             run_subprocess(cmd, 'selection_pipeline')
236 |             os.chdir(orig_dir)
237 |     if parralelise_populations:
238 |         queue_jobs(cmds, 'selection_pipeline',
239 |                    cores, folder_names=folder_names)
240 | 
241 | 
242 | def fst_vcf(input_vcf, config, options, populations):
243 |     """ Generates FST statistics for every pair of populations
244 | 
245 |         Uses the population dictionary to generate a weir-fst statistics
246 |         using VCF-TOOLS.
247 |     """
248 |     vcf_tools = config['vcftools']['vcf_tools_executable']
249 |     directory = 'fst'
250 |     if not os.path.exists(directory):
251 |         os.mkdir(directory)
252 |     pops = list(populations.keys())
253 |     orig_dir = os.getcwd()
254 |     os.chdir(directory)
255 |     for i in range(0, len(pops)-1):
256 |         p = pops[i]
257 |         cmd = []
258 |         cmd_hapmap = [] 
259 |         cmd.append(vcf_tools)
260 |         first_pop_name = open('first_pop.tmp', 'w')
261 |         first_pop_name.write('\n'.join(populations[p]))
262 |         first_pop_name.close()
263 |         cmd.extend(['--fst-window-size', options.fst_window_size, 
264 |                     '--fst-window-step', options.fst_window_step,
265 |                     '--vcf',input_vcf])
266 |         cmd_hapmap.extend(cmd)
267 |         cmd.extend(['--weir-fst-pop', 'first_pop.tmp'])
268 |         cmd_hapmap.extend(['--hapmap-fst-pop','first_pop.tmp'])
269 |         for j in range(i+1, len(pops)):
270 |             s = pops[j]
271 |             tmp_cmd_hapmap = []
272 |             tmp_cmd = []
273 |             tmp_cmd_hapmap.extend(cmd_hapmap)
274 |             tmp_cmd.extend(cmd)
275 |             tmp_cmd.extend(['--weir-fst-pop', 'second_pop.tmp'])
276 |             tmp_cmd_hapmap.extend(['--hapmap-fst-pop','second_pop.tmp'])
277 |             second_pop_name = open('second_pop.tmp', 'w')
278 |             second_pop_name.write('\n'.join(populations[s]))
279 |             second_pop_name.close()
280 |             run_subprocess(tmp_cmd, 'fst_calculation_weir')
281 |             run_subprocess(tmp_cmd_hapmap,'fst_calculation_hapmap')
282 |             os.rename('out.windowed.weir.fst',
283 |                       options.chromosome + p + s + '.weir.fst')
284 |             os.rename('out.windowed.hapmap.fst',
285 |                       options.chromosome + p + s +'.hapmap.fst')
286 |     os.remove('second_pop.tmp')
287 |     os.remove('first_pop.tmp')
288 |     os.remove('out.log')
289 |     os.chdir(orig_dir)
290 | 
291 | 
292 | def main():
293 |     """ Main function for multi_population
294 | 
295 |         Reads config and options and runs the multi_population
296 |         pipeline
297 |     """
298 |     parser = OptionParser()
299 |     parser.add_option('-p', '--population', action='append',
300 |                       dest="populations", help='population_files')
301 |     parser.add_option('-a', '--arguments-selection-pipelines',
302 |                       dest="extra_args", help=('Arguments to the selection'
303 |                                                'pipeline script'))
304 |     parser.add_option('-l', '--log-file', dest="log_file", help="Log file")
305 |     parser.add_option('-i', '--vcf-input-file', dest="vcf_input",
306 |                       help="VCF Input File")
307 |     parser.add_option('-c', '--chromosome', dest="chromosome",
308 |                       help=("Chromosome label doesn't actually have to"
309 |                             "correspond to the real chromosome but is required"
310 |                             " to determine what output files to make"))
311 |     parser.add_option('--config-file', dest='config_file',
312 |                       help='Configuration File')
313 |     parser.add_option('--fst-window-size', dest="fst_window_size",
314 |                       help="FST window size (kb)")
315 |     parser.add_option('--fst-window-step', dest="fst_window_step",
316 |                       help="FST window step size (kb)")
317 |     parser.add_option('--no-clean-up', dest="no_clean_up",
318 |                       action="store_true",
319 |                       help="Do not clean up intermediate datafiles")
320 |     parser.add_option('--cores', dest="cores", help=("Overrides number of "
321 |                       "cores avaliable as provided in the config file"))
322 |     parser.add_option('--no-rsb',dest="no_rsb", action="store_true",
323 |                       help="Do not calculate RSB")
324 |     (options, args) = parser.parse_args()
325 |     print(options.extra_args)
326 |     assert options.vcf_input is not None, \
327 |         "no VCF file has been specified as input"
328 |     assert os.path.isfile(options.vcf_input), \
329 |         "Cannot locate vcf file at path = {0)".format(options.vcf_input)
330 |     assert options.chromosome is not None, \
331 |         "no chromosome has been specified to the script"
332 |     assert options.populations is not None and \
333 |         len(options.populations) >= 2, \
334 |         "At least two population files are required"
335 |     if options.config_file is None:
336 |         options.config_file = 'defaults.cfg'
337 |         if not(os.path.isfile(options.config_file)):
338 |             raise Exception("Cannot find config file")
339 |     elif not(os.path.isfile(options.config_file)):
340 |         raise Exception("Cannot find config file")
341 |     config = parse_config(options)
342 |     if options.log_file is None:
343 |         options.log_file = 'multi_population.log'
344 |     logging.basicConfig(format='%(asctime)s %(message)s',
345 |                         filename=options.log_file, filemode='w',
346 |                         level=logging.INFO)
347 |     if not (check_executables_and_scripts_exist(options, config)):
348 |         sys.exit(CANNOT_FIND_EXECUTABLE)
349 |     if options.no_clean_up is None:
350 |         options.clean_up_files = False
351 |     if options.fst_window_step is None:
352 |         options.fst_window_step = str(1000)
353 |     else:
354 |         options.fst_window_step = str(
355 |                 float(options.fst_window_step) * 1e3)
356 |     if options.fst_window_size is None:
357 |         options.fst_window_size = str(1000)
358 |     else:
359 |         options.fst_window_size = str(
360 |                 float(options.fst_window_size) * 1e3)
361 |     if options.no_rsb is None:
362 |         options.no_rsb = False
363 |     if options.cores is not None:
364 |         config['system']['cores_avaliable'] = options.cores
365 |     set_environment(config['environment'])
366 |     options.vcf_input = os.path.abspath(options.vcf_input)
367 |     populations = get_populations(options.populations)
368 |     populations = OrderedDict(sorted(populations.items(), key=lambda t: t[0]))
369 |     fst_vcf(options.vcf_input, config, options, populations)
370 |     output_vcfs = subset_vcf(options.vcf_input, config, populations)
371 |     run_selection_pipeline(output_vcfs, options, populations, config)
372 |     # TODO move FST to here on filtered dataset
373 |     if not (options.no_rsb):
374 |         rsb(config, options, populations)
375 |     if not os.path.exists('logs'):
376 |         os.mkdir('logs')
377 |     os.rename(options.log_file, 'logs/' + options.log_file)
378 |     if not options.no_clean_up:
379 |         keep = [os.path.basename(options.vcf_input),os.path.basename(options.config_file)]
380 |         keep.extend(options.populations)
381 |         clean_folder('.', keep=keep)
382 |     logger.info("Multi_population Complete")
383 |     logger.info("Goodbye :")
384 |     print("Multi-population selection pipeline completed successfully !:)")
385 | if __name__ == "__main__":
386 |     main()
387 | 


--------------------------------------------------------------------------------
/selection_pipeline/run_pipeline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import fnmatch
  3 | import logging
  4 | from .haps_interpolate import get_shapeit_genetic_map
  5 | logger = logging.getLogger(__name__)
  6 | SUBPROCESS_FAILED_EXIT = 10
  7 | 
  8 | 
  9 | class CommandTemplate(object):
 10 |     """ Represents a full selection pipeline run.
 11 | 
 12 |         Creates command_templates for each of the tools
 13 |         in the selection pipeline. Any new system that
 14 |         uses the selection pipeline should inherit from Command
 15 |         Template. Any commands you want to use on a new system
 16 |         should call the parent class first to setup the template
 17 |         to built on for the specifiec architecture.
 18 |         e.g. Standard linux box, Load Leveler or another
 19 |         cluster interface.
 20 |     """
 21 | 
 22 |     def __init__(self, options, config):
 23 |         """ Initialises the class variables self.config and self.options.
 24 | 
 25 |         """
 26 |         self.config = config
 27 |         self.options = options
 28 | 
 29 |     def run_vcf_to_plink(self):
 30 |         """ Template for vcf to plink conversion
 31 | 
 32 |             Uses vcf tools to convert a VCF file
 33 |             to ped/map plink format.
 34 |         """
 35 |         cmd = []
 36 |         prefix = self.options.population + self.options.chromosome
 37 |         vcf_tools = self.config['vcftools']['vcf_tools_executable']
 38 |         cmd.append(vcf_tools)
 39 |         if(self.options.vcf_gz):
 40 |             cmd.append('--gzvcf')
 41 |         else:
 42 |             cmd.append('--vcf')
 43 |             cmd.extend([self.options.vcf_input, '--plink', '--out',
 44 |                        prefix, '--remove-indels', '--max-alleles',str(2)])
 45 |             cmd.extend(self.config['vcftools']['extra_args'].split())
 46 |         return (cmd, prefix)
 47 | 
 48 | 
 49 | ### OLD VCFTOOLS METHOD ###
 50 |     def run_remove_indels_from_vcf(self):
 51 |         """ Template for running remove indels from vcf
 52 | 
 53 |         """
 54 |         cmd = []
 55 |         output_name = \
 56 |            os.path.basename( self.options.vcf_input.split('.vcf')[0])
 57 |         vcftools = self.config['vcftools']['vcf_tools_executable']
 58 |         cmd.append(vcftools)
 59 |         cmd.extend(['--vcf', self.options.vcf_input, '--remove-indels',
 60 |                    '--out', output_name, '--recode','--max-alleles',str(2)])
 61 |         return(cmd, output_name + '.recode.vcf')
 62 | ### NEW METHOD -  NEEDS WORK also change standard_run.py
 63 | #    def run_remove_indels_from_vcf(self):
 64 | #        """ Template for running remove indels from vcf
 65 | #
 66 | #        """
 67 | #        cmd = []
 68 | #        output_name = \
 69 | #            os.path.basename( self.options.vcf_input.split('.vcf')[0])
 70 | #        vcfsnps = self.config['vcflib']['vcflib_vcfsnps']
 71 | #        cmd.append(vcfsnps)
 72 | #        return(cmd, output_name + '.recode.vcf')
 73 | 
 74 |     def run_plink_filter(self, ped, map):
 75 |         """ Template for running the plink filter
 76 | 
 77 |             Uses PLINK to filter the ped/map file
 78 |             filters HWE, MAF and missing Genotype
 79 |             information.
 80 |         """
 81 |         cmd = []
 82 |         prefix = ped.split('.')[0]
 83 |         plink = self.config['plink']['plink_executable']
 84 |         cmd.append(plink)
 85 |         # add standard plink commands #
 86 | 
 87 |         cmd.extend(['--noweb', '--file', prefix, '--geno',
 88 |                    str(self.options.remove_missing), '--hwe',
 89 |                    str(self.options.hwe), '--maf', str(self.options.maf),
 90 |                    '--recode', '--out', prefix])
 91 |         cmd.extend(self.config['plink']['extra_args'].split())
 92 |         return(cmd, prefix)
 93 | 
 94 | 
 95 |     def run_shape_it(self, ped, map):
 96 |         """ Template for running shapeit
 97 | 
 98 |             Sets up the default command for running shapeit
 99 |             for phasing genotypes.
100 | 
101 |             Reads the config file to find the location of the
102 |             genetic_map
103 |         """
104 |         cmd = []
105 |         prefix = self.options.population + \
106 |                 self.options.chromosome + '.phased'
107 |         for file in os.listdir(self.config['genetic_map']['genetic_map_dir']):
108 |             if fnmatch.fnmatch(
109 |                     file, self.config['genetic_map']['genetic_map_prefix'].replace(
110 |                         '?', self.options.chromosome)):
111 |                         genetic_map = file
112 |         assert genetic_map is not None, \
113 |                 "Cannot find genetic map for chromosome {0}, check config".format(self.options.chromosome)
114 |         genetic_map=os.path.join(self.config['genetic_map']['genetic_map_dir'], genetic_map)
115 |         new_genetic_map=prefix + '_temp_genetic_map.txt'
116 |         # return the genetic map for use in shapeit
117 |         genetic_map=get_shapeit_genetic_map(genetic_map,new_genetic_map)
118 |         shapeit = self.config['shapeit']['shapeit_executable']
119 |         cmd.append(shapeit)
120 |         cmd.extend(['--input-ped', ped, map, '-M',genetic_map, '--output-max', prefix])
121 |         cmd.extend(self.config['shapeit']['extra_args'].split())
122 |         return(cmd, prefix)
123 | 
124 |     def indel_filter(self, haps):
125 |         """ Return a template for running the indel filter
126 | 
127 |             Sets up the default command for running the Rscript
128 |             indel_filter.
129 |         """
130 |         cmd = []
131 |         output_name = self.options.population + \
132 |                 self.options.chromosome + '_indel_filter.haps'
133 |         rscript = self.config['Rscript']['rscript_executable']
134 |         indel_filter = self.config['Rscript']['indel_filter']
135 |         cmd.append(rscript)
136 |         cmd.append(indel_filter)
137 |         cmd.extend([haps, str(self.options.maf), output_name])
138 |         return(cmd, output_name)
139 | 
140 |     def run_impute2(self, haps):
141 |         """ Return a template for running impute2
142 | 
143 |             Sets up a command for impute2 searches the paths
144 |             specified in the config file for the genetic_map
145 |             and the known haps and legend files.
146 |         """
147 |         prefix = self.options.population + self.options.chromosome + \
148 |                 '_impute2'
149 |         impute2 = self.config['impute2']['impute_executable']
150 |         genetic_map = ''
151 |         for file in os.listdir(self.config['genetic_map']['genetic_map_dir']):
152 |             if fnmatch.fnmatch(file, (
153 |                 self.config['genetic_map']['genetic_map_prefix'].replace(
154 |                     '?', self.options.chromosome))):
155 |                     genetic_map = os.path.join(
156 |                             self.config['genetic_map']['genetic_map_dir'], file)
157 |                     new_genetic_map=prefix +'_temp_genetic_map.txt'
158 |         # return the genetic map for use in impute2, only needed if people
159 |         # use plink genetic maps
160 |         assert os.path.isfile(genetic_map), \
161 |                 "Genetic map cannot be found for chromosome {0}".format(self.options.chromosome)
162 |         genetic_map=get_shapeit_genetic_map(genetic_map,new_genetic_map)
163 |         legend_file = ''
164 |         for file in os.listdir(self.config['impute2']['impute_reference_dir']):
165 |             if fnmatch.fnmatch(file, (
166 |                 self.config['impute2']['impute_reference_prefix'].replace(
167 |                     '?', self.options.chromosome) + '.legend')):
168 |                     legend_file = os.path.join(
169 |                             self.config['impute2']['impute_reference_dir'], file)
170 |         hap_file = ''
171 |         for file in os.listdir(self.config['impute2']['impute_reference_dir']):
172 |             if fnmatch.fnmatch(file, (
173 |                 self.config['impute2']['impute_reference_prefix'].replace(
174 |                     '?', self.options.chromosome) + '.haplotypes')):
175 |                     hap_file = os.path.join(
176 |                             self.config['impute2']['impute_reference_dir'],file)
177 |             elif fnmatch.fnmatch(file, (
178 |                 self.config['impute2']['impute_reference_prefix'].replace(
179 |                     '?', self.options.chromosome) + '.hap')):
180 |                     hap_file = os.path.join(
181 |                             self.config['impute2']['impute_reference_dir'], file)
182 |                     #create the command template
183 |         assert os.path.isfile(hap_file), \
184 |                 "Hap file cannot be found for chromosome {0}".format(self.options.chromosome)
185 |         assert os.path.isfile(legend_file), \
186 |                 "Legend file cannot be found for chromosome {0}".format(self.options.chromosome)
187 |         cmd_template = []
188 |         cmd_template.append(impute2)
189 |         cmd_template.extend(['-m', genetic_map, '-h', hap_file, '-l',
190 |             legend_file, '-known_haps_g', haps, '-phase'])
191 |         return (cmd_template, prefix)
192 | 
193 |     def get_ancestral_fasta(self):
194 |         """ Get the ancestral fasta file for the pipeline
195 | 
196 |             Reads the config file and gets the ancestral fasta file
197 |             to be used for ancestral annotation.
198 |         """
199 |         if(not self.config['ancestral_allele']['split_by_chromosome']):
200 |             ancestral_fasta = \
201 |                     self.config['ancestral_allele']['ancestral_fasta_file']
202 |             regex = \
203 |                 self.config['ancestral_allele']['ancestral_fasta_header_regex']
204 |         else:
205 |             for file in os.listdir(
206 |                     self.config['ancestral_allele']['ancestral_fasta_dir']):
207 |                 if fnmatch.fnmatch(
208 |                         file,
209 |                         self.config['ancestral_allele']['ancestral_prefix'].
210 |                         replace('?', self.options.chromosome)):
211 |                     ancestral_fasta = os.path.join(
212 |                         self.config['ancestral_allele']['ancestral_fasta_dir'],
213 |                         file)
214 |             regex = None
215 |         return (ancestral_fasta, regex)
216 | 
217 |     def run_aa_annotate_haps(self, in_file, vcf=False):
218 |         """ Return the template for running ancestral annotation
219 | 
220 |             runs the ancestral annotation python script to convert
221 |             the haps file to derived / ancestral alleles
222 |         """
223 |         cmd = []
224 |         output_haps = self.options.population.split('.haps')[0] + \
225 |             '_aachanged.haps'
226 |         if(vcf):
227 |             output_sample = self.options.population.split('.haps')[0] + \
228 |                 '_aachanged.sample'
229 |         aa_annotate = \
230 |             self.config['ancestral_allele']['ancestral_allele_script']
231 |         cmd.append(aa_annotate)
232 |         (ancestral_fasta, regex) = self.get_ancestral_fasta()
233 |         cmd.extend(['-c', self.options.chromosome, '-o',
234 |                    output_haps, '-a', ancestral_fasta])
235 |         if(regex is not None):
236 |             cmd.extend(['--header-regex', regex])
237 |         else:
238 |             cmd.extend(['--single-chromosome'])
239 |         if(vcf):
240 |             cmd.extend(['-v', in_file, '-s', output_sample])
241 |             return(cmd, output_haps, output_sample)
242 |         else:
243 |             cmd.extend(['-i', in_file])
244 |             return(cmd, output_haps)
245 | 
246 |     def interpolate_haps(self, haps):
247 |         output_haps = self.options.population.split('.haps')[0] + \
248 |             '_genetic_dist.haps'
249 |         output_physical = self.options.population.split('.haps')[0] + \
250 |             '_genetic_dist.pos'
251 |         genetic_map = None
252 |         for file in os.listdir(self.config['genetic_map']['genetic_map_dir']):
253 |             if fnmatch.fnmatch(
254 |                 file, self.config['genetic_map']['genetic_map_prefix'].replace(
255 |                     '?', self.options.chromosome)):
256 |                 genetic_map = file
257 |         cmd = []
258 |         assert genetic_map is not None, \
259 |                 "Cannot find genetic map for chromosome {0}, check config".format(self.options.chromosome)
260 |         genetic_map = os.path.join(self.config['genetic_map']['genetic_map_dir'],genetic_map)
261 |         interpolate_script= \
262 |                 self.config['haps_scripts']['haps_interpolate_script']
263 |         cmd.append(interpolate_script)
264 |         cmd.extend(['--haps', haps, '--output', output_haps, '--genetic-map',
265 |                    genetic_map])
266 |         cmd.extend(['--physical-position-output',output_physical])
267 |         return(cmd,output_haps,output_physical)
268 | 
269 |     def run_multi_coreihh(self, haps, haps_physical):
270 |         """ Return the template for running multi_coren ihh
271 | 
272 |         """
273 |         cmd = []
274 |         output_name = self.options.population + 'chr' + \
275 |             self.options.chromosome + '.ihh'
276 |         rscript = self.config['Rscript']['rscript_executable']
277 |         multicore_ihh = self.config['multicore_ihh']['multicore_ihh']
278 |         window = self.options.multi_window_size
279 |         overlap = self.options.ehh_overlap
280 |         population = self.options.population
281 |         cmd.append(rscript)
282 |         cmd.extend([multicore_ihh, '-p', population, '-i',
283 |                    haps, '-c', str(self.options.chromosome),
284 |                    '--window', str(window), '--overlap', str(overlap),
285 |                    '--maf', self.options.daf])
286 |         cmd.extend(['--big_gap', self.options.big_gap, '--small_gap',
287 |                    self.options.small_gap, '--small_gap_penalty',
288 |                    self.options.small_gap_penalty, '--haplo_hh'])
289 |         if(haps_physical != None):
290 |                cmd.extend(['--physical_map_haps',haps_physical])
291 |         return (cmd, output_name)
292 | 
293 |     def fix_sample_file(self, sample_file):
294 |         """ Return the template for running fix sample file
295 | 
296 |             Command just cuts the extra columns from the file
297 |             to conform with the standard sample file input
298 |         """
299 |         cmd = []
300 |         cmd.extend(['cut', '-d', ' ', '-f', '1-6', sample_file])
301 |         sample_file = sample_file.split('.sample')[0] + '_fixed.sample'
302 |         return(cmd, sample_file)
303 | 
304 |     def haps_to_vcf(self, haps, new_sample_file):
305 |         """ Return the template for running haps to vcf
306 | 
307 |         """
308 |         cmd = []
309 |         output_name = self.options.population + \
310 |             self.options.chromosome + '.vcf'
311 |         qctool_executable = self.config['qctool']['qctool_executable']
312 |         cmd.append(qctool_executable)
313 |         cmd.extend(['-filetype', 'shapeit_haplotypes', '-g',
314 |                    haps, '-s', new_sample_file, '-og', output_name])
315 |         return (cmd, output_name)
316 | 
317 |     def vcf_to_haps(self, vcf):
318 |         """ Return the template for running vcf to haps ( no annotation )
319 | 
320 |         """
321 |         cmd = []
322 |         haps = self.options.population + \
323 |             self.options.chromosome + 'vcf_to_haps' + '.haps'
324 |         sample = self.options.population + \
325 |             self.options.chromosome + 'vcf_to_haps' + '.sample'
326 |         aa_annotate = \
327 |             self.config['ancestral_allele']['ancestral_allele_script']
328 |         cmd.append(aa_annotate)
329 |         cmd.extend(['-c', self.options.chromosome, '-v', vcf, '-s', sample,
330 |                    '-o', haps, '--no-annotation'])
331 |         return(cmd, haps, sample)
332 | 
333 |     def fix_vcf_qctool(self, vcf):
334 |         """ Return the template for running fix vcf qctool
335 | 
336 |         """
337 |         cmd = []
338 |         output_name = vcf.split('.vcf')[0] + '_fixed.vcf'
339 |         cmd.extend(['sed', 's/^NA/{0}/g'.format(self.options.chromosome), vcf])
340 |         return(cmd, output_name)
341 | 
342 |     def vcf_to_tajimas_d(self, vcf):
343 |         """ Return the template for running vcf to tajima's D
344 | 
345 |         """
346 |         cmd = []
347 |         output_name = 'out.Tajima.D'
348 |         vcftools_executable = self.config['vcftools']['vcf_tools_executable']
349 |         cmd.append(vcftools_executable)
350 |         cmd.extend(['--TajimaD', self.options.tajimas_d, '--vcf', vcf])
351 |         return(cmd, output_name)
352 | 
353 |     def haps_filter(self, haps):
354 |         """  Return the template for running haps filter
355 | 
356 |         """
357 |         cmd = []
358 |         output_name = self.options.population + self.options.chromosome + \
359 |             '_filtered' + '.haps'
360 |         haps_filter_script = self.config['haps_scripts']['haps_filter_script']
361 |         cmd.append(haps_filter_script)
362 |         cmd.extend(['--maf', self.options.maf, '--hwe', self.options.hwe,
363 |                    '--chi-sq', '--missing', self.options.remove_missing,
364 |                    '--output', output_name, '--haps', haps])
365 |         return(cmd, output_name)
366 | 
367 |     def prepare_haps_for_variscan(self, haps, sample):
368 |         """ Return the template for running haps to variscan
369 | 
370 |         """
371 |         cmd = []
372 |         output_name = self.options.population + self.options.chromosome + \
373 |             '.hapmap'
374 |         haps_executable = self.config['haps_scripts']['haps_to_hapmap_script']
375 |         (ancestral_fasta, regex) = self.get_ancestral_fasta()
376 |         cmd.append(haps_executable)
377 |         if(regex is not None):
378 |             cmd.extend(['--header-regex', regex])
379 |         else:
380 |             cmd.extend(['--single-chromosome'])
381 |         cmd.extend(['-i', haps, '-s', sample, '-o', output_name, '--id',
382 |                    'ANCESTOR', '-a', ancestral_fasta, '-c',
383 |                    self.options.chromosome])
384 |         return(cmd, output_name)
385 | 
386 | 
387 |     def variscan_fayandwus(self, hap2):
388 |         """ Return the template for running variscan fay and wus
389 | 
390 |         """
391 |         cmd = []
392 |         v_config_name = 'variscan.conf'
393 |         output_name = self.options.population + self.options.chromosome + \
394 |             '.faw'
395 |         variscan_config = open(v_config_name, 'w')
396 |         variscan_executable = self.config['variscan']['variscan_executable']
397 |         cmd.append(variscan_executable)
398 |         cmd.extend([hap2, v_config_name])
399 |         # generate default self.config file for variscan
400 |         config_string = 'RefPos = 0 \n'
401 |         config_string += 'RefSeq = 1 \n'
402 |         config_string += 'BlockDataFile = none \n'
403 |         config_string += 'SeqChoice = all \n'
404 |         config_string += 'OutGroup = last \n'
405 |         config_string += 'RunMode = 22 \n'
406 |         config_string += 'IndivNames = \n'
407 |         config_string += 'UseMuts = 1 \n'
408 |         config_string += 'CompleteDeletion = 0 \n'
409 |         config_string += 'FixNum = 0 \n'
410 |         config_string += 'NumNuc = 4 \n'
411 |         config_string += 'SlidingWindow = 1 \n'
412 |         config_string += 'WidthSW = {0} \n'.format(
413 |             self.options.fayandWuWindowWidth)
414 |         config_string += 'JumpSW = {0} \n'.format(
415 |             self.options.fayandWuWindowJump)
416 |         config_string += 'WindowType = 0 \n'
417 |         config_string += 'UseLDSinglets = 0 \n'
418 |         variscan_config.write(config_string)
419 |         variscan_config.close()
420 |         return(cmd, output_name, v_config_name)
421 | 
422 |     def beagle_phasing(self, vcf):
423 |         cmd = []
424 |         java_executable = self.config['java']['java_executable']
425 |         beagle_memory =  self.config['beagle']['vm_size']
426 |         beagle_jar = self.config['beagle']['beagle_jar']
427 |         out_prefix = self.options.population + self.options.chromosome + \
428 |                 '.beagle'
429 |         cmd.append(java_executable)
430 |         cmd.append('-Xmx' + beagle_memory)
431 |         cmd.extend(['-jar',beagle_jar])
432 |         cmd.extend(['gtgl={0}'.format(vcf),'out={0}'.format(out_prefix)])
433 |         output_name=out_prefix + '.vcf.gz'
434 |         return(cmd,output_name)
435 | 


--------------------------------------------------------------------------------
/selection_pipeline/selectionTools.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: selectionTools
 3 | Version: 1.0
 4 | Summary: Selection Pipeline for VCF Data
 5 | Home-page: github.com/smilefreak/MerrimanSelectionPipeline
 6 | Author: James Boocock
 7 | Author-email: smilefreak@gmx.com
 8 | License: MIT
 9 | Description: UNKNOWN
10 | Keywords: iHS ehh selection evolution
11 | Platform: UNKNOWN
12 | 


--------------------------------------------------------------------------------
/selection_pipeline/selectionTools.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/selection_pipeline/selectionTools.egg-info/SOURCES.txt


--------------------------------------------------------------------------------
/selection_pipeline/selectionTools.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/selection_pipeline/selectionTools.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | selection_pipeline = selection_pipeline.selection_pipeline:main
3 | ancestral_annotation = selection_pipeline.aa_annotate:main
4 | haps_to_hapmap = selection_pipeline.haps_to_hapmap:main
5 | haps_interpolate = selection_pipeline.haps_interpolate:main
6 | multipop_selection_pipeline = selection_pipeline.multipipeline:main
7 | haps_filters = selection_pipeline.haps_filters:main
8 | 
9 | 


--------------------------------------------------------------------------------
/selection_pipeline/selectionTools.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/selection_pipeline/selectionTools.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | selection_pipeline
2 | 


--------------------------------------------------------------------------------
/selection_pipeline/selection_pipeline.py:
--------------------------------------------------------------------------------
  1 | # Python Script to perform for running the single process for our pipeline
  2 | #
  3 | # Murray Cadzow
  4 | # July 2013
  5 | # University Of Otago
  6 | #
  7 | # James Boocock
  8 | # July 2013
  9 | # University Of Otago
 10 | #
 11 | from optparse import OptionParser
 12 | import ConfigParser
 13 | import logging
 14 | import os
 15 | import sys
 16 | from .standard_run import StandardRun
 17 | from .environment import set_environment
 18 | from ._version import __version__
 19 | logger = logging.getLogger(__name__)
 20 | SUBPROCESS_FAILED_EXIT = 10
 21 | 
 22 | 
 23 | def parse_config(options):
 24 |     """ Parse config file
 25 | 
 26 |         Reads a config and parses the
 27 |         arguments into a dictionary.
 28 |     """
 29 |     config = ConfigParser.ConfigParser()
 30 |     config.read(options.config_file)
 31 |     config_parsed = {}
 32 |     logger.debug(config.sections())
 33 |     for section in config.sections():
 34 |         logger.debug(section)
 35 |         opts = config.options(section)
 36 |         config_parsed[section] = {}
 37 |         for op in opts:
 38 |             logger.debug(op)
 39 |             try:
 40 |                 config_parsed[section][op] = config.get(section, op)
 41 |             except:
 42 |                 logger.info("exception on {0}".format(op))
 43 |                 config_parsed[section][op] = None
 44 |     return config_parsed
 45 | 
 46 | 
 47 | def parse_arguments():
 48 |     """ Parse the comand line arguments
 49 | 
 50 |         read the arguments and set sensible
 51 |         default values for the program
 52 |     """
 53 |     parser = OptionParser()
 54 |     parser.add_option('-v', '--debug',
 55 |                       action="store_true", dest='debug',
 56 |                       help="Print debug messages")
 57 |     parser.add_option('-q', '--silent', action="store_false",
 58 |                       dest='verbose', help="Run Silently")
 59 |     parser.add_option('-i', '--vcf',
 60 |                       dest='vcf_input', help="VCF input file")
 61 |     parser.add_option('-c', '--chromosome',
 62 |                       dest='chromosome', help="Chromosome")
 63 |     parser.add_option('-l', '--log-fire', dest='log_file',
 64 |                       help="Log file for the pipeline process")
 65 |     parser.add_option('--maf', dest='maf',
 66 |                       help='Minor allele-frequency filter')
 67 |     parser.add_option('--hwe', dest='hwe',
 68 |                       help="Hardy-Weinberg Equillibrium filter proportion")
 69 |     parser.add_option('--remove-missing', dest="remove_missing",
 70 |                       help="Remove missing genotypes")
 71 |     parser.add_option('--config-file', dest="config_file",
 72 |                       help="Config file")
 73 |     parser.add_option('--phased-vcf', action="store_true",
 74 |                       dest="phased_vcf", help="Phased vcf file")
 75 |     parser.add_option('--population', dest="population",
 76 |                       help="Population Code ")
 77 |     parser.add_option('--imputation', action="store_true",
 78 |                       dest="imputation", help="Imputation")
 79 |     parser.add_option('--full-process', action="store_true",
 80 |                       dest="full_process", help="Run Entire Process")
 81 |     parser.add_option('--gzvcf', action="store_true",
 82 |                       dest="vcf_gz", help="VCF input is in GZ file (optional)")
 83 |     parser.add_option('--TajimaD', dest='tajimas_d',
 84 |                       help="Output Tajima's D statistic in bins of size (bp)")
 85 |     parser.add_option('--fay-Window-Width', dest='fayandWuWindowWidth',
 86 |                       help="Sliding window width for Fay and Wu's H (kb)")
 87 |     parser.add_option('--fay-Window-Jump', dest="fayandWuWindowJump",
 88 |                       help=("Window Jump for Fay and Wus ( if fay-Window-Width"
 89 |                             " = fay-Window-Jump non-overlapping windows "
 90 |                             "are used (kb)"))
 91 |     parser.add_option('--no-clean-up', dest="no_clean_up", action="store_true",
 92 |                       help="Do not clean up intermediate datafiles")
 93 |     parser.add_option('--impute-split-size', dest='impute_split_size',
 94 |                       help="impute2 split size (Mb)")
 95 |     parser.add_option('--ehh-window-size', dest="multi_window_size",
 96 |                       help="Multicore window size (Mp)")
 97 |     parser.add_option('--ehh-overlap', dest="ehh_overlap",
 98 |                       help="EHH window overlap (Mb)")
 99 |     parser.add_option('--daf', dest='daf',
100 |                       help="Derived Allele Frequency filter proportion")
101 |     parser.add_option('--big-gap', dest="big_gap",
102 |                       help=("Gap size for not calculating iHH if "
103 |                             "core SNP spans this gap (kb)"))
104 |     parser.add_option('--small-gap', dest='small_gap',
105 |                       help=("Gap size for applying a penalty to "
106 |                             "the area calculated by iHH (kb)"))
107 |     parser.add_option('--small-gap-penalty', dest="small_gap_penalty",
108 |                       help=("Penalty multiplier for intergration steps"
109 |                             "in iHH see manual for formula, usually the "
110 |                             "same as small-gap"))
111 |     parser.add_option('--cores', dest='cores',
112 |                       help="Override cores avaliable setting")
113 |     parser.add_option('--no-ihs',dest='no_ihs',action="store_true"
114 |                       , help='Disable iHS and iHH calculation')
115 |     parser.add_option('--haps', dest='haps',
116 |                         help="Shapeit haps file")
117 |     parser.add_option('--sample', dest='sample',
118 |                         help='Corresponding sample file to accompany haps')
119 |     parser.add_option('--beagle',dest='beagle',action='store_true',
120 |                       help="Use beagle to phase")
121 |     parser.add_option('--no-gmap',dest="no_genetic_map",action="store_true",
122 |                       help="Do not use a genetic map for the analysis")
123 |     parser.add_option('--physical-ihs',dest="physical_ihs",help="Use physical map for calculating iHS",action="store_true")
124 |     parser.add_option("--no-plots" , dest="no_plots", action="store_true",
125 |                       help="Do not create rudimentary plots")
126 |     parser.add_option('--version', dest = "ver", action="store_true",
127 |                       help="Print version info")
128 |     (options, args) = parser.parse_args()
129 |     if(options.verbose is not None):
130 |         if(options.debug):
131 |             logger.setLevel(logging.DEBUG)
132 |         else:
133 |             logger.setLevel(logging.ERROR)
134 |     if(options.ver is True): 
135 |         print "Version: {0}".format(__version__)
136 |         sys.exit(1)        
137 | 
138 |     # Obligatory arguments
139 |     assert options.vcf_input or (options.haps and options.sample) is not None, \
140 |         "No VCF or haps/sample file has been specified as input"
141 |     assert options.chromosome is not None, \
142 |         "No chromosome has been specified to the script"
143 |     assert options.population is not None, \
144 |         "Population code has not been specified."
145 |     assert options.config_file is not None, \
146 |         "Config file has not been specified."
147 |     if(options.haps and options.sample):
148 |         assert os.path.isfile(options.haps), \
149 |                 "Cannot locate haps file path = {0}".format(options.haps)
150 |         assert os.path.isfile(options.sample), \
151 |                 "Cannot locate sample file path = {0}".format(options.sample)
152 |     elif(options.vcf_input):
153 |         assert os.path.isfile(options.vcf_input), \
154 |                 "Cannot locate vcf input file path = {0}".format(options.vcf_input) 
155 |     if(options.fayandWuWindowJump is None):
156 |         options.fayandWuWindowJump = str(5000)
157 |     else:
158 |         options.fayandWuWindowJump = str(
159 |             int(float(options.fayandWuWindowJump) * 1e3))
160 |     if(options.fayandWuWindowWidth is None):
161 |         options.fayandWuWindowWidth = str(5000)
162 |     else:
163 |         options.fayandWuWindowWidth = str(
164 |             int(float(options.fayandWuWindowWidth) * 1e3))
165 |     if(options.no_clean_up is None):
166 |         options.no_clean_up = False
167 |     if(options.tajimas_d is None):
168 |         options.tajimas_d = str(5000)
169 |     else:
170 |         options.tajimas_d = str(
171 |             int(float(options.tajimas_d) * 1e3))
172 |     if(options.imputation is None):
173 |         options.imputation = False
174 |     if(options.hwe is None):
175 |         options.hwe = str(0.0001)
176 |     if(options.maf is None):
177 |         options.maf = str(0.01)
178 |     if(options.daf is None):
179 |         options.daf = str(0.00)
180 |     if(options.remove_missing is None):
181 |         options.remove_missing = str(0.99)
182 |     if (options.phased_vcf is None):
183 |         options.phased_vcf = False
184 |     if (options.full_process is None):
185 |         options.full_process = False
186 |     if (options.vcf_gz is None):
187 |         options.vcf_gz = False
188 |     if (options.no_ihs is None):
189 |         options.no_ihs = False
190 |     if(options.log_file is None):
191 |         options.log_file = options.population + \
192 |             options.chromosome + "_selection_pipeline.log"
193 |     if (options.impute_split_size is None):
194 |         options.impute_split_size = str(5000000)
195 |     else:
196 |         options.impute_split_size = str(
197 |             int(float(options.impute_split_size) * 1e6))
198 |     if (options.multi_window_size is None):
199 |         options.multi_window_size = str(int(5*1e6))
200 |     else:
201 |         options.multi_window_size = str(
202 |             int(float(options.multi_window_size) * 1e6))
203 |     if (options.ehh_overlap is None):
204 |         options.ehh_overlap = str(int(2*1e6))
205 |     else:
206 |         options.ehh_overlap = str(
207 |             int(float(options.ehh_overlap) * 1e6))
208 |     if (options.big_gap is None):
209 |         options.big_gap = str(0)
210 |     else:
211 |         options.big_gap = str(
212 |             int(float(options.big_gap) * 1e3))
213 |     if (options.small_gap is None):
214 |         options.small_gap = str(0)
215 |     else:
216 |          options.small_gap = str(
217 |             int(float(options.small_gap) * 1e3))
218 |     if (options.small_gap_penalty is None):
219 |         options.small_gap_penalty = str(0)
220 |     else:
221 |         options.small_gap_penalty = str(
222 |             int(float(options.small_gap_penalty) * 1e3))
223 |     if (options.no_genetic_map):
224 |         # Must set beagle to true becasue shapeit will not
225 |         # Work without a genetic map
226 |         options.beagle = True
227 |     if (options.no_plots is None):
228 |         options.no_plots = False
229 |     if (options.physical_ihs is None):
230 |         options.physical_ihs = False
231 |     return options
232 | 
233 | 
234 | def main():
235 |     """ The main function
236 | 
237 |         Runs the selection pipeline.
238 |     """
239 |     options = parse_arguments()
240 |     config = parse_config(options)
241 |     set_environment(config['environment'])
242 |     if options.cores is not None:
243 |         config['system']['cores_avaliable'] = options.cores
244 |     logging.basicConfig(format='%(asctime)s     %(message)s',
245 |                         filename=options.log_file, filemode='w',
246 |                         level=logging.INFO)
247 |     s = StandardRun(options, config=config)
248 |     s.run_pipeline()
249 |     print("Selection Pipeline Completed Successfully :)!")
250 | 
251 | if __name__ == "__main__":
252 |     main()
253 | 


--------------------------------------------------------------------------------
/selection_pipeline/standard_run_utilities.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import sys
  4 | import logging
  5 | import re
  6 | import gzip
  7 | import tempfile
  8 | import signal
  9 | from time import sleep
 10 | #queue for threads
 11 | #regex for hash at start of line
 12 | 
 13 | try:
 14 |     import Queue as Queue
 15 | except ImportError:
 16 |     import queue as Queue
 17 | from threading import Thread
 18 | logger = logging.getLogger(__name__)
 19 | SUBPROCESS_FAILED_EXIT = 10
 20 | MISSING_EXECUTABLE_ERROR = 5
 21 | STOP = False
 22 | 
 23 | # returns Split VCF files that can be used
 24 | # by the vcf-subset function to take advantage of the cores avaliable
 25 | #
 26 | #
 27 | # split positions looks like an array
 28 | # 0,100,200 etc. The line ranges not
 29 | # including the header are seperated into
 30 | # seperate files which the function returns
 31 | #
 32 | # potential edge cases with really really small vcf files these should not be
 33 | # in use
 34 | #
 35 | 
 36 | 
 37 | 
 38 | def split_vcf(input_file, split_positions):
 39 |     """ Split a vcf file by input_positions
 40 | 
 41 |     """
 42 |     header = ''
 43 |     output_vcfs = []
 44 |     file_id = 0
 45 |     line_count = 1
 46 |     # get splits files positions 0 and 1
 47 |     # for a 1 core setup these will be
 48 |     # the start and the end of the file
 49 |     # and so the file will not change
 50 |     i = 0
 51 |     pos1 = split_positions[i]
 52 |     output_vcf = open(os.path.basename(input_file)+str(file_id), 'w')
 53 |     output_vcfs.append(os.path.basename(input_file)+str(file_id))
 54 |     with open(input_file, 'r') as vcf:
 55 |         for line in vcf:
 56 |             if re.match("^#", line) is not None:
 57 |                 header += line
 58 |             else:
 59 |                 output_vcf.write(header)
 60 |                 output_vcf.write(line)
 61 |                 break
 62 |         for line in vcf:
 63 |             if(line_count < pos1):
 64 |                 output_vcf.write(line)
 65 |             else:
 66 |                 i = i + 1
 67 |                 pos1 = split_positions[i]
 68 |                 file_id += 1
 69 |                 out_name = input_file + str(file_id)
 70 |                 output_vcfs.append(out_name)
 71 |                 output_vcf = open(input_file+str(file_id), 'w')
 72 |                 output_vcf.write(header)
 73 |                 output_vcf.write(line)
 74 |             line_count += 1
 75 |     return(output_vcfs)
 76 | 
 77 | 
 78 | def get_vcf_line_count(input_file):
 79 |     """ Return the line count of a vcf file
 80 | 
 81 |     """
 82 |     with open(input_file, 'r') as vcf:
 83 |         line_count = 0
 84 |         for line in vcf:
 85 |             if re.match("^#", line) is not None:
 86 |                 line_count = 1
 87 |             else:
 88 |                 break
 89 |         for line in vcf:
 90 |             line_count += 1
 91 |         return(line_count)
 92 | 
 93 | 
 94 | def __is_script__(fpath):
 95 |     """ Return true if the path is a file
 96 | 
 97 |     """
 98 |     return os.path.isfile(fpath)
 99 | 
100 | 
101 | def __is_exe__(fpath):
102 |     """ Return true if the path is a file and the executable bit is set
103 | 
104 |     """
105 |     return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
106 | 
107 | 
108 | def which(program, program_name):
109 |     """ Checks whether the file exists on the path or the system path
110 | 
111 |     """
112 |     fpath, fname = os.path.split(program)
113 |     if fpath:
114 |         if __is_exe__(program):
115 |             return program
116 |         elif (__is_script__(program)):
117 |             return program
118 |     else:
119 |         for path in os.environ["PATH"].split(os.pathsep):
120 |             path = path.strip('"')
121 |             exe_file = os.path.join(path, program)
122 |             if __is_exe__(exe_file):
123 |                 return exe_file
124 |     logger.error(program_name + " path = " + fpath +
125 |                  " not locatable path or in the directory specified \
126 |                  in your config file ")
127 |     return None
128 | 
129 | 
130 | def run_subprocess(
131 |     command, tool, stdout=None,
132 |     stderr=None, stdoutlog=False,
133 |         working_dir=None,with_queue=False, stdin=None):
134 |     """ Runs a command on the system shell and forks a new process
135 | 
136 |         also creates a file for stderr and stdout if needed
137 |         to avoid deadlock.
138 |     """
139 |     # Very dirty hack
140 |     logger.info(tool + ' command = ' + ' '.join(command))
141 |     if (working_dir is None):
142 |         working_dir = '.'
143 |     if(tool == 'selection_pipeline'):
144 |         stderr = working_dir+'/selection_stderr.tmp'
145 |         stdout = working_dir+ '/selection_stdout.tmp'
146 |     if(stderr is None):
147 |         stderr = 'stderr.tmp'
148 |         standard_err = open(stderr, 'w')
149 |     else:
150 |         standard_err = open(stderr, 'w')
151 |     if(stdin is None):
152 |         standard_in = None
153 |     else:
154 |         standard_in = open(working_dir + "/" + stdin, 'r')
155 |     try:
156 |         if(stdout is None):
157 |             standard_out = open('stdout.tmp', 'w')
158 |             exit_code = subprocess.Popen(
159 |                 command, stdout=standard_out, stderr=standard_err,cwd=working_dir, stdin=standard_in)
160 |         else:
161 |         # find out what kind of exception to try here
162 |             if(hasattr(stdout, 'read')):
163 |                 exit_code = subprocess.Popen(
164 |                     command, stdout=stdout, stderr=standard_err,cwd=working_dir, stdin=standard_in)
165 |             else:
166 |                 stdout = open(stdout, 'w')
167 |                 exit_code = subprocess.Popen(
168 |                     command, stdout=stdout, stderr=standard_err,cwd=working_dir, stdin=standard_in)
169 |             standard_out = stdout
170 |     except:
171 |         logger.error(tool + " failed to run " + ' '.join(command))
172 |         standard_err = open(stderr, 'r')
173 |         while True:
174 |             line = standard_err.readline()
175 |             if not line:
176 |                 break
177 |             logger.info(tool + " STDERR: " + line.strip())
178 |         standard_err.close()
179 |         sys.exit(SUBPROCESS_FAILED_EXIT)
180 |     try:
181 |         while(exit_code.poll() is None):
182 |             sleep(0.2)
183 |             if(STOP == True):
184 |                 exit_code.send_signal(signal.SIGINT) 
185 |                 if (with_queue) :
186 |                    return
187 |                 else:
188 |                     sys.exit(SUBPROCESS_FAILED_EXIT)
189 |     except (KeyboardInterrupt, SystemExit):
190 |         exit_code.send_signal(signal.SIGINT) 
191 |         global STOP
192 |         STOP = True
193 |         if( with_queue) :
194 |             return
195 |         else:
196 |             sys.exit(SUBPROCESS_FAILED_EXIT)
197 |     standard_err.close()
198 |     standard_out.close()
199 |     standard_err = open(stderr, 'r')
200 |     if(exit_code.returncode != 0):
201 |         logger.error(tool + " failed to run " + ' '.join(command))
202 |         while True:
203 |             line = standard_err.readline()
204 |             if not line:
205 |                 break
206 |             logger.info(tool + " STDERR: " + line.strip())
207 |         sys.exit(SUBPROCESS_FAILED_EXIT)
208 |     stdout_log = False
209 |     if(stdout is None):
210 |         standard_out = open('stdout.tmp', 'r')
211 |         stdout_log = True
212 |     elif(stdoutlog):
213 |         if(hasattr(stdout, 'write')):
214 |             standard_out = open(stdout.name, 'r')
215 |         else:
216 |             standard_out = open(stdout, 'r')
217 |         stdout_log = True
218 |     if(stdout_log):
219 |         while True:
220 |             line = standard_out.readline()
221 |             if not line:
222 |                 break
223 |             logger.info(tool + " STDOUT: " + line.strip())
224 |         standard_out.close()
225 |     while True:
226 |         line = standard_err.readline()
227 |         if not line:
228 |             break
229 |         logger.info(tool + " STDERR: " + line.strip())
230 |     logger.info("Finished tool " + tool)
231 |     logger.debug("command = " + ' '.join(command))
232 |     standard_err.close()
233 |     standard_out.close()
234 |     # Removed stdout if it either was not specified
235 |     # or the log was specified.
236 |     if(stdout is None or stdout is 'selection_stdout.tmp'):
237 |         os.remove('stdout.tmp')
238 |     elif(stdoutlog):
239 |         os.remove(standard_out.name)
240 |     os.remove(stderr)
241 | 
242 | 
243 | def __queue_worker__(q, tool_name):
244 |     while True:
245 |         queue_item = q.get()
246 |         try:
247 |             cmd = queue_item[0]
248 |             stdout = queue_item[1]
249 |             stdoutlog = queue_item[2]
250 |             stderr = queue_item[3]
251 |             folder_names = queue_item[4]
252 |         except IndexError:
253 |             cmd = queue_item[0]
254 |             stdout = queue_item[1]
255 |             stdoutlog = False
256 |             stderr = None
257 |             folder_names = '.'
258 |         try:
259 |            run_subprocess(
260 |                 cmd, tool_name, stdout=stdout,
261 |                 stdoutlog=stdoutlog, stderr=stderr, working_dir=folder_names,with_queue=True)
262 |         except SystemExit:
263 |             global STOP
264 |             STOP = True
265 |             logger.error(tool_name + ": Failed to run in thread")
266 |         q.task_done()
267 | 
268 | def queue_jobs(commands, tool_name, threads, stdouts=None, folder_names=None):
269 |     """ Creates a queue for running jobs
270 | 
271 |         Using a synchronized queue to spawn jobs equal
272 |         to the number of cores specified to the user.
273 |         The method blocks until all tasks are complete
274 |     """
275 |     q = Queue.Queue()
276 |     thread_L = []
277 |     for i in range(int(threads)):
278 |         t = Thread(target=__queue_worker__, args=[q, tool_name])
279 |         t.daemon = True
280 |         thread_L.append(t)
281 |         t.start()
282 |     for i, cmd in enumerate(commands):
283 |         stderr = 'stderr' + str(i) + '.tmp'
284 |         if(folder_names is None):
285 |             folder_name = '.'
286 |         else:
287 |             folder_name = folder_names[i]
288 |         if (stdouts is not None):
289 |             q.put([cmd, stdouts[i], False, stderr, folder_name])
290 |         else:
291 |             stdout = 'stdout' + str(i) + '.tmp'
292 |             q.put([cmd, stdout, True, stderr, folder_name])
293 |     q.join()
294 |     if (STOP == True):
295 |         sys.exit(SUBPROCESS_FAILED_EXIT)
296 | 
297 | # clean folder expecting a list containing
298 | # files to keep from that folder
299 | # only required if the user
300 | # runs the analysis from their root directory
301 | 
302 | 
303 | def clean_folder(folder, keep=None):
304 |     """ Cleans the working directory
305 | 
306 |         Takes as a parameter a list of the files
307 |         to not delete.
308 |     """
309 |     for the_file in os.listdir(folder):
310 |         the_file = os.path.basename(the_file)
311 |         file_path = os.path.join(folder, the_file)
312 |         if keep is not None:
313 |             if (file_path in [os.path.join(folder, x) for x in keep]):
314 |                 continue
315 |         try:
316 |             if os.path.isfile(file_path):
317 |                 os.unlink(file_path)
318 |         except Exception as e:
319 |             logger.error(e)
320 | 
321 | def gunzip_file(input_file,output_file=None):
322 |     """ Gunzips target file and retuns the file name
323 | 
324 |     """
325 |     if(output_file is None):
326 |         output_file = input_file.split(".gz")[0]
327 |     with open(output_file,'w') as out: 
328 |         with gzip.open(input_file) as gz:
329 |             for line in gz:
330 |                 out.write(line)
331 |     return(output_file)
332 |                 
333 | 
334 | 


--------------------------------------------------------------------------------
/selection_pipeline/tests/CEU_test.ids:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/selection_pipeline/tests/CEU_test.ids


--------------------------------------------------------------------------------
/selection_pipeline/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/selection_pipeline/tests/__init__.py


--------------------------------------------------------------------------------
/selection_pipeline/tests/ancestor.fa:
--------------------------------------------------------------------------------
1 | >ANCESTOR_2_FA
2 | GCCG
3 | 


--------------------------------------------------------------------------------
/selection_pipeline/tests/ancestor.fa.flat:
--------------------------------------------------------------------------------
1 | GCCG


--------------------------------------------------------------------------------
/selection_pipeline/tests/ancestor.fa.gdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/selection_pipeline/tests/ancestor.fa.gdx


--------------------------------------------------------------------------------
/selection_pipeline/tests/defaults.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # Defaults config file for VCF process
 3 | #
 4 | # If the executables are on your path 
 5 | # just the executable name is required.
 6 | #
 7 | # ? is the willcard flag for the prefix options
 8 | 
 9 | 
10 | 
11 | [system]
12 | cores_avaliable = 1
13 | # Library settings do not change, the library folder are appended to the path when runnig the program#
14 | [environment]
15 | LD_LIBRARY_PATH=/Users/smilefreak/Programming/Selection/selectionTools/lib
16 | PERL5LIB=/Users/smilefreak/Programming/Selection/selectionTools/lib/perl5
17 | [selection_pipeline]
18 | selection_pipeline_executable = selection_pipeline
19 | [vcftools]
20 | vcf_tools_executable = /Users/smilefreak/Programming/Selection/selectionTools/bin/vcftools
21 | vcf_subset_executable = /Users/smilefreak/Programming/Selection/selectionTools/bin/vcf-subset
22 | vcf_merge_executable = /Users/smilefreak/Programming/Selection/selectionTools/bin/vcf-merge
23 | vcf_concat_executable = /Users/smilefreak/Programming/Selection/selectionTools/bin/vcf-concat
24 | extra_args= 
25 | [genetic_map]
26 | genetic_map_dir= /Users/smilefreak/Programming/Selection/selectionTools/referencefiles/genetic_maps
27 | genetic_map_prefix=genetic_map_chr?_combined_b37.txt
28 | [shapeit]
29 | shapeit_executable= /Users/smilefreak/Programming/Selection/selectionTools/bin/shapeit
30 | extra_args =
31 | [impute2]
32 | impute_executable = /Users/smilefreak/Programming/Selection/selectionTools/bin/impute2
33 | impute_map_dir= /Users/smilefreak/Programming/Selection/selectionTools/referencefiles/impute_ref
34 | impute_reference_dir= /Users/smilefreak/Programming/Selection/selectionTools/referencefiles/impute_ref
35 | impute_map_prefix=genetic_map_chr?_combined_b37.txt
36 | impute_reference_prefix=ALL_1000G_phase1integrated_v3_chr?_impute
37 | extra_args = 
38 | [plink]
39 | plink_executable =/Users/smilefreak/Programming/Selection/selectionTools/bin/plink
40 | extra_args = 
41 | [Rscript]
42 | rscript_executable = Rscript
43 | indel_filter = /Users/smilefreak/Programming/Selection/selectionTools/corescripts/haps_indel_and_maf_filter.R
44 | generate_rsb = /Users/smilefreak/Programming/Selection/selectionTools/corescripts/generate_rsb.R
45 | extra_args=
46 | [haps_scripts]
47 | haps_to_hapmap_script= haps_to_hapmap
48 | haps_filter_script = haps_filters
49 | haps_interpolate_script = haps_interpolate
50 | [ancestral_allele]
51 | split_by_chromosome = True
52 | # not used unless split_by_chromosome is set to False
53 | ancestral_fasta_header_regex = 
54 | # not used unless split_by_chromosome is set to False
55 | ancestral_fasta_file =
56 | ancestral_allele_script= ancestral_annotation
57 | 
58 | ancestral_fasta_dir=/Users/smilefreak/Programming/Selection/selectionTools/referencefiles/ancestral_ref/
59 | ancestral_prefix=human_ancestor_?.fa
60 | [qctool]
61 | qctool_executable=/Users/smilefreak/Programming/Selection/selectionTools/bin/qctool
62 | 
63 | [multicore_ihh]
64 | multicore_ihh = /Users/smilefreak/Programming/Selection/selectionTools/corescripts/multicore_iHH.R
65 | [variscan]
66 | variscan_executable = /Users/smilefreak/Programming/Selection/selectionTools/bin/variscan
67 | [java]
68 | java_executable = /usr/bin/java
69 | [beagle]
70 | beagle_jar = /Users/smilefreak/Programming/Selection/selectionTools/bin/beagle.jar
71 | vm_size = 4g
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/selection_pipeline/tests/filter.haps:
--------------------------------------------------------------------------------
1 | --- rs147096179 130000004 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ? ? ? ? ? ? ? ? ? ?
2 | 2 rs1251176 130000040 G C 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0
3 | --- rs138462475 130000109 G T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 | 2 rs4662641 130000272 G A 1 1 0 1 0 1 0 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0
5 | 


--------------------------------------------------------------------------------
/selection_pipeline/tests/one_line.haps:
--------------------------------------------------------------------------------
1 | --- rs147096179 130000004 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ? ? ? ? ? ? ? ? ? ?
2 | 


--------------------------------------------------------------------------------
/selection_pipeline/tests/test_selection_pipeline.py:
--------------------------------------------------------------------------------
  1 | from selection_pipeline.haps_filters import\
  2 |     hardy_weinberg_asymptotic, filter_haps_file, stats
  3 | import doctest
  4 | import unittest
  5 | import selection_pipeline
  6 | import os
  7 | from selection_pipeline.selection_pipeline import parse_config
  8 | from selection_pipeline.run_pipeline import CommandTemplate
  9 | from selection_pipeline.aa_annotate import aa_seq, write_sample_file, \
 10 |     get_haps_line, aa_check
 11 | 
 12 | import vcf
 13 | 
 14 | suite = doctest.DocTestSuite(selection_pipeline)
 15 | 
 16 | def get_file(fname):
 17 |     return os.path.join(os.path.dirname(__file__),fname)
 18 | 
 19 | class Args:
 20 |     """ Represents command line arguments
 21 |     """
 22 |     pass    
 23 | class TestHapsFilter(unittest.TestCase):
 24 |     
 25 |     def test_hardy_exact(self):
 26 |         p_value = hardy_weinberg_asymptotic(138,1469,5)
 27 | 
 28 |     def test_hardy_asymptotic(self):
 29 |         p_value = hardy_weinberg_asymptotic(138,1469,5)
 30 |         assert p_value == 0.34263933319103679
 31 |     def test_filter_hap_file(self):
 32 |         args = Args()
 33 |         args.haps = get_file('filter.haps')
 34 |         args.output = get_file('output.haps')
 35 |         args.maf = 0.05
 36 |         args.missing = 0.05
 37 |         args.hwe = .05
 38 |         args.chi_square = True
 39 |         filter_haps_file(args)
 40 |         with open(args.output) as f:
 41 |             lines = sum(1 for line in f)
 42 |         assert lines == 1
 43 |         with open(args.output) as f:
 44 |             line = f.readline()
 45 |             line = line.split()
 46 |             assert line[0] == '2'
 47 |             assert line[1] == 'rs4662641'
 48 |             assert line[2] == '130000272' 
 49 |         os.remove(args.output)
 50 | 
 51 |     def test_remove_triallelic(self):
 52 |         args = Args()
 53 |         args.haps = get_file('triallelic_haps.haps')
 54 |         args.output = get_file('output.haps')
 55 |         args.maf = 0.05
 56 |         args.missing = 0.80
 57 |         args.hwe = .05
 58 |         args.chi_square = True
 59 |         filter_haps_file(args)
 60 |         with open(args.output) as f:
 61 |             lines = sum(1 for line in f) 
 62 |         assert lines == 1
 63 |         os.remove(args.output)
 64 | 
 65 | class TestRunPipeline(unittest.TestCase):
 66 |    
 67 |     def setUp(self): 
 68 |         self.options = Args()
 69 |         self.options.config_file = get_file('defaults.cfg')
 70 |         self.config = parse_config(self.options)
 71 |         self.options.output_prefix = 'CEU'
 72 |         self.options.chromosome = '5'
 73 |         self.options.vcf_input = 'testcase.vcf'
 74 |         self.options.population = 'CEU'
 75 |         self.template = CommandTemplate(self.options,self.config)
 76 | 
 77 |     #def test_run_impute2(self):
 78 |     #    (cmd_template, prefix) = self.template.run_impute2('test.haps')
 79 |     #    assert prefix == \
 80 |     #        self.options.output_prefix + self.options.chromosome +\
 81 |     #        '_impute2'
 82 |     #    assert cmd_template[0] == '/home/smilefreak/selectionTools/bin/impute2'
 83 |     #    assert len(cmd_template) == 10
 84 | 
 85 |     def test_remove_indels_vcf(self):
 86 |         (cmd,output_name) = self.template.run_remove_indels_from_vcf()
 87 |         assert output_name == 'testcase.recode.vcf'
 88 |         assert len(cmd) == 7
 89 |     
 90 | class TestAncestralAnnotation(unittest.TestCase):
 91 | 
 92 |     def __verify_sequence__(self,aaSeq):
 93 |         assert ''.join(aaSeq) == 'GCCG'
 94 | 
 95 |     def test_aa_seq_single_chromosome(self):
 96 |         options = Args()
 97 |         options.ancestralfasta = get_file('ancestor.fa')
 98 |         options.single_chromosome = True
 99 |         aaSeq = aa_seq(options)
100 |         self.__verify_sequence__(aaSeq)
101 | 
102 |     def test_aa_seq_header_regex(self):
103 |         options = Args()
104 |         options.ancestralfasta = get_file('ancestor.fa') 
105 |         options.header = 'ANCESTOR_?_FA'
106 |         options.chromosome = '2'
107 |         options.single_chromosome = False
108 |         aaSeq = aa_seq(options)
109 |         self.__verify_sequence__(aaSeq)
110 | 
111 |     def test_write_sample_file(self):
112 |         options = Args()
113 |         options.sample_file = get_file('test_sample.sample')
114 |         vcf_reader = vcf.Reader(filename=get_file('CEU_test.vcf'))
115 |         write_sample_file(options,vcf_reader)
116 |         sample_names = []
117 |         with open(options.sample_file) as f:
118 |             for i, sample in enumerate(f):
119 |                 if( i < 2):
120 |                     continue
121 |                 else:
122 |                     sample_name = sample.split()[0]
123 |                     sample_names.append(sample_name)
124 |         for s1, s2 in zip(sorted(sample_names),sorted(vcf_reader.samples)):
125 |             assert s1 == s2
126 |         os.remove(options.sample_file)
127 | 
128 |     def test_get_haps_line(self):
129 |         options = Args()
130 |         vcf_reader = vcf.Reader(filename=get_file("CEU_test.vcf"))
131 |         record = vcf_reader.next()
132 |         line = get_haps_line(options,record)
133 |         line = line.split()
134 |         assert line[0] == "rs147096179"
135 |         assert line[2] == "130000004"
136 |         assert line[3] == 'C'
137 |         assert line[4] == 'T' 
138 |     
139 |     def test_aa_check(self):
140 |         options = Args()
141 |         realAA= 'G'
142 |         ref = 'C'
143 |         alt = 'T'
144 |         format = "lower"
145 |         line = '2 rs1000 1 C T 0 1'
146 |         # Test ancestral allele != ref
147 |         new_line = aa_check(realAA,ref,alt,format,line)
148 |         for item in new_line.split()[5:]:
149 |             assert item == '1' 
150 |         # Test ancestral allele == ref
151 |         realAA = 'C'
152 |         new_line = aa_check(realAA,ref,alt,format,line)
153 |         assert new_line == '2 rs1000 1 C T 0 1'
154 |         realAA = 'T'
155 |         new_line = aa_check(realAA,ref,alt,format,line)
156 |         assert new_line == '2 rs1000 1 T C 1 0' 
157 |         
158 |             
159 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestHapsFilter))
160 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRunPipeline))
161 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestAncestralAnnotation))
162 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestHapsFilter))
163 | 


--------------------------------------------------------------------------------
/selection_pipeline/tests/triallelic_haps.haps:
--------------------------------------------------------------------------------
1 | --- rs1980000 100 C T 0 0 ? 2 2 2
2 | --- rs1981000 101 C T 0 0 ? 1 1 1 
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | __version__ = "unknown"
 3 | from selection_pipeline._version import __version__
 4 | setup(
 5 |     name="selectionTools",
 6 |     version=__version__,
 7 |     packages=['selection_pipeline','selection_pipeline.tests'],
 8 |     test_suite='selection_pipeline.tests.test_selection_pipeline',
 9 |     author="James Boocock",
10 |     author_email="james.boocock@otago.ac.nz",
11 |     description="Selection Pipeline for VCF Data",
12 |     license="MIT",
13 |     keywords="iHS ehh selection evolution",
14 |     zip_safe=False,
15 |     entry_points={
16 |         'console_scripts': [
17 |             'ancestral_annotation = selection_pipeline.aa_annotate:main',
18 |             'selection_pipeline = selection_pipeline.selection_pipeline:main',
19 |             'multipop_selection_pipeline = selection_pipeline.multipipeline:main',
20 |             'haps_to_hapmap = selection_pipeline.haps_to_hapmap:main',
21 |             'haps_filters = selection_pipeline.haps_filters:main',
22 |             'haps_interpolate = selection_pipeline.haps_interpolate:main'
23 |         ]
24 |     },
25 |     url="github.com/smilefreak/MerrimanSelectionPipeline",
26 |     use_2to3=True,
27 |     include_package_data=True,
28 |     package_data = {
29 |         '' : ['*.haps','*.cfg','*.vcf','*.sample','*.ped',
30 |               '*.map',"*.fa","*.ids"]
31 |     }
32 | 
33 | )
34 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | rehh
2 | 


--------------------------------------------------------------------------------
/src/PopGenome_2.0.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/PopGenome_2.0.7.tar.gz


--------------------------------------------------------------------------------
/src/R_dependencies.R:
--------------------------------------------------------------------------------
1 | 
2 | args = commandArgs(trailingOnly=TRUE)
3 | package = args[1]
4 | print(package)
5 | install.packages(package, repo='http://cran.stat.auckland.ac.nz')
6 | 


--------------------------------------------------------------------------------
/src/beagle.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/beagle.jar


--------------------------------------------------------------------------------
/src/defaults.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # Defaults config file for VCF process
 3 | #
 4 | # If the executables are on your path 
 5 | # just the executable name is required.
 6 | #
 7 | # ? is the willcard flag for the prefix options
 8 | 
 9 | 
10 | 
11 | [system]
12 | cores_avaliable = 1
13 | # Library settings do not change, the library folder are appended to the path when runnig the program#
14 | [environment]
15 | LD_LIBRARY_PATH=!SELECT_PIPELINE!/lib
16 | PERL5LIB=!SELECT_PIPELINE!/lib/perl5
17 | [selection_pipeline]
18 | selection_pipeline_executable = selection_pipeline
19 | [vcftools]
20 | vcf_tools_executable = !SELECT_PIPELINE!/bin/vcftools
21 | vcf_subset_executable = !SELECT_PIPELINE!/bin/vcf-subset
22 | vcf_merge_executable = !SELECT_PIPELINE!/bin/vcf-merge
23 | vcf_concat_executable = !SELECT_PIPELINE!/bin/vcf-concat
24 | extra_args= 
25 | [genetic_map]
26 | genetic_map_dir= !SELECT_PIPELINE!/referencefiles/genetic_maps
27 | genetic_map_prefix=genetic_map_chr?_combined_b37.txt
28 | [shapeit]
29 | shapeit_executable= !SELECT_PIPELINE!/bin/shapeit
30 | extra_args =
31 | [impute2]
32 | impute_executable = !SELECT_PIPELINE!/bin/impute2
33 | impute_map_dir= !SELECT_PIPELINE!/referencefiles/impute_ref
34 | impute_reference_dir= !SELECT_PIPELINE!/referencefiles/impute_ref
35 | impute_map_prefix=genetic_map_chr?_combined_b37.txt
36 | impute_reference_prefix=ALL_1000G_phase1integrated_v3_chr?_impute
37 | extra_args = 
38 | [plink]
39 | plink_executable =!SELECT_PIPELINE!/bin/plink
40 | extra_args = 
41 | [Rscript]
42 | rscript_executable = Rscript
43 | indel_filter = !SELECT_PIPELINE!/corescripts/haps_indel_and_maf_filter.R
44 | generate_rsb = !SELECT_PIPELINE!/corescripts/generate_rsb.R
45 | extra_args=
46 | [haps_scripts]
47 | haps_to_hapmap_script= haps_to_hapmap
48 | haps_filter_script = haps_filters
49 | haps_interpolate_script = haps_interpolate
50 | [ancestral_allele]
51 | split_by_chromosome = True
52 | # not used unless split_by_chromosome is set to False
53 | ancestral_fasta_header_regex = 
54 | # not used unless split_by_chromosome is set to False
55 | ancestral_fasta_file =
56 | ancestral_allele_script= ancestral_annotation
57 | 
58 | ancestral_fasta_dir=!SELECT_PIPELINE!/referencefiles/ancestral_ref/
59 | ancestral_prefix=human_ancestor_?.fa
60 | [qctool]
61 | qctool_executable=!SELECT_PIPELINE!/bin/qctool
62 | 
63 | [multicore_ihh]
64 | multicore_ihh = !SELECT_PIPELINE!/corescripts/multicore_iHH.R
65 | [variscan]
66 | variscan_executable = !SELECT_PIPELINE!/bin/variscan
67 | [java]
68 | java_executable = /usr/bin/java
69 | [beagle]
70 | beagle_jar = !SELECT_PIPELINE!/bin/beagle.jar
71 | vm_size = 4g
72 | 
73 | [vcflib]
74 | vcflib_vcfsnps = !SELECT_PIPELINE!/bin/vcfsnps
75 | 


--------------------------------------------------------------------------------
/src/getopt_1.20.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/getopt_1.20.0.tar.gz


--------------------------------------------------------------------------------
/src/impute_v2.3.1_MacOSX_Intel.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/impute_v2.3.1_MacOSX_Intel.tgz


--------------------------------------------------------------------------------
/src/impute_v2.3.1_x86_64_static.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/impute_v2.3.1_x86_64_static.tgz


--------------------------------------------------------------------------------
/src/multicore_0.1-7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/multicore_0.1-7.tar.gz


--------------------------------------------------------------------------------
/src/plink-1.07-mac-intel.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/plink-1.07-mac-intel.zip


--------------------------------------------------------------------------------
/src/plink-1.07-x86_64.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/plink-1.07-x86_64.zip


--------------------------------------------------------------------------------
/src/qctool_v1.4-linux-x86_64.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/qctool_v1.4-linux-x86_64.tgz


--------------------------------------------------------------------------------
/src/qctool_v1.4-osx.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/qctool_v1.4-osx.tgz


--------------------------------------------------------------------------------
/src/qctool_v1.4-scientific-linux-x86_64.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/qctool_v1.4-scientific-linux-x86_64.tgz


--------------------------------------------------------------------------------
/src/rehh_1.11.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/rehh_1.11.tar.gz


--------------------------------------------------------------------------------
/src/tabix.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/tabix.tar.bz2


--------------------------------------------------------------------------------
/src/variscan-2.0.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/variscan-2.0.3.tar.gz


--------------------------------------------------------------------------------
/src/vcflib.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/vcflib.zip


--------------------------------------------------------------------------------
/src/vcftools.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/vcftools.tar.gz


--------------------------------------------------------------------------------
/src/zlib-1.2.8.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MerrimanLab/selectionTools/761ace7cf43639b890c5071c200761ff8a6fb05f/src/zlib-1.2.8.tar.gz


--------------------------------------------------------------------------------