├── AddChromosomeNumber.sh
├── DropDuplicatedPositions.sh
├── DropDuplicatedSNPs.sh
├── ExtractAncestryOutliers.sh
├── FilterByInfoAll.sh
├── GenewiseManhattanPlotinR.R
├── Get_Covariates.R
├── ID_Build.py
├── IdHets.R
├── IndividualIBD.R
├── Iterative_Missingness.sh
├── LICENSE
├── MakeChunks.sh
├── MakeKeepIDs.sh
├── Make_glist.sh
├── ManhattanPlotinR.R
├── ManhattanPlotinRBumblebee.R
├── Manhattan_Plot_For_DTP.R
├── Master_imputation_script_posterior_sampled_haps.sh
├── MergeImputedChunks.sh
├── Modified_submit_impute2_jobs_to_cluster.R
├── PC-VS-OUTCOME_IN_R_FULL.R
├── PC-VS-OUTCOME_IN_R_SHORT.R
├── PC_Plot_1KG.R
├── PC_Plot_1KG_Greyed.R
├── PlotPCs.R
├── Prototype_imputation_job_posterior_sampled_haps.sh
├── QQPlot_For_DTP.R
├── QQPlotinR.R
├── QQPlotinR_Alternate.R
├── README.md
├── Relabel_rs.sh
├── ReplaceDots.sh
├── highLDregions4bim_b37.awk
├── highLDregions4bim_b38.awk
├── manhattan_DOG_TRY.R
├── manhattan_v2.R
├── manhattan_v2_bumblebee.R
└── qq_plot_v7.R


/AddChromosomeNumber.sh:
--------------------------------------------------------------------------------
1 | results=$1
2 | for i in {1..22}
3 | do
4 | awk -v i=$i results=$results '{s=""; for (j=2; j <= NF; j++) s=s $j " "; print i, s}' $results/Chr$i.impute2 > New_Chromosome$i.impute2
5 | done
6 | awk -v results=$results ' NR==1 {print; next}{ s = ""; for (j =2; j <= NF; j++) s = s $j " "; print "X", s }' $results/ChrX.impute2 > New_ChromosomeX.impute2
7 | 


--------------------------------------------------------------------------------
/DropDuplicatedPositions.sh:
--------------------------------------------------------------------------------
 1 | source Config.conf
 2 | 
 3 | awk '{print $2}' $root.post_imputation_final_rs_only.bim | \
 4 | sort | \ 
 5 | uniq –d > More_Duplicates_Removed
 6 | 
 7 | ./plink2 \
 8 | --bfile $root.post_imputation_final \
 9 | --exclude More_Duplicates_Removed \
10 | --make-bed \
11 | --out $root.post_imputation_final
12 | 
13 | awk 'BEGIN {OFS = "\t"} $2 ~ /^rs/{gsub(":.*", "", $2) }1' $root.post_imputation_final.bim > $root.post_imputation_final_rs_only.bim
14 | cp $root.post_imputation_final.bed $root.post_imputation_final_rs_only.bed
15 | cp $root.post_imputation_final.fam $root.post_imputation_final_rs_only.fam
16 | 


--------------------------------------------------------------------------------
/DropDuplicatedSNPs.sh:
--------------------------------------------------------------------------------
1 | source Config.conf
2 | 
3 | awk '{print $0, $1":"$4}' $root.post_imputation_updated.bim > $root.post_imputation_updated_positions
4 | awk '{print $1":"$4}' $root.post_imputation_updated.bim | sort | uniq -d > $root.post_imputation_updated_duplicated_positions 
5 | grep -w -f  $root.post_imputation_updated_duplicated_positions $root.post_imputation_updated_positions | awk '{print $2}' > $root.post_imputation_updated_duplicated_IDs
6 | 


--------------------------------------------------------------------------------
/ExtractAncestryOutliers.sh:
--------------------------------------------------------------------------------
1 | source Config.conf
2 | 
3 | awk '/REMOVED/ {print $3}' $root.pop_strat_outliers_smartpca.log | sed 's/:/ /g' > $root.pop_strat_outliers.outliers
4 | 


--------------------------------------------------------------------------------
/FilterByInfoAll.sh:
--------------------------------------------------------------------------------
 1 | source Config.conf
 2 | 
 3 | gunzip $root.whole_genome.impute2_info.gz
 4 | awk '$5 >= 0.8' $root.whole_genome.impute2_info > $root.whole_genome_filtered.impute2_info
 5 | gzip $root.whole_genome.impute2_info
 6 | for i in {1..22}
 7 | do
 8 | awk 'FNR==NR { a[$2]; next } $2 in a' $root.whole_genome_filtered.impute2_info New_Chromosome$i.impute2 > Filtered_Chromsome$i.impute2
 9 | done
10 | awk 'FNR==NR { a[$2]; next } $2 in a' $root.whole_genome_filtered.impute2_info New_ChromosomeX.impute2 > Filtered_ChromsomeX.impute2
11 | 


--------------------------------------------------------------------------------
/GenewiseManhattanPlotinR.R:
--------------------------------------------------------------------------------
 1 | source("manhattan_v2_bumblebee.R")
 2 | args <- commandArgs(TRUE)
 3 | 
 4 | data <- args[1]
 5 | chr <- args[2]
 6 | bp <- args[3]
 7 | p <- args[4]
 8 | out <- args[5]
 9 | gws <- as.numeric(args[6])
10 | 
11 | gwas1 <- read.table(data,head=T)
12 | data_to_plot <- data.frame(CHR=gwas1[,chr], BP=gwas1[,bp], P=gwas1[,p])
13 | 
14 | grey_zone <- gws*0.0001 / 0.00000005 
15 | 
16 | pdf(out,width=8,height=6)
17 | manhattan(data_to_plot, GWthresh=-log10(gws), GreyZoneThresh=-log10(grey_zone), DrawGWline=TRUE)
18 | dev.off() 
19 | 


--------------------------------------------------------------------------------
/Get_Covariates.R:
--------------------------------------------------------------------------------
1 | args <- commandArgs(TRUE)
2 | root <- args[1]
3 | covar<- args[2]
4 | PCAEVEC<-read.table(paste(root,".dataname_pop_strat_includes.pca.evec",sep=""),head=T)
5 | colnames(PCAEVEC)<-c("IID","FID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100")
6 | COVARIATES<-read.table(covar,head=T)
7 | COVAR_WITH_PCs<-merge(PCAEVEC, COVARIATES)
8 | write.table(COVAR_WITH_PCs, file=paste(root,".covariates_file.txt",sep=""), quote=F, row.names=F, col.names=T)
9 | 


--------------------------------------------------------------------------------
/ID_Build.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python -tt
  2 | 
  3 | ##Basic build identifier
  4 | ##Run as ID_Build.py PLINK_CHROMSOME_6_FILE.bim
  5 | 
  6 | import sys
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | # Define a main() function
 11 | def main():
 12 |     if len(sys.argv) >= 2:
 13 |         with open(sys.argv[1]) as bim_6_file:
 14 |             Ref_hg18_dict = {'24979336' : 'C',
 15 |                              '32537182' : 'T',
 16 |                              '32647698' : 'A',
 17 |                              '24871357' : 'X',
 18 |                              '32429204' : 'X',
 19 |                              '32539620' : 'X',
 20 |                              '24871129' : 'X',
 21 |                              '32461427' : 'X',
 22 |                              '32571843' : 'X',
 23 |                              '168465326' : 'T',
 24 |                              '134105573' : 'T',
 25 |                              '116083117' : 'T',
 26 |                              '17839694' : 'T',
 27 |                              '11943869' : 'G',
 28 |                              '105624138' : 'T',
 29 |                              '130013753' : 'G',
 30 |                              '168776041' : 'T',
 31 |                              '168722477' : 'X',
 32 |                              '134063880' : 'X',
 33 |                              '115976424' : 'X',
 34 |                              '17731715' : 'X',
 35 |                              '11834883' : 'X',
 36 |                              '105517445' : 'X',
 37 |                              '129972060' : 'X',
 38 |                              '169034116' : 'X',
 39 |                              '168321797' : 'X',
 40 |                              '133742742' : 'X',
 41 |                              '115655260' : 'X',
 42 |                              '17731484' : 'X',
 43 |                              '11834650' : 'X',
 44 |                              '105069570' : 'X',
 45 |                              '129650915' : 'X',
 46 |                              '168633436' : 'X'}
 47 |             Ref_hg19_dict = {'24979336' : 'X',
 48 |                              '32537182' : 'X',
 49 |                              '32647698' : 'A',
 50 |                              '24871357' : 'C',
 51 |                              '32429204' : 'T',
 52 |                              '32539620' : 'A',
 53 |                              '24871129' : 'X',
 54 |                              '32461427' : 'X',
 55 |                              '32571843' : 'X',
 56 |                              '168465326' : 'X',
 57 |                              '134105573' : 'X',
 58 |                              '116083117' : 'A',
 59 |                              '17839694' : 'X',
 60 |                              '11943869' : 'X',
 61 |                              '105624138' : 'X',
 62 |                              '130013753' : 'X',
 63 |                              '168776041' : 'X',
 64 |                              '168722477' : 'T',
 65 |                              '134063880' : 'T',
 66 |                              '115976424' : 'T',
 67 |                              '17731715' : 'T',
 68 |                              '11834883' : 'G',
 69 |                              '105517445' : 'T',
 70 |                              '129972060' : 'G',
 71 |                              '169034116' : 'T',
 72 |                              '168321797' : 'X',
 73 |                              '133742742' : 'X',
 74 |                              '115655260' : 'X',
 75 |                              '17731484' : 'X',
 76 |                              '11834650' : 'X',
 77 |                              '105069570' : 'X',
 78 |                              '129650915' : 'X',
 79 |                              '168633436' : 'X'}
 80 |             Ref_hg38_dict = {'24979336' : 'X',
 81 |                              '32537182' : 'X',
 82 |                              '32647698' : 'X',
 83 |                              '24871357' : 'X',
 84 |                              '32429204' : 'X',
 85 |                              '32539620' : 'X',
 86 |                              '24871129' : 'C',
 87 |                              '32461427' : 'T',
 88 |                              '32571843' : 'A',
 89 |                              '168465326' : 'X',
 90 |                              '134105573' : 'X',
 91 |                              '116083117' : 'X',
 92 |                              '17839694' : 'X',
 93 |                              '11943869' : 'X',
 94 |                              '105624138' : 'X',
 95 |                              '130013753' : 'X',
 96 |                              '168776041' : 'X',
 97 |                              '168722477' : 'X',
 98 |                              '134063880' : 'X',
 99 |                              '115976424' : 'X',
100 |                              '17731715' : 'X',
101 |                              '11834883' : 'X',
102 |                              '105517445' : 'X',
103 |                              '129972060' : 'X',
104 |                              '169034116' : 'X',
105 |                              '168321797' : 'T',
106 |                              '133742742' : 'T',
107 |                              '115655260' : 'T',
108 |                              '17731484' : 'T',
109 |                              '11834650' : 'G',
110 |                              '105069570' : 'T',
111 |                              '129650915' : 'G',
112 |                              '168633436' : 'T'}
113 |             bim_6_df = pd.read_table(bim_6_file, delim_whitespace=True, header=None, prefix='V', index_col=0, usecols=[3,4])
114 |             if np.sum(bim_6_df.index.isin([24979336])) == 1:
115 |                 bim_6_dict = {'24979336' : bim_6_df.loc[24979336,'V4'].upper()}
116 |             else:
117 |                 bim_6_dict = {'24979336' : 'X'}
118 |             if np.sum(bim_6_df.index.isin([32537182])) == 1:
119 |                 bim_6_dict['32537182'] = bim_6_df.loc[32537182,'V4'].upper()
120 |             else:
121 |                 bim_6_dict['32537182'] = 'X'
122 |             if np.sum(bim_6_df.index.isin([32647698])) == 1:
123 |                 bim_6_dict['32647698'] = bim_6_df.loc[32647698, 'V4'].upper()
124 |             else:
125 |                 bim_6_dict['32647698'] = 'X'
126 |             if np.sum(bim_6_df.index.isin([24871357])) == 1:
127 |                 bim_6_dict['24871357'] = bim_6_df.loc[24871357, 'V4'].upper()
128 |             else:
129 |                 bim_6_dict['24871357'] = 'X'
130 |             if np.sum(bim_6_df.index.isin([32429204])) == 1:
131 |                 bim_6_dict['32429204'] = bim_6_df.loc[32429204, 'V4'].upper()
132 |             else:
133 |                 bim_6_dict['32429204'] = 'X'
134 |             if np.sum(bim_6_df.index.isin([32539620])) == 1:
135 |                 bim_6_dict['32539620'] = bim_6_df.loc[32539620, 'V4'].upper()
136 |             else:
137 |                 bim_6_dict['32539620'] = 'X'
138 |             if np.sum(bim_6_df.index.isin([24871129])) == 1:
139 |                 bim_6_dict['24871129'] = bim_6_df.loc[24871129, 'V4'].upper()
140 |             else:
141 |                 bim_6_dict['24871129'] = 'X'
142 |             if np.sum(bim_6_df.index.isin([32461427])) == 1:
143 |                 bim_6_dict['32461427'] = bim_6_df.loc[32461427, 'V4'].upper()
144 |             else:
145 |                 bim_6_dict['32461427'] = 'X'
146 |             if np.sum(bim_6_df.index.isin([32571843])) == 1:
147 |                 bim_6_dict['32571843'] = bim_6_df.loc[32571843, 'V4'].upper()
148 |             else:
149 |                 bim_6_dict['32571843'] = 'X'
150 |             if np.sum(bim_6_df.index.isin([168465326])) == 1:
151 |                 bim_6_dict['168465326'] = bim_6_df.loc[168465326, 'V4'].upper()
152 |             else:
153 |                 bim_6_dict['168465326'] = 'X'
154 |             if np.sum(bim_6_df.index.isin([134105573])) == 1:
155 |                 bim_6_dict['134105573'] = bim_6_df.loc[134105573, 'V4'].upper()
156 |             else:
157 |                 bim_6_dict['134105573'] = 'X'
158 |             if np.sum(bim_6_df.index.isin([116083117])) == 1:
159 |                 bim_6_dict['116083117'] = bim_6_df.loc[116083117, 'V4'].upper()
160 |             else:
161 |                 bim_6_dict['116083117'] = 'X'
162 |             if np.sum(bim_6_df.index.isin([17839694])) == 1:
163 |                 bim_6_dict['17839694'] = bim_6_df.loc[17839694, 'V4'].upper()
164 |             else:
165 |                 bim_6_dict['17839694'] = 'X'
166 |             if np.sum(bim_6_df.index.isin([11943869])) == 1:
167 |                 bim_6_dict['11943869'] = bim_6_df.loc[11943869, 'V4'].upper()
168 |             else:
169 |                 bim_6_dict['11943869'] = 'X'
170 |             if np.sum(bim_6_df.index.isin([105624138])) == 1:
171 |                 bim_6_dict['105624138'] = bim_6_df.loc[105624138, 'V4'].upper()
172 |             else:
173 |                 bim_6_dict['105624138'] = 'X'
174 |             if np.sum(bim_6_df.index.isin([130013753])) == 1:
175 |                 bim_6_dict['130013753'] = bim_6_df.loc[130013753, 'V4'].upper()
176 |             else:
177 |                 bim_6_dict['130013753'] = 'X'
178 |             if np.sum(bim_6_df.index.isin([168776041])) == 1:
179 |                 bim_6_dict['168776041'] = bim_6_df.loc[168776041, 'V4'].upper()
180 |             else:
181 |                 bim_6_dict['168776041'] = 'X'
182 |             if np.sum(bim_6_df.index.isin([168722477])) == 1:
183 |                 bim_6_dict['168722477'] = bim_6_df.loc[168722477, 'V4'].upper()
184 |             else:
185 |                 bim_6_dict['168722477'] = 'X'
186 |             if np.sum(bim_6_df.index.isin([134063880])) == 1:
187 |                 bim_6_dict['134063880'] = bim_6_df.loc[134063880, 'V4'].upper()
188 |             else:
189 |                 bim_6_dict['134063880'] = 'X'
190 |             if np.sum(bim_6_df.index.isin([115976424])) == 1:
191 |                 bim_6_dict['115976424'] = bim_6_df.loc[115976424, 'V4'].upper()
192 |             else:
193 |                 bim_6_dict['115976424'] = 'X'
194 |             if np.sum(bim_6_df.index.isin([17731715])) == 1:
195 |                 bim_6_dict['17731715'] = bim_6_df.loc[17731715, 'V4'].upper()
196 |             else:
197 |                 bim_6_dict['17731715'] = 'X'
198 |             if np.sum(bim_6_df.index.isin([11834883])) == 1:
199 |                 bim_6_dict['11834883'] = bim_6_df.loc[11834883, 'V4'].upper()
200 |             else:
201 |                 bim_6_dict['11834883'] = 'X'
202 |             if np.sum(bim_6_df.index.isin([105517445])) == 1:
203 |                 bim_6_dict['105517445'] = bim_6_df.loc[105517445, 'V4'].upper()
204 |             else:
205 |                 bim_6_dict['105517445'] = 'X'
206 |             if np.sum(bim_6_df.index.isin([129972060])) == 1:
207 |                 bim_6_dict['129972060'] = bim_6_df.loc[129972060, 'V4'].upper()
208 |             else:
209 |                 bim_6_dict['129972060'] = 'X'
210 |             if np.sum(bim_6_df.index.isin([169034116])) == 1:
211 |                 bim_6_dict['169034116'] = bim_6_df.loc[169034116, 'V4'].upper()
212 |             else:
213 |                 bim_6_dict['169034116'] = 'X'
214 |             if np.sum(bim_6_df.index.isin([168321797])) == 1:
215 |                 bim_6_dict['168321797'] = bim_6_df.loc[168321797, 'V4'].upper()
216 |             else:
217 |                 bim_6_dict['168321797'] = 'X'
218 |             if np.sum(bim_6_df.index.isin([133742742])) == 1:
219 |                 bim_6_dict['133742742'] = bim_6_df.loc[133742742, 'V4'].upper()
220 |             else:
221 |                 bim_6_dict['133742742'] = 'X'
222 |             if np.sum(bim_6_df.index.isin([115655260])) == 1:
223 |                 bim_6_dict['115655260'] = bim_6_df.loc[115655260, 'V4'].upper()
224 |             else:
225 |                 bim_6_dict['115655260'] = 'X'
226 |             if np.sum(bim_6_df.index.isin([17731484])) == 1:
227 |                 bim_6_dict['17731484'] = bim_6_df.loc[17731484, 'V4'].upper()
228 |             else:
229 |                 bim_6_dict['17731484'] = 'X'
230 |             if np.sum(bim_6_df.index.isin([11834650])) == 1:
231 |                 bim_6_dict['11834650'] = bim_6_df.loc[11834650, 'V4'].upper()
232 |             else:
233 |                 bim_6_dict['11834650'] = 'X'
234 |             if np.sum(bim_6_df.index.isin([105069570])) == 1:
235 |                 bim_6_dict['105069570'] = bim_6_df.loc[105069570, 'V4'].upper()
236 |             else:
237 |                 bim_6_dict['105069570'] = 'X'
238 |             if np.sum(bim_6_df.index.isin([129650915])) == 1:
239 |                 bim_6_dict['129650915'] = bim_6_df.loc[129650915, 'V4'].upper()
240 |             else:
241 |                 bim_6_dict['129650915'] = 'X'
242 |             if np.sum(bim_6_df.index.isin([168633436])) == 1:
243 |                 bim_6_dict['168633436'] = bim_6_df.loc[168633436, 'V4'].upper()
244 |             else:
245 |                 bim_6_dict['168633436'] = 'X'
246 |             hg_18_score = 0
247 |             hg_19_score = 0
248 |             hg_38_score = 0
249 |             Test = ''.join('{}'.format(val) for key, val in sorted(bim_6_dict.items()))
250 |             hg_18_ref = ''.join('{}'.format(val) for key, val in sorted(Ref_hg18_dict.items()))
251 |             hg_19_ref = ''.join('{}'.format(val) for key, val in sorted(Ref_hg19_dict.items()))
252 |             hg_38_ref = ''.join('{}'.format(val) for key, val in sorted(Ref_hg38_dict.items()))
253 |             for i in range(len(Test)):
254 |                 if Test[i] == 'X':
255 |                     continue
256 |                 if Test[i] != 'X':
257 |                     if hg_18_ref[i] == Test[i]:
258 |                         hg_18_score += 1
259 |                     if hg_19_ref[i] == Test[i]:
260 |                         hg_19_score += 1
261 |                     if hg_38_ref[i] == Test[i]:
262 |                         hg_38_score += 1
263 |             hg_18_length = 0
264 |             hg_19_length = 0
265 |             hg_38_length = 0
266 |             for i in range(len(hg_18_ref)):
267 |                 if hg_18_ref[i] == 'X':
268 |                     continue
269 |                 if hg_18_ref[i] != 'X':
270 |                     hg_18_length += 1
271 |             for i in range(len(hg_19_ref)):
272 |                 if hg_19_ref[i] == 'X':
273 |                     continue
274 |                 if hg_19_ref[i] != 'X':
275 |                     hg_19_length += 1
276 |             for i in range(len(hg_38_ref)):
277 |                 if hg_38_ref[i] == 'X':
278 |                     continue
279 |                 if hg_38_ref[i] != 'X':
280 |                     hg_38_length += 1
281 |             print '\n'
282 |             if hg_18_score > hg_19_score:
283 |                 if hg_18_score > hg_38_score:
284 |                     print 'Probable build is hg18'
285 |                 if hg_18_score < hg_38_score:
286 |                     print 'Probable build is hg38'
287 |                 if hg_18_score == hg_38_score:
288 |                     print 'Cannot determine build'
289 |             if hg_18_score < hg_19_score:
290 |                 if hg_19_score > hg_38_score:
291 |                     print 'Probable build is hg19'
292 |                 if hg_19_score < hg_38_score:
293 |                     print 'Probable build is hg38'
294 |                 if hg_19_score == hg_38_score:
295 |                     print 'Cannot determine build'
296 |             if hg_18_score == hg_19_score:
297 |                 if hg_18_score < hg_38_score:
298 |                     print 'Probable build is hg38'
299 |                 if hg_18_score >= hg_38_score:
300 |                     print 'Cannot determine build'
301 |             print '\nhg18 reference:', hg_18_ref
302 |             print 'Test:', Test
303 |             print 'Match:', hg_18_score, 'out of', hg_18_length
304 |             print '\nhg19 reference:', hg_19_ref
305 |             print 'Test:', Test
306 |             print 'Match:', hg_19_score, 'out of', hg_19_length
307 |             print '\nhg38 reference:', hg_38_ref
308 |             print 'Test:', Test
309 |             print 'Match:', hg_38_score, 'out of', hg_38_length
310 |     else:
311 |         print 'Give me a PLINK .bim file please. Chromosome 6, not pruned for preference'
312 | 
313 | # This is the standard boilerplate that calls the main() function.
314 | if __name__ == '__main__':
315 |     main()
316 | 


--------------------------------------------------------------------------------
/IdHets.R:
--------------------------------------------------------------------------------
 1 | data_dir <- getwd()
 2 | setwd(data_dir)
 3 | args <- commandArgs(TRUE)
 4 | root <- args[1]
 5 | # default
 6 | ibc_file=paste(root,".ibc",sep="")
 7 | sdcut=3 # number of sds at which to impose cut offs
 8 | # get args
 9 | t=commandArgs()
10 | if (charmatch("-args",t,nomatch=-1)>=0) args = t[((1:length(t))[t=="-args"]+1):length(t)] else args=""
11 | if (charmatch("ibc_file=",args,nomatch=-1)>=0) ibc_file = strsplit(args[charmatch("ibc_file=",args)],split="=")[[1]][2]
12 | if (charmatch("sdcut=",args,nomatch=-1)>=0) sdcut = strsplit(args[charmatch("sdcut=",args)],split="=")[[1]][2]
13 | ##
14 | d <- read.table(ibc_file,head=T);
15 | het_outliers_3sd <- abs(scale(d$Fhat2))>3
16 | write.table(d[het_outliers_3sd,],file=paste(root,".LD_het_outliers.txt",sep=""), sep="\t",quote=F,row.names=F);
17 | write.table(d[het_outliers_3sd,c(1,2)],file=paste(root,".LD_het_outliers_sample_exclude",sep=""), sep="\t",quote=F,row.names=F,col.names=F);
18 | 


--------------------------------------------------------------------------------
/IndividualIBD.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | root <- args[1]
 3 | sigma <- as.numeric(args[2])
 4 | U<-read.table(paste(root,".IBD.genome",sep=""),head=T) # read in table
 5 | V_ONE<-with(U, data.frame(FID1, IID1, PI_HAT)) # get variables of interest - reference individual
 6 | V_TWO<-with(U, data.frame(FID2, IID2, PI_HAT)) # get variables of interest - test individual
 7 | names(V_TWO)<-c("FID1", "IID1", "PI_HAT")
 8 | V<-as.data.frame(rbind(V_ONE, V_TWO))
 9 | names(V)<-c("FID1","IID1","PI_HAT")
10 | W<- aggregate(V$PI_HAT,FUN=mean,by=list(V$FID1, V$IID1)) #calculate average pi hat
11 | names(W)<-c("FID","IID","MEAN_PI_HAT") #rename columns
12 | X<-mean(W$MEAN_PI_HAT) #calculate mean of average pi hats
13 | Y<-sd(W$MEAN_PI_HAT) #calculate standard deviation of average pi hats
14 | Z<-X+(sigma*Y) # calculate threshold, here 6 SDs from mean
15 | sink(paste(root,".IBD_INDIV.txt",sep=""))
16 | W #print average pi hats
17 | sink(paste(root,".IBD_INDIV_outliers.txt",sep=""))
18 | subset(W,W$MEAN_PI_HAT>=Z)[,1:2] #print outliers
19 | 


--------------------------------------------------------------------------------
/Iterative_Missingness.sh:
--------------------------------------------------------------------------------
 1 | source ./Config.conf
 2 | 
 3 | aspercent=$(echo $1 " / 100" | bc -l)
 4 | genomind_1=$(echo "1-"$aspercent | bc -l)
 5 | 
 6 | $plink \
 7 | --bfile $root.common \
 8 | --geno $genomind_1 \
 9 | --make-bed \
10 | --out $root.common_SNP$1
11 | 
12 | #Remove samples with completeness < 90%
13 | 
14 | $plink \
15 | --bfile $root.common_SNP$1 \
16 | --mind $genomind_1 \
17 | --make-bed \
18 | --out $root.common_sample$1.SNP$1
19 | 
20 | newstep=$(($1+$3))
21 | 
22 | for i in $(seq $newstep $3 $2)
23 | 
24 | do
25 | 
26 | aspercent=$(echo $i " / 100" | bc -l)
27 | genomind=$(echo "1-"$aspercent | bc -l)
28 | prefix=$(($i-$3))
29 | 
30 | $plink \
31 | --bfile $root.common_sample$prefix.SNP$prefix \
32 | --geno $genomind \
33 | --make-bed \
34 | --out $root.common_sample$prefix.SNP$i
35 | 
36 | $plink \
37 | --bfile $root.common_sample$prefix.SNP$i \
38 | --mind $genomind \
39 | --make-bed \
40 | --out $root.common_sample$i.SNP$i
41 | 
42 | done
43 | 
44 | $plink \
45 | --bfile $root.common_sample$2.SNP$2 \
46 | --make-bed \
47 | --out $root.filtered 
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Joni Coleman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MakeChunks.sh:
--------------------------------------------------------------------------------
1 | source Config.conf
2 | 
3 | awk -v const=5000000 -v max=51 '{a[$1,int($4/const)]++; b[$1]} END{for (i in b) {for (j=0; j<max; j++) print i, j*const +1, (j+1)*const, a[i,j]}}' $root.het_cleaned.bim > Chunks
4 | awk '$4 != "" {print $2, $3 > "analysis_chunks_5Mb_chr"$1".txt"} ' < Chunks
5 | awk '$4 != "" {print > "Chunks_chr"$1".txt"}' < Chunks
6 | 


--------------------------------------------------------------------------------
/MakeKeepIDs.sh:
--------------------------------------------------------------------------------
1 | for pop in {CEU, CHB, JPT, YRI}
2 | do
3 | head -1 genotypes_chr22_{pop}_r28_nr.b36_fwd.txt | sed 's/ /\n/g' | sort | sed '1,5d' > keepIDs{pop}
4 | done
5 | cat keepIDs* > keepids.txt
6 | 


--------------------------------------------------------------------------------
/Make_glist.sh:
--------------------------------------------------------------------------------
 1 | awk '{
 2 | if (!($4 in min)) {
 3 |     min[$4]=$2; max[$4]=$3; chrom[$4]=$1
 4 |   } else {
 5 |     if ($2 < min[$4]) min[$4]=$2
 6 |     if ($3 > max[$4]) max[$4]=$3
 7 |   }
 8 | }
 9 | END {
10 |   for (name2 in min)
11 |     print chrom[name2], min[name2], max[name2], name2}' $1 | \
12 | sort -k 1 -n | \
13 | sed '1d' | \
14 | awk '$1 >= 1 && $1 <= 22 || $1 =="X" {print $0}' | \
15 | grep -v _g > $2
16 | 


--------------------------------------------------------------------------------
/ManhattanPlotinR.R:
--------------------------------------------------------------------------------
1 | source("manhattan_v2.R")
2 | args <- commandArgs(TRUE)
3 | root <- args[1]
4 | gwas1 <- read.table(paste(root,".post_imputation_final_analysis_FOR_MP",sep=""),head=T)
5 | data_to_plot <- data.frame(CHR=gwas1$CHR, BP=gwas1$BP, P=gwas1$P)
6 | pdf(paste(root,".post_imputation_final_analysis_MP.pdf",sep=""),width=8,height=6)
7 | manhattan(data_to_plot, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE)
8 | dev.off() 
9 | 


--------------------------------------------------------------------------------
/ManhattanPlotinRBumblebee.R:
--------------------------------------------------------------------------------
1 | source("manhattan_v2_bumblebee.R")
2 | args <- commandArgs(TRUE)
3 | root <- args[1]
4 | gwas1 <- read.table(paste(root,".post_imputation_final_analysis_FOR_MP",sep=""),head=T)
5 | data_to_plot <- data.frame(CHR=gwas1$CHR, BP=gwas1$BP, P=gwas1$P)
6 | pdf(paste(root,".post_imputation_final_analysis_MP.pdf",sep=""),width=8,height=6)
7 | manhattan(data_to_plot, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE)
8 | dev.off() 
9 | 


--------------------------------------------------------------------------------
/Manhattan_Plot_For_DTP.R:
--------------------------------------------------------------------------------
1 | source("manhattan_v2.R")
2 | args <- commandArgs(TRUE)
3 | root <- args[1]
4 | gwas1 <- read.table(paste(root,sep=""),head=T)
5 | data_to_plot <- data.frame(CHR=gwas1$CHR, BP=gwas1$BP, P=gwas1$P)
6 | pdf(paste(root,".MP.pdf",sep=""),width=8,height=6)
7 | manhattan(data_to_plot, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE)
8 | dev.off()
9 | 


--------------------------------------------------------------------------------
/Master_imputation_script_posterior_sampled_haps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -cwd
 3 | 
 4 | for chrom in {1..22}
 5 | do
 6 | impute2_examples/modified_submit_impute2_jobs_to_cluster.R chr=${chrom} post.avg.impute.run=TRUE
 7 | done
 8 | sed -i '{s/ qsub/\n\qsub/g}' impute2_examples/qsublist.sh
 9 | sed -i '1d' impute2_examples/qsublist.sh
10 | 
11 | split -l 45 impute2_examples/qsublist.sh impute2_examples/qsub
12 | 
13 | sed -i '0,/qsub/{s/qsub/qsub --N job1/}' impute2_examples/qsubaa
14 | sed -i '0,/qsub/{s/qsub/qsub --N job2/}' impute2_examples/qsubab
15 | sed -i '0,/qsub/{s/qsub/qsub --N job3/}' impute2_examples/qsubac
16 | sed -i '0,/qsub/{s/qsub/qsub --N job4/}' impute2_examples/qsubad
17 | sed -i '0,/qsub/{s/qsub/qsub --N job5/}' impute2_examples/qsubae
18 | sed -i '0,/qsub/{s/qsub/qsub --N job6/}' impute2_examples/qsubaf ##etc., for as many jobs as are needed
19 | 
20 | sed -i '{s/qsub/qsub --hold_jid job1/g}' impute2_examples/qsubab
21 | sed -i '{s/qsub/qsub --hold_jid job2/g}' impute2_examples/qsubac
22 | sed -i '{s/qsub/qsub --hold_jid job3/g}' impute2_examples/qsubad
23 | sed -i '{s/qsub/qsub --hold_jid job4/g}' impute2_examples/qsubae
24 | sed -i '{s/qsub/qsub --hold_jid job5/g}' impute2_examples/qsubaf ##etc. for as many jobs as are needed
25 | 
26 | sh impute2_examples/qsubaa
27 | sh impute2_examples/qsubab
28 | sh impute2_examples/qsubac
29 | sh impute2_examples/qsubad
30 | sh impute2_examples/qsubae
31 | sh impute2_examples/qsubaf
32 | 


--------------------------------------------------------------------------------
/MergeImputedChunks.sh:
--------------------------------------------------------------------------------
 1 | for i in {1..22} 
 2 | do	
 3 |   mkdir results-directory/Chr$i
 4 |   mv gwas_data_chr$i* Chr$i
 5 |   cat Chr$i/*.impute2 > Chr$i/Chr$i.impute2
 6 |   cat Chr$i/*.impute2_info > Chr$i/Chr$i.impute2_info	
 7 |   mv Chr$i/Chr$i.impute2* results-directory/
 8 | done
 9 | mkdir results-directory/ChrX
10 | mv gwas_data_chrX* ChrX
11 | cat ChrX/*.impute2 > ChrX/ChrX.impute2
12 | cat ChrX/*.impute2_info > ChrX/ChrX.impute2_info	
13 | mv ChrX/ChrX.impute2* results-directory/
14 | 


--------------------------------------------------------------------------------
/Modified_submit_impute2_jobs_to_cluster.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript --vanilla
 2 | 
 3 | root.dir <- "."
 4 | data.dir <- paste(root.dir,"Data/", sep="")
 5 | 
 6 | # default settings; can change on command line
 7 | chr <- 10
 8 | chunk.size <- 5  # chunk size in Mb
 9 | 
10 | # exactly one of the following must be set to TRUE on the command line
11 | phasing.run <- FALSE  # is this a phasing run?
12 | best.guess.impute.run <- FALSE  # is this an imputation run using best-guess haplotypes?
13 | post.avg.impute.run <- FALSE  # is this an imputation run using posterior--sampled haplotypes?
14 | 
15 | ## process command--line arguments
16 | args <- strsplit(commandArgs(TRUE), split='=')
17 | keys <- vector("character")
18 | 
19 | if (length(args) > 0) {
20 |   for (i in 1:length(args)) {
21 |     key <- args[[i]][1]
22 |     value <- args[[i]][2]
23 |     keys <- c(keys, key)
24 | 
25 |     if (exists(key)) {
26 |       # replace default value of key with input value
27 |       assign(key, value)
28 |     }
29 |     else {
30 |       cat("\n")
31 |       stop(paste("Unrecognized option [",key,"].\n\n", sep=""))
32 |     }
33 |   }
34 | }
35 | 
36 | # housekeeping
37 | phasing.run <- as.logical(phasing.run)
38 | best.guess.impute.run <- as.logical(best.guess.impute.run)
39 | post.avg.impute.run <- as.logical(post.avg.impute.run)
40 | 
41 | # exit the script if it is not clear what type of IMPUTE2 job we want to run
42 | stopifnot(phasing.run + best.guess.impute.run + post.avg.impute.run == 1)
43 | 
44 | 
45 | # read in file with chunk boundary definitions
46 | chunk.file <- paste(data.dir,"analysis_chunks_",chunk.size,"Mb_chr",chr,".txt", sep="")
47 | chunks <- read.table(chunk.file, head=T, as.is=T)
48 | 
49 | # submit a job to the cluster for each analysis chunk on this chromosome
50 | sink(paste(root.dir,"qsublist.sh", sep = ""),append=T)
51 | for (i in 1:nrow(chunks)) {
52 |   system.call <- paste(" qsub ",
53 |                        ifelse(phasing.run,"./prototype_phasing_job.sh ",""),
54 |                        ifelse(best.guess.impute.run,"./prototype_imputation_job_best_guess_haps.sh ",""),
55 |                        ifelse(post.avg.impute.run,"/root/to/impute2_examples/prototype_imputation_job_posterior_sampled_haps.sh ",""),
56 |                        chr," ",chunks[i,1]," ",chunks[i,2],
57 |                        sep="")
58 |   cat( system.call )
59 | }
60 | 


--------------------------------------------------------------------------------
/PC-VS-OUTCOME_IN_R_FULL.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | root <- args[1]
 3 | pheno <- args[2]
 4 | PCAEVEC<-read.table(paste(root,".pca.evec",sep=""),head=T)
 5 | colnames(PCAEVEC)<-c("FID","IID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100", "Pheno")
 6 | PHENOTYPE<-read.table(pheno,head=T)
 7 | PCAPHENO<-merge(PCAEVEC,PHENOTYPE)
 8 | sink(paste(root,".PC_Output_Associations_FULL.txt",sep=""))
 9 | for (i in 1:100) {
10 | DATA<-as.data.frame(PCAPHENO[,c(3:(i+2))])
11 | print(summary(lm(PCAPHENO[,104] ~ ., data=DATA)))
12 | }
13 | sink()
14 | 


--------------------------------------------------------------------------------
/PC-VS-OUTCOME_IN_R_SHORT.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | root <- args[1]
 3 | pheno<- args[2]
 4 | PCAEVEC<-read.table(paste(root,".pca.evec",sep=""),head=T)
 5 | colnames(PCAEVEC)<-c("FID","IID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100", "Pheno")
 6 | PHENOTYPE<-read.table(pheno,head=T)
 7 | PCAPHENO<-merge(PCAEVEC,PHENOTYPE)
 8 | sink(paste(root,".PC_Output_Associations_SHORT.txt",sep=""))
 9 | writeLines(c(" PC P  R-squared"))
10 | options(scipen=999)
11 | DATA<-as.data.frame(PCAPHENO[,3])
12 | print(c(1, summary(lm(PCAPHENO[,104] ~ DATA[,1], data=DATA))$coefficients[2,4], summary(lm(PCAPHENO[,104] ~ DATA[,1], data=DATA))$r.squared))
13 | for (i in 2:100) {
14 | DATA<-as.data.frame(PCAPHENO[,c(3:(i+2))])
15 | DATA2<-as.data.frame(PCAPHENO[,c(3:(i+1))])
16 | print(c(i, summary(lm(PCAPHENO[,104] ~ ., data=DATA))$coefficients[(i+1),4], summary(lm(PCAPHENO[,104] ~ ., data=DATA))$r.squared - summary(lm(PCAPHENO[,104] ~ ., data=DATA2))$r.squared))
17 | }
18 | sink()
19 | 


--------------------------------------------------------------------------------
/PC_Plot_1KG.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | root <- args[1]
 3 | PCAEVEC<-read.table(paste(root,".1kg.LD_pop_strat.pca.evec_RENAMED",sep=""), head=T)
 4 | colnames(PCAEVEC) <- c("ID","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","Pop")
 5 | library(ggplot2)
 6 | pdf(paste(root,"1kg.LD_pop_strat_PCA.pdf",sep=""))
 7 | with(PCAEVEC, qplot(PC1,PC2,colour=Pop))
 8 | with(PCAEVEC, qplot(PC1,PC3,colour=Pop))
 9 | with(PCAEVEC, qplot(PC1,PC4,colour=Pop))
10 | with(PCAEVEC, qplot(PC1,PC5,colour=Pop))
11 | with(PCAEVEC, qplot(PC2,PC3,colour=Pop))
12 | with(PCAEVEC, qplot(PC2,PC4,colour=Pop))
13 | with(PCAEVEC, qplot(PC2,PC5,colour=Pop))
14 | with(PCAEVEC, qplot(PC3,PC4,colour=Pop))
15 | with(PCAEVEC, qplot(PC3,PC5,colour=Pop))
16 | with(PCAEVEC, qplot(PC4,PC5,colour=Pop))
17 | dev.off()
18 | 


--------------------------------------------------------------------------------
/PC_Plot_1KG_Greyed.R:
--------------------------------------------------------------------------------
 1 | ###Author: JRIC
 2 | ###Date: 2018-07-13
 3 | ###Purpose: Plot user data projected on 1KG PCs, greying out 1KG individuals
 4 | 
 5 | ##Load packages
 6 | library(ggplot2)
 7 | library(colorspace)
 8 | 
 9 | ##Initialise command line arguments
10 | args <- commandArgs(TRUE)
11 | 
12 | ##Load data
13 | root <- args[1]
14 | PCAEVEC<-read.table(paste(root,".1kg.LD_pop_strat.pca.evec_RENAMED",sep=""), head=T)
15 | 
16 | ##Rename columns
17 | colnames(PCAEVEC) <- c("ID","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","Pop")
18 | 
19 | ##Define colour palette
20 | ThousandGenomesPalette<-heat_hcl(length(unique(PCAEVEC$Pop)), h = c(300, 75), c. = c(35, 95), l = c(15, 90), power = c(0.8, 1.2), fixup = TRUE, gamma = NULL, alpha = 1)
21 | names(ThousandGenomesPalette)<-unique(PCAEVEC$Pop)
22 | 
23 | ThousandGenomesPops<-c("LWK","MXL","PUR","TSI","YRI","ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT")
24 | 
25 | ThousandGenomesPalette[names(ThousandGenomesPalette) %in% ThousandGenomesPops] <- "#CCCCCC" 
26 | ThousandGenomesPalette[!names(ThousandGenomesPalette) %in% ThousandGenomesPops] <-  heat_hcl(length(unique(PCAEVEC$Pop)) - 14, h = c(300, 75), c. = c(35, 95), l = c(15, 90), power = c(0.8, 1.2), fixup = TRUE, gamma = NULL, alpha = 1)
27 |                                                                              
28 | ##Print pairwise comparisons of PC1-5 to pdf
29 | pdf(paste(root,"1kg.LD_pop_strat_PCA.pdf",sep=""))
30 |     with(PCAEVEC, qplot(PC1,PC2,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
31 |     with(PCAEVEC, qplot(PC1,PC3,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
32 |     with(PCAEVEC, qplot(PC1,PC4,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
33 |     with(PCAEVEC, qplot(PC1,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
34 |     with(PCAEVEC, qplot(PC2,PC3,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
35 |     with(PCAEVEC, qplot(PC2,PC4,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
36 |     with(PCAEVEC, qplot(PC2,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
37 |     with(PCAEVEC, qplot(PC3,PC4,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
38 |     with(PCAEVEC, qplot(PC3,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
39 |     with(PCAEVEC, qplot(PC4,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette))
40 | dev.off()
41 | 


--------------------------------------------------------------------------------
/PlotPCs.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | root <- args[1]
 3 | pcx <- as.numeric(args[2])
 4 | pcy <- as.numeric(args[3])
 5 | 
 6 | PCAEVEC<-read.table(paste(root,".pca.evec",sep=""),head=T)
 7 | colnames(PCAEVEC)<-c("FID","IID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100", "Pheno")
 8 | library(ggplot2)
 9 | pcx<-2+pcx
10 | pcy<-2+pcy
11 | pdf(paste(root,"_PC",(pcx-2),"_PC",(pcy-2),".pdf",sep=""))
12 | qplot(PCAEVEC[,pcx],PCAEVEC[,pcy], data=PCAEVEC, color=Pheno) + xlab(paste("PC",(pcx-2),sep="")) + ylab(paste("PC",(pcy-2),sep=""))
13 | dev.off()
14 | 


--------------------------------------------------------------------------------
/Prototype_imputation_job_posterior_sampled_haps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$-S /bin/sh
 3 | 
 4 | CHR=$1 
 5 | CHUNK_START=`printf "%.0f" $2`
 6 | CHUNK_END=`printf "%.0f" $3`
 7 | 
 8 | # directories
 9 | ROOT_DIR="./"
10 | DATA_DIR=${ROOT_DIR}downloaded_references/
11 | RESULTS_DIR=${ROOT_DIR}results_directory/
12 | 
13 | # executable
14 | IMPUTE2_EXEC=bin/impute2
15 | 
16 | # parameters
17 | NE=20000
18 | iter=30
19 | burnin=10
20 | k=80
21 | k_hap=500
22 | 
23 | # reference data files
24 | GENMAP_FILE=${DATA_DIR}genetic_map_chr${CHR}_combined_b37.txt
25 | HAPS_FILE=${DATA_DIR} ALL.chr${CHR}.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.nomono.haplotypes.gz
26 | LEGEND_FILE=${DATA_DIR} ALL.chr${CHR}.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.nomono.legend.gz
27 | STRAND_FILE=${DATA_DIR}dataname_{CHR}.strand
28 | 
29 | # GWAS data files
30 | GWAS_GTYPE_FILE=${DATA_DIR}Chr${CHR}.gen
31 | 
32 | # main output file
33 | OUTPUT_FILE=${RESULTS_DIR}gwas_data_chr${CHR}.pos${CHUNK_START}-${CHUNK_END}.posterior_sampled_haps_imputation.impute2
34 | 
35 | ## impute genotypes from posterior--sampled GWAS haplotypes
36 | $IMPUTE2_EXEC \
37 |     -m $GENMAP_FILE \
38 |     -g $GWAS_GTYPE_FILE \
39 |     -strand_g $STRAND_FILE \
40 |     -h $HAPS_FILE \
41 |     -l $LEGEND_FILE \
42 |     -Ne $NE \
43 |     -iter $iter \
44 |     -burnin $burnin \
45 |     -k $k \
46 |     -k_hap $k_hap \
47 |     -int $CHUNK_START $CHUNK_END \
48 |     -allow_large_regions \
49 |     -o $OUTPUT_FILE
50 | 


--------------------------------------------------------------------------------
/QQPlot_For_DTP.R:
--------------------------------------------------------------------------------
 1 | source("qq_plot_v7.R")
 2 | args <- commandArgs(TRUE)
 3 | root <- args[1]
 4 | gwas1<-read.table(paste(root, sep=""),head=T)
 5 | x1<-gwas1$P
 6 | pdf(paste(root,".QQ.pdf",sep=""),width=8,height=6)
 7 | qq.plot(x1, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=0.1, print=F, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black")
 8 | text(x=12,y=4,paste("lambda--median=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep=""))
 9 | dev.off()
10 | 
11 | ###This version of the QQ plot wrapper also writes values of lambda median on the plot, using p--values for all SNPs plotted.###
12 | 


--------------------------------------------------------------------------------
/QQPlotinR.R:
--------------------------------------------------------------------------------
 1 | source("qq_plot_v7.R")
 2 | args <- commandArgs(TRUE)
 3 | root <- args[1]
 4 | gwas1<-read.table(paste(root,".post_imputation_final_analysis_p",sep=""), head=T)
 5 | x1<-gwas1$P
 6 | pdf(paste(root,".post_imputation_final_analysis_QQ.pdf",sep=""),width=8,height=6)
 7 | qq.plot(x1, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=0.1, print=F, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black")
 8 | text(x=12,y=4,paste("lambda--median=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep=""))
 9 | dev.off()
10 | 
11 | ###This version of the QQ plot wrapper also writes values of lambda median on the plot, using p--values for all SNPs plotted.###
12 | 


--------------------------------------------------------------------------------
/QQPlotinR_Alternate.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | source("~/Desktop/gwas_scripts/qq_plot_v7.R")
 3 | gwas1<-fread("daner_PGC_BIP32b_mds7a_0416a_INFO3_AF1",data.table=F)
 4 | x1<-gwas1$P
 5 | png("daner_PGC_BIP32b_mds7a_0416a_INFO3_AF1.png",width=2400,height=3000, res=300)
 6 | qq.plot(x1, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=1, print=F, xat=NULL, yat=NULL, main="QQ Plot", xlab=NULL, ylab=NULL, pch="x", cex=1, col="black", cex.lab=1.5, cex.main=1.5)
 7 | text(x=20,y=4,paste(expression(lambda[median]),"=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep=" "), cex=1.5)
 8 | dev.off()
 9 | 
10 | # gwas2<-fread("daner_PGC_BIP32b_mds7a_0416a_INFO6_AF1",data.table=F)
11 | # x2<-gwas2$P
12 | # png("daner_PGC_BIP32b_mds7a_0416a_INFO6_AF1.png",width=4800,height=6000, res=300)
13 | # qq.plot(x2, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=0.1, print=F, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black")
14 | # text(x=12,y=4,paste("lambda-median=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep=""))
15 | # dev.off()
16 | 
17 | ###This version of the QQ plot wrapper also writes values of lambda median on the plot, using p--values for all SNPs plotted.###
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | ﻿
   2 | 
   3 | # gwas_scripts
   4 | ###### GWAS codebook (Coleman et al, 2015,  Briefings in Functional Genomics), version 1.0.0
   5 | ##### Please address questions, comments and improvements to [my google group](https://groups.google.com/forum/#!forum/gwas-questions)
   6 | ##### If you use the scripts and advice herein, please consider citing our paper, the full text of which is available on the publisher's website: 
   7 | ##### Quality control, imputation and analysis of genome-wide genotyping data from the Illumina HumanCoreExome microarray. Jonathan R. I. Coleman; Jack Euesden; Hamel Patel; Amos A. Folarin; Stephen Newhouse; Gerome Breen. Briefings in Functional Genomics 2016; [doi:10.1093/bfgp/elv037](http://bfg.oxfordjournals.org/content/15/4/298)
   8 | 
   9 | 
  10 | The scripts in this repo are referenced in the publication referenced above, which provides a straight-forward guide to the quality control, imputation and analysis of genome-wide genotype data. Scripts can be tested using the toy PLINK dataset kindly provided by Shaun Purcell on the PLINK 1.07 website: [example.zip](https://zzz.bwh.harvard.edu/plink/dist/example.zip).
  11 | 
  12 | 
  13 | This pipeline is designed to provide a useful resource for using genome-wide data from low-coverage arrays and smaller projects. As projects grow larger and more complex, it may be valuable to consult software creators' websites to seek more sophisticated analysis methods. A brief list of these is provided at the end of this document - I will gladly consider suggested inclusions. 
  14 | 
  15 | For the quality control, imputation and analysis of large scale genome-wide genotype data, it is highly recommended to look at Ricopili, the pipeline of the Psychiatric Genomics Consortium, which is currently being deposited in [this repo](https://github.com/Nealelab/ricopili) and is documented [here](https://sites.google.com/a/broadinstitute.org/ricopili/). All credit for Ricopili goes to its creators.
  16 | 
  17 | Within this protocol, the following software is used:
  18 | 
  19 | •	[PLINK](http://zzz.bwh.harvard.edu/plink/) /  [PLINK2](https://www.cog-genomics.org/plink2)
  20 | 
  21 | •	[R](http://www.r-project.org/)
  22 | 
  23 | •	[EIGENSOFT](http://www.hsph.harvard.edu/alkes-price/software/)
  24 | 
  25 | •	[IMPUTE](https://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
  26 | 
  27 | 
  28 | The protocol runs in a UNIX environment, and makes use of some of the basic software of the UNIX operating system. It should run on a Mac, but not in Windows. An exception to this is the GCTA MLMA GWAS analyses described at the end of the protocol - such analyses are only implemented in the Linux version of GCTA. Most sections of this protocol are designed to be usable simply by pasting into the command line – variables are set when each command is run, and should be straight-forward to modify.
  29 | 
  30 | # Procedure
  31 | 
  32 | ##### Recalling and rare-variant calling
  33 | 
  34 | Not covered by this protocol, see [this protocol](https://confluence.brc.iop.kcl.ac.uk:8493/display/PUB/Production+Version%3A+Illumina+Exome+Chip+SOP+v1.4), which presents best-practice for recalling the raw genotype data using Illumina GenomeStudio, and https://github.com/KHP-Informatics/chip_gt, which implements and compares the results of [ZCall](https://github.com/jigold/zCall) and [Opticall](https://www.sanger.ac.uk/resources/software/opticall/). [This Nature Protocols paper](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4441213/)  is also very good. 
  35 | 
  36 | ##### Reformat of data from the rare caller pipeline
  37 | 
  38 | The Human Core Exome array contains some SNPs called "SNP…" In order to make ZCall run effectively, it is necessary to change the name of these SNPs, e.g. to "xxx…" This can be done using the UNIX program sed
  39 | 
  40 | ```{sed}
  41 | sed 's/SNP/xxx/g' < rootname.report > rootname.updated.report
  42 | ```
  43 | 
  44 | Following the implementation of the rare caller pipeline, it is recommended to review the concordance between ZCall and Opticall − concordance is expected to be high (>99%). 
  45 |  
  46 | ##### Define names and locations of important files and software:
  47 | 
  48 | ```{UNIX}
  49 | printf "root=/path/to/rootname
  50 | pheno=/path/to/external_pheno.phe
  51 | covar=/path/to/covariates.cov
  52 | genders=/path/to/external_genders.txt
  53 | names=/path/to/external_individual_names.txt
  54 | keeps=/path/to/samples_to_keep.txt 
  55 | excludes=/path/to/samples_to_exclude.txt
  56 | insnps=/path/to/SNPs_to_keep.txt
  57 | outsnps=/path/to/SNPs_to_exclude.txt
  58 | plink=/path/to/plink2
  59 | R=/path/to/R" > Config.conf 
  60 | ```
  61 | 
  62 | File formats are the [PLINK file formats](http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml).
  63 | 
  64 | "rootname" is the prefix of the PLINK binary files obtained from the Exome-chip pipeline (i.e. the .bed file from the ZCall branch has the name "rootname_filt_Zcall_UA.bed"), and "/path/to/" is the location of these files on the computer. 
  65 | 
  66 | NB: not all of these files may be relevant to your study.
  67 | 
  68 | ##### Review the PLINK binary (.bed, .bim, .fam) files from Exome-chip pipeline
  69 | 
  70 | _Check individuals_
  71 | 
  72 | ```{UNIX}
  73 | less $root.fam
  74 | ```
  75 | 
  76 | _Check SNPs_ 
  77 | 
  78 | ```{UNIX}
  79 | less $root.bim
  80 | ```
  81 | 
  82 | ##### Update files
  83 | 
  84 | Phenotypes, individual names, genders, or SNP alleles may be lost in preparatory steps. These can be updated using external files.
  85 | 
  86 | _Update phenotype_
  87 | 
  88 | ```{PLINK}
  89 | $plink \
  90 | --bfile $root \
  91 | --pheno $pheno \
  92 | --make-bed \
  93 | --out $root.updated_pheno
  94 | ```
  95 | 
  96 | _Update genders_
  97 | 
  98 | ```{PLINK}
  99 | $plink \
 100 | --bfile $root \
 101 | --update-sex $genders \
 102 | --make-bed \
 103 | --out $root.updated_genders
 104 | ```
 105 | 
 106 | _Update sample names_
 107 | 
 108 | ```{PLINK}
 109 | $plink \
 110 | --bfile $root \
 111 | --update-ids $names \
 112 | --make-bed \
 113 | --out $root.updated_names
 114 | ```
 115 | 
 116 | _Select individuals for analysis_
 117 | 
 118 | ```{PLINK}
 119 | $plink \
 120 | --bfile $root \
 121 | --keep $keeps \
 122 | --make-bed \
 123 | --out $root.kept_names
 124 | ```
 125 | 
 126 | Or:
 127 | 
 128 | ```{PLINK}
 129 | $plink \
 130 | --bfile $root \
 131 | --remove $excludes \
 132 | --make-bed \
 133 | --out $root.kept_names
 134 | ```
 135 | 
 136 | _Select SNPs for analysis_
 137 | 
 138 | ```{PLINK}
 139 | $plink \
 140 | --bfile $root \
 141 | --extract $insnps \
 142 | --make-bed \
 143 | --out $root.kept_samples
 144 | ```
 145 | 
 146 | Or:
 147 | 
 148 | ```{PLINK}
 149 | $plink \
 150 | --bfile $root \
 151 | --exclude $outsnps \
 152 | --make-bed \
 153 | --out $root.kept_samples
 154 | ```
 155 | 
 156 | ##### Filter for common SNPs
 157 | 
 158 | ```{PLINK}
 159 | $plink \
 160 | --bfile $root \
 161 | --maf 0.01 \
 162 | --make-bed \
 163 | --out $root.common
 164 | ```
 165 | 
 166 | This assumes no updates were made, otherwise modify the --bfile command to point to that file (e.g. $root.updated_names).
 167 | 
 168 | 
 169 | ##### Filter for call rate iteratively 
 170 | 
 171 | ```{bash}
 172 | sh ./Iterative_Missingness.sh [begin] [final] [steps] 
 173 | ```
 174 | _Removes SNPs then samples at increasingly high cut-offs. E.g. To remove at 90% to 99%, in steps of 1%:_
 175 | 
 176 | ```{bash}
 177 | sh ./Iterative_Missingness.sh 90 99 1 
 178 | ```
 179 | 
 180 | 
 181 | ##### Review call rates to ensure all missing SNPs and individuals have been dropped
 182 | 
 183 | _Generate files for individual call rates and variant vall rates._
 184 | 
 185 | ```{PLINK}
 186 | $plink \
 187 | --bfile $root.filtered \
 188 | --missing \
 189 | --out $root.filtered_missing
 190 | ```
 191 | 
 192 | _Examine the lowest call rates for variants:_ 
 193 | 
 194 | ```{UNIX}
 195 | sort -k 5 -gr $root.filtered_missing.lmiss | head
 196 | ```
 197 | 
 198 | Check no variants above threshold remain in column 5 (proportion missing).
 199 | 
 200 | _Examine the lowest call rates for individuals:_ 
 201 | 
 202 | ```{UNIX}
 203 | sort -k 6 -gr $root.filtered_missing.imiss | head
 204 | ```
 205 | Check no individuals above threshold remain in column 6 (proportion missing).
 206 | 
 207 | ##### Assess SNPs for deviation from Hardy-Weinberg Equilibrium
 208 | 
 209 | _--hardy calculates HWE test p-values:_
 210 | 
 211 | ```{PLINK}
 212 | $plink \
 213 | --bfile $root.filtered \
 214 | --hardy \
 215 | --out $root.hw_p_values
 216 | ```
 217 | 
 218 | _--hwe removes deviant SNPs past a given threshold, 1x10^-5 below:_ 
 219 | 
 220 | ```{PLINK}
 221 | $plink \
 222 | --bfile $root.filtered \
 223 | --hwe 0.00001 \
 224 | --make-bed \
 225 | --out  $root.hw_dropped
 226 | ```
 227 | NB: in case-control datasets, the default behaviour of hwe is to work on controls only
 228 | 
 229 | ##### Prune data file for linkage disequilibrium 
 230 | 
 231 | _Using a window of 1500 variants and a shift of 150 variants between windows, with an r2 cut-off of 0.2:_
 232 | 
 233 | ```{PLINK}
 234 | $plink \
 235 | --bfile $root.hw_dropped \
 236 | --indep-pairwise 1500 150 0.2 \
 237 | --out $root.LD_one
 238 | ```
 239 | 
 240 | _Extract pruned-in SNPs_
 241 | 
 242 | ```{PLINK}
 243 | $plink \
 244 | --bfile $root.hw_dropped \
 245 | --extract $root.LD_one.prune.in \
 246 | --make-bed \
 247 | --out $root.LD_two
 248 | ```
 249 | 
 250 | _Exclude  high-LD and non-autosomal regions from the pruned file (see [Mike Weale's website](https://sites.google.com/site/mikeweale))_
 251 | 
 252 | ```{AWK}
 253 | awk -f highLDregions4bim_b37.awk $root.LD_two.bim > highLDexcludes
 254 | ```
 255 | ```{AWK}
 256 | awk '($1 < 1) || ($1 > 22) {print $2}' $root.LD_two.bim > autosomeexcludes
 257 | ```
 258 | ```{bash}
 259 | cat highLDexcludes autosomeexcludes > highLD_and_autosomal_excludes  
 260 | ``` 
 261 | ```{PLINK}
 262 | $plink \
 263 | --bfile $root.LD_two \
 264 | --exclude highLD_and_autosomal_excludes \
 265 | --make-bed \
 266 | --out $root.LD_three
 267 | ```
 268 | 
 269 | ##### Add phenotype to differentiate groups
 270 | 
 271 | _E.g. Add site of collection ("Site") from an external pheotype file:_
 272 | 
 273 | ```{PLINK}
 274 | $plink \
 275 | --bfile $root.LD_three \
 276 | --pheno $pheno \
 277 | --pheno-name Site \
 278 | --make-bed \
 279 | --out  $root.LD_four
 280 | ```
 281 | 
 282 | ##### Compare genotypic and phenotypic gender
 283 | 
 284 | _Ensure there is a separate XY region for the pseudoautosomal region on X:_ 
 285 | 
 286 | Most chips have the pseudoautosomal region mapped separately already.
 287 | Requires entry of genome build, below this is hg37 ("b37").
 288 | 
 289 | ```{PLINK}
 290 | $plink \
 291 | --bfile $root.LD_two \
 292 | --split-x b37 \
 293 | --make-bed \
 294 | --out $root.LD_split
 295 | ```
 296 | 
 297 | _Compare phenotypic gender to X chromosome heterogeneity and Y chromosome SNP count:_
 298 | 
 299 | ```{PLINK}
 300 | $plink \
 301 | --bfile $root.LD_split \
 302 | --check-sex ycount 0.2 0.8 0 1 \
 303 | --out $root.sex_check
 304 | ```
 305 | 
 306 | IDs identified as discordant (not the phenotypic gender) or for which F is between 0.2 and 0.8 (not assigned a gender by PLINK), should be reviewed with the collection site where possible. This command also takes into account the number of Y chromosome SNPs present, to counteract the unreliable nature of the F statistic in assigning female gender. The number of Y SNPs with calls in females can be set as part of ycount (above females have a maximum of 0, and males a maximum of 1), and will depend on the recalling method used and sample size. An additional check can be made by assessing whole-genome heterogeneity for all samples (see below) at this point – discordant gender may be the result of unusual heterogeneity
 307 | 
 308 | _Remove discordant IDs that cannot be resolved:_ 
 309 | 
 310 | This command assumes a PLINK-format file of IDs for discordant individuals called "discordant_individuals.txt".
 311 | 
 312 | ```{PLINK}
 313 | $plink \
 314 | --bfile $root.LD_four \
 315 | --remove discordant_individuals.txt \
 316 | --make-bed \
 317 | --out $root.LD_five
 318 | 	
 319 | $plink \
 320 | --bfile $root.hw_dropped \
 321 | --remove discordant_individuals.txt \
 322 | --make-bed \
 323 | --out $root.sexcheck_cleaned
 324 | ```
 325 | 
 326 | ##### Pairwise identical-by-descent (IBD) check
 327 | 
 328 | ```{PLINK}
 329 | $plink \
 330 | --bfile $root.LD_five \
 331 | --genome \
 332 | --make-bed \
 333 | --out $root.IBD
 334 | ```
 335 | 
 336 | _Remove one sample from each pair with pi-hat (% IBD) above threshold (0.1875 below):_
 337 | 
 338 | ```{AWK}
 339 | awk '$10 >= 0.1875 {print $1, $2}' $root.IBD.genome > $root.IBD_outliers.txt 
 340 | ```
 341 | 
 342 | ```{PLINK}
 343 | $plink \
 344 | --bfile $root.IBD \
 345 | --remove $root.IBD_outliers.txt \
 346 | --make-bed \
 347 | --out $root.no_close_relatives
 348 | ```
 349 | 
 350 | _Calculate average IBD per individual using R, output outliers (defined as more than ***sigma*** standard deviations above the mean, as provided by the user):_
 351 | 
 352 | ```{R}
 353 | $R --file=IndividualIBD.R --args $root [sigma]
 354 | ```
 355 | 
 356 | Exclude outliers from both LD-stripped and all SNP binary files
 357 | 
 358 | ```{PLINK}
 359 | $plink \
 360 | --bfile $root.LD_five \
 361 | --remove $root.IBD_INDIV_outliers.txt \
 362 | --make-bed \
 363 | --out $root.LD_IBD
 364 | 
 365 | $plink \
 366 | --bfile $root.sexcheck_cleaned \
 367 | --remove $root.IBD_INDIV_outliers.txt \
 368 | --make-bed \
 369 | --out $root.IBD_cleaned
 370 | ```
 371 | 
 372 | ##### Population stratification by principal component analysis in EIGENSOFT
 373 | 
 374 | Consult [https://sites.google.com/site/mikeweale/software/eigensoftplus].
 375 | 
 376 | ___Run EIGENSOFT using LD-pruned binary___
 377 | 
 378 | _Convert files to EIGENSOFT format using CONVERTF_
 379 | 
 380 | Requires par file to convert from packedped format to eigenstrat format
 381 | 
 382 | ```{UNIX}
 383 | convertf -p <(printf "genotypename: "$root".LD_IBD.bed
 384 | snpname: "$root".LD_IBD.bim
 385 | indivname: "$root".LD_IBD.fam
 386 | outputformat: EIGENSTRAT
 387 | genotypeoutname: "$root".pop_strat.eigenstratgeno
 388 | snpoutname: "$root".pop_strat.snp
 389 | indivoutname: "$root".pop_strat.ind")
 390 | ```
 391 | 
 392 | _Run SmartPCA, removing no outliers_ 
 393 | 
 394 | Produces 100 PCs
 395 | 
 396 | ```{perl}
 397 | smartpca.perl \
 398 | -i $root.pop_strat.eigenstratgeno \
 399 | -a $root.pop_strat.snp \
 400 | -b $root.pop_strat.ind \
 401 | -o $root.pop_strat.pca \
 402 | -p $root.pop_strat.plot \
 403 | -e $root.pop_strat.eval \
 404 | -l $root.pop_strat_smartpca.log \
 405 | -m 0 \
 406 | -t 100 \
 407 | -k 100 \
 408 | -s 6
 409 | ```
 410 | 
 411 | Note that the order of the inputs is important.
 412 | 
 413 | Inputs explained:
 414 | 
 415 |     -i  is the genotype file
 416 |     -a is the SNP names
 417 |     -b is the individual names
 418 |     -o is the output eigenvectors ( $root.pop_strat.pca.evec)
 419 |     -p plots the output file. This is only activated if gnuplot is installed, but is a necessary inclusion for smartpca to run. If gnuplot is not installed, this does not affect the running of smartpca. If gnuplot is installed, this produces a plot of the first component on the second.
 420 |     -e is the output eigenvalues
 421 |     -l is the log, including a list of individuals defined as outliers. 
 422 |     -m sets the number of outlier removal iterations. This is initially set to 0, so no outliers are removed.
 423 |     -t sets the number of components from which outliers should be removed. If -m is 0, this value has no effect.
 424 |     -k is the number of components to be output
 425 |     -s defines the minimum number of standard deviations from the mean of each component an individual must be to be counted as an outlier.
 426 |   
 427 | _Minor edit to allow import into R_
 428 | 
 429 | Remove leading tab and split ID into two columns. 
 430 | 
 431 | ```{sed}
 432 | sed -i -e 's/^[ \t]*//' -e 's/:/ /g' $root.pop_strat.pca.evec
 433 | ```
 434 | 
 435 | _Calculate association between PCs and outcome measure in R_
 436 | 
 437 | Both scripts require the same IDs to be in $root.pca.evec and $pheno, and look at 100 PCs by default. 
 438 | 
 439 | *Short version (outputs the variance explained by each component and its significance when added to a model including the previous components):*
 440 | 
 441 | ```{R}	
 442 | $R --file=PC-VS-OUTCOME_IN_R_SHORT.R --args $root.pop_strat $pheno
 443 | ```
 444 | 
 445 | *Long version (outputs the full results of the linear model, adding each component in turn):*
 446 | 
 447 | ```{R}
 448 | $R --file= PC-VS-OUTCOME_IN_R_FULL.R --args $root.pop_strat $pheno
 449 | ```
 450 | 
 451 | _Run SmartPCA again to remove outliers_ 
 452 | 
 453 | Run as above ("Run SmartPCA, removing no outliers"), but change $root suffix to pop_strat_outliers.
 454 | Set –m 5 and –t x (where ***x*** is the number of PCs significantly associated with the outcome measure) 
 455 | 
 456 | ```{perl}
 457 | smartpca.perl \
 458 | -i $root.pop_strat.eigenstratgeno \
 459 | -a $root.pop_strat.snp \
 460 | -b $root.pop_strat.ind \
 461 | -o $root.pop_strat_outliers.pca \
 462 | -p $root.pop_strat_outliers.plot \
 463 | -e $root.pop_strat_outliers.eval \
 464 | -l $root.pop_strat_outliers_smartpca.log \
 465 | -m 5 \
 466 | -t x \
 467 | -k 100 \
 468 | -s 6
 469 | ```
 470 | 
 471 | _Plot principal components in R_
 472 | 
 473 | Plot before and after outlier exclusion to allow visual inspection of which samples are dropped
 474 | 
 475 | Plot first component against second in R and colour by phenotype - this requires [ggplot2](http://ggplot2.org/) to be installed.
 476 | 
 477 | ```{sed}
 478 | sed -i -e 's/^[ \t]*//' -e 's/:/ /g' $root.pop_strat_outliers.pca.evec
 479 | ```
 480 | 
 481 | ```{R}
 482 | $R --file=PlotPCs.R --args $root.pop_strat 1 2
 483 | 
 484 | $R --file=PlotPCs.R --args $root.pop_strat_outliers 1 2
 485 | ```
 486 | 
 487 | This script can be modified to plot any of the first 100 components against each other by changing 1 and 2 above. The design of the plot is extremely modifiable - see [http://docs.ggplot2.org/current/].
 488 | 
 489 | _Extract outliers_
 490 | 
 491 | ```{bash}
 492 | sh ./ExtractAncestryOutliers.sh
 493 | ```
 494 | 
 495 | ```{PLINK}
 496 | $plink \
 497 | --bfile $root.LD_IBD \
 498 | --remove $root.pop_strat_outliers.outliers \
 499 | --make-bed \
 500 | --out $root.LD_pop_strat
 501 | 
 502 | $plink \
 503 | --bfile $root.IBD_cleaned \
 504 | --remove $root.pop_strat_outliers.outliers \
 505 | --make-bed \
 506 | --out $root.pop_strat
 507 | ```
 508 | 
 509 | _Re-run to assess which components to include as covariates in the final analysis_
 510 | 
 511 | Run ConvertF:
 512 | 
 513 | ```{perl}
 514 | convertf -p <(printf "genotypename: $root.LD_pop_strat.bed
 515 | snpname: $root.LD_pop_strat.bim
 516 | indivname: $root.LD_pop_strat.fam
 517 | outputformat: EIGENSTRAT
 518 | genotypeoutname: $root.PCS_for_covariates.eigenstratgeno
 519 | snpoutname: $root.PCS_for_covariates.snp
 520 | indivoutname: $root.PCS_for_covariates.ind")
 521 | ```
 522 | 
 523 | Run SmartPCA:
 524 | 
 525 | ```{perl}
 526 | smartpca.perl \
 527 | -i $root.PCS_for_covariates.eigenstratgeno \
 528 | -a $root.PCS_for_covariates.snp \
 529 | -b $root.PCS_for_covariates.ind \
 530 | -o $root.PCS_for_covariates.pca \
 531 | -p $root.PCS_for_covariates.plot \
 532 | -e $root.PCS_for_covariates.eval \
 533 | -l $root.PCS_for_covariates_smartpca.log \
 534 | -m 0 \
 535 | -t 100 \
 536 | -k 100 \
 537 | -s 6 \
 538 | ```
 539 | 
 540 | Calculate association (short version):
 541 | 
 542 | ```{R}	
 543 | $R --file=PC-VS-OUTCOME_IN_R_SHORT.R --args $root.PCS_for_covariates
 544 | ```
 545 | 
 546 | Include components significantly associated with outcome as covariates in the final analysis, or add PCs in turn until inflation falls to an accepted level (lambda ≈ 1).
 547 | 
 548 | ##### Optional (but useful): plot individuals on components drawn from the HapMap reference populations to assess likely ancestry groupings. 
 549 | 
 550 | Details of this procedure can be found at [Timothee Flutre's OpenWetWare](http://openwetware.org/wiki/User:Timothee_Flutre/Notebook/Postdoc/2012/01/22).
 551 | 
 552 | Note that the  http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/ domain referenced by Dr Flutre has since been retired. The HapMap samples are available at [ftp://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/2010-08_phaseII+III/forward/](ftp://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/2010-08_phaseII+III/forward/).
 553 | 
 554 | _Manually extract HapMap and own cohort individual names_
 555 | 
 556 | ```{bash}
 557 | sh ./MakeKeepIds.sh
 558 | ```
 559 | 
 560 | _Use keepids.txt at this section:_
 561 | 
 562 |     for pop in {CEU,CHB,JPT,YRI}; do echo ${pop}; \
 563 |     hapmap2impute.py -i genotypes_CHR_${pop}_r28_nr.b36_fwd.txt.gz -n keepids.txt -o genotypes_hapmap_r28_b37_${pop}.impute.gz -b snps_hapmap_r28_nr_b37.bed.gz -s list_snps_redundant.txt; done
 564 |     zcat genotypes_hapmap_r28_b37_CEU.impute.gz | wc -l
 565 |     3907899
 566 |     zcat genotypes_hapmap_r28_b37_CHB.impute.gz | wc -l
 567 |     3933013
 568 |     zcat genotypes_hapmap_r28_b37_JPT.impute.gz | wc -l
 569 |     3931282
 570 |     zcat genotypes_hapmap_r28_b37_YRI.impute.gz | wc -l
 571 |     3862842
 572 | 
 573 | More populations now exist than those listed in Flutre’s script; these can be obtained in the same manner.
 574 | 
 575 | ##### Alternative - Use 1000 Genomes Phase 1 data to achieve the same
 576 | 
 577 | Much the same process can be used to assess sample ethnicity by projecting on PCs from the 1000 Genomes samples. 
 578 | 
 579 | From $root.IBD_cleaned:
 580 | 
 581 | _Obtain 1KG Phase 1 data from PLINK2 website_
 582 | 
 583 | **WARNING: FILE > 1GB**
 584 | 
 585 | ```{bash}
 586 | wget https://www.dropbox.com/s/k9ptc4kep9hmvz5/1kg_phase1_all.tar.gz?dl=1
 587 | ```
 588 | 
 589 | Note code below creates numerical phenotypes for the 1KG populations. **CHANGE THESE IF THEY WILL OVERLAP WITH YOUR PHENOTYPE DATA!**
 590 | 
 591 | _Obtain 1KG Population info from 1KG_
 592 | 
 593 | ```{bash}
 594 | wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped
 595 | 
 596 | grep -f <(awk '{print $2}' 1kg_phase1_all.fam) <(awk 'NR > 1 {print 0, $2, $7}' 20130606_g1k.ped) > 1KG_Phenos.txt
 597 | 
 598 | sed -i -e 's/ASW/3/g' -e 's/CEU/4/g' -e 's/CHB/5/g' -e 's/CHS/6/g' -e 's/CLM/7/g' -e 's/FIN/8/g' -e 's/GBR/10/g' -e 's/IBS/11/g' -e 's/JPT/12/g' -e 's/LWK/13/g' -e 's/MXL/14/g' -e 's/PUR/15/g' -e 's/TSI/16/g' -e 's/YRI/17/g' 1KG_Phenos.txt 
 599 | ```
 600 | 
 601 | Provided your data has sufficient common variants (as with most microarrays), you can be fairly brutal with selecting variants for ancestry estimation. Ultimately, good estimation can be achieved with ~20K variants (see [Price et al, 2006](http://www.ncbi.nlm.nih.gov/pubmed/16862161)).
 602 | 
 603 | _Limit files to SNPs with rs IDs_
 604 | 
 605 | ```{bash}
 606 | fgrep rs $root.IBD_cleaned.bim > $root.IBD_cleaned.rsids.txt
 607 | ```
 608 |  
 609 | _Get rs ID variant names_
 610 | 
 611 | ```{bash}
 612 | awk '{print $2}' $root.IBD_cleaned.rsids.txt > $root.IBD_cleaned.rsid_names.txt
 613 | ```
 614 | 
 615 | _Extract rs IDs from root_
 616 | 
 617 | ```{PLINK}
 618 | $plink \
 619 | --bfile $root.IBD_cleaned \
 620 | --extract $root.IBD_cleaned.rsid_names.txt \
 621 | --chr 1-22 \
 622 | --make-bed \
 623 | --out $root.IBD_cleaned.rsids.autosomal
 624 | ```
 625 | 
 626 | _Extract rs IDs from 1KG (and add phenotypes)_
 627 | 
 628 | ```{PLINK}
 629 | $plink \
 630 | --bfile 1kg_phase1_all \
 631 | --extract $root.IBD_cleaned.rsid_names.txt \
 632 | --pheno 1KG_Phenos.txt \
 633 | --make-bed \
 634 | --out 1kg_phase1_all.rsids.autosomal
 635 | ```
 636 | 
 637 | _Obtain SNPs present in both files_
 638 | 
 639 | ```{bash}
 640 | awk '{print $2}' 1kg_phase1_all.rsids.autosomal.bim > 1kg_phase1_all.rsids_names.txt
 641 | ```
 642 | 
 643 | _Extract 1KG SNPs from root_
 644 | 
 645 | ```{PLINK}
 646 | $plink \
 647 | --bfile $root.IBD_cleaned.rsids.autosomal \
 648 | --extract 1kg_phase1_all.rsids_names.txt \
 649 | --make-bed \
 650 | --out $root.IBD_cleaned.intersection
 651 | ```
 652 | 
 653 | _Dry run bmerge to identify SNPs PLINK will fail on_
 654 | 
 655 | ```{PLINK}
 656 | $plink \
 657 | --bfile $root.IBD_cleaned.intersection \
 658 | --bmerge 1kg_phase1_all.rsids.autosomal \
 659 | --merge-mode 6 \
 660 | --out $root.1KG.IBD_cleaned_failures
 661 | ```
 662 | 
 663 | _Add variants with multiple positions to missnp_
 664 | 
 665 | ```{bash}
 666 | fgrep \'rs $root.1KG.IBD_cleaned_failures.log |\
 667 | awk '{print $7}' |\
 668 | sed -e "s/'//g" -e "s/.//g" > $root.1KG.IBD_cleaned_failures.multiple.positions.txt
 669 | 
 670 | cat $root.1KG.IBD_cleaned_failures.missnp $root.1KG.IBD_cleaned_failures.multiple.positions.txt > $root.1KG.IBD_cleaned_failures.multiple.positions.missnp
 671 | ```
 672 | 
 673 | _Exclude mismatched SNPs and variants with multiple positions_
 674 | 
 675 | ```{PLINK}
 676 | $plink \
 677 | --bfile $root.IBD_cleaned.intersection \
 678 | --exclude $root.1KG.IBD_cleaned_failures.multiple.positions.missnp \
 679 | --make-bed \
 680 | --out $root.IBD_cleaned.intersection_for_merge
 681 | ```
 682 | 
 683 | _Merge root and 1KG_
 684 | 
 685 | ```{PLINK}
 686 | $plink \
 687 | --bfile $root.IBD_cleaned.intersection_for_merge \
 688 | --bmerge 1kg_phase1_all.rsids.autosomal \
 689 | --out $root.1kg.pop_strat
 690 | ```
 691 | 
 692 | _Filter missing variants, rare variants and HWE_
 693 | 
 694 | ```{PLINK}
 695 | $plink \
 696 | --bfile $root.1kg.pop_strat \
 697 | --geno 0.01 \
 698 | --maf 0.01 \
 699 | --hwe 0.0001 \
 700 | --make-bed \
 701 | --out $root.1kg.pop_strat.for_prune
 702 | ```
 703 | 
 704 | _LD Pruning_
 705 | 
 706 | ```{PLINK}
 707 | $plink \
 708 | --bfile $root.1kg.pop_strat.for_prune \
 709 | --indep-pairwise 1500 150 0.2 \
 710 | --out $root.1kg.pop_strat.prune
 711 | ```
 712 | ```{PLINK}
 713 | $plink \
 714 | --bfile $root.1kg.pop_strat.for_prune \
 715 | --extract $root.1kg.pop_strat.prune.prune.in \
 716 | --make-bed \
 717 | --out $root.1kg.LD_pop_strat
 718 | ```
 719 | 
 720 | _Run convertf to make EIGENSTRAT file_
 721 | 
 722 | ```{perl}
 723 | convertf -p <(printf "genotypename: $root.1kg.LD_pop_strat.bed
 724 |              snpname: $root.1kg.LD_pop_strat.bim
 725 |              indivname: $root.1kg.LD_pop_strat.fam
 726 |              outputformat: EIGENSTRAT
 727 |              genotypeoutname: $root.1kg.LD_pop_strat.eigenstratgeno
 728 |              snpoutname: $root.1kg.LD_pop_strat.snp
 729 |              indivoutname: $root.1kg.LD_pop_strat.ind")
 730 | ```
 731 | 
 732 | _Generate poplist for projection_
 733 | 
 734 | ```{bash}
 735 | awk '{print $3}' 1KG_Phenos.txt | sort | uniq > $root.1kg.LD_poplist.txt
 736 | ```
 737 | 
 738 | _Run Smartpca, projecting on 1KG samples only_
 739 | 
 740 | ```{perl}
 741 | smartpca.perl \
 742 | -i $root.1kg.LD_pop_strat.eigenstratgeno \
 743 | -a $root.1kg.LD_pop_strat.snp \
 744 | -b $root.1kg.LD_pop_strat.ind \
 745 | -o $root.1kg.LD_pop_strat.pca \
 746 | -p $root.1kg.LD_pop_strat.plot \
 747 | -e $root.1kg.LD_pop_strat.eigenvalues \
 748 | -l $root.1kg.LD_pop_strat.log \
 749 | -w $root.1kg.LD_poplist.txt \
 750 | -m 0
 751 | ```
 752 | 
 753 | Note that the command below relabels the phenotype column as xCHANGE, where x is the phenotype, and then relabels the 1KG populations with their names for graphing. **Modify the sed command to allow your samples to be labelled usefully!** 
 754 | 
 755 | _Modify $root.1kg.LD_pop_strat.pca.evec for R_
 756 | 
 757 | ```{bash}
 758 | awk 'NR > 1 {print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12"CHANGE"}' $root.1kg.LD_pop_strat.pca.evec > $root.1kg.LD_pop_strat.pca.evec_RENAMED
 759 | 
 760 | sed -i -e 's/13CHANGE/LWK/g' -e 's/14CHANGE/MXL/g' -e 's/15CHANGE/PUR/g' -e 's/16CHANGE/TSI/g' -e 's/17CHANGE/YRI/g' -e 's/3CHANGE/ASW/g' -e 's/4CHANGE/CEU/g' -e 's/5CHANGE/CHB/g' -e 's/6CHANGE/CHS/g' -e 's/7CHANGE/CLM/g' -e 's/8CHANGE/FIN/g' -e 's/10CHANGE/GBR/g' -e 's/11CHANGE/IBS/g' -e 's/12CHANGE/JPT/g'  $root.1kg.LD_pop_strat.pca.evec_RENAMED
 761 | ```
 762 | 
 763 | _Plot PCs_
 764 | 
 765 | ```{bash}
 766 | RScript PC_Plot_1KG.R $root
 767 | ```
 768 | 
 769 | ##### Heterozygosity Test
 770 | 
 771 | _Test for unusual patterns of genome-wide heterogeneity in LD-pruned data_
 772 | 
 773 | ```{PLINK}
 774 | $plink \
 775 | --bfile $root.LD_pop_strat \
 776 | --ibc \
 777 | --out $root.het
 778 | ```
 779 | 
 780 | _Exclude samples identified as outliers_ 
 781 | 
 782 | ```{R}
 783 | R --file=IdHets.R --args $root.het
 784 | ```
 785 | 
 786 | ```{PLINK}
 787 | $plink \
 788 | --bfile $root.LD_pop_strat \
 789 | --remove $root.het.LD_het_outliers_sample_exclude \
 790 | --make-bed \
 791 | --out $root.LD_het_cleaned
 792 | 
 793 | $plink \
 794 | --bfile $root.pop_strat \
 795 | --remove $root.het.LD_het_outliers_sample_exclude \
 796 | --make-bed \
 797 | --out $root.het_cleaned
 798 | ```
 799 | 
 800 | ##### Imputation
 801 | 
 802 | ___THIS CODE SHOULD BE CONSIDERED ARCHIVAL - IT IS RECOMMENDED THAT YOU NOW PERFORM IMPUTATION BY SUBMITTING YOUR DATA TO THE [TOPMED](https://imputation.biodatacatalyst.nhlbi.nih.gov/#!), [MICHIGAN](https://imputationserver.sph.umich.edu/index.html) OR [SANGER](https://imputation.sanger.ac.uk/) IMPUTATION SERVERS, WHICH ARE FASTER AND USE MORE UP-TO-DATE REFERENCE PANELS.___
 803 | 
 804 | *Consult [http://genome.sph.umich.edu/wiki/IMPUTE2:_1000_Genomes_Imputation_Cookbook] and [https://mathgen.stats.ox.ac.uk/impute/prephasing_and_imputation_with_impute2.tgz]*
 805 | 
 806 | *Download reference files from [http://mathgen.stats.ox.ac.uk/impute/impute_v2.html]*
 807 | 
 808 | *Copy impute2_examples folder from [https://mathgen.stats.ox.ac.uk/impute/prephasing_and_imputation_with_impute2.tgz] to work folder*
 809 | 
 810 | _Download relevant strand file from [http://www.well.ox.ac.uk/~wrayner/strand/] and split by chromosome_
 811 | 
 812 | ```{AWK}
 813 | awk '{print $3, $5 > "$root."$2".strand"}' HumanCoreExome-12v1-0_B-b37.strand
 814 | ```
 815 | 
 816 | _Convert PLINK binary to GEN files (IMPUTE2 input)_ 
 817 | 
 818 | ```{PLINK}
 819 | $plink \
 820 | --bfile $root.het_cleaned \
 821 | --recode oxford \ 
 822 | --out $root.for_impute
 823 | ```
 824 | 
 825 | _Split whole--genome .gen into chromosome .gen_
 826 | 
 827 | ```{AWK}
 828 | awk '{print > "Chr"$1".gen"}' $root.for_impute.gen
 829 | ```
 830 | 
 831 | _Check split has proceeded correctly – total line number of all chromosome .gen files should total $root.for_impute.gen_
 832 | 
 833 | ```{bash}
 834 | wc -l *.gen
 835 | ```
 836 | 
 837 | _Generate chunk files for each chromosome_
 838 | 
 839 | ```{bash}
 840 | sh ./MakeChunks.sh
 841 | ```
 842 | 
 843 | This makes two sets of files:
 844 | 
 845 | Chunks_chr[1-23].txt
 846 | 
 847 |     30000001 3.5e+07 875 
 848 |     35000001 4e+07 500
 849 |     40000001 4.5e+07 85
 850 |     45000001 5e+07 424
 851 |     50000001 5.5e+07 693
 852 | 
 853 | These files list the base positions of the edges of each chromosome chunk and the number of SNPs in each chunk.
 854 | Consult this file and merge chunks with few SNPs (e.g. less than 100) with neighbouring chunks.
 855 | 
 856 | analysis_chunks_5Mb_chr[1-23].txt
 857 | 
 858 |     30000001 3.5e+07
 859 |     35000001 4e+07
 860 |     40000001 5e+07
 861 |     50000001 5.5e+07
 862 | 
 863 | These files are the input for IMPUTE2
 864 | 
 865 | *Modify the submit_impute2_jobs_to_cluster.R script (from impute2_examples) to accept chunk files without headers*
 866 | 
 867 | ```{bash}
 868 | From:
 869 |   # read in file with chunk boundary definitions
 870 |   chunk.file <- paste(data.dir,"analysis_chunks_",chunk.size,"Mb_chr",chr,".txt", sep="")
 871 |   chunks <- read.table(chunk.file, head=T, as.is=T) 
 872 | 
 873 | To:
 874 | 
 875 |   # read in file with chunk boundary definitions
 876 |   chunk.file <- paste(data.dir,"analysis_chunks_",chunk.size,"Mb_chr",chr,".txt", sep="")
 877 |   chunks <- read.table(chunk.file, head=F, as.is=T). 
 878 | ```
 879 | *Modify scripts in impute2_examples folder (prototype_imputation_job_posterior_sampled_haps.sh master_imputation_script_posterior_sampled_haps.sh and submit_impute_jobs_to_cluster.R) to fit personal needs.*
 880 | 
 881 | Likely to need to limit number of jobs submitted to remain within local SunGridEngine rules – liaise with local system administrator to establish local best practice. Also likely to need to amend script to allow 5Mb jobs to run - for example, to increase virtual memory allowance to 15Gb, add the following to the header of the prototype_imputation_job_posterior_sampled_haps.sh script:
 882 | 
 883 | ```bash
 884 | #$ -l h_vmem=15G
 885 | ```
 886 | 
 887 | *Submit jobs*
 888 | 
 889 | **NB – this runs over 600 jobs on your cluster if not controlled!**
 890 | 
 891 | ```{bash}
 892 | sh ./master_imputation_script_posterior_sampled_haps.sh 
 893 | ```
 894 | 
 895 | _Adapt scripts for imputing X chromosome (running the different X map and legend files), and run_
 896 | 
 897 | Consult [http://mathgen.stats.ox.ac.uk/impute/impute_v2.html]
 898 | 
 899 | *Merge imputed chunks together (.impute2 and .impute2_info) to form a file for each chromosome*
 900 | 
 901 | ```{bash}
 902 | sh ./MergeImputedChunks.sh 
 903 | ```
 904 | 
 905 | _Add chromosome number to each SNP in each chromosome.impute2 file_
 906 | 
 907 | ```{bash}
 908 | sh ./AddChromosomeNumber.sh
 909 | ```
 910 | 
 911 | _Merge by-chromosome info files to form a file for the whole genome_
 912 | 
 913 | ```{bash}
 914 | cat results-directory/*.impute2_info > path/to/results-directory/$root.whole_genome.impute2_info
 915 | ```
 916 | 
 917 | For December 2013 release of reference data (Phase1 Integrated), there are several aspects that require clean-up - these do not appear to apply to the Phase 3 release. Steps marked in bold are required for the Phase1 Integrated release, but may not be needed for Phase3.
 918 | 
 919 | __Exomic variants are named "." It is necessary to make these unique (as chr:position)__
 920 | 
 921 | ```{bash}
 922 | sh ./ReplaceDots.sh
 923 | ```
 924 | 
 925 | _Filter imputed data (.impute2 files) by info metric_ 
 926 | 
 927 | ```{bash}
 928 | sh ./FilterByInfoAll.sh [threshold]
 929 | ```
 930 | 
 931 | _Merge filtered by-chromosome .impute2 files to make a single whole-genome file_
 932 | 
 933 | ```{bash}
 934 | cat results-directory/*_New_filtered.impute2 > \
 935 | /results-directory/$root.whole_genome_filtered.impute2
 936 | ```
 937 | 
 938 | __Remove duplicate SNPs from .impute2 file__
 939 | 
 940 | ```{AWK}
 941 | awk '{print $2}' $root.whole_genome_filtered.impute2 | \
 942 | sort | uniq –c | awk '$1 !=1 {print $0}' > Duplicates
 943 | 
 944 | awk '{print $2}' $root.whole_genome_filtered.impute2 | sort | uniq -d > Duplicates_cleaned
 945 | ```
 946 | 
 947 | These produce two files called Duplicates and Duplicates_cleaned that list the duplicated SNPs in the file with and without the number of instances respectively
 948 | 
 949 | ```{bash}
 950 | grep -vwF -f Duplicates_cleaned $root.whole_genome_filtered.impute2 > Temp1
 951 | ```
 952 | 
 953 | Removes all lines with an instance of a duplicated rs# from $root.whole_genome_filtered.impute2 and outputs to Temp1:
 954 | 
 955 | ```{AWK}
 956 | awk '{print $2}' Temp1 | sort | uniq –d > DuplicatesRemoved
 957 | ```
 958 | 
 959 | Repeats the check for duplicates – this file should now be empty; check with
 960 | 
 961 | ```{bash}
 962 | less DuplicatesRemoved 
 963 | ```
 964 | 
 965 | Compare file lengths; the length of Temp1 should be the length of $root.whole_genome_filtered.impute2 minus the number of duplicated SNPs removed
 966 | 
 967 | ```{bash}
 968 | wc -l Temp1 $root.whole_genome_filtered.impute2
 969 | mv Temp1 $root.whole_genome_filtered_cleaned.impute2
 970 | ```
 971 | 
 972 | *Convert IMPUTE2 to hard-called PLINK format*
 973 | 
 974 | ```{PLINK}
 975 | $plink \
 976 | --gen $root.whole_genome_filtered_cleaned.impute2 \
 977 | --sample $root.for_impute.sample \
 978 | --hard-call-threshold 0.8 \
 979 | --make-bed \
 980 | --out $root.post_imputation
 981 | ```
 982 | 
 983 | ***NB: if SNP does not pass threshold, it is set as missing!***
 984 | 
 985 | At this point, it is recommended to gzip all IMPUTE2 files - note that this will be a large job.
 986 | 
 987 | ```{bash}
 988 | gzip *impute2* 
 989 | ```
 990 | 
 991 | ##### Post-imputation quality control	
 992 | 
 993 | *Remove rare SNPs depending on sample size and dataset characteristics*
 994 | 
 995 | ```{PLINK}
 996 | $plink \
 997 | --bfile $root.post_imputation \
 998 | --maf 0.01 \
 999 | --make-bed \
1000 | --out $root.post_imputation_common
1001 | ```
1002 | 
1003 | *Remove missing SNPs, including those set as missing above*
1004 | 
1005 | ```{PLINK}
1006 | $plink \
1007 | --bfile $root.post_imputation_common \
1008 | --geno 0.02 \
1009 | --make-bed \
1010 | --out $root.post_imputation_updated
1011 | ```
1012 | 
1013 | _Drop duplicated variants from imputation_
1014 | 
1015 | ```{bash}
1016 | sh ./DropDuplicatedSNPs.sh
1017 | ```
1018 | ```{PLINK}
1019 | $plink \
1020 | --bfile $root.post_imputation_updated \
1021 | --exclude $root.post_imputation_updated_duplicated_IDs \
1022 | --make-bed \
1023 | --out $root.post_imputation_final
1024 | ```
1025 | 
1026 | *Convert imputed rs IDs back to rs… format*
1027 | 
1028 | ```{bash}
1029 | sh ./Relabel_rs.sh
1030 | ```
1031 | 
1032 | Some rs IDs are imperfectly mapped, resulting in duplications with imputed IDs, so remove these accidental duplicates.
1033 | 
1034 | ```{bash}
1035 | sh ./DropDuplicatedPositions.sh 
1036 | ```
1037 | 
1038 | ##### Association testing in PLINK/PLINK2
1039 | 
1040 | *Generate covariates file, merging $covar and $root.dataname_pop_strat_includes.pca.evec (output from SMARTPCA) files*
1041 | 
1042 | ```{R}
1043 | $-R --file=Get_Covariates.R --args $root $covar
1044 | ```
1045 | 
1046 | Relabels header and adds additional covariates (.pca.evec contains all PCs included in the SmartPCA analysis) 
1047 | Script assumes a covariate file with the same column names for IDs (FID and IID), but no shared column names with the .pca.evec file, which is assumed to contain 100 PCs).
1048 | 
1049 | 
1050 | *Run association against phenotype*
1051 | 
1052 | Phenotype here assumed to be in $pheno as the only phenotype (otherwise use --mpheno [column number]) and called "Outcome". 
1053 | 
1054 | ```{PLINK}
1055 | $plink \
1056 | --bfile $root.post_imputation_final \
1057 | --logistic/--linear (depending whether phenotype of interest is dichotomous or continuous) \
1058 | --pheno $pheno  \
1059 | --pheno-name Outcome \
1060 | --covar $covar
1061 | --covar-number 1-10
1062 | --hide-covar
1063 | --parameters 1-11
1064 | --out $root.post_imputation_conc_analysis
1065 | ```
1066 | 
1067 | Consider coding of phenotype – may require the use of --1 as an option if coding is in 0,1 format (rather than 1,2 format)
1068 |     
1069 |          --covar-number indicates which covariates to include. --covar-name can also be used for this 
1070 |          --hide-covar hides results of association tests between phenotype and covariates
1071 |          --parameters specifies models to include in the analysis (see www.cog-genomics.org/plink2)
1072 |            1.	Allelic dosage additive effect (or homozygous minor dummy variable)
1073 |            2.	Dominance deviation, if present
1074 |            3.	--condition{-list} covariate(s), if present
1075 |            4.	--covar covariate(s), if present
1076 |            5.	Genotype x non-sex covariate 'interaction' terms, if present
1077 |            6.	Sex, if present
1078 |            7.	Sex-genotype interaction(s), if present 
1079 | 
1080 | _Investigate further any SNP that is highly associated with the phenotype, and exclude from analysis if justified_
1081 | 
1082 | Run BLAT, [available on the UCSC Genome Browser](https://genome.ucsc.edu/cgi-bin/hgBlat?command=start) on the probe sequence (available from the array manifest) for all highly associated genotyped SNPs as a test of how well mapped/unique the sequence is, particularly with regards to similarity to sequences on the sex chromosomes. Discard any associated SNP that does not map uniquely.
1083 | 
1084 | All association details here assume an additive model – see PLINK website to implement other models (but see [Knight and Lewis, 2012](http://www.ncbi.nlm.nih.gov/pubmed/22383645) for discussion of statistical issues of performing tests using multiple models). More association tests are available in PLINK and PLINK2.
1085 |  
1086 | ##### Using GCTA for Genomic-relatedness-matrix Restricted Maximum Likelihood (GREML) and Mixed Linear Model Association (MLMA)
1087 | 
1088 | _Make GRM_
1089 | 
1090 | Thresholds below: MAF 1%, IBD 0.025  
1091 | 
1092 | ```{GCTA}
1093 | ./gcta \ 
1094 | --bfile $root.post_imputation_final \
1095 | --autosome \
1096 | --maf 0.01 \
1097 | --grm-cutoff 0.025 \
1098 | --make-grm \
1099 | --out $root.post_imputation_final_grm
1100 | ```
1101 | 
1102 | GRM is created here from imputed data - see text for discussion of the benefits of this.
1103 |  
1104 | 
1105 | _Generate principal components_
1106 | 
1107 | ```{GCTA}
1108 | ./gcta \
1109 | --grm $root.post_imputation_final_grm \
1110 | --pca \
1111 | --out $root.post_imputation_final_pca
1112 | ```
1113 | 
1114 | _Univariate GREML, including principal components as continuous covariates_
1115 | 
1116 | ```{GCTA}
1117 | ./gcta \
1118 | --grm $root.post_imputation_final_grm \
1119 | --pheno $pheno \
1120 | --covar $covar \ 
1121 | --qcovar $root.post_imputation_final_pca \
1122 | --reml \
1123 | --out $root.post_imputation_final_greml
1124 | ```
1125 | 
1126 | The number of principal components generated can be varied to assess the effect of their inclusion - if components are included as covariates for population stratification in GWAS, it is suggested to include the same number in GREML.
1127 | 
1128 | This script assumes the covariates file contains only discrete covariates – if there are continuous covariates in the covariates file, these should be removed from the $covar file and added to the $root.post imputation_final_pca file. 
1129 |  
1130 | _Run MLMA-LOCO for autosomes_
1131 | 
1132 | ```{GCTA}
1133 | ./gcta \
1134 | --bfile $root.post_imputation_final \
1135 | --pheno $pheno \ 
1136 | --covar $covar \
1137 | --qcovar $root.post_imputation_final_pca \ 
1138 | --mlma-loco \
1139 | --out $root.post_imputation_final_mlma_analysis
1140 | ```
1141 | 
1142 | 
1143 | _Run MLMA for X chromosome_
1144 | 
1145 | ```{PLINK}
1146 | ./plink \
1147 | --bfile $root.post_imputation_final \
1148 | --chr X \
1149 | --make-bed \
1150 | --out $root.post_imputation_final_X
1151 | ```
1152 | ```{GCTA}
1153 | ./gcta \
1154 | --grm $root.post_imputation_final_grm \
1155 | --bfile $root.post_imputation_final_X \ 
1156 | --pheno $pheno \
1157 | --covar $covar \
1158 | --qcovar $root.post_imputation_final_pca \ 
1159 | --mlma \
1160 | --out $root.post_imputation_final_mlma_analysis_X
1161 | ```
1162 | 
1163 | _Merge results files together_
1164 | 
1165 | ```{bash}
1166 | sed -i '1d' $root.post_imputation_final_mlma_analysis_X.mlma
1167 | cat $root.post_imputation_final_mlma_analysis.mlmaloco $root.post_imputation_final_mlma_analysis_X.mlma >  $root.post_imputation_final_mlma_analysis_combined.mlmaloco
1168 | ```
1169 | 
1170 | ##### SNP Clumping to identify independent hits
1171 | 
1172 | _Limit associations to lowest p-value in each region of linkage disequilibrium_ 
1173 | 
1174 | ```{PLINK}
1175 | $plink \
1176 | --bfile $root.post_imputation_final \
1177 | --clump $root.post_imputation_final_analysis.assoc.logistic \
1178 | --clump-p1 1 \
1179 | --clump-p2 1 \
1180 | --clump-r2 0.25 \
1181 | --clump-kb 250 \
1182 | --out $root.post_imputation_final_analysis_clumped
1183 | ```
1184 | 
1185 |     --clump-p1 is the p-value threshold below which to consider SNPs for inclusion as the reported SNP from the clump
1186 |     --clump-p2 is the p-value threshold below which to consider SNPs for inclusion in the clump
1187 |     --clump-r2 is the LD R2 threshold above which SNPs must be to be included in the same clump 
1188 |     --clump-kb is the maximum distance a clump SNP can be from the reported SNP
1189 |   
1190 | The options given here will generate clumps of all SNPs in LD (above R2 = 0.25), with a maximum size of 500kb, considering all SNPs regardless of p-value  
1191 | 
1192 | ##### Annotation of Results
1193 | 
1194 | *Download all RefSeq genes from [UCSC](https://genome.ucsc.edu/)*
1195 | 
1196 | Go to [Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables)
1197 | 
1198 |          1.	Pick Group: Genes and Gene Prediction Tracks
1199 |          2.	Pick Track: RefSeq Genes
1200 |          3.	Pick Table: refGene 
1201 |          4.	Pick Region: genome
1202 |          5.	Pick Output Format: Selected fields…
1203 |          6.	Click Get Output
1204 |          7.	Tick Chrom, cdsStart, cdsEnd and name2
1205 |          8.	Click GetOutput
1206 |  
1207 | Transfer output to file GeneList.txt 
1208 | 
1209 | _Slight reformat of gene list, then make glist_hg19_
1210 | 
1211 | ```{bash}
1212 | sed -i 's/#chrom/Chrom/g' GeneList.txt 
1213 | sed -i 's/chr//g' GeneList.txt
1214 | sh ./Make_glist.sh GeneList.txt glist_hg19 
1215 | ```
1216 | 
1217 | _Annotation in PLINK/PLINK2_ 
1218 | 
1219 | Annotates variants with genes within 250kb
1220 | 
1221 | ```{PLINK}
1222 | $plink \
1223 | --annotate $root.post_imputation_final_analysis_clumped.clumped \
1224 | ranges=glist-hg19 \
1225 | --border 250 \
1226 | --out $root.post_imputation_final_analysis_annotated
1227 | ```
1228 | 
1229 | 
1230 | Alternatively, export results to a web tool such as [http://jjwanglab.org/gwasrap]
1231 | 
1232 | ##### Plot Manhattan and QQ plots
1233 | 
1234 | _Select top million hits for Manhattan plot_
1235 | 
1236 | ```{bash}
1237 | head -1000001 $root.post_imputation_final_analysis.assoc.logisitic >  $root.post_imputation_final_analysis_for_MP
1238 | ```
1239 | 
1240 | _Run Manhattan plot and QQ plot scripts in R_ 
1241 | 
1242 | ```{R}
1243 | $R --file ManhattanPlotinR.R --args $root
1244 | $R --file QQPlotinR.R --args $root
1245 | ```
1246 | 
1247 | QQ plot currently plots top 10% of the data - this can be altered by changing the "frac" option
1248 | Both of these plots can be output in different graphic file formats (.jpeg, .tiff, .png) - please refer to the [R documentation](http://stat.ethz.ch/R-manual/R-devel/library/grDevices/html/00Index.html) 
1249 |  
1250 | # Files in this GitHub repo
1251 | 
1252 | 
1253 | #### README.md:  This file!
1254 | 
1255 | 
1256 | ## Quality Control
1257 | 
1258 | 
1259 | #### Iterative_Missingness.sh:  Remove SNPs missing in more than 10% of samples, then samples missing more than 10% of SNPs, then repeat for 5% and 1%.
1260 | 
1261 | **Usage: Iterative_Missingness.sh**
1262 | ```{bash}
1263 | Iterative_Missingness.sh 
1264 | ```
1265 | 
1266 | 
1267 | #### highLDregions4bim_b37.awk:  Awk script to remove regions of high-LD from LD-pruned files. Original script from M.Weale, adapted using Ensembl.
1268 | 
1269 | **Usage: highLDregions4bim_b37.awk**
1270 | ```{awk}
1271 | awk –f highLDregions4bim_b37.awk input.file > output.file
1272 | ```
1273 | 
1274 | 
1275 | #### IndividualIBD.R:  In R, calculate and print average identity-by-descent relatedness, and print outliers at > 6 SD from mean
1276 | 
1277 | **Usage: IndividualIBD.R**
1278 | ```{R}
1279 | R --file=IndividualIBD.R
1280 | ```
1281 | 
1282 | 
1283 | #### parfile.par:  Provide parameters to the Convertf programme from the EIGENSOFT suite, to convert files from PLNIK format to EIGENSOFT format. 
1284 | 
1285 | **Usage: parfile.par**
1286 | ```{bash}
1287 | convertf -p parfile.par
1288 | ```
1289 | 
1290 | 
1291 | #### PC-VS-OUTCOME_IN_R_FULL.R:  Regress 100 PCs step-wise on outcome, print full results of each regression to file
1292 | 
1293 | **Usage: PC-VS-OUTCOME_IN_R_FULL.R**
1294 | ```{R}
1295 | R --file=PC-VS_OUTCOME_IN_R_FULL.R
1296 | ```
1297 | 
1298 | 
1299 | #### PC-VS-OUTCOME_IN_R_SHORT.R:  Regress 100 PCs step-wise on outcome, print variance explained for each prinicipal component when added to model, and the p-value for this variance explained, to file
1300 | 
1301 | **Usage: PC-VS-OUTCOME_IN_R_SHORT.R**
1302 | ```{R}
1303 | R --file=PC-VS_OUTCOME_IN_R_SHORT.R
1304 | ```
1305 | 
1306 | 
1307 | #### PlotPCs.R:  Use qplot option from ggplot2 to plot samples on first two principal components from PCA
1308 | 
1309 | **Usage: PlotPCs.R**
1310 | ```{R}
1311 | R --file=PlotPCs.R
1312 | ```
1313 | 
1314 | 
1315 | #### ExtractAncestryOutliers.sh:  Pull out outlying samples from PCA (as defined by EIGENSOFT) for removal from PLINK binary.
1316 | 
1317 | **Usage: ExtractAncestryOutliers.sh**
1318 | ```{bash}
1319 | ExtractAncestryOutliers.sh
1320 | ```
1321 | 
1322 | 
1323 | #### MakeKeepIDs.sh:  Make keepids.txt file for use with T.Flutre's HapMap3 OpenWetWare cookbook
1324 | 
1325 | **Usage: MakeKeepIDs.sh**
1326 | ```{bash}
1327 | MakeKeepIDs.sh
1328 | ```
1329 | 
1330 | 
1331 | #### IdHets.R:  Identify samples with unusual genome-wide heterozygosity (> or < 3SD from mean). Credit: Amos Folarin.
1332 | 
1333 | **Usage: IDHets.R**
1334 | ```{R}
1335 | R --file=Id_hets.R
1336 | ```
1337 | 
1338 | 
1339 | ## Imputation in Impute2
1340 | 
1341 | 
1342 | #### MakeChunks.sh:  Generate two sets of 5Mb chunk files for imputation: Chunks_chr... prints a list of chunks for that chromosome, with SNP number; analysis_chunks... prints the input chunk file for Impute2 for that chromosome.
1343 | 
1344 | **Usage: MakeChunks.sh**
1345 | ```{bash}
1346 | MakeChunks.sh
1347 | ```
1348 | 
1349 | 
1350 | #### Master_imputation_script_posterior_sampled_haps.sh:  Script to control submission of posterior-sampling imputation jobs to a SGE-based cluster. Modified from scripts provided with Impute2
1351 | #### Modified_submit_impute2_jobs_to_cluster.R: R script for submitting Impute2 jobs to a SGE-controlled cluster. Modified from scripts provided with Impute2.
1352 | #### Prototype_imputation_job_posterior_sampled_haps.sh:  Posterior-sampling imputation job script for Impute2. Modified from scripts provided with Impute2
1353 | 
1354 | **Usage Master_imputation_script_posterior_sampled_haps.sh**
1355 | ```{bash}
1356 | Master_imputation_script_posterior_sampled_haps.sh
1357 | ```
1358 | This runs multiple instances of **Prototype_imputation_job_posterior_sampled_haps.sh**
1359 |   ```{bash}
1360 |   Prototype_imputation_job_posterior_sampled_haps.sh <Chr> <Start BP> <End BP>
1361 |   ```
1362 | 
1363 | 
1364 | ## Post-imputation quality control
1365 | 
1366 | 
1367 | #### MergeImputedChunks.sh:  Script to move imputed 5Mb chunks to chromosome-specific folders, and merge them into by-chromosome .impute2 and .impute2_info files.
1368 | 
1369 | **Usage: MergeImputedChunks.sh**
1370 | ```{bash}
1371 | MergeImputedChunks.sh
1372 | ```
1373 | 
1374 | 
1375 | #### AddChromosomeNumber.sh:**  Post-imputation clean-up script to add chromosome number to by-chromosome .impute2 files
1376 | 
1377 | **Usage: AddChromosomeNumber.sh**
1378 | ```{bash}
1379 | AddChromosomeNumber.sh /path/to/results_directory
1380 | ```
1381 | 
1382 | 
1383 | #### ReplaceDots.sh:  Post-imputation clean-up file for Phase1_Integrated 1KG reference. Converts exon variants called "." to "chr/position"
1384 | 
1385 | **Usage: ReplaceDots.sh**
1386 | ```{bash}
1387 | ReplaceDots.sh
1388 | ```
1389 | 
1390 | 
1391 | #### FilterByInfoAll.sh:  Post-imputation QC script to filter whole-genome impute2_info file by info score, and then keep only these variants from the by-chromosome impute2_info files
1392 | 
1393 | **Usage: FilterByInfoAll.sh**
1394 | ```{bash}
1395 | FilterByInfo.sh
1396 | ```
1397 | 
1398 | 
1399 | #### DropDuplicatedSNPs.sh:  Post-imputation QC script to remove duplicated positions in the post-imputation file (which are usually indels or multiallelic variants)
1400 | 
1401 | **Usage: DropDuplicatedSNPs.sh**
1402 | ```{bash}
1403 | DropDuplicatedSNPs.sh
1404 | ```
1405 | 
1406 | 
1407 | #### Relabel_rs.sh: Relabel imputed SNPs with an rs id with the rs id only
1408 | 
1409 | **Usage: Relabel_rs.sh**
1410 | ```{bash}
1411 | Relabel_rs.sh
1412 | ```
1413 | 
1414 | 
1415 | #### DropDuplicatedPositions.sh:	Some rs IDs are imperfectly mapped, resulting in duplications with imputed IDs, so remove these accidental genotyped duplicates.
1416 | 
1417 | 
1418 | **Usage: DropDuplicatedPositions.sh**
1419 | ```{bash}
1420 | DropDuplicatedPositions.sh
1421 | ```
1422 | 
1423 | 
1424 | ## Association testing
1425 | 
1426 | 
1427 | #### Get_Covariates.R:  Script to create covariate file for association analyses in PLINK from the principal components from PCA and an external covariates file.
1428 | 
1429 | **Usage: Get_Covariates.sh**
1430 | ```{R}
1431 | R --file=Get_Covariates.R
1432 | ```
1433 | 
1434 | 
1435 | ## Post-GWAS
1436 | 
1437 | 
1438 | #### Make_glist.sh:  Script to make a glist-file from the GeneList file downloaded from UCSC Table Browser, for use in annotation in PLINK
1439 | 
1440 | **Usage: Make_glist.sh**
1441 | ```{bash}
1442 | Make_glist.sh <GeneList> <output.file>
1443 | ```
1444 | 
1445 | 
1446 | #### ManhattanPlotinR.R:  Wrapper script for running Mike Weale's manhattan_v2.R script to generate Manhattan plots from association result
1447 | #### manhattan_v2.R:  Mike Weale's Manhattan plot script
1448 | 
1449 | **Usage: ManhattanPlotinR.R**
1450 | ```{R}
1451 | R --file=ManhattanPlotinR.R
1452 | ```
1453 | 
1454 | #### QQPlotinR.R:  Wrapper script to generate QQ plots from association results, using Mike Weale's qq_plot_v7.R
1455 | #### qq_plot_v7.R:  Mike Weale's QQ-plot script
1456 | 
1457 | **Usage: QQPlotinR.R**
1458 | ```{R}
1459 | R --file=QQPlotinR.R
1460 | ```
1461 | 
1462 | #### ID_Build.py: Python script for quickly identifying the genome build of an unknown PLINK binary.
1463 | 
1464 | **Usage: ID_Build.py**
1465 | ```{bash}
1466 | $plink \
1467 | --bfile $root \
1468 | --chr 6 \
1469 | --make-bed \
1470 | --out $root.chr6
1471 | 
1472 | python ID_Build.py $root.chr6.bim
1473 | ```
1474 | 
1475 | # Valuable web resources
1476 | 
1477 | [Genotype recalling pipeline](http://confluence.brc.iop.kcl.ac.uk:8090/x/4AAm)
1478 | 
1479 | [Rare variant recalling pipeline](http://core.brc.iop.kcl.ac.uk/2013/04/08/exome-chip-rare-caller-pipeline/)
1480 | 
1481 | [Alternative recalling and quality control pipeline](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4441213/)
1482 | 
1483 | [PLINK 1.07](http://pngu.mgh.harvard.edu/~purcell/plink/)
1484 | 
1485 | [PLINK 1.9](https://www.cog-genomics.org/plink2/)
1486 | 
1487 | [R](http://www.r-project.org/)
1488 | 
1489 | [EIGENSOFT, EAGLE, HAPLOSNP, LDScore...  JUST SO MUCH GOOD STUFF](http://www.hsph.harvard.edu/alkes-price/software/)
1490 | 
1491 | [IMPUTE2](https://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
1492 | 
1493 | [IMPUTE2 Cookbook](http://genome.sph.umich.edu/wiki/IMPUTE2:_1000_Genomes_Imputation_Cookbook)
1494 | 
1495 | [William Rayner's strand files for microarrays](http://www.well.ox.ac.uk/~wrayner/strand/)
1496 | 
1497 | [Mike Weale's Page](https://sites.google.com/site/mikeweale)
1498 | 
1499 | [Tim Flutre's OpenWetWare script for HapMap3 PC plot](http://openwetware.org/wiki/User:Timothee_Flutre/Notebook/Postdoc/2012/01/22)
1500 | 
1501 | [GWASRAP - Post-GWAS annotation](http://jjwanglab.org/gwasrap)
1502 | 
1503 | [PRSice - Polygenic Risk Scoring software](http://prsice.info)
1504 | 
1505 | [GCTA - Software suite for mixed linear modelling and heritability analyses](http://cnsgenomics.com/software/gcta/)
1506 | 
1507 | # Acknowledgements
1508 | 
1509 | Thank you to the following people for their advice and input on the contents of this cookbook:
1510 |   * Gerome Breen
1511 |   * Steve Newhouse
1512 |   * Amos Folarin
1513 |   * Richard Dobson
1514 |   * Cass Johnston
1515 |   * Hamel Patel
1516 |   * Jack Euesden
1517 |   * Jemma Walker
1518 |   * Niamh Mullins
1519 |   * Cathryn Lewis
1520 |   * Paul O'Reilly
1521 |   * Mike Weale
1522 | 
1523 | In addition, thank you to the authors of the resources listed above, and the genetics community for their comments.
1524 | 


--------------------------------------------------------------------------------
/Relabel_rs.sh:
--------------------------------------------------------------------------------
1 | source Config.conf
2 | 
3 | awk 'BEGIN {OFS = "\t"} $2 ~ /^rs/{gsub(":.*", "", $2) }1' $root.post_imputation_final.bim > $root.post_imputation_final_rs_only.bim
4 | cp $root.post_imputation_final.bed $root.post_imputation_final_rs_only.bed
5 | cp $root.post_imputation_final.fam $root.post_imputation_final_rs_only.fam
6 | 


--------------------------------------------------------------------------------
/ReplaceDots.sh:
--------------------------------------------------------------------------------
 1 | source Config.conf
 2 | 
 3 | for i in {1..22}
 4 | do
 5 | awk '$2=="." {$2= $1 ":" $3} {print}' < New_Chromosome$i.impute2 > Temp1
 6 | mv Temp1 New_Chromosome$i.impute2
 7 | 
 8 | done
 9 | 
10 | awk -v i=$i '$2=="." {$2= $1 ":" $3} {print}' < $root.whole_genome.impute2_info > Temp2
11 | mv Temp2 $root.whole_genome.impute2_info
12 | 


--------------------------------------------------------------------------------
/highLDregions4bim_b37.awk:
--------------------------------------------------------------------------------
 1 | ($1 == 1) && ($4 >= 48287981) && ($4 <= 52287979) {print $2}
 2 | ($1 == 2) && ($4 >= 86088343) && ($4 <= 101041482) {print $2}
 3 | ($1 == 2) && ($4 >= 134666269) && ($4 <= 138166268) {print $2}
 4 | ($1 == 2) && ($4 >= 183174495) && ($4 <= 190174494) {print $2}
 5 | ($1 == 3) && ($4 >= 47524997) && ($4 <= 50024996) {print $2}
 6 | ($1 == 3) && ($4 >= 83417311) && ($4 <= 86917310) {print $2}
 7 | ($1 == 3) && ($4 >= 88917311) && ($4 <= 96017310) {print $2}
 8 | ($1 == 5) && ($4 >= 44464244) && ($4 <= 50464243) {print $2}
 9 | ($1 == 5) && ($4 >= 97972101) && ($4 <= 100472101) {print $2}
10 | ($1 == 5) && ($4 >= 128972102) && ($4 <= 131972101) {print $2}
11 | ($1 == 5) && ($4 >= 135472102) && ($4 <= 138472101) {print $2}
12 | ($1 == 6) && ($4 >= 25392022) && ($4 <= 33392022) {print $2}
13 | ($1 == 6) && ($4 >= 56892042) && ($4 <= 63942041) {print $2}
14 | ($1 == 6) && ($4 >= 139958308) && ($4 <= 142458307) {print $2}
15 | ($1 == 7) && ($4 >= 55225792) && ($4 <= 66555850) {print $2}
16 | ($1 == 8) && ($4 >= 7962591) && ($4 <= 11962591) {print $2}
17 | ($1 == 8) && ($4 >= 42880844) && ($4 <= 49837447) {print $2}
18 | ($1 == 8) && ($4 >= 111930825) && ($4 <= 114930824) {print $2}
19 | ($1 == 10) && ($4 >= 36959995) && ($4 <= 43679994) {print $2}	
20 | ($1 == 11) && ($4 >= 46043425) && ($4 <= 57243424) {print $2}
21 | ($1 == 11) && ($4 >= 87860353) && ($4 <= 90860352) {print $2}
22 | ($1 == 12) && ($4 >= 33108734) && ($4 <= 41713733) {print $2}
23 | ($1 == 12) && ($4 >= 111037281) && ($4 <= 113537280) {print $2}
24 | ($1 == 20) && ($4 >= 32536340) && ($4 <= 35066586) {print $2}
25 | 


--------------------------------------------------------------------------------
/highLDregions4bim_b38.awk:
--------------------------------------------------------------------------------
 1 | ($1 == 1) && ($4 >= 47822309) && ($4 <= 51822307) {print $2}
 2 | ($1 == 2) && ($4 >= 85861220) && ($4 <= 100425020) {print $2}
 3 | ($1 == 2) && ($4 >= 133908698) && ($4 <= 137408698) {print $2}
 4 | ($1 == 2) && ($4 >= 182309768) && ($4 <= 189309768) {print $2}
 5 | ($1 == 3) && ($4 >= 47483507) && ($4 <= 49987563) {print $2}
 6 | ($1 == 3) && ($4 >= 83368160) && ($4 <= 86868160) {print $2}
 7 | ($1 == 3) && ($4 >= 88868161) && ($4 <= 96298466) {print $2}
 8 | ($1 == 5) && ($4 >= 44464142) && ($4 <= 51168409) {print $2}
 9 | ($1 == 5) && ($4 >= 98636397) && ($4 <= 101136397) {print $2}
10 | ($1 == 5) && ($4 >= 129636409) && ($4 <= 132636409) {print $2}
11 | ($1 == 5) && ($4 >= 136136413) && ($4 <= 139136412) {print $2}
12 | ($1 == 6) && ($4 >= 25391794) && ($4 <= 33424245) {print $2}
13 | ($1 == 6) && ($4 >= 57027244) && ($4 <= 63232136) {print $2}
14 | ($1 == 6) && ($4 >= 139637171) && ($4 <= 142137170) {print $2}
15 | ($1 == 7) && ($4 >= 55158099) && ($4 <= 67090863) {print $2}
16 | ($1 == 8) && ($4 >= 8105069) && ($4 <= 12105082) {print $2}
17 | ($1 == 8) && ($4 >= 43025701) && ($4 <= 48924888) {print $2}
18 | ($1 == 8) && ($4 >= 110918596) && ($4 <= 113918595) {print $2}
19 | ($1 == 10) && ($4 >= 36671067) && ($4 <= 43184546) {print $2}
20 | ($1 == 11) && ($4 >= 46021874) && ($4 <= 57475951) {print $2}
21 | ($1 == 11) && ($4 >= 88127185) && ($4 <= 91127184) {print $2}
22 | ($1 == 12) && ($4 >= 32955800) && ($4 <= 41319931) {print $2}
23 | ($1 == 12) && ($4 >= 110599476) && ($4 <= 113099475) {print $2}
24 | ($1 == 20) && ($4 >= 33948534) && ($4 <= 36438183) {print $2}
25 | 


--------------------------------------------------------------------------------
/manhattan_DOG_TRY.R:
--------------------------------------------------------------------------------
  1 | #Generic Manhattan plot function for PLINK-formatted data (chr X,Y,XY and MT are represented as 23,24,25,26)
  2 | #Wrapper function written by Mike Weale and Richard Gunning.  Internal "wgplot" function written by Matt Settles.
  3 | #Version 2 (12 Mar 2013)
  4 | #Arguments:
  5 | #x				Data frame to be plotted. x$CHR contains chromosome (numeric).  x$BP contains SNP position (numeric).  x$P contains association p-value (numeric)
  6 | #GWthresh			Numeric.  Indicates where "genomewide significance" threshold should be drawn
  7 | #GreyZoneThresh		Numeric.  Indicates a sub-genomewide-sig "grey zone" where SNPs are shown with a larger point size
  8 | #DrawGWline		Boolean.  If TRUE, then a red line at the "genomewide significance" threshold is plotted
  9 | #cutoff			Numeric.  Any p-vlaues less than cutoff are forced equal to cutoff
 10 | #Example:
 11 | #source("manhattan_v2.R")
 12 | #d = read.table("myplinkresults.logistic", header=TRUE, as.is=TRUE)
 13 | #X=data.frame(CHR=d$CHR, BP=d$BP, P=d$P)
 14 | #manhattan( X, DrawGWline=FALSE )
 15 | 
 16 | manhattan <- function( x, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE, cutoff=0 )
 17 | {
 18 | 
 19 | x$P[ x$P<cutoff ] = cutoff
 20 | 
 21 | #Create x2 data frame for use with wgplot
 22 | x2=data.frame(CHR=as.character(x$CHR), BP=x$BP, P=x$P, stringsAsFactors=FALSE)
 23 | #x2$CHR[x2$CHR=="23"]="X"
 24 | #x2$CHR[x2$CHR=="24"]="Y"
 25 | #x2$CHR[x2$CHR=="25"]="XY"
 26 | #x2$CHR[x2$CHR=="26"]="MT"
 27 | 
 28 | labels <- unique(x2$CHR)
 29 | wgplot(x2, pch=".", color=c(1:39), cutoffs=4:9, labels=labels )
 30 | 
 31 | for (i in 4:20) abline(h=i, col="grey", lty="dotted")
 32 | 
 33 | #Get my own global SNP pos info
 34 | chr  <- x$CHR
 35 | pos  <- x$BP
 36 | p    <- x$P
 37 | 
 38 | ord  <- order(as.numeric(chr),as.numeric(pos))
 39 | chr  <- chr[ord]
 40 | pos  <- pos[ord]
 41 | p    <- p[ord]
 42 | 
 43 | lens.chr <- as.vector(table(as.numeric(chr)))
 44 | CM <- cumsum(lens.chr)
 45 | n.chr <- length(lens.chr)
 46 | color=c(1:39)
 47 | color <- rep(color,ceiling(n.chr/length(color)))
 48 | 
 49 | p <- -log(p,10)
 50 | 
 51 | #make positions cumulatve
 52 | if ( any(diff(pos) < 0) ) {
 53 |   cpos <-  cumsum(c(0,pos[which(!duplicated(chr))-1]))
 54 |   pos <- pos + rep(cpos,lens.chr)
 55 |   
 56 |   mids <- cpos + diff(c(cpos,max(pos)))/2
 57 | }
 58 | 
 59 | #plot overlay points
 60 | for (i in 1:n.chr) {
 61 |   u <- CM[i]
 62 |   l <- CM[i] - lens.chr[i] + 1
 63 |   cat("Overlay Plotting points ", l, "-", u, "\n")
 64 |   postmp <- pos[l:u]
 65 |   ptmp <- p[l:u]
 66 |   points(postmp[(ptmp>GreyZoneThresh)&(ptmp<GWthresh)], ptmp[(ptmp>GreyZoneThresh)&(ptmp<GWthresh)],cex=0.5, pch="x", col=color[i])
 67 |   points(postmp[ptmp>GWthresh], ptmp[ptmp>GWthresh], pch=20, col=color[i])
 68 | }
 69 | 
 70 | #drawthreshold
 71 | if (DrawGWline) {
 72 |    abline(h=(GWthresh), col="red")
 73 | }
 74 | }
 75 | 
 76 | 
 77 | 
 78 | #From http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R
 79 | #See also https://stat.ethz.ch/pipermail/r-help/2008-November/180812.html
 80 | ###############################################################################
 81 | ###
 82 | ### Whole Genome Significance plot 
 83 | ### Matt Settles
 84 | ### Bioinformatics Core
 85 | ### Washington State University, Pullman, WA
 86 | ### 
 87 | ### Created July 7, 2008
 88 | ###
 89 | ### July 8, 2008 - fixed color goof
 90 | ###############################################################################
 91 | ##############
 92 | ### things to add
 93 | ### marker name on plot for significant markers
 94 | ##############
 95 | 
 96 | ### THERE ARE ERRORS IN GAPS MHTPLOT, SO THIS IS A FIX
 97 | ## data 	a data frame with three columns representing chromosome, position and p values logged or unlogged
 98 | ## logscale a flag to indicate if p value are to be log-transformed, FALSE means already logtransformed
 99 | ## base 	the base of the logarithm, when logscale =TRUE
100 | ## cutoffs 	the cutt-offs where horizontal line(s) are drawn
101 | ## color 	the color for different chromosome(s), and random if unspecified
102 | ## labels 	labels for the x-axis, length = number of chromosomes
103 | ## xlabel   label to be placed on the X axis
104 | ## ylabel   lable to be placed on the Y axis
105 | ## ... 	other options in compatible with the R plot function
106 | 
107 | ## USAGE
108 | # source("http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R")
109 | ## fake example with Affy500k data
110 | # affy <-c(40220, 41400, 33801, 32334, 32056, 31470, 25835, 27457, 22864, 28501, 26273, 
111 | #          24954, 19188, 15721, 14356, 15309, 11281, 14881, 6399, 12400, 7125, 6207)
112 | # CM <- cumsum(affy)
113 | # n.markers <- sum(affy)
114 | # n.chr <- length(affy)
115 | # test <- data.frame(chr=rep(1:n.chr,affy),pos=1:n.markers,p=runif(n.markers))
116 | # png("wgplot.png",units="in",width=8,height=5,res=300)
117 | # par(las="2",cex=0.6,pch=21,bg="white")
118 | # wgplot(test,cutoffs = c(1,3, 5, 7, 9),color=palette()[2:5],labels=as.character(1:22))
119 | # title("Whole Genome Associaton Plot of Significance for Chromosomes 1 to 22")
120 | # dev.off()
121 | ##
122 | "wgplot" <-
123 | function (data, 
124 |     logscale = TRUE, 
125 |     base = 10, 
126 |     cutoffs = c(3, 5, 7, 9),
127 |     siglines = NULL,
128 |     sigcolors = "red", 
129 |     color = sample(colors(), 26),
130 |     chrom = as.character(c(1:39)),
131 |     startbp = NULL,
132 |     endbp = NULL,
133 |     labels = as.character(c(1:22,"X","Y","XY","MT")),
134 |     xlabel = "Chromosome",
135 |     ylabel = "-Log10(p-value)", ...) 
136 | {
137 |     if (any(is.na(data)))
138 |         data <- data[-unique(which(is.na(data))%%nrow(data)),]
139 | 	keep <- which(data[,1] %in% chrom)
140 | 	data <- data[keep,]
141 |     if (!is.null(startbp) & !is.null(endbp) & length(chrom) == 1){
142 |         keep <- which(data[,2] >= startbp & data[,2] <= endbp) 
143 |         data <- data[keep,]       
144 |     }
145 |     
146 |     
147 |     chr  <- data[, 1]
148 |     pos  <- data[, 2]
149 |     p    <- data[, 3]
150 |     
151 |     ### remove any NAs
152 |      which(is.na(data[,2]))
153 |     #chr  <- replace(chr,which(chr == "X"),"100")
154 |     #chr  <- replace(chr,which(chr == "Y"),"101")
155 |     #chr  <- replace(chr,which(chr == "XY"),"102")
156 |     #chr  <- replace(chr,which(chr == "MT"),"103")	
157 | 
158 |     ord  <- order(as.numeric(chr),as.numeric(pos))
159 |     chr  <- chr[ord]
160 |     pos  <- pos[ord]
161 |     p    <- p[ord]
162 |    
163 |     lens.chr <- as.vector(table(as.numeric(chr)))
164 |     CM <- cumsum(lens.chr)
165 |     n.markers <- sum(lens.chr)
166 |     n.chr <- length(lens.chr)
167 |     id <- 1:n.chr
168 |     color <- rep(color,ceiling(n.chr/length(color)))
169 |     if (logscale)
170 |         p <- -log(p,base)        
171 |     if ( any(diff(pos) < 0) ) {
172 |         cpos <-  cumsum(c(0,pos[which(!duplicated(chr))-1]))
173 |         pos <- pos + rep(cpos,lens.chr)
174 | 
175 |         mids <- cpos + diff(c(cpos,max(pos)))/2
176 |     }
177 | 
178 |     par(xaxt = "n", yaxt = "n")
179 |     plot(c(pos,pos[1]), c(9,p), type = "n", xlab = xlabel, ylab = ylabel, axes = FALSE,  ...)
180 |     for (i in 1:n.chr) {
181 |         u <- CM[i]
182 |         l <- CM[i] - lens.chr[i] + 1
183 |         cat("Plotting points ", l, "-", u, "\n")
184 |         points(pos[l:u], p[l:u], col = color[i], ...)
185 |     }
186 |     par(xaxt = "s", yaxt = "s")
187 |     axis(1, at = c(0, pos[round(CM)],max(pos)),FALSE)
188 | 	text(mids, par("usr")[3] - 0.5, srt = 0, pos=2,cex=0.5,offset= -0.2,
189 |           labels = labels[1:n.chr], xpd = TRUE)
190 | 	#axis(side=1, at =  pos[round(CM-lens.chr/2)],tick=FALSE, labels= labels[1:n.chr])
191 |     #abline(h = cutoffs)
192 |     axis(side=2, at = cutoffs )
193 | 	if (!is.null(siglines))
194 |     abline(h = -log(siglines,base),col=sigcolors)
195 | 
196 |     #mtext(eval(expression(cutoffs)), 2, at = cutoffs)
197 | 	
198 | }
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/manhattan_v2.R:
--------------------------------------------------------------------------------
  1 | #Generic Manhattan plot function for PLINK-formatted data (chr X,Y,XY and MT are represented as 23,24,25,26)
  2 | #Wrapper function written by Mike Weale and Richard Gunning.  Internal "wgplot" function written by Matt Settles.
  3 | #Version 2 (12 Mar 2013)
  4 | #Arguments:
  5 | #x				Data frame to be plotted. x$CHR contains chromosome (numeric).  x$BP contains SNP position (numeric).  x$P contains association p-value (numeric)
  6 | #GWthresh			Numeric.  Indicates where "genomewide significance" threshold should be drawn
  7 | #GreyZoneThresh		Numeric.  Indicates a sub-genomewide-sig "grey zone" where SNPs are shown with a larger point size
  8 | #DrawGWline		Boolean.  If TRUE, then a red line at the "genomewide significance" threshold is plotted
  9 | #cutoff			Numeric.  Any p-vlaues less than cutoff are forced equal to cutoff
 10 | #Example:
 11 | #source("manhattan_v2.R")
 12 | #d = read.table("myplinkresults.logistic", header=TRUE, as.is=TRUE)
 13 | #X=data.frame(CHR=d$CHR, BP=d$BP, P=d$P)
 14 | #manhattan( X, DrawGWline=FALSE )
 15 | 
 16 | manhattan <- function( x, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE, cutoff=0 )
 17 | {
 18 | 
 19 | x$P[ x$P<cutoff ] = cutoff
 20 | 
 21 | #Create x2 data frame for use with wgplot
 22 | x2=data.frame(CHR=as.character(x$CHR), BP=x$BP, P=x$P, stringsAsFactors=FALSE)
 23 | x2$CHR[x2$CHR=="23"]="X"
 24 | x2$CHR[x2$CHR=="24"]="Y"
 25 | x2$CHR[x2$CHR=="25"]="XY"
 26 | x2$CHR[x2$CHR=="26"]="MT"
 27 | 
 28 | labels <- as.character(sort(as.numeric(unique(x2$CHR))))
 29 | wgplot(x2, pch=".", color=c(1:25), cutoffs=4:round(-log10(min(x2$P)),0), labels=labels )
 30 | 
 31 | for (i in 4:20) abline(h=i, col="grey", lty="dotted")
 32 | 
 33 | #Get my own global SNP pos info
 34 | chr  <- x$CHR
 35 | pos  <- x$BP
 36 | p    <- x$P
 37 | 
 38 | ord  <- order(as.numeric(chr),as.numeric(pos))
 39 | chr  <- chr[ord]
 40 | pos  <- pos[ord]
 41 | p    <- p[ord]
 42 | 
 43 | lens.chr <- as.vector(table(as.numeric(chr)))
 44 | CM <- cumsum(lens.chr)
 45 | n.chr <- length(lens.chr)
 46 | color=c(1:25)
 47 | color <- rep(color,ceiling(n.chr/length(color)))
 48 | 
 49 | p <- -log(p,10)
 50 | 
 51 | #make positions cumulatve
 52 | if ( any(diff(pos) < 0) ) {
 53 |   cpos <-  cumsum(c(0,pos[which(!duplicated(chr))-1]))
 54 |   pos <- pos + rep(cpos,lens.chr)
 55 |   
 56 |   mids <- cpos + diff(c(cpos,max(pos)))/2
 57 | }
 58 | 
 59 | #plot overlay points
 60 | for (i in 1:n.chr) {
 61 |   u <- CM[i]
 62 |   l <- CM[i] - lens.chr[i] + 1
 63 |   cat("Overlay Plotting points ", l, "-", u, "\n")
 64 |   postmp <- pos[l:u]
 65 |   ptmp <- p[l:u]
 66 |   points(postmp[(ptmp>GreyZoneThresh)&(ptmp<GWthresh)], ptmp[(ptmp>GreyZoneThresh)&(ptmp<GWthresh)],cex=0.5, pch="x", col=color[i])
 67 |   points(postmp[ptmp>GWthresh], ptmp[ptmp>GWthresh], pch=20, col=color[i])
 68 | }
 69 | 
 70 | #drawthreshold
 71 | if (DrawGWline) {
 72 |    abline(h=(GWthresh), col="red")
 73 | }
 74 | }
 75 | 
 76 | 
 77 | 
 78 | #From http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R
 79 | #See also https://stat.ethz.ch/pipermail/r-help/2008-November/180812.html
 80 | ###############################################################################
 81 | ###
 82 | ### Whole Genome Significance plot 
 83 | ### Matt Settles
 84 | ### Bioinformatics Core
 85 | ### Washington State University, Pullman, WA
 86 | ### 
 87 | ### Created July 7, 2008
 88 | ###
 89 | ### July 8, 2008 - fixed color goof
 90 | ###############################################################################
 91 | ##############
 92 | ### things to add
 93 | ### marker name on plot for significant markers
 94 | ##############
 95 | 
 96 | ### THERE ARE ERRORS IN GAPS MHTPLOT, SO THIS IS A FIX
 97 | ## data 	a data frame with three columns representing chromosome, position and p values logged or unlogged
 98 | ## logscale a flag to indicate if p value are to be log-transformed, FALSE means already logtransformed
 99 | ## base 	the base of the logarithm, when logscale =TRUE
100 | ## cutoffs 	the cutt-offs where horizontal line(s) are drawn
101 | ## color 	the color for different chromosome(s), and random if unspecified
102 | ## labels 	labels for the x-axis, length = number of chromosomes
103 | ## xlabel   label to be placed on the X axis
104 | ## ylabel   lable to be placed on the Y axis
105 | ## ... 	other options in compatible with the R plot function
106 | 
107 | ## USAGE
108 | # source("http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R")
109 | ## fake example with Affy500k data
110 | # affy <-c(40220, 41400, 33801, 32334, 32056, 31470, 25835, 27457, 22864, 28501, 26273, 
111 | #          24954, 19188, 15721, 14356, 15309, 11281, 14881, 6399, 12400, 7125, 6207)
112 | # CM <- cumsum(affy)
113 | # n.markers <- sum(affy)
114 | # n.chr <- length(affy)
115 | # test <- data.frame(chr=rep(1:n.chr,affy),pos=1:n.markers,p=runif(n.markers))
116 | # png("wgplot.png",units="in",width=8,height=5,res=300)
117 | # par(las="2",cex=0.6,pch=21,bg="white")
118 | # wgplot(test,cutoffs = c(1,3, 5, 7, 9),color=palette()[2:5],labels=as.character(1:22))
119 | # title("Whole Genome Associaton Plot of Significance for Chromosomes 1 to 22")
120 | # dev.off()
121 | ##
122 | "wgplot" <-
123 | function (data, 
124 |     logscale = TRUE, 
125 |     base = 10, 
126 |     cutoffs = c(3, 5, 7, 9),
127 |     siglines = NULL,
128 |     sigcolors = "red", 
129 |     color = sample(colors(), 26),
130 |     chrom = as.character(c(1:22,"X","Y","XY","MT")),
131 |     startbp = NULL,
132 |     endbp = NULL,
133 |     labels = as.character(c(1:22,"X","Y","XY","MT")),
134 |     xlabel = "Chromosome",
135 |     ylabel = expression(log[10]*" p-value"), ...) 
136 | {
137 |     if (any(is.na(data)))
138 |         data <- data[-unique(which(is.na(data))%%nrow(data)),]
139 | 	keep <- which(data[,1] %in% chrom)
140 | 	data <- data[keep,]
141 |     if (!is.null(startbp) & !is.null(endbp) & length(chrom) == 1){
142 |         keep <- which(data[,2] >= startbp & data[,2] <= endbp) 
143 |         data <- data[keep,]       
144 |     }
145 |     
146 |     
147 |     chr  <- data[, 1]
148 |     pos  <- data[, 2]
149 |     p    <- data[, 3]
150 |     
151 |     ### remove any NAs
152 |      which(is.na(data[,2]))
153 |     chr  <- replace(chr,which(chr == "X"),"100")
154 |     chr  <- replace(chr,which(chr == "Y"),"101")
155 |     chr  <- replace(chr,which(chr == "XY"),"102")
156 |     chr  <- replace(chr,which(chr == "MT"),"103")	
157 | 
158 |     ord  <- order(as.numeric(chr),as.numeric(pos))
159 |     chr  <- chr[ord]
160 |     pos  <- pos[ord]
161 |     p    <- p[ord]
162 |    
163 |     lens.chr <- as.vector(table(as.numeric(chr)))
164 |     CM <- cumsum(lens.chr)
165 |     n.markers <- sum(lens.chr)
166 |     n.chr <- length(lens.chr)
167 |     id <- 1:n.chr
168 |     color <- rep(color,ceiling(n.chr/length(color)))
169 |     if (logscale)
170 |         p <- -log(p,base)        
171 |     if ( any(diff(pos) < 0) ) {
172 |         cpos <-  cumsum(c(0,pos[which(!duplicated(chr))-1]))
173 |         pos <- pos + rep(cpos,lens.chr)
174 | 
175 |         mids <- cpos + diff(c(cpos,max(pos)))/2
176 |     }
177 | 
178 |     par(xaxt = "n", yaxt = "n")
179 |     plot(c(pos,pos[1]), c(9,p), type = "n", xlab = xlabel, ylab = ylabel, axes = FALSE,  ...)
180 |     for (i in 1:n.chr) {
181 |         u <- CM[i]
182 |         l <- CM[i] - lens.chr[i] + 1
183 |         cat("Plotting points ", l, "-", u, "\n")
184 |         points(pos[l:u], p[l:u], col = color[i], ...)
185 |     }
186 |     par(xaxt = "s", yaxt = "s")
187 |     axis(1, at = c(0, pos[round(CM)],max(pos)),FALSE)
188 | 	text(mids, par("usr")[3] - 0.5, srt = 0, pos=2,cex=0.5,offset= -0.2,
189 |           labels = labels[1:n.chr], xpd = TRUE)
190 | 	#axis(side=1, at =  pos[round(CM-lens.chr/2)],tick=FALSE, labels= labels[1:n.chr])
191 |     #abline(h = cutoffs)
192 |     axis(side=2, at = cutoffs )
193 | 	if (!is.null(siglines))
194 |     abline(h = -log(siglines,base),col=sigcolors)
195 | 
196 |     #mtext(eval(expression(cutoffs)), 2, at = cutoffs)
197 | 	
198 | }
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/manhattan_v2_bumblebee.R:
--------------------------------------------------------------------------------
  1 | #Generic Manhattan plot function for PLINK-formatted data (chr X,Y,XY and MT are represented as 23,24,25,26)
  2 | #Wrapper function written by Mike Weale and Richard Gunning.  Internal "wgplot" function written by Matt Settles.
  3 | #Version 2 (12 Mar 2013)
  4 | #Arguments:
  5 | #x				Data frame to be plotted. x$CHR contains chromosome (numeric).  x$BP contains SNP position (numeric).  x$P contains association p-value (numeric)
  6 | #GWthresh			Numeric.  Indicates where "genomewide significance" threshold should be drawn
  7 | #GreyZoneThresh		Numeric.  Indicates a sub-genomewide-sig "grey zone" where SNPs are shown with a larger point size
  8 | #DrawGWline		Boolean.  If TRUE, then a red line at the "genomewide significance" threshold is plotted
  9 | #cutoff			Numeric.  Any p-vlaues less than cutoff are forced equal to cutoff
 10 | #Example:
 11 | #source("manhattan_v2.R")
 12 | #d = read.table("myplinkresults.logistic", header=TRUE, as.is=TRUE)
 13 | #X=data.frame(CHR=d$CHR, BP=d$BP, P=d$P)
 14 | #manhattan( X, DrawGWline=FALSE )
 15 | 
 16 | manhattan <- function( x, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE, cutoff=0 )
 17 | {
 18 | 
 19 | x$P[ x$P<cutoff ] = cutoff
 20 | 
 21 | #Create x2 data frame for use with wgplot
 22 | x2=data.frame(CHR=as.character(x$CHR), BP=x$BP, P=x$P, stringsAsFactors=FALSE)
 23 | x2$CHR[x2$CHR=="23"]="X"
 24 | x2$CHR[x2$CHR=="24"]="Y"
 25 | x2$CHR[x2$CHR=="25"]="XY"
 26 | x2$CHR[x2$CHR=="26"]="MT"
 27 | 
 28 | labels <- as.character(sort(as.numeric(unique(x2$CHR))))
 29 | bumblebee<-c("orange",rep.int(c("orange","black"),12))
 30 | 
 31 | wgplot(x2, pch=".", color=bumblebee, cutoffs=4:round(-log10(min(x2$P)),0), labels=labels )
 32 | 
 33 | for (i in 4:20) abline(h=i, col="grey", lty="dotted")
 34 | 
 35 | #Get my own global SNP pos info
 36 | chr  <- x$CHR
 37 | pos  <- x$BP
 38 | p    <- x$P
 39 | 
 40 | ord  <- order(as.numeric(chr),as.numeric(pos))
 41 | chr  <- chr[ord]
 42 | pos  <- pos[ord]
 43 | p    <- p[ord]
 44 | 
 45 | lens.chr <- as.vector(table(as.numeric(chr)))
 46 | CM <- cumsum(lens.chr)
 47 | n.chr <- length(lens.chr)
 48 | color=bumblebee
 49 | color <- rep(color,ceiling(n.chr/length(color)))
 50 | 
 51 | p <- -log(p,10)
 52 | 
 53 | #make positions cumulatve
 54 | if ( any(diff(pos) < 0) ) {
 55 |   cpos <-  cumsum(c(0,pos[which(!duplicated(chr))-1]))
 56 |   pos <- pos + rep(cpos,lens.chr)
 57 |   
 58 |   mids <- cpos + diff(c(cpos,max(pos)))/2
 59 | }
 60 | 
 61 | #plot overlay points
 62 | for (i in 1:n.chr) {
 63 |   u <- CM[i]
 64 |   l <- CM[i] - lens.chr[i] + 1
 65 |   cat("Overlay Plotting points ", l, "-", u, "\n")
 66 |   postmp <- pos[l:u]
 67 |   ptmp <- p[l:u]
 68 |   points(postmp[(ptmp>GreyZoneThresh)&(ptmp<GWthresh)], ptmp[(ptmp>GreyZoneThresh)&(ptmp<GWthresh)],cex=0.5, pch="x", col=color[i])
 69 |   points(postmp[ptmp>GWthresh], ptmp[ptmp>GWthresh], pch=20, col=color[i])
 70 | }
 71 | 
 72 | #drawthreshold
 73 | if (DrawGWline) {
 74 |    abline(h=(GWthresh), col="red")
 75 | }
 76 | }
 77 | 
 78 | 
 79 | 
 80 | #From http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R
 81 | #See also https://stat.ethz.ch/pipermail/r-help/2008-November/180812.html
 82 | ###############################################################################
 83 | ###
 84 | ### Whole Genome Significance plot 
 85 | ### Matt Settles
 86 | ### Bioinformatics Core
 87 | ### Washington State University, Pullman, WA
 88 | ### 
 89 | ### Created July 7, 2008
 90 | ###
 91 | ### July 8, 2008 - fixed color goof
 92 | ###############################################################################
 93 | ##############
 94 | ### things to add
 95 | ### marker name on plot for significant markers
 96 | ##############
 97 | 
 98 | ### THERE ARE ERRORS IN GAPS MHTPLOT, SO THIS IS A FIX
 99 | ## data 	a data frame with three columns representing chromosome, position and p values logged or unlogged
100 | ## logscale a flag to indicate if p value are to be log-transformed, FALSE means already logtransformed
101 | ## base 	the base of the logarithm, when logscale =TRUE
102 | ## cutoffs 	the cutt-offs where horizontal line(s) are drawn
103 | ## color 	the color for different chromosome(s), and random if unspecified
104 | ## labels 	labels for the x-axis, length = number of chromosomes
105 | ## xlabel   label to be placed on the X axis
106 | ## ylabel   lable to be placed on the Y axis
107 | ## ... 	other options in compatible with the R plot function
108 | 
109 | ## USAGE
110 | # source("http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R")
111 | ## fake example with Affy500k data
112 | # affy <-c(40220, 41400, 33801, 32334, 32056, 31470, 25835, 27457, 22864, 28501, 26273, 
113 | #          24954, 19188, 15721, 14356, 15309, 11281, 14881, 6399, 12400, 7125, 6207)
114 | # CM <- cumsum(affy)
115 | # n.markers <- sum(affy)
116 | # n.chr <- length(affy)
117 | # test <- data.frame(chr=rep(1:n.chr,affy),pos=1:n.markers,p=runif(n.markers))
118 | # png("wgplot.png",units="in",width=8,height=5,res=300)
119 | # par(las="2",cex=0.6,pch=21,bg="white")
120 | # wgplot(test,cutoffs = c(1,3, 5, 7, 9),color=palette()[2:5],labels=as.character(1:22))
121 | # title("Whole Genome Associaton Plot of Significance for Chromosomes 1 to 22")
122 | # dev.off()
123 | ##
124 | "wgplot" <-
125 | function (data, 
126 |     logscale = TRUE, 
127 |     base = 10, 
128 |     cutoffs = c(3, 5, 7, 9),
129 |     siglines = NULL,
130 |     sigcolors = "red", 
131 |     color = sample(colors(), 26),
132 |     chrom = as.character(c(1:22,"X","Y","XY","MT")),
133 |     startbp = NULL,
134 |     endbp = NULL,
135 |     labels = as.character(c(1:22,"X","Y","XY","MT")),
136 |     xlabel = "Chromosome",
137 |     ylabel = expression(log[10]*" p-value"), ...) 
138 | {
139 |     if (any(is.na(data)))
140 |         data <- data[-unique(which(is.na(data))%%nrow(data)),]
141 | 	keep <- which(data[,1] %in% chrom)
142 | 	data <- data[keep,]
143 |     if (!is.null(startbp) & !is.null(endbp) & length(chrom) == 1){
144 |         keep <- which(data[,2] >= startbp & data[,2] <= endbp) 
145 |         data <- data[keep,]       
146 |     }
147 |     
148 |     
149 |     chr  <- data[, 1]
150 |     pos  <- data[, 2]
151 |     p    <- data[, 3]
152 |     
153 |     ### remove any NAs
154 |      which(is.na(data[,2]))
155 |     chr  <- replace(chr,which(chr == "X"),"100")
156 |     chr  <- replace(chr,which(chr == "Y"),"101")
157 |     chr  <- replace(chr,which(chr == "XY"),"102")
158 |     chr  <- replace(chr,which(chr == "MT"),"103")	
159 | 
160 |     ord  <- order(as.numeric(chr),as.numeric(pos))
161 |     chr  <- chr[ord]
162 |     pos  <- pos[ord]
163 |     p    <- p[ord]
164 |    
165 |     lens.chr <- as.vector(table(as.numeric(chr)))
166 |     CM <- cumsum(lens.chr)
167 |     n.markers <- sum(lens.chr)
168 |     n.chr <- length(lens.chr)
169 |     id <- 1:n.chr
170 |     color <- rep(color,ceiling(n.chr/length(color)))
171 |     if (logscale)
172 |         p <- -log(p,base)        
173 |     if ( any(diff(pos) < 0) ) {
174 |         cpos <-  cumsum(c(0,pos[which(!duplicated(chr))-1]))
175 |         pos <- pos + rep(cpos,lens.chr)
176 | 
177 |         mids <- cpos + diff(c(cpos,max(pos)))/2
178 |     }
179 | 
180 |     par(xaxt = "n", yaxt = "n")
181 |     plot(c(pos,pos[1]), c(9,p), type = "n", xlab = xlabel, ylab = ylabel, axes = FALSE,  ...)
182 |     for (i in 1:n.chr) {
183 |         u <- CM[i]
184 |         l <- CM[i] - lens.chr[i] + 1
185 |         cat("Plotting points ", l, "-", u, "\n")
186 |         points(pos[l:u], p[l:u], col = color[i], ...)
187 |     }
188 |     par(xaxt = "s", yaxt = "s")
189 |     axis(1, at = c(0, pos[round(CM)],max(pos)),FALSE)
190 | 	text(mids, par("usr")[3] - 0.5, srt = 0, pos=2,cex=0.5,offset= -0.2,
191 |           labels = labels[1:n.chr], xpd = TRUE)
192 | 	#axis(side=1, at =  pos[round(CM-lens.chr/2)],tick=FALSE, labels= labels[1:n.chr])
193 |     #abline(h = cutoffs)
194 |     axis(side=2, at = cutoffs )
195 | 	if (!is.null(siglines))
196 |     abline(h = -log(siglines,base),col=sigcolors)
197 | 
198 |     #mtext(eval(expression(cutoffs)), 2, at = cutoffs)
199 | 	
200 | }
201 | 
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/qq_plot_v7.R:
--------------------------------------------------------------------------------
  1 | #Generic QQ plot function with concentration bands, for either P-values or chi-sq values
  2 | #Function written by Mike Weale and Tom Price, King's College London.
  3 | #Version 7 (23 Feb 2013) (a) fixes plots issues if p=0 values in the dataset; (b) allows "iplot" vector giving indices of sorted quantiles (O vector - largest values first) to be plotted)
  4 | #Concentration bands are plotted using the pointwise method of Quesenberry & Hale (1980) J. Statist. Comput. Simul. 11:41-53
  5 | #The method proceeds from noting that the kth order statistic from a sample of n i.i.d. U(0,1) statistics has a Beta(k,n+1-k) distribution.
  6 | #Arguments:
  7 | #x		the data vector to be plotted
  8 | #alpha	the alpha level for the concentration band (if plotted)
  9 | #datatype	"pvalue" (default) indicates x contains p-values.  "chisq" indicates x contains chi-square values.  "stdnorm" indicates x contains z values.
 10 | #scaletype	"pvalue" (default) indicates x- and y-axis scale to be in -log10(p-value) units.  "quantile" indicates x- and y-axis scale to be in quantile units (=chisq units for pvalues).  Note if datatype="stdnorm" then scaletype is forced ="quantile"
 11 | #df		degrees of freedom for chi-square scale used in Q-Q plot.  Default=1 (as this is the most common test type)
 12 | #plot.concentration.band	Flag to indicate whether concentration band is to be plotted.  Default=TRUE.
 13 | #one.sided				Flab to indicate if one-sided (upper) or two-sided concentration band required.  Default=FALSE
 14 | #frac=1	Fraction of total data to be plotted.  E.g. set frac=0.1 to plot only the top 10% of data points
 15 | #iplot	If set, a vector of indices for which ordered quantiles (O vector - largest values first) to be plotted.
 16 | #		e.g. set iplot=c( (1:1e4), sort(sample((1e4:length(x)),1e4)) ) to select all of 1st 10k values + random set of remaining 10k values
 17 | #		Note if iplot is set, then frac is forced=1
 18 | #print	If set, a dataframe of O and E values are returned.  Default=FALSE
 19 | #xat		If set, a vector seting x tick positions. For p-values, sets 10^x positions
 20 | #yat		If set, a vector seting y tick positions. For p-values, sets 10^y positions
 21 | # ...		other graphical parameters to be passed to plot function
 22 | #Returns (if print==TRUE):
 23 | #Dataframe with two columns: $O=sorted observed values, $E=sorted expected values
 24 | #e.g.
 25 | #p = pnorm(c(rnorm(1e4),rnorm(10)-5))  #mixture of 'null' and 'hit' p-values
 26 | #qq.plot(p)
 27 | #
 28 | qq.plot <- function( x, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE, frac=1, iplot=NULL, print=FALSE, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black", ... )
 29 | {
 30 |   pname <- paste(deparse(substitute(x), 500), collapse="\n")   #Name of vector passed as "x" to be used in plot title etc.
 31 |   if (!is.null(iplot)) frac=1            #Forces frac=1 if "iplot" used to chose points to plot
 32 |   #Some validity checks on x
 33 |   if (!is.numeric(x))
 34 |     stop("'x' must be numeric")
 35 |   nmissing = sum(is.na(x))
 36 |   x <- x[ !is.na(x) ]                 #To deal with missing data values (these don't get plotted)
 37 |   if ((datatype=="pvalue")&((min(x)<0)|(max(x)>1)))
 38 |     stop("'x' must be >=0 and <=1")
 39 |   if ((datatype=="chisq")&(min(x)<0))
 40 |     stop("'x' must be >=0")
 41 |   nzero = sum(x==0)
 42 |   #Some warnings on missing values (and, if pvalues, on x=0) 
 43 |   if (nmissing>0)
 44 |     warning(nmissing, " missing values (excluded)")
 45 |   if ((nzero>0)&(datatype=="pvalue")) {
 46 |     warning(nzero, " zero values (plotted with same value as lowest non-zero p-value)")
 47 |     x[x==0] <- min(x[x>0])
 48 |   }
 49 |   if (datatype=="stdnorm") {df=0; scaletype="ordinal"}
 50 |   n <- length(x)
 51 |   starti = floor((n-1)*(1-frac)) +1         			#i for the first sorted datapoint to be plotted.
 52 |   lena = n-starti+1			               			#Number of datapoints to be plotted
 53 |   if (!is.null(iplot)) a2=iplot else a2=(1:lena)          #indices to be plotted
 54 |   b <- n+1-a2                                             #indices used in determining concentration band
 55 |   #Find E and O under relevant inv. chisq transformation
 56 |   if ((df==2)&(datatype!="stdnorm")) {                                  #short-cut for df=2 (chisq or pval data): use -2log-transformed expected U(0,1) order statistics (high values first)
 57 |     E <- -2*log(a2/(n+1))                                               
 58 |     if (datatype=="pvalue") O <- -2*log(sort(x)[a2])                    #Note obs data no need to transform if already chisq or z value (high values first)
 59 |   } else {
 60 |     if (datatype=="stdnorm") E <- qnorm(a2/(n+1),lower.tail=FALSE)              #invnorm-transformed expected U(0,1) order statistics (put high scores first)
 61 |     if (datatype!="stdnorm") E <- qchisq(a2/(n+1),df=df,lower.tail=FALSE)       #invchisq-transformed expected U(0,1) order statistics (put high scores first)
 62 |     if (datatype=="pvalue") O <- qchisq(sort(x)[a2],df=df,lower.tail=FALSE)     #Take lowest pvalues, transform to chisq (highest/most interesting values first)
 63 |   }
 64 |   if (datatype!="pvalue") O <- sort(x, decreasing=TRUE)[a2]                     #Sort x (chisq or norm), highest (most interesting) values first
 65 |   #Derive "pretty" tick places for log10 p-value scale, if necessary
 66 |   #Note that by this stage, O/E will either contain chisq-scale or normal-scale values, and both are sorted
 67 |   if (scaletype=="pvalue") {                                              #Note scaletype forced="quantile" for stdnorm data, so here all data is on chisq scale
 68 |     if (!is.null(xat)) x4Lx=xat  else x4Lx = pretty( -log10( pchisq(c(E[1],E[length(E)]),df=df,lower.tail=FALSE) ) )
 69 |     if (!is.null(yat)) y4Ly=yat  else y4Ly = pretty( -log10( pchisq(c(O[1],O[length(O)]),df=df,lower.tail=FALSE) ) )
 70 |     xnums = qchisq(10^-x4Lx,df=df,lower.tail=FALSE)                       #Get same locations on actual chisq scale
 71 |     ynums = qchisq(10^-y4Ly,df=df,lower.tail=FALSE)                       #Get same locations on actual chisq scale
 72 |     Lx <- parse( text=paste("10^-",x4Lx,sep="") )
 73 |     Ly <- parse( text=paste("10^-",y4Ly,sep="") )
 74 |   } else {                                                                #"Else" covers both chisq and stdnorm-scaled data
 75 |     if (!is.null(xat)) xnums=xat else xnums=pretty(c(E[1],E[length(E)]))
 76 |     if (!is.null(yat)) ynums=yat else ynums=pretty(c(O[1],O[length(O)]))
 77 |     Lx <- parse( text=as.character(xnums) )
 78 |     Ly <- parse( text=as.character(ynums) )
 79 |   }
 80 |   #Do Q-Q plot
 81 |   if (is.null(main)) {
 82 |     if (datatype=="stdnorm") main=paste("Q-Q plot (on stdnorm) of " ,pname, sep="")
 83 |     else main=paste("Q-Q plot (on chisq[",df,"]) of " ,pname, sep="")
 84 |   }
 85 |   if (is.null(xlab)) {
 86 |     if (scaletype=="pvalue") xlab="Expected p-value"
 87 |     else xlab="Expected quantile"
 88 |   }
 89 |   if (is.null(ylab)) {
 90 |     if (scaletype=="pvalue") ylab="Observed p-value"
 91 |     else ylab="Observed quantile"
 92 |   }
 93 |   plot( c(E[1],E[length(E)]), c(O[1],O[length(O)]), main = main, xlab = xlab, ylab = ylab, type = "n", xaxt = "n", yaxt = "n", ... )       #Just plots the outside box
 94 |   axis(1, at=xnums, labels=Lx )
 95 |   axis(2, at=ynums, labels=Ly )
 96 |   if (plot.concentration.band==TRUE) {        #Note that conc band won't draw if x has too many datapoints
 97 |     if (one.sided==FALSE) {
 98 |       upper <- qbeta( 1-alpha/2, a2, b )      #Exp. upper CL for 'a'th U(0,1) order statistic (becomes 'lower')
 99 |       lower <- qbeta( alpha/2, a2, b )        #Exp. lower CL for 'a'th U(0,1) order statistic (becomes 'upper')
100 |     } else {
101 |       upper <- rep(1,length(E))                    #Exp. upper CL for 'a'th U(0,1) order statistic (becomes 'lower')
102 |       lower <- qbeta( alpha, a2, b )          #Exp. lower CL for 'a'th U(0,1) order statistic (becomes 'upper')
103 |     }
104 |     if (df==2) {
105 |       polygon( c( E, rev(E) ), c( -2*log(upper), rev(E) ), col="grey", border = NA )  #'lower' band after trans
106 |       polygon( c( E, rev(E) ), c( -2*log(lower), rev(E) ), col="grey", border = NA )  #'upper' band after trans
107 |     } else {
108 |       if (datatype=="stdnorm") {
109 |         polygon( c( E, rev(E) ), c( qnorm(upper,lower.tail=FALSE), rev(E) ), col="grey", border = NA )
110 |         polygon( c( E, rev(E) ), c( qnorm(lower,lower.tail=FALSE), rev(E) ), col="grey", border = NA )
111 |       } else {
112 |         polygon( c( E, rev(E) ), c( qchisq(upper,df=df,lower.tail=FALSE), rev(E) ), col="grey", border = NA ) #'lower' band
113 |         polygon( c( E, rev(E) ), c( qchisq(lower,df=df,lower.tail=FALSE), rev(E) ), col="grey", border = NA ) #'upper' band
114 |       }
115 |     }
116 |   }
117 |   abline( 0, 1, col="red" )                                          #plot 1:1 line
118 |   abline(h=ynums, v=xnums, col="lightgray", lty="dotted")            #plot grid
119 |   points( E, O, pch=pch, cex=cex, col=col )                          #Finally, plot points
120 |   if (print==TRUE) return( data.frame( O=O, E=E ) )
121 | }
122 | 


--------------------------------------------------------------------------------