├── AddChromosomeNumber.sh ├── DropDuplicatedPositions.sh ├── DropDuplicatedSNPs.sh ├── ExtractAncestryOutliers.sh ├── FilterByInfoAll.sh ├── GenewiseManhattanPlotinR.R ├── Get_Covariates.R ├── ID_Build.py ├── IdHets.R ├── IndividualIBD.R ├── Iterative_Missingness.sh ├── LICENSE ├── MakeChunks.sh ├── MakeKeepIDs.sh ├── Make_glist.sh ├── ManhattanPlotinR.R ├── ManhattanPlotinRBumblebee.R ├── Manhattan_Plot_For_DTP.R ├── Master_imputation_script_posterior_sampled_haps.sh ├── MergeImputedChunks.sh ├── Modified_submit_impute2_jobs_to_cluster.R ├── PC-VS-OUTCOME_IN_R_FULL.R ├── PC-VS-OUTCOME_IN_R_SHORT.R ├── PC_Plot_1KG.R ├── PC_Plot_1KG_Greyed.R ├── PlotPCs.R ├── Prototype_imputation_job_posterior_sampled_haps.sh ├── QQPlot_For_DTP.R ├── QQPlotinR.R ├── QQPlotinR_Alternate.R ├── README.md ├── Relabel_rs.sh ├── ReplaceDots.sh ├── highLDregions4bim_b37.awk ├── highLDregions4bim_b38.awk ├── manhattan_DOG_TRY.R ├── manhattan_v2.R ├── manhattan_v2_bumblebee.R └── qq_plot_v7.R /AddChromosomeNumber.sh: -------------------------------------------------------------------------------- 1 | results=$1 2 | for i in {1..22} 3 | do 4 | awk -v i=$i results=$results '{s=""; for (j=2; j <= NF; j++) s=s $j " "; print i, s}' $results/Chr$i.impute2 > New_Chromosome$i.impute2 5 | done 6 | awk -v results=$results ' NR==1 {print; next}{ s = ""; for (j =2; j <= NF; j++) s = s $j " "; print "X", s }' $results/ChrX.impute2 > New_ChromosomeX.impute2 7 | -------------------------------------------------------------------------------- /DropDuplicatedPositions.sh: -------------------------------------------------------------------------------- 1 | source Config.conf 2 | 3 | awk '{print $2}' $root.post_imputation_final_rs_only.bim | \ 4 | sort | \ 5 | uniq –d > More_Duplicates_Removed 6 | 7 | ./plink2 \ 8 | --bfile $root.post_imputation_final \ 9 | --exclude More_Duplicates_Removed \ 10 | --make-bed \ 11 | --out $root.post_imputation_final 12 | 13 | awk 'BEGIN {OFS = "\t"} $2 ~ /^rs/{gsub(":.*", "", $2) }1' $root.post_imputation_final.bim > $root.post_imputation_final_rs_only.bim 14 | cp $root.post_imputation_final.bed $root.post_imputation_final_rs_only.bed 15 | cp $root.post_imputation_final.fam $root.post_imputation_final_rs_only.fam 16 | -------------------------------------------------------------------------------- /DropDuplicatedSNPs.sh: -------------------------------------------------------------------------------- 1 | source Config.conf 2 | 3 | awk '{print $0, $1":"$4}' $root.post_imputation_updated.bim > $root.post_imputation_updated_positions 4 | awk '{print $1":"$4}' $root.post_imputation_updated.bim | sort | uniq -d > $root.post_imputation_updated_duplicated_positions 5 | grep -w -f $root.post_imputation_updated_duplicated_positions $root.post_imputation_updated_positions | awk '{print $2}' > $root.post_imputation_updated_duplicated_IDs 6 | -------------------------------------------------------------------------------- /ExtractAncestryOutliers.sh: -------------------------------------------------------------------------------- 1 | source Config.conf 2 | 3 | awk '/REMOVED/ {print $3}' $root.pop_strat_outliers_smartpca.log | sed 's/:/ /g' > $root.pop_strat_outliers.outliers 4 | -------------------------------------------------------------------------------- /FilterByInfoAll.sh: -------------------------------------------------------------------------------- 1 | source Config.conf 2 | 3 | gunzip $root.whole_genome.impute2_info.gz 4 | awk '$5 >= 0.8' $root.whole_genome.impute2_info > $root.whole_genome_filtered.impute2_info 5 | gzip $root.whole_genome.impute2_info 6 | for i in {1..22} 7 | do 8 | awk 'FNR==NR { a[$2]; next } $2 in a' $root.whole_genome_filtered.impute2_info New_Chromosome$i.impute2 > Filtered_Chromsome$i.impute2 9 | done 10 | awk 'FNR==NR { a[$2]; next } $2 in a' $root.whole_genome_filtered.impute2_info New_ChromosomeX.impute2 > Filtered_ChromsomeX.impute2 11 | -------------------------------------------------------------------------------- /GenewiseManhattanPlotinR.R: -------------------------------------------------------------------------------- 1 | source("manhattan_v2_bumblebee.R") 2 | args <- commandArgs(TRUE) 3 | 4 | data <- args[1] 5 | chr <- args[2] 6 | bp <- args[3] 7 | p <- args[4] 8 | out <- args[5] 9 | gws <- as.numeric(args[6]) 10 | 11 | gwas1 <- read.table(data,head=T) 12 | data_to_plot <- data.frame(CHR=gwas1[,chr], BP=gwas1[,bp], P=gwas1[,p]) 13 | 14 | grey_zone <- gws*0.0001 / 0.00000005 15 | 16 | pdf(out,width=8,height=6) 17 | manhattan(data_to_plot, GWthresh=-log10(gws), GreyZoneThresh=-log10(grey_zone), DrawGWline=TRUE) 18 | dev.off() 19 | -------------------------------------------------------------------------------- /Get_Covariates.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | root <- args[1] 3 | covar<- args[2] 4 | PCAEVEC<-read.table(paste(root,".dataname_pop_strat_includes.pca.evec",sep=""),head=T) 5 | colnames(PCAEVEC)<-c("IID","FID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100") 6 | COVARIATES<-read.table(covar,head=T) 7 | COVAR_WITH_PCs<-merge(PCAEVEC, COVARIATES) 8 | write.table(COVAR_WITH_PCs, file=paste(root,".covariates_file.txt",sep=""), quote=F, row.names=F, col.names=T) 9 | -------------------------------------------------------------------------------- /ID_Build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -tt 2 | 3 | ##Basic build identifier 4 | ##Run as ID_Build.py PLINK_CHROMSOME_6_FILE.bim 5 | 6 | import sys 7 | import numpy as np 8 | import pandas as pd 9 | 10 | # Define a main() function 11 | def main(): 12 | if len(sys.argv) >= 2: 13 | with open(sys.argv[1]) as bim_6_file: 14 | Ref_hg18_dict = {'24979336' : 'C', 15 | '32537182' : 'T', 16 | '32647698' : 'A', 17 | '24871357' : 'X', 18 | '32429204' : 'X', 19 | '32539620' : 'X', 20 | '24871129' : 'X', 21 | '32461427' : 'X', 22 | '32571843' : 'X', 23 | '168465326' : 'T', 24 | '134105573' : 'T', 25 | '116083117' : 'T', 26 | '17839694' : 'T', 27 | '11943869' : 'G', 28 | '105624138' : 'T', 29 | '130013753' : 'G', 30 | '168776041' : 'T', 31 | '168722477' : 'X', 32 | '134063880' : 'X', 33 | '115976424' : 'X', 34 | '17731715' : 'X', 35 | '11834883' : 'X', 36 | '105517445' : 'X', 37 | '129972060' : 'X', 38 | '169034116' : 'X', 39 | '168321797' : 'X', 40 | '133742742' : 'X', 41 | '115655260' : 'X', 42 | '17731484' : 'X', 43 | '11834650' : 'X', 44 | '105069570' : 'X', 45 | '129650915' : 'X', 46 | '168633436' : 'X'} 47 | Ref_hg19_dict = {'24979336' : 'X', 48 | '32537182' : 'X', 49 | '32647698' : 'A', 50 | '24871357' : 'C', 51 | '32429204' : 'T', 52 | '32539620' : 'A', 53 | '24871129' : 'X', 54 | '32461427' : 'X', 55 | '32571843' : 'X', 56 | '168465326' : 'X', 57 | '134105573' : 'X', 58 | '116083117' : 'A', 59 | '17839694' : 'X', 60 | '11943869' : 'X', 61 | '105624138' : 'X', 62 | '130013753' : 'X', 63 | '168776041' : 'X', 64 | '168722477' : 'T', 65 | '134063880' : 'T', 66 | '115976424' : 'T', 67 | '17731715' : 'T', 68 | '11834883' : 'G', 69 | '105517445' : 'T', 70 | '129972060' : 'G', 71 | '169034116' : 'T', 72 | '168321797' : 'X', 73 | '133742742' : 'X', 74 | '115655260' : 'X', 75 | '17731484' : 'X', 76 | '11834650' : 'X', 77 | '105069570' : 'X', 78 | '129650915' : 'X', 79 | '168633436' : 'X'} 80 | Ref_hg38_dict = {'24979336' : 'X', 81 | '32537182' : 'X', 82 | '32647698' : 'X', 83 | '24871357' : 'X', 84 | '32429204' : 'X', 85 | '32539620' : 'X', 86 | '24871129' : 'C', 87 | '32461427' : 'T', 88 | '32571843' : 'A', 89 | '168465326' : 'X', 90 | '134105573' : 'X', 91 | '116083117' : 'X', 92 | '17839694' : 'X', 93 | '11943869' : 'X', 94 | '105624138' : 'X', 95 | '130013753' : 'X', 96 | '168776041' : 'X', 97 | '168722477' : 'X', 98 | '134063880' : 'X', 99 | '115976424' : 'X', 100 | '17731715' : 'X', 101 | '11834883' : 'X', 102 | '105517445' : 'X', 103 | '129972060' : 'X', 104 | '169034116' : 'X', 105 | '168321797' : 'T', 106 | '133742742' : 'T', 107 | '115655260' : 'T', 108 | '17731484' : 'T', 109 | '11834650' : 'G', 110 | '105069570' : 'T', 111 | '129650915' : 'G', 112 | '168633436' : 'T'} 113 | bim_6_df = pd.read_table(bim_6_file, delim_whitespace=True, header=None, prefix='V', index_col=0, usecols=[3,4]) 114 | if np.sum(bim_6_df.index.isin([24979336])) == 1: 115 | bim_6_dict = {'24979336' : bim_6_df.loc[24979336,'V4'].upper()} 116 | else: 117 | bim_6_dict = {'24979336' : 'X'} 118 | if np.sum(bim_6_df.index.isin([32537182])) == 1: 119 | bim_6_dict['32537182'] = bim_6_df.loc[32537182,'V4'].upper() 120 | else: 121 | bim_6_dict['32537182'] = 'X' 122 | if np.sum(bim_6_df.index.isin([32647698])) == 1: 123 | bim_6_dict['32647698'] = bim_6_df.loc[32647698, 'V4'].upper() 124 | else: 125 | bim_6_dict['32647698'] = 'X' 126 | if np.sum(bim_6_df.index.isin([24871357])) == 1: 127 | bim_6_dict['24871357'] = bim_6_df.loc[24871357, 'V4'].upper() 128 | else: 129 | bim_6_dict['24871357'] = 'X' 130 | if np.sum(bim_6_df.index.isin([32429204])) == 1: 131 | bim_6_dict['32429204'] = bim_6_df.loc[32429204, 'V4'].upper() 132 | else: 133 | bim_6_dict['32429204'] = 'X' 134 | if np.sum(bim_6_df.index.isin([32539620])) == 1: 135 | bim_6_dict['32539620'] = bim_6_df.loc[32539620, 'V4'].upper() 136 | else: 137 | bim_6_dict['32539620'] = 'X' 138 | if np.sum(bim_6_df.index.isin([24871129])) == 1: 139 | bim_6_dict['24871129'] = bim_6_df.loc[24871129, 'V4'].upper() 140 | else: 141 | bim_6_dict['24871129'] = 'X' 142 | if np.sum(bim_6_df.index.isin([32461427])) == 1: 143 | bim_6_dict['32461427'] = bim_6_df.loc[32461427, 'V4'].upper() 144 | else: 145 | bim_6_dict['32461427'] = 'X' 146 | if np.sum(bim_6_df.index.isin([32571843])) == 1: 147 | bim_6_dict['32571843'] = bim_6_df.loc[32571843, 'V4'].upper() 148 | else: 149 | bim_6_dict['32571843'] = 'X' 150 | if np.sum(bim_6_df.index.isin([168465326])) == 1: 151 | bim_6_dict['168465326'] = bim_6_df.loc[168465326, 'V4'].upper() 152 | else: 153 | bim_6_dict['168465326'] = 'X' 154 | if np.sum(bim_6_df.index.isin([134105573])) == 1: 155 | bim_6_dict['134105573'] = bim_6_df.loc[134105573, 'V4'].upper() 156 | else: 157 | bim_6_dict['134105573'] = 'X' 158 | if np.sum(bim_6_df.index.isin([116083117])) == 1: 159 | bim_6_dict['116083117'] = bim_6_df.loc[116083117, 'V4'].upper() 160 | else: 161 | bim_6_dict['116083117'] = 'X' 162 | if np.sum(bim_6_df.index.isin([17839694])) == 1: 163 | bim_6_dict['17839694'] = bim_6_df.loc[17839694, 'V4'].upper() 164 | else: 165 | bim_6_dict['17839694'] = 'X' 166 | if np.sum(bim_6_df.index.isin([11943869])) == 1: 167 | bim_6_dict['11943869'] = bim_6_df.loc[11943869, 'V4'].upper() 168 | else: 169 | bim_6_dict['11943869'] = 'X' 170 | if np.sum(bim_6_df.index.isin([105624138])) == 1: 171 | bim_6_dict['105624138'] = bim_6_df.loc[105624138, 'V4'].upper() 172 | else: 173 | bim_6_dict['105624138'] = 'X' 174 | if np.sum(bim_6_df.index.isin([130013753])) == 1: 175 | bim_6_dict['130013753'] = bim_6_df.loc[130013753, 'V4'].upper() 176 | else: 177 | bim_6_dict['130013753'] = 'X' 178 | if np.sum(bim_6_df.index.isin([168776041])) == 1: 179 | bim_6_dict['168776041'] = bim_6_df.loc[168776041, 'V4'].upper() 180 | else: 181 | bim_6_dict['168776041'] = 'X' 182 | if np.sum(bim_6_df.index.isin([168722477])) == 1: 183 | bim_6_dict['168722477'] = bim_6_df.loc[168722477, 'V4'].upper() 184 | else: 185 | bim_6_dict['168722477'] = 'X' 186 | if np.sum(bim_6_df.index.isin([134063880])) == 1: 187 | bim_6_dict['134063880'] = bim_6_df.loc[134063880, 'V4'].upper() 188 | else: 189 | bim_6_dict['134063880'] = 'X' 190 | if np.sum(bim_6_df.index.isin([115976424])) == 1: 191 | bim_6_dict['115976424'] = bim_6_df.loc[115976424, 'V4'].upper() 192 | else: 193 | bim_6_dict['115976424'] = 'X' 194 | if np.sum(bim_6_df.index.isin([17731715])) == 1: 195 | bim_6_dict['17731715'] = bim_6_df.loc[17731715, 'V4'].upper() 196 | else: 197 | bim_6_dict['17731715'] = 'X' 198 | if np.sum(bim_6_df.index.isin([11834883])) == 1: 199 | bim_6_dict['11834883'] = bim_6_df.loc[11834883, 'V4'].upper() 200 | else: 201 | bim_6_dict['11834883'] = 'X' 202 | if np.sum(bim_6_df.index.isin([105517445])) == 1: 203 | bim_6_dict['105517445'] = bim_6_df.loc[105517445, 'V4'].upper() 204 | else: 205 | bim_6_dict['105517445'] = 'X' 206 | if np.sum(bim_6_df.index.isin([129972060])) == 1: 207 | bim_6_dict['129972060'] = bim_6_df.loc[129972060, 'V4'].upper() 208 | else: 209 | bim_6_dict['129972060'] = 'X' 210 | if np.sum(bim_6_df.index.isin([169034116])) == 1: 211 | bim_6_dict['169034116'] = bim_6_df.loc[169034116, 'V4'].upper() 212 | else: 213 | bim_6_dict['169034116'] = 'X' 214 | if np.sum(bim_6_df.index.isin([168321797])) == 1: 215 | bim_6_dict['168321797'] = bim_6_df.loc[168321797, 'V4'].upper() 216 | else: 217 | bim_6_dict['168321797'] = 'X' 218 | if np.sum(bim_6_df.index.isin([133742742])) == 1: 219 | bim_6_dict['133742742'] = bim_6_df.loc[133742742, 'V4'].upper() 220 | else: 221 | bim_6_dict['133742742'] = 'X' 222 | if np.sum(bim_6_df.index.isin([115655260])) == 1: 223 | bim_6_dict['115655260'] = bim_6_df.loc[115655260, 'V4'].upper() 224 | else: 225 | bim_6_dict['115655260'] = 'X' 226 | if np.sum(bim_6_df.index.isin([17731484])) == 1: 227 | bim_6_dict['17731484'] = bim_6_df.loc[17731484, 'V4'].upper() 228 | else: 229 | bim_6_dict['17731484'] = 'X' 230 | if np.sum(bim_6_df.index.isin([11834650])) == 1: 231 | bim_6_dict['11834650'] = bim_6_df.loc[11834650, 'V4'].upper() 232 | else: 233 | bim_6_dict['11834650'] = 'X' 234 | if np.sum(bim_6_df.index.isin([105069570])) == 1: 235 | bim_6_dict['105069570'] = bim_6_df.loc[105069570, 'V4'].upper() 236 | else: 237 | bim_6_dict['105069570'] = 'X' 238 | if np.sum(bim_6_df.index.isin([129650915])) == 1: 239 | bim_6_dict['129650915'] = bim_6_df.loc[129650915, 'V4'].upper() 240 | else: 241 | bim_6_dict['129650915'] = 'X' 242 | if np.sum(bim_6_df.index.isin([168633436])) == 1: 243 | bim_6_dict['168633436'] = bim_6_df.loc[168633436, 'V4'].upper() 244 | else: 245 | bim_6_dict['168633436'] = 'X' 246 | hg_18_score = 0 247 | hg_19_score = 0 248 | hg_38_score = 0 249 | Test = ''.join('{}'.format(val) for key, val in sorted(bim_6_dict.items())) 250 | hg_18_ref = ''.join('{}'.format(val) for key, val in sorted(Ref_hg18_dict.items())) 251 | hg_19_ref = ''.join('{}'.format(val) for key, val in sorted(Ref_hg19_dict.items())) 252 | hg_38_ref = ''.join('{}'.format(val) for key, val in sorted(Ref_hg38_dict.items())) 253 | for i in range(len(Test)): 254 | if Test[i] == 'X': 255 | continue 256 | if Test[i] != 'X': 257 | if hg_18_ref[i] == Test[i]: 258 | hg_18_score += 1 259 | if hg_19_ref[i] == Test[i]: 260 | hg_19_score += 1 261 | if hg_38_ref[i] == Test[i]: 262 | hg_38_score += 1 263 | hg_18_length = 0 264 | hg_19_length = 0 265 | hg_38_length = 0 266 | for i in range(len(hg_18_ref)): 267 | if hg_18_ref[i] == 'X': 268 | continue 269 | if hg_18_ref[i] != 'X': 270 | hg_18_length += 1 271 | for i in range(len(hg_19_ref)): 272 | if hg_19_ref[i] == 'X': 273 | continue 274 | if hg_19_ref[i] != 'X': 275 | hg_19_length += 1 276 | for i in range(len(hg_38_ref)): 277 | if hg_38_ref[i] == 'X': 278 | continue 279 | if hg_38_ref[i] != 'X': 280 | hg_38_length += 1 281 | print '\n' 282 | if hg_18_score > hg_19_score: 283 | if hg_18_score > hg_38_score: 284 | print 'Probable build is hg18' 285 | if hg_18_score < hg_38_score: 286 | print 'Probable build is hg38' 287 | if hg_18_score == hg_38_score: 288 | print 'Cannot determine build' 289 | if hg_18_score < hg_19_score: 290 | if hg_19_score > hg_38_score: 291 | print 'Probable build is hg19' 292 | if hg_19_score < hg_38_score: 293 | print 'Probable build is hg38' 294 | if hg_19_score == hg_38_score: 295 | print 'Cannot determine build' 296 | if hg_18_score == hg_19_score: 297 | if hg_18_score < hg_38_score: 298 | print 'Probable build is hg38' 299 | if hg_18_score >= hg_38_score: 300 | print 'Cannot determine build' 301 | print '\nhg18 reference:', hg_18_ref 302 | print 'Test:', Test 303 | print 'Match:', hg_18_score, 'out of', hg_18_length 304 | print '\nhg19 reference:', hg_19_ref 305 | print 'Test:', Test 306 | print 'Match:', hg_19_score, 'out of', hg_19_length 307 | print '\nhg38 reference:', hg_38_ref 308 | print 'Test:', Test 309 | print 'Match:', hg_38_score, 'out of', hg_38_length 310 | else: 311 | print 'Give me a PLINK .bim file please. Chromosome 6, not pruned for preference' 312 | 313 | # This is the standard boilerplate that calls the main() function. 314 | if __name__ == '__main__': 315 | main() 316 | -------------------------------------------------------------------------------- /IdHets.R: -------------------------------------------------------------------------------- 1 | data_dir <- getwd() 2 | setwd(data_dir) 3 | args <- commandArgs(TRUE) 4 | root <- args[1] 5 | # default 6 | ibc_file=paste(root,".ibc",sep="") 7 | sdcut=3 # number of sds at which to impose cut offs 8 | # get args 9 | t=commandArgs() 10 | if (charmatch("-args",t,nomatch=-1)>=0) args = t[((1:length(t))[t=="-args"]+1):length(t)] else args="" 11 | if (charmatch("ibc_file=",args,nomatch=-1)>=0) ibc_file = strsplit(args[charmatch("ibc_file=",args)],split="=")[[1]][2] 12 | if (charmatch("sdcut=",args,nomatch=-1)>=0) sdcut = strsplit(args[charmatch("sdcut=",args)],split="=")[[1]][2] 13 | ## 14 | d <- read.table(ibc_file,head=T); 15 | het_outliers_3sd <- abs(scale(d$Fhat2))>3 16 | write.table(d[het_outliers_3sd,],file=paste(root,".LD_het_outliers.txt",sep=""), sep="\t",quote=F,row.names=F); 17 | write.table(d[het_outliers_3sd,c(1,2)],file=paste(root,".LD_het_outliers_sample_exclude",sep=""), sep="\t",quote=F,row.names=F,col.names=F); 18 | -------------------------------------------------------------------------------- /IndividualIBD.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | root <- args[1] 3 | sigma <- as.numeric(args[2]) 4 | U<-read.table(paste(root,".IBD.genome",sep=""),head=T) # read in table 5 | V_ONE<-with(U, data.frame(FID1, IID1, PI_HAT)) # get variables of interest - reference individual 6 | V_TWO<-with(U, data.frame(FID2, IID2, PI_HAT)) # get variables of interest - test individual 7 | names(V_TWO)<-c("FID1", "IID1", "PI_HAT") 8 | V<-as.data.frame(rbind(V_ONE, V_TWO)) 9 | names(V)<-c("FID1","IID1","PI_HAT") 10 | W<- aggregate(V$PI_HAT,FUN=mean,by=list(V$FID1, V$IID1)) #calculate average pi hat 11 | names(W)<-c("FID","IID","MEAN_PI_HAT") #rename columns 12 | X<-mean(W$MEAN_PI_HAT) #calculate mean of average pi hats 13 | Y<-sd(W$MEAN_PI_HAT) #calculate standard deviation of average pi hats 14 | Z<-X+(sigma*Y) # calculate threshold, here 6 SDs from mean 15 | sink(paste(root,".IBD_INDIV.txt",sep="")) 16 | W #print average pi hats 17 | sink(paste(root,".IBD_INDIV_outliers.txt",sep="")) 18 | subset(W,W$MEAN_PI_HAT>=Z)[,1:2] #print outliers 19 | -------------------------------------------------------------------------------- /Iterative_Missingness.sh: -------------------------------------------------------------------------------- 1 | source ./Config.conf 2 | 3 | aspercent=$(echo $1 " / 100" | bc -l) 4 | genomind_1=$(echo "1-"$aspercent | bc -l) 5 | 6 | $plink \ 7 | --bfile $root.common \ 8 | --geno $genomind_1 \ 9 | --make-bed \ 10 | --out $root.common_SNP$1 11 | 12 | #Remove samples with completeness < 90% 13 | 14 | $plink \ 15 | --bfile $root.common_SNP$1 \ 16 | --mind $genomind_1 \ 17 | --make-bed \ 18 | --out $root.common_sample$1.SNP$1 19 | 20 | newstep=$(($1+$3)) 21 | 22 | for i in $(seq $newstep $3 $2) 23 | 24 | do 25 | 26 | aspercent=$(echo $i " / 100" | bc -l) 27 | genomind=$(echo "1-"$aspercent | bc -l) 28 | prefix=$(($i-$3)) 29 | 30 | $plink \ 31 | --bfile $root.common_sample$prefix.SNP$prefix \ 32 | --geno $genomind \ 33 | --make-bed \ 34 | --out $root.common_sample$prefix.SNP$i 35 | 36 | $plink \ 37 | --bfile $root.common_sample$prefix.SNP$i \ 38 | --mind $genomind \ 39 | --make-bed \ 40 | --out $root.common_sample$i.SNP$i 41 | 42 | done 43 | 44 | $plink \ 45 | --bfile $root.common_sample$2.SNP$2 \ 46 | --make-bed \ 47 | --out $root.filtered 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Joni Coleman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MakeChunks.sh: -------------------------------------------------------------------------------- 1 | source Config.conf 2 | 3 | awk -v const=5000000 -v max=51 '{a[$1,int($4/const)]++; b[$1]} END{for (i in b) {for (j=0; j Chunks 4 | awk '$4 != "" {print $2, $3 > "analysis_chunks_5Mb_chr"$1".txt"} ' < Chunks 5 | awk '$4 != "" {print > "Chunks_chr"$1".txt"}' < Chunks 6 | -------------------------------------------------------------------------------- /MakeKeepIDs.sh: -------------------------------------------------------------------------------- 1 | for pop in {CEU, CHB, JPT, YRI} 2 | do 3 | head -1 genotypes_chr22_{pop}_r28_nr.b36_fwd.txt | sed 's/ /\n/g' | sort | sed '1,5d' > keepIDs{pop} 4 | done 5 | cat keepIDs* > keepids.txt 6 | -------------------------------------------------------------------------------- /Make_glist.sh: -------------------------------------------------------------------------------- 1 | awk '{ 2 | if (!($4 in min)) { 3 | min[$4]=$2; max[$4]=$3; chrom[$4]=$1 4 | } else { 5 | if ($2 < min[$4]) min[$4]=$2 6 | if ($3 > max[$4]) max[$4]=$3 7 | } 8 | } 9 | END { 10 | for (name2 in min) 11 | print chrom[name2], min[name2], max[name2], name2}' $1 | \ 12 | sort -k 1 -n | \ 13 | sed '1d' | \ 14 | awk '$1 >= 1 && $1 <= 22 || $1 =="X" {print $0}' | \ 15 | grep -v _g > $2 16 | -------------------------------------------------------------------------------- /ManhattanPlotinR.R: -------------------------------------------------------------------------------- 1 | source("manhattan_v2.R") 2 | args <- commandArgs(TRUE) 3 | root <- args[1] 4 | gwas1 <- read.table(paste(root,".post_imputation_final_analysis_FOR_MP",sep=""),head=T) 5 | data_to_plot <- data.frame(CHR=gwas1$CHR, BP=gwas1$BP, P=gwas1$P) 6 | pdf(paste(root,".post_imputation_final_analysis_MP.pdf",sep=""),width=8,height=6) 7 | manhattan(data_to_plot, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE) 8 | dev.off() 9 | -------------------------------------------------------------------------------- /ManhattanPlotinRBumblebee.R: -------------------------------------------------------------------------------- 1 | source("manhattan_v2_bumblebee.R") 2 | args <- commandArgs(TRUE) 3 | root <- args[1] 4 | gwas1 <- read.table(paste(root,".post_imputation_final_analysis_FOR_MP",sep=""),head=T) 5 | data_to_plot <- data.frame(CHR=gwas1$CHR, BP=gwas1$BP, P=gwas1$P) 6 | pdf(paste(root,".post_imputation_final_analysis_MP.pdf",sep=""),width=8,height=6) 7 | manhattan(data_to_plot, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE) 8 | dev.off() 9 | -------------------------------------------------------------------------------- /Manhattan_Plot_For_DTP.R: -------------------------------------------------------------------------------- 1 | source("manhattan_v2.R") 2 | args <- commandArgs(TRUE) 3 | root <- args[1] 4 | gwas1 <- read.table(paste(root,sep=""),head=T) 5 | data_to_plot <- data.frame(CHR=gwas1$CHR, BP=gwas1$BP, P=gwas1$P) 6 | pdf(paste(root,".MP.pdf",sep=""),width=8,height=6) 7 | manhattan(data_to_plot, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE) 8 | dev.off() 9 | -------------------------------------------------------------------------------- /Master_imputation_script_posterior_sampled_haps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -cwd 3 | 4 | for chrom in {1..22} 5 | do 6 | impute2_examples/modified_submit_impute2_jobs_to_cluster.R chr=${chrom} post.avg.impute.run=TRUE 7 | done 8 | sed -i '{s/ qsub/\n\qsub/g}' impute2_examples/qsublist.sh 9 | sed -i '1d' impute2_examples/qsublist.sh 10 | 11 | split -l 45 impute2_examples/qsublist.sh impute2_examples/qsub 12 | 13 | sed -i '0,/qsub/{s/qsub/qsub --N job1/}' impute2_examples/qsubaa 14 | sed -i '0,/qsub/{s/qsub/qsub --N job2/}' impute2_examples/qsubab 15 | sed -i '0,/qsub/{s/qsub/qsub --N job3/}' impute2_examples/qsubac 16 | sed -i '0,/qsub/{s/qsub/qsub --N job4/}' impute2_examples/qsubad 17 | sed -i '0,/qsub/{s/qsub/qsub --N job5/}' impute2_examples/qsubae 18 | sed -i '0,/qsub/{s/qsub/qsub --N job6/}' impute2_examples/qsubaf ##etc., for as many jobs as are needed 19 | 20 | sed -i '{s/qsub/qsub --hold_jid job1/g}' impute2_examples/qsubab 21 | sed -i '{s/qsub/qsub --hold_jid job2/g}' impute2_examples/qsubac 22 | sed -i '{s/qsub/qsub --hold_jid job3/g}' impute2_examples/qsubad 23 | sed -i '{s/qsub/qsub --hold_jid job4/g}' impute2_examples/qsubae 24 | sed -i '{s/qsub/qsub --hold_jid job5/g}' impute2_examples/qsubaf ##etc. for as many jobs as are needed 25 | 26 | sh impute2_examples/qsubaa 27 | sh impute2_examples/qsubab 28 | sh impute2_examples/qsubac 29 | sh impute2_examples/qsubad 30 | sh impute2_examples/qsubae 31 | sh impute2_examples/qsubaf 32 | -------------------------------------------------------------------------------- /MergeImputedChunks.sh: -------------------------------------------------------------------------------- 1 | for i in {1..22} 2 | do 3 | mkdir results-directory/Chr$i 4 | mv gwas_data_chr$i* Chr$i 5 | cat Chr$i/*.impute2 > Chr$i/Chr$i.impute2 6 | cat Chr$i/*.impute2_info > Chr$i/Chr$i.impute2_info 7 | mv Chr$i/Chr$i.impute2* results-directory/ 8 | done 9 | mkdir results-directory/ChrX 10 | mv gwas_data_chrX* ChrX 11 | cat ChrX/*.impute2 > ChrX/ChrX.impute2 12 | cat ChrX/*.impute2_info > ChrX/ChrX.impute2_info 13 | mv ChrX/ChrX.impute2* results-directory/ 14 | -------------------------------------------------------------------------------- /Modified_submit_impute2_jobs_to_cluster.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript --vanilla 2 | 3 | root.dir <- "." 4 | data.dir <- paste(root.dir,"Data/", sep="") 5 | 6 | # default settings; can change on command line 7 | chr <- 10 8 | chunk.size <- 5 # chunk size in Mb 9 | 10 | # exactly one of the following must be set to TRUE on the command line 11 | phasing.run <- FALSE # is this a phasing run? 12 | best.guess.impute.run <- FALSE # is this an imputation run using best-guess haplotypes? 13 | post.avg.impute.run <- FALSE # is this an imputation run using posterior--sampled haplotypes? 14 | 15 | ## process command--line arguments 16 | args <- strsplit(commandArgs(TRUE), split='=') 17 | keys <- vector("character") 18 | 19 | if (length(args) > 0) { 20 | for (i in 1:length(args)) { 21 | key <- args[[i]][1] 22 | value <- args[[i]][2] 23 | keys <- c(keys, key) 24 | 25 | if (exists(key)) { 26 | # replace default value of key with input value 27 | assign(key, value) 28 | } 29 | else { 30 | cat("\n") 31 | stop(paste("Unrecognized option [",key,"].\n\n", sep="")) 32 | } 33 | } 34 | } 35 | 36 | # housekeeping 37 | phasing.run <- as.logical(phasing.run) 38 | best.guess.impute.run <- as.logical(best.guess.impute.run) 39 | post.avg.impute.run <- as.logical(post.avg.impute.run) 40 | 41 | # exit the script if it is not clear what type of IMPUTE2 job we want to run 42 | stopifnot(phasing.run + best.guess.impute.run + post.avg.impute.run == 1) 43 | 44 | 45 | # read in file with chunk boundary definitions 46 | chunk.file <- paste(data.dir,"analysis_chunks_",chunk.size,"Mb_chr",chr,".txt", sep="") 47 | chunks <- read.table(chunk.file, head=T, as.is=T) 48 | 49 | # submit a job to the cluster for each analysis chunk on this chromosome 50 | sink(paste(root.dir,"qsublist.sh", sep = ""),append=T) 51 | for (i in 1:nrow(chunks)) { 52 | system.call <- paste(" qsub ", 53 | ifelse(phasing.run,"./prototype_phasing_job.sh ",""), 54 | ifelse(best.guess.impute.run,"./prototype_imputation_job_best_guess_haps.sh ",""), 55 | ifelse(post.avg.impute.run,"/root/to/impute2_examples/prototype_imputation_job_posterior_sampled_haps.sh ",""), 56 | chr," ",chunks[i,1]," ",chunks[i,2], 57 | sep="") 58 | cat( system.call ) 59 | } 60 | -------------------------------------------------------------------------------- /PC-VS-OUTCOME_IN_R_FULL.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | root <- args[1] 3 | pheno <- args[2] 4 | PCAEVEC<-read.table(paste(root,".pca.evec",sep=""),head=T) 5 | colnames(PCAEVEC)<-c("FID","IID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100", "Pheno") 6 | PHENOTYPE<-read.table(pheno,head=T) 7 | PCAPHENO<-merge(PCAEVEC,PHENOTYPE) 8 | sink(paste(root,".PC_Output_Associations_FULL.txt",sep="")) 9 | for (i in 1:100) { 10 | DATA<-as.data.frame(PCAPHENO[,c(3:(i+2))]) 11 | print(summary(lm(PCAPHENO[,104] ~ ., data=DATA))) 12 | } 13 | sink() 14 | -------------------------------------------------------------------------------- /PC-VS-OUTCOME_IN_R_SHORT.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | root <- args[1] 3 | pheno<- args[2] 4 | PCAEVEC<-read.table(paste(root,".pca.evec",sep=""),head=T) 5 | colnames(PCAEVEC)<-c("FID","IID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100", "Pheno") 6 | PHENOTYPE<-read.table(pheno,head=T) 7 | PCAPHENO<-merge(PCAEVEC,PHENOTYPE) 8 | sink(paste(root,".PC_Output_Associations_SHORT.txt",sep="")) 9 | writeLines(c(" PC P R-squared")) 10 | options(scipen=999) 11 | DATA<-as.data.frame(PCAPHENO[,3]) 12 | print(c(1, summary(lm(PCAPHENO[,104] ~ DATA[,1], data=DATA))$coefficients[2,4], summary(lm(PCAPHENO[,104] ~ DATA[,1], data=DATA))$r.squared)) 13 | for (i in 2:100) { 14 | DATA<-as.data.frame(PCAPHENO[,c(3:(i+2))]) 15 | DATA2<-as.data.frame(PCAPHENO[,c(3:(i+1))]) 16 | print(c(i, summary(lm(PCAPHENO[,104] ~ ., data=DATA))$coefficients[(i+1),4], summary(lm(PCAPHENO[,104] ~ ., data=DATA))$r.squared - summary(lm(PCAPHENO[,104] ~ ., data=DATA2))$r.squared)) 17 | } 18 | sink() 19 | -------------------------------------------------------------------------------- /PC_Plot_1KG.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | root <- args[1] 3 | PCAEVEC<-read.table(paste(root,".1kg.LD_pop_strat.pca.evec_RENAMED",sep=""), head=T) 4 | colnames(PCAEVEC) <- c("ID","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","Pop") 5 | library(ggplot2) 6 | pdf(paste(root,"1kg.LD_pop_strat_PCA.pdf",sep="")) 7 | with(PCAEVEC, qplot(PC1,PC2,colour=Pop)) 8 | with(PCAEVEC, qplot(PC1,PC3,colour=Pop)) 9 | with(PCAEVEC, qplot(PC1,PC4,colour=Pop)) 10 | with(PCAEVEC, qplot(PC1,PC5,colour=Pop)) 11 | with(PCAEVEC, qplot(PC2,PC3,colour=Pop)) 12 | with(PCAEVEC, qplot(PC2,PC4,colour=Pop)) 13 | with(PCAEVEC, qplot(PC2,PC5,colour=Pop)) 14 | with(PCAEVEC, qplot(PC3,PC4,colour=Pop)) 15 | with(PCAEVEC, qplot(PC3,PC5,colour=Pop)) 16 | with(PCAEVEC, qplot(PC4,PC5,colour=Pop)) 17 | dev.off() 18 | -------------------------------------------------------------------------------- /PC_Plot_1KG_Greyed.R: -------------------------------------------------------------------------------- 1 | ###Author: JRIC 2 | ###Date: 2018-07-13 3 | ###Purpose: Plot user data projected on 1KG PCs, greying out 1KG individuals 4 | 5 | ##Load packages 6 | library(ggplot2) 7 | library(colorspace) 8 | 9 | ##Initialise command line arguments 10 | args <- commandArgs(TRUE) 11 | 12 | ##Load data 13 | root <- args[1] 14 | PCAEVEC<-read.table(paste(root,".1kg.LD_pop_strat.pca.evec_RENAMED",sep=""), head=T) 15 | 16 | ##Rename columns 17 | colnames(PCAEVEC) <- c("ID","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","Pop") 18 | 19 | ##Define colour palette 20 | ThousandGenomesPalette<-heat_hcl(length(unique(PCAEVEC$Pop)), h = c(300, 75), c. = c(35, 95), l = c(15, 90), power = c(0.8, 1.2), fixup = TRUE, gamma = NULL, alpha = 1) 21 | names(ThousandGenomesPalette)<-unique(PCAEVEC$Pop) 22 | 23 | ThousandGenomesPops<-c("LWK","MXL","PUR","TSI","YRI","ASW","CEU","CHB","CHS","CLM","FIN","GBR","IBS","JPT") 24 | 25 | ThousandGenomesPalette[names(ThousandGenomesPalette) %in% ThousandGenomesPops] <- "#CCCCCC" 26 | ThousandGenomesPalette[!names(ThousandGenomesPalette) %in% ThousandGenomesPops] <- heat_hcl(length(unique(PCAEVEC$Pop)) - 14, h = c(300, 75), c. = c(35, 95), l = c(15, 90), power = c(0.8, 1.2), fixup = TRUE, gamma = NULL, alpha = 1) 27 | 28 | ##Print pairwise comparisons of PC1-5 to pdf 29 | pdf(paste(root,"1kg.LD_pop_strat_PCA.pdf",sep="")) 30 | with(PCAEVEC, qplot(PC1,PC2,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 31 | with(PCAEVEC, qplot(PC1,PC3,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 32 | with(PCAEVEC, qplot(PC1,PC4,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 33 | with(PCAEVEC, qplot(PC1,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 34 | with(PCAEVEC, qplot(PC2,PC3,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 35 | with(PCAEVEC, qplot(PC2,PC4,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 36 | with(PCAEVEC, qplot(PC2,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 37 | with(PCAEVEC, qplot(PC3,PC4,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 38 | with(PCAEVEC, qplot(PC3,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 39 | with(PCAEVEC, qplot(PC4,PC5,colour=Pop) + scale_colour_manual(values = ThousandGenomesPalette)) 40 | dev.off() 41 | -------------------------------------------------------------------------------- /PlotPCs.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | root <- args[1] 3 | pcx <- as.numeric(args[2]) 4 | pcy <- as.numeric(args[3]) 5 | 6 | PCAEVEC<-read.table(paste(root,".pca.evec",sep=""),head=T) 7 | colnames(PCAEVEC)<-c("FID","IID","PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", "PC61", "PC62", "PC63", "PC64", "PC65", "PC66", "PC67", "PC68", "PC69", "PC70", "PC71", "PC72", "PC73", "PC74", "PC75", "PC76", "PC77", "PC78", "PC79", "PC80", "PC81", "PC82", "PC83", "PC84", "PC85", "PC86", "PC87", "PC88", "PC89", "PC90", "PC91", "PC92", "PC93", "PC94", "PC95", "PC96", "PC97", "PC98", "PC99", "PC100", "Pheno") 8 | library(ggplot2) 9 | pcx<-2+pcx 10 | pcy<-2+pcy 11 | pdf(paste(root,"_PC",(pcx-2),"_PC",(pcy-2),".pdf",sep="")) 12 | qplot(PCAEVEC[,pcx],PCAEVEC[,pcy], data=PCAEVEC, color=Pheno) + xlab(paste("PC",(pcx-2),sep="")) + ylab(paste("PC",(pcy-2),sep="")) 13 | dev.off() 14 | -------------------------------------------------------------------------------- /Prototype_imputation_job_posterior_sampled_haps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$-S /bin/sh 3 | 4 | CHR=$1 5 | CHUNK_START=`printf "%.0f" $2` 6 | CHUNK_END=`printf "%.0f" $3` 7 | 8 | # directories 9 | ROOT_DIR="./" 10 | DATA_DIR=${ROOT_DIR}downloaded_references/ 11 | RESULTS_DIR=${ROOT_DIR}results_directory/ 12 | 13 | # executable 14 | IMPUTE2_EXEC=bin/impute2 15 | 16 | # parameters 17 | NE=20000 18 | iter=30 19 | burnin=10 20 | k=80 21 | k_hap=500 22 | 23 | # reference data files 24 | GENMAP_FILE=${DATA_DIR}genetic_map_chr${CHR}_combined_b37.txt 25 | HAPS_FILE=${DATA_DIR} ALL.chr${CHR}.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.nomono.haplotypes.gz 26 | LEGEND_FILE=${DATA_DIR} ALL.chr${CHR}.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.nomono.legend.gz 27 | STRAND_FILE=${DATA_DIR}dataname_{CHR}.strand 28 | 29 | # GWAS data files 30 | GWAS_GTYPE_FILE=${DATA_DIR}Chr${CHR}.gen 31 | 32 | # main output file 33 | OUTPUT_FILE=${RESULTS_DIR}gwas_data_chr${CHR}.pos${CHUNK_START}-${CHUNK_END}.posterior_sampled_haps_imputation.impute2 34 | 35 | ## impute genotypes from posterior--sampled GWAS haplotypes 36 | $IMPUTE2_EXEC \ 37 | -m $GENMAP_FILE \ 38 | -g $GWAS_GTYPE_FILE \ 39 | -strand_g $STRAND_FILE \ 40 | -h $HAPS_FILE \ 41 | -l $LEGEND_FILE \ 42 | -Ne $NE \ 43 | -iter $iter \ 44 | -burnin $burnin \ 45 | -k $k \ 46 | -k_hap $k_hap \ 47 | -int $CHUNK_START $CHUNK_END \ 48 | -allow_large_regions \ 49 | -o $OUTPUT_FILE 50 | -------------------------------------------------------------------------------- /QQPlot_For_DTP.R: -------------------------------------------------------------------------------- 1 | source("qq_plot_v7.R") 2 | args <- commandArgs(TRUE) 3 | root <- args[1] 4 | gwas1<-read.table(paste(root, sep=""),head=T) 5 | x1<-gwas1$P 6 | pdf(paste(root,".QQ.pdf",sep=""),width=8,height=6) 7 | qq.plot(x1, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=0.1, print=F, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black") 8 | text(x=12,y=4,paste("lambda--median=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep="")) 9 | dev.off() 10 | 11 | ###This version of the QQ plot wrapper also writes values of lambda median on the plot, using p--values for all SNPs plotted.### 12 | -------------------------------------------------------------------------------- /QQPlotinR.R: -------------------------------------------------------------------------------- 1 | source("qq_plot_v7.R") 2 | args <- commandArgs(TRUE) 3 | root <- args[1] 4 | gwas1<-read.table(paste(root,".post_imputation_final_analysis_p",sep=""), head=T) 5 | x1<-gwas1$P 6 | pdf(paste(root,".post_imputation_final_analysis_QQ.pdf",sep=""),width=8,height=6) 7 | qq.plot(x1, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=0.1, print=F, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black") 8 | text(x=12,y=4,paste("lambda--median=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep="")) 9 | dev.off() 10 | 11 | ###This version of the QQ plot wrapper also writes values of lambda median on the plot, using p--values for all SNPs plotted.### 12 | -------------------------------------------------------------------------------- /QQPlotinR_Alternate.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | source("~/Desktop/gwas_scripts/qq_plot_v7.R") 3 | gwas1<-fread("daner_PGC_BIP32b_mds7a_0416a_INFO3_AF1",data.table=F) 4 | x1<-gwas1$P 5 | png("daner_PGC_BIP32b_mds7a_0416a_INFO3_AF1.png",width=2400,height=3000, res=300) 6 | qq.plot(x1, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=1, print=F, xat=NULL, yat=NULL, main="QQ Plot", xlab=NULL, ylab=NULL, pch="x", cex=1, col="black", cex.lab=1.5, cex.main=1.5) 7 | text(x=20,y=4,paste(expression(lambda[median]),"=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep=" "), cex=1.5) 8 | dev.off() 9 | 10 | # gwas2<-fread("daner_PGC_BIP32b_mds7a_0416a_INFO6_AF1",data.table=F) 11 | # x2<-gwas2$P 12 | # png("daner_PGC_BIP32b_mds7a_0416a_INFO6_AF1.png",width=4800,height=6000, res=300) 13 | # qq.plot(x2, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE,frac=0.1, print=F, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black") 14 | # text(x=12,y=4,paste("lambda-median=", format((median(qchisq(p=1-x1,df=1)))/0.4549,digits=3),sep="")) 15 | # dev.off() 16 | 17 | ###This version of the QQ plot wrapper also writes values of lambda median on the plot, using p--values for all SNPs plotted.### 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |  2 | 3 | # gwas_scripts 4 | ###### GWAS codebook (Coleman et al, 2015, Briefings in Functional Genomics), version 1.0.0 5 | ##### Please address questions, comments and improvements to [my google group](https://groups.google.com/forum/#!forum/gwas-questions) 6 | ##### If you use the scripts and advice herein, please consider citing our paper, the full text of which is available on the publisher's website: 7 | ##### Quality control, imputation and analysis of genome-wide genotyping data from the Illumina HumanCoreExome microarray. Jonathan R. I. Coleman; Jack Euesden; Hamel Patel; Amos A. Folarin; Stephen Newhouse; Gerome Breen. Briefings in Functional Genomics 2016; [doi:10.1093/bfgp/elv037](http://bfg.oxfordjournals.org/content/15/4/298) 8 | 9 | 10 | The scripts in this repo are referenced in the publication referenced above, which provides a straight-forward guide to the quality control, imputation and analysis of genome-wide genotype data. Scripts can be tested using the toy PLINK dataset kindly provided by Shaun Purcell on the PLINK 1.07 website: [example.zip](https://zzz.bwh.harvard.edu/plink/dist/example.zip). 11 | 12 | 13 | This pipeline is designed to provide a useful resource for using genome-wide data from low-coverage arrays and smaller projects. As projects grow larger and more complex, it may be valuable to consult software creators' websites to seek more sophisticated analysis methods. A brief list of these is provided at the end of this document - I will gladly consider suggested inclusions. 14 | 15 | For the quality control, imputation and analysis of large scale genome-wide genotype data, it is highly recommended to look at Ricopili, the pipeline of the Psychiatric Genomics Consortium, which is currently being deposited in [this repo](https://github.com/Nealelab/ricopili) and is documented [here](https://sites.google.com/a/broadinstitute.org/ricopili/). All credit for Ricopili goes to its creators. 16 | 17 | Within this protocol, the following software is used: 18 | 19 | • [PLINK](http://zzz.bwh.harvard.edu/plink/) / [PLINK2](https://www.cog-genomics.org/plink2) 20 | 21 | • [R](http://www.r-project.org/) 22 | 23 | • [EIGENSOFT](http://www.hsph.harvard.edu/alkes-price/software/) 24 | 25 | • [IMPUTE](https://mathgen.stats.ox.ac.uk/impute/impute_v2.html) 26 | 27 | 28 | The protocol runs in a UNIX environment, and makes use of some of the basic software of the UNIX operating system. It should run on a Mac, but not in Windows. An exception to this is the GCTA MLMA GWAS analyses described at the end of the protocol - such analyses are only implemented in the Linux version of GCTA. Most sections of this protocol are designed to be usable simply by pasting into the command line – variables are set when each command is run, and should be straight-forward to modify. 29 | 30 | # Procedure 31 | 32 | ##### Recalling and rare-variant calling 33 | 34 | Not covered by this protocol, see [this protocol](https://confluence.brc.iop.kcl.ac.uk:8493/display/PUB/Production+Version%3A+Illumina+Exome+Chip+SOP+v1.4), which presents best-practice for recalling the raw genotype data using Illumina GenomeStudio, and https://github.com/KHP-Informatics/chip_gt, which implements and compares the results of [ZCall](https://github.com/jigold/zCall) and [Opticall](https://www.sanger.ac.uk/resources/software/opticall/). [This Nature Protocols paper](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4441213/) is also very good. 35 | 36 | ##### Reformat of data from the rare caller pipeline 37 | 38 | The Human Core Exome array contains some SNPs called "SNP…" In order to make ZCall run effectively, it is necessary to change the name of these SNPs, e.g. to "xxx…" This can be done using the UNIX program sed 39 | 40 | ```{sed} 41 | sed 's/SNP/xxx/g' < rootname.report > rootname.updated.report 42 | ``` 43 | 44 | Following the implementation of the rare caller pipeline, it is recommended to review the concordance between ZCall and Opticall − concordance is expected to be high (>99%). 45 | 46 | ##### Define names and locations of important files and software: 47 | 48 | ```{UNIX} 49 | printf "root=/path/to/rootname 50 | pheno=/path/to/external_pheno.phe 51 | covar=/path/to/covariates.cov 52 | genders=/path/to/external_genders.txt 53 | names=/path/to/external_individual_names.txt 54 | keeps=/path/to/samples_to_keep.txt 55 | excludes=/path/to/samples_to_exclude.txt 56 | insnps=/path/to/SNPs_to_keep.txt 57 | outsnps=/path/to/SNPs_to_exclude.txt 58 | plink=/path/to/plink2 59 | R=/path/to/R" > Config.conf 60 | ``` 61 | 62 | File formats are the [PLINK file formats](http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml). 63 | 64 | "rootname" is the prefix of the PLINK binary files obtained from the Exome-chip pipeline (i.e. the .bed file from the ZCall branch has the name "rootname_filt_Zcall_UA.bed"), and "/path/to/" is the location of these files on the computer. 65 | 66 | NB: not all of these files may be relevant to your study. 67 | 68 | ##### Review the PLINK binary (.bed, .bim, .fam) files from Exome-chip pipeline 69 | 70 | _Check individuals_ 71 | 72 | ```{UNIX} 73 | less $root.fam 74 | ``` 75 | 76 | _Check SNPs_ 77 | 78 | ```{UNIX} 79 | less $root.bim 80 | ``` 81 | 82 | ##### Update files 83 | 84 | Phenotypes, individual names, genders, or SNP alleles may be lost in preparatory steps. These can be updated using external files. 85 | 86 | _Update phenotype_ 87 | 88 | ```{PLINK} 89 | $plink \ 90 | --bfile $root \ 91 | --pheno $pheno \ 92 | --make-bed \ 93 | --out $root.updated_pheno 94 | ``` 95 | 96 | _Update genders_ 97 | 98 | ```{PLINK} 99 | $plink \ 100 | --bfile $root \ 101 | --update-sex $genders \ 102 | --make-bed \ 103 | --out $root.updated_genders 104 | ``` 105 | 106 | _Update sample names_ 107 | 108 | ```{PLINK} 109 | $plink \ 110 | --bfile $root \ 111 | --update-ids $names \ 112 | --make-bed \ 113 | --out $root.updated_names 114 | ``` 115 | 116 | _Select individuals for analysis_ 117 | 118 | ```{PLINK} 119 | $plink \ 120 | --bfile $root \ 121 | --keep $keeps \ 122 | --make-bed \ 123 | --out $root.kept_names 124 | ``` 125 | 126 | Or: 127 | 128 | ```{PLINK} 129 | $plink \ 130 | --bfile $root \ 131 | --remove $excludes \ 132 | --make-bed \ 133 | --out $root.kept_names 134 | ``` 135 | 136 | _Select SNPs for analysis_ 137 | 138 | ```{PLINK} 139 | $plink \ 140 | --bfile $root \ 141 | --extract $insnps \ 142 | --make-bed \ 143 | --out $root.kept_samples 144 | ``` 145 | 146 | Or: 147 | 148 | ```{PLINK} 149 | $plink \ 150 | --bfile $root \ 151 | --exclude $outsnps \ 152 | --make-bed \ 153 | --out $root.kept_samples 154 | ``` 155 | 156 | ##### Filter for common SNPs 157 | 158 | ```{PLINK} 159 | $plink \ 160 | --bfile $root \ 161 | --maf 0.01 \ 162 | --make-bed \ 163 | --out $root.common 164 | ``` 165 | 166 | This assumes no updates were made, otherwise modify the --bfile command to point to that file (e.g. $root.updated_names). 167 | 168 | 169 | ##### Filter for call rate iteratively 170 | 171 | ```{bash} 172 | sh ./Iterative_Missingness.sh [begin] [final] [steps] 173 | ``` 174 | _Removes SNPs then samples at increasingly high cut-offs. E.g. To remove at 90% to 99%, in steps of 1%:_ 175 | 176 | ```{bash} 177 | sh ./Iterative_Missingness.sh 90 99 1 178 | ``` 179 | 180 | 181 | ##### Review call rates to ensure all missing SNPs and individuals have been dropped 182 | 183 | _Generate files for individual call rates and variant vall rates._ 184 | 185 | ```{PLINK} 186 | $plink \ 187 | --bfile $root.filtered \ 188 | --missing \ 189 | --out $root.filtered_missing 190 | ``` 191 | 192 | _Examine the lowest call rates for variants:_ 193 | 194 | ```{UNIX} 195 | sort -k 5 -gr $root.filtered_missing.lmiss | head 196 | ``` 197 | 198 | Check no variants above threshold remain in column 5 (proportion missing). 199 | 200 | _Examine the lowest call rates for individuals:_ 201 | 202 | ```{UNIX} 203 | sort -k 6 -gr $root.filtered_missing.imiss | head 204 | ``` 205 | Check no individuals above threshold remain in column 6 (proportion missing). 206 | 207 | ##### Assess SNPs for deviation from Hardy-Weinberg Equilibrium 208 | 209 | _--hardy calculates HWE test p-values:_ 210 | 211 | ```{PLINK} 212 | $plink \ 213 | --bfile $root.filtered \ 214 | --hardy \ 215 | --out $root.hw_p_values 216 | ``` 217 | 218 | _--hwe removes deviant SNPs past a given threshold, 1x10^-5 below:_ 219 | 220 | ```{PLINK} 221 | $plink \ 222 | --bfile $root.filtered \ 223 | --hwe 0.00001 \ 224 | --make-bed \ 225 | --out $root.hw_dropped 226 | ``` 227 | NB: in case-control datasets, the default behaviour of hwe is to work on controls only 228 | 229 | ##### Prune data file for linkage disequilibrium 230 | 231 | _Using a window of 1500 variants and a shift of 150 variants between windows, with an r2 cut-off of 0.2:_ 232 | 233 | ```{PLINK} 234 | $plink \ 235 | --bfile $root.hw_dropped \ 236 | --indep-pairwise 1500 150 0.2 \ 237 | --out $root.LD_one 238 | ``` 239 | 240 | _Extract pruned-in SNPs_ 241 | 242 | ```{PLINK} 243 | $plink \ 244 | --bfile $root.hw_dropped \ 245 | --extract $root.LD_one.prune.in \ 246 | --make-bed \ 247 | --out $root.LD_two 248 | ``` 249 | 250 | _Exclude high-LD and non-autosomal regions from the pruned file (see [Mike Weale's website](https://sites.google.com/site/mikeweale))_ 251 | 252 | ```{AWK} 253 | awk -f highLDregions4bim_b37.awk $root.LD_two.bim > highLDexcludes 254 | ``` 255 | ```{AWK} 256 | awk '($1 < 1) || ($1 > 22) {print $2}' $root.LD_two.bim > autosomeexcludes 257 | ``` 258 | ```{bash} 259 | cat highLDexcludes autosomeexcludes > highLD_and_autosomal_excludes 260 | ``` 261 | ```{PLINK} 262 | $plink \ 263 | --bfile $root.LD_two \ 264 | --exclude highLD_and_autosomal_excludes \ 265 | --make-bed \ 266 | --out $root.LD_three 267 | ``` 268 | 269 | ##### Add phenotype to differentiate groups 270 | 271 | _E.g. Add site of collection ("Site") from an external pheotype file:_ 272 | 273 | ```{PLINK} 274 | $plink \ 275 | --bfile $root.LD_three \ 276 | --pheno $pheno \ 277 | --pheno-name Site \ 278 | --make-bed \ 279 | --out $root.LD_four 280 | ``` 281 | 282 | ##### Compare genotypic and phenotypic gender 283 | 284 | _Ensure there is a separate XY region for the pseudoautosomal region on X:_ 285 | 286 | Most chips have the pseudoautosomal region mapped separately already. 287 | Requires entry of genome build, below this is hg37 ("b37"). 288 | 289 | ```{PLINK} 290 | $plink \ 291 | --bfile $root.LD_two \ 292 | --split-x b37 \ 293 | --make-bed \ 294 | --out $root.LD_split 295 | ``` 296 | 297 | _Compare phenotypic gender to X chromosome heterogeneity and Y chromosome SNP count:_ 298 | 299 | ```{PLINK} 300 | $plink \ 301 | --bfile $root.LD_split \ 302 | --check-sex ycount 0.2 0.8 0 1 \ 303 | --out $root.sex_check 304 | ``` 305 | 306 | IDs identified as discordant (not the phenotypic gender) or for which F is between 0.2 and 0.8 (not assigned a gender by PLINK), should be reviewed with the collection site where possible. This command also takes into account the number of Y chromosome SNPs present, to counteract the unreliable nature of the F statistic in assigning female gender. The number of Y SNPs with calls in females can be set as part of ycount (above females have a maximum of 0, and males a maximum of 1), and will depend on the recalling method used and sample size. An additional check can be made by assessing whole-genome heterogeneity for all samples (see below) at this point – discordant gender may be the result of unusual heterogeneity 307 | 308 | _Remove discordant IDs that cannot be resolved:_ 309 | 310 | This command assumes a PLINK-format file of IDs for discordant individuals called "discordant_individuals.txt". 311 | 312 | ```{PLINK} 313 | $plink \ 314 | --bfile $root.LD_four \ 315 | --remove discordant_individuals.txt \ 316 | --make-bed \ 317 | --out $root.LD_five 318 | 319 | $plink \ 320 | --bfile $root.hw_dropped \ 321 | --remove discordant_individuals.txt \ 322 | --make-bed \ 323 | --out $root.sexcheck_cleaned 324 | ``` 325 | 326 | ##### Pairwise identical-by-descent (IBD) check 327 | 328 | ```{PLINK} 329 | $plink \ 330 | --bfile $root.LD_five \ 331 | --genome \ 332 | --make-bed \ 333 | --out $root.IBD 334 | ``` 335 | 336 | _Remove one sample from each pair with pi-hat (% IBD) above threshold (0.1875 below):_ 337 | 338 | ```{AWK} 339 | awk '$10 >= 0.1875 {print $1, $2}' $root.IBD.genome > $root.IBD_outliers.txt 340 | ``` 341 | 342 | ```{PLINK} 343 | $plink \ 344 | --bfile $root.IBD \ 345 | --remove $root.IBD_outliers.txt \ 346 | --make-bed \ 347 | --out $root.no_close_relatives 348 | ``` 349 | 350 | _Calculate average IBD per individual using R, output outliers (defined as more than ***sigma*** standard deviations above the mean, as provided by the user):_ 351 | 352 | ```{R} 353 | $R --file=IndividualIBD.R --args $root [sigma] 354 | ``` 355 | 356 | Exclude outliers from both LD-stripped and all SNP binary files 357 | 358 | ```{PLINK} 359 | $plink \ 360 | --bfile $root.LD_five \ 361 | --remove $root.IBD_INDIV_outliers.txt \ 362 | --make-bed \ 363 | --out $root.LD_IBD 364 | 365 | $plink \ 366 | --bfile $root.sexcheck_cleaned \ 367 | --remove $root.IBD_INDIV_outliers.txt \ 368 | --make-bed \ 369 | --out $root.IBD_cleaned 370 | ``` 371 | 372 | ##### Population stratification by principal component analysis in EIGENSOFT 373 | 374 | Consult [https://sites.google.com/site/mikeweale/software/eigensoftplus]. 375 | 376 | ___Run EIGENSOFT using LD-pruned binary___ 377 | 378 | _Convert files to EIGENSOFT format using CONVERTF_ 379 | 380 | Requires par file to convert from packedped format to eigenstrat format 381 | 382 | ```{UNIX} 383 | convertf -p <(printf "genotypename: "$root".LD_IBD.bed 384 | snpname: "$root".LD_IBD.bim 385 | indivname: "$root".LD_IBD.fam 386 | outputformat: EIGENSTRAT 387 | genotypeoutname: "$root".pop_strat.eigenstratgeno 388 | snpoutname: "$root".pop_strat.snp 389 | indivoutname: "$root".pop_strat.ind") 390 | ``` 391 | 392 | _Run SmartPCA, removing no outliers_ 393 | 394 | Produces 100 PCs 395 | 396 | ```{perl} 397 | smartpca.perl \ 398 | -i $root.pop_strat.eigenstratgeno \ 399 | -a $root.pop_strat.snp \ 400 | -b $root.pop_strat.ind \ 401 | -o $root.pop_strat.pca \ 402 | -p $root.pop_strat.plot \ 403 | -e $root.pop_strat.eval \ 404 | -l $root.pop_strat_smartpca.log \ 405 | -m 0 \ 406 | -t 100 \ 407 | -k 100 \ 408 | -s 6 409 | ``` 410 | 411 | Note that the order of the inputs is important. 412 | 413 | Inputs explained: 414 | 415 | -i is the genotype file 416 | -a is the SNP names 417 | -b is the individual names 418 | -o is the output eigenvectors ( $root.pop_strat.pca.evec) 419 | -p plots the output file. This is only activated if gnuplot is installed, but is a necessary inclusion for smartpca to run. If gnuplot is not installed, this does not affect the running of smartpca. If gnuplot is installed, this produces a plot of the first component on the second. 420 | -e is the output eigenvalues 421 | -l is the log, including a list of individuals defined as outliers. 422 | -m sets the number of outlier removal iterations. This is initially set to 0, so no outliers are removed. 423 | -t sets the number of components from which outliers should be removed. If -m is 0, this value has no effect. 424 | -k is the number of components to be output 425 | -s defines the minimum number of standard deviations from the mean of each component an individual must be to be counted as an outlier. 426 | 427 | _Minor edit to allow import into R_ 428 | 429 | Remove leading tab and split ID into two columns. 430 | 431 | ```{sed} 432 | sed -i -e 's/^[ \t]*//' -e 's/:/ /g' $root.pop_strat.pca.evec 433 | ``` 434 | 435 | _Calculate association between PCs and outcome measure in R_ 436 | 437 | Both scripts require the same IDs to be in $root.pca.evec and $pheno, and look at 100 PCs by default. 438 | 439 | *Short version (outputs the variance explained by each component and its significance when added to a model including the previous components):* 440 | 441 | ```{R} 442 | $R --file=PC-VS-OUTCOME_IN_R_SHORT.R --args $root.pop_strat $pheno 443 | ``` 444 | 445 | *Long version (outputs the full results of the linear model, adding each component in turn):* 446 | 447 | ```{R} 448 | $R --file= PC-VS-OUTCOME_IN_R_FULL.R --args $root.pop_strat $pheno 449 | ``` 450 | 451 | _Run SmartPCA again to remove outliers_ 452 | 453 | Run as above ("Run SmartPCA, removing no outliers"), but change $root suffix to pop_strat_outliers. 454 | Set –m 5 and –t x (where ***x*** is the number of PCs significantly associated with the outcome measure) 455 | 456 | ```{perl} 457 | smartpca.perl \ 458 | -i $root.pop_strat.eigenstratgeno \ 459 | -a $root.pop_strat.snp \ 460 | -b $root.pop_strat.ind \ 461 | -o $root.pop_strat_outliers.pca \ 462 | -p $root.pop_strat_outliers.plot \ 463 | -e $root.pop_strat_outliers.eval \ 464 | -l $root.pop_strat_outliers_smartpca.log \ 465 | -m 5 \ 466 | -t x \ 467 | -k 100 \ 468 | -s 6 469 | ``` 470 | 471 | _Plot principal components in R_ 472 | 473 | Plot before and after outlier exclusion to allow visual inspection of which samples are dropped 474 | 475 | Plot first component against second in R and colour by phenotype - this requires [ggplot2](http://ggplot2.org/) to be installed. 476 | 477 | ```{sed} 478 | sed -i -e 's/^[ \t]*//' -e 's/:/ /g' $root.pop_strat_outliers.pca.evec 479 | ``` 480 | 481 | ```{R} 482 | $R --file=PlotPCs.R --args $root.pop_strat 1 2 483 | 484 | $R --file=PlotPCs.R --args $root.pop_strat_outliers 1 2 485 | ``` 486 | 487 | This script can be modified to plot any of the first 100 components against each other by changing 1 and 2 above. The design of the plot is extremely modifiable - see [http://docs.ggplot2.org/current/]. 488 | 489 | _Extract outliers_ 490 | 491 | ```{bash} 492 | sh ./ExtractAncestryOutliers.sh 493 | ``` 494 | 495 | ```{PLINK} 496 | $plink \ 497 | --bfile $root.LD_IBD \ 498 | --remove $root.pop_strat_outliers.outliers \ 499 | --make-bed \ 500 | --out $root.LD_pop_strat 501 | 502 | $plink \ 503 | --bfile $root.IBD_cleaned \ 504 | --remove $root.pop_strat_outliers.outliers \ 505 | --make-bed \ 506 | --out $root.pop_strat 507 | ``` 508 | 509 | _Re-run to assess which components to include as covariates in the final analysis_ 510 | 511 | Run ConvertF: 512 | 513 | ```{perl} 514 | convertf -p <(printf "genotypename: $root.LD_pop_strat.bed 515 | snpname: $root.LD_pop_strat.bim 516 | indivname: $root.LD_pop_strat.fam 517 | outputformat: EIGENSTRAT 518 | genotypeoutname: $root.PCS_for_covariates.eigenstratgeno 519 | snpoutname: $root.PCS_for_covariates.snp 520 | indivoutname: $root.PCS_for_covariates.ind") 521 | ``` 522 | 523 | Run SmartPCA: 524 | 525 | ```{perl} 526 | smartpca.perl \ 527 | -i $root.PCS_for_covariates.eigenstratgeno \ 528 | -a $root.PCS_for_covariates.snp \ 529 | -b $root.PCS_for_covariates.ind \ 530 | -o $root.PCS_for_covariates.pca \ 531 | -p $root.PCS_for_covariates.plot \ 532 | -e $root.PCS_for_covariates.eval \ 533 | -l $root.PCS_for_covariates_smartpca.log \ 534 | -m 0 \ 535 | -t 100 \ 536 | -k 100 \ 537 | -s 6 \ 538 | ``` 539 | 540 | Calculate association (short version): 541 | 542 | ```{R} 543 | $R --file=PC-VS-OUTCOME_IN_R_SHORT.R --args $root.PCS_for_covariates 544 | ``` 545 | 546 | Include components significantly associated with outcome as covariates in the final analysis, or add PCs in turn until inflation falls to an accepted level (lambda ≈ 1). 547 | 548 | ##### Optional (but useful): plot individuals on components drawn from the HapMap reference populations to assess likely ancestry groupings. 549 | 550 | Details of this procedure can be found at [Timothee Flutre's OpenWetWare](http://openwetware.org/wiki/User:Timothee_Flutre/Notebook/Postdoc/2012/01/22). 551 | 552 | Note that the http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/2010-08_phaseII+III/forward/ domain referenced by Dr Flutre has since been retired. The HapMap samples are available at [ftp://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/2010-08_phaseII+III/forward/](ftp://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/2010-08_phaseII+III/forward/). 553 | 554 | _Manually extract HapMap and own cohort individual names_ 555 | 556 | ```{bash} 557 | sh ./MakeKeepIds.sh 558 | ``` 559 | 560 | _Use keepids.txt at this section:_ 561 | 562 | for pop in {CEU,CHB,JPT,YRI}; do echo ${pop}; \ 563 | hapmap2impute.py -i genotypes_CHR_${pop}_r28_nr.b36_fwd.txt.gz -n keepids.txt -o genotypes_hapmap_r28_b37_${pop}.impute.gz -b snps_hapmap_r28_nr_b37.bed.gz -s list_snps_redundant.txt; done 564 | zcat genotypes_hapmap_r28_b37_CEU.impute.gz | wc -l 565 | 3907899 566 | zcat genotypes_hapmap_r28_b37_CHB.impute.gz | wc -l 567 | 3933013 568 | zcat genotypes_hapmap_r28_b37_JPT.impute.gz | wc -l 569 | 3931282 570 | zcat genotypes_hapmap_r28_b37_YRI.impute.gz | wc -l 571 | 3862842 572 | 573 | More populations now exist than those listed in Flutre’s script; these can be obtained in the same manner. 574 | 575 | ##### Alternative - Use 1000 Genomes Phase 1 data to achieve the same 576 | 577 | Much the same process can be used to assess sample ethnicity by projecting on PCs from the 1000 Genomes samples. 578 | 579 | From $root.IBD_cleaned: 580 | 581 | _Obtain 1KG Phase 1 data from PLINK2 website_ 582 | 583 | **WARNING: FILE > 1GB** 584 | 585 | ```{bash} 586 | wget https://www.dropbox.com/s/k9ptc4kep9hmvz5/1kg_phase1_all.tar.gz?dl=1 587 | ``` 588 | 589 | Note code below creates numerical phenotypes for the 1KG populations. **CHANGE THESE IF THEY WILL OVERLAP WITH YOUR PHENOTYPE DATA!** 590 | 591 | _Obtain 1KG Population info from 1KG_ 592 | 593 | ```{bash} 594 | wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_g1k.ped 595 | 596 | grep -f <(awk '{print $2}' 1kg_phase1_all.fam) <(awk 'NR > 1 {print 0, $2, $7}' 20130606_g1k.ped) > 1KG_Phenos.txt 597 | 598 | sed -i -e 's/ASW/3/g' -e 's/CEU/4/g' -e 's/CHB/5/g' -e 's/CHS/6/g' -e 's/CLM/7/g' -e 's/FIN/8/g' -e 's/GBR/10/g' -e 's/IBS/11/g' -e 's/JPT/12/g' -e 's/LWK/13/g' -e 's/MXL/14/g' -e 's/PUR/15/g' -e 's/TSI/16/g' -e 's/YRI/17/g' 1KG_Phenos.txt 599 | ``` 600 | 601 | Provided your data has sufficient common variants (as with most microarrays), you can be fairly brutal with selecting variants for ancestry estimation. Ultimately, good estimation can be achieved with ~20K variants (see [Price et al, 2006](http://www.ncbi.nlm.nih.gov/pubmed/16862161)). 602 | 603 | _Limit files to SNPs with rs IDs_ 604 | 605 | ```{bash} 606 | fgrep rs $root.IBD_cleaned.bim > $root.IBD_cleaned.rsids.txt 607 | ``` 608 | 609 | _Get rs ID variant names_ 610 | 611 | ```{bash} 612 | awk '{print $2}' $root.IBD_cleaned.rsids.txt > $root.IBD_cleaned.rsid_names.txt 613 | ``` 614 | 615 | _Extract rs IDs from root_ 616 | 617 | ```{PLINK} 618 | $plink \ 619 | --bfile $root.IBD_cleaned \ 620 | --extract $root.IBD_cleaned.rsid_names.txt \ 621 | --chr 1-22 \ 622 | --make-bed \ 623 | --out $root.IBD_cleaned.rsids.autosomal 624 | ``` 625 | 626 | _Extract rs IDs from 1KG (and add phenotypes)_ 627 | 628 | ```{PLINK} 629 | $plink \ 630 | --bfile 1kg_phase1_all \ 631 | --extract $root.IBD_cleaned.rsid_names.txt \ 632 | --pheno 1KG_Phenos.txt \ 633 | --make-bed \ 634 | --out 1kg_phase1_all.rsids.autosomal 635 | ``` 636 | 637 | _Obtain SNPs present in both files_ 638 | 639 | ```{bash} 640 | awk '{print $2}' 1kg_phase1_all.rsids.autosomal.bim > 1kg_phase1_all.rsids_names.txt 641 | ``` 642 | 643 | _Extract 1KG SNPs from root_ 644 | 645 | ```{PLINK} 646 | $plink \ 647 | --bfile $root.IBD_cleaned.rsids.autosomal \ 648 | --extract 1kg_phase1_all.rsids_names.txt \ 649 | --make-bed \ 650 | --out $root.IBD_cleaned.intersection 651 | ``` 652 | 653 | _Dry run bmerge to identify SNPs PLINK will fail on_ 654 | 655 | ```{PLINK} 656 | $plink \ 657 | --bfile $root.IBD_cleaned.intersection \ 658 | --bmerge 1kg_phase1_all.rsids.autosomal \ 659 | --merge-mode 6 \ 660 | --out $root.1KG.IBD_cleaned_failures 661 | ``` 662 | 663 | _Add variants with multiple positions to missnp_ 664 | 665 | ```{bash} 666 | fgrep \'rs $root.1KG.IBD_cleaned_failures.log |\ 667 | awk '{print $7}' |\ 668 | sed -e "s/'//g" -e "s/.//g" > $root.1KG.IBD_cleaned_failures.multiple.positions.txt 669 | 670 | cat $root.1KG.IBD_cleaned_failures.missnp $root.1KG.IBD_cleaned_failures.multiple.positions.txt > $root.1KG.IBD_cleaned_failures.multiple.positions.missnp 671 | ``` 672 | 673 | _Exclude mismatched SNPs and variants with multiple positions_ 674 | 675 | ```{PLINK} 676 | $plink \ 677 | --bfile $root.IBD_cleaned.intersection \ 678 | --exclude $root.1KG.IBD_cleaned_failures.multiple.positions.missnp \ 679 | --make-bed \ 680 | --out $root.IBD_cleaned.intersection_for_merge 681 | ``` 682 | 683 | _Merge root and 1KG_ 684 | 685 | ```{PLINK} 686 | $plink \ 687 | --bfile $root.IBD_cleaned.intersection_for_merge \ 688 | --bmerge 1kg_phase1_all.rsids.autosomal \ 689 | --out $root.1kg.pop_strat 690 | ``` 691 | 692 | _Filter missing variants, rare variants and HWE_ 693 | 694 | ```{PLINK} 695 | $plink \ 696 | --bfile $root.1kg.pop_strat \ 697 | --geno 0.01 \ 698 | --maf 0.01 \ 699 | --hwe 0.0001 \ 700 | --make-bed \ 701 | --out $root.1kg.pop_strat.for_prune 702 | ``` 703 | 704 | _LD Pruning_ 705 | 706 | ```{PLINK} 707 | $plink \ 708 | --bfile $root.1kg.pop_strat.for_prune \ 709 | --indep-pairwise 1500 150 0.2 \ 710 | --out $root.1kg.pop_strat.prune 711 | ``` 712 | ```{PLINK} 713 | $plink \ 714 | --bfile $root.1kg.pop_strat.for_prune \ 715 | --extract $root.1kg.pop_strat.prune.prune.in \ 716 | --make-bed \ 717 | --out $root.1kg.LD_pop_strat 718 | ``` 719 | 720 | _Run convertf to make EIGENSTRAT file_ 721 | 722 | ```{perl} 723 | convertf -p <(printf "genotypename: $root.1kg.LD_pop_strat.bed 724 | snpname: $root.1kg.LD_pop_strat.bim 725 | indivname: $root.1kg.LD_pop_strat.fam 726 | outputformat: EIGENSTRAT 727 | genotypeoutname: $root.1kg.LD_pop_strat.eigenstratgeno 728 | snpoutname: $root.1kg.LD_pop_strat.snp 729 | indivoutname: $root.1kg.LD_pop_strat.ind") 730 | ``` 731 | 732 | _Generate poplist for projection_ 733 | 734 | ```{bash} 735 | awk '{print $3}' 1KG_Phenos.txt | sort | uniq > $root.1kg.LD_poplist.txt 736 | ``` 737 | 738 | _Run Smartpca, projecting on 1KG samples only_ 739 | 740 | ```{perl} 741 | smartpca.perl \ 742 | -i $root.1kg.LD_pop_strat.eigenstratgeno \ 743 | -a $root.1kg.LD_pop_strat.snp \ 744 | -b $root.1kg.LD_pop_strat.ind \ 745 | -o $root.1kg.LD_pop_strat.pca \ 746 | -p $root.1kg.LD_pop_strat.plot \ 747 | -e $root.1kg.LD_pop_strat.eigenvalues \ 748 | -l $root.1kg.LD_pop_strat.log \ 749 | -w $root.1kg.LD_poplist.txt \ 750 | -m 0 751 | ``` 752 | 753 | Note that the command below relabels the phenotype column as xCHANGE, where x is the phenotype, and then relabels the 1KG populations with their names for graphing. **Modify the sed command to allow your samples to be labelled usefully!** 754 | 755 | _Modify $root.1kg.LD_pop_strat.pca.evec for R_ 756 | 757 | ```{bash} 758 | awk 'NR > 1 {print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12"CHANGE"}' $root.1kg.LD_pop_strat.pca.evec > $root.1kg.LD_pop_strat.pca.evec_RENAMED 759 | 760 | sed -i -e 's/13CHANGE/LWK/g' -e 's/14CHANGE/MXL/g' -e 's/15CHANGE/PUR/g' -e 's/16CHANGE/TSI/g' -e 's/17CHANGE/YRI/g' -e 's/3CHANGE/ASW/g' -e 's/4CHANGE/CEU/g' -e 's/5CHANGE/CHB/g' -e 's/6CHANGE/CHS/g' -e 's/7CHANGE/CLM/g' -e 's/8CHANGE/FIN/g' -e 's/10CHANGE/GBR/g' -e 's/11CHANGE/IBS/g' -e 's/12CHANGE/JPT/g' $root.1kg.LD_pop_strat.pca.evec_RENAMED 761 | ``` 762 | 763 | _Plot PCs_ 764 | 765 | ```{bash} 766 | RScript PC_Plot_1KG.R $root 767 | ``` 768 | 769 | ##### Heterozygosity Test 770 | 771 | _Test for unusual patterns of genome-wide heterogeneity in LD-pruned data_ 772 | 773 | ```{PLINK} 774 | $plink \ 775 | --bfile $root.LD_pop_strat \ 776 | --ibc \ 777 | --out $root.het 778 | ``` 779 | 780 | _Exclude samples identified as outliers_ 781 | 782 | ```{R} 783 | R --file=IdHets.R --args $root.het 784 | ``` 785 | 786 | ```{PLINK} 787 | $plink \ 788 | --bfile $root.LD_pop_strat \ 789 | --remove $root.het.LD_het_outliers_sample_exclude \ 790 | --make-bed \ 791 | --out $root.LD_het_cleaned 792 | 793 | $plink \ 794 | --bfile $root.pop_strat \ 795 | --remove $root.het.LD_het_outliers_sample_exclude \ 796 | --make-bed \ 797 | --out $root.het_cleaned 798 | ``` 799 | 800 | ##### Imputation 801 | 802 | ___THIS CODE SHOULD BE CONSIDERED ARCHIVAL - IT IS RECOMMENDED THAT YOU NOW PERFORM IMPUTATION BY SUBMITTING YOUR DATA TO THE [TOPMED](https://imputation.biodatacatalyst.nhlbi.nih.gov/#!), [MICHIGAN](https://imputationserver.sph.umich.edu/index.html) OR [SANGER](https://imputation.sanger.ac.uk/) IMPUTATION SERVERS, WHICH ARE FASTER AND USE MORE UP-TO-DATE REFERENCE PANELS.___ 803 | 804 | *Consult [http://genome.sph.umich.edu/wiki/IMPUTE2:_1000_Genomes_Imputation_Cookbook] and [https://mathgen.stats.ox.ac.uk/impute/prephasing_and_imputation_with_impute2.tgz]* 805 | 806 | *Download reference files from [http://mathgen.stats.ox.ac.uk/impute/impute_v2.html]* 807 | 808 | *Copy impute2_examples folder from [https://mathgen.stats.ox.ac.uk/impute/prephasing_and_imputation_with_impute2.tgz] to work folder* 809 | 810 | _Download relevant strand file from [http://www.well.ox.ac.uk/~wrayner/strand/] and split by chromosome_ 811 | 812 | ```{AWK} 813 | awk '{print $3, $5 > "$root."$2".strand"}' HumanCoreExome-12v1-0_B-b37.strand 814 | ``` 815 | 816 | _Convert PLINK binary to GEN files (IMPUTE2 input)_ 817 | 818 | ```{PLINK} 819 | $plink \ 820 | --bfile $root.het_cleaned \ 821 | --recode oxford \ 822 | --out $root.for_impute 823 | ``` 824 | 825 | _Split whole--genome .gen into chromosome .gen_ 826 | 827 | ```{AWK} 828 | awk '{print > "Chr"$1".gen"}' $root.for_impute.gen 829 | ``` 830 | 831 | _Check split has proceeded correctly – total line number of all chromosome .gen files should total $root.for_impute.gen_ 832 | 833 | ```{bash} 834 | wc -l *.gen 835 | ``` 836 | 837 | _Generate chunk files for each chromosome_ 838 | 839 | ```{bash} 840 | sh ./MakeChunks.sh 841 | ``` 842 | 843 | This makes two sets of files: 844 | 845 | Chunks_chr[1-23].txt 846 | 847 | 30000001 3.5e+07 875 848 | 35000001 4e+07 500 849 | 40000001 4.5e+07 85 850 | 45000001 5e+07 424 851 | 50000001 5.5e+07 693 852 | 853 | These files list the base positions of the edges of each chromosome chunk and the number of SNPs in each chunk. 854 | Consult this file and merge chunks with few SNPs (e.g. less than 100) with neighbouring chunks. 855 | 856 | analysis_chunks_5Mb_chr[1-23].txt 857 | 858 | 30000001 3.5e+07 859 | 35000001 4e+07 860 | 40000001 5e+07 861 | 50000001 5.5e+07 862 | 863 | These files are the input for IMPUTE2 864 | 865 | *Modify the submit_impute2_jobs_to_cluster.R script (from impute2_examples) to accept chunk files without headers* 866 | 867 | ```{bash} 868 | From: 869 | # read in file with chunk boundary definitions 870 | chunk.file <- paste(data.dir,"analysis_chunks_",chunk.size,"Mb_chr",chr,".txt", sep="") 871 | chunks <- read.table(chunk.file, head=T, as.is=T) 872 | 873 | To: 874 | 875 | # read in file with chunk boundary definitions 876 | chunk.file <- paste(data.dir,"analysis_chunks_",chunk.size,"Mb_chr",chr,".txt", sep="") 877 | chunks <- read.table(chunk.file, head=F, as.is=T). 878 | ``` 879 | *Modify scripts in impute2_examples folder (prototype_imputation_job_posterior_sampled_haps.sh master_imputation_script_posterior_sampled_haps.sh and submit_impute_jobs_to_cluster.R) to fit personal needs.* 880 | 881 | Likely to need to limit number of jobs submitted to remain within local SunGridEngine rules – liaise with local system administrator to establish local best practice. Also likely to need to amend script to allow 5Mb jobs to run - for example, to increase virtual memory allowance to 15Gb, add the following to the header of the prototype_imputation_job_posterior_sampled_haps.sh script: 882 | 883 | ```bash 884 | #$ -l h_vmem=15G 885 | ``` 886 | 887 | *Submit jobs* 888 | 889 | **NB – this runs over 600 jobs on your cluster if not controlled!** 890 | 891 | ```{bash} 892 | sh ./master_imputation_script_posterior_sampled_haps.sh 893 | ``` 894 | 895 | _Adapt scripts for imputing X chromosome (running the different X map and legend files), and run_ 896 | 897 | Consult [http://mathgen.stats.ox.ac.uk/impute/impute_v2.html] 898 | 899 | *Merge imputed chunks together (.impute2 and .impute2_info) to form a file for each chromosome* 900 | 901 | ```{bash} 902 | sh ./MergeImputedChunks.sh 903 | ``` 904 | 905 | _Add chromosome number to each SNP in each chromosome.impute2 file_ 906 | 907 | ```{bash} 908 | sh ./AddChromosomeNumber.sh 909 | ``` 910 | 911 | _Merge by-chromosome info files to form a file for the whole genome_ 912 | 913 | ```{bash} 914 | cat results-directory/*.impute2_info > path/to/results-directory/$root.whole_genome.impute2_info 915 | ``` 916 | 917 | For December 2013 release of reference data (Phase1 Integrated), there are several aspects that require clean-up - these do not appear to apply to the Phase 3 release. Steps marked in bold are required for the Phase1 Integrated release, but may not be needed for Phase3. 918 | 919 | __Exomic variants are named "." It is necessary to make these unique (as chr:position)__ 920 | 921 | ```{bash} 922 | sh ./ReplaceDots.sh 923 | ``` 924 | 925 | _Filter imputed data (.impute2 files) by info metric_ 926 | 927 | ```{bash} 928 | sh ./FilterByInfoAll.sh [threshold] 929 | ``` 930 | 931 | _Merge filtered by-chromosome .impute2 files to make a single whole-genome file_ 932 | 933 | ```{bash} 934 | cat results-directory/*_New_filtered.impute2 > \ 935 | /results-directory/$root.whole_genome_filtered.impute2 936 | ``` 937 | 938 | __Remove duplicate SNPs from .impute2 file__ 939 | 940 | ```{AWK} 941 | awk '{print $2}' $root.whole_genome_filtered.impute2 | \ 942 | sort | uniq –c | awk '$1 !=1 {print $0}' > Duplicates 943 | 944 | awk '{print $2}' $root.whole_genome_filtered.impute2 | sort | uniq -d > Duplicates_cleaned 945 | ``` 946 | 947 | These produce two files called Duplicates and Duplicates_cleaned that list the duplicated SNPs in the file with and without the number of instances respectively 948 | 949 | ```{bash} 950 | grep -vwF -f Duplicates_cleaned $root.whole_genome_filtered.impute2 > Temp1 951 | ``` 952 | 953 | Removes all lines with an instance of a duplicated rs# from $root.whole_genome_filtered.impute2 and outputs to Temp1: 954 | 955 | ```{AWK} 956 | awk '{print $2}' Temp1 | sort | uniq –d > DuplicatesRemoved 957 | ``` 958 | 959 | Repeats the check for duplicates – this file should now be empty; check with 960 | 961 | ```{bash} 962 | less DuplicatesRemoved 963 | ``` 964 | 965 | Compare file lengths; the length of Temp1 should be the length of $root.whole_genome_filtered.impute2 minus the number of duplicated SNPs removed 966 | 967 | ```{bash} 968 | wc -l Temp1 $root.whole_genome_filtered.impute2 969 | mv Temp1 $root.whole_genome_filtered_cleaned.impute2 970 | ``` 971 | 972 | *Convert IMPUTE2 to hard-called PLINK format* 973 | 974 | ```{PLINK} 975 | $plink \ 976 | --gen $root.whole_genome_filtered_cleaned.impute2 \ 977 | --sample $root.for_impute.sample \ 978 | --hard-call-threshold 0.8 \ 979 | --make-bed \ 980 | --out $root.post_imputation 981 | ``` 982 | 983 | ***NB: if SNP does not pass threshold, it is set as missing!*** 984 | 985 | At this point, it is recommended to gzip all IMPUTE2 files - note that this will be a large job. 986 | 987 | ```{bash} 988 | gzip *impute2* 989 | ``` 990 | 991 | ##### Post-imputation quality control 992 | 993 | *Remove rare SNPs depending on sample size and dataset characteristics* 994 | 995 | ```{PLINK} 996 | $plink \ 997 | --bfile $root.post_imputation \ 998 | --maf 0.01 \ 999 | --make-bed \ 1000 | --out $root.post_imputation_common 1001 | ``` 1002 | 1003 | *Remove missing SNPs, including those set as missing above* 1004 | 1005 | ```{PLINK} 1006 | $plink \ 1007 | --bfile $root.post_imputation_common \ 1008 | --geno 0.02 \ 1009 | --make-bed \ 1010 | --out $root.post_imputation_updated 1011 | ``` 1012 | 1013 | _Drop duplicated variants from imputation_ 1014 | 1015 | ```{bash} 1016 | sh ./DropDuplicatedSNPs.sh 1017 | ``` 1018 | ```{PLINK} 1019 | $plink \ 1020 | --bfile $root.post_imputation_updated \ 1021 | --exclude $root.post_imputation_updated_duplicated_IDs \ 1022 | --make-bed \ 1023 | --out $root.post_imputation_final 1024 | ``` 1025 | 1026 | *Convert imputed rs IDs back to rs… format* 1027 | 1028 | ```{bash} 1029 | sh ./Relabel_rs.sh 1030 | ``` 1031 | 1032 | Some rs IDs are imperfectly mapped, resulting in duplications with imputed IDs, so remove these accidental duplicates. 1033 | 1034 | ```{bash} 1035 | sh ./DropDuplicatedPositions.sh 1036 | ``` 1037 | 1038 | ##### Association testing in PLINK/PLINK2 1039 | 1040 | *Generate covariates file, merging $covar and $root.dataname_pop_strat_includes.pca.evec (output from SMARTPCA) files* 1041 | 1042 | ```{R} 1043 | $-R --file=Get_Covariates.R --args $root $covar 1044 | ``` 1045 | 1046 | Relabels header and adds additional covariates (.pca.evec contains all PCs included in the SmartPCA analysis) 1047 | Script assumes a covariate file with the same column names for IDs (FID and IID), but no shared column names with the .pca.evec file, which is assumed to contain 100 PCs). 1048 | 1049 | 1050 | *Run association against phenotype* 1051 | 1052 | Phenotype here assumed to be in $pheno as the only phenotype (otherwise use --mpheno [column number]) and called "Outcome". 1053 | 1054 | ```{PLINK} 1055 | $plink \ 1056 | --bfile $root.post_imputation_final \ 1057 | --logistic/--linear (depending whether phenotype of interest is dichotomous or continuous) \ 1058 | --pheno $pheno \ 1059 | --pheno-name Outcome \ 1060 | --covar $covar 1061 | --covar-number 1-10 1062 | --hide-covar 1063 | --parameters 1-11 1064 | --out $root.post_imputation_conc_analysis 1065 | ``` 1066 | 1067 | Consider coding of phenotype – may require the use of --1 as an option if coding is in 0,1 format (rather than 1,2 format) 1068 | 1069 | --covar-number indicates which covariates to include. --covar-name can also be used for this 1070 | --hide-covar hides results of association tests between phenotype and covariates 1071 | --parameters specifies models to include in the analysis (see www.cog-genomics.org/plink2) 1072 | 1. Allelic dosage additive effect (or homozygous minor dummy variable) 1073 | 2. Dominance deviation, if present 1074 | 3. --condition{-list} covariate(s), if present 1075 | 4. --covar covariate(s), if present 1076 | 5. Genotype x non-sex covariate 'interaction' terms, if present 1077 | 6. Sex, if present 1078 | 7. Sex-genotype interaction(s), if present 1079 | 1080 | _Investigate further any SNP that is highly associated with the phenotype, and exclude from analysis if justified_ 1081 | 1082 | Run BLAT, [available on the UCSC Genome Browser](https://genome.ucsc.edu/cgi-bin/hgBlat?command=start) on the probe sequence (available from the array manifest) for all highly associated genotyped SNPs as a test of how well mapped/unique the sequence is, particularly with regards to similarity to sequences on the sex chromosomes. Discard any associated SNP that does not map uniquely. 1083 | 1084 | All association details here assume an additive model – see PLINK website to implement other models (but see [Knight and Lewis, 2012](http://www.ncbi.nlm.nih.gov/pubmed/22383645) for discussion of statistical issues of performing tests using multiple models). More association tests are available in PLINK and PLINK2. 1085 | 1086 | ##### Using GCTA for Genomic-relatedness-matrix Restricted Maximum Likelihood (GREML) and Mixed Linear Model Association (MLMA) 1087 | 1088 | _Make GRM_ 1089 | 1090 | Thresholds below: MAF 1%, IBD 0.025 1091 | 1092 | ```{GCTA} 1093 | ./gcta \ 1094 | --bfile $root.post_imputation_final \ 1095 | --autosome \ 1096 | --maf 0.01 \ 1097 | --grm-cutoff 0.025 \ 1098 | --make-grm \ 1099 | --out $root.post_imputation_final_grm 1100 | ``` 1101 | 1102 | GRM is created here from imputed data - see text for discussion of the benefits of this. 1103 | 1104 | 1105 | _Generate principal components_ 1106 | 1107 | ```{GCTA} 1108 | ./gcta \ 1109 | --grm $root.post_imputation_final_grm \ 1110 | --pca \ 1111 | --out $root.post_imputation_final_pca 1112 | ``` 1113 | 1114 | _Univariate GREML, including principal components as continuous covariates_ 1115 | 1116 | ```{GCTA} 1117 | ./gcta \ 1118 | --grm $root.post_imputation_final_grm \ 1119 | --pheno $pheno \ 1120 | --covar $covar \ 1121 | --qcovar $root.post_imputation_final_pca \ 1122 | --reml \ 1123 | --out $root.post_imputation_final_greml 1124 | ``` 1125 | 1126 | The number of principal components generated can be varied to assess the effect of their inclusion - if components are included as covariates for population stratification in GWAS, it is suggested to include the same number in GREML. 1127 | 1128 | This script assumes the covariates file contains only discrete covariates – if there are continuous covariates in the covariates file, these should be removed from the $covar file and added to the $root.post imputation_final_pca file. 1129 | 1130 | _Run MLMA-LOCO for autosomes_ 1131 | 1132 | ```{GCTA} 1133 | ./gcta \ 1134 | --bfile $root.post_imputation_final \ 1135 | --pheno $pheno \ 1136 | --covar $covar \ 1137 | --qcovar $root.post_imputation_final_pca \ 1138 | --mlma-loco \ 1139 | --out $root.post_imputation_final_mlma_analysis 1140 | ``` 1141 | 1142 | 1143 | _Run MLMA for X chromosome_ 1144 | 1145 | ```{PLINK} 1146 | ./plink \ 1147 | --bfile $root.post_imputation_final \ 1148 | --chr X \ 1149 | --make-bed \ 1150 | --out $root.post_imputation_final_X 1151 | ``` 1152 | ```{GCTA} 1153 | ./gcta \ 1154 | --grm $root.post_imputation_final_grm \ 1155 | --bfile $root.post_imputation_final_X \ 1156 | --pheno $pheno \ 1157 | --covar $covar \ 1158 | --qcovar $root.post_imputation_final_pca \ 1159 | --mlma \ 1160 | --out $root.post_imputation_final_mlma_analysis_X 1161 | ``` 1162 | 1163 | _Merge results files together_ 1164 | 1165 | ```{bash} 1166 | sed -i '1d' $root.post_imputation_final_mlma_analysis_X.mlma 1167 | cat $root.post_imputation_final_mlma_analysis.mlmaloco $root.post_imputation_final_mlma_analysis_X.mlma > $root.post_imputation_final_mlma_analysis_combined.mlmaloco 1168 | ``` 1169 | 1170 | ##### SNP Clumping to identify independent hits 1171 | 1172 | _Limit associations to lowest p-value in each region of linkage disequilibrium_ 1173 | 1174 | ```{PLINK} 1175 | $plink \ 1176 | --bfile $root.post_imputation_final \ 1177 | --clump $root.post_imputation_final_analysis.assoc.logistic \ 1178 | --clump-p1 1 \ 1179 | --clump-p2 1 \ 1180 | --clump-r2 0.25 \ 1181 | --clump-kb 250 \ 1182 | --out $root.post_imputation_final_analysis_clumped 1183 | ``` 1184 | 1185 | --clump-p1 is the p-value threshold below which to consider SNPs for inclusion as the reported SNP from the clump 1186 | --clump-p2 is the p-value threshold below which to consider SNPs for inclusion in the clump 1187 | --clump-r2 is the LD R2 threshold above which SNPs must be to be included in the same clump 1188 | --clump-kb is the maximum distance a clump SNP can be from the reported SNP 1189 | 1190 | The options given here will generate clumps of all SNPs in LD (above R2 = 0.25), with a maximum size of 500kb, considering all SNPs regardless of p-value 1191 | 1192 | ##### Annotation of Results 1193 | 1194 | *Download all RefSeq genes from [UCSC](https://genome.ucsc.edu/)* 1195 | 1196 | Go to [Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) 1197 | 1198 | 1. Pick Group: Genes and Gene Prediction Tracks 1199 | 2. Pick Track: RefSeq Genes 1200 | 3. Pick Table: refGene 1201 | 4. Pick Region: genome 1202 | 5. Pick Output Format: Selected fields… 1203 | 6. Click Get Output 1204 | 7. Tick Chrom, cdsStart, cdsEnd and name2 1205 | 8. Click GetOutput 1206 | 1207 | Transfer output to file GeneList.txt 1208 | 1209 | _Slight reformat of gene list, then make glist_hg19_ 1210 | 1211 | ```{bash} 1212 | sed -i 's/#chrom/Chrom/g' GeneList.txt 1213 | sed -i 's/chr//g' GeneList.txt 1214 | sh ./Make_glist.sh GeneList.txt glist_hg19 1215 | ``` 1216 | 1217 | _Annotation in PLINK/PLINK2_ 1218 | 1219 | Annotates variants with genes within 250kb 1220 | 1221 | ```{PLINK} 1222 | $plink \ 1223 | --annotate $root.post_imputation_final_analysis_clumped.clumped \ 1224 | ranges=glist-hg19 \ 1225 | --border 250 \ 1226 | --out $root.post_imputation_final_analysis_annotated 1227 | ``` 1228 | 1229 | 1230 | Alternatively, export results to a web tool such as [http://jjwanglab.org/gwasrap] 1231 | 1232 | ##### Plot Manhattan and QQ plots 1233 | 1234 | _Select top million hits for Manhattan plot_ 1235 | 1236 | ```{bash} 1237 | head -1000001 $root.post_imputation_final_analysis.assoc.logisitic > $root.post_imputation_final_analysis_for_MP 1238 | ``` 1239 | 1240 | _Run Manhattan plot and QQ plot scripts in R_ 1241 | 1242 | ```{R} 1243 | $R --file ManhattanPlotinR.R --args $root 1244 | $R --file QQPlotinR.R --args $root 1245 | ``` 1246 | 1247 | QQ plot currently plots top 10% of the data - this can be altered by changing the "frac" option 1248 | Both of these plots can be output in different graphic file formats (.jpeg, .tiff, .png) - please refer to the [R documentation](http://stat.ethz.ch/R-manual/R-devel/library/grDevices/html/00Index.html) 1249 | 1250 | # Files in this GitHub repo 1251 | 1252 | 1253 | #### README.md: This file! 1254 | 1255 | 1256 | ## Quality Control 1257 | 1258 | 1259 | #### Iterative_Missingness.sh: Remove SNPs missing in more than 10% of samples, then samples missing more than 10% of SNPs, then repeat for 5% and 1%. 1260 | 1261 | **Usage: Iterative_Missingness.sh** 1262 | ```{bash} 1263 | Iterative_Missingness.sh 1264 | ``` 1265 | 1266 | 1267 | #### highLDregions4bim_b37.awk: Awk script to remove regions of high-LD from LD-pruned files. Original script from M.Weale, adapted using Ensembl. 1268 | 1269 | **Usage: highLDregions4bim_b37.awk** 1270 | ```{awk} 1271 | awk –f highLDregions4bim_b37.awk input.file > output.file 1272 | ``` 1273 | 1274 | 1275 | #### IndividualIBD.R: In R, calculate and print average identity-by-descent relatedness, and print outliers at > 6 SD from mean 1276 | 1277 | **Usage: IndividualIBD.R** 1278 | ```{R} 1279 | R --file=IndividualIBD.R 1280 | ``` 1281 | 1282 | 1283 | #### parfile.par: Provide parameters to the Convertf programme from the EIGENSOFT suite, to convert files from PLNIK format to EIGENSOFT format. 1284 | 1285 | **Usage: parfile.par** 1286 | ```{bash} 1287 | convertf -p parfile.par 1288 | ``` 1289 | 1290 | 1291 | #### PC-VS-OUTCOME_IN_R_FULL.R: Regress 100 PCs step-wise on outcome, print full results of each regression to file 1292 | 1293 | **Usage: PC-VS-OUTCOME_IN_R_FULL.R** 1294 | ```{R} 1295 | R --file=PC-VS_OUTCOME_IN_R_FULL.R 1296 | ``` 1297 | 1298 | 1299 | #### PC-VS-OUTCOME_IN_R_SHORT.R: Regress 100 PCs step-wise on outcome, print variance explained for each prinicipal component when added to model, and the p-value for this variance explained, to file 1300 | 1301 | **Usage: PC-VS-OUTCOME_IN_R_SHORT.R** 1302 | ```{R} 1303 | R --file=PC-VS_OUTCOME_IN_R_SHORT.R 1304 | ``` 1305 | 1306 | 1307 | #### PlotPCs.R: Use qplot option from ggplot2 to plot samples on first two principal components from PCA 1308 | 1309 | **Usage: PlotPCs.R** 1310 | ```{R} 1311 | R --file=PlotPCs.R 1312 | ``` 1313 | 1314 | 1315 | #### ExtractAncestryOutliers.sh: Pull out outlying samples from PCA (as defined by EIGENSOFT) for removal from PLINK binary. 1316 | 1317 | **Usage: ExtractAncestryOutliers.sh** 1318 | ```{bash} 1319 | ExtractAncestryOutliers.sh 1320 | ``` 1321 | 1322 | 1323 | #### MakeKeepIDs.sh: Make keepids.txt file for use with T.Flutre's HapMap3 OpenWetWare cookbook 1324 | 1325 | **Usage: MakeKeepIDs.sh** 1326 | ```{bash} 1327 | MakeKeepIDs.sh 1328 | ``` 1329 | 1330 | 1331 | #### IdHets.R: Identify samples with unusual genome-wide heterozygosity (> or < 3SD from mean). Credit: Amos Folarin. 1332 | 1333 | **Usage: IDHets.R** 1334 | ```{R} 1335 | R --file=Id_hets.R 1336 | ``` 1337 | 1338 | 1339 | ## Imputation in Impute2 1340 | 1341 | 1342 | #### MakeChunks.sh: Generate two sets of 5Mb chunk files for imputation: Chunks_chr... prints a list of chunks for that chromosome, with SNP number; analysis_chunks... prints the input chunk file for Impute2 for that chromosome. 1343 | 1344 | **Usage: MakeChunks.sh** 1345 | ```{bash} 1346 | MakeChunks.sh 1347 | ``` 1348 | 1349 | 1350 | #### Master_imputation_script_posterior_sampled_haps.sh: Script to control submission of posterior-sampling imputation jobs to a SGE-based cluster. Modified from scripts provided with Impute2 1351 | #### Modified_submit_impute2_jobs_to_cluster.R: R script for submitting Impute2 jobs to a SGE-controlled cluster. Modified from scripts provided with Impute2. 1352 | #### Prototype_imputation_job_posterior_sampled_haps.sh: Posterior-sampling imputation job script for Impute2. Modified from scripts provided with Impute2 1353 | 1354 | **Usage Master_imputation_script_posterior_sampled_haps.sh** 1355 | ```{bash} 1356 | Master_imputation_script_posterior_sampled_haps.sh 1357 | ``` 1358 | This runs multiple instances of **Prototype_imputation_job_posterior_sampled_haps.sh** 1359 | ```{bash} 1360 | Prototype_imputation_job_posterior_sampled_haps.sh 1361 | ``` 1362 | 1363 | 1364 | ## Post-imputation quality control 1365 | 1366 | 1367 | #### MergeImputedChunks.sh: Script to move imputed 5Mb chunks to chromosome-specific folders, and merge them into by-chromosome .impute2 and .impute2_info files. 1368 | 1369 | **Usage: MergeImputedChunks.sh** 1370 | ```{bash} 1371 | MergeImputedChunks.sh 1372 | ``` 1373 | 1374 | 1375 | #### AddChromosomeNumber.sh:** Post-imputation clean-up script to add chromosome number to by-chromosome .impute2 files 1376 | 1377 | **Usage: AddChromosomeNumber.sh** 1378 | ```{bash} 1379 | AddChromosomeNumber.sh /path/to/results_directory 1380 | ``` 1381 | 1382 | 1383 | #### ReplaceDots.sh: Post-imputation clean-up file for Phase1_Integrated 1KG reference. Converts exon variants called "." to "chr/position" 1384 | 1385 | **Usage: ReplaceDots.sh** 1386 | ```{bash} 1387 | ReplaceDots.sh 1388 | ``` 1389 | 1390 | 1391 | #### FilterByInfoAll.sh: Post-imputation QC script to filter whole-genome impute2_info file by info score, and then keep only these variants from the by-chromosome impute2_info files 1392 | 1393 | **Usage: FilterByInfoAll.sh** 1394 | ```{bash} 1395 | FilterByInfo.sh 1396 | ``` 1397 | 1398 | 1399 | #### DropDuplicatedSNPs.sh: Post-imputation QC script to remove duplicated positions in the post-imputation file (which are usually indels or multiallelic variants) 1400 | 1401 | **Usage: DropDuplicatedSNPs.sh** 1402 | ```{bash} 1403 | DropDuplicatedSNPs.sh 1404 | ``` 1405 | 1406 | 1407 | #### Relabel_rs.sh: Relabel imputed SNPs with an rs id with the rs id only 1408 | 1409 | **Usage: Relabel_rs.sh** 1410 | ```{bash} 1411 | Relabel_rs.sh 1412 | ``` 1413 | 1414 | 1415 | #### DropDuplicatedPositions.sh: Some rs IDs are imperfectly mapped, resulting in duplications with imputed IDs, so remove these accidental genotyped duplicates. 1416 | 1417 | 1418 | **Usage: DropDuplicatedPositions.sh** 1419 | ```{bash} 1420 | DropDuplicatedPositions.sh 1421 | ``` 1422 | 1423 | 1424 | ## Association testing 1425 | 1426 | 1427 | #### Get_Covariates.R: Script to create covariate file for association analyses in PLINK from the principal components from PCA and an external covariates file. 1428 | 1429 | **Usage: Get_Covariates.sh** 1430 | ```{R} 1431 | R --file=Get_Covariates.R 1432 | ``` 1433 | 1434 | 1435 | ## Post-GWAS 1436 | 1437 | 1438 | #### Make_glist.sh: Script to make a glist-file from the GeneList file downloaded from UCSC Table Browser, for use in annotation in PLINK 1439 | 1440 | **Usage: Make_glist.sh** 1441 | ```{bash} 1442 | Make_glist.sh 1443 | ``` 1444 | 1445 | 1446 | #### ManhattanPlotinR.R: Wrapper script for running Mike Weale's manhattan_v2.R script to generate Manhattan plots from association result 1447 | #### manhattan_v2.R: Mike Weale's Manhattan plot script 1448 | 1449 | **Usage: ManhattanPlotinR.R** 1450 | ```{R} 1451 | R --file=ManhattanPlotinR.R 1452 | ``` 1453 | 1454 | #### QQPlotinR.R: Wrapper script to generate QQ plots from association results, using Mike Weale's qq_plot_v7.R 1455 | #### qq_plot_v7.R: Mike Weale's QQ-plot script 1456 | 1457 | **Usage: QQPlotinR.R** 1458 | ```{R} 1459 | R --file=QQPlotinR.R 1460 | ``` 1461 | 1462 | #### ID_Build.py: Python script for quickly identifying the genome build of an unknown PLINK binary. 1463 | 1464 | **Usage: ID_Build.py** 1465 | ```{bash} 1466 | $plink \ 1467 | --bfile $root \ 1468 | --chr 6 \ 1469 | --make-bed \ 1470 | --out $root.chr6 1471 | 1472 | python ID_Build.py $root.chr6.bim 1473 | ``` 1474 | 1475 | # Valuable web resources 1476 | 1477 | [Genotype recalling pipeline](http://confluence.brc.iop.kcl.ac.uk:8090/x/4AAm) 1478 | 1479 | [Rare variant recalling pipeline](http://core.brc.iop.kcl.ac.uk/2013/04/08/exome-chip-rare-caller-pipeline/) 1480 | 1481 | [Alternative recalling and quality control pipeline](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4441213/) 1482 | 1483 | [PLINK 1.07](http://pngu.mgh.harvard.edu/~purcell/plink/) 1484 | 1485 | [PLINK 1.9](https://www.cog-genomics.org/plink2/) 1486 | 1487 | [R](http://www.r-project.org/) 1488 | 1489 | [EIGENSOFT, EAGLE, HAPLOSNP, LDScore... JUST SO MUCH GOOD STUFF](http://www.hsph.harvard.edu/alkes-price/software/) 1490 | 1491 | [IMPUTE2](https://mathgen.stats.ox.ac.uk/impute/impute_v2.html) 1492 | 1493 | [IMPUTE2 Cookbook](http://genome.sph.umich.edu/wiki/IMPUTE2:_1000_Genomes_Imputation_Cookbook) 1494 | 1495 | [William Rayner's strand files for microarrays](http://www.well.ox.ac.uk/~wrayner/strand/) 1496 | 1497 | [Mike Weale's Page](https://sites.google.com/site/mikeweale) 1498 | 1499 | [Tim Flutre's OpenWetWare script for HapMap3 PC plot](http://openwetware.org/wiki/User:Timothee_Flutre/Notebook/Postdoc/2012/01/22) 1500 | 1501 | [GWASRAP - Post-GWAS annotation](http://jjwanglab.org/gwasrap) 1502 | 1503 | [PRSice - Polygenic Risk Scoring software](http://prsice.info) 1504 | 1505 | [GCTA - Software suite for mixed linear modelling and heritability analyses](http://cnsgenomics.com/software/gcta/) 1506 | 1507 | # Acknowledgements 1508 | 1509 | Thank you to the following people for their advice and input on the contents of this cookbook: 1510 | * Gerome Breen 1511 | * Steve Newhouse 1512 | * Amos Folarin 1513 | * Richard Dobson 1514 | * Cass Johnston 1515 | * Hamel Patel 1516 | * Jack Euesden 1517 | * Jemma Walker 1518 | * Niamh Mullins 1519 | * Cathryn Lewis 1520 | * Paul O'Reilly 1521 | * Mike Weale 1522 | 1523 | In addition, thank you to the authors of the resources listed above, and the genetics community for their comments. 1524 | -------------------------------------------------------------------------------- /Relabel_rs.sh: -------------------------------------------------------------------------------- 1 | source Config.conf 2 | 3 | awk 'BEGIN {OFS = "\t"} $2 ~ /^rs/{gsub(":.*", "", $2) }1' $root.post_imputation_final.bim > $root.post_imputation_final_rs_only.bim 4 | cp $root.post_imputation_final.bed $root.post_imputation_final_rs_only.bed 5 | cp $root.post_imputation_final.fam $root.post_imputation_final_rs_only.fam 6 | -------------------------------------------------------------------------------- /ReplaceDots.sh: -------------------------------------------------------------------------------- 1 | source Config.conf 2 | 3 | for i in {1..22} 4 | do 5 | awk '$2=="." {$2= $1 ":" $3} {print}' < New_Chromosome$i.impute2 > Temp1 6 | mv Temp1 New_Chromosome$i.impute2 7 | 8 | done 9 | 10 | awk -v i=$i '$2=="." {$2= $1 ":" $3} {print}' < $root.whole_genome.impute2_info > Temp2 11 | mv Temp2 $root.whole_genome.impute2_info 12 | -------------------------------------------------------------------------------- /highLDregions4bim_b37.awk: -------------------------------------------------------------------------------- 1 | ($1 == 1) && ($4 >= 48287981) && ($4 <= 52287979) {print $2} 2 | ($1 == 2) && ($4 >= 86088343) && ($4 <= 101041482) {print $2} 3 | ($1 == 2) && ($4 >= 134666269) && ($4 <= 138166268) {print $2} 4 | ($1 == 2) && ($4 >= 183174495) && ($4 <= 190174494) {print $2} 5 | ($1 == 3) && ($4 >= 47524997) && ($4 <= 50024996) {print $2} 6 | ($1 == 3) && ($4 >= 83417311) && ($4 <= 86917310) {print $2} 7 | ($1 == 3) && ($4 >= 88917311) && ($4 <= 96017310) {print $2} 8 | ($1 == 5) && ($4 >= 44464244) && ($4 <= 50464243) {print $2} 9 | ($1 == 5) && ($4 >= 97972101) && ($4 <= 100472101) {print $2} 10 | ($1 == 5) && ($4 >= 128972102) && ($4 <= 131972101) {print $2} 11 | ($1 == 5) && ($4 >= 135472102) && ($4 <= 138472101) {print $2} 12 | ($1 == 6) && ($4 >= 25392022) && ($4 <= 33392022) {print $2} 13 | ($1 == 6) && ($4 >= 56892042) && ($4 <= 63942041) {print $2} 14 | ($1 == 6) && ($4 >= 139958308) && ($4 <= 142458307) {print $2} 15 | ($1 == 7) && ($4 >= 55225792) && ($4 <= 66555850) {print $2} 16 | ($1 == 8) && ($4 >= 7962591) && ($4 <= 11962591) {print $2} 17 | ($1 == 8) && ($4 >= 42880844) && ($4 <= 49837447) {print $2} 18 | ($1 == 8) && ($4 >= 111930825) && ($4 <= 114930824) {print $2} 19 | ($1 == 10) && ($4 >= 36959995) && ($4 <= 43679994) {print $2} 20 | ($1 == 11) && ($4 >= 46043425) && ($4 <= 57243424) {print $2} 21 | ($1 == 11) && ($4 >= 87860353) && ($4 <= 90860352) {print $2} 22 | ($1 == 12) && ($4 >= 33108734) && ($4 <= 41713733) {print $2} 23 | ($1 == 12) && ($4 >= 111037281) && ($4 <= 113537280) {print $2} 24 | ($1 == 20) && ($4 >= 32536340) && ($4 <= 35066586) {print $2} 25 | -------------------------------------------------------------------------------- /highLDregions4bim_b38.awk: -------------------------------------------------------------------------------- 1 | ($1 == 1) && ($4 >= 47822309) && ($4 <= 51822307) {print $2} 2 | ($1 == 2) && ($4 >= 85861220) && ($4 <= 100425020) {print $2} 3 | ($1 == 2) && ($4 >= 133908698) && ($4 <= 137408698) {print $2} 4 | ($1 == 2) && ($4 >= 182309768) && ($4 <= 189309768) {print $2} 5 | ($1 == 3) && ($4 >= 47483507) && ($4 <= 49987563) {print $2} 6 | ($1 == 3) && ($4 >= 83368160) && ($4 <= 86868160) {print $2} 7 | ($1 == 3) && ($4 >= 88868161) && ($4 <= 96298466) {print $2} 8 | ($1 == 5) && ($4 >= 44464142) && ($4 <= 51168409) {print $2} 9 | ($1 == 5) && ($4 >= 98636397) && ($4 <= 101136397) {print $2} 10 | ($1 == 5) && ($4 >= 129636409) && ($4 <= 132636409) {print $2} 11 | ($1 == 5) && ($4 >= 136136413) && ($4 <= 139136412) {print $2} 12 | ($1 == 6) && ($4 >= 25391794) && ($4 <= 33424245) {print $2} 13 | ($1 == 6) && ($4 >= 57027244) && ($4 <= 63232136) {print $2} 14 | ($1 == 6) && ($4 >= 139637171) && ($4 <= 142137170) {print $2} 15 | ($1 == 7) && ($4 >= 55158099) && ($4 <= 67090863) {print $2} 16 | ($1 == 8) && ($4 >= 8105069) && ($4 <= 12105082) {print $2} 17 | ($1 == 8) && ($4 >= 43025701) && ($4 <= 48924888) {print $2} 18 | ($1 == 8) && ($4 >= 110918596) && ($4 <= 113918595) {print $2} 19 | ($1 == 10) && ($4 >= 36671067) && ($4 <= 43184546) {print $2} 20 | ($1 == 11) && ($4 >= 46021874) && ($4 <= 57475951) {print $2} 21 | ($1 == 11) && ($4 >= 88127185) && ($4 <= 91127184) {print $2} 22 | ($1 == 12) && ($4 >= 32955800) && ($4 <= 41319931) {print $2} 23 | ($1 == 12) && ($4 >= 110599476) && ($4 <= 113099475) {print $2} 24 | ($1 == 20) && ($4 >= 33948534) && ($4 <= 36438183) {print $2} 25 | -------------------------------------------------------------------------------- /manhattan_DOG_TRY.R: -------------------------------------------------------------------------------- 1 | #Generic Manhattan plot function for PLINK-formatted data (chr X,Y,XY and MT are represented as 23,24,25,26) 2 | #Wrapper function written by Mike Weale and Richard Gunning. Internal "wgplot" function written by Matt Settles. 3 | #Version 2 (12 Mar 2013) 4 | #Arguments: 5 | #x Data frame to be plotted. x$CHR contains chromosome (numeric). x$BP contains SNP position (numeric). x$P contains association p-value (numeric) 6 | #GWthresh Numeric. Indicates where "genomewide significance" threshold should be drawn 7 | #GreyZoneThresh Numeric. Indicates a sub-genomewide-sig "grey zone" where SNPs are shown with a larger point size 8 | #DrawGWline Boolean. If TRUE, then a red line at the "genomewide significance" threshold is plotted 9 | #cutoff Numeric. Any p-vlaues less than cutoff are forced equal to cutoff 10 | #Example: 11 | #source("manhattan_v2.R") 12 | #d = read.table("myplinkresults.logistic", header=TRUE, as.is=TRUE) 13 | #X=data.frame(CHR=d$CHR, BP=d$BP, P=d$P) 14 | #manhattan( X, DrawGWline=FALSE ) 15 | 16 | manhattan <- function( x, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE, cutoff=0 ) 17 | { 18 | 19 | x$P[ x$PGreyZoneThresh)&(ptmpGreyZoneThresh)&(ptmpGWthresh], ptmp[ptmp>GWthresh], pch=20, col=color[i]) 68 | } 69 | 70 | #drawthreshold 71 | if (DrawGWline) { 72 | abline(h=(GWthresh), col="red") 73 | } 74 | } 75 | 76 | 77 | 78 | #From http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R 79 | #See also https://stat.ethz.ch/pipermail/r-help/2008-November/180812.html 80 | ############################################################################### 81 | ### 82 | ### Whole Genome Significance plot 83 | ### Matt Settles 84 | ### Bioinformatics Core 85 | ### Washington State University, Pullman, WA 86 | ### 87 | ### Created July 7, 2008 88 | ### 89 | ### July 8, 2008 - fixed color goof 90 | ############################################################################### 91 | ############## 92 | ### things to add 93 | ### marker name on plot for significant markers 94 | ############## 95 | 96 | ### THERE ARE ERRORS IN GAPS MHTPLOT, SO THIS IS A FIX 97 | ## data a data frame with three columns representing chromosome, position and p values logged or unlogged 98 | ## logscale a flag to indicate if p value are to be log-transformed, FALSE means already logtransformed 99 | ## base the base of the logarithm, when logscale =TRUE 100 | ## cutoffs the cutt-offs where horizontal line(s) are drawn 101 | ## color the color for different chromosome(s), and random if unspecified 102 | ## labels labels for the x-axis, length = number of chromosomes 103 | ## xlabel label to be placed on the X axis 104 | ## ylabel lable to be placed on the Y axis 105 | ## ... other options in compatible with the R plot function 106 | 107 | ## USAGE 108 | # source("http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R") 109 | ## fake example with Affy500k data 110 | # affy <-c(40220, 41400, 33801, 32334, 32056, 31470, 25835, 27457, 22864, 28501, 26273, 111 | # 24954, 19188, 15721, 14356, 15309, 11281, 14881, 6399, 12400, 7125, 6207) 112 | # CM <- cumsum(affy) 113 | # n.markers <- sum(affy) 114 | # n.chr <- length(affy) 115 | # test <- data.frame(chr=rep(1:n.chr,affy),pos=1:n.markers,p=runif(n.markers)) 116 | # png("wgplot.png",units="in",width=8,height=5,res=300) 117 | # par(las="2",cex=0.6,pch=21,bg="white") 118 | # wgplot(test,cutoffs = c(1,3, 5, 7, 9),color=palette()[2:5],labels=as.character(1:22)) 119 | # title("Whole Genome Associaton Plot of Significance for Chromosomes 1 to 22") 120 | # dev.off() 121 | ## 122 | "wgplot" <- 123 | function (data, 124 | logscale = TRUE, 125 | base = 10, 126 | cutoffs = c(3, 5, 7, 9), 127 | siglines = NULL, 128 | sigcolors = "red", 129 | color = sample(colors(), 26), 130 | chrom = as.character(c(1:39)), 131 | startbp = NULL, 132 | endbp = NULL, 133 | labels = as.character(c(1:22,"X","Y","XY","MT")), 134 | xlabel = "Chromosome", 135 | ylabel = "-Log10(p-value)", ...) 136 | { 137 | if (any(is.na(data))) 138 | data <- data[-unique(which(is.na(data))%%nrow(data)),] 139 | keep <- which(data[,1] %in% chrom) 140 | data <- data[keep,] 141 | if (!is.null(startbp) & !is.null(endbp) & length(chrom) == 1){ 142 | keep <- which(data[,2] >= startbp & data[,2] <= endbp) 143 | data <- data[keep,] 144 | } 145 | 146 | 147 | chr <- data[, 1] 148 | pos <- data[, 2] 149 | p <- data[, 3] 150 | 151 | ### remove any NAs 152 | which(is.na(data[,2])) 153 | #chr <- replace(chr,which(chr == "X"),"100") 154 | #chr <- replace(chr,which(chr == "Y"),"101") 155 | #chr <- replace(chr,which(chr == "XY"),"102") 156 | #chr <- replace(chr,which(chr == "MT"),"103") 157 | 158 | ord <- order(as.numeric(chr),as.numeric(pos)) 159 | chr <- chr[ord] 160 | pos <- pos[ord] 161 | p <- p[ord] 162 | 163 | lens.chr <- as.vector(table(as.numeric(chr))) 164 | CM <- cumsum(lens.chr) 165 | n.markers <- sum(lens.chr) 166 | n.chr <- length(lens.chr) 167 | id <- 1:n.chr 168 | color <- rep(color,ceiling(n.chr/length(color))) 169 | if (logscale) 170 | p <- -log(p,base) 171 | if ( any(diff(pos) < 0) ) { 172 | cpos <- cumsum(c(0,pos[which(!duplicated(chr))-1])) 173 | pos <- pos + rep(cpos,lens.chr) 174 | 175 | mids <- cpos + diff(c(cpos,max(pos)))/2 176 | } 177 | 178 | par(xaxt = "n", yaxt = "n") 179 | plot(c(pos,pos[1]), c(9,p), type = "n", xlab = xlabel, ylab = ylabel, axes = FALSE, ...) 180 | for (i in 1:n.chr) { 181 | u <- CM[i] 182 | l <- CM[i] - lens.chr[i] + 1 183 | cat("Plotting points ", l, "-", u, "\n") 184 | points(pos[l:u], p[l:u], col = color[i], ...) 185 | } 186 | par(xaxt = "s", yaxt = "s") 187 | axis(1, at = c(0, pos[round(CM)],max(pos)),FALSE) 188 | text(mids, par("usr")[3] - 0.5, srt = 0, pos=2,cex=0.5,offset= -0.2, 189 | labels = labels[1:n.chr], xpd = TRUE) 190 | #axis(side=1, at = pos[round(CM-lens.chr/2)],tick=FALSE, labels= labels[1:n.chr]) 191 | #abline(h = cutoffs) 192 | axis(side=2, at = cutoffs ) 193 | if (!is.null(siglines)) 194 | abline(h = -log(siglines,base),col=sigcolors) 195 | 196 | #mtext(eval(expression(cutoffs)), 2, at = cutoffs) 197 | 198 | } 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /manhattan_v2.R: -------------------------------------------------------------------------------- 1 | #Generic Manhattan plot function for PLINK-formatted data (chr X,Y,XY and MT are represented as 23,24,25,26) 2 | #Wrapper function written by Mike Weale and Richard Gunning. Internal "wgplot" function written by Matt Settles. 3 | #Version 2 (12 Mar 2013) 4 | #Arguments: 5 | #x Data frame to be plotted. x$CHR contains chromosome (numeric). x$BP contains SNP position (numeric). x$P contains association p-value (numeric) 6 | #GWthresh Numeric. Indicates where "genomewide significance" threshold should be drawn 7 | #GreyZoneThresh Numeric. Indicates a sub-genomewide-sig "grey zone" where SNPs are shown with a larger point size 8 | #DrawGWline Boolean. If TRUE, then a red line at the "genomewide significance" threshold is plotted 9 | #cutoff Numeric. Any p-vlaues less than cutoff are forced equal to cutoff 10 | #Example: 11 | #source("manhattan_v2.R") 12 | #d = read.table("myplinkresults.logistic", header=TRUE, as.is=TRUE) 13 | #X=data.frame(CHR=d$CHR, BP=d$BP, P=d$P) 14 | #manhattan( X, DrawGWline=FALSE ) 15 | 16 | manhattan <- function( x, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE, cutoff=0 ) 17 | { 18 | 19 | x$P[ x$PGreyZoneThresh)&(ptmpGreyZoneThresh)&(ptmpGWthresh], ptmp[ptmp>GWthresh], pch=20, col=color[i]) 68 | } 69 | 70 | #drawthreshold 71 | if (DrawGWline) { 72 | abline(h=(GWthresh), col="red") 73 | } 74 | } 75 | 76 | 77 | 78 | #From http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R 79 | #See also https://stat.ethz.ch/pipermail/r-help/2008-November/180812.html 80 | ############################################################################### 81 | ### 82 | ### Whole Genome Significance plot 83 | ### Matt Settles 84 | ### Bioinformatics Core 85 | ### Washington State University, Pullman, WA 86 | ### 87 | ### Created July 7, 2008 88 | ### 89 | ### July 8, 2008 - fixed color goof 90 | ############################################################################### 91 | ############## 92 | ### things to add 93 | ### marker name on plot for significant markers 94 | ############## 95 | 96 | ### THERE ARE ERRORS IN GAPS MHTPLOT, SO THIS IS A FIX 97 | ## data a data frame with three columns representing chromosome, position and p values logged or unlogged 98 | ## logscale a flag to indicate if p value are to be log-transformed, FALSE means already logtransformed 99 | ## base the base of the logarithm, when logscale =TRUE 100 | ## cutoffs the cutt-offs where horizontal line(s) are drawn 101 | ## color the color for different chromosome(s), and random if unspecified 102 | ## labels labels for the x-axis, length = number of chromosomes 103 | ## xlabel label to be placed on the X axis 104 | ## ylabel lable to be placed on the Y axis 105 | ## ... other options in compatible with the R plot function 106 | 107 | ## USAGE 108 | # source("http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R") 109 | ## fake example with Affy500k data 110 | # affy <-c(40220, 41400, 33801, 32334, 32056, 31470, 25835, 27457, 22864, 28501, 26273, 111 | # 24954, 19188, 15721, 14356, 15309, 11281, 14881, 6399, 12400, 7125, 6207) 112 | # CM <- cumsum(affy) 113 | # n.markers <- sum(affy) 114 | # n.chr <- length(affy) 115 | # test <- data.frame(chr=rep(1:n.chr,affy),pos=1:n.markers,p=runif(n.markers)) 116 | # png("wgplot.png",units="in",width=8,height=5,res=300) 117 | # par(las="2",cex=0.6,pch=21,bg="white") 118 | # wgplot(test,cutoffs = c(1,3, 5, 7, 9),color=palette()[2:5],labels=as.character(1:22)) 119 | # title("Whole Genome Associaton Plot of Significance for Chromosomes 1 to 22") 120 | # dev.off() 121 | ## 122 | "wgplot" <- 123 | function (data, 124 | logscale = TRUE, 125 | base = 10, 126 | cutoffs = c(3, 5, 7, 9), 127 | siglines = NULL, 128 | sigcolors = "red", 129 | color = sample(colors(), 26), 130 | chrom = as.character(c(1:22,"X","Y","XY","MT")), 131 | startbp = NULL, 132 | endbp = NULL, 133 | labels = as.character(c(1:22,"X","Y","XY","MT")), 134 | xlabel = "Chromosome", 135 | ylabel = expression(log[10]*" p-value"), ...) 136 | { 137 | if (any(is.na(data))) 138 | data <- data[-unique(which(is.na(data))%%nrow(data)),] 139 | keep <- which(data[,1] %in% chrom) 140 | data <- data[keep,] 141 | if (!is.null(startbp) & !is.null(endbp) & length(chrom) == 1){ 142 | keep <- which(data[,2] >= startbp & data[,2] <= endbp) 143 | data <- data[keep,] 144 | } 145 | 146 | 147 | chr <- data[, 1] 148 | pos <- data[, 2] 149 | p <- data[, 3] 150 | 151 | ### remove any NAs 152 | which(is.na(data[,2])) 153 | chr <- replace(chr,which(chr == "X"),"100") 154 | chr <- replace(chr,which(chr == "Y"),"101") 155 | chr <- replace(chr,which(chr == "XY"),"102") 156 | chr <- replace(chr,which(chr == "MT"),"103") 157 | 158 | ord <- order(as.numeric(chr),as.numeric(pos)) 159 | chr <- chr[ord] 160 | pos <- pos[ord] 161 | p <- p[ord] 162 | 163 | lens.chr <- as.vector(table(as.numeric(chr))) 164 | CM <- cumsum(lens.chr) 165 | n.markers <- sum(lens.chr) 166 | n.chr <- length(lens.chr) 167 | id <- 1:n.chr 168 | color <- rep(color,ceiling(n.chr/length(color))) 169 | if (logscale) 170 | p <- -log(p,base) 171 | if ( any(diff(pos) < 0) ) { 172 | cpos <- cumsum(c(0,pos[which(!duplicated(chr))-1])) 173 | pos <- pos + rep(cpos,lens.chr) 174 | 175 | mids <- cpos + diff(c(cpos,max(pos)))/2 176 | } 177 | 178 | par(xaxt = "n", yaxt = "n") 179 | plot(c(pos,pos[1]), c(9,p), type = "n", xlab = xlabel, ylab = ylabel, axes = FALSE, ...) 180 | for (i in 1:n.chr) { 181 | u <- CM[i] 182 | l <- CM[i] - lens.chr[i] + 1 183 | cat("Plotting points ", l, "-", u, "\n") 184 | points(pos[l:u], p[l:u], col = color[i], ...) 185 | } 186 | par(xaxt = "s", yaxt = "s") 187 | axis(1, at = c(0, pos[round(CM)],max(pos)),FALSE) 188 | text(mids, par("usr")[3] - 0.5, srt = 0, pos=2,cex=0.5,offset= -0.2, 189 | labels = labels[1:n.chr], xpd = TRUE) 190 | #axis(side=1, at = pos[round(CM-lens.chr/2)],tick=FALSE, labels= labels[1:n.chr]) 191 | #abline(h = cutoffs) 192 | axis(side=2, at = cutoffs ) 193 | if (!is.null(siglines)) 194 | abline(h = -log(siglines,base),col=sigcolors) 195 | 196 | #mtext(eval(expression(cutoffs)), 2, at = cutoffs) 197 | 198 | } 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /manhattan_v2_bumblebee.R: -------------------------------------------------------------------------------- 1 | #Generic Manhattan plot function for PLINK-formatted data (chr X,Y,XY and MT are represented as 23,24,25,26) 2 | #Wrapper function written by Mike Weale and Richard Gunning. Internal "wgplot" function written by Matt Settles. 3 | #Version 2 (12 Mar 2013) 4 | #Arguments: 5 | #x Data frame to be plotted. x$CHR contains chromosome (numeric). x$BP contains SNP position (numeric). x$P contains association p-value (numeric) 6 | #GWthresh Numeric. Indicates where "genomewide significance" threshold should be drawn 7 | #GreyZoneThresh Numeric. Indicates a sub-genomewide-sig "grey zone" where SNPs are shown with a larger point size 8 | #DrawGWline Boolean. If TRUE, then a red line at the "genomewide significance" threshold is plotted 9 | #cutoff Numeric. Any p-vlaues less than cutoff are forced equal to cutoff 10 | #Example: 11 | #source("manhattan_v2.R") 12 | #d = read.table("myplinkresults.logistic", header=TRUE, as.is=TRUE) 13 | #X=data.frame(CHR=d$CHR, BP=d$BP, P=d$P) 14 | #manhattan( X, DrawGWline=FALSE ) 15 | 16 | manhattan <- function( x, GWthresh=-log10(5e-8), GreyZoneThresh=-log10(1e-4), DrawGWline=TRUE, cutoff=0 ) 17 | { 18 | 19 | x$P[ x$PGreyZoneThresh)&(ptmpGreyZoneThresh)&(ptmpGWthresh], ptmp[ptmp>GWthresh], pch=20, col=color[i]) 70 | } 71 | 72 | #drawthreshold 73 | if (DrawGWline) { 74 | abline(h=(GWthresh), col="red") 75 | } 76 | } 77 | 78 | 79 | 80 | #From http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R 81 | #See also https://stat.ethz.ch/pipermail/r-help/2008-November/180812.html 82 | ############################################################################### 83 | ### 84 | ### Whole Genome Significance plot 85 | ### Matt Settles 86 | ### Bioinformatics Core 87 | ### Washington State University, Pullman, WA 88 | ### 89 | ### Created July 7, 2008 90 | ### 91 | ### July 8, 2008 - fixed color goof 92 | ############################################################################### 93 | ############## 94 | ### things to add 95 | ### marker name on plot for significant markers 96 | ############## 97 | 98 | ### THERE ARE ERRORS IN GAPS MHTPLOT, SO THIS IS A FIX 99 | ## data a data frame with three columns representing chromosome, position and p values logged or unlogged 100 | ## logscale a flag to indicate if p value are to be log-transformed, FALSE means already logtransformed 101 | ## base the base of the logarithm, when logscale =TRUE 102 | ## cutoffs the cutt-offs where horizontal line(s) are drawn 103 | ## color the color for different chromosome(s), and random if unspecified 104 | ## labels labels for the x-axis, length = number of chromosomes 105 | ## xlabel label to be placed on the X axis 106 | ## ylabel lable to be placed on the Y axis 107 | ## ... other options in compatible with the R plot function 108 | 109 | ## USAGE 110 | # source("http://bioinfo-mite.crb.wsu.edu/Rcode/wgplot.R") 111 | ## fake example with Affy500k data 112 | # affy <-c(40220, 41400, 33801, 32334, 32056, 31470, 25835, 27457, 22864, 28501, 26273, 113 | # 24954, 19188, 15721, 14356, 15309, 11281, 14881, 6399, 12400, 7125, 6207) 114 | # CM <- cumsum(affy) 115 | # n.markers <- sum(affy) 116 | # n.chr <- length(affy) 117 | # test <- data.frame(chr=rep(1:n.chr,affy),pos=1:n.markers,p=runif(n.markers)) 118 | # png("wgplot.png",units="in",width=8,height=5,res=300) 119 | # par(las="2",cex=0.6,pch=21,bg="white") 120 | # wgplot(test,cutoffs = c(1,3, 5, 7, 9),color=palette()[2:5],labels=as.character(1:22)) 121 | # title("Whole Genome Associaton Plot of Significance for Chromosomes 1 to 22") 122 | # dev.off() 123 | ## 124 | "wgplot" <- 125 | function (data, 126 | logscale = TRUE, 127 | base = 10, 128 | cutoffs = c(3, 5, 7, 9), 129 | siglines = NULL, 130 | sigcolors = "red", 131 | color = sample(colors(), 26), 132 | chrom = as.character(c(1:22,"X","Y","XY","MT")), 133 | startbp = NULL, 134 | endbp = NULL, 135 | labels = as.character(c(1:22,"X","Y","XY","MT")), 136 | xlabel = "Chromosome", 137 | ylabel = expression(log[10]*" p-value"), ...) 138 | { 139 | if (any(is.na(data))) 140 | data <- data[-unique(which(is.na(data))%%nrow(data)),] 141 | keep <- which(data[,1] %in% chrom) 142 | data <- data[keep,] 143 | if (!is.null(startbp) & !is.null(endbp) & length(chrom) == 1){ 144 | keep <- which(data[,2] >= startbp & data[,2] <= endbp) 145 | data <- data[keep,] 146 | } 147 | 148 | 149 | chr <- data[, 1] 150 | pos <- data[, 2] 151 | p <- data[, 3] 152 | 153 | ### remove any NAs 154 | which(is.na(data[,2])) 155 | chr <- replace(chr,which(chr == "X"),"100") 156 | chr <- replace(chr,which(chr == "Y"),"101") 157 | chr <- replace(chr,which(chr == "XY"),"102") 158 | chr <- replace(chr,which(chr == "MT"),"103") 159 | 160 | ord <- order(as.numeric(chr),as.numeric(pos)) 161 | chr <- chr[ord] 162 | pos <- pos[ord] 163 | p <- p[ord] 164 | 165 | lens.chr <- as.vector(table(as.numeric(chr))) 166 | CM <- cumsum(lens.chr) 167 | n.markers <- sum(lens.chr) 168 | n.chr <- length(lens.chr) 169 | id <- 1:n.chr 170 | color <- rep(color,ceiling(n.chr/length(color))) 171 | if (logscale) 172 | p <- -log(p,base) 173 | if ( any(diff(pos) < 0) ) { 174 | cpos <- cumsum(c(0,pos[which(!duplicated(chr))-1])) 175 | pos <- pos + rep(cpos,lens.chr) 176 | 177 | mids <- cpos + diff(c(cpos,max(pos)))/2 178 | } 179 | 180 | par(xaxt = "n", yaxt = "n") 181 | plot(c(pos,pos[1]), c(9,p), type = "n", xlab = xlabel, ylab = ylabel, axes = FALSE, ...) 182 | for (i in 1:n.chr) { 183 | u <- CM[i] 184 | l <- CM[i] - lens.chr[i] + 1 185 | cat("Plotting points ", l, "-", u, "\n") 186 | points(pos[l:u], p[l:u], col = color[i], ...) 187 | } 188 | par(xaxt = "s", yaxt = "s") 189 | axis(1, at = c(0, pos[round(CM)],max(pos)),FALSE) 190 | text(mids, par("usr")[3] - 0.5, srt = 0, pos=2,cex=0.5,offset= -0.2, 191 | labels = labels[1:n.chr], xpd = TRUE) 192 | #axis(side=1, at = pos[round(CM-lens.chr/2)],tick=FALSE, labels= labels[1:n.chr]) 193 | #abline(h = cutoffs) 194 | axis(side=2, at = cutoffs ) 195 | if (!is.null(siglines)) 196 | abline(h = -log(siglines,base),col=sigcolors) 197 | 198 | #mtext(eval(expression(cutoffs)), 2, at = cutoffs) 199 | 200 | } 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /qq_plot_v7.R: -------------------------------------------------------------------------------- 1 | #Generic QQ plot function with concentration bands, for either P-values or chi-sq values 2 | #Function written by Mike Weale and Tom Price, King's College London. 3 | #Version 7 (23 Feb 2013) (a) fixes plots issues if p=0 values in the dataset; (b) allows "iplot" vector giving indices of sorted quantiles (O vector - largest values first) to be plotted) 4 | #Concentration bands are plotted using the pointwise method of Quesenberry & Hale (1980) J. Statist. Comput. Simul. 11:41-53 5 | #The method proceeds from noting that the kth order statistic from a sample of n i.i.d. U(0,1) statistics has a Beta(k,n+1-k) distribution. 6 | #Arguments: 7 | #x the data vector to be plotted 8 | #alpha the alpha level for the concentration band (if plotted) 9 | #datatype "pvalue" (default) indicates x contains p-values. "chisq" indicates x contains chi-square values. "stdnorm" indicates x contains z values. 10 | #scaletype "pvalue" (default) indicates x- and y-axis scale to be in -log10(p-value) units. "quantile" indicates x- and y-axis scale to be in quantile units (=chisq units for pvalues). Note if datatype="stdnorm" then scaletype is forced ="quantile" 11 | #df degrees of freedom for chi-square scale used in Q-Q plot. Default=1 (as this is the most common test type) 12 | #plot.concentration.band Flag to indicate whether concentration band is to be plotted. Default=TRUE. 13 | #one.sided Flab to indicate if one-sided (upper) or two-sided concentration band required. Default=FALSE 14 | #frac=1 Fraction of total data to be plotted. E.g. set frac=0.1 to plot only the top 10% of data points 15 | #iplot If set, a vector of indices for which ordered quantiles (O vector - largest values first) to be plotted. 16 | # e.g. set iplot=c( (1:1e4), sort(sample((1e4:length(x)),1e4)) ) to select all of 1st 10k values + random set of remaining 10k values 17 | # Note if iplot is set, then frac is forced=1 18 | #print If set, a dataframe of O and E values are returned. Default=FALSE 19 | #xat If set, a vector seting x tick positions. For p-values, sets 10^x positions 20 | #yat If set, a vector seting y tick positions. For p-values, sets 10^y positions 21 | # ... other graphical parameters to be passed to plot function 22 | #Returns (if print==TRUE): 23 | #Dataframe with two columns: $O=sorted observed values, $E=sorted expected values 24 | #e.g. 25 | #p = pnorm(c(rnorm(1e4),rnorm(10)-5)) #mixture of 'null' and 'hit' p-values 26 | #qq.plot(p) 27 | # 28 | qq.plot <- function( x, alpha=0.05, datatype="pvalue", scaletype="pvalue", df=1, plot.concentration.band=TRUE, one.sided=FALSE, frac=1, iplot=NULL, print=FALSE, xat=NULL, yat=NULL, main=NULL, xlab=NULL, ylab=NULL, pch="x", cex=0.5, col="black", ... ) 29 | { 30 | pname <- paste(deparse(substitute(x), 500), collapse="\n") #Name of vector passed as "x" to be used in plot title etc. 31 | if (!is.null(iplot)) frac=1 #Forces frac=1 if "iplot" used to chose points to plot 32 | #Some validity checks on x 33 | if (!is.numeric(x)) 34 | stop("'x' must be numeric") 35 | nmissing = sum(is.na(x)) 36 | x <- x[ !is.na(x) ] #To deal with missing data values (these don't get plotted) 37 | if ((datatype=="pvalue")&((min(x)<0)|(max(x)>1))) 38 | stop("'x' must be >=0 and <=1") 39 | if ((datatype=="chisq")&(min(x)<0)) 40 | stop("'x' must be >=0") 41 | nzero = sum(x==0) 42 | #Some warnings on missing values (and, if pvalues, on x=0) 43 | if (nmissing>0) 44 | warning(nmissing, " missing values (excluded)") 45 | if ((nzero>0)&(datatype=="pvalue")) { 46 | warning(nzero, " zero values (plotted with same value as lowest non-zero p-value)") 47 | x[x==0] <- min(x[x>0]) 48 | } 49 | if (datatype=="stdnorm") {df=0; scaletype="ordinal"} 50 | n <- length(x) 51 | starti = floor((n-1)*(1-frac)) +1 #i for the first sorted datapoint to be plotted. 52 | lena = n-starti+1 #Number of datapoints to be plotted 53 | if (!is.null(iplot)) a2=iplot else a2=(1:lena) #indices to be plotted 54 | b <- n+1-a2 #indices used in determining concentration band 55 | #Find E and O under relevant inv. chisq transformation 56 | if ((df==2)&(datatype!="stdnorm")) { #short-cut for df=2 (chisq or pval data): use -2log-transformed expected U(0,1) order statistics (high values first) 57 | E <- -2*log(a2/(n+1)) 58 | if (datatype=="pvalue") O <- -2*log(sort(x)[a2]) #Note obs data no need to transform if already chisq or z value (high values first) 59 | } else { 60 | if (datatype=="stdnorm") E <- qnorm(a2/(n+1),lower.tail=FALSE) #invnorm-transformed expected U(0,1) order statistics (put high scores first) 61 | if (datatype!="stdnorm") E <- qchisq(a2/(n+1),df=df,lower.tail=FALSE) #invchisq-transformed expected U(0,1) order statistics (put high scores first) 62 | if (datatype=="pvalue") O <- qchisq(sort(x)[a2],df=df,lower.tail=FALSE) #Take lowest pvalues, transform to chisq (highest/most interesting values first) 63 | } 64 | if (datatype!="pvalue") O <- sort(x, decreasing=TRUE)[a2] #Sort x (chisq or norm), highest (most interesting) values first 65 | #Derive "pretty" tick places for log10 p-value scale, if necessary 66 | #Note that by this stage, O/E will either contain chisq-scale or normal-scale values, and both are sorted 67 | if (scaletype=="pvalue") { #Note scaletype forced="quantile" for stdnorm data, so here all data is on chisq scale 68 | if (!is.null(xat)) x4Lx=xat else x4Lx = pretty( -log10( pchisq(c(E[1],E[length(E)]),df=df,lower.tail=FALSE) ) ) 69 | if (!is.null(yat)) y4Ly=yat else y4Ly = pretty( -log10( pchisq(c(O[1],O[length(O)]),df=df,lower.tail=FALSE) ) ) 70 | xnums = qchisq(10^-x4Lx,df=df,lower.tail=FALSE) #Get same locations on actual chisq scale 71 | ynums = qchisq(10^-y4Ly,df=df,lower.tail=FALSE) #Get same locations on actual chisq scale 72 | Lx <- parse( text=paste("10^-",x4Lx,sep="") ) 73 | Ly <- parse( text=paste("10^-",y4Ly,sep="") ) 74 | } else { #"Else" covers both chisq and stdnorm-scaled data 75 | if (!is.null(xat)) xnums=xat else xnums=pretty(c(E[1],E[length(E)])) 76 | if (!is.null(yat)) ynums=yat else ynums=pretty(c(O[1],O[length(O)])) 77 | Lx <- parse( text=as.character(xnums) ) 78 | Ly <- parse( text=as.character(ynums) ) 79 | } 80 | #Do Q-Q plot 81 | if (is.null(main)) { 82 | if (datatype=="stdnorm") main=paste("Q-Q plot (on stdnorm) of " ,pname, sep="") 83 | else main=paste("Q-Q plot (on chisq[",df,"]) of " ,pname, sep="") 84 | } 85 | if (is.null(xlab)) { 86 | if (scaletype=="pvalue") xlab="Expected p-value" 87 | else xlab="Expected quantile" 88 | } 89 | if (is.null(ylab)) { 90 | if (scaletype=="pvalue") ylab="Observed p-value" 91 | else ylab="Observed quantile" 92 | } 93 | plot( c(E[1],E[length(E)]), c(O[1],O[length(O)]), main = main, xlab = xlab, ylab = ylab, type = "n", xaxt = "n", yaxt = "n", ... ) #Just plots the outside box 94 | axis(1, at=xnums, labels=Lx ) 95 | axis(2, at=ynums, labels=Ly ) 96 | if (plot.concentration.band==TRUE) { #Note that conc band won't draw if x has too many datapoints 97 | if (one.sided==FALSE) { 98 | upper <- qbeta( 1-alpha/2, a2, b ) #Exp. upper CL for 'a'th U(0,1) order statistic (becomes 'lower') 99 | lower <- qbeta( alpha/2, a2, b ) #Exp. lower CL for 'a'th U(0,1) order statistic (becomes 'upper') 100 | } else { 101 | upper <- rep(1,length(E)) #Exp. upper CL for 'a'th U(0,1) order statistic (becomes 'lower') 102 | lower <- qbeta( alpha, a2, b ) #Exp. lower CL for 'a'th U(0,1) order statistic (becomes 'upper') 103 | } 104 | if (df==2) { 105 | polygon( c( E, rev(E) ), c( -2*log(upper), rev(E) ), col="grey", border = NA ) #'lower' band after trans 106 | polygon( c( E, rev(E) ), c( -2*log(lower), rev(E) ), col="grey", border = NA ) #'upper' band after trans 107 | } else { 108 | if (datatype=="stdnorm") { 109 | polygon( c( E, rev(E) ), c( qnorm(upper,lower.tail=FALSE), rev(E) ), col="grey", border = NA ) 110 | polygon( c( E, rev(E) ), c( qnorm(lower,lower.tail=FALSE), rev(E) ), col="grey", border = NA ) 111 | } else { 112 | polygon( c( E, rev(E) ), c( qchisq(upper,df=df,lower.tail=FALSE), rev(E) ), col="grey", border = NA ) #'lower' band 113 | polygon( c( E, rev(E) ), c( qchisq(lower,df=df,lower.tail=FALSE), rev(E) ), col="grey", border = NA ) #'upper' band 114 | } 115 | } 116 | } 117 | abline( 0, 1, col="red" ) #plot 1:1 line 118 | abline(h=ynums, v=xnums, col="lightgray", lty="dotted") #plot grid 119 | points( E, O, pch=pch, cex=cex, col=col ) #Finally, plot points 120 | if (print==TRUE) return( data.frame( O=O, E=E ) ) 121 | } 122 | --------------------------------------------------------------------------------