├── AFRsubset_fortutorial.txt ├── BetaScan.py ├── BetaScan_python2.py └── README.md /AFRsubset_fortutorial.txt: -------------------------------------------------------------------------------- 1 | NA18489 2 | NA18504 3 | NA18511 4 | NA18516 5 | NA18523 6 | NA18486 7 | NA18498 8 | NA18501 9 | NA18520 10 | NA18934 11 | -------------------------------------------------------------------------------- /BetaScan.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import argparse 4 | import math 5 | 6 | 7 | def find_win_indx(prev_start_i, prev_end_i, snp_i, data_list, win_size): 8 | """Takes in the previous indices of the start_ing and end of the window, 9 | then returns the appropriate start_ing and ending index for the next SNP 10 | 11 | Parameters: 12 | prev_start_i: start_ing index in the array of SNP for the previous core SNP's window, inclusive 13 | prev_end_i: ending index in the array for the previous SNP's window, inclusive 14 | snp_i, the index in the array for the current SNP under consideration 15 | data_list: the numpy array of all SNP locations & frequencies 16 | """ 17 | 18 | loc_snp = data_list[snp_i, 0] # the coordinates of the core SNP 19 | win_start = loc_snp - win_size/2 20 | 21 | # array index of start of window, inclusive 22 | firstI = prev_start_i + np.searchsorted(data_list[prev_start_i:, 0], win_start, side='left') 23 | winEnd = loc_snp + win_size/2 24 | 25 | # array index of end of window, exclusive 26 | endI = prev_end_i - 1 + np.searchsorted(data_list[prev_end_i:, 0], winEnd, side='right') 27 | return (firstI, endI) 28 | 29 | 30 | def calc_beta_folded(snp_freq_list, core_freq, num_ind, p): 31 | """Calculates the value of the folded beta statistic 32 | 33 | Parameters: 34 | SNPFreq: freq of SNP under consideration, ranges from 1 to sample size 35 | core_freq: freq of coresite, ranges from 0 to 1 36 | p: the p parameter specificying sharpness of peak 37 | num_ind: the number of haploid individuals used to calculate frequency of core site 38 | """ 39 | 40 | if snp_freq_list.size == 0: 41 | return 0 42 | a1 = np.sum(1./np.arange(1, num_ind)) 43 | thetaW = len(snp_freq_list[:, 0])/a1 44 | thetaBNum = np.sum(calc_d(snp_freq_list[:, 0]/snp_freq_list[:, 1], core_freq, p)) 45 | 46 | i = np.arange(1, num_ind) 47 | thetaBDenom = np.sum((1./i)*calc_d(i/float(num_ind), core_freq, p)) 48 | 49 | thetaB = thetaBNum/thetaBDenom 50 | return thetaB - thetaW 51 | 52 | 53 | def calc_beta_unfolded(snp_freq_list, core_freq, num_ind, p): 54 | """Calculates the unfolded version of Beta from Siewert and Voight 55 | For use when the ancestral and derived alleles can be confidently called 56 | 57 | Parameters: 58 | snp_freq_list: a list of frequencies, one for each SNP in the window, 59 | first column ranges from 1 to number of individuals, second columns is # individuals 60 | core_freq: the frequency of the core SNP, must range from 0 to 1, exclusive 61 | num_ind: number of individuals used to calculate the core site frequency 62 | p: value of parameter p 63 | """ 64 | if snp_freq_list.size == 0: 65 | return 0 66 | a1 = np.sum(1./np.arange(1, num_ind)) 67 | thetaW = len(snp_freq_list[:, 0])/a1 68 | thetaBNum = sum(calc_d(snp_freq_list[:, 0]/snp_freq_list[:, 1], core_freq, p) * snp_freq_list[:, 0]) 69 | thetaBDenom = np.sum(calc_d(np.arange(1, num_ind)/float(num_ind), core_freq, p)) 70 | thetaB = thetaBNum/thetaBDenom 71 | return thetaB - thetaW 72 | 73 | 74 | def calc_thetabeta_unfolded(snp_freq_list, core_freq, num_ind, p): 75 | """Calculates theta_Beta usign the unfolded SFS 76 | 77 | Parameters: 78 | SNPFreq: freq of SNP under consideration, ranges from 1 to sample size 79 | core_freq: freq of coresite, ranges from 0 to 1 80 | p: the p parameter specificying sharpness of peak 81 | num_ind: the number of haploid individuals used to calculate frequency of core site 82 | """ 83 | 84 | if snp_freq_list.size == 0: 85 | return 0 86 | thetaBNum = np.sum(calc_d(snp_freq_list[:, 0]/snp_freq_list[:, 1], core_freq, p) * snp_freq_list[:, 0]) 87 | 88 | thetaBDenom = np.sum(calc_d(np.arange(1, num_ind)/float(num_ind), core_freq, p)) 89 | 90 | thetaB = thetaBNum/thetaBDenom 91 | return thetaB 92 | 93 | 94 | def calc_thetabeta_folded(snp_freq_list, core_freq, num_ind, p): 95 | """Calculates theta_Beta using the folded SFS 96 | 97 | Parameters: 98 | SNPFreq: freq of SNP under consideration, ranges from 1 to sample size 99 | core_freq: freq of coresite, ranges from 0 to 1 100 | p: the p parameter specificying sharpness of peak 101 | num_ind: the number of haploid individuals used to calculate frequency of core site 102 | """ 103 | 104 | if snp_freq_list.size == 0: 105 | return 0 106 | thetaBNum = np.sum(calc_d(snp_freq_list[:, 0]/snp_freq_list[:, 1], core_freq, p)) 107 | 108 | thetaBDenom = np.sum((1./np.arange(1, num_ind))*calc_d(np.arange(1, num_ind) / float(num_ind), core_freq, p)) 109 | 110 | thetaB = thetaBNum/thetaBDenom 111 | return thetaB 112 | 113 | 114 | def calc_thetaw_unfolded(snp_freq_list, num_ind): 115 | """Calculates watterson's theta 116 | 117 | Parameters: 118 | snp_freq_list: a list of frequencies, one for each SNP in the window, 119 | first column ranges from 1 to number of individuals, second columns is # individuals 120 | num_ind: number of individuals used to calculate the core site frequency 121 | """ 122 | if snp_freq_list.size == 0: 123 | return 0 124 | 125 | a1 = np.sum(1./np.arange(1, num_ind)) 126 | 127 | thetaW = len(snp_freq_list[:, 0])/a1 128 | return thetaW 129 | 130 | 131 | def calc_theta_d(snp_freq_list, c, n): 132 | """ 133 | Calculates theta_D 134 | 135 | Parameters: 136 | c: Speciation time in coalescent units 137 | n: Sample Size 138 | """ 139 | if snp_freq_list.size == 0: 140 | return 0 141 | 142 | S = np.where(snp_freq_list[:, 0] == snp_freq_list[:, 1])[0].shape[0] 143 | return S/(c+1./n) 144 | 145 | 146 | def calc_beta_2(snp_freq_list, c, n, core_freq, p): 147 | SNPs = snp_freq_list[np.where(snp_freq_list[:, 0] != snp_freq_list[:, 1])] 148 | return calc_thetabeta_unfolded(SNPs, core_freq, n, p) - calc_theta_d(snp_freq_list, c, n) 149 | 150 | 151 | def calc_var_theta_d(c, n, theta): 152 | """Calculates the variance of Theta_S 153 | 154 | Parameters: 155 | c: Speciation time in coalescent units 156 | n: Sample Size 157 | theta: genome-wide estimate of the mutation rate 158 | """ 159 | i = np.arange(2, n+1) 160 | x = np.sum(1./(i**2.*(i-1)**2.)) 161 | return (1./(c+1./n))**2.*(theta**2.+c*theta+theta/n+theta**2.*x) 162 | 163 | 164 | def calc_t_b2(snp_freq_list, core_freq, c, n, p, theta, var_dic): 165 | ''' 166 | 167 | Parameters: 168 | core_freq: freq of SNP under consideration, ranges from 1 to sample size 169 | n: sample size of core SNP 170 | p: the p parameter specifying sharpness of peak 171 | theta: genome-wide estimate of the mutation rate 172 | ''' 173 | notSubsList_noCore = snp_freq_list[np.where(snp_freq_list[:, 0] != snp_freq_list[:, 1])] 174 | thetaB = calc_thetabeta_unfolded(notSubsList_noCore, core_freq/n, n, p) 175 | thetasubs = calc_theta_d(snp_freq_list, c, n) 176 | if not (n, core_freq, theta) in var_dic: 177 | VarD = calc_var_theta_d(c, n, theta) 178 | VarB = calc_var_theta(n, theta, core_freq, p, False) 179 | denom = math.sqrt(VarD+VarB) 180 | var_dic[(n, core_freq, theta)] = denom 181 | else: 182 | denom = var_dic[(n, core_freq, theta)] 183 | return (thetaB-thetasubs)/denom 184 | 185 | 186 | def calc_d(freq, x, p): 187 | """Calculates the value of d, the similarity measure 188 | 189 | Parameters: 190 | freq: freq of SNP under consideration, ranges from 0 to 1 191 | x: freq of coresite, ranges from 0 to 1 192 | p: the p parameter specifying sharpness of peak 193 | """ 194 | xf = min(x, 1.-x) 195 | f = np.minimum(freq, 1.-freq) 196 | maxdiff = np.maximum(xf, .5-xf) 197 | corr = ((maxdiff-np.abs(xf-f))/maxdiff)**p 198 | return corr 199 | 200 | 201 | def calc_t_unfolded(snp_freq_list, core_freq, snp_n, p, theta, var_dic): 202 | """ 203 | Using equation 8 from Achaz 2009 204 | 205 | Parameters: 206 | core_freq: freq of SNP under consideration, ranges from 1 to sample size 207 | snp_n: sample size of core SNP 208 | p: the p parameter specifying sharpness of peak 209 | theta: genome-wide estimate of the mutation rate 210 | """ 211 | 212 | x = float(core_freq)/snp_n 213 | 214 | num = np.sum(snp_freq_list[:, 0]/snp_freq_list[:, 1]*snp_n*omegai(snp_freq_list[:, 0]/snp_freq_list[:, 1], 215 | snp_n, x, p)) 216 | if not (snp_n, core_freq, theta) in var_dic: 217 | denom = math.sqrt(an(snp_n, x, p) * theta + Bn(snp_n, x, p) * theta**2.) 218 | var_dic[(snp_n, core_freq, theta)] = denom 219 | else: 220 | denom = var_dic[(snp_n, core_freq, theta)] 221 | return num/denom 222 | 223 | 224 | def calc_var_theta(n, theta, core_freq, p, wattersons): 225 | """ 226 | Calculates variance of a given estimator of theta, eq 7 from Achaz. 227 | 228 | Parameters: 229 | core_freq: freq of SNP under consideration, ranges from 1 to sample size 230 | n: sample size of core SNP 231 | p: the p parameter specifying sharpness of peak 232 | theta: genome-wide estimate of the mutation rate 233 | wattersons: whether to calculate wattersons theta instead of 234 | """ 235 | wVector = None 236 | if wattersons: 237 | wVector = 1./np.arange(1, n) 238 | else: 239 | wVector = calc_d(np.arange(1, n)/float(n), float(core_freq)/n, p) 240 | t1 = np.sum(wVector)**(-2.) 241 | t2 = theta*np.sum(wVector**2. * np.arange(1, n)) 242 | 243 | i = np.arange(1, n) 244 | s1 = np.sum(wVector**2*i**2*sigma(n, np.column_stack([i, i]))) 245 | 246 | coords = np.asarray([(j, i) for i in range(1, n) for j in range(i+1, n)]) 247 | iind = np.asarray([i-1 for i in range(1, n) for j in range(i+1, n)]) 248 | jind = np.asarray([j-1 for i in range(1, n) for j in range(i+1, n)]) 249 | 250 | s2 = np.sum(coords[:, 0] * coords[:, 1] * wVector[iind] * wVector[jind] * sigma(n, coords)) 251 | 252 | t3 = theta**2.*(s1+2.*s2) 253 | return t1*(t2+t3) 254 | 255 | 256 | def calc_var_theta_fold(n, theta, core_freq, p): 257 | """ 258 | Parameters: 259 | core_freq: freq of SNP under consideration, ranges from 1 to sample size 260 | n: sample size of core SNP 261 | p: the p parameter specifying sharpness of peak 262 | theta: genome-wide estimate of the mutation rate 263 | """ 264 | 265 | wVector = calc_d(np.arange(1, int(n/2)+1)/float(n), float(core_freq)/n, p) 266 | r = np.arange(1, int(n/2)+1) 267 | t1 = sum(wVector*(1./r+1./(n-r)) * 1./(1+(r == n-r)))**-2. 268 | t2 = sum([wVector[i-1]**2.*(phi(n, i)*theta+rho_p_ii(n, i)*theta**2.) for i in range(1, int(n/2)+1)]) 269 | 270 | coords = np.asarray([(j, i) for i in range(1, int(n/2)+1) for j in range(1, i)]) 271 | t3 = np.sum(wVector[coords[:, 0]-1]*wVector[coords[:, 1]-1] * rho_p_ij(n, coords[:, 0], coords[:, 1]) * theta**2.) 272 | 273 | return t1*(t2+2.*t3) 274 | 275 | 276 | def calc_cov_folded(n, theta, core_freq, p): 277 | """ 278 | Parameters: 279 | core_freq: freq of SNP under consideration, ranges from 1 to sample size 280 | n: sample size of core SNP 281 | p: the p parameter specifying sharpness of peak 282 | theta: genome-wide estimate of the mutation rate 283 | """ 284 | r = np.arange(1, int(n/2)+1) 285 | wVector = calc_d(r/float(n), float(core_freq)/n, p) 286 | t1 = 1./sum(wVector*(1./r+1./(n-r))*1./(1.+(r == n-r))) 287 | t2 = 1./sum((1./r+1./(n-r))*1./(1+(r == n-r))) 288 | coords = np.asarray([(i, j) for i in range(1, int(n/2)+1) for j in range(1, int(n/2)+1)]) 289 | t3 = np.sum(wVector[coords[:, 0]-1]*rho_p_ij(n, coords[:, 0], coords[:, 1]) * theta**2.) 290 | return t1*t2*t3 291 | 292 | 293 | def calc_var_folded_beta(n, theta, core_freq, p): 294 | """ 295 | Parameters: 296 | n: sample size of core SNP 297 | theta: genome-wide estimate of the mutation rate 298 | core_freq: freq of SNP under consideration, ranges from 1 to sample size 299 | p: the p parameter specifying sharpness of peak 300 | """ 301 | return calc_var_theta_fold(n, theta, core_freq, p) + calc_var_theta(n, theta, core_freq, p, True) - \ 302 | 2. * calc_cov_folded(n, theta, core_freq, p) 303 | 304 | 305 | def omegai(i, snp_n, x, p): 306 | """Calculates 9a 307 | 308 | Parameters: 309 | i:freq of SNP under consideration, ranges between 0 and 1 310 | snp_n: number of chromosomes used to calculate frequency of core SNP 311 | x: freq of coresite, ranges from 0 to 1 312 | p: the p parameter specifying sharpness of peak 313 | """ 314 | n1num = calc_d(i, x, p) 315 | n1denom = np.sum(calc_d(np.arange(1., snp_n)/snp_n, x, p)) 316 | n1 = n1num/n1denom 317 | n2 = (1./(i*snp_n)) / (np.sum(1./np.arange(1., snp_n))) 318 | return n1 - n2 319 | 320 | 321 | def phi(n, i): 322 | """ 323 | Calculates equation 12a of Achaz 324 | 325 | Parameters: 326 | n:sample size 327 | i: frequency of SNP, in number of individuals 328 | """ 329 | return n/((1.+(i == n-i)) * i * (n-i)) 330 | 331 | 332 | def rho_p_ii(n, i): 333 | """ 334 | Calculates equation 12b of Achaz 335 | 336 | Parameters: 337 | n:sample size 338 | i: frequency of SNP, in number of individuals 339 | """ 340 | return (sigma(n, np.column_stack([i, i]))+sigma(n, np.column_stack([n-i, n-i]))+2. 341 | * sigma(n, np.column_stack([i, n-i]))) / (1.+(i == (n-i)))**2. 342 | 343 | 344 | def rho_p_ij(n, i, j): 345 | """ 346 | Calculates equation 12c of Achaz 347 | 348 | Parameters: 349 | n:sample size 350 | i: frequency of SNP, in number of individuals 351 | j: second frequency 352 | """ 353 | return (sigma(n, np.column_stack([i, j]))+sigma(n, np.column_stack([i, n-j])) + sigma(n, np.column_stack([n-i, j])) 354 | + sigma(n, np.column_stack([n-i, n-j]))) / ((1.+(i == n-i)) * (1. + (j == n-j))) 355 | 356 | 357 | def an(snp_n, x, p): 358 | """ 359 | Calculates alpha_n from Achaz 2009, eq 9b 360 | 361 | snp_n: Sample size 362 | x: frequency, ranges from 0 to 1 363 | p: value of p parameter 364 | """ 365 | i = np.arange(1, snp_n) 366 | return np.sum(i * omegai(i/float(snp_n), snp_n, x, p)**2.) 367 | 368 | 369 | def Bn(snp_n, x, p): 370 | ''' 371 | Returns Beta_N from Achaz 2009, eq 9c 372 | 373 | Parameters: 374 | snp_n: Sample size 375 | x: frequency, ranges from 0 to 1 376 | p: value of p parameter 377 | ''' 378 | 379 | i = np.arange(1, snp_n) 380 | n1 = np.sum(i**2.*omegai(i/float(snp_n), snp_n, x, p)**2.*sigma(snp_n, np.column_stack([i, i]))) 381 | 382 | coords = np.asarray([(j, i) for i in range(1, snp_n) for j in range(1, i)]) 383 | s2 = np.sum(coords[:, 0]*coords[:, 1]*omegai(coords[:, 0]/float(snp_n), snp_n, x, p) 384 | * omegai(coords[:, 1]/float(snp_n), snp_n, x, p)*sigma(snp_n, coords)) 385 | 386 | n2 = 2.*s2 387 | return n1+n2 388 | 389 | 390 | def calc_t_fold(snp_freq_list, core_freq, snp_n, p, theta, var_dic): 391 | """ 392 | 393 | Parameters: 394 | core_freq: freq of SNP under consideration, ranges from 1 to sample size 395 | snp_n: sample size of core SNP 396 | p: the p parameter specifying sharpness of peak 397 | theta: genome-wide estimate of the mutation rate 398 | """ 399 | 400 | x = float(core_freq)/snp_n 401 | num = calc_beta_folded(snp_freq_list, x, snp_n, p) 402 | if not (snp_n, core_freq, theta) in var_dic: 403 | denom = math.sqrt(calc_var_folded_beta(snp_n, theta, core_freq, p)) 404 | var_dic[(snp_n, core_freq, theta)] = denom 405 | else: 406 | denom = var_dic[(snp_n, core_freq, theta)] 407 | return num/denom 408 | 409 | 410 | def sigma(n, ij): 411 | """ 412 | Returns sigma from eq 2 or 3 in Fu 1995 413 | 414 | Parameters: 415 | n: sample size 416 | ij: 2-d array of integers with 2 cols and no rows 417 | """ 418 | np.seterr(all='raise') 419 | res = np.zeros(ij.shape[0]) 420 | # i must be greater than j 421 | ij[:, 0], ij[:, 1] = ij.max(axis=1), ij.min(axis=1) 422 | ci = np.logical_and(ij[:, 0] == ij[:, 1], ij[:, 0] == n/2) 423 | 424 | # Using eq 2 425 | if np.any(ci) > 0: 426 | res[ci] = 2.*((fu_an_vec([n]) - fu_an_vec(ij[ci, 0]))/(float(n)-ij[ci, 0]))-(1./(ij[ci, 0]**2.)) 427 | 428 | ci = np.logical_and(ij[:, 0] == ij[:, 1], ij[:, 0] < n/2) 429 | if np.any(ci) > 0: 430 | res[ci] = fu_Bn(n, ij[ci, 0]+1) 431 | 432 | ci = np.logical_and(ij[:, 0] == ij[:, 1], ij[:, 0] > n/2) 433 | if np.any(ci) > 0: 434 | res[ci] = fu_Bn(n, ij[ci, 0])-1./(ij[ci, 0]**2.) 435 | 436 | # using eq 3 437 | ci = np.logical_and(ij[:, 0] > ij[:, 1], ij[:, 0]+ij[:, 1] == n) 438 | if np.any(ci) > 0: 439 | res[ci] = (fu_an_vec([n])-fu_an_vec(ij[ci, 0]))/(n-ij[ci, 0]) + \ 440 | (fu_an_vec([n])-fu_an_vec(ij[ci, 1]))/(n-ij[ci, 1]) 441 | - (fu_Bn(n, ij[ci, 0]) + fu_Bn(n, ij[ci, 1]+1))/2. - 1./(ij[ci, 0]*ij[ci, 1]) 442 | 443 | ci = np.logical_and(ij[:, 0] > ij[:, 1], ij[:, 0]+ij[:, 1] < n) 444 | if np.any(ci) > 0: 445 | res[ci] = (fu_Bn(n, ij[ci, 0]+1)-fu_Bn(n, ij[ci, 0]))/2. 446 | 447 | ci = np.logical_and(ij[:, 0] > ij[:, 1], ij[:, 0]+ij[:, 1] > n) 448 | if np.any(ci) > 0: 449 | res[ci] = (fu_Bn(n, ij[ci, 1])-fu_Bn(n, ij[ci, 1]+1))/2.-(1./(ij[ci, 0] * ij[ci, 1])) 450 | 451 | return res 452 | 453 | 454 | def fu_an_vec(n): 455 | """Calculates a_n from Fu 1995, eq 4""" 456 | a = np.insert(np.cumsum(1./np.arange(1, np.amax(n))), 0, 0) 457 | return a[np.asarray(n)-1] # minus one for sum being only to n-1 458 | 459 | 460 | def fu_Bn(n, i): 461 | """Calculates Beta_n(i) from Fu 1995, eq 5""" 462 | 463 | r = 2.0 * n/((n-i+1.)*(n-i)) * (fu_an_vec([n+1])-fu_an_vec(i)) - (2./(n-i)) 464 | return r 465 | 466 | 467 | def find_local_theta(theta_map, start_i, coordinate): 468 | """ 469 | Given a numpy array of mutation rates finds the theta corresponding to the window that coordinate is in. 470 | Starts searching at the prior window index to save time 471 | """ 472 | for i in range(start_i, theta_map.shape[0]): 473 | if coordinate < theta_map[i, 1] and coordinate >= theta_map[i, 0]: 474 | return (theta_map[i, 2], i) 475 | print(sys.exit("Error: Coordinate " + str(coordinate)+" is found in the SNP input file, but is not in any \ 476 | of the windows in the theta_map file.")) 477 | 478 | 479 | def main(): 480 | 481 | # Loads the input parameters given by the user 482 | parser = argparse.ArgumentParser() 483 | parser.add_argument("-i", help="Name of input file with all SNPs", type=str, required=True) 484 | parser.add_argument("-o", help="Output file", type=str, default="/dev/stdout") 485 | parser.add_argument("-w", help="Maximum Window Size (in bp) to calculate Beta in for a single test SNP", type=int, 486 | default=1000) 487 | parser.add_argument("-onewin", help="Calculate Beta on window which uses all SNPs in input file instead of using \ 488 | distance-based window", default=False, action="store_true") 489 | parser.add_argument("-p", help="Power to raise difference measure by", type=int, default=2) 490 | parser.add_argument("-fold", help="Use folded SFS version", action="store_true") 491 | parser.add_argument("-B2", help="Use the Beta2 statistic. Substiution data with an outgroup must be provided.", 492 | action="store_true") 493 | parser.add_argument("-m", help="Minimum folded core SNP frequency, exclusive. Must be between 0 and 0.5.", 494 | type=float, default=0) 495 | parser.add_argument("-std", help="Instead of returning Beta value, return normalized Beta Statistic", default=False, 496 | action="store_true") 497 | parser.add_argument("-theta", help="Estimated genome wide theta value per basepair. Used for calculation of \ 498 | variance. It's equal to 2*l*N_e*u, where u is the locus neutral mutation rate, Ne is the \ 499 | effective population size and l is the ploidy", type=float) 500 | parser.add_argument("-theta_map", help="Filename of map of mutation rates. This file should contain estimated\ 501 | mutation rates in windows across the genomic area you are applying Beta on.", type=str) 502 | parser.add_argument("-thetaPerSNP", help="Filename of map of mutation rates. This file should contain estimated\ 503 | mutation rates around each SNP. This file should be two columns: position and estimated theta\ 504 | rate.", type=str) 505 | parser.add_argument("-DivTime", help="Divergence time, in coalescent units, between the two species. Only needed\ 506 | if using B^(2). This can be estimated using the BALLET software, or you can use prior \ 507 | estimates for your species of interest. In practice, this value affects power very little, \ 508 | but will affect the standardized statistic. To convert from generations (g) to coalescent \ 509 | units (c), the formula is g=c*Ne*2 where Ne is the effective population size.", type=float) 510 | 511 | args = parser.parse_args() 512 | output = open(args.o, 'w') 513 | 514 | # Check for valid file format and parameters 515 | try: 516 | SNPs = np.loadtxt(args.i, dtype=float) 517 | except IOError: 518 | print(sys.exit("Error: Input file cannot be found")) 519 | except: 520 | print(sys.exit("Error: Input file in wrong format")) 521 | if args.m < 0 or args.m > .5: 522 | print(sys.exit("Error: Parameter m must be between 0 and 0.5.")) 523 | if args.p <= 0: 524 | print(sys.exit("Error: Parameter p must be positive.")) 525 | if len(SNPs.shape) <= 1: 526 | print(sys.exit("Error: Because the core SNP is excluded from calculations, there must be at least two SNPs in\ 527 | the input file.")) 528 | if args.std and args.theta is None and args.theta_map is None and args.thetaPerSNP is None: 529 | print(sys.exit("Error: In order to normalize Beta statistics, a theta value must be provided using the -theta\ 530 | or -theta_map flags.")) 531 | if args.onewin and (args.theta_map is not None or args.thetaPerSNP is not None): 532 | print(sys.exit("Error: onewin and theta_map options are not compatible. onewin clculates the mutation rate in\ 533 | the given window of arbitrary size")) 534 | if args.w < 2: 535 | print(sys.exit("Error: Window size must be 2 bp or above. However, you probably want to use a window size much\ 536 | larger than 2.")) 537 | if args.std and args.theta_map is None and args.theta <= 0 and args.thetaPerSNP is None: 538 | print(sys.exit("Error: You must provide an estimate of theta (population-scaled mutation rate) and it must be a\ 539 | positive value.")) 540 | if args.p > 50: 541 | print(sys.exit("Error: P is too large. Reduce value to prevent python numerical errors. See manual for more \ 542 | information.")) 543 | if args.fold and args.B2: 544 | print(sys.exit("Error: You cannot use both B1* (folded Beta) and B2. B1* is for when you have no outgroup, \ 545 | and B2 is for when you can call substiutions with an outgroup. See manual for guidance about \ 546 | which to use.")) 547 | if args.DivTime is not None and args.DivTime > 1000: 548 | print(sys.exit("Error: Your divergence time seems very high. Divergence time should be in coalescent units,\ 549 | not generations or years.")) 550 | if args.B2 and not np.any(SNPs[:, 1] == SNPs[:, 2]): 551 | print(sys.exit("Error: You chose to calculate Beta2, but your input file contains no substiutions. If you do \ 552 | not have substiution data, please use Beta1 or Beta1*.")) 553 | if args.B2 and args.DivTime is None: 554 | print(sys.exit("You must provide a divergence time using the -DivTime flag to use B2")) 555 | if args.theta_map is not None and args.thetaPerSNP is not None: 556 | print(sys.exit("You can use -theta_map or -thetaPerSNP but not both.")) 557 | 558 | if args.onewin: 559 | if args.fold: 560 | output.write("Position\tBeta1*_std\n") 561 | elif args.B2: 562 | output.write("Position\tBeta2_std\n") 563 | else: 564 | output.write("Position\tBeta1_std\n") 565 | elif not args.std and args.fold: 566 | output.write("Position\tBeta1*\n") 567 | elif args.std and args.fold: 568 | output.write("Position\tBeta1*\tBeta1*_std\n") 569 | elif args.std and not args.B2: 570 | output.write("Position\tBeta1\tBeta1_std\n") 571 | elif not args.B2: 572 | output.write("Position\tBeta1\n") 573 | elif args.B2 and not args.std: 574 | output.write("Position\tBeta2\n") 575 | else: 576 | output.write("Position\tBeta2\tBeta2_std\n") 577 | 578 | if not args.B2 and np.any(SNPs[:, 1] == SNPs[:, 2]): 579 | SNPs = SNPs[(SNPs[:, 1] != SNPs[:, 2]) & (SNPs[:, 1] != 0)] 580 | 581 | prev_start_i = 0 582 | prev_end_i = 0 583 | var_dic = {} # records variance calculations so don't need to be recalculated 584 | theta_map = None 585 | if args.theta_map is not None: 586 | theta_map = np.loadtxt(args.theta_map, dtype=float) 587 | elif args.thetaPerSNP is not None: 588 | theta_map = np.loadtxt(args.thetaPerSNP, dtype=float) 589 | 590 | curr_theta_i = 0 591 | 592 | if args.onewin: 593 | theta = calc_thetaw_unfolded(SNPs[:, 1:], int(np.mean(SNPs[:, 2]))) 594 | for snp_i in range(len(SNPs)): 595 | loc = SNPs[snp_i, 0] 596 | if len(SNPs) == 1: 597 | T = 0 598 | output.write(str(loc)+"\t"+str(round(T, 6))+"\n") 599 | break 600 | 601 | freqCount = float(SNPs[snp_i, 1]) 602 | sample_n = int(SNPs[snp_i, 2]) 603 | freq = freqCount/sample_n 604 | SNPSet = np.delete(SNPs, snp_i, axis=0)[:, 1:] 605 | if int(freqCount) != sample_n and freq < 1.0-args.m and freq > args.m and sample_n > 3: 606 | if args.fold: 607 | T = calc_t_fold(SNPSet, freqCount, sample_n, args.p, theta, var_dic) 608 | elif args.B2: 609 | T = calc_t_b2(SNPSet, freqCount, args.DivTime, sample_n, args.p, theta, var_dic) 610 | else: 611 | T = calc_t_unfolded(SNPSet, freqCount, sample_n, args.p, theta, var_dic) 612 | output.write(str(loc)+"\t"+str(round(T, 6))+"\n") 613 | elif freq > 1.0 or freq < 0: 614 | print(sys.exit("Error: Input file contains SNP of invalid frequency on line "+str(snp_i)+".")) 615 | elif freq < 1.0-args.m and freq > args.m and sample_n <= 3: 616 | print(sys.exit("Error: Sample size must be greater than 3 haploid individuals to make inference,\ 617 | or else theta_beta will always equal theta_watterson's. You may wish to increase\ 618 | the m paramter value to exclude this SNP from being a core SNP.")) 619 | else: 620 | for snp_i in range(len(SNPs)): 621 | loc = int(SNPs[snp_i, 0]) 622 | freqCount = float(SNPs[snp_i, 1]) 623 | sample_n = int(SNPs[snp_i, 2]) 624 | freq = freqCount/sample_n 625 | 626 | if int(freqCount) != sample_n and freq < 1.0-args.m and freq > args.m and sample_n > 3: 627 | sI, endI = find_win_indx(prev_start_i, prev_end_i, snp_i, SNPs, args.w) 628 | prev_start_i = sI 629 | prev_end_i = endI 630 | B = None 631 | T = None 632 | if endI > sI: 633 | 634 | SNPSet = np.take(SNPs, list(range(sI, snp_i))+list(range(snp_i+1, endI+1)), axis=0)[:, 1:] 635 | if args.fold: 636 | B = calc_beta_folded(SNPSet, freqCount/sample_n, sample_n, args.p) 637 | elif not args.fold and not args.B2: 638 | B = calc_beta_unfolded(SNPSet, freqCount/sample_n, sample_n, args.p) 639 | elif args.B2: 640 | B = calc_beta_2(SNPSet, args.DivTime, sample_n, freqCount/sample_n, args.p) 641 | 642 | if args.theta_map is not None or args.thetaPerSNP is not None: 643 | theta = None 644 | if args.thetaPerSNP is not None: 645 | theta = theta_map[np.where(theta_map[:, 0] == int(loc)), 1] 646 | if len(theta[0]) == 1: 647 | theta = float(theta) 648 | elif len(theta[0]) > 1: 649 | theta = float(theta[0][0]) 650 | else: 651 | print(sys.exit("SNP at location "+str(loc)+" is not in thetaPerSNP file or is found \ 652 | more than once")) 653 | else: 654 | theta, curr_theta_i = find_local_theta(theta_map, curr_theta_i, loc) 655 | if args.fold: 656 | T = calc_t_fold(SNPSet, freqCount, sample_n, args.p, theta * args.w, var_dic) 657 | elif args.B2: 658 | T = calc_t_b2(SNPSet, freqCount, args.DivTime, sample_n, args.p, theta*args.w, var_dic) 659 | else: 660 | T = calc_t_unfolded(SNPSet, freqCount, sample_n, args.p, theta*args.w, var_dic) 661 | elif args.std: 662 | if args.fold: 663 | T = calc_t_fold(SNPSet, freqCount, sample_n, args.p, args.theta * args.w, var_dic) 664 | elif args.B2: 665 | T = calc_t_b2(SNPSet, freqCount, args.DivTime, sample_n, args.p, args.theta*args.w, var_dic) 666 | else: 667 | T = calc_t_unfolded(SNPSet, freqCount, sample_n, args.p, args.theta * args.w, var_dic) 668 | 669 | if endI == sI: 670 | B = 0 671 | T = 0 672 | if not args.std: 673 | output.write(str(loc)+"\t"+str(round(B, 6))+"\n") # Remove thetas 674 | else: 675 | output.write(str(loc)+"\t"+str(round(B, 6))+"\t"+str(round(T, 6))+"\n") 676 | elif freq > 1.0 or freq < 0: 677 | print(sys.exit("Error: Input file contains SNP of invalid frequency on line "+str(snp_i)+".")) 678 | elif freq < 1.0-args.m and freq > args.m and sample_n <= 3: 679 | print(sys.exit("Error: Sample size must be greater than 3 haploid individuals to make inference, \ 680 | or else theta_beta will always equal theta_watterson's. You may wish to increase the \ 681 | m paramter value to exclude this SNP from being a core SNP.")) 682 | 683 | 684 | if __name__ == "__main__": 685 | main() 686 | -------------------------------------------------------------------------------- /BetaScan_python2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from StringIO import StringIO 4 | import argparse 5 | import math 6 | import os 7 | 8 | 9 | def find_win_indx(prevStarti, prevEndi, SNPi, dataList, winSize): 10 | """Takes in the previous indices of the starting and end of the window, 11 | then returns the appropriate starting and ending index for the next SNP 12 | 13 | Parameters 14 | prevStarti: the starting index in the array of SNP for the previous core SNP's window, inclusive 15 | prevEndi: the ending index in the array for the previous SNP's window, inclusive 16 | SNPi, the index in the array for the current SNP under consideration 17 | dataList: the numpy array of all SNP locations & frequencies 18 | """ 19 | 20 | locSNP = dataList[SNPi,0] #the coordinates of the core SNP 21 | winStart = locSNP-winSize/2 22 | firstI= prevStarti + np.searchsorted(dataList[prevStarti:,0],winStart,side='left') #array index of start of window, inclusive 23 | winEnd = locSNP + winSize/2 24 | endI = prevEndi - 1 + np.searchsorted(dataList[prevEndi:,0],winEnd,side='right') #array index of end of window, exclusive 25 | return (firstI,endI) 26 | 27 | 28 | def calc_beta_folded(SNPFreqList, coreFreq, numInd,p): 29 | """Calculates the value of d, the similarity measure, times i, the frequency from Siewert et al. 30 | #SNPFreq: freq of SNP under consideration, ranges from 1 to sample size 31 | #coreFreq: freq of coresite, ranges from 0 to 1 32 | #p: the p parameter specificying sharpness of peak 33 | #numInd: the number of haploid individuals used to calculate frequency of core site 34 | """ 35 | 36 | if SNPFreqList.size==0: 37 | return 0 38 | a1 = np.sum(1./np.arange(1,numInd)) 39 | thetaW = len(SNPFreqList[:,0])/a1 40 | thetaBNum = np.sum(calcD(SNPFreqList[:,0]/SNPFreqList[:,1],coreFreq,p)) 41 | 42 | i = np.arange(1,numInd) 43 | thetaBDenom = np.sum((1./i)*calcD(i/float(numInd),coreFreq,p)) 44 | 45 | thetaB = thetaBNum/thetaBDenom 46 | return thetaB - thetaW 47 | 48 | 49 | 50 | def calc_beta_unfolded(SNPFreqList, coreFreq, numInd,p): 51 | """Calculates the unfolded version of Beta from Siewert and Voight 52 | For use when the ancestral and derived alleles can be confidently called 53 | 54 | Parameters: 55 | SNPFreqList: a list of frequencies, one for each SNP in the window, 56 | first column ranges from 1 to number of individuals, second columns is # individuals 57 | coreFreq: the frequency of the core SNP, must range from 0 to 1, exclusive 58 | numInd: number of individuals used to calculate the core site frequency 59 | p: value of parameter p 60 | """ 61 | if SNPFreqList.size==0: 62 | return 0 63 | a1 = np.sum(1./np.arange(1,numInd)) 64 | thetaW = len(SNPFreqList[:,0])/a1 65 | thetaBNum = sum(calcD(SNPFreqList[:,0]/SNPFreqList[:,1],coreFreq,p)*SNPFreqList[:,0]) 66 | thetaBDenom = np.sum(calcD(np.arange(1,numInd)/float(numInd),coreFreq,p)) 67 | thetaB = thetaBNum/thetaBDenom 68 | return thetaB - thetaW 69 | 70 | 71 | def calc_thetabeta_unfolded(SNPFreqList, coreFreq, numInd,p): 72 | """Calculates theta_Beta usign the unfolded SFS 73 | #SNPFreq: freq of SNP under consideration, ranges from 1 to sample size 74 | #coreFreq: freq of coresite, ranges from 0 to 1 75 | #p: the p parameter specificying sharpness of peak 76 | #numInd: the number of haploid individuals used to calculate frequency of core site 77 | """ 78 | 79 | if SNPFreqList.size==0: 80 | return 0 81 | thetaBNum = np.sum(calcD(SNPFreqList[:,0]/SNPFreqList[:,1],coreFreq,p)*SNPFreqList[:,0]) 82 | 83 | thetaBDenom = np.sum(calcD(np.arange(1,numInd)/float(numInd),coreFreq,p)) 84 | 85 | thetaB = thetaBNum/thetaBDenom 86 | return thetaB 87 | 88 | 89 | def calc_thetabeta_folded(SNPFreqList, coreFreq, numInd,p): 90 | """Calculates theta_Beta using the folded SFS 91 | #SNPFreq: freq of SNP under consideration, ranges from 1 to sample size 92 | #coreFreq: freq of coresite, ranges from 0 to 1 93 | #p: the p parameter specificying sharpness of peak 94 | #numInd: the number of haploid individuals used to calculate frequency of core site 95 | """ 96 | 97 | if SNPFreqList.size==0: 98 | return 0 99 | thetaBNum = np.sum(calcD(SNPFreqList[:,0]/SNPFreqList[:,1],coreFreq,p)) 100 | 101 | thetaBDenom = np.sum((1./np.arange(1,numInd))*calcD(np.arange(1,numInd)/float(numInd),coreFreq,p)) 102 | 103 | thetaB = thetaBNum/thetaBDenom 104 | return thetaB 105 | 106 | 107 | 108 | def calc_thetaw_unfolded(SNPFreqList, numInd): 109 | """Calculates watterson's theta 110 | 111 | Parameters: 112 | SNPFreqList: a list of frequencies, one for each SNP in the window, 113 | first column ranges from 1 to number of individuals, second columns is # individuals 114 | numInd: number of individuals used to calculate the core site frequency 115 | """ 116 | if SNPFreqList.size==0: 117 | return 0 118 | 119 | a1 = np.sum(1./np.arange(1,numInd)) 120 | 121 | thetaW = len(SNPFreqList[:,0])/a1 122 | return thetaW 123 | 124 | 125 | #Calculates theta_D 126 | def calcThetaD(SNPFreqList,c,n): 127 | """ 128 | c: Speciation time in coalescent units 129 | n: Sample Size 130 | """ 131 | if SNPFreqList.size==0: 132 | return 0 133 | 134 | S = np.where(SNPFreqList[:,0]==SNPFreqList[:,1])[0].shape[0] 135 | return S/(c+1./n) 136 | 137 | 138 | def calcBeta2(SNPFreqList,c,n,coreFreq,p): 139 | SNPs = SNPFreqList[np.where(SNPFreqList[:,0]!=SNPFreqList[:,1])] 140 | return calc_thetabeta_unfolded(SNPs,coreFreq,n,p)-calcThetaD(SNPFreqList,c,n) 141 | 142 | 143 | #Calculates the variance of Theta_S 144 | def calcVarThetaD(c,n,theta): 145 | i = np.arange(2,n+1) 146 | x = np.sum(1./(i**2.*(i-1)**2.)) 147 | return (1./(c+1./n))**2.*(theta**2.+c*theta+theta/n+theta**2.*x) 148 | 149 | 150 | 151 | def calcT_B2(SNPFreqList,coreFreq,c,n,p,theta,varDic): 152 | ''' 153 | #coreFreq: freq of SNP under consideration, ranges from 1 to sample size 154 | #n: sample size of core SNP 155 | #p: the p parameter specifying sharpness of peak 156 | #theta: genome-wide estimate of the mutation rate 157 | ''' 158 | notSubsList_noCore = SNPFreqList[np.where(SNPFreqList[:,0]!=SNPFreqList[:,1])] 159 | thetaB = calc_thetabeta_unfolded(notSubsList_noCore,coreFreq/n,n,p) 160 | thetasubs = calcThetaD(SNPFreqList,c,n) 161 | if not (n,coreFreq,theta) in varDic: 162 | VarD = calcVarThetaD(c,n,theta) 163 | VarB = calcVTheta(n,theta,coreFreq,p,False) 164 | denom = math.sqrt(VarD+VarB) 165 | varDic[(n,coreFreq,theta)] = denom 166 | else: 167 | denom = varDic[(n,coreFreq,theta)] 168 | return (thetaB-thetasubs)/denom 169 | 170 | 171 | 172 | def calcD(freq,x,p): 173 | """Calculates the value of d, the similarity measure 174 | #freq: freq of SNP under consideration, ranges from 0 to 1 175 | #x: freq of coresite, ranges from 0 to 1 176 | #p: the p parameter specifying sharpness of peak 177 | """ 178 | xf = min(x,1.-x) 179 | f = np.minimum(freq,1.-freq) 180 | maxdiff = np.maximum(xf,.5-xf) 181 | corr = ((maxdiff-np.abs(xf-f))/maxdiff)**p 182 | return corr 183 | 184 | 185 | #Using equation 8 from Achaz 2009 186 | def calcT_unfold(SNPFreqList, coreFreq, SNPn, p, theta,varDic): 187 | """ 188 | #coreFreq: freq of SNP under consideration, ranges from 1 to sample size 189 | #SNPn: sample size of core SNP 190 | #p: the p parameter specifying sharpness of peak 191 | #theta: genome-wide estimate of the mutation rate 192 | """ 193 | 194 | x = float(coreFreq)/SNPn 195 | 196 | num = np.sum(SNPFreqList[:,0]/SNPFreqList[:,1]*SNPn*omegai(SNPFreqList[:,0]/SNPFreqList[:,1],SNPn, x,p)) 197 | if not (SNPn,coreFreq,theta) in varDic: 198 | denom = math.sqrt(an(SNPn,x,p)*theta+ Bn(SNPn,x,p)*theta**2.) 199 | varDic[(SNPn,coreFreq,theta)] = denom 200 | else: 201 | denom = varDic[(SNPn,coreFreq,theta)] 202 | return num/denom 203 | 204 | 205 | #Calculates variance of a given estimator of theta, eq 7 from Achaz. 206 | def calcVTheta(n,theta,coreFreq,p,wattersons): 207 | """ 208 | #coreFreq: freq of SNP under consideration, ranges from 1 to sample size 209 | #n: sample size of core SNP 210 | #p: the p parameter specifying sharpness of peak 211 | #theta: genome-wide estimate of the mutation rate 212 | #Wattersons: whether to calculate wattersons theta instead of 213 | """ 214 | wVector = None 215 | if wattersons==True: 216 | wVector = 1./np.arange(1,n) 217 | else: 218 | wVector = calcD(np.arange(1,n)/float(n),float(coreFreq)/n,p) 219 | t1 = np.sum(wVector)**(-2.) 220 | t2 = theta*np.sum(wVector**2.*np.arange(1,n)) 221 | 222 | i = np.arange(1,n) 223 | s1 = np.sum(wVector**2*i**2*sigma(n,np.column_stack([i,i]))) 224 | 225 | 226 | coords = np.asarray([(j,i) for i in range(1,n) for j in range(i+1,n)]) 227 | iind = np.asarray([i-1 for i in range(1,n) for j in range(i+1,n)]) 228 | jind = np.asarray([j-1 for i in range(1,n) for j in range(i+1,n)]) 229 | 230 | s2 = np.sum(coords[:,0]*coords[:,1]*wVector[iind]*wVector[jind]*sigma(n,coords)) 231 | 232 | t3 = theta**2.*(s1+2.*s2) 233 | return t1*(t2+t3) 234 | 235 | 236 | def calcVTheta_fold(n,theta,coreFreq,p): 237 | """ 238 | #coreFreq: freq of SNP under consideration, ranges from 1 to sample size 239 | #n: sample size of core SNP 240 | #p: the p parameter specifying sharpness of peak 241 | #theta: genome-wide estimate of the mutation rate 242 | #Wattersons: whether to calculate wattersons theta instead of 243 | """ 244 | 245 | wVector = calcD(np.arange(1,n/2+1)/float(n),float(coreFreq)/n,p) 246 | r = np.arange(1,n/2+1) 247 | t1 = sum(wVector*(1./r+1./(n-r))*1./(1+(r==n-r)))**-2. 248 | t2 = sum([wVector[i-1]**2.*(phi(n,i)*theta+rho_p_ii(n,i)*theta**2.) for i in range(1,n/2+1)]) 249 | 250 | coords = np.asarray([(j,i) for i in range(1,n/2+1) for j in range(1,i)]) 251 | t3 = np.sum(wVector[coords[:,0]-1]*wVector[coords[:,1]-1]*rho_p_ij(n,coords[:,0],coords[:,1])*theta**2.) 252 | 253 | return t1*(t2+2.*t3) 254 | 255 | 256 | def calcCovFolded(n,theta,coreFreq,p): 257 | """ 258 | #coreFreq: freq of SNP under consideration, ranges from 1 to sample size 259 | #n: sample size of core SNP 260 | #p: the p parameter specifying sharpness of peak 261 | #theta: genome-wide estimate of the mutation rate 262 | """ 263 | r = np.arange(1,n/2+1) 264 | wVector = calcD(r/float(n),float(coreFreq)/n,p) 265 | t1 = 1./sum(wVector*(1./r+1./(n-r))*1./(1.+(r==n-r))) 266 | t2 = 1./sum((1./r+1./(n-r))*1./(1+(r==n-r))) 267 | coords = np.asarray([(i,j) for i in range(1,n/2+1) for j in range(1,n/2+1)]) 268 | t3 = np.sum(wVector[coords[:,0]-1]*rho_p_ij(n,coords[:,0],coords[:,1])*theta**2.) 269 | return t1*t2*t3 270 | 271 | 272 | def calcVarFoldedBeta(n,theta,coreFreq,p): 273 | """ 274 | #coreFreq: freq of SNP under consideration, ranges from 1 to sample size 275 | #n: sample size of core SNP 276 | #p: the p parameter specifying sharpness of peak 277 | #theta: genome-wide estimate of the mutation rate 278 | #Wattersons: whether to calculate wattersons theta instead of 279 | """ 280 | return calcVTheta_fold(n,theta,coreFreq,p)+calcVTheta(n,theta,coreFreq,p,True)-2.*calcCovFolded(n,theta,coreFreq,p) 281 | 282 | 283 | def omegai(i,SNPn,x,p): 284 | """Calculates 9a 285 | #i:freq of SNP under consideration, ranges between 0 and 1 286 | #SNPn: number of chromosomes used to calculate frequency of core SNP 287 | #x: freq of coresite, ranges from 0 to 1 288 | #p: the p parameter specifying sharpness of peak 289 | """ 290 | n1num = calcD(i,x,p) 291 | n1denom = np.sum(calcD(np.arange(1.,SNPn)/SNPn,x,p)) 292 | n1 = n1num/n1denom 293 | n2 = (1./(i*SNPn)) /(np.sum(1./np.arange(1.,SNPn))) 294 | return n1 - n2 295 | 296 | 297 | #Eq 12a of Achaz 298 | def phi(n,i): 299 | #n:sample size 300 | #i: frequency of SNP, in number of individuals 301 | return n/((1.+(i==n-i))*i*(n-i)) 302 | 303 | #eq 12b of Achaz 304 | def rho_p_ii(n,i): 305 | #n:sample size 306 | #i: frequency of SNP, in number of individuals 307 | return (sigma(n,np.column_stack([i,i]))+sigma(n,np.column_stack([n-i,n-i]))+2.*sigma(n,np.column_stack([i,n-i])))/(1.+(i==(n-i)))**2. 308 | 309 | 310 | #eq 12c of Achaz 311 | def rho_p_ij(n,i,j): 312 | 313 | return (sigma(n,np.column_stack([i,j]))+sigma(n,np.column_stack([i,n-j]))+sigma(n,np.column_stack([n-i,j]))+sigma(n,np.column_stack([n-i,n-j])))/((1.+(i==n-i))*(1.+(j==n-j))) 314 | 315 | 316 | #Returns alpha_n from Achaz 2009, eq 9b 317 | def an(SNPn,x,p): 318 | ''' 319 | SNPn: Sample size 320 | x: frequency, ranges from 0 to 1 321 | p: value of p parameter 322 | ''' 323 | i=np.arange(1,SNPn) 324 | return np.sum(i*omegai(i/float(SNPn),SNPn,x,p)**2.) 325 | 326 | 327 | #Returns Beta_N from Achaz 2009, eq 9c 328 | def Bn(SNPn,x,p): 329 | ''' 330 | SNPn: Sample size 331 | x: frequency, ranges from 0 to 1 332 | p: value of p parameter 333 | ''' 334 | 335 | i = np.arange(1,SNPn) 336 | n1 = np.sum(i**2.*omegai(i/float(SNPn),SNPn,x,p)**2.*sigma(SNPn,np.column_stack([i,i]))) 337 | 338 | 339 | coords = np.asarray([(j,i) for i in range(1,SNPn) for j in range(1,i)]) 340 | s2 = np.sum(coords[:,0]*coords[:,1]*omegai(coords[:,0]/float(SNPn),SNPn,x,p)*omegai(coords[:,1]/float(SNPn),SNPn,x,p)*sigma(SNPn,coords)) 341 | 342 | n2=2.*s2 343 | return n1+n2 344 | 345 | 346 | def calcT_fold(SNPFreqList, coreFreq, SNPn, p, theta, varDic): 347 | """ 348 | #coreFreq: freq of SNP under consideration, ranges from 1 to sample size 349 | #SNPn: sample size of core SNP 350 | #p: the p parameter specifying sharpness of peak 351 | #theta: genome-wide estimate of the mutation rate 352 | """ 353 | 354 | x = float(coreFreq)/SNPn 355 | num = calc_beta_folded(SNPFreqList, x, SNPn,p) 356 | if not (SNPn,coreFreq,theta) in varDic: 357 | denom = math.sqrt(calcVarFoldedBeta(SNPn,theta,coreFreq,p)) 358 | varDic[(SNPn,coreFreq,theta)] = denom 359 | else: 360 | denom = varDic[(SNPn,coreFreq,theta)] 361 | return num/denom 362 | 363 | 364 | #Returns sigma from eq 2 or 3 in Fu 1995 365 | def sigma(n,ij): 366 | ''' 367 | n: sample size 368 | ij: 2-d array of integers with 2 cols and no rows 369 | ''' 370 | np.seterr(all='raise') 371 | res = np.zeros(ij.shape[0]) 372 | #i must be greater than j 373 | 374 | ij[:,0],ij[:,1] = ij.max(axis=1),ij.min(axis=1) #flip coordinates if i is less than j 375 | ci = np.logical_and(ij[:,0]==ij[:,1], ij[:,0]==n/2) 376 | 377 | 378 | #Using eq 2 379 | if np.any(ci)>0: 380 | res[ci] = 2.*((Fu_an_vec([n])-Fu_an_vec(ij[ci,0]))/(float(n)-ij[ci,0]))-(1./(ij[ci,0]**2.)) 381 | 382 | ci = np.logical_and(ij[:,0]==ij[:,1], ij[:,0]0: 384 | res[ci] = Fu_Bn(n,ij[ci,0]+1) 385 | 386 | #below is line causing issue 387 | ci = np.logical_and(ij[:,0]==ij[:,1], ij[:,0]>n/2) 388 | 389 | if np.any(ci)>0: 390 | res[ci] = Fu_Bn(n,ij[ci,0])-1./(ij[ci,0]**2.) 391 | 392 | 393 | 394 | #using eq 3 395 | ci = np.logical_and(ij[:,0]>ij[:,1], ij[:,0]+ij[:,1]==n) 396 | if np.any(ci)>0: 397 | res[ci] = (Fu_an_vec([n])-Fu_an_vec(ij[ci,0]))/(n-ij[ci,0]) + (Fu_an_vec([n])-Fu_an_vec(ij[ci,1]))/(n-ij[ci,1]) - (Fu_Bn(n,ij[ci,0])+Fu_Bn(n,ij[ci,1]+1))/2. - 1./(ij[ci,0]*ij[ci,1]) 398 | 399 | ci = np.logical_and(ij[:,0]>ij[:,1], ij[:,0]+ij[:,1]0: 401 | res[ci] = (Fu_Bn(n,ij[ci,0]+1)-Fu_Bn(n,ij[ci,0]))/2. 402 | 403 | ci = np.logical_and(ij[:,0]>ij[:,1], ij[:,0]+ij[:,1]>n) 404 | if np.any(ci)>0: 405 | res[ci] = (Fu_Bn(n,ij[ci,1])-Fu_Bn(n,ij[ci,1]+1))/2.-(1./(ij[ci,0]*ij[ci,1])) 406 | 407 | return res 408 | 409 | 410 | #return a_n from Fu 1995, eq 4 411 | def Fu_an_vec(n): 412 | a = np.insert(np.cumsum(1./np.arange(1,np.amax(n))),0,0) 413 | return a[np.asarray(n)-1] #minus one for sum being only to n-1 414 | 415 | 416 | #returns Beta_n(i) from Fu 1995, eq 5 417 | def Fu_Bn(n,i): 418 | r = 2.0*n/((n-i+1.)*(n-i)) * (Fu_an_vec([n+1])-Fu_an_vec(i)) - (2./(n-i)) 419 | 420 | return r 421 | 422 | 423 | #Given a numpy array of mutation rates finds the theta corresponding to the window that coordinate is in. 424 | #Starts searching at the prior window index to save time 425 | def findLocalTheta(thetaMap,startI,coordinate): 426 | for i in range(startI,thetaMap.shape[0]): 427 | if coordinate=thetaMap[i,0]: 428 | return (thetaMap[i,2],i) 429 | print sys.exit("Error: Coordinate "+str(coordinate)+" is found in the SNP input file, but is not in any of the windows in the thetaMap file.") 430 | 431 | 432 | 433 | 434 | 435 | def main(): 436 | 437 | #Loads the input parameters given by the user 438 | parser = argparse.ArgumentParser() 439 | parser.add_argument("-i", help="Name of input file with all SNPs",type=str,required=True) 440 | parser.add_argument("-o", help="Output file",type=str,default="/dev/stdout") 441 | parser.add_argument("-w", help="Maximum Window Size (in bp) to calculate Beta in for a single test SNP",type=int,default=1000) 442 | parser.add_argument("-onewin",help="Calculate Beta on window which uses all SNPs in input file instead of using distance-based window",default=False,action="store_true") 443 | parser.add_argument("-p", help="Power to raise difference measure by",type=int,default=2) 444 | parser.add_argument("-fold", help="Use folded SFS version",action="store_true") 445 | parser.add_argument("-B2",help="Use the Beta2 statistic. To use this, substiution data with an outgroup is needed.",action="store_true") 446 | parser.add_argument("-m", help="Minimum folded core SNP frequency, exclusive. Must be between 0 and 0.5.",type=float,default=0) 447 | parser.add_argument("-std",help="Instead of returning Beta value, return normalized Beta Statistic",default=False,action="store_true") 448 | parser.add_argument("-theta",help="Estimated genome wide theta value per basepair. Used for calculation of variance. It's equal to 2*l*N_e*u, where u is the locus neutral mutation rate, Ne is the effective population size and l is the ploidy",type=float) 449 | parser.add_argument("-thetaMap",help="Filename of map of mutation rates. This file should contain estimated mutation rates in windows across the genomic area you are applying Beta on.",type=str) 450 | parser.add_argument("-thetaPerSNP",help="Filename of map of mutation rates. This file should contain estimated mutation rates around each SNP. This file should be two columns: position and estimated theta rate.",type=str) 451 | 452 | parser.add_argument("-DivTime",help="Divergence time, in coalescent units, between the two species. Only needed if using B^(2). This can be estimated using the BALLET software, or you can use prior estimates for your species of interest. In practice, this value affects power very little, but will affect the standardized statistic. To convert from generations (g) to coalescent units (c), the formula is g=c*Ne*2 where Ne is the effective population size.",type=float) 453 | 454 | args = parser.parse_args() 455 | output = open(args.o,'w') 456 | 457 | 458 | #Check for valid file format and parameters 459 | try: 460 | SNPs = np.loadtxt(args.i,dtype=float) 461 | except IOError: 462 | print sys.exit("Error: Input file cannot be found") 463 | except: 464 | print sys.exit("Error: Input file in wrong format") 465 | if args.m<0 or args.m>.5: 466 | print sys.exit("Error: Parameter m must be between 0 and 0.5.") 467 | if args.p<=0: 468 | print sys.exit("Error: Parameter p must be positive.") 469 | if len(SNPs.shape)<=1: 470 | print sys.exit("Error: Because the core SNP is excluded from calculations, there must be at least two SNPs in the input file.") 471 | if args.std and args.theta==None and args.thetaMap==None and args.thetaPerSNP==None: 472 | print sys.exit("Error: In order to normalize Beta statistics, a theta value must be provided using the -theta or -thetaMap flags.") 473 | if args.onewin and (args.thetaMap!=None or args.thetaPerSNP!=None): 474 | print sys.exit("Error: onewin and thetaMap options are not compatible. onewin clculates the mutation rate in the given window of arbitrary size") 475 | if args.w<2: 476 | print sys.exit("Error: Window size must be 2 bp or above. However, you probably want to use a window size much larger than 2.") 477 | if args.std and args.thetaMap==None and args.theta<=0 and args.thetaPerSNP==None: 478 | print sys.exit("Error: You must provide an estimate of theta (population-scaled mutation rate) and it must be a positive value.") 479 | if args.p>50: 480 | print sys.exit("Error: P is too large. Reduce value to prevent python numerical errors. See manual for more information.") 481 | if args.fold and args.B2: 482 | print sys.exit("Error: You cannot use both B1* (folded Beta) and B2. B1* is for when you have no outgroup, and B2 is for when you can call substiutions with an outgroup. See manual for guidance about which to use.") 483 | if args.DivTime>1000: 484 | print sys.exit("Error: Your divergence time seems very high. Divergence time should be in coalescent units, not generations or years.") 485 | if args.B2 and not np.any(SNPs[:, 1] == SNPs[:, 2]): 486 | print sys.exit("Error: You chose to calculate Beta2, but your input file contains no substiutions. If you do not have substiution data, please use Beta1 or Beta1*.") 487 | if args.B2 and args.DivTime==None: 488 | print sys.exit("You must provide a divergence time using the -DivTime flag to use B2") 489 | if args.thetaMap!=None and args.thetaPerSNP!=None: 490 | print sys.exit("You can use -thetaMap or -thetaPerSNP but not both.") 491 | if not args.std and args.fold: 492 | output.write("Position\tBeta1*\n") 493 | elif args.std and args.fold: 494 | output.write("Position\tBeta1*\tBeta1*_std\n") 495 | elif args.std and not args.B2: 496 | output.write("Position\tBeta1\tBeta1_std\n") 497 | elif not args.B2: 498 | output.write("Position\tBeta1\n") 499 | elif args.B2 and not args.std: 500 | output.write("Position\tBeta2\n") 501 | else: 502 | output.write("Position\tBeta2\tBeta2_std\n") 503 | 504 | if not args.B2 and np.any(SNPs[:, 1] == SNPs[:, 2]): 505 | SNPs = SNPs[(SNPs[:,1]!=SNPs[:,2]) & (SNPs[:,1]!=0)] 506 | 507 | prevStarti = 0 508 | prevEndi = 0 509 | varDic = {} #records variance calculations so don't need to be recalculated 510 | thetaMap = None 511 | if args.thetaMap != None: 512 | thetaMap = np.loadtxt(args.thetaMap,dtype=float) 513 | elif args.thetaPerSNP != None: 514 | thetaMap = np.loadtxt(args.thetaPerSNP,dtype=float) 515 | 516 | currThetaMapI = 0 517 | 518 | if args.onewin: 519 | theta = calc_thetaw_unfolded(SNPs[:,1:], int(np.mean(SNPs[:,2]))) 520 | for SNPi in range(len(SNPs)): 521 | loc = SNPs[SNPi,0] 522 | if len(SNPs)==1: 523 | T = 0 524 | output.write(str(loc)+"\t"+str(round(T,6))+"\n") 525 | break 526 | 527 | freqCount = float(SNPs[SNPi,1]) 528 | sampleN = int(SNPs[SNPi,2]) 529 | freq = freqCount/sampleN 530 | SNPSet = np.delete(SNPs, SNPi,axis=0)[:,1:] 531 | if int(freqCount)!=sampleN and freq<1.0-args.m and freq>args.m and sampleN>3: 532 | if args.fold: 533 | T = calcT_fold(SNPSet,freqCount,sampleN,args.p,theta,varDic) 534 | elif args.B2: 535 | T = calcT_B2(SNPSet,freqCount,args.DivTime,sampleN,args.p,theta,varDic) 536 | else: 537 | T = calcT_unfold(SNPSet,freqCount,sampleN,args.p,theta,varDic) 538 | output.write(str(loc)+"\t"+str(round(T,6))+"\n") 539 | elif freq>1.0 or freq<0: 540 | print sys.exit("Error: Input file contains SNP of invalid frequency on line "+str(SNPi)+".") 541 | elif freq<1.0-args.m and freq>args.m and sampleN<=3: 542 | print sys.exit("Error: Sample size must be greater than 3 haploid individuals to make inference, or else theta_beta will always equal theta_watterson's. You may wish to increase the m paramter value to exclude this SNP from being a core SNP.") 543 | else: 544 | for SNPi in range(len(SNPs)): 545 | loc = int(SNPs[SNPi,0]) 546 | freqCount = float(SNPs[SNPi,1]) 547 | sampleN = int(SNPs[SNPi,2]) 548 | freq = freqCount/sampleN 549 | 550 | if int(freqCount)!=sampleN and freq<1.0-args.m and freq>args.m and sampleN>3: 551 | SNPLocs = SNPs[:,0] 552 | sI,endI = find_win_indx(prevStarti, prevEndi, SNPi, SNPs, args.w) 553 | prevStarti = sI 554 | prevEndi = endI 555 | B = None 556 | ThetaB = None 557 | ThetaD = None 558 | T = None 559 | if endI>sI: 560 | SNPSet = np.take(SNPs,range(sI,SNPi)+range(SNPi+1,endI+1),axis=0)[:,1:] 561 | if args.fold: 562 | B = calc_beta_folded(SNPSet,freqCount/sampleN,sampleN,args.p) 563 | elif not args.fold and not args.B2: 564 | B = calc_beta_unfolded(SNPSet,freqCount/sampleN,sampleN,args.p) 565 | elif args.B2: 566 | B = calcBeta2(SNPSet,args.DivTime,sampleN,freqCount/sampleN,args.p) 567 | 568 | if args.thetaMap!=None or args.thetaPerSNP!=None: 569 | theta = None 570 | if args.thetaPerSNP!=None: 571 | theta = thetaMap[np.where(thetaMap[:,0]==int(loc)),1] 572 | if len(theta[0])==1: 573 | theta = float(theta) 574 | elif len(theta[0])>1: 575 | theta = float(theta[0][0]) 576 | else: 577 | print sys.exit("SNP at location "+str(loc)+" is not in thetaPerSNP file or is found more than once") 578 | else: 579 | theta,currThetaMapI = findLocalTheta(thetaMap,currThetaMapI,loc) 580 | print currThetaMapI 581 | if args.fold: 582 | T = calcT_fold(SNPSet,freqCount,sampleN,args.p,theta*args.w,varDic) 583 | elif args.B2: 584 | 585 | T = calcT_B2(SNPSet,freqCount,args.DivTime,sampleN,args.p,theta*args.w,varDic) 586 | else: 587 | T = calcT_unfold(SNPSet,freqCount,sampleN,args.p,theta*args.w,varDic) 588 | elif args.std: 589 | if args.fold: 590 | T = calcT_fold(SNPSet,freqCount,sampleN,args.p,args.theta*args.w,varDic) 591 | elif args.B2: 592 | T = calcT_B2(SNPSet,freqCount,args.DivTime,sampleN,args.p,args.theta*args.w,varDic) 593 | else: 594 | T = calcT_unfold(SNPSet,freqCount,sampleN,args.p,args.theta*args.w,varDic) 595 | 596 | if endI==sI: 597 | B=0 598 | ThetaB=0 599 | ThetaD=0 600 | T=0 601 | if not args.std: 602 | output.write(str(loc)+"\t"+str(round(B,6))+"\n") #Remove thetas 603 | else: 604 | output.write(str(loc)+"\t"+str(round(B,6))+"\t"+str(round(T,6))+"\n") 605 | elif freq>1.0 or freq<0: 606 | print sys.exit("Error: Input file contains SNP of invalid frequency on line "+str(SNPi)+".") 607 | elif freq<1.0-args.m and freq>args.m and sampleN<=3: 608 | print sys.exit("Error: Sample size must be greater than 3 haploid individuals to make inference, or else theta_beta will always equal theta_watterson's. You may wish to increase the m paramter value to exclude this SNP from being a core SNP.") 609 | 610 | 611 | 612 | if __name__ == "__main__": 613 | main() 614 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BetaScan 2 | Welcome! BetaScan implements the β statistic to detect ancient balancing selection, as described in [Siewert & Voight, 2017](https://academic.oup.com/mbe/article/doi/10.1093/molbev/msx209/3988103/Detecting-Long-term-Balancing-Selection-using) and [Siewert & Voight, 2020](https://academic.oup.com/gbe/article/12/2/3873/5721358). For in-depth instructions, please read the [BetaScan wiki](https://github.com/ksiewert/BetaScan/wiki). 3 | 4 | Update: 11/8/22: The newest version of BetaScan now uses python3! If you want to continue using the old python2 code, you can use BetaScan_python2.py. Otherwise, use BetaScan.py 5 | 6 | ## Basic Usage 7 | To run BetaScan on an input file named SNPFreqs.txt with default parameters: 8 | ``` 9 | python BetaScan.py -i SNPFreqs.txt 10 | ``` 11 | If you have a folded site frequency spectrum, you must include the -fold flag to calculate β(1)\*. If your data includes substitutions with an outgroup, you can use the -B2 flag, which calculates β(2). However, if you use -B2 you must include an estimate of speciation time. See the [Usage page of the wiki](https://github.com/ksiewert/BetaScan/wiki/Basic-Usage) for details on how you can estimate. 12 | 13 | If you also want to standardize β by its variance, you can do so using the -std flag. This flag must be accompanied by an estimate of the mutation rate using the -theta flag. Once again, see the [Usage page of the wiki](https://github.com/ksiewert/BetaScan/wiki/Basic-Usage) for details on how you can estimate the mutation rate. 14 | 15 | ## Questions? Comments? 16 | Any feedback or questions are very welcome. You can e-mail Katie at ksiewert@hsph.harvard.edu or post a github issue. We know that programs written by other people can be difficult to use, so we’ve tried our best to make this program simple and intuitive. That being said, bioinformatics is bioinformatics, and issues will arise, so don’t hesitate to contact us! 17 | 18 | ## References 19 | The original Beta statistics are described in [Detecting Long-Term Balancing Selection Using Allele Frequency Correlation, MBE 2017](https://academic.oup.com/mbe/article/doi/10.1093/molbev/msx209/3988103/Detecting-Long-term-Balancing-Selection-using). 20 | 21 | Recent updates to BetaScan, including the β(2) statistic and standardization are now published in [BetaScan2: Standardized statistics to detect balancing selection utilizing substitution data, GBE 2020](https://academic.oup.com/gbe/advance-article/doi/10.1093/gbe/evaa013/5721358). 22 | 23 | ## 1000 Genomes Beta Scores 24 | If you would like the β(1) scores for each population in the 1000 Genomes dataset, they are available [here](https://doi.org/10.5281/zenodo.7842399). If you just want to look at the top 1% highest scoring haplotypes in each population, that data is also available [here](https://doi.org/10.5281/zenodo.7842399). These scores are based on hg19. 25 | 26 | β(2) Scores are available for the YRI, CEU and CHB populations are available [here](https://doi.org/10.5281/zenodo.7842447). These scores are also based on hg19. 27 | 28 | --------------------------------------------------------------------------------