├── MANIFEST.in ├── .gitignore ├── setup.py ├── LICENSE ├── README.md └── mendelianerror.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup 3 | 4 | setup(version='0.0.3', 5 | name='mendelianerror', 6 | py_modules=['mendelianerror'], 7 | description="probability of mendelian error in trios", 8 | entry_points={ 9 | 'console_scripts': ['mendelianerror = mendelianerror:_main']}, 10 | long_description=open('README.md').read(), 11 | author="Brent Pedersen", 12 | author_email="bpederse@gmail.com", 13 | zip_safe=False, 14 | classifiers=[ 15 | 'Development Status :: 4 - Beta', 16 | 'Intended Audience :: Science/Research', 17 | 'License :: OSI Approved :: MIT License', 18 | 'Topic :: Scientific/Engineering :: Bio-Informatics']) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Brent Pedersen, Aaron Quinlan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mendelian-error 2 | =============== 3 | 4 | This package attempts to assign probability to a mendelian error event in a trio. 5 | For example, given a mother with genotype 'C/C' and a father with genotype 'C/C' 6 | a child with genotype 'C/T' will be a "mendelian error", in this case, a candidate 7 | *de novo* mutation. 8 | 9 | We can filter candidates based on their genotype likelihoods. For example if the 10 | child had a genotype likelihood of -2,-1,-20 then we are likely to consider this 11 | a genotyping error because the homozygous reference ("C/C" with GL -2) is fairly 12 | close the the GL for het (with GL -1). If the genotype likelihood field was 13 | -20,0,-20, then the call is confidently het. We can use the genotype likelihoods 14 | to assign a probability: 15 | 16 | ```Python 17 | 18 | >>> from mendelianerror import mendelian_error 19 | # everyone is homref. this should have a low probability of an error: 20 | >>> father = mother = child = [-0.1, -8.0, -8.0] 21 | >>> mendelian_error(mother, father, child) 22 | 7.55...e-08 23 | 24 | 25 | # parents are hom, child is het. this is a likely mendelian error: 26 | >>> father = mother = [-0.6, -2.5, -2.5] 27 | >>> child = [-2.5, -0.6, -2.5] 28 | >>> mendelian_error(mother, father, child) 29 | 0.987... 30 | 31 | ``` 32 | 33 | So the input is the 3 GL numbers for each of the father, mother, child. 34 | 35 | Installation 36 | ============ 37 | 38 | pip install mendelianerror 39 | 40 | CLI 41 | === 42 | 43 | After installation, one can use this on a multi-sample VCF file like: 44 | ```Shell 45 | mendelianerror $input.vcf father_id mother_id child_id > $new.vcf 46 | ``` 47 | 48 | to get only the putative mendelian errors, set a high cutoff: 49 | 50 | ```Shell 51 | mendelianerror --cutoff 0.999 $input.vcf father_id mother_id child_id > $new.vcf 52 | ``` 53 | 54 | 55 | Limitations 56 | =========== 57 | 58 | + Only make sense for autosomal variants. 59 | + Only works on trios (doesn't consider extended pedigrees or siblings). 60 | -------------------------------------------------------------------------------- /mendelianerror.py: -------------------------------------------------------------------------------- 1 | """ 2 | Calculate the probability of a mendelian error given the genotype likelihoods 3 | from a trio.""" 4 | 5 | import sys 6 | from math import log10 7 | import gzip 8 | 9 | nan = float('nan') 10 | 11 | class LowGenotypeException(Exception): 12 | pass 13 | 14 | def rescale(li): 15 | s = float(sum(li)) 16 | if s < 1e-40: 17 | raise LowGenotypeException 18 | return [v / s for v in li] 19 | 20 | def mendelian_error(mother, father, child, pls=False): 21 | """ 22 | Return the probability of a mendelian error given the log10 genotype 23 | likelihoods. A large value indicates a high probability of a mendelian 24 | error. Low values mean that the genotype-likelihoods indicate enough 25 | uncertainty that it could be a genotyping error. 26 | 27 | 28 | # everyone is het: 29 | >>> het = (-2.0, -0.1, -2.0) 30 | >>> mendelian_error(het, het, het) 31 | 0.047... 32 | 33 | # parents are hom, child is het. 34 | >>> father = mother = [-0.6, -2.5, -2.5] 35 | >>> child = [-2.5, -0.6, -2.5] 36 | >>> mendelian_error(mother, father, child) 37 | 0.987... 38 | 39 | # same as above, but more certainty in the called genotypes: 40 | >>> child[1] = 1e-6 41 | >>> mother[0] = father[0] = 1e-6 42 | >>> mendelian_error(mother, father, child) 43 | 0.996... 44 | 45 | # everyone is confidently homozygous alt 46 | >>> child = father = mother = [-11.0, -11.0, -0.1] 47 | >>> mendelian_error(mother, father, child) 48 | 7.55...e-11 49 | 50 | # everyone is less confidently homozygous refs: 51 | >>> child = father = mother = [-0.1, -2.0, -2.0] 52 | >>> mendelian_error(mother, father, child) 53 | 0.071... 54 | 55 | mother and fater are homozygous alts 56 | >>> mother = father = [-3.0, -3.0, -0.1] 57 | 58 | # child is het 59 | >>> child = [-3., -0.1, -3.] 60 | >>> mendelian_error(mother, father, child) 61 | 0.998... 62 | 63 | # but when the hom-alt call is close... 64 | >>> child = [-3., -0.1, -0.15] 65 | >>> mendelian_error(mother, father, child) 66 | 0.53... 67 | 68 | # mother is hom_ref, dad is het, child is hom_alt 69 | >>> mother, father, child = (-0.1, -2, -2), (-2, -0.1, -2), (-2, -2, -0.1) 70 | >>> mendelian_error(mother, father, child) 71 | 0.976... 72 | 73 | # mother is hom_ref, dad is hom_alt, child is hom_ref 74 | >>> mother, father, child = (-0.1, -2.5, -2.5), (-2.5, -2.5, -0.1), (-0.1, -2.5, -2.5) 75 | >>> mendelian_error(mother, father, child) 76 | 0.993... 77 | 78 | # same, but child is hom_alt 79 | >>> mendelian_error(mother, father, (-5, -5, -0.01)) 80 | 0.994... 81 | 82 | # child should be het: 83 | >>> mendelian_error(mother, father, (-3, 0, -3)) 84 | 0.75... 85 | 86 | # NOTE: does oddish things if all have very low, equal values. 87 | >>> mendelian_error([-16.2, -16.2, -16.2], [-14.4, -15.0, -22.6], [-24.9, -21.2, -20.9]) 88 | 0.8629... 89 | 90 | >>> mendelian_error([-15.5, -15.8, -19.7], [-11.8, -9.9, -22.9], [-69.7, -55.9, -58.3]) 91 | 92 | >>> mendelian_error([-3.4, -0, -2.9], [-0, -1.8, -23.0], [-6.7, 0.0, -10.7]) 93 | 0.742... 94 | 95 | >>> mendelian_error([34, 0, 29], [0, 18, 23], [67, 0, 107], pls=True) 96 | 0.74... 97 | 98 | """ 99 | if len(mother) != 3 or len(father) != 3 or len(child) != 3: 100 | sys.stderr.write("WARNING: found a multi-allelic site. run VCF through vt decompose\n") 101 | return None 102 | 103 | if pls: 104 | mother = [m / -10.0 for m in mother] 105 | father = [f / -10.0 for f in father] 106 | child = [c / -10.0 for c in child] 107 | try: 108 | M = rescale([10.**m for m in mother]) 109 | F = rescale([10.**f for f in father]) 110 | C = rescale([10.**c for c in child]) 111 | except LowGenotypeException: 112 | return None 113 | 114 | # by ref, and alt, we mean hom_ref, hom_alt 115 | p_two_ref = M[0] * F[0] 116 | p_two_het = M[1] * F[1] 117 | p_two_alt = M[2] * F[2] 118 | 119 | 120 | # only 1 of the parents is ... 121 | p_one_ref = (M[0] + F[0])/2 - p_two_ref 122 | p_one_het = (M[1] + F[1])/2 - p_two_het 123 | p_one_alt = (M[2] + F[2])/2 - p_two_alt 124 | # divide by 2 because parents independent. 125 | 126 | # all options covered because, e.g. p(two_ref) == p(zero_alt) 127 | assert abs(sum((p_one_ref, p_one_het, p_one_alt, p_two_ref, p_two_het, p_two_alt)) - 1) < 1e-4, \ 128 | abs(sum((p_one_ref, p_one_het, p_one_alt, p_two_ref, p_two_het, p_two_alt)) - 1) 129 | ################## 130 | # Non-violations # 131 | ################## 132 | # a. everyone is reference 133 | a = p_two_ref * C[0] 134 | # b. everyone is hom alt 135 | b = p_two_alt * C[2] 136 | # c. 1 het and 1 ref parent. child matches 137 | c = p_one_het * p_one_ref * (C[0] + C[1]) 138 | # d. 1 het and 1 alt parent. child matches 139 | d = p_one_het * p_one_alt * (C[1] + C[2]) 140 | # e. both parents hets. (child can be anything) 141 | e = p_two_het 142 | # f. one hom ref, one home alt. child is het 143 | f = p_one_ref * p_one_alt * C[1] 144 | #print a, b, c, d, e, f 145 | 146 | p_not_error = a + b + c + d + e + f 147 | return 1.0 - p_not_error 148 | 149 | def xopen(f): 150 | return gzip.open(f) if f.endswith(".gz") else sys.stdin if "-" == f else open(f) 151 | 152 | def main(fh, father, mother, child, cutoff=None): 153 | 154 | for line in fh: 155 | if line.startswith("##"): 156 | print line, 157 | continue 158 | elif line.startswith("#CHROM"): 159 | print "##INFO=" 160 | print "##INFO=" 161 | fields = line.rstrip().split("\t") 162 | samples = fields[9:] 163 | idxs = [9 + samples.index(s) for s in (father, mother, child)] 164 | print line, 165 | continue 166 | 167 | fields = line.rstrip().split("\t") 168 | samples = [fields[i].split(":") for i in idxs] 169 | 170 | fmt = fields[8].split(":") 171 | try: 172 | if "PL" in fmt: 173 | gli = fmt.index("PL") 174 | opls = [s[gli].split(",") for s in samples] 175 | gls = [[int(p)/-10. for p in pl] for pl in opls] 176 | else: 177 | gli = fmt.index("GL") 178 | ogls = [s[gli].split(",") for s in samples] 179 | gls = [[float(g) for g in gl] for gl in ogls] 180 | except (IndexError, ValueError): # not info for at least 1 sample 181 | if cutoff == 1: 182 | print line, 183 | continue 184 | 185 | for i, gl in enumerate(gls): 186 | while sum(gls[i]) < -50: 187 | gls[i] = [p / 10. for p in gls[i]] 188 | p = mendelian_error(gls[0], gls[1], gls[2]) 189 | if p < cutoff: 190 | continue 191 | 192 | if p == 1: 193 | mer = 100 194 | elif p == 0: 195 | mer = 0 196 | elif p is None: 197 | mer = None 198 | else: 199 | mer = log10(p / (1.0 - p)) 200 | 201 | fields[7] += ";MEP=%.8g" % (nan if p is None else p) 202 | fields[7] += ";MER=%.8g" % (nan if p is None else mer) 203 | print "\t".join(fields) 204 | 205 | def test(): 206 | from random import randint 207 | sys.exit() 208 | 209 | def gen3(): 210 | return [randint(-70, 1) / 10. for i in range(3)] 211 | 212 | ps = [] 213 | for i in xrange(100000): 214 | a, b, c = gen3(), gen3(), gen3() 215 | ps.append(mendelian_error(a, b, c)) 216 | if ps[-1] > 0.999999: 217 | print "mendelian error:", tuple(a), tuple(b), tuple(c) 218 | elif ps[-1] < 0.00001: 219 | print "expected :", tuple(a), tuple(b), tuple(c) 220 | try: 221 | import pylab as pl 222 | pl.hist(ps, 50) 223 | pl.show() 224 | except ImportError: 225 | pass 226 | 227 | def _main(): 228 | if len(sys.argv) > 1 and sys.argv[1] == "test": 229 | sys.exit(test()) 230 | import argparse 231 | a = argparse.ArgumentParser() 232 | a.add_argument("vcf") 233 | a.add_argument("father_id", help="sample name of father in the vcf header") 234 | a.add_argument("mother_id", help="sample name of mother in the vcf header") 235 | a.add_argument("kid_id", help="sample name of kid in the vcf header") 236 | a.add_argument("--cutoff", type=float, default=1.0, help="don't print \ 237 | variants with a mendelian error probability below this") 238 | 239 | p = a.parse_args() 240 | 241 | main(xopen(p.vcf), p.father_id, p.mother_id, p.kid_id, p.cutoff) 242 | 243 | if __name__ == "__main__": 244 | import doctest 245 | sys.stderr.write(str(doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS | doctest.REPORT_ONLY_FIRST_FAILURE, verbose=0)) + "\n") 246 | _main() 247 | --------------------------------------------------------------------------------