├── phasing
    ├── __init__.py
    ├── io
    │   ├── __init__.py
    │   ├── SAMMPileUpReader.py
    │   ├── coordinate_mapper.py
    │   ├── MPileUpVariantCaller.py
    │   ├── BioReaders.py
    │   └── VariantPhaser.py
    ├── utils
    │   └── paint_bam_post_phaser.py
    └── mag_phaser.py
├── MagPhase.conda_env.yml
├── setup.py
├── LICENSE
└── README.md


/phasing/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'etseng@pacb.com'
2 | 


--------------------------------------------------------------------------------
/phasing/io/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'etseng@pacb.com'
2 | 


--------------------------------------------------------------------------------
/MagPhase.conda_env.yml:
--------------------------------------------------------------------------------
 1 | name: MagPhase.env
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - anaconda
 6 |   - r
 7 |   - defaults
 8 | dependencies:
 9 |   - bcbiogff
10 |   - biopython
11 |   - bx-python
12 |   - numpy
13 |   - psutil
14 |   - pybedtools
15 |   - pysam
16 |   - python>=3.7.6
17 |   - pyvcf
18 |   - samtools
19 |   - scipy
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension, find_packages
 2 | import sys
 3 | 
 4 | __author__ = "etseng@pacb.com"
 5 | version = "v1.0.0"
 6 | 
 7 | setup(
 8 |     name = 'magphase',
 9 |     version=version,
10 |     author='Elizabeth Tseng',
11 |     author_email='etseng@pacb.com',
12 |     zip_safe=False,
13 |     packages = ['phasing.io'],
14 | 
15 |     install_requires=[
16 |         'biopython',
17 |         'bx-python>=0.7.3',
18 |         'scipy',
19 |         'pysam'
20 |         ],
21 |     scripts = [
22 | 			   'phasing/mag_phaser.py',
23 |                'phasing/utils/paint_bam_post_phaser.py'
24 |                ],
25 |     )
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021, Pacific Biosciences of California, Inc.
 2 |  
 3 | All rights reserved.
 4 |  
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the disclaimer below) provided that the following conditions are met:
 6 |  
 7 |     *  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 |  
 9 |     *  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 |   
11 |     *  Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 |  
13 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 |  
15 | 
16 | 


--------------------------------------------------------------------------------
/phasing/utils/paint_bam_post_phaser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os, sys
 4 | from csv import DictReader
 5 | import pysam
 6 | 
 7 | def paint_bam_post_phaser(input_bam, output_bam, read_info, chrom, start, end):
 8 |     reader = pysam.AlignmentFile(input_bam, 'rb')
 9 |     fout = pysam.AlignmentFile(output_bam, 'wb', header=reader.header)
10 |     for r in reader.fetch(chrom, start, end):
11 |         d = r.to_dict()
12 |         newtags = []
13 |         for k in d['tags']:
14 |             if not k.startswith('RG:Z'):
15 |                 newtags.append(k)
16 |         if r.qname not in read_info:
17 |             newtags.append('RG:Z:unassigned')
18 |         else:
19 |             hapstr = read_info[r.qname]
20 |             newtags.append('RG:Z:' + hapstr)
21 |         d['tags'] = newtags
22 |         fout.write(pysam.AlignedSegment.from_dict(d, r.header))
23 |     fout.close()
24 | 
25 | if __name__ == "__main__":
26 |     from argparse import ArgumentParser
27 |     parser = ArgumentParser()
28 |     parser.add_argument("input_bam")
29 |     parser.add_argument("output_bam")
30 |     parser.add_argument("read_hap_info", help="Human readable read-to-hap info file from Isophase/Magphase output")
31 |     parser.add_argument("-c", "--chrom", required=True, help='Chromosome')
32 |     parser.add_argument("-s", "--start", required=True, type=int, help="Start location")
33 |     parser.add_argument("-e", "--end", required=True, type=int, help="End location")
34 | 
35 |     args = parser.parse_args()
36 | 
37 |     read_info = {}
38 |     for r in DictReader(open(args.read_hap_info), delimiter='\t'):
39 |         read_info[r['read_id']] = r['haplotype']
40 | 
41 |     paint_bam_post_phaser(args.input_bam, args.output_bam, read_info, args.chrom, args.start, args.end)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MagPhase
  2 | Phasing for metagenomics using PacBio long reads
  3 | 
  4 | Current Version (07/13/2021): MagPhase v1.0
  5 | 
  6 | 
  7 | ## What is MagPhase?
  8 | 
  9 | MagPhase is for phasing of metagenomics data using long reads. 
 10 | 
 11 | MagPhase is a modified version of [IsoPhase](https://github.com/Magdoll/cDNA_Cupcake/wiki/IsoPhase:-Haplotyping-using-Iso-Seq-data) which was originally designed for isoform-level phasing of PacBio Iso-Seq (full-length transcript sequencing) data.
 12 | 
 13 | ## How MagPhase works
 14 | 
 15 | ![](https://github.com/Magdoll/images_public/blob/master/IsoPhase_MagPhase/magphase_workflow_for_DerekPaper.png?raw=true)
 16 | 
 17 | MagPhase takes an alignment BAM file of HiFi reads aligned to the assembled contigs and a BED file that denotes the regions to phase.
 18 | 
 19 | For each region, individual SNPs are called. Then, reads are used to infer the "haplotypes" (or lineages).
 20 | 
 21 | The output of MagPhase consists of individual SNP information and the inferred list of haplotypes. 
 22 | 
 23 | 
 24 | ## Requirements & Installation
 25 | 
 26 | ### Prerequisites
 27 | 
 28 | * Python (3.7+)
 29 | * minimap2
 30 | 
 31 | ### Python-related libraries
 32 | 
 33 | * biopython
 34 | * bx-python
 35 | * scipy
 36 | * pysam
 37 | * pyvcf
 38 | 
 39 | ### Installation using (Ana)Conda 
 40 | 
 41 | We recommend using [Anaconda](https://www.anaconda.com/products/individual) to set up your conda environment. Currently only Linux environments are supported.
 42 | 
 43 | (1) Install Conda Environment
 44 | 
 45 | ```
 46 | export PATH=$PATH:<path_to_anaconda>/bin
 47 | conda -V
 48 | conda update conda
 49 | ```
 50 | 
 51 | (2) Clone the Github repo and install using the yml script
 52 | 
 53 | ```
 54 | git clone https://github.com/Magdoll/MagPhase.git
 55 | cd MagPhase
 56 | conda env create -f MagPhase.conda_env.yml
 57 | source activate MagPhase.env
 58 | ```
 59 | 
 60 | (3) Once you have activated the virtual environment, you should see your prompt changing to something like this:
 61 | 
 62 | ```
 63 | (MagPhase.env)$
 64 | ```
 65 | 
 66 | (4) Compile and install MagPhase
 67 | 
 68 | ```
 69 | (MagPhase.env)$ python setup.py build
 70 | (MagPhase.env)$ python setup.py install
 71 | ```
 72 | 
 73 | 
 74 | ## Example Usage
 75 | 
 76 | The usage for `mag_phaser.py` is as follows:
 77 | 
 78 | ```
 79 | $ mag_phaser.py -h
 80 | usage: mag_phaser.py [-h] -a ASSEMBLY -b BAMFILE -o OUTPUT -g GENES [-p PVAL_CUTOFF] [--bhFDR BHFDR]
 81 | 
 82 | optional arguments:
 83 |   -h, --help            show this help message and exit
 84 |   -a ASSEMBLY, --assembly ASSEMBLY
 85 |                         The mag assembly file in fasta format
 86 |   -b BAMFILE, --bamfile BAMFILE
 87 |                         Aligned reads in bam file format [full path needed!]
 88 |   -o OUTPUT, --output OUTPUT
 89 |                         output prefix
 90 |   -g GENES, --genes GENES
 91 |                         SCG gene bed file
 92 |   -p PVAL_CUTOFF, --pval_cutoff PVAL_CUTOFF
 93 |                         P value cutoff for variant calls
 94 |   --bhFDR BHFDR         FDR to be used for the Benjamini–Hochberg correction. Default: None (not used).
 95 | 
 96 | ```
 97 | 
 98 | where `-a` provides MAG assembly contig fasta file. `-b` provides the aligned HiFi reads to the contig fasta. `-g` provides a BED file that contains the individual regions to be phased.
 99 | 
100 | `-p` and `--bhFDR` controls the p-value cutoff for SNP calling. It is recommended that you use the Benjamini–Hochberg correction for better SNP detection (since correction for multiple testing can drastically reduce number of SNPs called). We recommend using `--bhFDR 0.01` for general metagenomics applications.
101 | 
102 | 
103 | ## Output Interpretation
104 | 
105 | An example run:
106 | 
107 | ```
108 | mag_phaser.py -a all_contigs.fasta -b all_contigs.bubbles.ccs.filtered.sorted.bam -g 1377.shortmaps.bed --bhFDR 0.01 -o 1377.strain
109 | ```
110 | 
111 | will produce the following files:
112 | 
113 | ```
114 | 1377.strain.human_readable_by_hap.txt  
115 | 1377.strain.human_readable_by_pos.txt  
116 | 1377.strain.human_readable_by_read.txt
117 | ```
118 | 
119 | Within a region (as provided by the `-g` BED file), if there were phasing results (note: note all regions can be phased, as there could be no SNPs present), the haplotypes are represetned as a string of concatenated SNPs. For example, if there are three SNPs in this region at genomic position 101, 150, and 220, and there are four haplotypes, then the representation in the `_hap.txt` would be:
120 | 
121 | |haplotype|hapIdx|contig|count|
122 | |---|---|---|---|
123 | |ATT|0  |contig\_1337|10|
124 | |CTG|1  |contig\_1337|12|
125 | |ATG|2  |contig\_1337|2|
126 | |?AG|3  |contig\_1337|4|
127 | 
128 | Note not all haplotypes will cover all SNP positions, so some haplotypes may have a `?` indicating lack of bases at that SNP location.
129 | 
130 | The SNP position would be stored in the `_pos.txt` file:
131 | 
132 | |haplotype|contig      |pos|varIdx|base|
133 | |---------|------------|---|------|----|
134 | |ATT      |contig\_1337|101|1     |REF |
135 | |ATT      |contig\_1337|150|2     |REF |
136 | |ATT      |contig\_1337|220|3     |REF |
137 | |CTG      |contig\_1337|101|1     |ALT0 |
138 | |CTG      |contig\_1337|150|2     |REF |
139 | |CTG      |contig\_1337|220|3     |ALT0 |
140 | |ATG      |contig\_1337|101|1     |REF |
141 | |ATG      |contig\_1337|150|2     |REF |
142 | |ATG      |contig\_1337|220|3     |ALT0 |
143 | |?AG      |contig\_1337|150|2     |ALT0 |
144 | |?AG      |contig\_1337|220|3     |ALT0 |
145 | 
146 | 


--------------------------------------------------------------------------------
/phasing/mag_phaser.py:
--------------------------------------------------------------------------------
  1 | import os, re, sys
  2 | import subprocess
  3 | 
  4 | try:
  5 |     import vcf
  6 | except ImportError:
  7 |     print("Cannot import vcf! Please install pyvcf!", file=sys.stderr)
  8 |     sys.exit(-1)
  9 | 
 10 | from phasing.io import SAMMPileUpReader as sam
 11 | from phasing.io import MPileUpVariantCaller as VC
 12 | from phasing.io import VariantPhaser
 13 | 
 14 | 
 15 | MIN_COVERAGE = 10     # minimum number of FL reads for a gene to do SNP calling and phasing
 16 | ERR_SUB = 0.005
 17 | PVAL_CUTOFF = 0.1
 18 | 
 19 | def parse_user_input():
 20 |     from argparse import ArgumentParser
 21 |     parser = ArgumentParser(
 22 |             description = "A pipeline for aligning sequence data on a slurm cluster"
 23 |             )
 24 |     parser.add_argument('-a', '--assembly',
 25 |                         help="The mag assembly file in fasta format",
 26 |                         required=True, type=str
 27 |                         )
 28 |     parser.add_argument('-b', '--bamfile',
 29 |                         help="Aligned reads in bam file format [full path needed!]",
 30 |                         required=True, type=str
 31 |                         )
 32 |     parser.add_argument('-o', '--output',
 33 |                         help="output prefix",
 34 |                         required=True, type=str
 35 |                         )
 36 |     parser.add_argument('-g', '--genes',
 37 |                         help='SCG gene bed file',
 38 |                         required =True, type=str
 39 |                         )
 40 |     parser.add_argument('-p', '--pval_cutoff',
 41 |                         help="P value cutoff for variant calls",
 42 |                         default=PVAL_CUTOFF, type=float
 43 |                         )
 44 |     parser.add_argument("--bhFDR", default=None,
 45 |                         type=float,
 46 |                         help="FDR to be used for the Benjamini–Hochberg correction. Default: None (not used).")
 47 | 
 48 | 
 49 |     return parser.parse_args(), parser
 50 | 
 51 | def main(args, parser):
 52 |     args = parser.parse_args()
 53 | 
 54 |     if args.bhFDR is not None:
 55 |         print("--bhFDR {0} is given! Will be using Benjamini–Hochberg correction insteaad. --pval_cutoff is ignored.".format(args.bhFDR))
 56 | 
 57 | 
 58 |     # remove potential past run output
 59 |     past_files = [args.output+'.NO_SNPS_FOUND',
 60 |              args.output+'.NO_HAPS_FOUND',
 61 |              args.output+'.snps',
 62 |              args.output+'.log',
 63 |              args.output+'.human_readable.txt',
 64 |              args.output+'.vcf',
 65 |              args.output+'.cleaned.human_readable.txt',
 66 |              args.output+'.cleaned.vcf']
 67 | 
 68 |     for file in past_files:
 69 |         if os.path.exists(file):
 70 |             os.remove(file)
 71 | 
 72 |     snpsfound = False
 73 |     # (0) generate pileups
 74 |     f_human1 = open(args.output + '.human_readable_by_pos.txt', 'w')
 75 |     f_human1.write("haplotype\thapIdx\tcontig\tpos\tvarIdx\tbase\tcount\n")
 76 |     f_human2 = open(args.output + '.human_readable_by_hap.txt', 'w')
 77 |     f_human2.write("haplotype\thapIdx\tcontig\tcount\n")
 78 |     f_human3 = open(args.output + '.human_readable_by_read.txt', 'w')
 79 |     f_human3.write("read_id\thaplotype\thapIdx\n")
 80 | 
 81 |     for mpileupFile, contig, start, end in elitePileups(args.bamfile, args.genes, args.assembly, args.output):
 82 |         # (1) read the mpileup and vall variants
 83 |         reader = sam.MPileUpReader(mpileupFile)
 84 |         recs = [r for r in reader]
 85 |         vc = VC.MagMPileUPVariant(recs, min_cov=MIN_COVERAGE, err_sub=ERR_SUB, expected_strand='+-',
 86 |                                   pval_cutoff=args.pval_cutoff,
 87 |                                   bhFDR=args.bhFDR)
 88 |         vc.call_variant()
 89 |         print(vc.variant)
 90 | 
 91 |         if len(vc.variant) != 0:
 92 |             snpsfound = True
 93 |         else:
 94 |             continue
 95 | 
 96 |         # we write SNPs with the bases separated by "/" not "|" becuz we haven't phased them yet
 97 |         with open(args.output + '.snps', 'a+') as f_snp:
 98 |             for pos, v in vc.variant.items():
 99 |                 f_snp.write("{contig}\t{pos}\t{bases}\t{counts}\n".format(\
100 |                     contig=contig,\
101 |                     pos=pos+1,\
102 |                     bases="/".join([b for (b,c) in v]),\
103 |                     counts="/".join([str(c) for (b,c) in v])))
104 | 
105 |         # (2) for each CCS read, assign a haplotype (or discard if outlier)
106 |         pp = VariantPhaser.MagVariantPhaser(vc)
107 |         pp.phase_variant(args.bamfile, [contig, start, end], args.output, partial_ok=True)
108 |         print(pp.haplotypes)
109 |         pp.haplotypes.get_haplotype_vcf_assignment()
110 |         pp.haplotypes.write_haplotype_to_humanreadable(contig, f_human1, f_human2, f_human3, pp.seq_hap_info)
111 |         os.remove(mpileupFile)
112 |     f_human1.close()
113 |     f_human2.close()
114 |     f_human3.close()
115 | 
116 |     if not snpsfound:
117 |         os.system("touch {out}.NO_SNPS_FOUND".format(out=args.output))
118 |         os.remove(args.output + '.human_readable.txt')
119 |         print("No SNPs found. END.", file=sys.stderr)
120 | 
121 | 
122 | def elitePileups(aligned_bam : str, gene_bed : str, assembly : str, outprefix : str) -> str:
123 |     """
124 | 
125 |     :param aligned_bam:
126 |     :param gene_bed: gene bed to extract for making pileup
127 |     :param assembly:
128 |     :param outprefix:
129 |     :return:
130 |     """
131 |     for line in open(gene_bed):
132 |         # contig_4047     8476    8850    contig_4047_5
133 |         chrom, s, e, name = line.strip().split()
134 | 
135 |         outfile = "{p}.{c}_{s}_{e}.pileup".format(p=outprefix, c=chrom, s=s, e=e)
136 |         cmd = "samtools mpileup -r {c}:{s}-{e} -f {asm} -s {bam} > {o}".format(\
137 |             c=chrom, s=s, e=e, asm=assembly, bam=aligned_bam, o=outfile)
138 |         if subprocess.check_call(cmd, shell=True)!=0:
139 |             print("FAILED TO RUN CMD: {0}. Abort!".format(cmd))
140 |             sys.exit(-1)
141 |         yield outfile, chrom, int(s), int(e)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     args, parser = parse_user_input()
146 |     main(args, parser)
147 | 


--------------------------------------------------------------------------------
/phasing/io/SAMMPileUpReader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'etseng@pacb.com'
  2 | 
  3 | """
  4 | Parser for `samtools mpileup`
  5 | 
  6 | http://www.htslib.org/doc/samtools-1.1.html
  7 | 1. chr
  8 | 2. 1-based position
  9 | 3. ref base
 10 | 4. coverage
 11 | 5. readBase
 12 | 6. base qualities
 13 | 7. alignment qualities
 14 | 
 15 | readBase:
 16 | .  match to ref
 17 | ,  match to ref on rev
 18 | > or <    ref skipping  (ex: like 37N)
 19 | ACGTN  mismatch on + strand
 20 | acgn   mismatch on - strand
 21 | +{number}{AGCTNagctn} - insertion of some {number}
 22 | -{number}{...} deletion of some {number}  # also means in next {number}, you will see a *
 23 | ^ begin of read, followed by asci-33 for quality
 24 | $ end of read
 25 | """
 26 | 
 27 | import os, sys, re
 28 | import pdb
 29 | from collections import Counter
 30 | 
 31 | class MPileUpRecord(object):
 32 |     def __init__(self, chr, pos, ref, cov, readBase, baseQuals, alnQuals):
 33 |         """
 34 |         In addition to storing the 7 cols from mpileup,
 35 |         nalso stores
 36 |         counter: Counter of (key) -> (obs count in pileup)
 37 |         """
 38 |         self.chr = chr
 39 |         self.pos = pos
 40 |         self.ref = ref.upper() # let ref base always be upper case
 41 |         self.cov = cov
 42 |         self.nCov = None # this is the coverage of non-indel, non-skipped, which would be ACGTNacgtn
 43 |         self.nType = None # this is the number of non-indel, non-skipped bases accumulated at this record
 44 |         self.readBase = readBase
 45 |         self.baseQuals = baseQuals
 46 |         self.alnQuals = alnQuals
 47 | 
 48 |         self.counts = Counter()
 49 |         self.parse_readBase()
 50 | 
 51 |     def __str__(self):
 52 |         return """
 53 |         chr: {c}
 54 |         pos: {p} (1-based)
 55 |         ref: {r}
 56 |         cov: {v}
 57 |         nCov: {n}
 58 |         counts: {t}""".format(c=self.chr, p=self.pos+1, r=self.ref, v=self.cov, n=self.nCov, t=self.counts)
 59 | 
 60 |     def parse_readBase(self):
 61 |         """
 62 |         fill in self.counts
 63 |         """
 64 |         def not_indel_end_pos(i):
 65 |             return i >= len(self.readBase)-1 or self.readBase[i+1] not in ('+', '-', '$')
 66 | 
 67 |         rex = re.compile('(\d+)')
 68 |         def read_indel(start_index):
 69 |             m = rex.search(self.readBase, start_index)
 70 |             num = int(self.readBase[m.start():m.end()])
 71 |             return m.start(), m.end()+num
 72 | 
 73 |         sanity_counter = 0 # use this to track how many "reads" we've parsed to make sure parsing is correct
 74 |                            # this number should agree with self.cov which is 4-th column in mpileup
 75 |         i = 0 # pointer for current location in string self.readBase
 76 |         while i < len(self.readBase):
 77 |             b = self.readBase[i]
 78 |             if b in '<>': # ignore skipped refs
 79 |                 sanity_counter += 1
 80 |                 i += 1
 81 |                 continue
 82 |             elif b == '*': # deletion, just advance
 83 |                 i += 1
 84 |                 sanity_counter += 1
 85 |                 continue
 86 |             elif b == '^': # start of read followed by ascii and either a comma or dot (ex: ^I.)
 87 |                 i += 3
 88 |                 sanity_counter += 1
 89 |                 continue
 90 |             elif b == '$': # end of read, DO NOT advance counter
 91 |                 i += 1
 92 |                 continue
 93 |             elif b == '.': # could be followed by indels or $, careful don't double count
 94 |                 self.counts[self.ref] += 1
 95 |                 sanity_counter += 1
 96 |                 i += 1
 97 |             elif b == ',': # # could be followed by indels or $, careful don't double count
 98 |                 self.counts[self.ref.lower()] += 1
 99 |                 sanity_counter += 1
100 |                 i += 1
101 |             elif b in 'ATCGNatcgn':
102 |                 self.counts[b] += 1
103 |                 sanity_counter += 1
104 |                 i += 1
105 |             elif b == '-': # DO NOT ADVANCE the sanity counter! otherwise double counting
106 |                 start, end = read_indel(i+1)
107 |                 self.counts["-"+self.readBase[start:end]] += 1
108 |                 i = end
109 |             elif b == '+': # insertion should be +{number}{bases}
110 |                 start, end = read_indel(i+1)
111 |                 self.counts["+"+self.readBase[start:end]] += 1
112 |                 i = end
113 |             else:
114 |                 raise Exception("Unknown {0} in readBase!".format(b))
115 | 
116 |         assert self.cov == sanity_counter or (self.readBase=='*' and self.cov==0)
117 |         # set nCov which is cov provided by non-indel non-skipped bases
118 |         self.nCov = 0
119 |         self.nType = 0
120 |         for x in 'ATCGNatcgn':
121 |             self.nCov += self.counts[x]
122 |             if self.counts[x] > 0: self.nType += 1
123 | 
124 | 
125 | class MPileUpReader(object):
126 |     def __init__(self, filename):
127 |         self.filename = filename
128 |         self.f = open(filename)
129 | 
130 |     def __iter__(self):
131 |         return self
132 | 
133 |     def __next__(self):
134 |         cur = self.f.tell()
135 |         line = self.f.readline()
136 |         if self.f.tell() == cur:
137 |             raise StopIteration
138 |         return self.parseLine(line)
139 | 
140 |     def parseLine(self, line):
141 |         raw = line.strip().split('\t')
142 |         if (len(raw)==7 or len(raw)==15):
143 |             cov = int(raw[3])
144 |             #if cov > 0:
145 |             return MPileUpRecord(chr=raw[0],\
146 |                                 pos=int(raw[1])-1,\
147 |                                 ref=raw[2],
148 |                                 cov=int(raw[3]),
149 |                                 readBase=raw[4],
150 |                                 baseQuals=raw[5],
151 |                                 alnQuals=raw[6])
152 |         elif len(raw)==4:
153 |             # only way to have only 4 columns is because after --min-BQ filtering there are no bases
154 |             # ex:
155 |             # fake    8728    T       3       .$.$.   ;q:     ]]]
156 |             # fake    8729    T       0
157 |             return MPileUpRecord(chr=raw[0],\
158 |                                  pos=int(raw[1])-1,\
159 |                                  ref=raw[2],
160 |                                  cov=0,
161 |                                  readBase='',
162 |                                  baseQuals='',
163 |                                  alnQuals='')
164 |         else:
165 |             raise Exception("Expected to have 7 cols in mpileup record \
166 |             but saw only {0}, abort! Line was: {1}".format(len(raw), line))
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/phasing/io/coordinate_mapper.py:
--------------------------------------------------------------------------------
  1 | import bisect
  2 | from bx.intervals import Interval
  3 | from Bio.Seq import Seq
  4 | 
  5 | def iter_cigar_string(cigar_string):
  6 |     num = cigar_string[0]
  7 |     for s in cigar_string[1:]:
  8 |         if str.isalpha(s):
  9 |             yield int(num), s
 10 |             num = ''
 11 |         else:
 12 |             num += s
 13 | 
 14 | 
 15 | def make_exons_from_base_mapping(mapping, start, end, strand):
 16 |     """
 17 |     mapping is 0-based index on transcript  --> 0-based index  on genome
 18 |     however beware of strand!
 19 |     """
 20 | 
 21 |     output = [mapping[start]]
 22 |     for i in range(start+1, end):
 23 |         cur_pos, cur_is_junction= mapping[i]
 24 |         if cur_is_junction and mapping[i]!=output[-1]:
 25 |             # if the last position is the same, DON'T APPEND (was an indel)
 26 |             output.append(mapping[i])
 27 |     cur_pos, cur_is_junction = mapping[end]
 28 |     if mapping[end]!=output[-1]:
 29 |         output.append(mapping[end])
 30 | 
 31 |     # remember for Interval it is 0-based start, 1-based end
 32 |     # if len(output) is odd, must be 1bp into an exon
 33 |     # ex: [(xxx,True), (xxx,True), (xxx,False)] or
 34 |     #     [.....(xxx,True), xxx(True)]
 35 |     #print output
 36 |     if len(output)==1:
 37 |         output = [output[0], output[0]] # just duplicate it
 38 |     elif len(output)%2==1:
 39 |         if output[0][1] and output[1][1]:
 40 |             output.insert(0, output[0])
 41 |         elif output[-1][1] and output[-2][1]:
 42 |             output.append(output[-1])
 43 |     #    print "modified:", output
 44 |     if strand == '+':
 45 |         return [Interval(output[i][0],output[i+1][0]+1) for i in range(0, len(output), 2)]
 46 |     else: # - strand
 47 |         return [Interval(output[i][0],output[i-1][0]+1)  for i in range(len(output)-1,-1,-2)]
 48 | 
 49 | 
 50 | 
 51 | def get_base_to_base_mapping_from_sam(exons, cigar_string, qStart, qEnd, strand, include_junction_info=False):
 52 |     """
 53 |     For PacBio data which can have indels w.r.t genome =___=
 54 | 
 55 |     ex:
 56 |         cigar: 1S105M407N548M
 57 |         sStart-sEnd: 948851-949911
 58 |         qStart-qEnd: 2-655
 59 |         segments: [Interval(start=948851, end=948956), Interval(start=949363, end=949911)]
 60 | 
 61 |     Returns: dict of 0-based position --> 0-based ref position
 62 |     """
 63 |     cur_exon_i = 0
 64 |     cur_nt_loc = qStart
 65 |     cur_genome_loc = exons[0].start
 66 | 
 67 |     start_soft_clip = qStart > 0
 68 | 
 69 |     last_base_is_junction = False
 70 |     qLen = qEnd
 71 | 
 72 |     mapping = {}
 73 | 
 74 |     for num, s in iter_cigar_string(cigar_string):
 75 |         if s == 'S': # soft clipping at the ends, ignore
 76 |             if start_soft_clip:
 77 |                 assert num == qStart
 78 |                 for i in range(num): mapping[i] = (cur_genome_loc, False)
 79 |                 start_soft_clip = False
 80 |             else:
 81 |                 # soft clipping at the end
 82 |                 # advance the mapping but not cur_nt_loc (otherwise will be diff from qEnd)
 83 |                 for i in range(num): 
 84 |                     mapping[cur_nt_loc+i] = (cur_genome_loc, False)
 85 |                     #cur_nt_loc += 1
 86 |                     #print cur_nt_loc
 87 |                     # for soft clipping at the end, do NOT progress cur_nt_loc!
 88 |                     # we are now "outside" the alignment, otherwise
 89 |                     # assert cur_nt_loc == qEnd will be wrong at the end
 90 |                 #cur_nt_loc -= 1
 91 |                 qLen += num # query length must be qEnd + soft clipped end
 92 |         elif s == 'N': # intron, move to next ref exon
 93 |             mapping[cur_nt_loc-1] = (mapping[cur_nt_loc-1][0], True)
 94 |             assert cur_genome_loc == exons[cur_exon_i].end
 95 |             cur_exon_i += 1
 96 |             cur_genome_loc = exons[cur_exon_i].start
 97 |             last_base_is_junction = True
 98 |         elif s == 'M':
 99 |             # for the next "num" matches are all 1:1
100 |             for i in range(num):
101 |                 if cur_nt_loc in mapping and mapping[cur_nt_loc][1]:
102 |                     # if this is true, then last mapping must be 'D' and was a junction
103 |                     # so we do nothing -- keep it
104 |                     pass
105 |                 else:
106 |                     mapping[cur_nt_loc] = (cur_genome_loc, last_base_is_junction)
107 |                 last_base_is_junction = False
108 |                 cur_nt_loc += 1
109 |                 cur_genome_loc += 1
110 |             assert cur_genome_loc <= exons[cur_exon_i].end
111 |         elif s == 'I': # insertion w.r.t to genome
112 |             for i in range(num):
113 |                 mapping[cur_nt_loc] = (cur_genome_loc, last_base_is_junction)
114 |                 cur_nt_loc += 1
115 |                 last_base_is_junction = False
116 |         elif s == 'D': # deletion w.r.t. to genome
117 |             # if last_base_is_junction is True, we want to make sure it makes it in mapping
118 |             mapping[cur_nt_loc] = (cur_genome_loc, last_base_is_junction)
119 |             last_base_is_junction = False
120 |             cur_genome_loc += num
121 | # BELOW IS WRONG
122 | #            for i in xrange(num):
123 | #                mapping[cur_nt_loc] = cur_genome_loc
124 | #                cur_genome_loc += 1
125 |             assert cur_genome_loc <= exons[cur_exon_i].end
126 |     assert cur_nt_loc == qEnd or (cur_nt_loc==qEnd-1 and s=='S')
127 | 
128 |     if strand == '-':
129 |         mapping = dict((qLen-1-k, v) for k,v in mapping.items())
130 | 
131 |     if not include_junction_info:
132 |         for k in mapping:
133 |             mapping[k] = mapping[k][0]
134 | 
135 |     return mapping
136 | 
137 | 
138 | 
139 | def get_exon_coordinates(exons, start, end):
140 |     """
141 |     Return the set of "exons" (genome location) that 
142 |     is where the nucleotide start-end is
143 | 
144 |     start is 0-based
145 |     end is 1-based
146 |     exons is a set of Interval (0-based start, 1-based end)
147 |     """
148 |     acc_lens = [0] # ex: [0, 945, 1065, 1141, 1237] accumulative length of exons
149 |     len_of_transcript = 0
150 |     for e in exons:
151 |         _len = e.end - e.start
152 |         acc_lens.append(acc_lens[-1] + _len)
153 |         len_of_transcript += _len
154 |     # confirm that start-end is in the range of the transcript!
155 |     assert 0 <= start < end <= len_of_transcript + 30 # allow a 30-bp slack due to PacBio indels
156 | 
157 |     end = min(end, len_of_transcript) # trim it to the end if necessary (for PacBio)
158 | 
159 | 
160 |     i = bisect.bisect_right(acc_lens, start) 
161 |     j = bisect.bisect_right(acc_lens, end) 
162 | 
163 |     # starts at i-th exon and ends at j-th exon, i and j are both 1-based
164 |     # for the first exon, the offset is start-acc+e.start
165 |     # for the last exon, the end point is end-acc+e.start
166 |     if i == j:
167 |         return [Interval(start-acc_lens[i-1]+exons[i-1].start, 
168 |                 end-acc_lens[i-1]+exons[i-1].start)]
169 |     else:
170 |         if j >= len(exons):  # the end is the end
171 |             return [Interval(start-acc_lens[i-1]+exons[i-1].start, exons[i-1].end)] + \
172 |                     exons[i:] 
173 |         else:
174 |             return [Interval(start-acc_lens[i-1]+exons[i-1].start, exons[i-1].end)] + \
175 |                 exons[i:j-1] + \
176 |                [Interval(exons[j-1].start, end-acc_lens[j-1]+exons[j-1].start)]
177 |     
178 | def consistute_genome_seq_from_exons(genome_dict, _chr, exons, strand):
179 |     """
180 |     genome_dict is expected to be SeqReaders.LazyFastaReader
181 |     exons is a list of [Interval(start, end)]
182 |     """
183 |     seq = ''
184 |     genome_seq = genome_dict[_chr].seq
185 |     for e in exons:
186 |         seq += str(genome_seq[e.start:e.end])
187 | 
188 |     seq = Seq(seq)
189 |     if strand == '+':
190 |         return seq.tostring()
191 |     else:
192 |         return seq.reverse_complement().tostring()
193 | 
194 | 


--------------------------------------------------------------------------------
/phasing/io/MPileUpVariantCaller.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | Call variant based on a list of SAMMPileUpRecord where list[i] is the record of i-th position
  4 | 
  5 | Most of the code follows Juliet's code at
  6 | https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp
  7 | 
  8 | """
  9 | import os, sys
 10 | import scipy.stats as stats
 11 | from collections import Counter, namedtuple
 12 | 
 13 | BHtuple = namedtuple('BHtuple', ['pval', 'record'])
 14 | 
 15 | class MPileUPVariant(object):
 16 |     def __init__(self, record_list, min_cov, err_sub, expected_strand, pval_cutoff=0.01, bhFDR=None):
 17 |         """
 18 |         :param record_list: list of SAMMPileUpRecord
 19 |         :param min_cov: minimum coverage to call variant
 20 |         :param err_sub: substitution error, right now a fixed float
 21 |         :param expected_strand: expected strand of the transcript (+ or -)
 22 |         """
 23 |         self.record_by_pos = dict((r.pos, r) for r in record_list)
 24 |         self.min_cov = min_cov
 25 |         self.err_sub = err_sub
 26 | 
 27 |         self.pval_cutoff = pval_cutoff
 28 |         self.bhFDR       = bhFDR   # is None, this is not used; other wise do Benjamini–Hochberg
 29 |         self.expected_strand = expected_strand
 30 | 
 31 | 
 32 |         self.prep_records()
 33 |         self.positions_to_call = self.get_positions_to_call()
 34 | 
 35 |         # must first call positions to call, then prep records, then number of tests
 36 |         self.number_of_tests = sum(self.record_by_pos[pos].clean_type for pos in self.positions_to_call)
 37 | 
 38 |         self.variant = {} # position --> in sorted order, (base, count)
 39 |         self.ref_base = {} # position --> ref base
 40 | 
 41 |         self.call_variant()
 42 | 
 43 | 
 44 |     def is_in_or_near_HP(self, pos, hp_size=4):
 45 |         """
 46 |         We define a HP region as stretches of 4 or more same nucleotides
 47 |         :return: True/False for in/hear HP region
 48 |         """
 49 |         def find_hp_region_size(cur):
 50 |             if cur not in self.record_by_pos: return 0
 51 |             end = cur+1
 52 |             while end in self.record_by_pos and self.record_by_pos[end].ref == self.record_by_pos[cur].ref:
 53 |                 end += 1
 54 |             start = cur-1
 55 |             while start in self.record_by_pos and self.record_by_pos[start].ref == self.record_by_pos[cur].ref:
 56 |                 start -= 1
 57 |             # hp region is from start+1 to end
 58 |             return end-(start+1)
 59 | 
 60 |         return (find_hp_region_size(pos) >= hp_size) or \
 61 |                (find_hp_region_size(pos-1) >= hp_size) or \
 62 |                (find_hp_region_size(pos+1) >= hp_size)
 63 | 
 64 | 
 65 |     def get_positions_to_call(self):
 66 |         """
 67 |         Identify list of positions to try to call SNPs. Must have:
 68 |         1. minimum coverage >= min_cov
 69 |         2. the first and second most frequent base are NOT an indel
 70 |         3. not next to or inside a homopolymer region
 71 |         4. has at least two or more keys
 72 |         """
 73 |         positions_to_call = []
 74 |         for pos in self.record_by_pos:
 75 |             if self.record_by_pos[pos].clean_type < 2: continue # only one base at this position, skip
 76 |             elif self.record_by_pos[pos].clean_cov < self.min_cov: continue # insufficient cov, skip
 77 |             else:
 78 |                 # find the first and second most freq base in the "non-clean" counts
 79 |                 m = self.record_by_pos[pos].clean_counts.most_common()
 80 |                 # ex: m = [('a', 10), ('-ct', 20), ('+t', 10)]
 81 |                 if m[0][0][0]in ('+','-') or m[1][0][0] in ('+','-') or self.is_in_or_near_HP(pos): continue
 82 |                 else:
 83 |                     positions_to_call.append(pos)
 84 |         return positions_to_call
 85 | 
 86 |     def prep_records(self):
 87 |         """
 88 |         Prepare the records by:
 89 |         1. remove all 'N' bases
 90 |         2. remove all bases that were not on the expected strand
 91 |         3. remove all indels
 92 | 
 93 |         Creates three new vars: clean_counts, clean_cov, clean_type
 94 |         DOES NOT ALTER the original counts or other variables!!!
 95 | 
 96 |         If + strand, then ATCG
 97 |         If - strand, then atcg
 98 |         """
 99 |         for pos in self.record_by_pos:
100 |             r = self.record_by_pos[pos]
101 |             if self.expected_strand == '+-':
102 |                 # for metagenomics, we don't care the strand
103 |                 # so instead we will convert everything to upper case later in the counts
104 |                 bases = 'ATCG'
105 |             elif self.expected_strand == '+':
106 |                 bases = 'ATCG'
107 |             elif self.expected_strand == '-':
108 |                 bases = 'atcg'
109 | 
110 |             if self.expected_strand == '+-':
111 |                 # convert lower case to upper case
112 |                 for k in 'atcg':
113 |                     if k in r.counts:
114 |                         r.counts[k.upper()] += r.counts[k]
115 |                         del r.counts[k]
116 | 
117 |             r.clean_counts = Counter(r.counts)
118 |             keys = list(r.counts.keys())
119 |             for k in keys:
120 |                 if k not in bases:
121 |                     del r.clean_counts[k]
122 |             r.clean_cov = sum(r.clean_counts.values())
123 |             r.clean_type = len(r.clean_counts)
124 | 
125 |     def call_variant(self):
126 |         """
127 |         mirrors AminoAcidCaller::CallVariants() in
128 |         https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp
129 | 
130 |         For each position (that has sufficient coverage),
131 |          do Fisher exact test w/ correction
132 |          if p-val < threshold, then store it.
133 | 
134 |         Stores results in self.variant as:
135 | 
136 |         self.variant[position] = desc list of (base, count).
137 |         NOTE: base must be either all in lower case (which means - strand)
138 |               or call upper case (+ strand).
139 |               If - strand and ('a', 10), it means the ref base in A on the + strand,
140 |               and the transcript should be T on the - strand.
141 | 
142 |         Only positions with more than the ref base is stored.
143 |         """
144 |         if self.bhFDR is None: # use Bonferroni correction
145 |             for pos in self.positions_to_call:
146 |                 r = self.record_by_pos[pos]
147 |                 alt_variant = []
148 |                 for base, count in r.clean_counts.most_common()[1:]:
149 |                     assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels
150 |                     exp = r.clean_cov * self.err_sub
151 |                     odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater')
152 |                     pval *= self.number_of_tests
153 |                     if pval < self.pval_cutoff: # store variant if below cutoff
154 |                         alt_variant.append((base, count))
155 |                 if len(alt_variant) > 0: # only record this variant if there's at least two haps
156 |                     self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant
157 |                     self.ref_base[pos] = r.ref
158 |         else: # use Benjamini–Hochberg procedure
159 |             # see: https://www.statisticshowto.com/benjamini-hochberg-procedure/
160 |             pval_dict = {} # (pos, base) -> BHtuple(pval, record)
161 |             for pos in self.positions_to_call:
162 |                 r = self.record_by_pos[pos]
163 |                 for base, count in r.clean_counts.most_common()[1:]:
164 |                     assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels
165 |                     exp = r.clean_cov * self.err_sub
166 |                     odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater')
167 |                     if pval <= self.pval_cutoff:      # With this filtration, the sequencing errors position will not be stored in pval_dict.
168 |                         pval_dict[(pos, base)] = BHtuple(pval=pval, record=r)
169 | 
170 |             # now we have all the pvals, rank them
171 |             keys_pos_base = list(pval_dict.keys())
172 |             keys_pos_base.sort(key=lambda x: pval_dict[x].pval)
173 |             self.number_of_tests = len(keys_pos_base)
174 |             # find the largest p value that is smaller than the critical value.
175 |             largest_good_rank1 = 0
176 |             for rank0,(pos, base) in enumerate(keys_pos_base):
177 |                 pval = pval_dict[(pos, base)].pval
178 |                 bh_val = ((rank0+1)/self.number_of_tests) * self.bhFDR # Only significant positions will be used to adjust bh_val
179 |                 if pval < bh_val:
180 |                     largest_good_rank1 = rank0+1
181 |                     print(f"pos:{pos} base:{base} pval:{pval} bh:{bh_val}")
182 |             for (pos,base) in keys_pos_base[:largest_good_rank1]:
183 |                 r = pval_dict[(pos,base)].record
184 |                 if pos not in self.variant:
185 |                     self.ref_base[pos] = r.ref
186 |                     self.variant[pos] = [r.clean_counts.most_common()[0]]
187 |                 self.variant[pos] += [(base, r.clean_counts[base])]
188 | 
189 | 
190 | class MagMPileUPVariant(MPileUPVariant):
191 |     def __init__(self, record_list, min_cov, err_sub, expected_strand, pval_cutoff=0.01, bhFDR=None):
192 |         self.ref_name = {} # position --> ref contig
193 |         super().__init__(record_list, min_cov, err_sub, expected_strand, pval_cutoff, bhFDR)
194 | 
195 |     def call_variant(self):
196 |         """
197 |         mirrors AminoAcidCaller::CallVariants() in
198 |         https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp
199 | 
200 |         For each position (that has sufficient coverage),
201 |          do Fisher exact test w/ correction
202 |          if p-val < threshold, then store it.
203 | 
204 |         Stores results in self.variant as:
205 | 
206 |         self.variant[position] = desc list of (base, count).
207 |         NOTE: base must be either all in lower case (which means - strand)
208 |               or call upper case (+ strand).
209 |               If - strand and ('a', 10), it means the ref base in A on the + strand,
210 |               and the transcript should be T on the - strand.
211 | 
212 |         Only positions with more than the ref base is stored.
213 |         """
214 |         if self.bhFDR is None: # use Bonferroni correction
215 |             for pos in self.positions_to_call:
216 |                 r = self.record_by_pos[pos]
217 |                 alt_variant = []
218 |                 for base, count in r.clean_counts.most_common()[1:]:
219 |                     assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels
220 |                     exp = r.clean_cov * self.err_sub
221 |                     odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater')
222 |                     pval *= self.number_of_tests
223 |                     if pval < self.pval_cutoff: # store variant if below cutoff
224 |                         alt_variant.append((base, count))
225 |                 if len(alt_variant) > 0: # only record this variant if there's at least two haps
226 |                     self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant
227 |                     self.ref_base[pos] = r.ref
228 |                     self.ref_name[pos] = r.chr
229 | 
230 |         else: # use Benjamini–Hochberg procedure
231 |             # see: https://www.statisticshowto.com/benjamini-hochberg-procedure/
232 |             pval_dict = {} # (pos, base) -> BHtuple(pval, record)
233 |             for pos in self.positions_to_call:
234 |                 r = self.record_by_pos[pos]
235 |                 for base, count in r.clean_counts.most_common()[1:]:
236 |                     assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels
237 |                     exp = r.clean_cov * self.err_sub
238 |                     odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater')
239 |                     if pval <= self.pval_cutoff:      # With this filtration, the sequencing errors position will not be stored in pval_dict.
240 |                         pval_dict[(pos, base)] = BHtuple(pval=pval, record=r)
241 |             # now we have all the pvals, rank them
242 |             keys_pos_base = list(pval_dict.keys())
243 |             keys_pos_base.sort(key=lambda x: pval_dict[x].pval)
244 |             self.number_of_tests = len(keys_pos_base)
245 |             # find the largest p value that is smaller than the critical value.
246 |             largest_good_rank1 = 0
247 |             for rank0,(pos, base) in enumerate(keys_pos_base):
248 |                 pval = pval_dict[(pos, base)].pval
249 |                 bh_val = ((rank0+1)/self.number_of_tests) * self.bhFDR # Only significant positions will be used to adjust bh_val
250 |                 if pval < bh_val:
251 |                     largest_good_rank1 = rank0+1
252 |                     print(f"pos:{pos} base:{base} pval:{pval} bh:{bh_val}")
253 |             for (pos,base) in keys_pos_base[:largest_good_rank1]:
254 |                 r = pval_dict[(pos,base)].record
255 |                 if pos not in self.variant:
256 |                     self.ref_base[pos] = r.ref
257 |                     self.ref_name[pos] = r.chr
258 |                     self.variant[pos] = [r.clean_counts.most_common()[0]]
259 |                 self.variant[pos] += [(base, r.clean_counts[base])]
260 | 
261 | 
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/phasing/io/BioReaders.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Should always be faithful duplicate of sequence/BioReaders.py
  4 | Duplicated here for tofu installation. This one is called via cupcake.io.BioReaders.
  5 | """
  6 | 
  7 | import re, sys
  8 | from collections import namedtuple
  9 | 
 10 | Interval = namedtuple('Interval', ['start', 'end'])
 11 |                                  
 12 | class SimpleSAMReader:
 13 |     """
 14 |     A simplified SAM reader meant for speed. Skips CIGAR & FLAG parsing; identity/coverage calculation.
 15 |     """
 16 |     SAMheaders = ['@HD', '@SQ', '@RG', '@PG', '@CO']    
 17 |     def __init__(self, filename, has_header):
 18 |         self.filename = filename
 19 |         self.f = open(filename)
 20 |         self.header = ''
 21 |         if has_header:
 22 |             while True:
 23 |                 cur = self.f.tell()
 24 |                 line = self.f.readline()
 25 |                 if line[:3] not in SimpleSAMReader.SAMheaders:
 26 |                     break
 27 |                 self.header += line
 28 |             self.f.seek(cur)
 29 |     
 30 |     def __iter__(self):
 31 |         return self
 32 |     
 33 |     def __next__(self):
 34 |         line = self.f.readline().strip()
 35 |         if len(line) == 0:
 36 |             raise StopIteration
 37 |         return SimpleSAMRecord(line)    
 38 |     
 39 |   
 40 | class SimpleSAMRecord:
 41 |     cigar_rex = re.compile('(\d+)([MIDSHN])')
 42 |     SAMflag = namedtuple('SAMflag', ['is_paired', 'strand', 'PE_read_num'])
 43 |     def __init__(self, record_line):
 44 |         """
 45 |         Simple bare bones version: only has
 46 |         
 47 |         qID, sID, sStart, sEnd, qStart, qEnd, cigar
 48 |         
 49 |         Simplified assumptions:
 50 |         -- must be end-to-end alignment (so qStart always 0)
 51 |         -- must be unspliced (no 'N' in cigar string)
 52 |         """
 53 |         self.qID = None
 54 |         self.sID = None
 55 |         self.sStart = None
 56 |         self.sEnd = None
 57 |         self.qStart = 0
 58 |         self.qEnd = None # length of SEQ
 59 |         self.cigar = None
 60 | 
 61 |         self.process(record_line)
 62 | 
 63 |     def __str__(self):
 64 |         msg = \
 65 |         """
 66 |         qID: {q}
 67 |         sID: {s}
 68 |         sStart-sEnd: {ss}-{se}
 69 |         qStart-qEnd: {qs}-{qe}
 70 |         cigar: {c}
 71 |         """.format(q=self.qID, s=self.sID, \
 72 |             ss=self.sStart, se=self.sEnd, qs=self.qStart, qe=self.qEnd, c=self.cigar)
 73 |         return msg
 74 | 
 75 |     def parse_cigar(self, cigar, start):
 76 |         """
 77 |         M - match
 78 |         I - insertion w.r.t. to ref
 79 |         D - deletion w.r.t. to ref
 80 |         N - skipped (which means splice junction)
 81 |         S - soft clipped
 82 |         H - hard clipped (not shown in SEQ)
 83 |         = - read match
 84 |         X - read mismatch
 85 | 
 86 |         ex: 50M43N3D
 87 | 
 88 |         NOTE: sets qStart & qEnd, which are often incorrect because of different ways to write CIGAR strings
 89 |               instead rely on XS/XE flags (from blasr or pbalign.py) to overwrite this later!!!
 90 | 
 91 |         Returns: genomic segment locations (using <start> as offset)
 92 |         """
 93 |         cur_end = start
 94 |         q_aln_len = 0
 95 |         for (num, type) in re.findall('(\d+)(\S)', cigar):
 96 |             num = int(num)
 97 |             if type == 'I':
 98 |                 q_aln_len += num
 99 |             elif type in ('M', '=', 'X'):
100 |                 cur_end += num
101 |                 q_aln_len += num
102 |             elif type == 'D':
103 |                 cur_end += num
104 |         self.qEnd = self.qStart + q_aln_len
105 |         self.sEnd = cur_end
106 | 
107 |             
108 |     def process(self, record_line):
109 |         """
110 |         Only process cigar to get qEnd and sEnd
111 |         """
112 |         raw = record_line.split('\t')
113 |         self.qID = raw[0]
114 |         self.sID = raw[2]
115 |         if self.sID == '*': # means no match! STOP here
116 |             return
117 |         self.sStart = int(raw[3]) - 1
118 |         self.cigar = raw[5]
119 |         self.parse_cigar(self.cigar, self.sStart)
120 |         #self.flag = SimpleSAMRecord.parse_sam_flag(int(raw[1]))
121 | 
122 |     
123 | 
124 | class SAMReader:
125 |     SAMheaders = ['@HD', '@SQ', '@RG', '@PG', '@CO']    
126 |     def __init__(self, filename, has_header, ref_len_dict=None, query_len_dict=None):
127 |         self.filename = filename
128 |         self.f = open(filename)
129 |         self.header = ''
130 |         self.ref_len_dict = ref_len_dict
131 |         self.query_len_dict = query_len_dict
132 |         if has_header:
133 |             while True:
134 |                 cur = self.f.tell()
135 |                 line = self.f.readline()
136 |                 if line[:3] not in SAMReader.SAMheaders:
137 |                     break
138 |                 self.header += line
139 |             self.f.seek(cur)
140 |     
141 |     def __iter__(self):
142 |         return self
143 |         
144 |     def __next__(self):
145 |         line = self.f.readline().strip()
146 |         if len(line) == 0:
147 |             raise StopIteration
148 |         return SAMRecord(line, self.ref_len_dict, self.query_len_dict)        
149 |     
150 | 
151 | class SAMRecord:
152 |     SAMflag = namedtuple('SAMflag', ['is_paired', 'strand', 'PE_read_num'])
153 |     def __init__(self, record_line=None, ref_len_dict=None, query_len_dict=None):
154 |         """
155 |         Designed to handle BowTie SAM output for unaligned reads (PE read not yet supported)
156 |         Can handle map to transfrag (no splicing) and genome (splicing)
157 |         """
158 |         self.qID = None
159 |         self.sID = None
160 |         self.sStart = None
161 |         self.sEnd = None
162 |         self.segments = None
163 |         self.num_nonmatches = None
164 |         self.num_ins = None
165 |         self.num_del = None
166 |         self.num_mat_or_sub = None
167 | 
168 |         self.qCoverage = None
169 |         self.sCoverage = None
170 | 
171 |         self.sLen = None
172 |         self.qLen = None
173 |         # qStart, qEnd might get changed in parse_cigar
174 |         self.qStart = 0
175 |         self.qEnd = None # length of SEQ
176 | 
177 |         self.cigar = None
178 |         self.flag = None
179 | 
180 |         self.identity = None
181 |         self.record_line = record_line
182 |         if record_line is not None:
183 |             self.process(record_line, ref_len_dict, query_len_dict)
184 | 
185 |     def __str__(self):
186 |         msg =\
187 |         """
188 |         qID: {q}
189 |         sID: {s}
190 |         cigar: {c}
191 |         sStart-sEnd: {ss}-{se}
192 |         qStart-qEnd: {qs}-{qe}
193 |         segments: {seg}
194 |         flag: {f}
195 |         
196 |         coverage (of query): {qcov}
197 |         coverage (of subject): {scov}
198 |         alignment identity: {iden}
199 |         """.format(q=self.qID, s=self.sID, seg=self.segments, c=self.cigar, f=self.flag,\
200 |             ss=self.sStart, se=self.sEnd, qs=self.qStart, qe=self.qEnd, iden=self.identity,\
201 |             qcov=self.qCoverage, scov=self.sCoverage)
202 |         return msg
203 | 
204 |     def __eq__(self, other):
205 |         return self.qID == other.qID and self.sID == other.sID and\
206 |                self.sStart == other.sStart and self.sEnd == other.sEnd and\
207 |                self.segments == other.segments and self.qCoverage == other.qCoverage and\
208 |                self.sCoverage == other.sCoverage and self.qLen == other.qLen and\
209 |                self.sLen == other.sLen and self.qStart == other.qStart and\
210 |                self.cigar == other.cigar and self.flag == other.flag and self.identity == other.identity
211 | 
212 | 
213 |     def process(self, record_line, ref_len_dict, query_len_dict):
214 |         """
215 |         If SAM is from pbalign.py output, then have flags:
216 |             XS: 1-based qStart, XE: 1-based qEnd, XQ: query length, NM: number of non-matches
217 | 
218 |         ignore_XQ should be False for BLASR/pbalign.py's SAM, True for GMAP's SAM
219 |         
220 |         0. qID
221 |         1. flag
222 |         2. sID
223 |         3. 1-based offset sStart
224 |         4. mapping quality (ignore)
225 |         5. cigar
226 |         6. name of ref of mate alignment (ignore)
227 |         7. 1-based offset sStart of mate (ignore)
228 |         8. inferred fragment length (ignore)
229 |         9. sequence (ignore)
230 |         10. read qual (ignore)
231 |         11. optional fields
232 |         """
233 |         raw = record_line.split('\t')
234 |         self.qID = raw[0]
235 |         self.sID = raw[2]
236 |         if self.sID == '*': # means no match! STOP here
237 |             return
238 |         self.sStart = int(raw[3]) - 1
239 |         self.cigar = raw[5]
240 |         self.segments = self.parse_cigar(self.cigar, self.sStart)
241 |         self.sEnd = self.segments[-1].end
242 |         self.flag = SAMRecord.parse_sam_flag(int(raw[1]))
243 |         
244 |         # process optional fields
245 |         # XM: number of mismatches
246 |         # NM: edit distance (sub/ins/del)
247 |         for x in raw[11:]:
248 |             if x.startswith('NM:i:'):
249 |                 self.num_nonmatches = int(x[5:])
250 | 
251 |         if ref_len_dict is not None:
252 |             self.sCoverage = (self.sEnd - self.sStart) * 1. / ref_len_dict[self.sID]
253 |             self.sLen = ref_len_dict[self.sID]
254 | 
255 |         if self.flag.strand == '-' and self.qLen is not None:
256 |             self.qStart, self.qEnd = self.qLen - self.qEnd, self.qLen - self.qStart
257 |          
258 |         if query_len_dict is not None: # over write qLen and qCoverage, should be done LAST
259 |             self.qLen = query_len_dict[self.qID]
260 |             self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen
261 |             
262 |         if self.num_nonmatches is not None:
263 |             self.identity = 1. - (self.num_nonmatches * 1. / (self.num_del + self.num_ins + self.num_mat_or_sub))
264 |             
265 | 
266 |     def parse_cigar(self, cigar, start):
267 |         """
268 |         M - match
269 |         I - insertion w.r.t. to ref
270 |         D - deletion w.r.t. to ref
271 |         N - skipped (which means splice junction)
272 |         S - soft clipped
273 |         H - hard clipped (not shown in SEQ)
274 |         = - read match
275 |         X - read mismatch
276 | 
277 |         ex: 50M43N3D
278 | 
279 |         NOTE: sets qStart & qEnd, which are often incorrect because of different ways to write CIGAR strings
280 | 
281 |         Returns: genomic segment locations (using <start> as offset)
282 |         """
283 |         segments = []
284 |         cur_start = start
285 |         cur_end = start
286 |         first_thing = True
287 |         q_aln_len = 0
288 |         self.num_del = 0
289 |         self.num_ins = 0
290 |         self.num_mat_or_sub = 0
291 |         for (num, type) in re.findall('(\d+)(\S)', cigar):
292 |             num = int(num)
293 |             if type == 'H' or type == 'S':
294 |                 if first_thing:
295 |                     self.qStart += num
296 |             elif type == 'I':
297 |                 q_aln_len += num
298 |                 self.num_ins += num
299 |             elif type in ('M','=','X'):
300 |                 cur_end += num
301 |                 q_aln_len += num
302 |                 self.num_mat_or_sub += num
303 |             elif type == 'D':
304 |                 cur_end += num
305 |                 self.num_del += num
306 |             elif type == 'N': # junction, make a new segment
307 |                 segments.append(Interval(cur_start, cur_end))
308 |                 cur_start = cur_end + num
309 |                 cur_end = cur_start
310 |             else:
311 |                 raise Exception("Unrecognized cigar character {0}!".format(type))
312 |             first_thing = False
313 |         if cur_start != cur_end:
314 |             segments.append(Interval(cur_start, cur_end))
315 |         self.qEnd = self.qStart + q_aln_len
316 |         return segments
317 | 
318 |     @classmethod
319 |     def parse_sam_flag(self, flag):
320 |         """
321 | 		Heng Li's SAM https://samtools.github.io/hts-specs/SAMv1.pdf
322 |         1 -- read is one of a pair
323 |         2 -- alignment is one end of proper PE alignment          (IGNORE)
324 |         4 -- read has no reported alignments                      (IGNORE)
325 |         8 -- read is one of a pair and has no reported alignments (IGNORE)
326 |         16 -- reverse ref strand
327 |         32 -- other mate is aligned to ref strand
328 |         64 -- first mate in pair
329 |         128 -- second mate in pair
330 |         256 -- not primary alignment
331 | 		512 -- not passing filters
332 | 		1024 -- PCR or optical duplicate
333 | 		2048 -- supplementary alignment
334 | 
335 |         Return: SAMflag
336 |         """
337 |         PE_read_num = 0
338 |         strand = '+'
339 |         if flag >= 2048: # supplementary alignment
340 |             flag -= 2048
341 |         if flag >= 1024: #PCR or optical duplicate, should never see this...
342 |             flag -= 1024
343 |         if flag >= 512: #not passing QC, should never see this
344 |             flag -= 512
345 |         if flag >= 256: #secondary alignment, OK to see this if option given in BowTie
346 |             flag -= 256
347 |         if flag >= 128:
348 |             PE_read_num = 2
349 |             flag -= 128
350 |         elif flag >= 64:
351 |             PE_read_num = 1
352 |             flag -= 64
353 |         if flag >= 32:
354 |             flag -= 32
355 |         if flag >= 16:
356 |             strand = '-'
357 |             flag -= 16
358 |         if flag >= 8:
359 |             flag -= 8
360 |         if flag >= 4:
361 |             flag -= 4
362 |         if flag >= 2:
363 |             flag -= 2
364 |         assert flag == 0 or flag == 1
365 |         is_paired = flag == 1
366 |         return SAMRecord.SAMflag(is_paired, strand, PE_read_num)
367 |             
368 | 
369 | class BLASRSAMReader(SAMReader):
370 |     def __next__(self):
371 |         line = self.f.readline().strip()
372 |         if len(line) == 0:
373 |             raise StopIteration
374 |         return BLASRSAMRecord(line, self.ref_len_dict, self.query_len_dict)   
375 | 
376 | class BLASRSAMRecord(SAMRecord):
377 |     def process(self, record_line, ref_len_dict=None, query_len_dict=None):
378 |         """
379 |         SAM files from pbalign.py have following optional fields:
380 |             XS: 1-based qStart, XE: 1-based qEnd, XQ: query length, NM: number of non-matches
381 |     
382 |         0. qID
383 |         1. flag
384 |         2. sID
385 |         3. 1-based offset sStart
386 |         4. mapping quality (ignore)
387 |         5. cigar
388 |         6. name of ref of mate alignment (ignore)
389 |         7. 1-based offset sStart of mate (ignore)
390 |         8. inferred fragment length (ignore)
391 |         9. sequence (ignore)
392 |         10. read qual (ignore)
393 |         11. optional fields
394 |         """
395 |         raw = record_line.split('\t')
396 |         self.qID = raw[0]
397 |         self.sID = raw[2]
398 |         if self.sID == '*': # means no match! STOP here
399 |             return
400 |         self.sStart = int(raw[3]) - 1
401 |         self.cigar = raw[5]
402 |         self.segments = self.parse_cigar(self.cigar, self.sStart)
403 |         self.sEnd = self.segments[-1].end
404 |         self.flag = SAMRecord.parse_sam_flag(int(raw[1]))
405 |         
406 |         # In Yuan Li's BLASR-to-SAM, XQ:i:<subread length>
407 |         # see https://github.com/PacificBiosciences/blasr/blob/master/common/datastructures/alignmentset/SAMAlignment.h
408 |         for x in raw[11:]:
409 |             if x.startswith('XQ:i:'): # XQ should come last, after XS and XE
410 |                 _qLen = int(x[5:])
411 |                 if _qLen > 0: # this is for GMAP's SAM, which has XQ:i:0
412 |                     self.qLen = _qLen
413 |             elif x.startswith('XS:i:'): # must be PacBio's SAM, need to update qStart
414 |                 qs = int(x[5:]) - 1 # XS is 1-based
415 |                 if qs > 0:
416 |                     print("qStart:", self.qStart)
417 |                     assert self.qStart == 0
418 |                     self.qStart = qs
419 |                     self.qEnd += qs
420 |             elif x.startswith('XE:i:'): # must be PacBio's SAM and comes after XS:i:
421 |                 qe = int(x[5:])     # XE is 1-based
422 |                 assert self.qEnd - self.qStart == qe - 1 # qEnd should've been updated already, confirm this
423 |             elif x.startswith('NM:i:'): # number of non-matches
424 |                 self.num_nonmatches = int(x[5:])
425 |                 self.identity = 1. - (self.num_nonmatches * 1. / (self.num_del + self.num_ins + self.num_mat_or_sub))
426 |                 
427 |         if ref_len_dict is not None:
428 |             self.sCoverage = (self.sEnd - self.sStart) * 1. / ref_len_dict[self.sID]
429 |             self.sLen = ref_len_dict[self.sID]
430 | 
431 |         if self.flag.strand == '-' and self.qLen is not None:
432 |             self.qStart, self.qEnd = self.qLen - self.qEnd, self.qLen - self.qStart
433 | 
434 |         if self.qLen is not None:
435 |             self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen
436 |            
437 |         if query_len_dict is not None: # over write qLen and qCoverage, should be done LAST
438 |             try:
439 |                 self.qLen = query_len_dict[self.qID]
440 |             except KeyError: # HACK for blasr's extended qID
441 |                 self.qLen = query_len_dict[self.qID[:self.qID.rfind('/')]]
442 |             self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen        
443 |             
444 |             
445 | class GMAPSAMReader(SAMReader):
446 |     def __next__(self):
447 |         while True:
448 |             line = self.f.readline().strip()
449 |             if len(line) == 0:
450 |                 raise StopIteration
451 |             if not line.startswith('@'): # header can occur at file end if the SAM was sorted
452 |                 break
453 |         return GMAPSAMRecord(line, self.ref_len_dict, self.query_len_dict)
454 |     
455 | class GMAPSAMRecord(SAMRecord):
456 |     def process(self, record_line, ref_len_dict=None, query_len_dict=None):
457 |         """
458 |         SAM files from pbalign.py have following optional fields:
459 |             XS: 1-based qStart, XE: 1-based qEnd, XQ: query length, NM: number of non-matches
460 |     
461 |         0. qID
462 |         1. flag
463 |         2. sID
464 |         3. 1-based offset sStart
465 |         4. mapping quality (ignore)
466 |         5. cigar
467 |         6. name of ref of mate alignment (ignore)
468 |         7. 1-based offset sStart of mate (ignore)
469 |         8. inferred fragment length (ignore)
470 |         9. sequence (ignore)
471 |         10. read qual (ignore)
472 |         11. optional fields
473 |         """
474 |         raw = record_line.split('\t')
475 |         self.qID = raw[0]
476 |         self.sID = raw[2]
477 |         if self.sID == '*': # means no match! STOP here
478 |             return
479 |         self.sStart = int(raw[3]) - 1
480 |         self.cigar = raw[5]
481 |         self.segments = self.parse_cigar(self.cigar, self.sStart)
482 |         self.sEnd = self.segments[-1].end
483 |         self.flag = SAMRecord.parse_sam_flag(int(raw[1])) # strand can be overwritten by XS:A flag
484 |         self._flag_strand = self.flag.strand # serve as backup for debugging
485 |         # In Yuan Li's BLASR-to-SAM, XQ:i:<subread length>
486 |         # see https://github.com/PacificBiosciences/blasr/blob/master/common/datastructures/alignmentset/SAMAlignment.h
487 |         for x in raw[11:]:
488 |             if x.startswith('NM:i:'): # number of non-matches
489 |                 self.num_nonmatches = int(x[5:])
490 |                 self.identity = 1. - (self.num_nonmatches * 1. / (self.num_del + self.num_ins + self.num_mat_or_sub))
491 |             elif x.startswith('XS:A:'): # strand ifnormation
492 |                 _s = x[5:]
493 |                 if _s!='?':
494 |                     self._flag_strand = self.flag.strand # serve as backup for debugging
495 |                     self.flag = SAMRecord.SAMflag(self.flag.is_paired, _s, self.flag.PE_read_num)
496 | 
497 |         if ref_len_dict is not None:
498 |             self.sCoverage = (self.sEnd - self.sStart) * 1. / ref_len_dict[self.sID]
499 |             self.sLen = ref_len_dict[self.sID]
500 | 
501 |         if self.flag.strand == '-' and self.qLen is not None:
502 |             self.qStart, self.qEnd = self.qLen - self.qEnd, self.qLen - self.qStart
503 | 
504 |         if self.qLen is not None:
505 |             self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen
506 |            
507 |         if query_len_dict is not None: # over write qLen and qCoverage, should be done LAST
508 |             try:
509 |                 self.qLen = query_len_dict[self.qID]
510 |             except KeyError: # HACK for blasr's extended qID
511 |                 k = self.qID.rfind('/')
512 |                 if k >= 0:
513 |                     try:
514 |                         self.qLen = query_len_dict[self.qID[:self.qID.rfind('/')]]
515 |                     except KeyError:
516 |                         self.qLen = query_len_dict[self.qID]
517 |                 else:
518 |                     raise Exception("Unable to find qID {0} in the input fasta/fastq!".format(self.qID))
519 |             self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen    
520 |                 
521 | 


--------------------------------------------------------------------------------
/phasing/io/VariantPhaser.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'etseng@pacb.com'
  2 | 
  3 | import pdb
  4 | from collections import defaultdict, namedtuple, Counter
  5 | from csv import DictReader
  6 | import vcf
  7 | import pysam
  8 | from Bio.Seq import Seq
  9 | from Bio import SeqIO
 10 | from .BioReaders import GMAPSAMReader
 11 | from .coordinate_mapper import get_base_to_base_mapping_from_sam
 12 | 
 13 | 
 14 | __VCF_EXAMPLE__ = \
 15 | """
 16 | ##fileformat=VCFv4.2
 17 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 18 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
 19 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 20 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
 21 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
 22 | 20      1       .       G       A,T     .       PASS    AF=0.5;DB       GT
 23 | """
 24 | 
 25 | def type_fa_or_fq(file):
 26 |     file = file.upper()
 27 |     if file.endswith('.FA') or file.endswith('.FASTA'): return 'fasta'
 28 |     else: return 'fastq'
 29 | 
 30 | 
 31 | class VariantPhaser(object):
 32 |     def __init__(self, vc):
 33 |         """
 34 |         :param vc: MPileUPVariant instance.
 35 |         """
 36 |         self.vc = vc
 37 |         self.min_var_pos = min(vc.variant)  # mininum 0-based position of a called variant
 38 |         self.max_var_pos = max(vc.variant)  # maximum 0-based position of a called variant
 39 |         self.accepted_vars_by_pos = {} # 0-based pos --> list of accepted, (NOT strand sense) base
 40 |         self.count_of_vars_by_pos = {} # 0-based pos --> (NOT strand sense, but ref-based) base --> count
 41 |         self.accepted_pos = [] # sorted list of variant positions (0-based, ref)
 42 | 
 43 |         # process vc.variant which is
 44 |         # dict of 0-based pos --> desc list of (base, count)
 45 |         # ex: {1565: [('a', 49), ('g', 36)]}
 46 |         # lower case means at pos 1565, we expect - strand mapping and
 47 |         # seq base is 'T' on the sense strand
 48 |         # this converts to self.accepted_vars_by_pos[1565] = ['A', 'G']
 49 |         # later, when we are matchin back to transcript seq, need to watch for strand!
 50 |         for pos, vars in vc.variant.items():
 51 |             self.accepted_vars_by_pos[pos] = [_base.upper() for _base,_count in vars]
 52 |             self.count_of_vars_by_pos[pos] = dict((_base.upper(), _count) for _base,_count in vars)
 53 | 
 54 |         self.accepted_pos = list(self.accepted_vars_by_pos.keys())
 55 |         self.accepted_pos.sort()
 56 | 
 57 |         self.haplotypes = Haplotypes(self.accepted_pos, self.vc.ref_base, self.count_of_vars_by_pos)
 58 |         self.seq_hap_info = {} # haplotype assignment, key: (CCS) seqid, value: haplotype index
 59 | 
 60 | 
 61 |     def phase_variant(self, sam_filename, input_fa_or_fq, output_prefix, partial_ok=False):
 62 |         """
 63 |         :param sam_filename: CCS SAM filename. Can be unsorted.
 64 |         :param input_fa_or_fq: Input CCS fasta/fastq filename.
 65 |         :param output_prefix: Output prefix. Writes to xxx.log.
 66 |         :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions.
 67 | 
 68 |         For each alignment:
 69 |         1. discard if did not map to the strand expected
 70 |         2. discard if did not map to the full range of variants (unless <partial_ok> is True)
 71 |         3. discard if at var positions have non-called bases (outliers)
 72 |         """
 73 |         f_log = open(output_prefix+'.log', 'w')
 74 | 
 75 |         seq_dict = SeqIO.to_dict(SeqIO.parse(open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq)))
 76 |         for r in GMAPSAMReader(sam_filename, True, query_len_dict=dict((k, len(seq_dict[k].seq)) for k in seq_dict)):
 77 |             if r.sID == '*':
 78 |                 f_log.write("Ignore {0} because: unmapped.\n".format(r.qID))
 79 |                 continue
 80 |             if r.flag.strand != self.vc.expected_strand:
 81 |                 f_log.write("Ignore {0} because: strand is {1}.\n".format(r.qID, r.flag.strand))
 82 |                 continue # ignore
 83 |             if not partial_ok and (r.sStart > self.min_var_pos or r.sEnd < self.max_var_pos):
 84 |                 f_log.write("Ignore {0} because: aln too short, from {1}-{2}.\n".format(r.qID, r.sStart+1, r.sEnd))
 85 |                 continue
 86 | 
 87 |             i, msg = self.match_haplotype(r, str(seq_dict[r.qID].seq).upper(), partial_ok)
 88 |             if i is None: # read is rejected for reason listed in <msg>
 89 |                 f_log.write("Ignore {0} because: {1}.\n".format(r.qID, msg))
 90 |                 continue
 91 |             else:
 92 |                 f_log.write("{0} phased: haplotype {1}={2}\n".format(r.qID, i, self.haplotypes[i]))
 93 |                 print("{0} has haplotype {1}:{2}".format(r.qID, i, self.haplotypes[i]))
 94 |                 self.seq_hap_info[r.qID] = i
 95 | 
 96 | 
 97 |     def match_haplotype(self, r, s, partial_ok=False):
 98 |         """
 99 |         Match an alignment record to existing haplotypes or create a new one.
100 |         Helper function for self.phase_variant()
101 |         :param r: CCS alignment (SAM record)
102 |         :param s: CCS sequence (in strand), must be plain str and every base is upper case
103 |         :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions.
104 | 
105 |         :return: (haplotype_index, msg) or (None, msg) if variants don't match w/ called SNPs
106 |         """
107 |         assert type(s) is str and str.isupper(s)
108 |         assert r.flag.strand == self.vc.expected_strand
109 |         # m: mapping of 0-based seq --> 0-based ref position
110 |         # rev_map: mapping of 0-based ref position --> 0-based seq
111 |         m = get_base_to_base_mapping_from_sam(r.segments, r.cigar, r.qStart, r.qEnd, r.flag.strand)
112 |         ref_m = dict((v,k) for k,v in m.items())
113 | 
114 |         # go through each variant
115 |         # <hap> to represent the concatenated string of all variant positions for this seq
116 |         # ex: if there are three var positions, a hap would be "ATG" or "A?G" (if partial_ok is True), etc.
117 |         hap = ''
118 |         impute_later = False
119 |         for ref_pos in self.accepted_pos:
120 |             if ref_pos not in ref_m:
121 |                 if partial_ok: # read does not cover one of the SNP positions, so use "?"
122 |                     hap += "?"
123 |                 else:
124 |                     return None, "Does not have base at ref_pos {0}.\n".format(ref_pos)
125 |             else:
126 |                 base = s[ref_m[ref_pos]]
127 |                 if self.vc.expected_strand == '-': # must convert the base to the rev comp
128 |                     base = str(Seq(base).reverse_complement()).upper()
129 |                 if base in self.accepted_vars_by_pos[ref_pos]:
130 |                     hap += base
131 |                 else: # contains a base at a variant position that is not called. Try to impute.
132 |                     hap += base
133 |                     impute_later = True
134 | 
135 |         if all(b=='?' for b in hap):
136 |             return None, "Does not cover any variant base."
137 | 
138 |         if impute_later:
139 |             impute_i = self.haplotypes.impute_haplotype(hap, min_score=3)
140 |             if impute_i is None:
141 |                 return None, "Seq {0} contained non-called variant. Impute failed.\n".format(hap)
142 |             else:
143 |                 return impute_i, "IMPUTED"
144 |         return self.haplotypes.match_or_add_haplotype(hap_string=hap)
145 | 
146 | 
147 | 
148 | def phase_isoforms(read_stat_filename, seqids, phaser):
149 |     """
150 |     :param read_stat_filename: the .read_stat file that has columns <id> and <pbid>, where <id> is CCS id and <pbid> is PB.X.Y
151 |     :param seqids: CCS IDs that were used to create the haplotypes.
152 |     :param phaser: VariantPhaser object that contains the haplotype and seqid->haplotype information.
153 | 
154 |     :return: list of (isoform, dict of haplotype count), ex: {'PB.45.1': {0:10, 1:20}}
155 |              which means PB.45.1 has haplotype 0 supported by 10 CCS reads and hap 1 supported by 20 CCS reads.
156 | 
157 |     *NOTE* currently uses FL CCS reads only (even if the SNPs may have been called by FL+nFL CCS SAM)
158 |     """
159 |     result = {} # dict of (isoform, dict of haplotype_index --> CCS count supporting it
160 |     # from read stat, gather which isoforms have which (CCS) seq members.
161 |     isoforms = defaultdict(lambda: []) # key: PB.X.Y, value: list of seqid members
162 |     for r in DictReader(open(read_stat_filename), delimiter='\t'):
163 |         if r['id'] in seqids and r['is_fl']=='Y':
164 |             isoforms[r['pbid']].append(r['id'])
165 | 
166 |     # for each isoform, look at the CCS membership to know which haplotypes are expressed
167 |     for _iso, _seqids in isoforms.items():
168 |         tally = defaultdict(lambda: 0) # haplotype index --> count (of CCS)
169 |         for seqid in _seqids:
170 |             if seqid in phaser.seq_hap_info: # some CCS (seqids) may not have been used by the phaser, so account for that
171 |                 tally[phaser.seq_hap_info[seqid]] += 1
172 |         if len(tally) > 0:
173 |             result[_iso] = dict(tally)
174 |     return result
175 | 
176 | 
177 | class Haplotypes(object):
178 |     """
179 |     Storing haplotypes for a loci.
180 | 
181 |     self.haplotype[i] is the i-th haplotype.
182 |     if N = len(self.haplotype[i]), then there are N variants along the loci.
183 |     self.hap_var_positions[j] means that the j-th variant corressponds to (0-based) position on the ref genome.
184 |     """
185 |     def __init__(self, var_positions, ref_at_pos, count_of_vars_by_pos):
186 |         """
187 |         :param var_positions: sorted list of (0-based) variant positions
188 |         :param ref_at_pos: dict of (0-based) variant position --> ref base at this position
189 |         :param count_of_vars_by_pos: 0-based pos --> (NOT strand sense, but ref-based) base --> count
190 |         """
191 |         self.haplotypes = [] # haplotypes, where haplotypes[i] is the i-th distinct haplotype of all var concat
192 |         self.hap_var_positions = var_positions
193 |         self.ref_at_pos = ref_at_pos # dict of (0-based) pos --> ref base
194 |         self.alt_at_pos = None # init: None, later: dict of (0-based) pos --> unique list of alt bases
195 |         self.count_of_vars_by_pos = count_of_vars_by_pos
196 |         self.haplotype_vcf_index = None # init: None, later: dict of (hap index) --> (0-based) var pos --> phase (0 for ref, 1+ for alt)
197 | 
198 |         # sanity check: all variant positions must be present
199 |         self.sanity_check()
200 | 
201 |     def __getitem__(self, ith):
202 |         """
203 |         Returns the <i>-th haplotype
204 |         """
205 |         return self.haplotypes[ith]
206 | 
207 |     def __str__(self):
208 |         return """
209 |         var positions: {pp}
210 |         haplotypes: \n{h}
211 |         """.format(pp=",".join(map(str,self.hap_var_positions)),
212 |                    h="\n".join(self.haplotypes))
213 | 
214 |     def sanity_check(self):
215 |         """
216 |         Sanity check the following:
217 |         -- variant positions are properly recorded and concordant
218 |         -- alt bases are truly alt and unique
219 |         -- all haplotypes are the same length
220 |         """
221 |         for pos in self.hap_var_positions:
222 |             assert pos in self.ref_at_pos
223 | 
224 |         if self.alt_at_pos is not None:
225 |             for pos in self.alt_at_pos:
226 |                 # ref base must not be in alt
227 |                 assert self.ref_at_pos[pos] not in self.alt_at_pos[pos]
228 |                 # alt bases must be unique
229 |                 assert len(self.alt_at_pos[pos]) == len(set(self.alt_at_pos[pos]))
230 | 
231 |         if len(self.haplotypes) >= 1:
232 |             n = len(self.haplotypes[0])
233 |             assert n == len(self.hap_var_positions)
234 |             for hap_str in self.haplotypes[1:]:
235 |                 assert len(hap_str) == n
236 | 
237 | 
238 |     def match_or_add_haplotype(self, hap_string):
239 |         """
240 |         If <hap_string> is an existing haplotype, return the index.
241 |         Otherwise, add to known haplotypes and return the new index.
242 | 
243 |         :return: <index>, "FOUND" or "NEW"
244 |         """
245 |         if hap_string in self.haplotypes:
246 |             i = self.haplotypes.index(hap_string)
247 |             return i, "FOUND"
248 |         else:
249 |             i = len(self.haplotypes)
250 |             self.haplotypes.append(hap_string)
251 |             return i, "NEW"
252 | 
253 |     def impute_haplotype(self, hap_string, min_score):
254 |         """
255 |         :param hap_string: a hap string with '?'s.
256 |         :param min_sim: minimum similarity with existing haplotype to accept assignment
257 |         :return: <index> of an existing haplotype, or None if not sufficiently matched
258 | 
259 |         Impute haplotype and only return a match if:
260 |         (a) score (similarity) is >= min_score
261 |         (b) the matching score for the best one is higher than the second best match
262 |         """
263 |         sim_tuple = namedtuple('sim_tuple', 'index score')
264 |         sims = [] # list of sim_tuple
265 |         hap_str_len = len(hap_string)
266 |         for i in range(len(self.haplotypes)):
267 |             # Liz note: currently NOT checking whether existing haplotypes have '?'. I'm assuming no '?'.
268 |             score = sum((hap_string[k]==self.haplotypes[i][k]) for k in range(hap_str_len))
269 |             if score > 0:
270 |                 sims.append(sim_tuple(index=i, score=score))
271 |         if len(sims) == 0:
272 |             return None
273 |         sims.sort(key=lambda x: x.score, reverse=True)
274 |         if sims[0].score >= min_score and (len(sims)==1 or sims[0].score > sims[1].score):
275 |             return sims[0].index
276 |         else:
277 |             return None
278 | 
279 |     def get_haplotype_vcf_assignment(self):
280 |         """
281 |         Must be called before self.write_haplotype_to_vcf()
282 |         This is preparing for writing out VCF. We need to know, for each variant position,
283 |         the ref base (already filled in self.ref_at_pos) and the alt bases (self.alt_at_pos).
284 |         For each haplotype in (self.haplotype), we need to know the whether the i-th variant is the
285 |         ref (index 0), or some alt base (index 1 and onwards).
286 | 
287 |         Propagates two variables:
288 | 
289 |         self.haplotype_vcf_index: hap index --> pos --> phase index (0 for ref, 1+ for alt)
290 |         self.alt_at_pos: dict of <0-based pos> --> alt bases (not is not ref) at this position
291 |         """
292 |         self.haplotype_vcf_index = [{} for i in range(len(self.haplotypes))]
293 |         self.alt_at_pos = {}
294 | 
295 |         # what happens in the case of partial phasing
296 |         # ex: self.haplotypes[0] = "A?G", this means when it comes to the second pos, pos2,
297 |         # in the VCF we would want to write out .|. for diploid, . for haploid, etc
298 |         # so let's set self.haplotype_vcf_index[0][pos2] = '.' to indicate that
299 | 
300 |         for i,pos in enumerate(self.hap_var_positions):
301 |             ref = self.ref_at_pos[pos]
302 |             # need to go through the haplotype bases, if ref is already represented, then don't put it in alt
303 |             self.alt_at_pos[pos] = []
304 |             for hap_i, hap_str in enumerate(self.haplotypes):
305 |                 base = hap_str[i]
306 |                 if base=='?': # means this haplotype does not cover this position!
307 |                     self.haplotype_vcf_index[hap_i][pos] = '.'
308 |                 elif base==ref: # is the ref base
309 |                     self.haplotype_vcf_index[hap_i][pos] = 0
310 |                 else: # is an alt base, see if it's already there
311 |                     if base in self.alt_at_pos[pos]:
312 |                         j = self.alt_at_pos[pos].index(base)
313 |                         self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref)
314 |                     else:
315 |                         j = len(self.alt_at_pos[pos])
316 |                         self.alt_at_pos[pos].append(base)
317 |                         self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref)
318 |             # in the case where partial_ok=False, it's possible some alt are never presented by a haplotype
319 |             # we must check that all variants are presented here
320 |             for _base in self.count_of_vars_by_pos[pos]:
321 |                 if (_base not in self.ref_at_pos[pos]) and (_base not in self.alt_at_pos[pos]):
322 |                     self.alt_at_pos[pos].append(_base)
323 | 
324 | 
325 |     def write_haplotype_to_vcf(self, fake_genome_mapping_filename, isoform_tally, output_prefix):
326 |         """
327 |         The following functions must first be called first:
328 |         -- self.get_haplotype_vcf_assignment
329 |         """
330 |         if self.haplotype_vcf_index is None or self.alt_at_pos is None:
331 |             raise Exception("Must call self.get_haplotype_vcf_assignment() first!")
332 | 
333 |         self.sanity_check()
334 | 
335 |         name_isoforms = list(isoform_tally.keys())
336 |         name_isoforms.sort()
337 | 
338 |         # write a fake VCF example so we can read the headers in
339 |         with open('template.vcf', 'w') as f:
340 |             f.write(__VCF_EXAMPLE__)
341 |         reader = vcf.VCFReader(open('template.vcf'))
342 |         reader.samples = name_isoforms
343 |         f_vcf = vcf.Writer(open(output_prefix+'.vcf', 'w'), reader)
344 | 
345 | 
346 |         # human readable text:
347 |         # first line: assoc VCF filename
348 |         # second line: haplotype, list of sorted isoforms
349 |         # third line onwards: haplotype and assoc count
350 |         f_human = open(output_prefix+'.human_readable.txt', 'w')
351 |         f_human.write("Associated VCF file: {0}.vcf\n".format(output_prefix))
352 |         f_human.write("haplotype\t{samples}\n".format(samples="\t".join(name_isoforms)))
353 |         for hap_index,hap_str in enumerate(self.haplotypes):
354 |             f_human.write(hap_str)
355 |             for _iso in name_isoforms:
356 |                 if hap_index in isoform_tally[_iso]:
357 |                     f_human.write("\t{0}".format(isoform_tally[_iso][hap_index]))
358 |                 else:
359 |                     f_human.write("\t0")
360 |             f_human.write('\n')
361 |         f_human.close()
362 | 
363 | 
364 |         # read fake genome mapping file
365 |         fake_map = {} # 0-based position on fake --> (chr, 0-based ref position)
366 |         with open(fake_genome_mapping_filename) as f:
367 |             for line in f:
368 |                 fake_pos, ref_chr, ref_pos = line.strip().split(',')
369 |                 fake_map[int(fake_pos)] = (ref_chr, int(ref_pos))
370 | 
371 | 
372 |         # for each position, write out the ref and alt bases
373 |         # then fill in for each isoform (aka "sample"):
374 |         #  if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise)
375 |         #  if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0]
376 |         for i,pos in enumerate(self.hap_var_positions):
377 |             ref_chr, ref_pos = fake_map[pos]
378 |             total_count = sum(self.count_of_vars_by_pos[pos].values())
379 |             alt_freq = ["{0:.2f}".format(self.count_of_vars_by_pos[pos][b]*1./total_count) for b in self.alt_at_pos[pos]]
380 |             rec = vcf.model._Record(CHROM=ref_chr,
381 |                               POS=ref_pos+1,
382 |                               ID='.',
383 |                               REF=self.ref_at_pos[pos],
384 |                               ALT=[vcf.model._Substitution(b) for b in self.alt_at_pos[pos]],
385 |                               QUAL='.',
386 |                               FILTER='PASS',
387 |                               INFO={'AF':alt_freq, 'DP':total_count},
388 |                               FORMAT="GT:HQ",
389 |                               sample_indexes=None)
390 |             samp_ft = vcf.model.make_calldata_tuple(['GT', 'HQ'])
391 |             rec.samples = []
392 |             for _iso in name_isoforms:
393 |                 # isoform_tally[_iso] is a dict of haplotype index --> count
394 |                 # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i]
395 |                 # we always need to show the phases in haplotype index order sorted
396 |                 hap_indices = list(isoform_tally[_iso].keys())
397 |                 hap_indices.sort()
398 |                 genotype = "|".join(str(self.haplotype_vcf_index[hap_index][pos]) for hap_index in hap_indices)
399 |                 counts = ",".join(str(isoform_tally[_iso][hap_index]) for hap_index in hap_indices)
400 |                 rec.samples.append(vcf.model._Call(rec, _iso, samp_ft(*[genotype, counts])))
401 |             f_vcf.write_record(rec)
402 |         f_vcf.close()
403 | 
404 | 
405 | def get_base_to_base_mapping_from_aligned_pairs(reftuple, qLen, strand):
406 |     """
407 |     Returns: dict of 0-based position --> 0-based ref position
408 |     """
409 |     cur_genome_loc = reftuple[0][1]
410 | 
411 |     mapping = {}
412 |     for qpos, rpos in reftuple:
413 |         if qpos is not None and rpos is not None:
414 |             mapping[qpos] = (rpos, True)
415 |         elif qpos is not None:
416 |             mapping[qpos] = (cur_genome_loc, None)
417 |         if rpos is not None: cur_genome_loc = rpos
418 | 
419 |     if strand == '-':
420 |         mapping = dict((qLen-1-k, v) for k,v in mapping.items())
421 | 
422 |     for k in mapping:
423 |         mapping[k] = mapping[k][0]
424 | 
425 |     return mapping
426 | 
427 | 
428 | class MagVariantPhaser(object):
429 |     def __init__(self, vc):
430 |         """
431 |         :param vc: MPileUPVariant instance.
432 |         """
433 |         self.vc = vc
434 |         self.min_var_pos = min(vc.variant)  # mininum 0-based position of a called variant
435 |         self.max_var_pos = max(vc.variant)  # maximum 0-based position of a called variant
436 |         self.accepted_vars_by_pos = {} # 0-based pos --> list of accepted, (NOT strand sense) base
437 |         self.count_of_vars_by_pos = {} # 0-based pos --> (NOT strand sense, but ref-based) base --> count
438 |         self.accepted_pos = [] # sorted list of variant positions (0-based, ref)
439 | 
440 |         # process vc.variant which is
441 |         # dict of 0-based pos --> desc list of (base, count)
442 |         # ex: {1565: [('a', 49), ('g', 36)]}
443 |         # lower case means at pos 1565, we expect - strand mapping and
444 |         # seq base is 'T' on the sense strand
445 |         # this converts to self.accepted_vars_by_pos[1565] = ['A', 'G']
446 |         # later, when we are matchin back to transcript seq, need to watch for strand!
447 |         for pos, vars in vc.variant.items():
448 |             self.accepted_vars_by_pos[pos] = [_base.upper() for _base,_count in vars]
449 |             self.count_of_vars_by_pos[pos] = dict((_base.upper(), _count) for _base,_count in vars)
450 | 
451 |         self.accepted_pos = list(self.accepted_vars_by_pos.keys())
452 |         self.accepted_pos.sort()
453 | 
454 |         self.haplotypes = MagHaplotypes(self.accepted_pos, [self.vc.ref_name[p] for p in self.accepted_pos], self.vc.ref_base, self.count_of_vars_by_pos)
455 |         self.seq_hap_info = {} # haplotype assignment, key: (CCS) seqid, value: haplotype index
456 | 
457 | 
458 |     def phase_variant(self, sam_filename, coordstr, output_prefix, partial_ok=False):
459 |         """
460 |         :param sam_filename: CCS SAM filename. Can be unsorted.
461 |         :param coordstr: list of [contig, start, end]
462 |         :param output_prefix: Output prefix. Writes to xxx.log.
463 |         :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions.
464 | 
465 |         For each alignment:
466 |         1. discard if did not map to the strand expected
467 |         2. discard if did not map to the full range of variants (unless <partial_ok> is True)
468 |         3. discard if at var positions have non-called bases (outliers)
469 |         """
470 |         f_log = open(output_prefix+'.log', 'a+')
471 | 
472 |         contig, start, end = coordstr
473 | 
474 |         secondary_align_counts = 0
475 |         tot_align_counts = 0
476 |         with pysam.AlignmentFile(sam_filename, 'rb') as samfile:
477 |             for s in samfile.fetch(contig, start, end):
478 |                 tot_align_counts += 1
479 |                 if s.reference_name == '*':
480 |                     f_log.write("Ignore {0} because: unmapped.\n".format(s.query_name))
481 |                     continue
482 |                 if not partial_ok and (s.reference_start > self.min_var_pos or s.reference_end < self.max_var_pos):
483 |                     f_log.write("Ignore {0} because: aln too short, from {1}-{2}.\n".format(s.query_name, s.referenc_start+1, s.reference_end))
484 |                     continue
485 |                 if s.is_secondary:
486 |                     secondary_align_counts += 1
487 |                     continue
488 |                 seqstr = s.query_sequence.upper()
489 |                 i, msg = self.match_haplotype(s, seqstr, partial_ok)
490 |                 if i is None: # read is rejected for reason listed in <msg>
491 |                     f_log.write("Ignore {0} because: {1}.\n".format(s.query_name, msg))
492 |                     continue
493 |                 else:
494 |                     f_log.write("{0} phased: haplotype {1}={2}\n".format(s.query_name, i, self.haplotypes[i]))
495 |                     print("{0} has haplotype {1}:{2}".format(s.query_name, i, self.haplotypes[i]))
496 |                     self.seq_hap_info[s.query_name] = i
497 |         f_log.write(f'Encountered {secondary_align_counts} out of {tot_align_counts} read alignments')
498 | 
499 | 
500 |     def match_haplotype(self, r, s, partial_ok=False):
501 |         """
502 |         Match an alignment record to existing haplotypes or create a new one.
503 |         Helper function for self.phase_variant()
504 |         :param r: CCS alignment (pysam record)
505 |         :param s: CCS sequence (in strand), must be plain str and every base is upper case
506 |         :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions.
507 | 
508 |         :return: (haplotype_index, msg) or (None, msg) if variants don't match w/ called SNPs
509 |         """
510 |         try:
511 |             assert type(s) is str and str.isupper(s)
512 |         except Exception as e:
513 |             print(f'exception: {s}')
514 |         # m: mapping of 0-based seq --> 0-based ref position
515 |         # rev_map: mapping of 0-based ref position --> 0-based seq
516 |         strand = '-' if r.is_reverse else '+'
517 |         m = get_base_to_base_mapping_from_aligned_pairs(r.get_aligned_pairs(), len(r.query_sequence), strand)
518 |         ref_m = dict((v,k) for k,v in m.items())
519 | 
520 |         # go through each variant
521 |         # <hap> to represent the concatenated string of all variant positions for this seq
522 |         # ex: if there are three var positions, a hap would be "ATG" or "A?G" (if partial_ok is True), etc.
523 |         hap = ''
524 |         impute_later = False
525 |         for ref_pos in self.accepted_pos:
526 |             if ref_pos not in ref_m:
527 |                 if partial_ok: # read does not cover one of the SNP positions, so use "?"
528 |                     hap += "?"
529 |                 else:
530 |                     return None, "Does not have base at ref_pos {0}.\n".format(ref_pos)
531 |             else:
532 |                 base = s[ref_m[ref_pos]]
533 |                 if base in self.accepted_vars_by_pos[ref_pos]:
534 |                     hap += base
535 |                 else: # contains a base at a variant position that is not called. Try to impute.
536 |                     hap += base
537 |                     impute_later = True
538 | 
539 |         if all(b=='?' for b in hap):
540 |             return None, "Does not cover any variant base."
541 | 
542 |         if impute_later:
543 |             impute_i = self.haplotypes.impute_haplotype(hap, min_score=3)
544 |             if impute_i is None:
545 |                 return None, "Seq {0} contained non-called variant. Impute failed.\n".format(hap)
546 |             else:
547 |                 return impute_i, "IMPUTED"
548 |         return self.haplotypes.match_or_add_haplotype(hap_string=hap)
549 | 
550 | 
551 | class MagHaplotypes(object):
552 |     """
553 |     Storing haplotypes for a loci.
554 | 
555 |     self.haplotype[i] is the i-th haplotype.
556 |     if N = len(self.haplotype[i]), then there are N variants along the loci.
557 |     self.hap_var_positions[j] means that the j-th variant corressponds to (0-based) position on the ref genome.
558 |     """
559 |     def __init__(self, var_positions, chrs, ref_at_pos, count_of_vars_by_pos):
560 |         """
561 |         :param var_positions: sorted list of (0-based) variant positions
562 |         :param ref_at_pos: dict of (0-based) variant position --> ref base at this position
563 |         :param count_of_vars_by_pos: 0-based pos --> (NOT strand sense, but ref-based) base --> count
564 |         """
565 |         self.haplotypes = [] # haplotypes, where haplotypes[i] is the i-th distinct haplotype of all var concat
566 |         self.hap_var_positions = var_positions
567 |         self.ref_at_pos = ref_at_pos # dict of (0-based) pos --> ref base
568 |         self.alt_at_pos = None # init: None, later: dict of (0-based) pos --> unique list of alt bases
569 |         self.count_of_vars_by_pos = count_of_vars_by_pos
570 |         self.haplotype_vcf_index = None # init: None, later: dict of (hap index) --> (0-based) var pos --> phase (0 for ref, 1+ for alt)
571 |         self.chrs = chrs # contig names where chrs[i] is the i-th contig name
572 | 
573 |         # sanity check: all variant positions must be present
574 |         self.sanity_check()
575 | 
576 |     def __getitem__(self, ith):
577 |         """
578 |         Returns the <i>-th haplotype
579 |         """
580 |         return self.haplotypes[ith]
581 | 
582 |     def __str__(self):
583 |         return """
584 |         var positions: {pp}
585 |         haplotypes: \n{h}
586 |         """.format(pp=",".join(map(str,self.hap_var_positions)),
587 |                    h="\n".join(self.haplotypes))
588 | 
589 |     def sanity_check(self):
590 |         """
591 |         Sanity check the following:
592 |         -- variant positions are properly recorded and concordant
593 |         -- alt bases are truly alt and unique
594 |         -- all haplotypes are the same length
595 |         """
596 |         for pos in self.hap_var_positions:
597 |             assert pos in self.ref_at_pos
598 | 
599 |         if self.alt_at_pos is not None:
600 |             for pos in self.alt_at_pos:
601 |                 # ref base must not be in alt
602 |                 assert self.ref_at_pos[pos] not in self.alt_at_pos[pos]
603 |                 # alt bases must be unique
604 |                 assert len(self.alt_at_pos[pos]) == len(set(self.alt_at_pos[pos]))
605 | 
606 |         if len(self.haplotypes) >= 1:
607 |             n = len(self.haplotypes[0])
608 |             assert n == len(self.hap_var_positions)
609 |             for hap_str in self.haplotypes[1:]:
610 |                 assert len(hap_str) == n
611 | 
612 | 
613 |     def match_or_add_haplotype(self, hap_string):
614 |         """
615 |         If <hap_string> is an existing haplotype, return the index.
616 |         Otherwise, add to known haplotypes and return the new index.
617 | 
618 |         :return: <index>, "FOUND" or "NEW"
619 |         """
620 |         if hap_string in self.haplotypes:
621 |             i = self.haplotypes.index(hap_string)
622 |             return i, "FOUND"
623 |         else:
624 |             i = len(self.haplotypes)
625 |             self.haplotypes.append(hap_string)
626 |             return i, "NEW"
627 | 
628 |     def impute_haplotype(self, hap_string, min_score):
629 |         """
630 |         :param hap_string: a hap string with '?'s.
631 |         :param min_sim: minimum similarity with existing haplotype to accept assignment
632 |         :return: <index> of an existing haplotype, or None if not sufficiently matched
633 | 
634 |         Impute haplotype and only return a match if:
635 |         (a) score (similarity) is >= min_score
636 |         (b) the matching score for the best one is higher than the second best match
637 |         """
638 |         sim_tuple = namedtuple('sim_tuple', 'index score')
639 |         sims = [] # list of sim_tuple
640 |         hap_str_len = len(hap_string)
641 |         for i in range(len(self.haplotypes)):
642 |             # Liz note: currently NOT checking whether existing haplotypes have '?'. I'm assuming no '?'.
643 |             score = sum((hap_string[k]==self.haplotypes[i][k]) for k in range(hap_str_len))
644 |             if score > 0:
645 |                 sims.append(sim_tuple(index=i, score=score))
646 |         if len(sims) == 0:
647 |             return None
648 |         sims.sort(key=lambda x: x.score, reverse=True)
649 |         if sims[0].score >= min_score and (len(sims)==1 or sims[0].score > sims[1].score):
650 |             return sims[0].index
651 |         else:
652 |             return None
653 | 
654 |     def get_haplotype_vcf_assignment(self):
655 |         """
656 |         Must be called before self.write_haplotype_to_vcf()
657 |         This is preparing for writing out VCF. We need to know, for each variant position,
658 |         the ref base (already filled in self.ref_at_pos) and the alt bases (self.alt_at_pos).
659 |         For each haplotype in (self.haplotype), we need to know the whether the i-th variant is the
660 |         ref (index 0), or some alt base (index 1 and onwards).
661 | 
662 |         Propagates two variables:
663 | 
664 |         self.haplotype_vcf_index: hap index --> pos --> phase index (0 for ref, 1+ for alt)
665 |         self.alt_at_pos: dict of <0-based pos> --> alt bases (not is not ref) at this position
666 |         """
667 |         self.haplotype_vcf_index = [{} for i in range(len(self.haplotypes))]
668 |         self.alt_at_pos = {}
669 | 
670 |         # what happens in the case of partial phasing
671 |         # ex: self.haplotypes[0] = "A?G", this means when it comes to the second pos, pos2,
672 |         # in the VCF we would want to write out .|. for diploid, . for haploid, etc
673 |         # so let's set self.haplotype_vcf_index[0][pos2] = '.' to indicate that
674 | 
675 |         for i,pos in enumerate(self.hap_var_positions):
676 |             ref = self.ref_at_pos[pos]
677 |             # need to go through the haplotype bases, if ref is already represented, then don't put it in alt
678 |             self.alt_at_pos[pos] = []
679 |             for hap_i, hap_str in enumerate(self.haplotypes):
680 |                 base = hap_str[i]
681 |                 if base=='?': # means this haplotype does not cover this position!
682 |                     self.haplotype_vcf_index[hap_i][pos] = '.'
683 |                 elif base==ref: # is the ref base
684 |                     self.haplotype_vcf_index[hap_i][pos] = 0
685 |                 else: # is an alt base, see if it's already there
686 |                     if base in self.alt_at_pos[pos]:
687 |                         j = self.alt_at_pos[pos].index(base)
688 |                         self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref)
689 |                     else:
690 |                         j = len(self.alt_at_pos[pos])
691 |                         self.alt_at_pos[pos].append(base)
692 |                         self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref)
693 |             # in the case where partial_ok=False, it's possible some alt are never presented by a haplotype
694 |             # we must check that all variants are presented here
695 |             for _base in self.count_of_vars_by_pos[pos]:
696 |                 if (_base not in self.ref_at_pos[pos]) and (_base not in self.alt_at_pos[pos]):
697 |                     self.alt_at_pos[pos].append(_base)
698 | 
699 | 
700 |     def write_haplotype_to_humanreadable(self, contig, f_human1, f_human2, f_human3, seq_hap_info):
701 |         """
702 |         The following functions must first be called first:
703 |         -- self.get_haplotype_vcf_assignment
704 |         f_human1 : human readable tab file handle, one SNP per line
705 |         f_human2: human readable tab file handle, one allele per line
706 |         f_human3: human readable tab file handle, CCS read to haplotype assignment, one read per line
707 |         """
708 |         if self.haplotype_vcf_index is None or self.alt_at_pos is None:
709 |             raise Exception("Must call self.get_haplotype_vcf_assignment() first!")
710 | 
711 |         self.sanity_check()
712 | 
713 |         # f_human1.write("haplotype\thapIdx\tcontig\tpos\tvarIdx\tbase\tcount\n")
714 |         # f_human2.write("haplotype\thapIdx\tcontig\tcount\n")
715 |         # f_human3.write("read_id\thaplotype\thapIdx\n")
716 | 
717 |         hap_count = Counter()
718 |         for ccs_id, hap_index in seq_hap_info.items():
719 |             hap_count[hap_index] += 1
720 |             hap_str = self.haplotypes[hap_index]
721 |             f_human3.write(f'{ccs_id}\t{hap_str}\t{hap_index}\n')
722 | 
723 |         for hap_index,hap_str in enumerate(self.haplotypes):
724 |             f_human2.write(f'{hap_str}\t{hap_index}\t{contig}\t')
725 |             f_human2.write(str(hap_count[hap_index]) + '\n')
726 |             for pos_index,pos in enumerate(self.hap_var_positions):
727 |                 i = self.haplotype_vcf_index[hap_index][pos]
728 |                 if i == '.': # means this haplotype does not include this position, skip!
729 |                     continue
730 |                 assert type(i) is int
731 |                 f_human1.write(f'{hap_str}\t{hap_index}\t{contig}\t')
732 |                 f_human1.write(str(pos+1)+'\t')
733 |                 f_human1.write(str(pos_index+1)+'\t')
734 |                 if i == 0:
735 |                     base = self.ref_at_pos[pos]
736 |                     f_human1.write("REF\t")
737 |                 else:
738 |                     base = self.alt_at_pos[pos][i-1]
739 |                     f_human1.write("ALT" + str(i-1) + '\t')
740 |                 #if i>0: pdb.set_trace()
741 |                 f_human1.write(str(self.count_of_vars_by_pos[pos][base]) + '\n')
742 | 
743 | 


--------------------------------------------------------------------------------