├── phasing ├── __init__.py ├── io │ ├── __init__.py │ ├── SAMMPileUpReader.py │ ├── coordinate_mapper.py │ ├── MPileUpVariantCaller.py │ ├── BioReaders.py │ └── VariantPhaser.py ├── utils │ └── paint_bam_post_phaser.py └── mag_phaser.py ├── MagPhase.conda_env.yml ├── setup.py ├── LICENSE └── README.md /phasing/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'etseng@pacb.com' 2 | -------------------------------------------------------------------------------- /phasing/io/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'etseng@pacb.com' 2 | -------------------------------------------------------------------------------- /MagPhase.conda_env.yml: -------------------------------------------------------------------------------- 1 | name: MagPhase.env 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - anaconda 6 | - r 7 | - defaults 8 | dependencies: 9 | - bcbiogff 10 | - biopython 11 | - bx-python 12 | - numpy 13 | - psutil 14 | - pybedtools 15 | - pysam 16 | - python>=3.7.6 17 | - pyvcf 18 | - samtools 19 | - scipy 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension, find_packages 2 | import sys 3 | 4 | __author__ = "etseng@pacb.com" 5 | version = "v1.0.0" 6 | 7 | setup( 8 | name = 'magphase', 9 | version=version, 10 | author='Elizabeth Tseng', 11 | author_email='etseng@pacb.com', 12 | zip_safe=False, 13 | packages = ['phasing.io'], 14 | 15 | install_requires=[ 16 | 'biopython', 17 | 'bx-python>=0.7.3', 18 | 'scipy', 19 | 'pysam' 20 | ], 21 | scripts = [ 22 | 'phasing/mag_phaser.py', 23 | 'phasing/utils/paint_bam_post_phaser.py' 24 | ], 25 | ) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021, Pacific Biosciences of California, Inc. 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted (subject to the limitations in the disclaimer below) provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | 15 | 16 | -------------------------------------------------------------------------------- /phasing/utils/paint_bam_post_phaser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os, sys 4 | from csv import DictReader 5 | import pysam 6 | 7 | def paint_bam_post_phaser(input_bam, output_bam, read_info, chrom, start, end): 8 | reader = pysam.AlignmentFile(input_bam, 'rb') 9 | fout = pysam.AlignmentFile(output_bam, 'wb', header=reader.header) 10 | for r in reader.fetch(chrom, start, end): 11 | d = r.to_dict() 12 | newtags = [] 13 | for k in d['tags']: 14 | if not k.startswith('RG:Z'): 15 | newtags.append(k) 16 | if r.qname not in read_info: 17 | newtags.append('RG:Z:unassigned') 18 | else: 19 | hapstr = read_info[r.qname] 20 | newtags.append('RG:Z:' + hapstr) 21 | d['tags'] = newtags 22 | fout.write(pysam.AlignedSegment.from_dict(d, r.header)) 23 | fout.close() 24 | 25 | if __name__ == "__main__": 26 | from argparse import ArgumentParser 27 | parser = ArgumentParser() 28 | parser.add_argument("input_bam") 29 | parser.add_argument("output_bam") 30 | parser.add_argument("read_hap_info", help="Human readable read-to-hap info file from Isophase/Magphase output") 31 | parser.add_argument("-c", "--chrom", required=True, help='Chromosome') 32 | parser.add_argument("-s", "--start", required=True, type=int, help="Start location") 33 | parser.add_argument("-e", "--end", required=True, type=int, help="End location") 34 | 35 | args = parser.parse_args() 36 | 37 | read_info = {} 38 | for r in DictReader(open(args.read_hap_info), delimiter='\t'): 39 | read_info[r['read_id']] = r['haplotype'] 40 | 41 | paint_bam_post_phaser(args.input_bam, args.output_bam, read_info, args.chrom, args.start, args.end) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MagPhase 2 | Phasing for metagenomics using PacBio long reads 3 | 4 | Current Version (07/13/2021): MagPhase v1.0 5 | 6 | 7 | ## What is MagPhase? 8 | 9 | MagPhase is for phasing of metagenomics data using long reads. 10 | 11 | MagPhase is a modified version of [IsoPhase](https://github.com/Magdoll/cDNA_Cupcake/wiki/IsoPhase:-Haplotyping-using-Iso-Seq-data) which was originally designed for isoform-level phasing of PacBio Iso-Seq (full-length transcript sequencing) data. 12 | 13 | ## How MagPhase works 14 | 15 | ![](https://github.com/Magdoll/images_public/blob/master/IsoPhase_MagPhase/magphase_workflow_for_DerekPaper.png?raw=true) 16 | 17 | MagPhase takes an alignment BAM file of HiFi reads aligned to the assembled contigs and a BED file that denotes the regions to phase. 18 | 19 | For each region, individual SNPs are called. Then, reads are used to infer the "haplotypes" (or lineages). 20 | 21 | The output of MagPhase consists of individual SNP information and the inferred list of haplotypes. 22 | 23 | 24 | ## Requirements & Installation 25 | 26 | ### Prerequisites 27 | 28 | * Python (3.7+) 29 | * minimap2 30 | 31 | ### Python-related libraries 32 | 33 | * biopython 34 | * bx-python 35 | * scipy 36 | * pysam 37 | * pyvcf 38 | 39 | ### Installation using (Ana)Conda 40 | 41 | We recommend using [Anaconda](https://www.anaconda.com/products/individual) to set up your conda environment. Currently only Linux environments are supported. 42 | 43 | (1) Install Conda Environment 44 | 45 | ``` 46 | export PATH=$PATH:/bin 47 | conda -V 48 | conda update conda 49 | ``` 50 | 51 | (2) Clone the Github repo and install using the yml script 52 | 53 | ``` 54 | git clone https://github.com/Magdoll/MagPhase.git 55 | cd MagPhase 56 | conda env create -f MagPhase.conda_env.yml 57 | source activate MagPhase.env 58 | ``` 59 | 60 | (3) Once you have activated the virtual environment, you should see your prompt changing to something like this: 61 | 62 | ``` 63 | (MagPhase.env)$ 64 | ``` 65 | 66 | (4) Compile and install MagPhase 67 | 68 | ``` 69 | (MagPhase.env)$ python setup.py build 70 | (MagPhase.env)$ python setup.py install 71 | ``` 72 | 73 | 74 | ## Example Usage 75 | 76 | The usage for `mag_phaser.py` is as follows: 77 | 78 | ``` 79 | $ mag_phaser.py -h 80 | usage: mag_phaser.py [-h] -a ASSEMBLY -b BAMFILE -o OUTPUT -g GENES [-p PVAL_CUTOFF] [--bhFDR BHFDR] 81 | 82 | optional arguments: 83 | -h, --help show this help message and exit 84 | -a ASSEMBLY, --assembly ASSEMBLY 85 | The mag assembly file in fasta format 86 | -b BAMFILE, --bamfile BAMFILE 87 | Aligned reads in bam file format [full path needed!] 88 | -o OUTPUT, --output OUTPUT 89 | output prefix 90 | -g GENES, --genes GENES 91 | SCG gene bed file 92 | -p PVAL_CUTOFF, --pval_cutoff PVAL_CUTOFF 93 | P value cutoff for variant calls 94 | --bhFDR BHFDR FDR to be used for the Benjamini–Hochberg correction. Default: None (not used). 95 | 96 | ``` 97 | 98 | where `-a` provides MAG assembly contig fasta file. `-b` provides the aligned HiFi reads to the contig fasta. `-g` provides a BED file that contains the individual regions to be phased. 99 | 100 | `-p` and `--bhFDR` controls the p-value cutoff for SNP calling. It is recommended that you use the Benjamini–Hochberg correction for better SNP detection (since correction for multiple testing can drastically reduce number of SNPs called). We recommend using `--bhFDR 0.01` for general metagenomics applications. 101 | 102 | 103 | ## Output Interpretation 104 | 105 | An example run: 106 | 107 | ``` 108 | mag_phaser.py -a all_contigs.fasta -b all_contigs.bubbles.ccs.filtered.sorted.bam -g 1377.shortmaps.bed --bhFDR 0.01 -o 1377.strain 109 | ``` 110 | 111 | will produce the following files: 112 | 113 | ``` 114 | 1377.strain.human_readable_by_hap.txt 115 | 1377.strain.human_readable_by_pos.txt 116 | 1377.strain.human_readable_by_read.txt 117 | ``` 118 | 119 | Within a region (as provided by the `-g` BED file), if there were phasing results (note: note all regions can be phased, as there could be no SNPs present), the haplotypes are represetned as a string of concatenated SNPs. For example, if there are three SNPs in this region at genomic position 101, 150, and 220, and there are four haplotypes, then the representation in the `_hap.txt` would be: 120 | 121 | |haplotype|hapIdx|contig|count| 122 | |---|---|---|---| 123 | |ATT|0 |contig\_1337|10| 124 | |CTG|1 |contig\_1337|12| 125 | |ATG|2 |contig\_1337|2| 126 | |?AG|3 |contig\_1337|4| 127 | 128 | Note not all haplotypes will cover all SNP positions, so some haplotypes may have a `?` indicating lack of bases at that SNP location. 129 | 130 | The SNP position would be stored in the `_pos.txt` file: 131 | 132 | |haplotype|contig |pos|varIdx|base| 133 | |---------|------------|---|------|----| 134 | |ATT |contig\_1337|101|1 |REF | 135 | |ATT |contig\_1337|150|2 |REF | 136 | |ATT |contig\_1337|220|3 |REF | 137 | |CTG |contig\_1337|101|1 |ALT0 | 138 | |CTG |contig\_1337|150|2 |REF | 139 | |CTG |contig\_1337|220|3 |ALT0 | 140 | |ATG |contig\_1337|101|1 |REF | 141 | |ATG |contig\_1337|150|2 |REF | 142 | |ATG |contig\_1337|220|3 |ALT0 | 143 | |?AG |contig\_1337|150|2 |ALT0 | 144 | |?AG |contig\_1337|220|3 |ALT0 | 145 | 146 | -------------------------------------------------------------------------------- /phasing/mag_phaser.py: -------------------------------------------------------------------------------- 1 | import os, re, sys 2 | import subprocess 3 | 4 | try: 5 | import vcf 6 | except ImportError: 7 | print("Cannot import vcf! Please install pyvcf!", file=sys.stderr) 8 | sys.exit(-1) 9 | 10 | from phasing.io import SAMMPileUpReader as sam 11 | from phasing.io import MPileUpVariantCaller as VC 12 | from phasing.io import VariantPhaser 13 | 14 | 15 | MIN_COVERAGE = 10 # minimum number of FL reads for a gene to do SNP calling and phasing 16 | ERR_SUB = 0.005 17 | PVAL_CUTOFF = 0.1 18 | 19 | def parse_user_input(): 20 | from argparse import ArgumentParser 21 | parser = ArgumentParser( 22 | description = "A pipeline for aligning sequence data on a slurm cluster" 23 | ) 24 | parser.add_argument('-a', '--assembly', 25 | help="The mag assembly file in fasta format", 26 | required=True, type=str 27 | ) 28 | parser.add_argument('-b', '--bamfile', 29 | help="Aligned reads in bam file format [full path needed!]", 30 | required=True, type=str 31 | ) 32 | parser.add_argument('-o', '--output', 33 | help="output prefix", 34 | required=True, type=str 35 | ) 36 | parser.add_argument('-g', '--genes', 37 | help='SCG gene bed file', 38 | required =True, type=str 39 | ) 40 | parser.add_argument('-p', '--pval_cutoff', 41 | help="P value cutoff for variant calls", 42 | default=PVAL_CUTOFF, type=float 43 | ) 44 | parser.add_argument("--bhFDR", default=None, 45 | type=float, 46 | help="FDR to be used for the Benjamini–Hochberg correction. Default: None (not used).") 47 | 48 | 49 | return parser.parse_args(), parser 50 | 51 | def main(args, parser): 52 | args = parser.parse_args() 53 | 54 | if args.bhFDR is not None: 55 | print("--bhFDR {0} is given! Will be using Benjamini–Hochberg correction insteaad. --pval_cutoff is ignored.".format(args.bhFDR)) 56 | 57 | 58 | # remove potential past run output 59 | past_files = [args.output+'.NO_SNPS_FOUND', 60 | args.output+'.NO_HAPS_FOUND', 61 | args.output+'.snps', 62 | args.output+'.log', 63 | args.output+'.human_readable.txt', 64 | args.output+'.vcf', 65 | args.output+'.cleaned.human_readable.txt', 66 | args.output+'.cleaned.vcf'] 67 | 68 | for file in past_files: 69 | if os.path.exists(file): 70 | os.remove(file) 71 | 72 | snpsfound = False 73 | # (0) generate pileups 74 | f_human1 = open(args.output + '.human_readable_by_pos.txt', 'w') 75 | f_human1.write("haplotype\thapIdx\tcontig\tpos\tvarIdx\tbase\tcount\n") 76 | f_human2 = open(args.output + '.human_readable_by_hap.txt', 'w') 77 | f_human2.write("haplotype\thapIdx\tcontig\tcount\n") 78 | f_human3 = open(args.output + '.human_readable_by_read.txt', 'w') 79 | f_human3.write("read_id\thaplotype\thapIdx\n") 80 | 81 | for mpileupFile, contig, start, end in elitePileups(args.bamfile, args.genes, args.assembly, args.output): 82 | # (1) read the mpileup and vall variants 83 | reader = sam.MPileUpReader(mpileupFile) 84 | recs = [r for r in reader] 85 | vc = VC.MagMPileUPVariant(recs, min_cov=MIN_COVERAGE, err_sub=ERR_SUB, expected_strand='+-', 86 | pval_cutoff=args.pval_cutoff, 87 | bhFDR=args.bhFDR) 88 | vc.call_variant() 89 | print(vc.variant) 90 | 91 | if len(vc.variant) != 0: 92 | snpsfound = True 93 | else: 94 | continue 95 | 96 | # we write SNPs with the bases separated by "/" not "|" becuz we haven't phased them yet 97 | with open(args.output + '.snps', 'a+') as f_snp: 98 | for pos, v in vc.variant.items(): 99 | f_snp.write("{contig}\t{pos}\t{bases}\t{counts}\n".format(\ 100 | contig=contig,\ 101 | pos=pos+1,\ 102 | bases="/".join([b for (b,c) in v]),\ 103 | counts="/".join([str(c) for (b,c) in v]))) 104 | 105 | # (2) for each CCS read, assign a haplotype (or discard if outlier) 106 | pp = VariantPhaser.MagVariantPhaser(vc) 107 | pp.phase_variant(args.bamfile, [contig, start, end], args.output, partial_ok=True) 108 | print(pp.haplotypes) 109 | pp.haplotypes.get_haplotype_vcf_assignment() 110 | pp.haplotypes.write_haplotype_to_humanreadable(contig, f_human1, f_human2, f_human3, pp.seq_hap_info) 111 | os.remove(mpileupFile) 112 | f_human1.close() 113 | f_human2.close() 114 | f_human3.close() 115 | 116 | if not snpsfound: 117 | os.system("touch {out}.NO_SNPS_FOUND".format(out=args.output)) 118 | os.remove(args.output + '.human_readable.txt') 119 | print("No SNPs found. END.", file=sys.stderr) 120 | 121 | 122 | def elitePileups(aligned_bam : str, gene_bed : str, assembly : str, outprefix : str) -> str: 123 | """ 124 | 125 | :param aligned_bam: 126 | :param gene_bed: gene bed to extract for making pileup 127 | :param assembly: 128 | :param outprefix: 129 | :return: 130 | """ 131 | for line in open(gene_bed): 132 | # contig_4047 8476 8850 contig_4047_5 133 | chrom, s, e, name = line.strip().split() 134 | 135 | outfile = "{p}.{c}_{s}_{e}.pileup".format(p=outprefix, c=chrom, s=s, e=e) 136 | cmd = "samtools mpileup -r {c}:{s}-{e} -f {asm} -s {bam} > {o}".format(\ 137 | c=chrom, s=s, e=e, asm=assembly, bam=aligned_bam, o=outfile) 138 | if subprocess.check_call(cmd, shell=True)!=0: 139 | print("FAILED TO RUN CMD: {0}. Abort!".format(cmd)) 140 | sys.exit(-1) 141 | yield outfile, chrom, int(s), int(e) 142 | 143 | 144 | if __name__ == "__main__": 145 | args, parser = parse_user_input() 146 | main(args, parser) 147 | -------------------------------------------------------------------------------- /phasing/io/SAMMPileUpReader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'etseng@pacb.com' 2 | 3 | """ 4 | Parser for `samtools mpileup` 5 | 6 | http://www.htslib.org/doc/samtools-1.1.html 7 | 1. chr 8 | 2. 1-based position 9 | 3. ref base 10 | 4. coverage 11 | 5. readBase 12 | 6. base qualities 13 | 7. alignment qualities 14 | 15 | readBase: 16 | . match to ref 17 | , match to ref on rev 18 | > or < ref skipping (ex: like 37N) 19 | ACGTN mismatch on + strand 20 | acgn mismatch on - strand 21 | +{number}{AGCTNagctn} - insertion of some {number} 22 | -{number}{...} deletion of some {number} # also means in next {number}, you will see a * 23 | ^ begin of read, followed by asci-33 for quality 24 | $ end of read 25 | """ 26 | 27 | import os, sys, re 28 | import pdb 29 | from collections import Counter 30 | 31 | class MPileUpRecord(object): 32 | def __init__(self, chr, pos, ref, cov, readBase, baseQuals, alnQuals): 33 | """ 34 | In addition to storing the 7 cols from mpileup, 35 | nalso stores 36 | counter: Counter of (key) -> (obs count in pileup) 37 | """ 38 | self.chr = chr 39 | self.pos = pos 40 | self.ref = ref.upper() # let ref base always be upper case 41 | self.cov = cov 42 | self.nCov = None # this is the coverage of non-indel, non-skipped, which would be ACGTNacgtn 43 | self.nType = None # this is the number of non-indel, non-skipped bases accumulated at this record 44 | self.readBase = readBase 45 | self.baseQuals = baseQuals 46 | self.alnQuals = alnQuals 47 | 48 | self.counts = Counter() 49 | self.parse_readBase() 50 | 51 | def __str__(self): 52 | return """ 53 | chr: {c} 54 | pos: {p} (1-based) 55 | ref: {r} 56 | cov: {v} 57 | nCov: {n} 58 | counts: {t}""".format(c=self.chr, p=self.pos+1, r=self.ref, v=self.cov, n=self.nCov, t=self.counts) 59 | 60 | def parse_readBase(self): 61 | """ 62 | fill in self.counts 63 | """ 64 | def not_indel_end_pos(i): 65 | return i >= len(self.readBase)-1 or self.readBase[i+1] not in ('+', '-', '$') 66 | 67 | rex = re.compile('(\d+)') 68 | def read_indel(start_index): 69 | m = rex.search(self.readBase, start_index) 70 | num = int(self.readBase[m.start():m.end()]) 71 | return m.start(), m.end()+num 72 | 73 | sanity_counter = 0 # use this to track how many "reads" we've parsed to make sure parsing is correct 74 | # this number should agree with self.cov which is 4-th column in mpileup 75 | i = 0 # pointer for current location in string self.readBase 76 | while i < len(self.readBase): 77 | b = self.readBase[i] 78 | if b in '<>': # ignore skipped refs 79 | sanity_counter += 1 80 | i += 1 81 | continue 82 | elif b == '*': # deletion, just advance 83 | i += 1 84 | sanity_counter += 1 85 | continue 86 | elif b == '^': # start of read followed by ascii and either a comma or dot (ex: ^I.) 87 | i += 3 88 | sanity_counter += 1 89 | continue 90 | elif b == '$': # end of read, DO NOT advance counter 91 | i += 1 92 | continue 93 | elif b == '.': # could be followed by indels or $, careful don't double count 94 | self.counts[self.ref] += 1 95 | sanity_counter += 1 96 | i += 1 97 | elif b == ',': # # could be followed by indels or $, careful don't double count 98 | self.counts[self.ref.lower()] += 1 99 | sanity_counter += 1 100 | i += 1 101 | elif b in 'ATCGNatcgn': 102 | self.counts[b] += 1 103 | sanity_counter += 1 104 | i += 1 105 | elif b == '-': # DO NOT ADVANCE the sanity counter! otherwise double counting 106 | start, end = read_indel(i+1) 107 | self.counts["-"+self.readBase[start:end]] += 1 108 | i = end 109 | elif b == '+': # insertion should be +{number}{bases} 110 | start, end = read_indel(i+1) 111 | self.counts["+"+self.readBase[start:end]] += 1 112 | i = end 113 | else: 114 | raise Exception("Unknown {0} in readBase!".format(b)) 115 | 116 | assert self.cov == sanity_counter or (self.readBase=='*' and self.cov==0) 117 | # set nCov which is cov provided by non-indel non-skipped bases 118 | self.nCov = 0 119 | self.nType = 0 120 | for x in 'ATCGNatcgn': 121 | self.nCov += self.counts[x] 122 | if self.counts[x] > 0: self.nType += 1 123 | 124 | 125 | class MPileUpReader(object): 126 | def __init__(self, filename): 127 | self.filename = filename 128 | self.f = open(filename) 129 | 130 | def __iter__(self): 131 | return self 132 | 133 | def __next__(self): 134 | cur = self.f.tell() 135 | line = self.f.readline() 136 | if self.f.tell() == cur: 137 | raise StopIteration 138 | return self.parseLine(line) 139 | 140 | def parseLine(self, line): 141 | raw = line.strip().split('\t') 142 | if (len(raw)==7 or len(raw)==15): 143 | cov = int(raw[3]) 144 | #if cov > 0: 145 | return MPileUpRecord(chr=raw[0],\ 146 | pos=int(raw[1])-1,\ 147 | ref=raw[2], 148 | cov=int(raw[3]), 149 | readBase=raw[4], 150 | baseQuals=raw[5], 151 | alnQuals=raw[6]) 152 | elif len(raw)==4: 153 | # only way to have only 4 columns is because after --min-BQ filtering there are no bases 154 | # ex: 155 | # fake 8728 T 3 .$.$. ;q: ]]] 156 | # fake 8729 T 0 157 | return MPileUpRecord(chr=raw[0],\ 158 | pos=int(raw[1])-1,\ 159 | ref=raw[2], 160 | cov=0, 161 | readBase='', 162 | baseQuals='', 163 | alnQuals='') 164 | else: 165 | raise Exception("Expected to have 7 cols in mpileup record \ 166 | but saw only {0}, abort! Line was: {1}".format(len(raw), line)) 167 | 168 | 169 | -------------------------------------------------------------------------------- /phasing/io/coordinate_mapper.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | from bx.intervals import Interval 3 | from Bio.Seq import Seq 4 | 5 | def iter_cigar_string(cigar_string): 6 | num = cigar_string[0] 7 | for s in cigar_string[1:]: 8 | if str.isalpha(s): 9 | yield int(num), s 10 | num = '' 11 | else: 12 | num += s 13 | 14 | 15 | def make_exons_from_base_mapping(mapping, start, end, strand): 16 | """ 17 | mapping is 0-based index on transcript --> 0-based index on genome 18 | however beware of strand! 19 | """ 20 | 21 | output = [mapping[start]] 22 | for i in range(start+1, end): 23 | cur_pos, cur_is_junction= mapping[i] 24 | if cur_is_junction and mapping[i]!=output[-1]: 25 | # if the last position is the same, DON'T APPEND (was an indel) 26 | output.append(mapping[i]) 27 | cur_pos, cur_is_junction = mapping[end] 28 | if mapping[end]!=output[-1]: 29 | output.append(mapping[end]) 30 | 31 | # remember for Interval it is 0-based start, 1-based end 32 | # if len(output) is odd, must be 1bp into an exon 33 | # ex: [(xxx,True), (xxx,True), (xxx,False)] or 34 | # [.....(xxx,True), xxx(True)] 35 | #print output 36 | if len(output)==1: 37 | output = [output[0], output[0]] # just duplicate it 38 | elif len(output)%2==1: 39 | if output[0][1] and output[1][1]: 40 | output.insert(0, output[0]) 41 | elif output[-1][1] and output[-2][1]: 42 | output.append(output[-1]) 43 | # print "modified:", output 44 | if strand == '+': 45 | return [Interval(output[i][0],output[i+1][0]+1) for i in range(0, len(output), 2)] 46 | else: # - strand 47 | return [Interval(output[i][0],output[i-1][0]+1) for i in range(len(output)-1,-1,-2)] 48 | 49 | 50 | 51 | def get_base_to_base_mapping_from_sam(exons, cigar_string, qStart, qEnd, strand, include_junction_info=False): 52 | """ 53 | For PacBio data which can have indels w.r.t genome =___= 54 | 55 | ex: 56 | cigar: 1S105M407N548M 57 | sStart-sEnd: 948851-949911 58 | qStart-qEnd: 2-655 59 | segments: [Interval(start=948851, end=948956), Interval(start=949363, end=949911)] 60 | 61 | Returns: dict of 0-based position --> 0-based ref position 62 | """ 63 | cur_exon_i = 0 64 | cur_nt_loc = qStart 65 | cur_genome_loc = exons[0].start 66 | 67 | start_soft_clip = qStart > 0 68 | 69 | last_base_is_junction = False 70 | qLen = qEnd 71 | 72 | mapping = {} 73 | 74 | for num, s in iter_cigar_string(cigar_string): 75 | if s == 'S': # soft clipping at the ends, ignore 76 | if start_soft_clip: 77 | assert num == qStart 78 | for i in range(num): mapping[i] = (cur_genome_loc, False) 79 | start_soft_clip = False 80 | else: 81 | # soft clipping at the end 82 | # advance the mapping but not cur_nt_loc (otherwise will be diff from qEnd) 83 | for i in range(num): 84 | mapping[cur_nt_loc+i] = (cur_genome_loc, False) 85 | #cur_nt_loc += 1 86 | #print cur_nt_loc 87 | # for soft clipping at the end, do NOT progress cur_nt_loc! 88 | # we are now "outside" the alignment, otherwise 89 | # assert cur_nt_loc == qEnd will be wrong at the end 90 | #cur_nt_loc -= 1 91 | qLen += num # query length must be qEnd + soft clipped end 92 | elif s == 'N': # intron, move to next ref exon 93 | mapping[cur_nt_loc-1] = (mapping[cur_nt_loc-1][0], True) 94 | assert cur_genome_loc == exons[cur_exon_i].end 95 | cur_exon_i += 1 96 | cur_genome_loc = exons[cur_exon_i].start 97 | last_base_is_junction = True 98 | elif s == 'M': 99 | # for the next "num" matches are all 1:1 100 | for i in range(num): 101 | if cur_nt_loc in mapping and mapping[cur_nt_loc][1]: 102 | # if this is true, then last mapping must be 'D' and was a junction 103 | # so we do nothing -- keep it 104 | pass 105 | else: 106 | mapping[cur_nt_loc] = (cur_genome_loc, last_base_is_junction) 107 | last_base_is_junction = False 108 | cur_nt_loc += 1 109 | cur_genome_loc += 1 110 | assert cur_genome_loc <= exons[cur_exon_i].end 111 | elif s == 'I': # insertion w.r.t to genome 112 | for i in range(num): 113 | mapping[cur_nt_loc] = (cur_genome_loc, last_base_is_junction) 114 | cur_nt_loc += 1 115 | last_base_is_junction = False 116 | elif s == 'D': # deletion w.r.t. to genome 117 | # if last_base_is_junction is True, we want to make sure it makes it in mapping 118 | mapping[cur_nt_loc] = (cur_genome_loc, last_base_is_junction) 119 | last_base_is_junction = False 120 | cur_genome_loc += num 121 | # BELOW IS WRONG 122 | # for i in xrange(num): 123 | # mapping[cur_nt_loc] = cur_genome_loc 124 | # cur_genome_loc += 1 125 | assert cur_genome_loc <= exons[cur_exon_i].end 126 | assert cur_nt_loc == qEnd or (cur_nt_loc==qEnd-1 and s=='S') 127 | 128 | if strand == '-': 129 | mapping = dict((qLen-1-k, v) for k,v in mapping.items()) 130 | 131 | if not include_junction_info: 132 | for k in mapping: 133 | mapping[k] = mapping[k][0] 134 | 135 | return mapping 136 | 137 | 138 | 139 | def get_exon_coordinates(exons, start, end): 140 | """ 141 | Return the set of "exons" (genome location) that 142 | is where the nucleotide start-end is 143 | 144 | start is 0-based 145 | end is 1-based 146 | exons is a set of Interval (0-based start, 1-based end) 147 | """ 148 | acc_lens = [0] # ex: [0, 945, 1065, 1141, 1237] accumulative length of exons 149 | len_of_transcript = 0 150 | for e in exons: 151 | _len = e.end - e.start 152 | acc_lens.append(acc_lens[-1] + _len) 153 | len_of_transcript += _len 154 | # confirm that start-end is in the range of the transcript! 155 | assert 0 <= start < end <= len_of_transcript + 30 # allow a 30-bp slack due to PacBio indels 156 | 157 | end = min(end, len_of_transcript) # trim it to the end if necessary (for PacBio) 158 | 159 | 160 | i = bisect.bisect_right(acc_lens, start) 161 | j = bisect.bisect_right(acc_lens, end) 162 | 163 | # starts at i-th exon and ends at j-th exon, i and j are both 1-based 164 | # for the first exon, the offset is start-acc+e.start 165 | # for the last exon, the end point is end-acc+e.start 166 | if i == j: 167 | return [Interval(start-acc_lens[i-1]+exons[i-1].start, 168 | end-acc_lens[i-1]+exons[i-1].start)] 169 | else: 170 | if j >= len(exons): # the end is the end 171 | return [Interval(start-acc_lens[i-1]+exons[i-1].start, exons[i-1].end)] + \ 172 | exons[i:] 173 | else: 174 | return [Interval(start-acc_lens[i-1]+exons[i-1].start, exons[i-1].end)] + \ 175 | exons[i:j-1] + \ 176 | [Interval(exons[j-1].start, end-acc_lens[j-1]+exons[j-1].start)] 177 | 178 | def consistute_genome_seq_from_exons(genome_dict, _chr, exons, strand): 179 | """ 180 | genome_dict is expected to be SeqReaders.LazyFastaReader 181 | exons is a list of [Interval(start, end)] 182 | """ 183 | seq = '' 184 | genome_seq = genome_dict[_chr].seq 185 | for e in exons: 186 | seq += str(genome_seq[e.start:e.end]) 187 | 188 | seq = Seq(seq) 189 | if strand == '+': 190 | return seq.tostring() 191 | else: 192 | return seq.reverse_complement().tostring() 193 | 194 | -------------------------------------------------------------------------------- /phasing/io/MPileUpVariantCaller.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Call variant based on a list of SAMMPileUpRecord where list[i] is the record of i-th position 4 | 5 | Most of the code follows Juliet's code at 6 | https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp 7 | 8 | """ 9 | import os, sys 10 | import scipy.stats as stats 11 | from collections import Counter, namedtuple 12 | 13 | BHtuple = namedtuple('BHtuple', ['pval', 'record']) 14 | 15 | class MPileUPVariant(object): 16 | def __init__(self, record_list, min_cov, err_sub, expected_strand, pval_cutoff=0.01, bhFDR=None): 17 | """ 18 | :param record_list: list of SAMMPileUpRecord 19 | :param min_cov: minimum coverage to call variant 20 | :param err_sub: substitution error, right now a fixed float 21 | :param expected_strand: expected strand of the transcript (+ or -) 22 | """ 23 | self.record_by_pos = dict((r.pos, r) for r in record_list) 24 | self.min_cov = min_cov 25 | self.err_sub = err_sub 26 | 27 | self.pval_cutoff = pval_cutoff 28 | self.bhFDR = bhFDR # is None, this is not used; other wise do Benjamini–Hochberg 29 | self.expected_strand = expected_strand 30 | 31 | 32 | self.prep_records() 33 | self.positions_to_call = self.get_positions_to_call() 34 | 35 | # must first call positions to call, then prep records, then number of tests 36 | self.number_of_tests = sum(self.record_by_pos[pos].clean_type for pos in self.positions_to_call) 37 | 38 | self.variant = {} # position --> in sorted order, (base, count) 39 | self.ref_base = {} # position --> ref base 40 | 41 | self.call_variant() 42 | 43 | 44 | def is_in_or_near_HP(self, pos, hp_size=4): 45 | """ 46 | We define a HP region as stretches of 4 or more same nucleotides 47 | :return: True/False for in/hear HP region 48 | """ 49 | def find_hp_region_size(cur): 50 | if cur not in self.record_by_pos: return 0 51 | end = cur+1 52 | while end in self.record_by_pos and self.record_by_pos[end].ref == self.record_by_pos[cur].ref: 53 | end += 1 54 | start = cur-1 55 | while start in self.record_by_pos and self.record_by_pos[start].ref == self.record_by_pos[cur].ref: 56 | start -= 1 57 | # hp region is from start+1 to end 58 | return end-(start+1) 59 | 60 | return (find_hp_region_size(pos) >= hp_size) or \ 61 | (find_hp_region_size(pos-1) >= hp_size) or \ 62 | (find_hp_region_size(pos+1) >= hp_size) 63 | 64 | 65 | def get_positions_to_call(self): 66 | """ 67 | Identify list of positions to try to call SNPs. Must have: 68 | 1. minimum coverage >= min_cov 69 | 2. the first and second most frequent base are NOT an indel 70 | 3. not next to or inside a homopolymer region 71 | 4. has at least two or more keys 72 | """ 73 | positions_to_call = [] 74 | for pos in self.record_by_pos: 75 | if self.record_by_pos[pos].clean_type < 2: continue # only one base at this position, skip 76 | elif self.record_by_pos[pos].clean_cov < self.min_cov: continue # insufficient cov, skip 77 | else: 78 | # find the first and second most freq base in the "non-clean" counts 79 | m = self.record_by_pos[pos].clean_counts.most_common() 80 | # ex: m = [('a', 10), ('-ct', 20), ('+t', 10)] 81 | if m[0][0][0]in ('+','-') or m[1][0][0] in ('+','-') or self.is_in_or_near_HP(pos): continue 82 | else: 83 | positions_to_call.append(pos) 84 | return positions_to_call 85 | 86 | def prep_records(self): 87 | """ 88 | Prepare the records by: 89 | 1. remove all 'N' bases 90 | 2. remove all bases that were not on the expected strand 91 | 3. remove all indels 92 | 93 | Creates three new vars: clean_counts, clean_cov, clean_type 94 | DOES NOT ALTER the original counts or other variables!!! 95 | 96 | If + strand, then ATCG 97 | If - strand, then atcg 98 | """ 99 | for pos in self.record_by_pos: 100 | r = self.record_by_pos[pos] 101 | if self.expected_strand == '+-': 102 | # for metagenomics, we don't care the strand 103 | # so instead we will convert everything to upper case later in the counts 104 | bases = 'ATCG' 105 | elif self.expected_strand == '+': 106 | bases = 'ATCG' 107 | elif self.expected_strand == '-': 108 | bases = 'atcg' 109 | 110 | if self.expected_strand == '+-': 111 | # convert lower case to upper case 112 | for k in 'atcg': 113 | if k in r.counts: 114 | r.counts[k.upper()] += r.counts[k] 115 | del r.counts[k] 116 | 117 | r.clean_counts = Counter(r.counts) 118 | keys = list(r.counts.keys()) 119 | for k in keys: 120 | if k not in bases: 121 | del r.clean_counts[k] 122 | r.clean_cov = sum(r.clean_counts.values()) 123 | r.clean_type = len(r.clean_counts) 124 | 125 | def call_variant(self): 126 | """ 127 | mirrors AminoAcidCaller::CallVariants() in 128 | https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp 129 | 130 | For each position (that has sufficient coverage), 131 | do Fisher exact test w/ correction 132 | if p-val < threshold, then store it. 133 | 134 | Stores results in self.variant as: 135 | 136 | self.variant[position] = desc list of (base, count). 137 | NOTE: base must be either all in lower case (which means - strand) 138 | or call upper case (+ strand). 139 | If - strand and ('a', 10), it means the ref base in A on the + strand, 140 | and the transcript should be T on the - strand. 141 | 142 | Only positions with more than the ref base is stored. 143 | """ 144 | if self.bhFDR is None: # use Bonferroni correction 145 | for pos in self.positions_to_call: 146 | r = self.record_by_pos[pos] 147 | alt_variant = [] 148 | for base, count in r.clean_counts.most_common()[1:]: 149 | assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels 150 | exp = r.clean_cov * self.err_sub 151 | odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater') 152 | pval *= self.number_of_tests 153 | if pval < self.pval_cutoff: # store variant if below cutoff 154 | alt_variant.append((base, count)) 155 | if len(alt_variant) > 0: # only record this variant if there's at least two haps 156 | self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant 157 | self.ref_base[pos] = r.ref 158 | else: # use Benjamini–Hochberg procedure 159 | # see: https://www.statisticshowto.com/benjamini-hochberg-procedure/ 160 | pval_dict = {} # (pos, base) -> BHtuple(pval, record) 161 | for pos in self.positions_to_call: 162 | r = self.record_by_pos[pos] 163 | for base, count in r.clean_counts.most_common()[1:]: 164 | assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels 165 | exp = r.clean_cov * self.err_sub 166 | odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater') 167 | if pval <= self.pval_cutoff: # With this filtration, the sequencing errors position will not be stored in pval_dict. 168 | pval_dict[(pos, base)] = BHtuple(pval=pval, record=r) 169 | 170 | # now we have all the pvals, rank them 171 | keys_pos_base = list(pval_dict.keys()) 172 | keys_pos_base.sort(key=lambda x: pval_dict[x].pval) 173 | self.number_of_tests = len(keys_pos_base) 174 | # find the largest p value that is smaller than the critical value. 175 | largest_good_rank1 = 0 176 | for rank0,(pos, base) in enumerate(keys_pos_base): 177 | pval = pval_dict[(pos, base)].pval 178 | bh_val = ((rank0+1)/self.number_of_tests) * self.bhFDR # Only significant positions will be used to adjust bh_val 179 | if pval < bh_val: 180 | largest_good_rank1 = rank0+1 181 | print(f"pos:{pos} base:{base} pval:{pval} bh:{bh_val}") 182 | for (pos,base) in keys_pos_base[:largest_good_rank1]: 183 | r = pval_dict[(pos,base)].record 184 | if pos not in self.variant: 185 | self.ref_base[pos] = r.ref 186 | self.variant[pos] = [r.clean_counts.most_common()[0]] 187 | self.variant[pos] += [(base, r.clean_counts[base])] 188 | 189 | 190 | class MagMPileUPVariant(MPileUPVariant): 191 | def __init__(self, record_list, min_cov, err_sub, expected_strand, pval_cutoff=0.01, bhFDR=None): 192 | self.ref_name = {} # position --> ref contig 193 | super().__init__(record_list, min_cov, err_sub, expected_strand, pval_cutoff, bhFDR) 194 | 195 | def call_variant(self): 196 | """ 197 | mirrors AminoAcidCaller::CallVariants() in 198 | https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp 199 | 200 | For each position (that has sufficient coverage), 201 | do Fisher exact test w/ correction 202 | if p-val < threshold, then store it. 203 | 204 | Stores results in self.variant as: 205 | 206 | self.variant[position] = desc list of (base, count). 207 | NOTE: base must be either all in lower case (which means - strand) 208 | or call upper case (+ strand). 209 | If - strand and ('a', 10), it means the ref base in A on the + strand, 210 | and the transcript should be T on the - strand. 211 | 212 | Only positions with more than the ref base is stored. 213 | """ 214 | if self.bhFDR is None: # use Bonferroni correction 215 | for pos in self.positions_to_call: 216 | r = self.record_by_pos[pos] 217 | alt_variant = [] 218 | for base, count in r.clean_counts.most_common()[1:]: 219 | assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels 220 | exp = r.clean_cov * self.err_sub 221 | odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater') 222 | pval *= self.number_of_tests 223 | if pval < self.pval_cutoff: # store variant if below cutoff 224 | alt_variant.append((base, count)) 225 | if len(alt_variant) > 0: # only record this variant if there's at least two haps 226 | self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant 227 | self.ref_base[pos] = r.ref 228 | self.ref_name[pos] = r.chr 229 | 230 | else: # use Benjamini–Hochberg procedure 231 | # see: https://www.statisticshowto.com/benjamini-hochberg-procedure/ 232 | pval_dict = {} # (pos, base) -> BHtuple(pval, record) 233 | for pos in self.positions_to_call: 234 | r = self.record_by_pos[pos] 235 | for base, count in r.clean_counts.most_common()[1:]: 236 | assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels 237 | exp = r.clean_cov * self.err_sub 238 | odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater') 239 | if pval <= self.pval_cutoff: # With this filtration, the sequencing errors position will not be stored in pval_dict. 240 | pval_dict[(pos, base)] = BHtuple(pval=pval, record=r) 241 | # now we have all the pvals, rank them 242 | keys_pos_base = list(pval_dict.keys()) 243 | keys_pos_base.sort(key=lambda x: pval_dict[x].pval) 244 | self.number_of_tests = len(keys_pos_base) 245 | # find the largest p value that is smaller than the critical value. 246 | largest_good_rank1 = 0 247 | for rank0,(pos, base) in enumerate(keys_pos_base): 248 | pval = pval_dict[(pos, base)].pval 249 | bh_val = ((rank0+1)/self.number_of_tests) * self.bhFDR # Only significant positions will be used to adjust bh_val 250 | if pval < bh_val: 251 | largest_good_rank1 = rank0+1 252 | print(f"pos:{pos} base:{base} pval:{pval} bh:{bh_val}") 253 | for (pos,base) in keys_pos_base[:largest_good_rank1]: 254 | r = pval_dict[(pos,base)].record 255 | if pos not in self.variant: 256 | self.ref_base[pos] = r.ref 257 | self.ref_name[pos] = r.chr 258 | self.variant[pos] = [r.clean_counts.most_common()[0]] 259 | self.variant[pos] += [(base, r.clean_counts[base])] 260 | 261 | 262 | 263 | 264 | -------------------------------------------------------------------------------- /phasing/io/BioReaders.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Should always be faithful duplicate of sequence/BioReaders.py 4 | Duplicated here for tofu installation. This one is called via cupcake.io.BioReaders. 5 | """ 6 | 7 | import re, sys 8 | from collections import namedtuple 9 | 10 | Interval = namedtuple('Interval', ['start', 'end']) 11 | 12 | class SimpleSAMReader: 13 | """ 14 | A simplified SAM reader meant for speed. Skips CIGAR & FLAG parsing; identity/coverage calculation. 15 | """ 16 | SAMheaders = ['@HD', '@SQ', '@RG', '@PG', '@CO'] 17 | def __init__(self, filename, has_header): 18 | self.filename = filename 19 | self.f = open(filename) 20 | self.header = '' 21 | if has_header: 22 | while True: 23 | cur = self.f.tell() 24 | line = self.f.readline() 25 | if line[:3] not in SimpleSAMReader.SAMheaders: 26 | break 27 | self.header += line 28 | self.f.seek(cur) 29 | 30 | def __iter__(self): 31 | return self 32 | 33 | def __next__(self): 34 | line = self.f.readline().strip() 35 | if len(line) == 0: 36 | raise StopIteration 37 | return SimpleSAMRecord(line) 38 | 39 | 40 | class SimpleSAMRecord: 41 | cigar_rex = re.compile('(\d+)([MIDSHN])') 42 | SAMflag = namedtuple('SAMflag', ['is_paired', 'strand', 'PE_read_num']) 43 | def __init__(self, record_line): 44 | """ 45 | Simple bare bones version: only has 46 | 47 | qID, sID, sStart, sEnd, qStart, qEnd, cigar 48 | 49 | Simplified assumptions: 50 | -- must be end-to-end alignment (so qStart always 0) 51 | -- must be unspliced (no 'N' in cigar string) 52 | """ 53 | self.qID = None 54 | self.sID = None 55 | self.sStart = None 56 | self.sEnd = None 57 | self.qStart = 0 58 | self.qEnd = None # length of SEQ 59 | self.cigar = None 60 | 61 | self.process(record_line) 62 | 63 | def __str__(self): 64 | msg = \ 65 | """ 66 | qID: {q} 67 | sID: {s} 68 | sStart-sEnd: {ss}-{se} 69 | qStart-qEnd: {qs}-{qe} 70 | cigar: {c} 71 | """.format(q=self.qID, s=self.sID, \ 72 | ss=self.sStart, se=self.sEnd, qs=self.qStart, qe=self.qEnd, c=self.cigar) 73 | return msg 74 | 75 | def parse_cigar(self, cigar, start): 76 | """ 77 | M - match 78 | I - insertion w.r.t. to ref 79 | D - deletion w.r.t. to ref 80 | N - skipped (which means splice junction) 81 | S - soft clipped 82 | H - hard clipped (not shown in SEQ) 83 | = - read match 84 | X - read mismatch 85 | 86 | ex: 50M43N3D 87 | 88 | NOTE: sets qStart & qEnd, which are often incorrect because of different ways to write CIGAR strings 89 | instead rely on XS/XE flags (from blasr or pbalign.py) to overwrite this later!!! 90 | 91 | Returns: genomic segment locations (using as offset) 92 | """ 93 | cur_end = start 94 | q_aln_len = 0 95 | for (num, type) in re.findall('(\d+)(\S)', cigar): 96 | num = int(num) 97 | if type == 'I': 98 | q_aln_len += num 99 | elif type in ('M', '=', 'X'): 100 | cur_end += num 101 | q_aln_len += num 102 | elif type == 'D': 103 | cur_end += num 104 | self.qEnd = self.qStart + q_aln_len 105 | self.sEnd = cur_end 106 | 107 | 108 | def process(self, record_line): 109 | """ 110 | Only process cigar to get qEnd and sEnd 111 | """ 112 | raw = record_line.split('\t') 113 | self.qID = raw[0] 114 | self.sID = raw[2] 115 | if self.sID == '*': # means no match! STOP here 116 | return 117 | self.sStart = int(raw[3]) - 1 118 | self.cigar = raw[5] 119 | self.parse_cigar(self.cigar, self.sStart) 120 | #self.flag = SimpleSAMRecord.parse_sam_flag(int(raw[1])) 121 | 122 | 123 | 124 | class SAMReader: 125 | SAMheaders = ['@HD', '@SQ', '@RG', '@PG', '@CO'] 126 | def __init__(self, filename, has_header, ref_len_dict=None, query_len_dict=None): 127 | self.filename = filename 128 | self.f = open(filename) 129 | self.header = '' 130 | self.ref_len_dict = ref_len_dict 131 | self.query_len_dict = query_len_dict 132 | if has_header: 133 | while True: 134 | cur = self.f.tell() 135 | line = self.f.readline() 136 | if line[:3] not in SAMReader.SAMheaders: 137 | break 138 | self.header += line 139 | self.f.seek(cur) 140 | 141 | def __iter__(self): 142 | return self 143 | 144 | def __next__(self): 145 | line = self.f.readline().strip() 146 | if len(line) == 0: 147 | raise StopIteration 148 | return SAMRecord(line, self.ref_len_dict, self.query_len_dict) 149 | 150 | 151 | class SAMRecord: 152 | SAMflag = namedtuple('SAMflag', ['is_paired', 'strand', 'PE_read_num']) 153 | def __init__(self, record_line=None, ref_len_dict=None, query_len_dict=None): 154 | """ 155 | Designed to handle BowTie SAM output for unaligned reads (PE read not yet supported) 156 | Can handle map to transfrag (no splicing) and genome (splicing) 157 | """ 158 | self.qID = None 159 | self.sID = None 160 | self.sStart = None 161 | self.sEnd = None 162 | self.segments = None 163 | self.num_nonmatches = None 164 | self.num_ins = None 165 | self.num_del = None 166 | self.num_mat_or_sub = None 167 | 168 | self.qCoverage = None 169 | self.sCoverage = None 170 | 171 | self.sLen = None 172 | self.qLen = None 173 | # qStart, qEnd might get changed in parse_cigar 174 | self.qStart = 0 175 | self.qEnd = None # length of SEQ 176 | 177 | self.cigar = None 178 | self.flag = None 179 | 180 | self.identity = None 181 | self.record_line = record_line 182 | if record_line is not None: 183 | self.process(record_line, ref_len_dict, query_len_dict) 184 | 185 | def __str__(self): 186 | msg =\ 187 | """ 188 | qID: {q} 189 | sID: {s} 190 | cigar: {c} 191 | sStart-sEnd: {ss}-{se} 192 | qStart-qEnd: {qs}-{qe} 193 | segments: {seg} 194 | flag: {f} 195 | 196 | coverage (of query): {qcov} 197 | coverage (of subject): {scov} 198 | alignment identity: {iden} 199 | """.format(q=self.qID, s=self.sID, seg=self.segments, c=self.cigar, f=self.flag,\ 200 | ss=self.sStart, se=self.sEnd, qs=self.qStart, qe=self.qEnd, iden=self.identity,\ 201 | qcov=self.qCoverage, scov=self.sCoverage) 202 | return msg 203 | 204 | def __eq__(self, other): 205 | return self.qID == other.qID and self.sID == other.sID and\ 206 | self.sStart == other.sStart and self.sEnd == other.sEnd and\ 207 | self.segments == other.segments and self.qCoverage == other.qCoverage and\ 208 | self.sCoverage == other.sCoverage and self.qLen == other.qLen and\ 209 | self.sLen == other.sLen and self.qStart == other.qStart and\ 210 | self.cigar == other.cigar and self.flag == other.flag and self.identity == other.identity 211 | 212 | 213 | def process(self, record_line, ref_len_dict, query_len_dict): 214 | """ 215 | If SAM is from pbalign.py output, then have flags: 216 | XS: 1-based qStart, XE: 1-based qEnd, XQ: query length, NM: number of non-matches 217 | 218 | ignore_XQ should be False for BLASR/pbalign.py's SAM, True for GMAP's SAM 219 | 220 | 0. qID 221 | 1. flag 222 | 2. sID 223 | 3. 1-based offset sStart 224 | 4. mapping quality (ignore) 225 | 5. cigar 226 | 6. name of ref of mate alignment (ignore) 227 | 7. 1-based offset sStart of mate (ignore) 228 | 8. inferred fragment length (ignore) 229 | 9. sequence (ignore) 230 | 10. read qual (ignore) 231 | 11. optional fields 232 | """ 233 | raw = record_line.split('\t') 234 | self.qID = raw[0] 235 | self.sID = raw[2] 236 | if self.sID == '*': # means no match! STOP here 237 | return 238 | self.sStart = int(raw[3]) - 1 239 | self.cigar = raw[5] 240 | self.segments = self.parse_cigar(self.cigar, self.sStart) 241 | self.sEnd = self.segments[-1].end 242 | self.flag = SAMRecord.parse_sam_flag(int(raw[1])) 243 | 244 | # process optional fields 245 | # XM: number of mismatches 246 | # NM: edit distance (sub/ins/del) 247 | for x in raw[11:]: 248 | if x.startswith('NM:i:'): 249 | self.num_nonmatches = int(x[5:]) 250 | 251 | if ref_len_dict is not None: 252 | self.sCoverage = (self.sEnd - self.sStart) * 1. / ref_len_dict[self.sID] 253 | self.sLen = ref_len_dict[self.sID] 254 | 255 | if self.flag.strand == '-' and self.qLen is not None: 256 | self.qStart, self.qEnd = self.qLen - self.qEnd, self.qLen - self.qStart 257 | 258 | if query_len_dict is not None: # over write qLen and qCoverage, should be done LAST 259 | self.qLen = query_len_dict[self.qID] 260 | self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen 261 | 262 | if self.num_nonmatches is not None: 263 | self.identity = 1. - (self.num_nonmatches * 1. / (self.num_del + self.num_ins + self.num_mat_or_sub)) 264 | 265 | 266 | def parse_cigar(self, cigar, start): 267 | """ 268 | M - match 269 | I - insertion w.r.t. to ref 270 | D - deletion w.r.t. to ref 271 | N - skipped (which means splice junction) 272 | S - soft clipped 273 | H - hard clipped (not shown in SEQ) 274 | = - read match 275 | X - read mismatch 276 | 277 | ex: 50M43N3D 278 | 279 | NOTE: sets qStart & qEnd, which are often incorrect because of different ways to write CIGAR strings 280 | 281 | Returns: genomic segment locations (using as offset) 282 | """ 283 | segments = [] 284 | cur_start = start 285 | cur_end = start 286 | first_thing = True 287 | q_aln_len = 0 288 | self.num_del = 0 289 | self.num_ins = 0 290 | self.num_mat_or_sub = 0 291 | for (num, type) in re.findall('(\d+)(\S)', cigar): 292 | num = int(num) 293 | if type == 'H' or type == 'S': 294 | if first_thing: 295 | self.qStart += num 296 | elif type == 'I': 297 | q_aln_len += num 298 | self.num_ins += num 299 | elif type in ('M','=','X'): 300 | cur_end += num 301 | q_aln_len += num 302 | self.num_mat_or_sub += num 303 | elif type == 'D': 304 | cur_end += num 305 | self.num_del += num 306 | elif type == 'N': # junction, make a new segment 307 | segments.append(Interval(cur_start, cur_end)) 308 | cur_start = cur_end + num 309 | cur_end = cur_start 310 | else: 311 | raise Exception("Unrecognized cigar character {0}!".format(type)) 312 | first_thing = False 313 | if cur_start != cur_end: 314 | segments.append(Interval(cur_start, cur_end)) 315 | self.qEnd = self.qStart + q_aln_len 316 | return segments 317 | 318 | @classmethod 319 | def parse_sam_flag(self, flag): 320 | """ 321 | Heng Li's SAM https://samtools.github.io/hts-specs/SAMv1.pdf 322 | 1 -- read is one of a pair 323 | 2 -- alignment is one end of proper PE alignment (IGNORE) 324 | 4 -- read has no reported alignments (IGNORE) 325 | 8 -- read is one of a pair and has no reported alignments (IGNORE) 326 | 16 -- reverse ref strand 327 | 32 -- other mate is aligned to ref strand 328 | 64 -- first mate in pair 329 | 128 -- second mate in pair 330 | 256 -- not primary alignment 331 | 512 -- not passing filters 332 | 1024 -- PCR or optical duplicate 333 | 2048 -- supplementary alignment 334 | 335 | Return: SAMflag 336 | """ 337 | PE_read_num = 0 338 | strand = '+' 339 | if flag >= 2048: # supplementary alignment 340 | flag -= 2048 341 | if flag >= 1024: #PCR or optical duplicate, should never see this... 342 | flag -= 1024 343 | if flag >= 512: #not passing QC, should never see this 344 | flag -= 512 345 | if flag >= 256: #secondary alignment, OK to see this if option given in BowTie 346 | flag -= 256 347 | if flag >= 128: 348 | PE_read_num = 2 349 | flag -= 128 350 | elif flag >= 64: 351 | PE_read_num = 1 352 | flag -= 64 353 | if flag >= 32: 354 | flag -= 32 355 | if flag >= 16: 356 | strand = '-' 357 | flag -= 16 358 | if flag >= 8: 359 | flag -= 8 360 | if flag >= 4: 361 | flag -= 4 362 | if flag >= 2: 363 | flag -= 2 364 | assert flag == 0 or flag == 1 365 | is_paired = flag == 1 366 | return SAMRecord.SAMflag(is_paired, strand, PE_read_num) 367 | 368 | 369 | class BLASRSAMReader(SAMReader): 370 | def __next__(self): 371 | line = self.f.readline().strip() 372 | if len(line) == 0: 373 | raise StopIteration 374 | return BLASRSAMRecord(line, self.ref_len_dict, self.query_len_dict) 375 | 376 | class BLASRSAMRecord(SAMRecord): 377 | def process(self, record_line, ref_len_dict=None, query_len_dict=None): 378 | """ 379 | SAM files from pbalign.py have following optional fields: 380 | XS: 1-based qStart, XE: 1-based qEnd, XQ: query length, NM: number of non-matches 381 | 382 | 0. qID 383 | 1. flag 384 | 2. sID 385 | 3. 1-based offset sStart 386 | 4. mapping quality (ignore) 387 | 5. cigar 388 | 6. name of ref of mate alignment (ignore) 389 | 7. 1-based offset sStart of mate (ignore) 390 | 8. inferred fragment length (ignore) 391 | 9. sequence (ignore) 392 | 10. read qual (ignore) 393 | 11. optional fields 394 | """ 395 | raw = record_line.split('\t') 396 | self.qID = raw[0] 397 | self.sID = raw[2] 398 | if self.sID == '*': # means no match! STOP here 399 | return 400 | self.sStart = int(raw[3]) - 1 401 | self.cigar = raw[5] 402 | self.segments = self.parse_cigar(self.cigar, self.sStart) 403 | self.sEnd = self.segments[-1].end 404 | self.flag = SAMRecord.parse_sam_flag(int(raw[1])) 405 | 406 | # In Yuan Li's BLASR-to-SAM, XQ:i: 407 | # see https://github.com/PacificBiosciences/blasr/blob/master/common/datastructures/alignmentset/SAMAlignment.h 408 | for x in raw[11:]: 409 | if x.startswith('XQ:i:'): # XQ should come last, after XS and XE 410 | _qLen = int(x[5:]) 411 | if _qLen > 0: # this is for GMAP's SAM, which has XQ:i:0 412 | self.qLen = _qLen 413 | elif x.startswith('XS:i:'): # must be PacBio's SAM, need to update qStart 414 | qs = int(x[5:]) - 1 # XS is 1-based 415 | if qs > 0: 416 | print("qStart:", self.qStart) 417 | assert self.qStart == 0 418 | self.qStart = qs 419 | self.qEnd += qs 420 | elif x.startswith('XE:i:'): # must be PacBio's SAM and comes after XS:i: 421 | qe = int(x[5:]) # XE is 1-based 422 | assert self.qEnd - self.qStart == qe - 1 # qEnd should've been updated already, confirm this 423 | elif x.startswith('NM:i:'): # number of non-matches 424 | self.num_nonmatches = int(x[5:]) 425 | self.identity = 1. - (self.num_nonmatches * 1. / (self.num_del + self.num_ins + self.num_mat_or_sub)) 426 | 427 | if ref_len_dict is not None: 428 | self.sCoverage = (self.sEnd - self.sStart) * 1. / ref_len_dict[self.sID] 429 | self.sLen = ref_len_dict[self.sID] 430 | 431 | if self.flag.strand == '-' and self.qLen is not None: 432 | self.qStart, self.qEnd = self.qLen - self.qEnd, self.qLen - self.qStart 433 | 434 | if self.qLen is not None: 435 | self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen 436 | 437 | if query_len_dict is not None: # over write qLen and qCoverage, should be done LAST 438 | try: 439 | self.qLen = query_len_dict[self.qID] 440 | except KeyError: # HACK for blasr's extended qID 441 | self.qLen = query_len_dict[self.qID[:self.qID.rfind('/')]] 442 | self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen 443 | 444 | 445 | class GMAPSAMReader(SAMReader): 446 | def __next__(self): 447 | while True: 448 | line = self.f.readline().strip() 449 | if len(line) == 0: 450 | raise StopIteration 451 | if not line.startswith('@'): # header can occur at file end if the SAM was sorted 452 | break 453 | return GMAPSAMRecord(line, self.ref_len_dict, self.query_len_dict) 454 | 455 | class GMAPSAMRecord(SAMRecord): 456 | def process(self, record_line, ref_len_dict=None, query_len_dict=None): 457 | """ 458 | SAM files from pbalign.py have following optional fields: 459 | XS: 1-based qStart, XE: 1-based qEnd, XQ: query length, NM: number of non-matches 460 | 461 | 0. qID 462 | 1. flag 463 | 2. sID 464 | 3. 1-based offset sStart 465 | 4. mapping quality (ignore) 466 | 5. cigar 467 | 6. name of ref of mate alignment (ignore) 468 | 7. 1-based offset sStart of mate (ignore) 469 | 8. inferred fragment length (ignore) 470 | 9. sequence (ignore) 471 | 10. read qual (ignore) 472 | 11. optional fields 473 | """ 474 | raw = record_line.split('\t') 475 | self.qID = raw[0] 476 | self.sID = raw[2] 477 | if self.sID == '*': # means no match! STOP here 478 | return 479 | self.sStart = int(raw[3]) - 1 480 | self.cigar = raw[5] 481 | self.segments = self.parse_cigar(self.cigar, self.sStart) 482 | self.sEnd = self.segments[-1].end 483 | self.flag = SAMRecord.parse_sam_flag(int(raw[1])) # strand can be overwritten by XS:A flag 484 | self._flag_strand = self.flag.strand # serve as backup for debugging 485 | # In Yuan Li's BLASR-to-SAM, XQ:i: 486 | # see https://github.com/PacificBiosciences/blasr/blob/master/common/datastructures/alignmentset/SAMAlignment.h 487 | for x in raw[11:]: 488 | if x.startswith('NM:i:'): # number of non-matches 489 | self.num_nonmatches = int(x[5:]) 490 | self.identity = 1. - (self.num_nonmatches * 1. / (self.num_del + self.num_ins + self.num_mat_or_sub)) 491 | elif x.startswith('XS:A:'): # strand ifnormation 492 | _s = x[5:] 493 | if _s!='?': 494 | self._flag_strand = self.flag.strand # serve as backup for debugging 495 | self.flag = SAMRecord.SAMflag(self.flag.is_paired, _s, self.flag.PE_read_num) 496 | 497 | if ref_len_dict is not None: 498 | self.sCoverage = (self.sEnd - self.sStart) * 1. / ref_len_dict[self.sID] 499 | self.sLen = ref_len_dict[self.sID] 500 | 501 | if self.flag.strand == '-' and self.qLen is not None: 502 | self.qStart, self.qEnd = self.qLen - self.qEnd, self.qLen - self.qStart 503 | 504 | if self.qLen is not None: 505 | self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen 506 | 507 | if query_len_dict is not None: # over write qLen and qCoverage, should be done LAST 508 | try: 509 | self.qLen = query_len_dict[self.qID] 510 | except KeyError: # HACK for blasr's extended qID 511 | k = self.qID.rfind('/') 512 | if k >= 0: 513 | try: 514 | self.qLen = query_len_dict[self.qID[:self.qID.rfind('/')]] 515 | except KeyError: 516 | self.qLen = query_len_dict[self.qID] 517 | else: 518 | raise Exception("Unable to find qID {0} in the input fasta/fastq!".format(self.qID)) 519 | self.qCoverage = (self.qEnd - self.qStart) * 1. / self.qLen 520 | 521 | -------------------------------------------------------------------------------- /phasing/io/VariantPhaser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'etseng@pacb.com' 2 | 3 | import pdb 4 | from collections import defaultdict, namedtuple, Counter 5 | from csv import DictReader 6 | import vcf 7 | import pysam 8 | from Bio.Seq import Seq 9 | from Bio import SeqIO 10 | from .BioReaders import GMAPSAMReader 11 | from .coordinate_mapper import get_base_to_base_mapping_from_sam 12 | 13 | 14 | __VCF_EXAMPLE__ = \ 15 | """ 16 | ##fileformat=VCFv4.2 17 | ##INFO= 18 | ##INFO= 19 | ##FORMAT= 20 | ##FORMAT= 21 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 22 | 20 1 . G A,T . PASS AF=0.5;DB GT 23 | """ 24 | 25 | def type_fa_or_fq(file): 26 | file = file.upper() 27 | if file.endswith('.FA') or file.endswith('.FASTA'): return 'fasta' 28 | else: return 'fastq' 29 | 30 | 31 | class VariantPhaser(object): 32 | def __init__(self, vc): 33 | """ 34 | :param vc: MPileUPVariant instance. 35 | """ 36 | self.vc = vc 37 | self.min_var_pos = min(vc.variant) # mininum 0-based position of a called variant 38 | self.max_var_pos = max(vc.variant) # maximum 0-based position of a called variant 39 | self.accepted_vars_by_pos = {} # 0-based pos --> list of accepted, (NOT strand sense) base 40 | self.count_of_vars_by_pos = {} # 0-based pos --> (NOT strand sense, but ref-based) base --> count 41 | self.accepted_pos = [] # sorted list of variant positions (0-based, ref) 42 | 43 | # process vc.variant which is 44 | # dict of 0-based pos --> desc list of (base, count) 45 | # ex: {1565: [('a', 49), ('g', 36)]} 46 | # lower case means at pos 1565, we expect - strand mapping and 47 | # seq base is 'T' on the sense strand 48 | # this converts to self.accepted_vars_by_pos[1565] = ['A', 'G'] 49 | # later, when we are matchin back to transcript seq, need to watch for strand! 50 | for pos, vars in vc.variant.items(): 51 | self.accepted_vars_by_pos[pos] = [_base.upper() for _base,_count in vars] 52 | self.count_of_vars_by_pos[pos] = dict((_base.upper(), _count) for _base,_count in vars) 53 | 54 | self.accepted_pos = list(self.accepted_vars_by_pos.keys()) 55 | self.accepted_pos.sort() 56 | 57 | self.haplotypes = Haplotypes(self.accepted_pos, self.vc.ref_base, self.count_of_vars_by_pos) 58 | self.seq_hap_info = {} # haplotype assignment, key: (CCS) seqid, value: haplotype index 59 | 60 | 61 | def phase_variant(self, sam_filename, input_fa_or_fq, output_prefix, partial_ok=False): 62 | """ 63 | :param sam_filename: CCS SAM filename. Can be unsorted. 64 | :param input_fa_or_fq: Input CCS fasta/fastq filename. 65 | :param output_prefix: Output prefix. Writes to xxx.log. 66 | :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions. 67 | 68 | For each alignment: 69 | 1. discard if did not map to the strand expected 70 | 2. discard if did not map to the full range of variants (unless is True) 71 | 3. discard if at var positions have non-called bases (outliers) 72 | """ 73 | f_log = open(output_prefix+'.log', 'w') 74 | 75 | seq_dict = SeqIO.to_dict(SeqIO.parse(open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq))) 76 | for r in GMAPSAMReader(sam_filename, True, query_len_dict=dict((k, len(seq_dict[k].seq)) for k in seq_dict)): 77 | if r.sID == '*': 78 | f_log.write("Ignore {0} because: unmapped.\n".format(r.qID)) 79 | continue 80 | if r.flag.strand != self.vc.expected_strand: 81 | f_log.write("Ignore {0} because: strand is {1}.\n".format(r.qID, r.flag.strand)) 82 | continue # ignore 83 | if not partial_ok and (r.sStart > self.min_var_pos or r.sEnd < self.max_var_pos): 84 | f_log.write("Ignore {0} because: aln too short, from {1}-{2}.\n".format(r.qID, r.sStart+1, r.sEnd)) 85 | continue 86 | 87 | i, msg = self.match_haplotype(r, str(seq_dict[r.qID].seq).upper(), partial_ok) 88 | if i is None: # read is rejected for reason listed in 89 | f_log.write("Ignore {0} because: {1}.\n".format(r.qID, msg)) 90 | continue 91 | else: 92 | f_log.write("{0} phased: haplotype {1}={2}\n".format(r.qID, i, self.haplotypes[i])) 93 | print("{0} has haplotype {1}:{2}".format(r.qID, i, self.haplotypes[i])) 94 | self.seq_hap_info[r.qID] = i 95 | 96 | 97 | def match_haplotype(self, r, s, partial_ok=False): 98 | """ 99 | Match an alignment record to existing haplotypes or create a new one. 100 | Helper function for self.phase_variant() 101 | :param r: CCS alignment (SAM record) 102 | :param s: CCS sequence (in strand), must be plain str and every base is upper case 103 | :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions. 104 | 105 | :return: (haplotype_index, msg) or (None, msg) if variants don't match w/ called SNPs 106 | """ 107 | assert type(s) is str and str.isupper(s) 108 | assert r.flag.strand == self.vc.expected_strand 109 | # m: mapping of 0-based seq --> 0-based ref position 110 | # rev_map: mapping of 0-based ref position --> 0-based seq 111 | m = get_base_to_base_mapping_from_sam(r.segments, r.cigar, r.qStart, r.qEnd, r.flag.strand) 112 | ref_m = dict((v,k) for k,v in m.items()) 113 | 114 | # go through each variant 115 | # to represent the concatenated string of all variant positions for this seq 116 | # ex: if there are three var positions, a hap would be "ATG" or "A?G" (if partial_ok is True), etc. 117 | hap = '' 118 | impute_later = False 119 | for ref_pos in self.accepted_pos: 120 | if ref_pos not in ref_m: 121 | if partial_ok: # read does not cover one of the SNP positions, so use "?" 122 | hap += "?" 123 | else: 124 | return None, "Does not have base at ref_pos {0}.\n".format(ref_pos) 125 | else: 126 | base = s[ref_m[ref_pos]] 127 | if self.vc.expected_strand == '-': # must convert the base to the rev comp 128 | base = str(Seq(base).reverse_complement()).upper() 129 | if base in self.accepted_vars_by_pos[ref_pos]: 130 | hap += base 131 | else: # contains a base at a variant position that is not called. Try to impute. 132 | hap += base 133 | impute_later = True 134 | 135 | if all(b=='?' for b in hap): 136 | return None, "Does not cover any variant base." 137 | 138 | if impute_later: 139 | impute_i = self.haplotypes.impute_haplotype(hap, min_score=3) 140 | if impute_i is None: 141 | return None, "Seq {0} contained non-called variant. Impute failed.\n".format(hap) 142 | else: 143 | return impute_i, "IMPUTED" 144 | return self.haplotypes.match_or_add_haplotype(hap_string=hap) 145 | 146 | 147 | 148 | def phase_isoforms(read_stat_filename, seqids, phaser): 149 | """ 150 | :param read_stat_filename: the .read_stat file that has columns and , where is CCS id and is PB.X.Y 151 | :param seqids: CCS IDs that were used to create the haplotypes. 152 | :param phaser: VariantPhaser object that contains the haplotype and seqid->haplotype information. 153 | 154 | :return: list of (isoform, dict of haplotype count), ex: {'PB.45.1': {0:10, 1:20}} 155 | which means PB.45.1 has haplotype 0 supported by 10 CCS reads and hap 1 supported by 20 CCS reads. 156 | 157 | *NOTE* currently uses FL CCS reads only (even if the SNPs may have been called by FL+nFL CCS SAM) 158 | """ 159 | result = {} # dict of (isoform, dict of haplotype_index --> CCS count supporting it 160 | # from read stat, gather which isoforms have which (CCS) seq members. 161 | isoforms = defaultdict(lambda: []) # key: PB.X.Y, value: list of seqid members 162 | for r in DictReader(open(read_stat_filename), delimiter='\t'): 163 | if r['id'] in seqids and r['is_fl']=='Y': 164 | isoforms[r['pbid']].append(r['id']) 165 | 166 | # for each isoform, look at the CCS membership to know which haplotypes are expressed 167 | for _iso, _seqids in isoforms.items(): 168 | tally = defaultdict(lambda: 0) # haplotype index --> count (of CCS) 169 | for seqid in _seqids: 170 | if seqid in phaser.seq_hap_info: # some CCS (seqids) may not have been used by the phaser, so account for that 171 | tally[phaser.seq_hap_info[seqid]] += 1 172 | if len(tally) > 0: 173 | result[_iso] = dict(tally) 174 | return result 175 | 176 | 177 | class Haplotypes(object): 178 | """ 179 | Storing haplotypes for a loci. 180 | 181 | self.haplotype[i] is the i-th haplotype. 182 | if N = len(self.haplotype[i]), then there are N variants along the loci. 183 | self.hap_var_positions[j] means that the j-th variant corressponds to (0-based) position on the ref genome. 184 | """ 185 | def __init__(self, var_positions, ref_at_pos, count_of_vars_by_pos): 186 | """ 187 | :param var_positions: sorted list of (0-based) variant positions 188 | :param ref_at_pos: dict of (0-based) variant position --> ref base at this position 189 | :param count_of_vars_by_pos: 0-based pos --> (NOT strand sense, but ref-based) base --> count 190 | """ 191 | self.haplotypes = [] # haplotypes, where haplotypes[i] is the i-th distinct haplotype of all var concat 192 | self.hap_var_positions = var_positions 193 | self.ref_at_pos = ref_at_pos # dict of (0-based) pos --> ref base 194 | self.alt_at_pos = None # init: None, later: dict of (0-based) pos --> unique list of alt bases 195 | self.count_of_vars_by_pos = count_of_vars_by_pos 196 | self.haplotype_vcf_index = None # init: None, later: dict of (hap index) --> (0-based) var pos --> phase (0 for ref, 1+ for alt) 197 | 198 | # sanity check: all variant positions must be present 199 | self.sanity_check() 200 | 201 | def __getitem__(self, ith): 202 | """ 203 | Returns the -th haplotype 204 | """ 205 | return self.haplotypes[ith] 206 | 207 | def __str__(self): 208 | return """ 209 | var positions: {pp} 210 | haplotypes: \n{h} 211 | """.format(pp=",".join(map(str,self.hap_var_positions)), 212 | h="\n".join(self.haplotypes)) 213 | 214 | def sanity_check(self): 215 | """ 216 | Sanity check the following: 217 | -- variant positions are properly recorded and concordant 218 | -- alt bases are truly alt and unique 219 | -- all haplotypes are the same length 220 | """ 221 | for pos in self.hap_var_positions: 222 | assert pos in self.ref_at_pos 223 | 224 | if self.alt_at_pos is not None: 225 | for pos in self.alt_at_pos: 226 | # ref base must not be in alt 227 | assert self.ref_at_pos[pos] not in self.alt_at_pos[pos] 228 | # alt bases must be unique 229 | assert len(self.alt_at_pos[pos]) == len(set(self.alt_at_pos[pos])) 230 | 231 | if len(self.haplotypes) >= 1: 232 | n = len(self.haplotypes[0]) 233 | assert n == len(self.hap_var_positions) 234 | for hap_str in self.haplotypes[1:]: 235 | assert len(hap_str) == n 236 | 237 | 238 | def match_or_add_haplotype(self, hap_string): 239 | """ 240 | If is an existing haplotype, return the index. 241 | Otherwise, add to known haplotypes and return the new index. 242 | 243 | :return: , "FOUND" or "NEW" 244 | """ 245 | if hap_string in self.haplotypes: 246 | i = self.haplotypes.index(hap_string) 247 | return i, "FOUND" 248 | else: 249 | i = len(self.haplotypes) 250 | self.haplotypes.append(hap_string) 251 | return i, "NEW" 252 | 253 | def impute_haplotype(self, hap_string, min_score): 254 | """ 255 | :param hap_string: a hap string with '?'s. 256 | :param min_sim: minimum similarity with existing haplotype to accept assignment 257 | :return: of an existing haplotype, or None if not sufficiently matched 258 | 259 | Impute haplotype and only return a match if: 260 | (a) score (similarity) is >= min_score 261 | (b) the matching score for the best one is higher than the second best match 262 | """ 263 | sim_tuple = namedtuple('sim_tuple', 'index score') 264 | sims = [] # list of sim_tuple 265 | hap_str_len = len(hap_string) 266 | for i in range(len(self.haplotypes)): 267 | # Liz note: currently NOT checking whether existing haplotypes have '?'. I'm assuming no '?'. 268 | score = sum((hap_string[k]==self.haplotypes[i][k]) for k in range(hap_str_len)) 269 | if score > 0: 270 | sims.append(sim_tuple(index=i, score=score)) 271 | if len(sims) == 0: 272 | return None 273 | sims.sort(key=lambda x: x.score, reverse=True) 274 | if sims[0].score >= min_score and (len(sims)==1 or sims[0].score > sims[1].score): 275 | return sims[0].index 276 | else: 277 | return None 278 | 279 | def get_haplotype_vcf_assignment(self): 280 | """ 281 | Must be called before self.write_haplotype_to_vcf() 282 | This is preparing for writing out VCF. We need to know, for each variant position, 283 | the ref base (already filled in self.ref_at_pos) and the alt bases (self.alt_at_pos). 284 | For each haplotype in (self.haplotype), we need to know the whether the i-th variant is the 285 | ref (index 0), or some alt base (index 1 and onwards). 286 | 287 | Propagates two variables: 288 | 289 | self.haplotype_vcf_index: hap index --> pos --> phase index (0 for ref, 1+ for alt) 290 | self.alt_at_pos: dict of <0-based pos> --> alt bases (not is not ref) at this position 291 | """ 292 | self.haplotype_vcf_index = [{} for i in range(len(self.haplotypes))] 293 | self.alt_at_pos = {} 294 | 295 | # what happens in the case of partial phasing 296 | # ex: self.haplotypes[0] = "A?G", this means when it comes to the second pos, pos2, 297 | # in the VCF we would want to write out .|. for diploid, . for haploid, etc 298 | # so let's set self.haplotype_vcf_index[0][pos2] = '.' to indicate that 299 | 300 | for i,pos in enumerate(self.hap_var_positions): 301 | ref = self.ref_at_pos[pos] 302 | # need to go through the haplotype bases, if ref is already represented, then don't put it in alt 303 | self.alt_at_pos[pos] = [] 304 | for hap_i, hap_str in enumerate(self.haplotypes): 305 | base = hap_str[i] 306 | if base=='?': # means this haplotype does not cover this position! 307 | self.haplotype_vcf_index[hap_i][pos] = '.' 308 | elif base==ref: # is the ref base 309 | self.haplotype_vcf_index[hap_i][pos] = 0 310 | else: # is an alt base, see if it's already there 311 | if base in self.alt_at_pos[pos]: 312 | j = self.alt_at_pos[pos].index(base) 313 | self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref) 314 | else: 315 | j = len(self.alt_at_pos[pos]) 316 | self.alt_at_pos[pos].append(base) 317 | self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref) 318 | # in the case where partial_ok=False, it's possible some alt are never presented by a haplotype 319 | # we must check that all variants are presented here 320 | for _base in self.count_of_vars_by_pos[pos]: 321 | if (_base not in self.ref_at_pos[pos]) and (_base not in self.alt_at_pos[pos]): 322 | self.alt_at_pos[pos].append(_base) 323 | 324 | 325 | def write_haplotype_to_vcf(self, fake_genome_mapping_filename, isoform_tally, output_prefix): 326 | """ 327 | The following functions must first be called first: 328 | -- self.get_haplotype_vcf_assignment 329 | """ 330 | if self.haplotype_vcf_index is None or self.alt_at_pos is None: 331 | raise Exception("Must call self.get_haplotype_vcf_assignment() first!") 332 | 333 | self.sanity_check() 334 | 335 | name_isoforms = list(isoform_tally.keys()) 336 | name_isoforms.sort() 337 | 338 | # write a fake VCF example so we can read the headers in 339 | with open('template.vcf', 'w') as f: 340 | f.write(__VCF_EXAMPLE__) 341 | reader = vcf.VCFReader(open('template.vcf')) 342 | reader.samples = name_isoforms 343 | f_vcf = vcf.Writer(open(output_prefix+'.vcf', 'w'), reader) 344 | 345 | 346 | # human readable text: 347 | # first line: assoc VCF filename 348 | # second line: haplotype, list of sorted isoforms 349 | # third line onwards: haplotype and assoc count 350 | f_human = open(output_prefix+'.human_readable.txt', 'w') 351 | f_human.write("Associated VCF file: {0}.vcf\n".format(output_prefix)) 352 | f_human.write("haplotype\t{samples}\n".format(samples="\t".join(name_isoforms))) 353 | for hap_index,hap_str in enumerate(self.haplotypes): 354 | f_human.write(hap_str) 355 | for _iso in name_isoforms: 356 | if hap_index in isoform_tally[_iso]: 357 | f_human.write("\t{0}".format(isoform_tally[_iso][hap_index])) 358 | else: 359 | f_human.write("\t0") 360 | f_human.write('\n') 361 | f_human.close() 362 | 363 | 364 | # read fake genome mapping file 365 | fake_map = {} # 0-based position on fake --> (chr, 0-based ref position) 366 | with open(fake_genome_mapping_filename) as f: 367 | for line in f: 368 | fake_pos, ref_chr, ref_pos = line.strip().split(',') 369 | fake_map[int(fake_pos)] = (ref_chr, int(ref_pos)) 370 | 371 | 372 | # for each position, write out the ref and alt bases 373 | # then fill in for each isoform (aka "sample"): 374 | # if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise) 375 | # if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0] 376 | for i,pos in enumerate(self.hap_var_positions): 377 | ref_chr, ref_pos = fake_map[pos] 378 | total_count = sum(self.count_of_vars_by_pos[pos].values()) 379 | alt_freq = ["{0:.2f}".format(self.count_of_vars_by_pos[pos][b]*1./total_count) for b in self.alt_at_pos[pos]] 380 | rec = vcf.model._Record(CHROM=ref_chr, 381 | POS=ref_pos+1, 382 | ID='.', 383 | REF=self.ref_at_pos[pos], 384 | ALT=[vcf.model._Substitution(b) for b in self.alt_at_pos[pos]], 385 | QUAL='.', 386 | FILTER='PASS', 387 | INFO={'AF':alt_freq, 'DP':total_count}, 388 | FORMAT="GT:HQ", 389 | sample_indexes=None) 390 | samp_ft = vcf.model.make_calldata_tuple(['GT', 'HQ']) 391 | rec.samples = [] 392 | for _iso in name_isoforms: 393 | # isoform_tally[_iso] is a dict of haplotype index --> count 394 | # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i] 395 | # we always need to show the phases in haplotype index order sorted 396 | hap_indices = list(isoform_tally[_iso].keys()) 397 | hap_indices.sort() 398 | genotype = "|".join(str(self.haplotype_vcf_index[hap_index][pos]) for hap_index in hap_indices) 399 | counts = ",".join(str(isoform_tally[_iso][hap_index]) for hap_index in hap_indices) 400 | rec.samples.append(vcf.model._Call(rec, _iso, samp_ft(*[genotype, counts]))) 401 | f_vcf.write_record(rec) 402 | f_vcf.close() 403 | 404 | 405 | def get_base_to_base_mapping_from_aligned_pairs(reftuple, qLen, strand): 406 | """ 407 | Returns: dict of 0-based position --> 0-based ref position 408 | """ 409 | cur_genome_loc = reftuple[0][1] 410 | 411 | mapping = {} 412 | for qpos, rpos in reftuple: 413 | if qpos is not None and rpos is not None: 414 | mapping[qpos] = (rpos, True) 415 | elif qpos is not None: 416 | mapping[qpos] = (cur_genome_loc, None) 417 | if rpos is not None: cur_genome_loc = rpos 418 | 419 | if strand == '-': 420 | mapping = dict((qLen-1-k, v) for k,v in mapping.items()) 421 | 422 | for k in mapping: 423 | mapping[k] = mapping[k][0] 424 | 425 | return mapping 426 | 427 | 428 | class MagVariantPhaser(object): 429 | def __init__(self, vc): 430 | """ 431 | :param vc: MPileUPVariant instance. 432 | """ 433 | self.vc = vc 434 | self.min_var_pos = min(vc.variant) # mininum 0-based position of a called variant 435 | self.max_var_pos = max(vc.variant) # maximum 0-based position of a called variant 436 | self.accepted_vars_by_pos = {} # 0-based pos --> list of accepted, (NOT strand sense) base 437 | self.count_of_vars_by_pos = {} # 0-based pos --> (NOT strand sense, but ref-based) base --> count 438 | self.accepted_pos = [] # sorted list of variant positions (0-based, ref) 439 | 440 | # process vc.variant which is 441 | # dict of 0-based pos --> desc list of (base, count) 442 | # ex: {1565: [('a', 49), ('g', 36)]} 443 | # lower case means at pos 1565, we expect - strand mapping and 444 | # seq base is 'T' on the sense strand 445 | # this converts to self.accepted_vars_by_pos[1565] = ['A', 'G'] 446 | # later, when we are matchin back to transcript seq, need to watch for strand! 447 | for pos, vars in vc.variant.items(): 448 | self.accepted_vars_by_pos[pos] = [_base.upper() for _base,_count in vars] 449 | self.count_of_vars_by_pos[pos] = dict((_base.upper(), _count) for _base,_count in vars) 450 | 451 | self.accepted_pos = list(self.accepted_vars_by_pos.keys()) 452 | self.accepted_pos.sort() 453 | 454 | self.haplotypes = MagHaplotypes(self.accepted_pos, [self.vc.ref_name[p] for p in self.accepted_pos], self.vc.ref_base, self.count_of_vars_by_pos) 455 | self.seq_hap_info = {} # haplotype assignment, key: (CCS) seqid, value: haplotype index 456 | 457 | 458 | def phase_variant(self, sam_filename, coordstr, output_prefix, partial_ok=False): 459 | """ 460 | :param sam_filename: CCS SAM filename. Can be unsorted. 461 | :param coordstr: list of [contig, start, end] 462 | :param output_prefix: Output prefix. Writes to xxx.log. 463 | :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions. 464 | 465 | For each alignment: 466 | 1. discard if did not map to the strand expected 467 | 2. discard if did not map to the full range of variants (unless is True) 468 | 3. discard if at var positions have non-called bases (outliers) 469 | """ 470 | f_log = open(output_prefix+'.log', 'a+') 471 | 472 | contig, start, end = coordstr 473 | 474 | secondary_align_counts = 0 475 | tot_align_counts = 0 476 | with pysam.AlignmentFile(sam_filename, 'rb') as samfile: 477 | for s in samfile.fetch(contig, start, end): 478 | tot_align_counts += 1 479 | if s.reference_name == '*': 480 | f_log.write("Ignore {0} because: unmapped.\n".format(s.query_name)) 481 | continue 482 | if not partial_ok and (s.reference_start > self.min_var_pos or s.reference_end < self.max_var_pos): 483 | f_log.write("Ignore {0} because: aln too short, from {1}-{2}.\n".format(s.query_name, s.referenc_start+1, s.reference_end)) 484 | continue 485 | if s.is_secondary: 486 | secondary_align_counts += 1 487 | continue 488 | seqstr = s.query_sequence.upper() 489 | i, msg = self.match_haplotype(s, seqstr, partial_ok) 490 | if i is None: # read is rejected for reason listed in 491 | f_log.write("Ignore {0} because: {1}.\n".format(s.query_name, msg)) 492 | continue 493 | else: 494 | f_log.write("{0} phased: haplotype {1}={2}\n".format(s.query_name, i, self.haplotypes[i])) 495 | print("{0} has haplotype {1}:{2}".format(s.query_name, i, self.haplotypes[i])) 496 | self.seq_hap_info[s.query_name] = i 497 | f_log.write(f'Encountered {secondary_align_counts} out of {tot_align_counts} read alignments') 498 | 499 | 500 | def match_haplotype(self, r, s, partial_ok=False): 501 | """ 502 | Match an alignment record to existing haplotypes or create a new one. 503 | Helper function for self.phase_variant() 504 | :param r: CCS alignment (pysam record) 505 | :param s: CCS sequence (in strand), must be plain str and every base is upper case 506 | :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions. 507 | 508 | :return: (haplotype_index, msg) or (None, msg) if variants don't match w/ called SNPs 509 | """ 510 | try: 511 | assert type(s) is str and str.isupper(s) 512 | except Exception as e: 513 | print(f'exception: {s}') 514 | # m: mapping of 0-based seq --> 0-based ref position 515 | # rev_map: mapping of 0-based ref position --> 0-based seq 516 | strand = '-' if r.is_reverse else '+' 517 | m = get_base_to_base_mapping_from_aligned_pairs(r.get_aligned_pairs(), len(r.query_sequence), strand) 518 | ref_m = dict((v,k) for k,v in m.items()) 519 | 520 | # go through each variant 521 | # to represent the concatenated string of all variant positions for this seq 522 | # ex: if there are three var positions, a hap would be "ATG" or "A?G" (if partial_ok is True), etc. 523 | hap = '' 524 | impute_later = False 525 | for ref_pos in self.accepted_pos: 526 | if ref_pos not in ref_m: 527 | if partial_ok: # read does not cover one of the SNP positions, so use "?" 528 | hap += "?" 529 | else: 530 | return None, "Does not have base at ref_pos {0}.\n".format(ref_pos) 531 | else: 532 | base = s[ref_m[ref_pos]] 533 | if base in self.accepted_vars_by_pos[ref_pos]: 534 | hap += base 535 | else: # contains a base at a variant position that is not called. Try to impute. 536 | hap += base 537 | impute_later = True 538 | 539 | if all(b=='?' for b in hap): 540 | return None, "Does not cover any variant base." 541 | 542 | if impute_later: 543 | impute_i = self.haplotypes.impute_haplotype(hap, min_score=3) 544 | if impute_i is None: 545 | return None, "Seq {0} contained non-called variant. Impute failed.\n".format(hap) 546 | else: 547 | return impute_i, "IMPUTED" 548 | return self.haplotypes.match_or_add_haplotype(hap_string=hap) 549 | 550 | 551 | class MagHaplotypes(object): 552 | """ 553 | Storing haplotypes for a loci. 554 | 555 | self.haplotype[i] is the i-th haplotype. 556 | if N = len(self.haplotype[i]), then there are N variants along the loci. 557 | self.hap_var_positions[j] means that the j-th variant corressponds to (0-based) position on the ref genome. 558 | """ 559 | def __init__(self, var_positions, chrs, ref_at_pos, count_of_vars_by_pos): 560 | """ 561 | :param var_positions: sorted list of (0-based) variant positions 562 | :param ref_at_pos: dict of (0-based) variant position --> ref base at this position 563 | :param count_of_vars_by_pos: 0-based pos --> (NOT strand sense, but ref-based) base --> count 564 | """ 565 | self.haplotypes = [] # haplotypes, where haplotypes[i] is the i-th distinct haplotype of all var concat 566 | self.hap_var_positions = var_positions 567 | self.ref_at_pos = ref_at_pos # dict of (0-based) pos --> ref base 568 | self.alt_at_pos = None # init: None, later: dict of (0-based) pos --> unique list of alt bases 569 | self.count_of_vars_by_pos = count_of_vars_by_pos 570 | self.haplotype_vcf_index = None # init: None, later: dict of (hap index) --> (0-based) var pos --> phase (0 for ref, 1+ for alt) 571 | self.chrs = chrs # contig names where chrs[i] is the i-th contig name 572 | 573 | # sanity check: all variant positions must be present 574 | self.sanity_check() 575 | 576 | def __getitem__(self, ith): 577 | """ 578 | Returns the -th haplotype 579 | """ 580 | return self.haplotypes[ith] 581 | 582 | def __str__(self): 583 | return """ 584 | var positions: {pp} 585 | haplotypes: \n{h} 586 | """.format(pp=",".join(map(str,self.hap_var_positions)), 587 | h="\n".join(self.haplotypes)) 588 | 589 | def sanity_check(self): 590 | """ 591 | Sanity check the following: 592 | -- variant positions are properly recorded and concordant 593 | -- alt bases are truly alt and unique 594 | -- all haplotypes are the same length 595 | """ 596 | for pos in self.hap_var_positions: 597 | assert pos in self.ref_at_pos 598 | 599 | if self.alt_at_pos is not None: 600 | for pos in self.alt_at_pos: 601 | # ref base must not be in alt 602 | assert self.ref_at_pos[pos] not in self.alt_at_pos[pos] 603 | # alt bases must be unique 604 | assert len(self.alt_at_pos[pos]) == len(set(self.alt_at_pos[pos])) 605 | 606 | if len(self.haplotypes) >= 1: 607 | n = len(self.haplotypes[0]) 608 | assert n == len(self.hap_var_positions) 609 | for hap_str in self.haplotypes[1:]: 610 | assert len(hap_str) == n 611 | 612 | 613 | def match_or_add_haplotype(self, hap_string): 614 | """ 615 | If is an existing haplotype, return the index. 616 | Otherwise, add to known haplotypes and return the new index. 617 | 618 | :return: , "FOUND" or "NEW" 619 | """ 620 | if hap_string in self.haplotypes: 621 | i = self.haplotypes.index(hap_string) 622 | return i, "FOUND" 623 | else: 624 | i = len(self.haplotypes) 625 | self.haplotypes.append(hap_string) 626 | return i, "NEW" 627 | 628 | def impute_haplotype(self, hap_string, min_score): 629 | """ 630 | :param hap_string: a hap string with '?'s. 631 | :param min_sim: minimum similarity with existing haplotype to accept assignment 632 | :return: of an existing haplotype, or None if not sufficiently matched 633 | 634 | Impute haplotype and only return a match if: 635 | (a) score (similarity) is >= min_score 636 | (b) the matching score for the best one is higher than the second best match 637 | """ 638 | sim_tuple = namedtuple('sim_tuple', 'index score') 639 | sims = [] # list of sim_tuple 640 | hap_str_len = len(hap_string) 641 | for i in range(len(self.haplotypes)): 642 | # Liz note: currently NOT checking whether existing haplotypes have '?'. I'm assuming no '?'. 643 | score = sum((hap_string[k]==self.haplotypes[i][k]) for k in range(hap_str_len)) 644 | if score > 0: 645 | sims.append(sim_tuple(index=i, score=score)) 646 | if len(sims) == 0: 647 | return None 648 | sims.sort(key=lambda x: x.score, reverse=True) 649 | if sims[0].score >= min_score and (len(sims)==1 or sims[0].score > sims[1].score): 650 | return sims[0].index 651 | else: 652 | return None 653 | 654 | def get_haplotype_vcf_assignment(self): 655 | """ 656 | Must be called before self.write_haplotype_to_vcf() 657 | This is preparing for writing out VCF. We need to know, for each variant position, 658 | the ref base (already filled in self.ref_at_pos) and the alt bases (self.alt_at_pos). 659 | For each haplotype in (self.haplotype), we need to know the whether the i-th variant is the 660 | ref (index 0), or some alt base (index 1 and onwards). 661 | 662 | Propagates two variables: 663 | 664 | self.haplotype_vcf_index: hap index --> pos --> phase index (0 for ref, 1+ for alt) 665 | self.alt_at_pos: dict of <0-based pos> --> alt bases (not is not ref) at this position 666 | """ 667 | self.haplotype_vcf_index = [{} for i in range(len(self.haplotypes))] 668 | self.alt_at_pos = {} 669 | 670 | # what happens in the case of partial phasing 671 | # ex: self.haplotypes[0] = "A?G", this means when it comes to the second pos, pos2, 672 | # in the VCF we would want to write out .|. for diploid, . for haploid, etc 673 | # so let's set self.haplotype_vcf_index[0][pos2] = '.' to indicate that 674 | 675 | for i,pos in enumerate(self.hap_var_positions): 676 | ref = self.ref_at_pos[pos] 677 | # need to go through the haplotype bases, if ref is already represented, then don't put it in alt 678 | self.alt_at_pos[pos] = [] 679 | for hap_i, hap_str in enumerate(self.haplotypes): 680 | base = hap_str[i] 681 | if base=='?': # means this haplotype does not cover this position! 682 | self.haplotype_vcf_index[hap_i][pos] = '.' 683 | elif base==ref: # is the ref base 684 | self.haplotype_vcf_index[hap_i][pos] = 0 685 | else: # is an alt base, see if it's already there 686 | if base in self.alt_at_pos[pos]: 687 | j = self.alt_at_pos[pos].index(base) 688 | self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref) 689 | else: 690 | j = len(self.alt_at_pos[pos]) 691 | self.alt_at_pos[pos].append(base) 692 | self.haplotype_vcf_index[hap_i][pos] = j + 1 # always +1, buz alt starts at 1 (0 is ref) 693 | # in the case where partial_ok=False, it's possible some alt are never presented by a haplotype 694 | # we must check that all variants are presented here 695 | for _base in self.count_of_vars_by_pos[pos]: 696 | if (_base not in self.ref_at_pos[pos]) and (_base not in self.alt_at_pos[pos]): 697 | self.alt_at_pos[pos].append(_base) 698 | 699 | 700 | def write_haplotype_to_humanreadable(self, contig, f_human1, f_human2, f_human3, seq_hap_info): 701 | """ 702 | The following functions must first be called first: 703 | -- self.get_haplotype_vcf_assignment 704 | f_human1 : human readable tab file handle, one SNP per line 705 | f_human2: human readable tab file handle, one allele per line 706 | f_human3: human readable tab file handle, CCS read to haplotype assignment, one read per line 707 | """ 708 | if self.haplotype_vcf_index is None or self.alt_at_pos is None: 709 | raise Exception("Must call self.get_haplotype_vcf_assignment() first!") 710 | 711 | self.sanity_check() 712 | 713 | # f_human1.write("haplotype\thapIdx\tcontig\tpos\tvarIdx\tbase\tcount\n") 714 | # f_human2.write("haplotype\thapIdx\tcontig\tcount\n") 715 | # f_human3.write("read_id\thaplotype\thapIdx\n") 716 | 717 | hap_count = Counter() 718 | for ccs_id, hap_index in seq_hap_info.items(): 719 | hap_count[hap_index] += 1 720 | hap_str = self.haplotypes[hap_index] 721 | f_human3.write(f'{ccs_id}\t{hap_str}\t{hap_index}\n') 722 | 723 | for hap_index,hap_str in enumerate(self.haplotypes): 724 | f_human2.write(f'{hap_str}\t{hap_index}\t{contig}\t') 725 | f_human2.write(str(hap_count[hap_index]) + '\n') 726 | for pos_index,pos in enumerate(self.hap_var_positions): 727 | i = self.haplotype_vcf_index[hap_index][pos] 728 | if i == '.': # means this haplotype does not include this position, skip! 729 | continue 730 | assert type(i) is int 731 | f_human1.write(f'{hap_str}\t{hap_index}\t{contig}\t') 732 | f_human1.write(str(pos+1)+'\t') 733 | f_human1.write(str(pos_index+1)+'\t') 734 | if i == 0: 735 | base = self.ref_at_pos[pos] 736 | f_human1.write("REF\t") 737 | else: 738 | base = self.alt_at_pos[pos][i-1] 739 | f_human1.write("ALT" + str(i-1) + '\t') 740 | #if i>0: pdb.set_trace() 741 | f_human1.write(str(self.count_of_vars_by_pos[pos][base]) + '\n') 742 | 743 | --------------------------------------------------------------------------------