├── Dockerfile
├── LICENSE.txt
├── README.md
├── debreak_detect.py
├── debreak_merge.py
├── debreak_merge_clustering.py
├── denovo_baseerror.py
├── denovo_correct.py
├── denovo_plot.py
├── denovo_static.py
├── inspector-correct.py
├── inspector.py
└── testdata
    ├── contig_test.fa
    └── read_test.fastq.gz


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | 
 4 | RUN apt-get update \
 5 |     && apt-get install -y --no-install-recommends \
 6 |         build-essential \
 7 |         bzip2 \
 8 |         curl \
 9 |         git \
10 |         less \
11 |         sudo \
12 |         vim \
13 |         wget \
14 |         zlib1g-dev \
15 | 	libbz2-dev \
16 | 	liblzma-dev \
17 |     && rm -rf /var/lib/apt/lists/*
18 | 
19 | RUN sudo apt -y update
20 | RUN sudo apt -y upgrade
21 | RUN sudo apt -y install python2.7 python-pip
22 | RUN pip install pysam
23 | RUN python -m pip install -U matplotlib
24 | RUN pip install statsmodels==0.10.1
25 | 
26 | 
27 | RUN curl -L  https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2 | tar -jxvf -
28 | WORKDIR samtools-1.9
29 | RUN ./configure --without-curses 
30 | RUN make && make install
31 | WORKDIR ..
32 | 
33 | 
34 | RUN curl -L https://github.com/lh3/minimap2/releases/download/v2.15/minimap2-2.15_x64-linux.tar.bz2 | tar -jxvf -
35 | ENV PATH="minimap2-2.15_x64-linux/:${PATH}"
36 | 
37 | RUN pip install setuptools
38 | RUN git clone https://github.com/fenderglass/Flye
39 | WORKDIR Flye
40 | RUN git checkout tags/2.8.3 -b inspector-flye
41 | RUN python setup.py install
42 | WORKDIR ..
43 | RUN git clone https://github.com/Maggi-Chen/Inspector.git
44 | ENV PATH="Inspector/:${PATH}"
45 | 
46 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2020-     University of Alabama at Birmingham
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Inspector
  2 | 
  3 | A reference-free assembly evaluator.
  4 | 
  5 | Author: Maggi Chen
  6 | 
  7 | Email: maggic@uab.edu
  8 | 
  9 | Draft date: Apr. 20, 2021
 10 | 
 11 | ## Quick Start
 12 | ```sh
 13 | git clone https://github.com/ChongLab/Inspector.git
 14 | cd Inspector/
 15 | ./inspector.py -h
 16 | 
 17 | # Evaluate assembly with raw reads
 18 | inspector.py -c contig.fa -r rawreads.1.fastq rawreads.2.fastq -o inspector_out/ --datatype clr 
 19 | # Evaluate assembly with hifi reads
 20 | inspector.py -c contig.fa -r ccsreads.1.fastq ccsreads.2.fastq -o inspector_out/ --datatype hifi
 21 | 
 22 | # With reference-based evaluation
 23 | inspector.py -c contig.fa -r rawreads.1.fastq --ref reference.fa -o inspector_out/ --datatype clr
 24 | 
 25 | # Reference-based only evaluation
 26 | inspector.py -c contig.fa -r emptyfile --ref reference.fa -o inspector_out/ 
 27 | 
 28 | # Error correction
 29 | inspector-correct.py -i inspector_out/ --datatype pacbio-hifi -o inspector_out/
 30 | 
 31 | ```
 32 | 
 33 | 
 34 | 
 35 | ## Description
 36 | 
 37 | Inspector is a tool for assembly evaluation with long read data. The input includes a contig file, long reads (PacBio CLR, PacBio HiFi, Oxford Nanopore, or mixed platform), and a reference genome (optional). The output includes A summary report, read-to-contig alignment file, a list of structrual errors and small-scale errors. This program was tested on a x86_64 Linux system with a 128GB physical memory.
 38 | 
 39 | ## Depencency
 40 | 
 41 | Dependencies for Inspector:
 42 | 
 43 | * python  
 44 | * pysam
 45 | * statsmodels (tested with version 0.10.1)
 46 | 
 47 | * minimap2  (tested with version 2.10 and 2.15)
 48 | * samtools  (tested with version 1.9)
 49 | 
 50 | 
 51 | Dependencies for Inspector error correction module:
 52 | * flye  (tested with version 2.8.3)
 53 | 
 54 | 
 55 | ## Installation
 56 | 
 57 | To create an environment with conda or mamba (recommended):
 58 | ```
 59 | mamba create --name ins inspector
 60 | mamba activate ins
 61 | 
 62 | ```
 63 | Git install after installing all the dependencies. 
 64 | ```
 65 | git clone https://github.com/ChongLab/Inspector.git
 66 | export PATH=$PWD/Inspector/:$PATH
 67 | ```
 68 | 
 69 | 
 70 | 
 71 | A subset of human genome assembly is available as testing dataset to validate successful installation. The contig_test.fa includes two contigs (1.4Mbp and 10Kbp). The read_test.fastq.gz includes ~60X PacBio HiFi reads belonging to these two contigs. There are 3 structural errors and 281 small-scale errors present in the testing dataset.
 72 | ```
 73 | cd Inspector/
 74 | ./inspector.py -c testdata/contig_test.fa -r testdata/read_test.fastq.gz -o test_out/ --datatype hifi 
 75 | ./inspector-correct.py -i test_out/ --datatype pacbio-hifi 
 76 | ```
 77 | (The Inspector evaluation on testing dataset should finish within several minutes with 4 CPUs and 400MB memory.
 78 | The Inspector error correction should finish within 10-15 minutes with 4 CPUs and 500MB memory.)
 79 | 
 80 | 
 81 | ## General usage
 82 | 
 83 | 
 84 | ```
 85 | 
 86 | inspector.py [-h] -c contig.fa -r raw_reads.fa -o output_dict/
 87 |   required arguments:
 88 |   --contig,-c           FASTA/FASTQ file containing contig sequences to be evaluated
 89 |   --read,-r             A list of FASTA/FASTQ files containing long read sequences
 90 | 
 91 |   optional arguments:
 92 |   -h, --help            Show this help message and exit
 93 |   --version             Show program's version number and exit
 94 |   --datatype,-d         Input read type. (clr, hifi, nanopore, mixed) [clr]
 95 |   --ref                 OPTIONAL reference genome in .fa format
 96 |   --thread,-t           Number of threads. [8]
 97 |   --min_contig_length   Minimal length for a contig to be evaluated [10000]
 98 |   --min_contig_length_assemblyerror    Minimal contig length for assembly error detection. [1000000]
 99 |   --pvalue              Maximal p-value for small-scale error identification [0.01 for HiFi, 0.05 for others]
100 |   --skip_read_mapping   Skip the step of mapping reads to contig
101 |   --skip_structural_error       Skip the step of identifying large structural errors
102 |   --skip_structural_error_detect       Skip the step of detecting large structural errors
103 |   --skip_base_error     Skip the step of identifying small-scale errors
104 |   --skip_base_error_detect      Skip the step of detecting small-scale errors from pileup
105 | 
106 | 
107 | 
108 | inspector-correct.py [-h] -i inspector_out/ --datatype pacbio-raw 
109 |   required arguments:
110 |   --inspector,-i        Inspector evaluation directory with original file names
111 |   --datatype            Type of read used for Inspector evaluation. Required for structural error correction
112 |   --outpath,-o          Output directory
113 |   --flyetimeout         Maximal runtime for local assembly with Flye
114 |   --thread,-t           Number of threads
115 |   --skip_structural     Do not correct structural errors. Local assembly will not be performed
116 |   --skip_baseerror      Do not correct small-scale errors
117 |   
118 | 
119 | ```
120 | 
121 | ## Use cases
122 | Inspector evaluates the contigs and identifies assembly errors with sequencing reads. You can use reads from single platform:
123 | ```
124 | inspector.py -c contig.fa -r rawreads.1.fastq rawreads.2.fastq -o inspector_out/ --datatype clr
125 | ```
126 | Or use a mixed data type:
127 | ```
128 | inspector.py -c contig.fa -r rawreads.fastq nanopore.fastq -o inspector_out/ --datatype mixed
129 | ```
130 | Reference-based evaluation is also supported:
131 | (Note that reported assembly error from reference-based mode will contain genetic variants)
132 | ```
133 | inspector.py -c contig.fa -r rawreads.fastq --ref reference.fa -o inspector_out/ --datatype clr
134 | ```
135 | If only the continuity analysis is needed, simply provide an empty file for --read:
136 | ```
137 | inspector.py -c contig.fa -r emptyfile -o inspector_out/ --skip_base_error --skip_structural_error
138 | ```
139 | For the '--skip' options, do not use unless you are repeating the evaluation with same contig and read files in the same output directory. These may help save time when testing different options for error detection.
140 | 
141 | 
142 | 
143 | Inspector provides an error-correction module to improve assembly accuracy. High-accuracy reads are recommended, especially for small-scale error correction:
144 | (Note that only reads from single platform are supported for error correction.)
145 | ```
146 | inspector-correct.py -i inspector_out/ --datatype pacbio-hifi -o inspector_out/ 
147 | ```
148 | 
149 | 
150 | ## Output file descriptions
151 | Inspector writes its evaluation reports into a new directory to avoid conflicts in file names. Inspector error correction uses the evaluationary results to generate corrected assembly. The output directory of Inspector includes:
152 | 
153 | ### summary_statistics
154 | An evaluation report of the input assembly. This file includes the contig continuity statistics reports, the read mapping summary, number of structural and small-scale errors, the QV score, and the contig alignment summary from reference-based mode when available. An assembly with expected total length, high read-to-contig mapping rate, low number of structural and small-scale errors, and high QV score indicates a high assembly quality. When the reference genome is provided, a higher genome coverage and NA50 also indicates more complete assembly.
155 | 
156 | ### structural_error.bed
157 | This file includes all structural errors identified in the assembly. <br />
158 | The first, second and third column indicate the contig, start and end position of structural error. For Expansions and Inversions, the size of error equals the distance between start and end position. For Collapses, the collapsed sequences are missing in the contigs, therefore the EndPosition is StartPosition+1. The length of collapsed sequence should be inferred from the Size column. For HaplotypeSwitches, Inspector normally considers the haplotype containing Expansion-like pattern as haplotype 1, and considers the haplotype with Collapse-like pattern as haplotype 2. The position of error in haplotype1 and haplotype2 are separated by ";" in the second and third columns. <br />
159 | The fourth column indicates the number of error-supporting reads. A high number of error-supporting reads indicates a confident error call. <br />
160 | The fifth and sixth column indicates the type and size of the erorr. For HaplotypeSwitch, the error sizes in the two haplotypes are usually different. Inspector normally lists the size in haplotype1 and haplotype2, corresponding to the position columns. <br />
161 | Column seven to twelve include other information about the structural errors. These are kept for developmental purpose. 
162 | 
163 | ### small_scale_error.bed
164 | This file includes all small-scale errors identified in the assembly. <br />
165 | The first, second and third column indicate the contig, start and end position of small-scale errors. Similar to the structural errors, the distance between StartPosition and EndPosition equals error size for small expansions and equals 1 for small collapses. <br />
166 | The fourth and fifth column indicate the base in the contig and in the reads. <br />
167 | The sixth and seventh column indicate the number of error-supporting reads and the local sequencing depth. A high supporting-read-to-depth ratio means a confident error call. <br />
168 | The eighth column indicates the type of error. <br />
169 | The nineth column indicates the p-value from binominal test. 
170 | 
171 | ### contig_corrected.fa 
172 | The output corrected assembly of Inspector error-correction module. <br />
173 | Only contigs contained in the valid_contig.fa file (longer than --min_contig_length) are corrected. The small-scale errors listed in small_scale_error.bed should all be fixed. The structural errors in structural_error.bed are fixed if the local de novo assembly generates a full-length contig that can be confidently aligned to the original error region. Otherwise, the original sequence will be remained. 
174 | 
175 | 
176 | 
177 | ## Citing Inspector
178 | If you use Inspector, please cite
179 | > Chen, Y., Zhang, Y., Wang, A.Y. et al. Accurate long-read de novo assembly evaluation with Inspector. Genome Biol 22, 312 (2021). https://doi.org/10.1186/s13059-021-02527-4
180 | 


--------------------------------------------------------------------------------
/debreak_detect.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import time
  3 | import os
  4 | 
  5 | def cigardeletion_ref(flag,chrom,position,cigar,min_size,max_size): 
  6 | 	pos=int(position)
  7 | 	numbers='1234567890'
  8 | 	num=''
  9 | 	reflen=0
 10 | 	readlen=0
 11 | 	leftclip=0
 12 | 	rightclip=0
 13 | 	deletions=[]
 14 | 	insertions=[]
 15 | 	for c in cigar:
 16 | 		if c in numbers:
 17 | 			num+=c; continue
 18 | 		if c in 'MNP=X':
 19 | 			readlen+=int(num); reflen+=int(num);  num='';  continue
 20 | 		if c=='I':
 21 | 			if  int(num)>=min_size and int(num)<=max_size:
 22 | 				insertions+=[[chrom,pos+reflen,int(num),'I-cigar',readlen+leftclip]]
 23 | 			readlen+=int(num)
 24 | 			num=''; continue
 25 | 		if c == 'D':
 26 | 			if  int(num)>=min_size and int(num)<=max_size:
 27 | 				deletions+=[[chrom,pos+reflen,int(num),'D-cigar',readlen+leftclip]]
 28 | 			reflen+=int(num);  num='';  continue
 29 | 
 30 | 		if c in 'SH':
 31 | 			if readlen==0:
 32 | 				leftclip=int(num)
 33 | 			else:
 34 | 				rightclip=int(num)
 35 | 			num=''; continue
 36 | 	testif=1
 37 | 	window=500
 38 | 	while testif==1:
 39 | 		testif=0
 40 | 		if len(deletions)==1:
 41 | 			break
 42 | 		i=len(deletions)-1
 43 | 		while i>0:
 44 | 			gaplength=deletions[i][1]-deletions[i-1][1]-deletions[i-1][2]
 45 | 			if gaplength <= window:
 46 | 				deletions=deletions[:i-1]+[[chrom,deletions[i-1][1],deletions[i-1][2]+deletions[i][2],'D-cigar',deletions[i-1][4]]]+deletions[i+1:]
 47 | 				testif=1;break
 48 | 			else:
 49 | 				i-=1
 50 | 	testif=1
 51 | 	while testif==1:
 52 | 		testif=0
 53 | 		if len(insertions)==1:
 54 | 			break
 55 | 		i=len(insertions)-1
 56 | 		while i>0:
 57 | 			gaplength=insertions[i][1]-insertions[i-1][1]
 58 | 			l1=insertions[i][2]
 59 | 			l2=insertions[i-1][2]
 60 | 			window=200 if max(l1,l2)<100 else 400
 61 | 			window=400 if window==400 and max(l1,l2) <500 else 600
 62 | 			if gaplength >window :
 63 | 				i-=1
 64 | 			else:
 65 | 				insertions=insertions[:i-1]+[[chrom,insertions[i-1][1],l1+l2,'I-cigar',insertions[i-1][4]]]+insertions[i+1:]
 66 | 				testif=1;break
 67 | 
 68 | 	svcallset=deletions+insertions
 69 | 	return [svcallset,reflen,[leftclip,readlen,rightclip]]
 70 | 
 71 | 
 72 | def cigardeletion(flag,chrom,position,cigar,min_size,max_size):	#input a read line, return list of deletions
 73 | 	flag=int(flag)
 74 | 	if flag<=16:
 75 | 		detect_cigar_sv=True
 76 | 	else:
 77 | 		detect_cigar_sv=True
 78 | 	pos=int(position)
 79 | 	numbers='1234567890'
 80 | 	num=''
 81 | 	reflen=0
 82 | 	readlen=0
 83 | 	leftclip=0
 84 | 	rightclip=0
 85 | 	deletions=[]
 86 | 	insertions=[]
 87 | 	for c in cigar:
 88 | 		if c in numbers:
 89 | 			num+=c
 90 | 			continue
 91 | 		if c in 'MNP=X':
 92 | 			readlen+=int(num); reflen+=int(num);  num='';  continue
 93 | 		if c=='I':
 94 | 			if detect_cigar_sv and int(num)>=min_size and int(num)<=max_size:
 95 | 				insertions+=[[chrom,pos+reflen,int(num),'I-cigar',readlen]]
 96 | 			readlen+=int(num)
 97 | 			num=''; continue
 98 | 		if c == 'D':
 99 | 			if detect_cigar_sv and  int(num)>=min_size and int(num)<=max_size:
100 | 				deletions+=[[chrom,pos+reflen,int(num),'D-cigar']]
101 | 			reflen+=int(num);  num='';  continue
102 | 		if c in 'SH':
103 | 			if readlen==0:
104 | 				leftclip=int(num)
105 | 			else:
106 | 				rightclip=int(num)
107 | 			num=''; continue
108 | 	#merge deletions
109 | 	if detect_cigar_sv:
110 | 		testif=1
111 | 		window=500
112 | 		while testif==1:
113 | 			testif=0
114 | 			if len(deletions)==1:
115 | 				break
116 | 			i=len(deletions)-1
117 | 			while i>0:
118 | 				gaplength=deletions[i][1]-deletions[i-1][1]-deletions[i-1][2]
119 | 				if gaplength <= window:
120 | 					deletions=deletions[:i-1]+[[chrom,deletions[i-1][1],deletions[i-1][2]+deletions[i][2],'D-cigar']]+deletions[i+1:]
121 | 					testif=1
122 | 					break
123 | 				else:
124 | 					i-=1
125 | 		#merge insertions
126 | 		testif=1
127 | 		while testif==1:
128 | 			testif=0
129 | 			if len(insertions)==1:
130 | 				break
131 | 			i=len(insertions)-1
132 | 			while i>0:
133 | 				l1=insertions[i][2]
134 | 				l2=insertions[i-1][2]
135 | 				gaplength=insertions[i][1]-insertions[i-1][1]
136 | 				window=200 if max(l1,l2)<100 else 400
137 | 				window=400 if window==400 and max(l1,l2) <500 else 600
138 | 				if gaplength >window :
139 | 					i-=1
140 | 				else:
141 | 					insertions=insertions[:i-1]+[[chrom,insertions[i-1][1],l1+l2,'I-cigar',insertions[i-1][4]]]+insertions[i+1:]
142 | 					testif=1
143 | 					break
144 | 	
145 | 	svcallset=deletions+insertions
146 | 	return [svcallset,reflen,[leftclip,readlen,rightclip]]
147 | 
148 | 
149 | def segmentdeletion_ref(segments,min_size,max_size,if_contig):
150 | 	segments=[c for c in segments if c[5][1]>=min(0.05*(c[5][0]+c[5][1]+c[5][2]),20000)]
151 | 	if len(segments)<=1:
152 | 		return []
153 | 	svcallset=[]
154 | 	for iii  in range(len(segments)-1):
155 | 		primary=segments[iii]
156 | 		chrom=primary[2]
157 | 		priflag=(int(primary[1])%32)>15
158 | 		samedirchr=[];samechr=[];diffchr=[]
159 | 		restsegments=segments[iii+1:]
160 | 		for c in restsegments:
161 | 			ch=c[2]; f=int(c[1])%32>15
162 | 			if c[5][1]<300:
163 | 				continue
164 | 			if ch!=chrom:
165 | 				diffchr+=[c]
166 | 			elif f!=priflag:
167 | 				samechr+=[c]
168 | 			else:
169 | 				samedirchr+=[c]
170 | 		for c in samedirchr:
171 | 			if c[3]>primary[3] and c[4]-primary[4]>-200:
172 | 				leftread=primary; rightread=c
173 | 			elif c[3]<primary[3] and primary[4]-c[4]>-200:
174 | 				leftread=c; rightread=primary
175 | 			else:
176 | 				continue
177 | 			leftinfo=leftread[5]
178 | 			rightinfo=rightread[5]
179 | 			window=300
180 | 			if if_contig:
181 | 				window=min(2000,leftinfo[1]//2,rightinfo[1]//2)
182 | 			if abs(rightread[3]-leftread[4])<=window:
183 | 				overlap=rightread[3]-leftread[4]
184 | 				ins_size=rightinfo[0]-leftinfo[1]-leftinfo[0]-overlap
185 | 				if min_size<=ins_size<=max_size:
186 | 					if not priflag:
187 | 						svcallset+=[chrom+'\t'+str(min(rightread[3],leftread[4]))+'\t'+str(ins_size)+'\t'+'I-segment'+'\t'+primary[0]+'\t'+str(int(c[1])+int(primary[1]))+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[0]+leftinfo[1])]
188 | 					else:
189 | 						svcallset+=[chrom+'\t'+str(min(rightread[3],leftread[4]))+'\t'+str(ins_size)+'\t'+'I-segment'+'\t'+primary[0]+'\t'+str(int(c[1])+int(primary[1]))+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[2])]
190 | 
191 | 			overlapmap=leftinfo[0]+leftinfo[1]-rightinfo[0]
192 | 			window_max=2000 #Test for rescue FN
193 | 			overlap_window=-200
194 | 			if if_contig:
195 | 				overlap_window=-5000
196 | 				window_max=5000
197 | 			if overlap_window<overlapmap<window_max:
198 | 				del_size=rightread[3]-leftread[4]+overlapmap
199 | 				if min_size<=del_size<=max_size:
200 | 					if not priflag:
201 | 						svcallset+=[chrom+'\t'+str(leftread[4]-max(0,overlapmap))+'\t'+str(del_size)+'\t'+'D-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[0]+leftinfo[1])]
202 | 					else:
203 | 						svcallset+=[chrom+'\t'+str(leftread[4]-max(0,overlapmap))+'\t'+str(del_size)+'\t'+'D-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[2])]
204 | 
205 | 		for c in samechr:
206 | 			if c[3]>primary[3] and c[4]-primary[4]>-200:
207 | 				leftread=primary; rightread=c
208 | 			elif c[3]<primary[3] and primary[4]-c[4]>-200:
209 | 				leftread=c; rightread=primary
210 | 			else:
211 | 				continue
212 | 			leftinfo=leftread[5]
213 | 			rightinfo=rightread[5]
214 | 			window_max=500
215 | 			overlap_window=-200
216 | 			if if_contig:
217 | 				overlap_window=-2000
218 | 			overlapmap=rightinfo[0]+rightinfo[1]-leftinfo[2]
219 | 			if overlap_window<overlapmap<window_max and (rightread[4]-leftread[4])>=max(100,overlapmap):
220 | 				inv_size=rightread[4]-leftread[4]-overlapmap
221 | 				if min_size<=inv_size<=max_size:
222 | 					if int(leftread[1]) % 32 <16:
223 | 						svcallset+=[chrom+'\t'+str(leftread[4])+'\t'+str(inv_size)+'\t'+'INV-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[0]+leftinfo[1])]
224 | 					else:
225 | 						svcallset+=[chrom+'\t'+str(leftread[4])+'\t'+str(inv_size)+'\t'+'INV-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[2])]
226 | 					continue
227 | 			overlapmap=rightinfo[1]+rightinfo[2]-leftinfo[0]
228 | 			if overlap_window<overlapmap<window_max and (rightread[3]-leftread[3])>=max(100,overlapmap):
229 | 				inv_size=rightread[3]-leftread[3]-overlapmap
230 | 				if min_size<=inv_size<=max_size:
231 | 					if int(leftread[1]) % 32 <16:
232 | 						svcallset+=[chrom+'\t'+str(leftread[3])+'\t'+str(inv_size)+'\t'+'INV-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[0])]
233 | 					else:
234 | 						svcallset+=[chrom+'\t'+str(leftread[3])+'\t'+str(inv_size)+'\t'+'INV-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)+'\t'+str(leftinfo[1]+leftinfo[2])]
235 | 					continue
236 | 	return svcallset
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | def segmentdeletion(segments,min_size,max_size,if_contig):  #input a list of segments,return list of deletions
244 | 	if len([c for c in segments if int(c[1])<=16])==0:
245 | 		return []
246 | 	segments=[c for c in segments if c[5][1]>=min(0.05*(c[5][0]+c[5][1]+c[5][2]),20000)]
247 | 	if len(segments)<=1:
248 | 		return []
249 | 	svcallset=[]
250 | 	for iii  in range(len(segments)-1):
251 | 		primary=segments[iii]
252 | 		chrom=primary[2]
253 | 		priflag=(int(primary[1])%32)>15
254 | 		samedirchr=[]
255 | 		samechr=[]
256 | 		diffchr=[]
257 | 		restsegments=segments[iii+1:]
258 | 		for c in restsegments:
259 | 			ch=c[2]
260 | 			f=int(c[1])%32>15
261 | 			if c[5][1]<300:
262 | 				continue
263 | 			if ch!=chrom:
264 | 				diffchr+=[c]
265 | 			elif f!=priflag:
266 | 				samechr+=[c]
267 | 			else:
268 | 				samedirchr+=[c]
269 | 		for c in samedirchr:
270 | 			if c[3]>primary[3] and c[4]-primary[4]>-200:
271 | 				leftread=primary
272 | 				rightread=c
273 | 			elif c[3]<primary[3] and primary[4]-c[4]>-200:
274 | 				leftread=c
275 | 				rightread=primary
276 | 			else:
277 | 				continue	
278 | 			leftinfo=leftread[5]
279 | 			rightinfo=rightread[5]
280 | 			#insertion:
281 | 			window=300
282 | 			if if_contig:
283 | 				window=min(2000,leftinfo[1]//2,rightinfo[1]//2)
284 | 			if abs(rightread[3]-leftread[4])<=window:
285 | 				overlap=rightread[3]-leftread[4]
286 | 				ins_size=rightinfo[0]-leftinfo[1]-leftinfo[0]-overlap
287 | 				if min_size<=ins_size<=max_size:
288 | 					svcallset+=[chrom+'\t'+str(min(rightread[3],leftread[4]))+'\t'+str(ins_size)+'\t'+'I-segment'+'\t'+primary[0]+'\t'+str(int(c[1])+int(primary[1]))+'\t'+str((int(c[6])+int(primary[6]))//2)]
289 | 
290 | 			#deletion:
291 | 			overlapmap=leftinfo[0]+leftinfo[1]-rightinfo[0]
292 | 			#window_max=1500
293 | 			window_max=2000 #Test for rescue FN
294 | 			overlap_window=-200
295 | 			if if_contig:
296 | 				overlap_window=-5000
297 | 				window_max=5000
298 | 			if overlap_window<overlapmap<window_max:
299 | 				del_size=rightread[3]-leftread[4]+overlapmap
300 | 				if min_size<=del_size<=max_size:
301 | 					svcallset+=[chrom+'\t'+str(leftread[4]-max(0,overlapmap))+'\t'+str(del_size)+'\t'+'D-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)]
302 | 			'''
303 | 			#duplication:
304 | 			if not if_contig:
305 | 				overlapmap=leftinfo[0]+leftinfo[1]-rightinfo[0]
306 | 				window_max=500
307 | 				if -200<overlapmap<window_max and leftread[4]-rightread[3]>=max(50,overlapmap):
308 | 					dup_size=leftread[4]-rightread[3]-max(overlapmap,0)
309 | 					if min_size<=dup_size<=max_size:
310 | 						svcallset+=[chrom+'\t'+str(rightread[3])+'\t'+str(dup_size)+'\t'+'I-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))/2)]
311 | 				overlapmap=rightinfo[0]+rightinfo[1]-leftinfo[0]
312 | 				if -200<overlapmap<window_max and (rightread[4]-leftread[3])>=max(1000,overlapmap):
313 | 					dup_size=rightread[4]-leftread[3]-overlapmap
314 | 					if min_size<=dup_size<=max_size:
315 | 						svcallset+=[chrom+'\t'+str(leftread[3])+'\t'+str(dup_size)+'\t'+'I-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))/2)]
316 | 			'''
317 | 		#inversion:
318 | 		for c in samechr:
319 | 			if c[3]>primary[3] and c[4]-primary[4]>-200:
320 | 				leftread=primary
321 | 				rightread=c
322 | 			elif c[3]<primary[3] and primary[4]-c[4]>-200:
323 | 				leftread=c
324 | 				rightread=primary
325 | 			else:
326 | 				continue
327 | 			leftinfo=leftread[5]
328 | 			rightinfo=rightread[5]
329 | 			window_max=500
330 | 			overlap_window=-200
331 | 			if if_contig:
332 | 				overlap_window=-2000
333 | 			overlapmap=rightinfo[0]+rightinfo[1]-leftinfo[2]
334 | 			if overlap_window<overlapmap<window_max and (rightread[4]-leftread[4])>=max(100,overlapmap):
335 | 				inv_size=rightread[4]-leftread[4]-overlapmap
336 | 				if min_size<=inv_size<=max_size:
337 | 					svcallset+=[chrom+'\t'+str(leftread[4])+'\t'+str(inv_size)+'\t'+'INV-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)]
338 | 					continue
339 | 			overlapmap=rightinfo[1]+rightinfo[2]-leftinfo[0]
340 | 			if overlap_window<overlapmap<window_max and (rightread[3]-leftread[3])>=max(100,overlapmap):
341 | 				inv_size=rightread[3]-leftread[3]-overlapmap
342 | 				if min_size<=inv_size<=max_size:
343 | 					svcallset+=[chrom+'\t'+str(leftread[3])+'\t'+str(inv_size)+'\t'+'INV-segment'+'\t'+primary[0]+'\t'+str(c[1])+'\t'+str((int(c[6])+int(primary[6]))//2)]
344 | 					continue
345 | 	return svcallset
346 | 
347 | 
348 | 
349 | 
350 | def detect_sortbam(workpath,min_size,max_size,chrom):
351 | 	f=pysam.AlignmentFile(workpath+'read_to_contig.bam', "rb")
352 | 	segmentreads={}
353 | 	tempfile=open(workpath+'debreak_workspace/read_to_contig_'+chrom+'.debreak.temp','w')
354 | 	totalmaplength=0
355 | 	number_read=0
356 | 	split_num=0
357 | 
358 | 	for align in f.fetch(chrom,):
359 | 		if align.is_secondary:
360 | 			continue
361 | 		readname=align.query_name
362 | 		flag=align.flag
363 | 		position=align.reference_start+1
364 | 		refend=align.reference_end+1
365 | 		cigar_info=[0,0,0]
366 | 		if align.cigar[0][0] in [4,5]:
367 | 			cigar_info[0]=align.cigar[0][1]
368 | 		if align.cigar[-1][0] in [4,5]:
369 | 			cigar_info[2]=align.cigar[-1][1]
370 | 		cigar_info[1]=align.query_alignment_length
371 | 		mappingquality=align.mapping_quality
372 | 		readinfo=[readname,flag,chrom,position,refend,cigar_info,mappingquality]
373 | 		if align.is_supplementary:
374 | 			cigar=align.cigarstring
375 | 			cigarinfo=cigardeletion(flag,chrom,position,cigar,5,max_size)
376 | 			cigarsv=[mm for mm in cigarinfo[0] if int(mm[2])>=min_size]
377 | 			for d in cigarsv:
378 | 				tempfile.write(d[0]+'\t'+str(d[1])+'\t'+str(d[2])+'\t'+d[3]+'\t'+readname+'\t'+str(flag)+'\t'+str(mappingquality)+'\n')
379 | 
380 | 			pri_chrom=align.get_tag("SA").split(',')[0]
381 | 			if pri_chrom!=chrom:
382 | 				continue
383 | 			else:
384 | 				if readname not in segmentreads:
385 | 					segmentreads[readname]=[readinfo]
386 | 				else:
387 | 					segmentreads[readname]+=[readinfo]
388 | 
389 | 		else:
390 | 			totalmaplength+=align.query_length
391 | 			number_read+=1
392 | 			cigar=align.cigarstring
393 | 			cigarinfo=cigardeletion(flag,chrom,position,cigar,5,max_size)
394 | 			cigarsv=[mm for mm in cigarinfo[0] if int(mm[2])>=min_size]
395 | 			for d in cigarsv:
396 | 				tempfile.write(d[0]+'\t'+str(d[1])+'\t'+str(d[2])+'\t'+d[3]+'\t'+readname+'\t'+str(flag)+'\t'+str(mappingquality)+'\n')
397 | 
398 | 			if align.has_tag("SA"):
399 | 				if align.mapping_quality > 50:
400 | 					split_num+=1
401 | 				if chrom in [c.split(',')[0] for c in align.get_tag("SA").split(';')[:-1]]:
402 | 					if readname not in segmentreads:
403 | 						segmentreads[readname]=[readinfo]
404 | 					else:
405 | 						segmentreads[readname]+=[readinfo]
406 | 
407 | 	for readgroup in segmentreads:
408 | 		if len(segmentreads[readgroup])<2 or len(segmentreads[readgroup])>20:
409 | 			continue
410 | 		segmentsv=segmentdeletion(segmentreads[readgroup],min_size,max_size,False)
411 | 		for d in segmentsv:
412 | 			tempfile.write(d+'\n')
413 | 	tempfile.close()
414 | 	f.close()
415 | 	if totalmaplength!=0:
416 | 		f=open(workpath+'map_depth/maplength_large_'+chrom,'w')
417 | 		f.write(str(totalmaplength)+'\n')
418 | 		f.close()
419 | 		f=open(workpath+'map_depth/readnum_large_'+chrom,'w')
420 | 		f.write(str(number_read)+'\n')
421 | 		f.close()
422 | 		f=open(workpath+'map_depth/splitread_large_'+chrom,'w')
423 | 		f.write(str(split_num)+'\n')
424 | 		f.close()
425 | 	return 0
426 | 
427 | 
428 | def detect_sortbam_nosv(writepath,chrom,contig_type):
429 | 	print('Collect info from '+chrom)
430 | 	samfile=pysam.AlignmentFile(writepath+'read_to_contig.bam',"rb")
431 | 	allreads=samfile.fetch(chrom,)
432 | 	totalmaplength=0
433 | 	number_read=0
434 | 	split_num=0
435 | 	for align in allreads:
436 | 		if align.is_secondary or align.is_supplementary:
437 | 			continue
438 | 		totalmaplength+=align.query_length
439 | 		number_read+=1
440 | 		if align.has_tag("SA"):
441 | 			if align.mapping_quality > 50:
442 | 				split_num+=1
443 | 
444 | 	if totalmaplength!=0:
445 | 		f=open(writepath+'map_depth/maplength_'+contig_type+'_'+chrom,'w')
446 | 		f.write(str(totalmaplength)+'\n')
447 | 		f.close()
448 | 		f=open(writepath+'map_depth/readnum_'+contig_type+'_'+chrom,'w')
449 | 		f.write(str(number_read)+'\n')
450 | 		f.close()
451 | 		f=open(writepath+'map_depth/splitread_'+contig_type+'_'+chrom,'w')
452 | 		f.write(str(split_num)+'\n')
453 | 		f.close()
454 | 	return 0
455 | 
456 | 
457 | 
458 | def detect_sam_ref(filename,readpath,writepath,min_size,max_size):
459 | 	f=open(readpath+filename,'r')
460 | 	c=f.readline()
461 | 	g=open(writepath+filename[:-4]+'.debreak.temp','w')
462 | 	lastname=''
463 | 	segments=['']
464 | 	unmapped=0
465 | 	mapped=0
466 | 	multimap=[]
467 | 	totalmappedlength=0
468 | 	while c!='':
469 | 		#remove headerlines, secondary alignments, alignment on scallfolds
470 | 		if c[0]=='@' or c.split('\t')[1] not in ['0','16','4','256','272','2048','2064']:
471 | 			c=f.readline(); continue
472 | 		if c.split('\t')[1]=='4':
473 | 			unmapped+=1; c=f.readline(); continue
474 | 		if c.split('\t')[1] in ['256','272'] :
475 | 			readname=c.split('\t')[0]
476 | 			if readname not in multimap:
477 | 				 multimap+=[readname]
478 | 			c=f.readline(); continue
479 | 		#detect the deletion from cigar
480 | 		readname=c.split('\t')[0]
481 | 		flag=c.split('\t')[1]
482 | 		chrom=c.split('\t')[2]
483 | 		position=int(c.split('\t')[3])
484 | 		mappingquality=c.split('\t')[4]
485 | 		cigar=c.split('\t')[5]
486 | 		cigarinfo=cigardeletion_ref('0',chrom,position,cigar,min_size,max_size)
487 | 
488 | 		if flag in ['0','16']:
489 | 			mapped+=1
490 | 		totalmappedlength+=cigarinfo[2][1]
491 | 
492 | 
493 | 		if cigarinfo[2][1]<10000 or cigarinfo[2][1]<0.01*(sum(cigarinfo[2])):
494 | 		#if cigarinfo[2][1]<100000 or cigarinfo[2][1]<0.01*(sum(cigarinfo[2])):
495 | 		#if cigarinfo[2][1]<500000 and cigarinfo[2][1]<0.05*(sum(cigarinfo[2])):
496 | 
497 | 			c=f.readline()
498 | 			continue
499 | 
500 | 		refend=position+cigarinfo[1]
501 | 		cimplecigar=str(cigarinfo[2][0])+'\t'+str(cigarinfo[2][1])+'\t'+str(cigarinfo[2][2])
502 | 		# if primary: write deletions from cigar string
503 | 		cigarsv=cigarinfo[0]
504 | 		if int(flag)%32<16:
505 | 			for d in cigarsv:
506 | 				g.write(d[0]+'\t'+str(d[1])+'\t'+str(d[2])+'\t'+d[3]+'\t'+readname+'\t'+flag+'\t'+mappingquality+'\t'+str(d[4])+'\n')
507 | 		else:
508 | 			totalreadlength=sum(cigarinfo[2])
509 | 			for d in cigarsv:
510 | 				if 'I-cigar' in d:
511 | 					g.write(d[0]+'\t'+str(d[1])+'\t'+str(d[2])+'\t'+d[3]+'\t'+readname+'\t'+flag+'\t'+mappingquality+'\t'+str(totalreadlength-d[4]-d[2])+'\n')
512 | 				else:
513 | 					g.write(d[0]+'\t'+str(d[1])+'\t'+str(d[2])+'\t'+d[3]+'\t'+readname+'\t'+flag+'\t'+mappingquality+'\t'+str(totalreadlength-d[4])+'\n')
514 | 
515 | 		readinfo=[readname,flag,chrom,position,refend,cigarinfo[2],mappingquality]
516 | 		if readname!=lastname:
517 | 			if 1<len(segments):
518 | 				segmentd=segmentdeletion_ref(segments,min_size,max_size,True)
519 | 				for d in segmentd:
520 | 					g.write(d+'\n')
521 | 			lastname=readname
522 | 			segments=[readinfo]
523 | 		else:
524 | 			segments+=[readinfo]
525 | 		c=f.readline()
526 | 	if 1<len(segments):
527 | 		segmentd=segmentdeletion_ref(segments,min_size,max_size,True)
528 | 		for d in segmentd:
529 | 			g.write(d+'\n')
530 | 	segments=[]
531 | 	f.close()
532 | 	g.close()
533 | 	
534 | 	return [unmapped,mapped,len(multimap),totalmappedlength]
535 | 
536 | 		
537 | 
538 | 


--------------------------------------------------------------------------------
/debreak_merge.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | import pysam 
  4 | import denovo_static
  5 | import sys
  6 | 
  7 | 
  8 | def mergerpossort(a):
  9 | 	return int(a.split('\t')[1])
 10 | 
 11 | def mergerlensort(a):
 12 | 	return int(a.split('\t')[2])
 13 | 
 14 | def m_samechr_insertion(samechrom):
 15 | 	ins=[]
 16 | 	samechrom.sort(key=mergerpossort)
 17 | 	samechrom+=['last_end\t999999999999\t999999999999\t1\t60\t0']
 18 | 	candi=[]
 19 | 	last=samechrom[0]
 20 | 	for event in samechrom:
 21 | 		maxlen=max(int(event.split('\t')[2]),int(last.split('\t')[2]))
 22 | 		window=max(300,maxlen+10)
 23 | 		window=min(800,window)
 24 | 		if int(event.split('\t')[1]) < int(last.split('\t')[1])+window :
 25 | 			candi+=[event]
 26 | 			last=sorted(candi,key=mergerlensort)[-1]
 27 | 			continue
 28 | 		if len(candi)==1:
 29 | 			ins+=[candi[0]]
 30 | 		else:
 31 | 			position=0; length=0; count=0; quality=0; sd=0; readnames=''
 32 | 			candi.sort(key=mergerlensort,reverse=True)
 33 | 			for can in candi:
 34 | 				if int(can.split('\t')[2])>=0.5*length:
 35 | 					can_count=int(can.split('\t')[3])
 36 | 					position=int((position*count+int(can.split('\t')[1])*can_count)/(count+can_count))
 37 | 					length=int((length*count+int(can.split('\t')[2])*can_count)/(count+can_count))
 38 | 					quality=(quality*count+float(can.split('\t')[4])*can_count)/(count+can_count)
 39 | 					sd=(sd*count+float(can.split('\t')[5])*can_count)/(count+can_count)
 40 | 					readnames+=can.split('\t')[6]+';'
 41 | 					count+=can_count
 42 | 			readnames=readnames[:-1]
 43 | 			ins+=[candi[0].split('\t')[0]+'\t'+str(position)+'\t'+str(length)+'\t'+str(count)+'\t'+str(quality)+'\t'+str(sd)+'\t'+readnames+'\tUnique']
 44 | 		candi=[event]
 45 | 		last=event
 46 | 	return ins
 47 | 
 48 | def sort_mostspupport(a):
 49 | 	return [int(a.split('\t')[3]),int(a.split('\t')[2])]
 50 | 
 51 | def m_samechr_deletion(samechrom):
 52 | 	dels=[]
 53 | 	samechrom.sort(key=mergerpossort)
 54 | 	samechrom+=['last_end\t999999999999\t999999999999\t1\t60\t0']
 55 | 	candi=[]
 56 | 	last=samechrom[0]
 57 | 	for event in samechrom:
 58 | 		maxlen=max(int(event.split('\t')[2]),int(last.split('\t')[2]))
 59 | 		window=max(300,maxlen+10)
 60 | 		window=min(800,window)
 61 | 		if int(event.split('\t')[1]) < int(last.split('\t')[1])+int(last.split('\t')[2])+200 and int(event.split('\t')[1])-int(last.split('\t')[1]) < window:
 62 | 			candi+=[event]
 63 | 			last=event
 64 | 			continue
 65 | 		if len(candi)==1:
 66 | 			dels+=[candi[0]]
 67 | 		else:
 68 | 			position=0; length=0; count=0; quality=0; sd=0; readnames=''
 69 | 			candi.sort(key=sort_mostspupport,reverse=True)
 70 | 			for can in candi:
 71 | 				if int(can.split('\t')[2])>=0.5*length:
 72 | 					can_count=int(can.split('\t')[3])
 73 | 					position=int((position*count+int(can.split('\t')[1])*can_count)/(count+can_count))
 74 | 					length=int((length*count+int(can.split('\t')[2])*can_count)/(count+can_count))
 75 | 					quality=(quality*count+float(can.split('\t')[4])*can_count)/(count+can_count)
 76 | 					sd=(sd*count+float(can.split('\t')[5])*can_count)/(count+can_count)
 77 | 					readnames+=can.split('\t')[6]+';'
 78 | 					count+=can_count
 79 | 			readnames=readnames[:-1]
 80 | 			dels+=[candi[0].split('\t')[0]+'\t'+str(position)+'\t'+str(length)+'\t'+str(count)+'\t'+str(quality)+'\t'+str(sd)+'\t'+readnames+'\tUnique']
 81 | 		candi=[event]
 82 | 		last=event
 83 | 	return dels
 84 | 
 85 | def mergertra(a):
 86 | 	return 	[a.split('\t')[2],int(a.split('\t')[3])]
 87 | 
 88 | def m_samechr_translocation(samechrom):
 89 | 	samechrom.sort(key=mergertra)
 90 | 	iftrue=0
 91 | 	while iftrue==0:
 92 | 		iftrue=1
 93 | 		for i in range(len(samechrom)-1):
 94 | 			if samechrom[i].split('\t')[2]==samechrom[i+1].split('\t')[2] and abs(int(samechrom[i].split('\t')[3])-int(samechrom[i+1].split('\t')[3]))<=800:
 95 | 				iftrue=0
 96 | 				if samechrom[i].split('\t')[0]==samechrom[i+1].split('\t')[0] and abs(int(samechrom[i].split('\t')[1])-int(samechrom[i+1].split('\t')[1]))<=1000:
 97 | 					count1=int(samechrom[i].split('\t')[4]); count2=int(samechrom[i+1].split('\t')[4])
 98 | 					pos1=(int(samechrom[i].split('\t')[1])*count1+int(samechrom[i+1].split('\t')[1])*count2)//(count1+count2)
 99 | 					pos2=(int(samechrom[i].split('\t')[3])*count1+int(samechrom[i+1].split('\t')[3])*count2)//(count1+count2)
100 | 					quality=(float(samechrom[i].split('\t')[5])*count1+float(samechrom[i+1].split('\t')[5])*count2)/(count1+count2)
101 | 					sd1=(float(samechrom[i].split('\t')[6])*count1+float(samechrom[i+1].split('\t')[6])*count2)/(count1+count2)
102 | 					sd2=(float(samechrom[i].split('\t')[7])*count1+float(samechrom[i+1].split('\t')[7])*count2)/(count1+count2)
103 | 					readname=samechrom[i].split('\t')[8]+';'+samechrom[i+1].split('\t')[8]
104 | 					mergedtra=samechrom[i].split('\t')[0]+'\t'+str(pos1)+'\t'+samechrom[i].split('\t')[2]+'\t'+str(pos2)+'\t'+str(count1+count2)+'\t'+str(quality)+'\t'+str(sd1)+'\t'+str(sd2)+'\t'+readname+'\tTranslocation'
105 | 					samechrom=samechrom[:i]+[mergedtra]+samechrom[i+2:]
106 | 				else:
107 | 					count1=int(samechrom[i].split('\t')[4]); count2=int(samechrom[i+1].split('\t')[4])
108 | 					if count1>=count2:
109 | 						samechrom.remove(samechrom[i+1])
110 | 					else:
111 | 						samechrom.remove(samechrom[i])
112 | 				break
113 | 	return samechrom
114 | 
115 | def standerd_varition(length):
116 | 	avelen=sum(length)/float(len(length))
117 | 	s=0.0
118 | 	for c in length:
119 | 		s+=(c-avelen)**2/float(len(length))
120 | 	s=s**0.5
121 | 	return s
122 | 
123 | 
124 | def mergeinfosecpos(a):
125 | 	return int(a.split('\t')[3])
126 | 
127 | def mergeinfo_translocation(candi,min_support):
128 | 	chrom=candi[0].split('\t')[0]
129 | 	secchr=[c.split('\t')[2] for c in candi]
130 | 	secchr=max(set(secchr),key=secchr.count)
131 | 	candi=[c for c in candi if c.split('\t')[2]==secchr]
132 | 
133 | 	candi.sort(key=mergeinfosecpos)
134 | 	if len(candi)%2==0:
135 | 		median=(int(candi[len(candi)//2-1].split('\t')[3])+int(candi[len(candi)//2].split('\t')[3]))//2
136 | 	else:
137 | 		median=int(candi[len(candi)//2].split('\t')[3])
138 | 	candi=[ c for c in candi if abs(int(c.split('\t')[3])-median)<=800]
139 | 	
140 | 	if len(candi)>=min_support:
141 | 		pos1=[int(c.split('\t')[1]) for c in candi]
142 | 		pos2=[int(c.split('\t')[3]) for c in candi]
143 | 		qual=[float(c.split('\t')[4]) for c in candi]
144 | 		sd1=standerd_varition(pos1)
145 | 		sd2=standerd_varition(pos2)
146 | 		readnames=''
147 | 		for c in candi:
148 | 			readnames+=c.split('\t')[5]+';'
149 | 		readnames=readnames[:-1]
150 | 		return [chrom+'\t'+str(sum(pos1)//len(pos1))+'\t'+secchr+'\t'+str(sum(pos2)//len(pos2))+'\t'+str(len(candi))+'\t'+str(sum(qual)/len(qual))+'\t'+str(sd1)+'\t'+str(sd2)+'\t'+readnames+'\tTranslocation']
151 | 	else:
152 | 		return []
153 | 
154 | 
155 | def assign_candi_insertion(candi,mean1,mean2):
156 | 	group1=[]
157 | 	group2=[]
158 | 	for c in candi:
159 | 		if abs(int(c.split('\t')[2])-mean1)<=abs(mean2-int(c.split('\t')[2])):
160 | 			group1+=[c]
161 | 		else:
162 | 			group2+=[c]
163 | 	mean1_new=int(sum([int(c.split('\t')[2]) for c in group1])/len(group1))
164 | 	mean2_new=int(sum([int(c.split('\t')[2]) for c in group2])/len(group2))
165 | 	return [group1,group2,mean1_new,mean2_new]
166 | 
167 | 
168 | 
169 | def mergeinfolengthsort(a):
170 | 	return int(a.split('\t')[2])
171 | 
172 | def mergeinfo_insertion(candi,min_support):
173 | 	candi.sort(key=mergeinfolengthsort)
174 | 
175 | 	if len(candi)>=1.5*min_support:
176 | 		upper=int(candi[len(candi)*3//4].split('\t')[2])
177 | 		lower=int(candi[len(candi)//4].split('\t')[2])
178 | 		if upper>3*lower:
179 | 			svgroups=assign_candi_insertion(candi,upper,lower)
180 | 			svgroups=assign_candi_insertion(candi,svgroups[2],svgroups[3])
181 | 			svgroups=assign_candi_insertion(candi,svgroups[2],svgroups[3])
182 | 			mergedsv=[]
183 | 			if len(svgroups[0])>=min_support:
184 | 				mergedsv+=mergeinfo_insertion_oneevent(svgroups[0],min_support)
185 | 			if len(svgroups[1])>=min_support:
186 | 				mergedsv+=mergeinfo_insertion_oneevent(svgroups[1],min_support)
187 | 			if len(mergedsv)==2:
188 | 				#mergedsv=[c+'\tCompoundSV' for c in mergedsv]
189 | 				# Test for rescue FP
190 | 				if int(mergedsv[0].split('\t')[3])>=int(mergedsv[1].split('\t')[3]):
191 | 					mergedsv=[mergedsv[0]+'\tCompoundSV']
192 | 				else:
193 | 					mergedsv=[mergedsv[1]+'\tCompoundSV']
194 | 			if len(mergedsv)==1:
195 | 				mergedsv=[mergedsv[0]+'\tUnique']
196 | 			return mergedsv
197 | 	mergedsv=mergeinfo_insertion_oneevent(candi,min_support)
198 | 	if len(mergedsv)==1:
199 | 		return [mergedsv[0]+'\tUnique']
200 | 	else:
201 | 		return []
202 | 
203 | def mergeinfo_insertion_oneevent(candi,min_support):
204 | 	candi.sort(key=mergeinfolengthsort)
205 | 	while len(candi)>min_support-2:
206 | 		if int(candi[-1].split('\t')[2]) > 1.5* int(candi[len(candi)//2].split('\t')[2]):
207 | 			candi.remove(candi[-1])
208 | 			continue
209 | 		if int(candi[len(candi)//2].split('\t')[2]) >  1.5*int(candi[0].split('\t')[2]):
210 | 			candi.remove(candi[0])
211 | 			continue
212 | 		break
213 | 	if len(candi)>=min_support:
214 | 		chrom=candi[0].split('\t')[0]
215 | 		position=[int(c.split('\t')[1]) for c in candi]
216 | 		length=[int(c.split('\t')[2]) for c in candi]
217 | 		quality=[float(c.split('\t')[3]) for c in candi]
218 | 		position=sum(position)//len(position)
219 | 		quality=sum(quality)/float(len(quality))
220 | 		stand=standerd_varition(length)
221 | 		length=sum(length)//len(length)
222 | 		readnames=''
223 | 		for c in candi:
224 | 			readnames+=c.split('\t')[4]+';'
225 | 		readnames=readnames[:-1]
226 | 		return[chrom+'\t'+str(position)+'\t'+str(length)+'\t'+str(len(candi))+'\t'+str(quality)+'\t'+str(stand)+'\t'+readnames]
227 | 	else:
228 | 		return []
229 | 
230 | 
231 | def counttimesort_tra(a):
232 | 	return [int(a.split('\t')[1]),int(a.split('\t')[3])]
233 | 
234 | def counttime_translocation(samechrom,min_support):
235 | 	samechrom.sort(key=counttimesort_tra)
236 | 	samechrtra=[]
237 | 	start=int(samechrom[0].split('\t')[1])
238 | 	candi=[]
239 | 	window=800
240 | 	for event in samechrom:
241 | 		if int(event.split('\t')[1])<=start+window:
242 | 			candi+=[event]
243 | 			continue
244 | 		if len(candi)>=min_support:
245 | 			samechrtra+=mergeinfo_translocation(candi,min_support)
246 | 		candi=[event]
247 | 		start=int(event.split('\t')[1])
248 | 	if len(candi)>=min_support:
249 | 		samechrtra+=mergeinfo_translocation(candi,min_support)
250 | 		candi=[]
251 | 	return samechrtra
252 | 
253 | 
254 | def counttimesort(a):
255 | 	return [int(a.split('\t')[1]),int(a.split('\t')[2])]
256 | 
257 | 
258 | def counttime_insertion(samechrom,min_support):
259 | 	if samechrom==[]:
260 | 		return []
261 | 	samechrom.sort(key=counttimesort)
262 | 	samechrins=[]
263 | 	start=int(samechrom[0].split('\t')[1])
264 | 	candi=[]
265 | 	inslength=[]
266 | 	window=100
267 | 	for event in samechrom:
268 | 		if int(event.split('\t')[1])<=start+window:
269 | 			candi+=[event]
270 | 			inslength+=[int(event.split('\t')[2])]
271 | 			continue
272 | 		if window==100:
273 | 			length=sum(inslength)//len(inslength)
274 | 			if length<=100:
275 | 				window=200
276 | 			if 100<length<=500:
277 | 				window=400
278 | 			if length>500:
279 | 				window=800
280 | 			if int(event.split('\t')[1])<=start+window:
281 | 				candi+=[event]
282 | 				inslength+=[int(event.split('\t')[2])]
283 | 				continue
284 | 		if len(candi)>=min_support:
285 | 			samechrins+=mergeinfo_insertion(candi,min_support)
286 | 		candi=[event]
287 | 		inslength=[int(event.split('\t')[2])]
288 | 		start=int(event.split('\t')[1])
289 | 		window=100
290 | 	if len(candi)>=min_support:
291 | 		samechrins+=mergeinfo_insertion(candi,min_support)
292 | 		candi=[]
293 | 	return samechrins
294 | 
295 | 
296 | 
297 | def counttime_deletion(samechrom,min_support):
298 | 	if samechrom==[]:
299 | 		return []
300 | 	samechrom.sort(key=counttimesort)
301 | 	samechrdel=[]
302 | 	start=int(samechrom[0].split('\t')[1])
303 | 	candi=[]
304 | 	dellength=[]
305 | 	window=100
306 | 	for event in samechrom:
307 | 		if int(event.split('\t')[1])<=start+window:
308 | 			candi+=[event]
309 | 			dellength+=[int(event.split('\t')[2])]
310 | 			continue
311 | 		if window==100:
312 | 			length=sum(dellength)//len(dellength)
313 | 			if length<=100:
314 | 				window=200
315 | 			if 100<length<=500:
316 | 				window=400
317 | 			if length>500:
318 | 				window=800
319 | 			if int(event.split('\t')[1])<=start+window:
320 | 				candi+=[event]
321 | 				dellength+=[int(event.split('\t')[2])]
322 | 				continue
323 | 		if len(candi)>=min_support:
324 | 			samechrdel+=mergeinfo_insertion(candi,min_support)
325 | 		candi=[event]
326 | 		dellength=[int(event.split('\t')[2])]
327 | 		start=int(event.split('\t')[1])
328 | 		window=100
329 | 	if len(candi)>=min_support:
330 | 		samechrdel+=mergeinfo_insertion(candi,min_support)
331 | 		candi=[]
332 | 	return samechrdel
333 | 
334 | def merge_deletion(min_support,min_quality,readpath,samechrom_deletion,chrom,svtype,upper_bound):
335 | 	delt1=time.time()
336 | 	samechrom_deletion=[c for c in samechrom_deletion if float(c.split('\t')[3])>=min_quality]
337 | 	if samechrom_deletion==[]:
338 | 		return True
339 | 	tt1=time.time()
340 | 	deletions=counttime_deletion(samechrom_deletion,min_support)
341 | 	deletions=[c for c in deletions if int(c.split('\t')[3])>=min_support]
342 | 	f=open(readpath+svtype+'-info-'+chrom,'w')
343 | 	for d in deletions:
344 | 		f.write(d+'\n')
345 | 	f.close()
346 | 	real=[c for c in deletions if c.split('\t')[-1]=='Unique']
347 | 	comp=[c for c in deletions if c.split('\t')[-1]=='CompoundSV']
348 | 	cleaneddels=m_samechr_deletion(real)
349 | 	cleaneddels+=comp
350 | 	cleaneddels.sort(key=counttimesort)
351 | 	if upper_bound:
352 | 		cleaneddels=[c for c in cleaneddels if min_support<=int(c.split('\t')[3])<=min_support*30]
353 | 	else:
354 | 		cleaneddels=[c for c in cleaneddels if min_support<=int(c.split('\t')[3])]
355 | 	f=open(readpath+svtype+'-merged-'+chrom,'w')
356 | 	if svtype=='del':
357 | 		sv_type='Deletion'
358 | 	if svtype=='dup':
359 | 		sv_type='Duplication'
360 | 	if svtype=='inv':
361 | 		sv_type='Inversion'
362 | 	merged_result=[]
363 | 	for d in cleaneddels:
364 | 		f.write(d+'\t'+sv_type+'\n')
365 | 		merged_result+=[d+'\t'+sv_type]
366 | 	f.close()
367 | 	delt2=time.time()
368 | 	return merged_result
369 | 
370 | def  merge_insertion(min_support,min_quality,readpath,samechrom_insertion,chrom,svtype,upper_bound):
371 | 	samechrom_insertion=[c for c in samechrom_insertion if float(c.split('\t')[3])>=min_quality]
372 | 	if samechrom_insertion==[]:
373 | 		return True
374 | 
375 | 	inst1=time.time()
376 | 	insertions=counttime_insertion(samechrom_insertion,min_support)
377 | 	f=open(readpath+svtype+'-info-'+chrom,'w')
378 | 	for d in insertions:
379 | 		f.write(d+'\n')
380 | 	f.close()
381 | 	real=[c for c in insertions if c.split('\t')[-1]=='Unique']
382 | 	compound=[c for c in insertions if c.split('\t')[-1]=='CompoundSV']
383 | 	cleanedins=m_samechr_insertion(real)
384 | 	cleanedins+=compound
385 | 	cleanedins.sort(key=counttimesort)
386 | 
387 | 	if upper_bound:
388 | 		cleanedins=[c for c in cleanedins if min_support<=int(c.split('\t')[3])<=30*min_support]
389 | 	else:
390 | 		cleanedins=[c for c in cleanedins if min_support<=int(c.split('\t')[3])]
391 | 	f=open(readpath+svtype+'-merged-'+chrom,'w')
392 | 	if svtype=='ins':
393 | 		sv_type='Insertion'
394 | 	if svtype=='inv':
395 | 		sv_type='Inversion'
396 | 	merged_result=[]
397 | 	for d in cleanedins:
398 | 		f.write(d+'\t'+sv_type+'\n')
399 | 		merged_result+=[d+'\t'+sv_type]
400 | 	f.close()
401 | 	inst2=time.time()
402 | 
403 | 	return merged_result
404 | 
405 | def finalsorttra(a):
406 | 	return [a.split('\t')[0],int(a.split('\t')[1])]
407 | 
408 | def merge_translocation(min_support,min_qual,readpath,samechrom_translocation,chrom,upper_bound):
409 | 	samechrom_translocation=[c for c in samechrom_translocation if float(c.split('\t')[4])>=min_qual]
410 | 	if samechrom_translocation==[]:
411 | 		return True
412 | 	trat1=time.time()
413 | 	translocations=counttime_translocation(samechrom_translocation,min_support)
414 | 	translocations=m_samechr_translocation(translocations)
415 | 	if upper_bound:
416 | 		translocations=[c for c in translocations if int(c.split('\t')[4])<=30*min_support]
417 | 
418 | 	translocations.sort(key=finalsorttra)
419 | 	merged_result=[]
420 | 	f=open(readpath+'tra-merged-'+chrom,'w')
421 | 	for d in translocations:
422 | 		f.write(d+'\n')
423 | 		merged_result+=[d]
424 | 	f.close()
425 | 	trat2=time.time()
426 | 	return merged_result
427 | 
428 | 
429 | 
430 | 
431 | def genotype(depth,outpath):
432 | 	highcov=depth*2
433 | 	allae=open(outpath+'assembly_errors.bed','r').read().split('\n')[:-1]
434 | 	
435 | 	samfile=pysam.AlignmentFile(outpath+'read_to_contig.bam',"rb")
436 | 	f=open(outpath+'assembly_errors.bed-gt_test','w')
437 | 	coll=0
438 | 	expan=0
439 | 	inv=0
440 | 
441 | 	for c in allae:
442 | 		chrom=c.split('\t')[0]
443 | 		start=int(c.split('\t')[1])
444 | 		stop=int(c.split('\t')[2])
445 | 		#print (c)
446 | 		if start<0:
447 | 			continue
448 | 		if 'Expansion' in c or 'Inversion' in c:
449 | 			leftcov=samfile.count(chrom,max(start-100,0),start,read_callback='all')
450 | 			rightcov=samfile.count(chrom,stop,stop+100,read_callback='all')
451 | 			#if leftcov>highcov and rightcov>highcov:
452 | 		#		continue
453 | 		if 'Collapse' in c:
454 | 			leftcov=samfile.count(chrom,max(0,start-100),start,read_callback='all')
455 | 			rightcov=samfile.count(chrom,stop,stop+100,read_callback='all')
456 | 		#	if leftcov>highcov and rightcov>highcov:
457 | 		#		continue
458 | 		if 'Expan' in c:
459 | 			expan+=1
460 | 		if 'Coll' in c:
461 | 			coll+=1
462 | 		if 'Inv' in c:
463 | 			inv+=1
464 | 		gtinfo='./.'
465 | 		if int(c.split('\t')[3])>=0.6*min(leftcov,rightcov):
466 | 			gtinfo='1/1'
467 | 		else:
468 | 			gtinfo='1/0'
469 | 
470 | 		#f.write(c+'\t'+gtinfo+'\n')
471 | 		f.write(c+'\t'+gtinfo+'\t'+str(leftcov)+'\t'+str(rightcov)+'\t'+str(min(rightcov,leftcov))+'\n')
472 | 	f.close()
473 | 	'''
474 | 	f=open(outpath+'summary_statistics','a')
475 | 	f.write('After Genotyping:\n')
476 | 	f.write('Number of assembly collapse\t'+str(coll)+'\n')
477 | 	f.write('Number of assembly expansion\t'+str(expan)+'\n')
478 | 	f.write('Number of assembly inversion\t'+str(inv)+'\n\n\n')
479 | 	'''
480 | 
481 | 
482 | 
483 | 	'''
484 | 	f.write('Number of assembly collapse in large contigs\t'+str(coll_large)+'\n')
485 | 	f.write('Number of assembly expansion in large contigs\t'+str(expan_large)+'\n')
486 | 	f.write('Number of assembly inversion in large contigs\t'+str(inv_large)+'\n\n\n')
487 | 	'''
488 | 	f.close()
489 | 	return True
490 | 
491 | 
492 | 
493 | 


--------------------------------------------------------------------------------
/debreak_merge_clustering.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import sys
  4 | import pysam
  5 | 
  6 | def mergeinfolengthsort(a):
  7 | 	return int(a.split('\t')[2])
  8 | 
  9 | def mergeinfo_insertion(candi,min_support):
 10 | 	candi.sort(key=mergeinfolengthsort)
 11 | 
 12 | 	if len(candi)>=1.5*min_support:
 13 | 		upper=int(candi[len(candi)*3//4].split('\t')[2])
 14 | 		lower=int(candi[len(candi)//4].split('\t')[2])
 15 | 		if upper>1.75*lower and upper-lower>50:
 16 | 			svgroups=assign_candi_insertion(candi,upper,lower)
 17 | 			svgroups=assign_candi_insertion(candi,svgroups[2],svgroups[3])
 18 | 			svgroups=assign_candi_insertion(candi,svgroups[2],svgroups[3])
 19 | 			mergedsv=[]
 20 | 			if len(svgroups[0])>=min_support:
 21 | 				mergedsv+=mergeinfo_insertion_oneevent(svgroups[0],min_support)
 22 | 			if len(svgroups[1])>=min_support:
 23 | 				mergedsv+=mergeinfo_insertion_oneevent(svgroups[1],min_support)
 24 | 			if len(mergedsv)==2:
 25 | 				mergedsv=[c+'\tCompoundSV' for c in mergedsv]
 26 | 			if len(mergedsv)==1:
 27 | 				mergedsv=[mergedsv[0]+'\tUnique']
 28 | 			return mergedsv
 29 | 	mergedsv=mergeinfo_insertion_oneevent(candi,min_support)
 30 | 	if len(mergedsv)==1:
 31 | 		return [mergedsv[0]+'\tUnique']
 32 | 	else:
 33 | 		return []
 34 | 
 35 | def assign_candi_insertion(candi,mean1,mean2):
 36 | 	group1=[]
 37 | 	group2=[]
 38 | 	for c in candi:
 39 | 		if abs(int(c.split('\t')[2])-mean1)<=abs(mean2-int(c.split('\t')[2])):
 40 | 			group1+=[c]
 41 | 		else:
 42 | 			group2+=[c]
 43 | 	mean1_new=sum([int(c.split('\t')[2]) for c in group1])//len(group1)
 44 | 	mean2_new=sum([int(c.split('\t')[2]) for c in group2])//len(group2)
 45 | 	return [group1,group2,mean1_new,mean2_new]
 46 | 
 47 | 
 48 | def mergeinfo_insertion_oneevent(candi,min_support):
 49 | 	candi.sort(key=mergeinfolengthsort)
 50 | 	min_support=max(2,min_support)
 51 | 	while len(candi)>max(2,min_support-2):
 52 | 		if int(candi[-1].split('\t')[2]) > 2* int(candi[len(candi)//2].split('\t')[2]) and  int(candi[-1].split('\t')[2]) -int(candi[len(candi)//2].split('\t')[2]) >30:
 53 | 			candi.remove(candi[-1])
 54 | 			continue
 55 | 		if int(candi[len(candi)//2].split('\t')[2]) >  2*int(candi[0].split('\t')[2]) and int(candi[len(candi)//2].split('\t')[2]) -int(candi[0].split('\t')[2]) >30:
 56 | 			candi.remove(candi[0])
 57 | 			continue
 58 | 		break
 59 | 	if len(candi)>=max(2,min_support):
 60 | 		chrom=candi[0].split('\t')[0]
 61 | 		position=[int(c.split('\t')[1]) for c in candi]
 62 | 		length=[int(c.split('\t')[2]) for c in candi]
 63 | 		quality=[float(c.split('\t')[6]) for c in candi]
 64 | 		position=sum(position)//len(position)
 65 | 		quality=sum(quality)/float(len(quality))
 66 | 		length=sum(length)//len(length)
 67 | 		readnames=''
 68 | 		for c in candi:
 69 | 			readnames+=c.split('\t')[4]+';'
 70 | 		readnames=readnames[:-1]
 71 | 		numread=len(readnames.split(';'))
 72 | 		return[chrom+'\t'+str(position)+'\t'+str(length)+'\t'+str(len(candi))+'\t'+str(numread)+'\t'+str(quality)+'\t'+'\t'+readnames]
 73 | 	else:
 74 | 		return []
 75 | 
 76 | def counttimesort(a):
 77 | 	return [int(a.split('\t')[1]),int(a.split('\t')[2])]
 78 | 
 79 | def cluster(outpath,chrom,contiglength,mins,maxdepth):
 80 | 	allsv=open(outpath+'debreak_workspace/read_to_contig_'+chrom+'.debreak.temp','r').read().split('\n')[:-1]
 81 | 
 82 | 	# Large DEL
 83 | 	largesv=[c for c in allsv if 'D-' in c and int(c.split('\t')[2])>2000]
 84 | 	window=1600
 85 | 	largesv.sort(key=counttimesort)
 86 | 	largedel=[]
 87 | 	start=0
 88 | 	candi=[]
 89 | 	for event in largesv:
 90 | 		if int(event.split('\t')[1])<=start+window:
 91 | 			candi+=[event]
 92 | 			continue
 93 | 		if len(candi)>=mins:
 94 | 			largedel+=mergeinfo_insertion(candi,mins)
 95 | 		candi=[event]
 96 | 		start=int(event.split('\t')[1])
 97 | 	if len(candi)>=mins:
 98 | 		largedel+=mergeinfo_insertion(candi,mins)
 99 | 		candi=[]
100 | 
101 | 	#smaller DEL
102 | 	allsv=[c for c in allsv if 'D-' in c and int(c.split('\t')[2])<=3000]
103 | 	genomeposition=[0]*contiglength
104 | 
105 | 	for c in allsv:
106 | 		start=int(c.split('\t')[1])
107 | 		end=int(c.split('\t')[1])+int(c.split('\t')[2])
108 | 		original=genomeposition[start-1:end-1]
109 | 		new=[mm+1 for mm in original]
110 | 		genomeposition[start-1:end-1]=new
111 | 	svregion=[]
112 | 	inblock=False
113 | 	threshold=3
114 | 
115 | 	for i in range(len(genomeposition)):
116 | 		if inblock:
117 | 			if genomeposition[i]>=max(maxdep/10.0,threshold):
118 | 				localdep+=[genomeposition[i]]
119 | 				if genomeposition[i]>maxdep:
120 | 					maxdep=genomeposition[i]
121 | 			else:
122 | 				inblock=False
123 | 				end=i
124 | 				if maxdep<=maxdepth:
125 | 					peakpos=localdep.index(maxdep)
126 | 					peakleftsize=0
127 | 					for i in range(peakpos):
128 | 						if localdep[peakpos-i-1]>=maxdep/10.0:
129 | 							peakleftsize+=1
130 | 						else:
131 | 							break
132 | 					svregion+=[(start+peakpos-peakleftsize,end,maxdep)]
133 | 				start=0
134 | 				end=0
135 | 				maxdep=0
136 | 
137 | 		else:
138 | 			if genomeposition[i] > threshold:
139 | 				inblock=True
140 | 				localdep=[genomeposition[i]]
141 | 				start=i
142 | 				maxdep=genomeposition[i]
143 | 
144 | 	svregion=[c for c in svregion if c[2] < maxdepth]
145 | 	allsvinfo={}
146 | 	for c in svregion:
147 | 		allsvinfo[c]=[]
148 | 
149 | 	for c in allsv:
150 | 		start=int(c.split('\t')[1])
151 | 		end=start+int(c.split('\t')[2])
152 | 		for d in svregion:
153 | 			if min(end,d[1])-max(d[0],start)>0:
154 | 				allsvinfo[d]+=[c]
155 | 
156 | 	sv=[]
157 | 	for c in svregion:
158 | 		svinfo=allsvinfo[c]
159 | 		sv+=mergeinfo_insertion(svinfo,mins)
160 | 
161 | 	newsv=[]
162 | 	for c in largedel:
163 | 		testif=0
164 | 		for d in sv:
165 | 			if min(int(c.split('\t')[1])+int(c.split('\t')[2]), int(d.split('\t')[1])+int(d.split('\t')[2])) - max(int(c.split('\t')[1]),int(d.split('\t')[1]))>0 and 0.8*int(d.split('\t')[2])<=int(c.split('\t')[2])<=int(d.split('\t')[2])/0.8:
166 | 				testif=1; break
167 | 		if testif==0:
168 | 			newsv+=[c]
169 | 	newsv+=sv
170 | 	newsv.sort(key=counttimesort)
171 | 
172 | 
173 | 	if newsv==[]:
174 | 		return 0
175 | 
176 | 	f=open(outpath+'ae_merge_workspace/del_merged_'+chrom,'w')
177 | 	for c in newsv:
178 | 		f.write(c+'\n')
179 | 	f.close()
180 | 
181 | 	return 0
182 | 
183 | 
184 | 
185 | def cluster_ins(outpath,chrom,contiglength,mins,maxdepth,svtype):
186 | 	allsv=open(outpath+'debreak_workspace/read_to_contig_'+chrom+'.debreak.temp','r').read().split('\n')[:-1]
187 | 	
188 | 	# Large INS
189 | 	if svtype=='ins':
190 | 		largesv=[c for c in allsv if 'I-' in c and int(c.split('\t')[2])>2000]
191 | 	else:
192 | 		largesv=[c for c in allsv if 'INV-' in c and int(c.split('\t')[2])>2000]
193 | 
194 | 	window=1600
195 | 	largesv.sort(key=counttimesort)
196 | 	largedel=[]
197 | 	start=0
198 | 	candi=[]
199 | 	for event in largesv:
200 | 		if int(event.split('\t')[1])<=start+window:
201 | 			candi+=[event]
202 | 			continue
203 | 		if len(candi)>=mins:
204 | 			largedel+=mergeinfo_insertion(candi,mins)
205 | 		candi=[event]
206 | 		start=int(event.split('\t')[1])
207 | 	if len(candi)>=mins:
208 | 		largedel+=mergeinfo_insertion(candi,mins)
209 | 		candi=[]
210 | 	
211 | 	# Small INS
212 | 	if svtype=='ins':
213 | 		allsv=[c for c in allsv if 'I-' in c and int(c.split('\t')[2])<=3000]
214 | 	else:
215 | 		allsv=[c for c in allsv if 'INV-' in c and int(c.split('\t')[2])<=3000]
216 | 
217 | 	genomeposition=[0]*contiglength
218 | 
219 | 	for c in allsv:
220 | 		start=int(c.split('\t')[1])-100
221 | 		end=int(c.split('\t')[1])+100
222 | 		original=genomeposition[start-1:end-1]
223 | 		new=[mm+1 for mm in original]
224 | 		genomeposition[start-1:end-1]=new
225 | 	
226 | 	svregion=[]
227 | 	inblock=False
228 | 	threshold=3
229 | 
230 | 	for i in range(len(genomeposition)):
231 | 		if inblock:
232 | 			if genomeposition[i]>=max(maxdep/10.0,threshold):
233 | 				localdep+=[genomeposition[i]]
234 | 				if genomeposition[i]>maxdep:
235 | 					maxdep=genomeposition[i]
236 | 			else:
237 | 				inblock=False
238 | 				end=i
239 | 				if maxdep<=maxdepth:
240 | 					peakpos=localdep.index(maxdep)
241 | 					peakleftsize=0
242 | 					for i in range(peakpos):
243 | 						if localdep[peakpos-i-1]>=maxdep/10.0:
244 | 							peakleftsize+=1
245 | 						else:
246 | 							break
247 | 					svregion+=[(start+peakpos-peakleftsize,end,maxdep)]
248 | 				start=0;end=0;maxdep=0
249 | 
250 | 		else:
251 | 			if genomeposition[i] > threshold:
252 | 				inblock=True
253 | 				localdep=[genomeposition[i]]
254 | 				start=i
255 | 				maxdep=genomeposition[i]
256 | 
257 | 	svregion=[c for c in svregion if c[2] < maxdepth]
258 | 	allsvinfo={}
259 | 	for c in svregion:
260 | 		allsvinfo[c]=[]
261 | 
262 | 	for c in allsv:
263 | 		start=int(c.split('\t')[1])-50
264 | 		end=start+100
265 | 		for d in svregion:
266 | 			if min(end,d[1])-max(d[0],start)>0:
267 | 				allsvinfo[d]+=[c]
268 | 	sv=[]
269 | 	for c in svregion:
270 | 		svinfo=allsvinfo[c]
271 | 		mergedins=mergeinfo_insertion(svinfo,mins)
272 | 		for m in mergedins:
273 | 			sv+=[m+'\t'+chrom+'\t'+str(c[0])+'\t'+str(c[1])+'\t'+str(c[2])]
274 | 
275 | 	newsv=[]
276 | 	for c in largedel:
277 | 		testif=0
278 | 		for d in sv:
279 | 			if min(int(c.split('\t')[1])+int(c.split('\t')[2]), int(d.split('\t')[1])+int(d.split('\t')[2])) - max(int(c.split('\t')[1]),int(d.split('\t')[1]))>0 and 0.8*int(d.split('\t')[2])<=int(c.split('\t')[2])<=int(d.split('\t')[2])/0.8:
280 | 				testif=1; break
281 | 		if testif==0:
282 | 			newsv+=[c]
283 | 	newsv+=sv
284 | 	newsv.sort(key=counttimesort)
285 | 
286 | 	if newsv==[]:
287 | 		return 0
288 | 
289 | 	if svtype=='ins':
290 | 		f=open(outpath+'ae_merge_workspace/ins_merged_'+chrom,'w')
291 | 	else:
292 | 		f=open(outpath+'ae_merge_workspace/inv_merged_'+chrom,'w')
293 | 
294 | 	for c in newsv:
295 | 		f.write(c+'\n')
296 | 	f.close()
297 | 	return 0
298 | 
299 | 
300 | def genotype(depth,outpath):
301 | 	highcov=depth*2
302 | 	allae=open(outpath+'assembly_errors.bed','r').read().split('\n')[:-1]
303 | 	samfile=pysam.AlignmentFile(outpath+'read_to_contig.bam',"rb")
304 | 	f=open(outpath+'assembly_errors.bed-gt','w')
305 | 	coll=0;expan=0;inv=0
306 | 
307 | 	for c in allae:
308 | 		chrom=c.split('\t')[0]
309 | 		start=int(c.split('\t')[1])
310 | 		stop=int(c.split('\t')[2])
311 | 		if start<0:
312 | 			continue	
313 | 		if 'Expansion' in c or 'Inversion' in c:
314 | 			leftcov=samfile.count(chrom,max(start-100,0),start,read_callback='all')
315 | 			rightcov=samfile.count(chrom,stop,stop+100,read_callback='all')
316 | 		if 'Collapse' in c:
317 | 			leftcov=samfile.count(chrom,max(0,start-100),start,read_callback='all')
318 | 			rightcov=samfile.count(chrom,stop,stop+100,read_callback='all')
319 | 		gtinfo='./.'
320 | 		if int(c.split('\t')[3])>=0.6*min(leftcov,rightcov):
321 | 			gtinfo='1/1'
322 | 		else:
323 | 			gtinfo='1/0'
324 | 		f.write(c+'\t'+gtinfo+'\t'+str(leftcov)+'\t'+str(rightcov)+'\t'+str(min(rightcov,leftcov))+'\n')
325 | 	f.close()
326 | 
327 | def filterae(depth,outpath,min_size,datatype):
328 | 	allsv=open(outpath+'assembly_errors.bed-gt','r').read().split('\n')[:-1]
329 | 	if datatype=='hifi':
330 | 		rat=0.8
331 | 	else:
332 | 		rat=0.7
333 | 
334 | 	highcov=depth*2
335 | 	lowcov=depth/2
336 | 	exp=[c for c in allsv if 'Exp' in c]
337 | 	col=[c for c in allsv if 'Col' in c]
338 | 	inv=[c for c in allsv if 'Inv' in c]
339 | 	new=[]
340 | 	exponly=[]
341 | 	for i in range(len(exp)):
342 | 		c=exp[i].split('\t')
343 | 		testif=0
344 | 		for d in col:
345 | 			if c[0]==d.split('\t')[0] and int(c[1])-250<=int(d.split('\t')[1])<=250+int(c[2]) and int(c[5].split('=')[1])<20*int(d.split('\t')[5].split('=')[1]):
346 | 				testif=1
347 | 				expread=c[6].split(';');goodexp=len(list(dict.fromkeys((expread))))
348 | 				colread=d.split('\t')[6].split(';'); goodcol=len(list(dict.fromkeys((colread))))
349 | 				totaln=len(list(dict.fromkeys((expread+colread))))
350 | 				if 0.33<=int(c[3])/float(d.split('\t')[3])<=3:
351 | 					if totaln<min(goodexp+goodcol//2,goodcol+goodexp//2):
352 | 						expsize=int(c[5].split('=')[1])
353 | 						colsize=int(d.split('\t')[5].split('=')[1])
354 | 						if expsize>colsize+min_size:
355 | 							new+=[c[0]+'\t'+c[1]+'\t'+c[2]+'\t'+str(goodexp)+'\t'+c[4]+'\tSize='+str(expsize-colsize)+'\t'+c[7]+'\t'+c[8]+'\t'+c[9]+'\t'+c[10]+'\t'+';'.join(expread)]
356 | 						if expsize<colsize-min_size:
357 | 							dd=d.split('\t')
358 | 							new+=[c[0]+'\t'+dd[1]+'\t'+dd[2]+'\t'+str(goodcol)+'\tCollapse\tSize='+str(colsize-expsize)+'\t'+dd[7]+'\t'+dd[8]+'\t'+dd[9]+'\t'+dd[10]+'\t'+';'.join(colread)]
359 | 						col.remove(d);break
360 | 					else:
361 | 						new+=[c[0]+'\t'+c[1]+';'+d.split('\t')[1]+'\t'+c[2]+';'+d.split('\t')[2]+'\t'+str(totaln)+'\tHaplotypeSwitch\tSize='+str(int(c[2])-int(c[1]))+';'+d.split('\t')[5].split('=')[1]+'\t-/-\t'+c[8]+'\t'+c[9]+'\t'+c[10]+'\t'+';'.join(expread)+':'+';'.join(colread)+'\t'+str(goodexp)+';'+str(goodcol)]
362 | 				if 0.33>int(c[3])/float(d.split('\t')[3]):
363 | 					dd=d.split('\t')
364 | 					new+=[c[0]+'\t'+dd[1]+'\t'+dd[2]+'\t'+str(goodcol)+'\tCollapse\tSize='+str(int(c[2])-int(c[1]))+';'+d.split('\t')[5].split('=')[1]+'\t-/-\t'+dd[8]+'\t'+dd[9]+'\t'+dd[10]+'\t'+';'.join(colread)]
365 | 				if int(c[3])/float(d.split('\t')[3])>3:
366 | 					new+=[c[0]+'\t'+c[1]+'\t'+c[2]+'\t'+str(goodexp)+'\tExpansion\tSize='+str(int(c[2])-int(c[1]))+';'+d.split('\t')[5].split('=')[1]+'\t-/-\t'+c[8]+'\t'+c[9]+'\t'+c[10]+'\t'+';'.join(expread)]
367 | 				col.remove(d);break
368 | 		if testif==0:
369 | 			exponly+=[exp[i]]
370 | 	allsv=new
371 | 	for c in exponly+col:
372 | 		c=c.split('\t')
373 | 		expread=c[6].split(';');goodexp=len(list(dict.fromkeys((expread))))
374 | 		allsv+=[c[0]+'\t'+c[1]+'\t'+c[2]+'\t'+str(goodexp)+'\t'+c[4]+'\t'+c[5]+'\t'+c[7]+'\t'+c[8]+'\t'+c[9]+'\t'+c[10]+'\t'+';'.join(expread)]
375 | 	
376 | 	for c in inv:
377 | 		c=c.split('\t')
378 | 		expread=c[5].split(';');goodexp=len(list(dict.fromkeys((expread))))
379 | 		allsv+=[c[0]+'\t'+c[1]+'\t'+c[2]+'\t'+str(goodexp)+'\t'+c[4]+'\tSize='+str(int(c[2])-int(c[1]))+'\t'+c[6]+'\t'+c[7]+'\t'+c[8]+'\t'+c[9]+'\t'+';'.join(expread)]
380 | 	new=[]
381 | 	for c in allsv:
382 | 		if  max([int(mm) for mm in c.split('\t')[5].split('=')[1].split(';')])<min_size:
383 | 			continue
384 | 		if int(c.split('\t')[3]) >=10 and int(c.split('\t')[3])>=rat*int(c.split('\t')[9]) and lowcov<=int(c.split('\t')[9])<highcov:
385 | 			new+=[c]; continue	
386 | 	f=open(outpath+'structural_error.bed','w')
387 | 	f.write('#Contig_Name\tStart_Position\tEnd_Position\tSupporting_Read\tType\tSize\tHaplotype_Info\tDepth_Left\tDepth_Right\tDepth_Min\tSupporting_Read_Name\tHaplotype_Switch_Info\n')
388 | 	for c in new:
389 | 		f.write(c+'\n')
390 | 	f.close()
391 | 
392 | 	exp=len([mm for mm in new if 'Exp' in mm])
393 | 	col=len([mm for mm in new if 'Col' in mm])
394 | 	het=len([mm for mm in new if 'Haplo' in mm])
395 | 	inv=len([mm for mm in new if 'Inv' in mm])
396 | 	f=open(outpath+'summary_statistics','a')
397 | 	f.write('Structural error\t'+str(len(new))+'\nExpansion\t'+str(exp)+'\nCollapse\t'+str(col))
398 | 	f.write('\nHaplotype switch\t'+str(het)+'\nInversion\t'+str(inv)+'\n')
399 | 	f.close()
400 | 
401 | 	os.system('rm '+outpath+'assembly_errors.bed*')
402 | 	os.system('rm '+outpath+'read_to_contig.debreak.temp')
403 | 	totalbase=0
404 | 	for c in new:
405 | 		if 'Inv' in c:
406 | 			totalbase+=200;continue
407 | 		size=min([int(mm) for mm in c.split('\t')[5].split('=')[1].split(';')]+[10000])
408 | 		totalbase+=size
409 | 
410 | 	return totalbase
411 | 
412 | 
413 | 
414 | 


--------------------------------------------------------------------------------
/denovo_baseerror.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import statsmodels.stats.proportion
  4 | import statsmodels.stats.multitest
  5 | 
  6 | def find2(li):
  7 | 	num=0
  8 | 	val=''
  9 | 	for c in  li:
 10 | 		if li.count(c)>num:
 11 | 			val=c
 12 | 	return val
 13 | 
 14 | def getsnv(path,chrom,mincount,maxcov,mindepth):
 15 | 	logf=open(path+'Inspector.log','a')
 16 | 	logf.write('Start small-scale error detection for '+chrom+'\n')
 17 | 	logf.close()
 18 | 	g=open(path+'base_error_workspace/baseerror_'+chrom+'.bed','w')
 19 | 	os.system('samtools mpileup -Q 0 '+path+'read_to_contig.bam -r '+chrom+' -o '+path+'base_error_workspace/base_'+chrom+'.pileup -f '+path+'valid_contig.fa')
 20 | 	f=open(path+'base_error_workspace/base_'+chrom+'.pileup','r')
 21 | 	a=f.readline()
 22 | 	numbaseerror=0
 23 | 	validctgbase=0
 24 | 	if mindepth==False and type(mindepth)==bool:
 25 | 		mindepth=maxcov/10.0
 26 | 
 27 | 	while a!='':
 28 | 		if a.split('\t')[2]!='N' and mindepth<=int(a.split('\t')[3]) <=maxcov:
 29 | 			validctgbase+=1
 30 | 		if int(a.split('\t')[3]) <mincount or int(a.split('\t')[3])-a.split('\t')[4].count('*') > maxcov:
 31 | 			a=f.readline(); continue
 32 | 		info=a.split('\t')[4]
 33 | 		info=info.replace(',','.')
 34 | 		info=re.sub('\^.','',info)
 35 | 		info=info.replace('a','A')
 36 | 		info=info.replace('t','T')
 37 | 		info=info.replace('c','C')
 38 | 		info=info.replace('g','G')
 39 | 		depth=int(a.split('\t')[3])-info.count('*')
 40 | 		min_supp=max(mincount,depth*0.2)
 41 | 		ins=info.count('+')
 42 | 		dels=info.count('-')
 43 | 		ifindel=False
 44 | 		if ins>=min_supp:
 45 | 			ifindel=True;
 46 | 			insinfp=info.split('+')[1:]
 47 | 			insseq=[]
 48 | 			for m in insinfp:
 49 | 				num='';inum=0
 50 | 				for dd in m:
 51 | 					if dd in '1234567890':
 52 | 						num+=dd; inum+=1
 53 | 					else:
 54 | 						break
 55 | 				if int(num)<=mincount//2:
 56 | 					insseq+=[m[inum:][:int(num)]]
 57 | 				else:
 58 | 					ins-=1
 59 | 			if ins>=min_supp :
 60 | 				mostf1=find2(insseq)
 61 | 				numbaseerror+=1
 62 | 				g.write(a.split('\t')[0]+'\t'+str(int(a.split('\t')[1])-1)+'\t'+a.split('\t')[1]+'\t-\t'+mostf1+'\t'+str(ins)+'\t'+str(depth)+'\tSmallCollapse\n')
 63 | 		if dels>=min_supp:
 64 | 			ifindel=True;
 65 | 			insinfp=info.split('-')[1:]
 66 | 			insseq=[]
 67 | 			for m in insinfp:
 68 | 				num='';inum=0
 69 | 				for dd in m:
 70 | 					if dd in '1234567890':
 71 | 						num+=dd; inum+=1
 72 | 					else:
 73 | 						break
 74 | 				if int(num)<=mincount//2:
 75 | 					insseq+=[m[inum:][:int(num)]]
 76 | 				else:
 77 | 					dels-=1
 78 | 			if dels>=min_supp:
 79 | 				mostf1=find2(insseq)
 80 | 				numbaseerror+=1
 81 | 				g.write(a.split('\t')[0]+'\t'+str(int(a.split('\t')[1])-1)+'\t'+str(int(a.split('\t')[1])+len(mostf1)-1)+'\t'+mostf1+'\t-\t'+str(dels)+'\t'+str(depth)+'\tSmallExpansion\n')
 82 | 
 83 | 		if info.count('.')+info.count('*')>0.8*int(a.split('\t')[3]) :
 84 | 			a=f.readline(); continue
 85 | 		acount=info.count('A')
 86 | 		tcount=info.count('T')
 87 | 		ccount=info.count('C')
 88 | 		gcount=info.count('G')
 89 | 
 90 | 		if '+'  in a or '-'  in info:
 91 | 			insseq=''
 92 | 			if '+' in info:
 93 | 				insinfp=info.split('+')[1:]
 94 | 				for m in insinfp:
 95 | 					num=''
 96 | 					inum=0
 97 | 					for dd in m:
 98 | 						if dd in '1234567890':
 99 | 							num+=dd; inum+=1
100 | 						else:
101 | 							break
102 | 					insseq+=m[inum:][:int(num)]
103 | 			if '-' in info:
104 | 				insinfp=info.split('-')[1:]
105 | 				for m in insinfp:
106 | 					num=''
107 | 					inum=0
108 | 					for dd in m:
109 | 						if dd in '1234567890':
110 | 							num+=dd; inum+=1
111 | 						else:
112 | 							break
113 | 					insseq+=m[inum:][:int(num)]
114 | 
115 | 			insacount=insseq.count('A')
116 | 			instcount=insseq.count('T')
117 | 			insccount=insseq.count('C')
118 | 			insgcount=insseq.count('G')
119 | 
120 | 			acount-=insacount; tcount-=instcount; ccount-=insccount; gcount-=insgcount
121 | 
122 | 		if max(acount,tcount,ccount,gcount) >=min_supp:
123 | 			if max(acount,tcount,ccount,gcount)==acount:
124 | 				altbase='A'
125 | 			if max(acount,tcount,ccount,gcount)==tcount:
126 | 				altbase='T'
127 | 			if max(acount,tcount,ccount,gcount)==ccount:
128 | 				altbase='C'
129 | 			if max(acount,tcount,ccount,gcount)==gcount:
130 | 				altbase='G'
131 | 			numbaseerror+=1
132 | 			g.write(a.split('\t')[0]+'\t'+str(int(a.split('\t')[1])-1)+'\t'+a.split('\t')[1]+'\t'+a.split('\t')[2]+'\t'+altbase+'\t'+str(max(acount,tcount,ccount,gcount))+'\t'+str(depth)+'\tBaseSubstitution\n')
133 | 
134 | 		a=f.readline()
135 | 	f.close()
136 | 	os.system('rm '+path+'base_error_workspace/base_'+chrom+'.pileup')
137 | 	g.close()
138 | 	if numbaseerror==0:
139 | 		os.system('rm '+path+'base_error_workspace/baseerror_'+chrom+'.bed')
140 | 	f=open(path+'base_error_workspace/validbase','a')
141 | 	f.write(str(validctgbase)+'\n')
142 | 	f.close()
143 | 	return 0
144 | 
145 | 
146 | def count_baseerrror(path,ctgtotallen,datatype,ave_depth):
147 | 	os.system('cat '+path+'base_error_workspace/baseerror_*bed > '+path+'base_error_workspace/allbaseerror.bed')
148 | 	allsnv=open(path+'base_error_workspace/allbaseerror.bed','r').read().split('\n')[:-1]
149 | 	snv=0;indelins=0;indeldel=0
150 | 
151 | 	baseerror=[]
152 | 	iii=0
153 | 	if datatype=='hifi':
154 | 		propvalue=0.5
155 | 		pcutoff=0.01
156 | 		readcutoff=0.75
157 | 		if ave_depth<25:
158 | 			pcutoff=0.02
159 | 		if ave_depth<15:
160 | 			pcutoff=0.1
161 | 	else:
162 | 		propvalue=0.4
163 | 		pcutoff=0.05
164 | 		readcutoff=0.5
165 | 		if ave_depth<25:
166 | 			pcutoff=0.1
167 | 
168 | 
169 | 	allpvalue=[]
170 | 
171 | 	for c in allsnv:
172 | 		p=0
173 | 		nread=int(c.split('\t')[5])
174 | 		depth=int(c.split('\t')[6])
175 | 		if nread<readcutoff*depth:
176 | 			continue
177 | 		for i in range(nread,depth+1):
178 | 			p+=statsmodels.stats.proportion.binom_test(i, depth, prop=propvalue, alternative='larger')
179 | 
180 | 		
181 | 		if p<pcutoff :
182 | 			iii+=1
183 | 			baseerror+=[c+'\t'+str(p)]
184 | 			if 'BaseSubstitution' in c:
185 | 				snv+=1
186 | 			if 'SmallExpansion' in c:
187 | 				indeldel+=1
188 | 			if 'SmallCollapse' in c:
189 | 				indelins+=1
190 | 		
191 | 	per=float(iii)/ctgtotallen*1000000
192 | 	f=open(path+'small_scale_error.bed','w')
193 | 	f.write('#Contig_Name\tStart_Position\tEnd_Position\tBase_Contig\tBase_Read\tSupporting_Read\tDepth\tType\tPvalue\n')
194 | 	for c in baseerror:
195 | 		f.write(c+'\n')
196 | 	f.close()
197 | 	f=open(path+'summary_statistics','a')
198 | 	f.write('\n\nSmall-scale assembly error /per Mbp\t'+str(per)+'\nTotal small-scale assembly error\t'+str(iii)+'\nBase substitution\t'+str(snv)+'\nSmall-scale expansion\t'+str(indeldel)+'\n')
199 | 	f.write('Small-scale collapse\t'+str(indelins)+'\n')
200 | 
201 | 	return iii	
202 | 
203 | 	
204 | 
205 | 


--------------------------------------------------------------------------------
/denovo_correct.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pysam
  3 | import sys
  4 | import random
  5 | import time
  6 | import multiprocessing
  7 | import subprocess
  8 | 
  9 | def sort_snp(a):
 10 | 	return int(a.split('\t')[1])
 11 | 
 12 | def get_snpcut_start(snp):
 13 | 	if snp.split('\t')[7] in ['BaseSubstitution', 'SmallCollapse']:
 14 | 		return int(snp.split('\t')[1])+1
 15 | 	else:
 16 | 		return int(snp.split('\t')[2])+1
 17 | 
 18 | def get_snpcut_end(snp):
 19 | 	if 'BaseSubstitution' == snp.split('\t')[7]:
 20 | 		return int(snp.split('\t')[1])
 21 | 	else:
 22 | 		return int(snp.split('\t')[1])+1
 23 | 
 24 | def base_correction(ctgseq,snpset,ctg):
 25 | 	t1=time.time()
 26 | 	snpset.sort(key=sort_snp)
 27 | 	bad=[]
 28 | 	for i in range(len(snpset)-1):
 29 | 		if 'BaseSubstitution' in snpset[i+1] and 'SmallExpansion' in snpset[i] and min(int(snpset[i+1].split('\t')[2]),int(snpset[i].split('\t')[2])+1)-max(int(snpset[i+1].split('\t')[1]),int(snpset[i].split('\t')[1])+1)>0:
 30 | 			bad+=[snpset[i],snpset[i+1]]
 31 | 	snpset=[c for c in snpset if c not in bad]
 32 | 	cutposinfo=[]
 33 | 	if snpset==[]:
 34 | 		return (ctgseq,snpset)
 35 | 	for i in range(len(snpset)):
 36 | 		cutinfo=[0,0,'']
 37 | 		if i>0:
 38 | 			cutinfo[0]=get_snpcut_start(snpset[i-1])
 39 | 		cutinfo[1]=get_snpcut_end(snpset[i])
 40 | 		if 'SmallExpansion' == snpset[i].split('\t')[7]:
 41 | 			cutinfo[2]=''
 42 | 		elif snpset[i].split('\t')[7]== 'BaseSubstitution' or snpset[i].split('\t')[7]== 'SmallCollapse':
 43 | 			cutinfo[2]=snpset[i].split('\t')[4]
 44 | 		else:
 45 | 			print ('Warning: Possible error in small-error correction.')
 46 | 		cutposinfo+=[cutinfo]
 47 | 	newseq=''
 48 | 	for cutinfo in cutposinfo:
 49 | 		newseq+=ctgseq[cutinfo[0]:cutinfo[1]]+cutinfo[2]
 50 | 	newseq+=ctgseq[get_snpcut_start(snpset[-1]):]
 51 | 	t2=time.time()
 52 | 	print ('Base error correction for ',ctg,' finished. Time cost: ',t2-t1)
 53 | 	return (newseq,snpset)
 54 | 
 55 | def call_flye_timeout(datatype,outpath,aeinfo,outtime):
 56 | 	testp = multiprocessing.dummy.Pool(1)
 57 | 	testres = testp.apply_async(call_flye, args=(datatype,outpath,aeinfo))
 58 | 	try:
 59 | 		testout = testres.get(outtime)  # Wait timeout seconds for func to complete.
 60 | 		return testout
 61 | 	except multiprocessing.TimeoutError:
 62 | 		print ('Flye assembly time out for ',aeinfo)
 63 | 		raise
 64 | 
 65 | def call_flye(datatype,outpath,aeinfo):
 66 | 	tt0=time.time()
 67 | 	os.system('flye --'+datatype+' '+outpath+'assemble_workspace/read_ass_'+aeinfo+'.fa -o '+outpath+'assemble_workspace/flye_out_'+aeinfo+'/ -t 4  ')
 68 | 	tt1=time.time()
 69 | 	print ('FLYETIME for ',aeinfo,tt1-tt0)
 70 | 	return 0
 71 | 
 72 | 
 73 | def findpos(aeset,snpset,bamfile,outpath,datatype,thread,outtime):
 74 | 	snpsetshift=[c for c in snpset if 'Small' in c]
 75 | 	snpsetshift.sort(key=sort_snp)
 76 | 	new=[]
 77 | 	bam=pysam.AlignmentFile(bamfile,'rb')
 78 | 	ctg=aeset[0].split('\t')[0]
 79 | 	aeinfolist={}
 80 | 	for c in aeset:
 81 | 		if 'Inversion' in c:
 82 | 			continue
 83 | 		if 'HaplotypeSwitch' in c:
 84 | 			if int(c.split('\t')[11].split(';')[0])>=int(c.split('\t')[11].split(';')[1]):
 85 | 				readgroup=c.split('\t')[10].split(':')[0].split(';')
 86 | 				aestart=int(c.split('\t')[1].split(';')[0])
 87 | 				aeend=int(c.split('\t')[2].split(';')[0])
 88 | 				aesize=c.split('\t')[5].split('=')[1].split(';')[0]
 89 | 				aeinfo=ctg+'__'+str(aestart)+'__'+str(aeend)+'__'+str(aesize)+'__exp'
 90 | 				aeinfolist[c]=aeinfo
 91 | 			else:
 92 | 				readgroup=c.split('\t')[10].split(':')[1].split(';')
 93 | 				aestart=int(c.split('\t')[1].split(';')[1])
 94 | 				aeend=int(c.split('\t')[2].split(';')[1])
 95 | 				aesize=c.split('\t')[5].split('=')[1].split(';')[1]
 96 | 				aeinfo=ctg+'__'+str(aestart)+'__'+str(aeend)+'__'+str(aesize)+'__col'
 97 | 				aeinfolist[c]=aeinfo
 98 | 		else:
 99 | 			readgroup=c.split('\t')[10].split(';')
100 | 			aestart=int(c.split('\t')[1])
101 | 			aeend=int(c.split('\t')[2])
102 | 			aesize=c.split('\t')[5].split('=')[1].split(';')[0]
103 | 			aeinfo=ctg+'__'+str(aestart)+'__'+str(aeend)+'__'+str(aesize)+'__exp' if 'Exp' in c else ctg+'__'+str(aestart)+'__'+str(aeend)+'__'+str(aesize)+'__col'
104 | 			aeinfolist[c]=aeinfo
105 | 
106 | 		f=open(outpath+'assemble_workspace/read_ass_'+aeinfo+'.fa','w')
107 | 		allread=bam.fetch(ctg,max(0,aestart-2000),aeend+2000)
108 | 		iii=0
109 | 		for read in allread:
110 | 			if read.query_name not in readgroup or read.flag>16:
111 | 				continue
112 | 			f.write('>'+read.query_name+'\n'+read.query_sequence+'\n')
113 | 			iii+=1
114 | 		f.close()
115 | 	
116 | 	flyerun=multiprocessing.Pool(thread)	
117 | 	for c in aeinfolist:
118 | 		aeinfo=aeinfolist[c]
119 | 		flyerun.apply_async(call_flye_timeout,args=(datatype,outpath,aeinfo,outtime))
120 | 	flyerun.close()
121 | 	flyerun.join()
122 | 
123 | 	for c in aeinfolist:
124 | 		aeinfo=aeinfolist[c]
125 | 		try:
126 | 			allctg=open(outpath+'assemble_workspace/flye_out_'+aeinfo+'/assembly.fasta','r').read().split('>')[1:]
127 | 		except:
128 | 			allctg=[]	
129 | 			print ('Inspector Assembly Fail ' ,aeinfo)
130 | 			os.system('rm -rf '+outpath+'assemble_workspace/flye_out_'+aeinfo+'/')
131 | 			continue
132 | 		if len(allctg)==1:
133 | 			f=open(outpath+'assemble_workspace/new_contig_'+ctg+'.fa','a')
134 | 			newassseq=''.join(allctg[0].split('\n')[1:-1])
135 | 			f.write('>'+aeinfo+'__newctg\n'+newassseq+'\n')
136 | 			f.close()
137 | 		else:
138 | 			print ('Inspector Multi/No Alignment ' ,aeinfo)
139 | 		os.system('rm -rf '+outpath+'assemble_workspace/flye_out_'+aeinfo+'/')
140 | 		
141 | 		shiftpos=0
142 | 		for d in snpsetshift:
143 | 			if int(d.split('\t')[2])<=aestart:
144 | 				if 'SmallCollapse' in d:
145 | 					shiftpos+=len(d.split('\t')[4])
146 | 				else:
147 | 					shiftpos-=len(d.split('\t')[3])
148 | 			else:
149 | 				break
150 | 		c=c.split('\t')
151 | 		new+=[ctg+'\t'+str(aestart+shiftpos)+'\t'+str(aeend+shiftpos)+'\t'+c[4]+'\t'+aesize+'\t'+aeinfo]
152 | 	return new
153 | 
154 | 
155 | def substitute_seq(ctgseq,newseq,ctgstart,ctgend,newstart,newend,diffsize):
156 | 	newpart=newseq[newstart:newend]
157 | 	oldpart=ctgseq[ctgstart-1000-10:ctgend+1000+10]
158 | 	realdiff=len(newpart)-len(oldpart)
159 | 	if (realdiff/float(diffsize)>2 or realdiff/float(diffsize)<0.5) and (abs(diffsize-realdiff)>300 and realdiff*diffsize>0):
160 | 		return (ctgseq,False)
161 | 	leftside= check_same(newpart[:100],oldpart[10:110])
162 | 	rightside= check_same(newpart[-100:],oldpart[-110:-10])
163 | 	shift1=0
164 | 	if leftside<90:
165 | 		for shift1 in range(-5,5):
166 | 			leftside= check_same(newpart[:100],oldpart[10+shift1:110+shift1])
167 | 			if leftside>=90:
168 | 				break
169 | 	shift2=0
170 | 	if rightside <90:
171 | 		for shift2 in range(-5,5):
172 | 			rightside= check_same(newpart[-100:],oldpart[-110+shift2:-10+shift2])
173 | 			if rightside>=90:
174 | 				break
175 | 	if leftside>=90 and rightside>=90:
176 | 		ctgseq=ctgseq[:ctgstart-1000+shift1]+newpart+ctgseq[ctgend+1000+shift2:]
177 | 		return (ctgseq,True)
178 | 	else:
179 | 		ctgseq=ctgseq[:ctgstart-1000]+newpart+ctgseq[ctgend+1000:]
180 | 		return (ctgseq,True)
181 | 
182 | def ae_correct_within(seq,read,start,end,size):
183 | 	mapping=read.get_aligned_pairs()
184 | 	readstart=0;readend=0
185 | 	for c in mapping:
186 | 		if c[1]==start-1000:
187 | 			readstart=c[0]
188 | 		if c[1]==end+1000:
189 | 			readend=c[0]
190 | 			break
191 | 	if readstart!=0 and readend!=0:
192 | 		(seq,ifcorr)=substitute_seq(seq,read.query_sequence,start,end,readstart,readend,size)
193 | 		return (seq,ifcorr)
194 | 	else:
195 | 		return (seq,False)
196 | 
197 | 
198 | def ae_correct_between(seq,align,start,end,size):
199 | 	readstart=0;readend=0
200 | 	for c in align:
201 | 		if c.reference_start < start-1000 and c.reference_end > start-100 :
202 | 			mapping=c.get_aligned_pairs()
203 | 			for m in mapping:
204 | 				if m[1]==start-1000:
205 | 					readstart=m[0]
206 | 		if c.reference_start < end+100 and c.reference_end  >  end+1000:
207 | 			mapping=c.get_aligned_pairs()
208 | 			for m in mapping:
209 | 				if m[1]==end+1000:
210 | 					readend=m[0]
211 | 	if readstart!=0 and readend!=0:
212 | 		(seq,ifcorr)=substitute_seq(seq,align[0].query_sequence,start,end,readstart,readend,size)
213 | 		return (seq,ifcorr)
214 | 	else:
215 | 		return (seq,False)
216 | 
217 | 
218 | def ae_correct_expcol(seq,align,aetype):
219 | 	aeinfo=align[0].query_name
220 | 	start=int(aeinfo.split('__')[1])
221 | 	end=int(aeinfo.split('__')[2])
222 | 	if aetype=='exp':
223 | 		size=0-int(aeinfo.split('__')[3])
224 | 	else:
225 | 		size=int(aeinfo.split('__')[3])
226 | 
227 | 	for read in align:
228 | 		if read.reference_start < start-1000 and end+1000<read.reference_end:
229 | 			(seq,ifcorr)=ae_correct_within(seq,read,start,end,size)
230 | 			return (seq,ifcorr)
231 | 	if len(align)<2:
232 | 		return (seq,False)
233 | 	(seq,ifcorr)=ae_correct_between(seq,align,start,end,size)
234 | 	return (seq,ifcorr)
235 | 
236 | def check_same(a,b):
237 | 	a=list(a)
238 | 	b=list(b)
239 | 	numsame=0
240 | 	for i in range(len(a)):
241 | 		if a[i]==b[i]:
242 | 			numsame+=1
243 | 	return numsame
244 | 
245 | def sortctg(a):
246 | 	return int(a.split('__')[1])
247 | 
248 | 
249 | def ae_correction(ctgseq,aeset,outpath):	
250 | 	ctg=aeset[0].split('\t')[0]
251 | 		
252 | 	f=open(outpath+'assemble_workspace/old_contig_'+ctg+'.fa','w')
253 | 	f.write('>old_ctg_'+ctg+'\n'+ctgseq+'\n')
254 | 	f.close()
255 | 	try:
256 | 		allctg=open(outpath+'assemble_workspace/new_contig_'+ctg+'.fa','r').read().split('>')[1:]
257 | 	except:
258 | 		return (ctgseq,0)
259 | 	newcontig={}
260 | 	ctgname=[]
261 | 	for c in allctg:
262 | 		newcontig[c.split('\n')[0]]=c.split('\n')[1]
263 | 		ctgname+=[c.split('\n')[0]]
264 | 	ctgname.sort(key=sortctg,reverse=True)
265 | 	f=open(outpath+'assemble_workspace/new_contig_'+ctg+'.fa','w')
266 | 	for c in ctgname:
267 | 		f.write('>'+c+'\n'+newcontig[c]+'\n')
268 | 	f.close()
269 | 	
270 | 	os.system('minimap2 -a '+outpath+'assemble_workspace/old_contig_'+ctg+'.fa '+outpath+'assemble_workspace/new_contig_'+ctg+'.fa --MD --eqx -t 6 --secondary=no  -Y  > '+outpath+'assemble_workspace/ctgalignment_'+ctg+'.sam')
271 | 	
272 | 	
273 | 	alignfile=pysam.AlignmentFile(outpath+'assemble_workspace/ctgalignment_'+ctg+'.sam','r')
274 | 	aeset.sort(key=sort_snp,reverse=True)
275 | 
276 | 	allalign=alignfile.fetch(until_eof=True)
277 | 	lastreadname=''
278 | 	samectg=[]
279 | 	numcorr=0
280 | 	for aligninfo in allalign:
281 | 		if aligninfo.flag==4:
282 | 			print (aligninfo.query_name,' contig not aligned.')
283 | 			continue
284 | 		if aligninfo.query_name == lastreadname:
285 | 			samectg+=[aligninfo]
286 | 			continue
287 | 		if samectg!=[]:
288 | 			if 'exp' in lastreadname:
289 | 				(ctgseq,ifcorr)=ae_correct_expcol(ctgseq,samectg,'exp')
290 | 				if ifcorr:
291 | 					numcorr+=1
292 | 			if 'col' in lastreadname:
293 | 				(ctgseq,ifcorr)=ae_correct_expcol(ctgseq,samectg,'col')
294 | 				if ifcorr:
295 | 					numcorr+=1
296 | 		lastreadname=aligninfo.query_name
297 | 		samectg=[aligninfo]
298 | 	if samectg!=[]:
299 | 		if 'exp' in lastreadname:
300 | 			(ctgseq,ifcorr)=ae_correct_expcol(ctgseq,samectg,'exp')
301 | 			if ifcorr:
302 | 				numcorr+=1
303 | 		if 'col' in lastreadname:
304 | 			(ctgseq,ifcorr)=ae_correct_expcol(ctgseq,samectg,'col')
305 | 			if ifcorr:
306 | 				numcorr+=1
307 | 	logf=open(outpath+'Inspector_correct.log','a')
308 | 	logf.write('total ae'+str(len(aeset))+', corrected error '+str(numcorr)+'\n')
309 | 	logf.close()
310 | 	return (ctgseq,numcorr)
311 | 
312 | 	mapinfo={}
313 | 	for c in aeset:
314 | 		mapinfo[c.split('\t')[5]]=int(c.split('\t')[1])
315 | 	allread=alignfile.fetch(until_eof=True)
316 | 	for aligninfo in allread:
317 | 		if aligninfo.is_secondary:
318 | 			continue
319 | 		if type( mapinfo[aligninfo.query_name[:-8]])==int  and aligninfo.reference_start+1000 <mapinfo[aligninfo.query_name[:-8]] < aligninfo.reference_end-1000 :
320 | 			mapinfo[aligninfo.query_name[:-8]]=aligninfo
321 | 
322 | 	correctedstructural=0
323 | 	for c in aeset:
324 | 		aeinfo=c.split('\t')[5]
325 | 		aestart=int(c.split('\t')[1])
326 | 		aeend=int(c.split('\t')[2])
327 | 		aligninfo=mapinfo[aeinfo]
328 | 		if type(aligninfo)==int:
329 | 			continue
330 | 		cigar=aligninfo.cigarstring
331 | 		refpos=aligninfo.reference_start
332 | 		readpos=0
333 | 		num=''
334 | 		readstart=-1
335 | 		readend=-1
336 | 		for m in cigar:
337 | 			if m in '1234567890':
338 | 				num+=m;continue
339 | 			if m in 'M=XD':
340 | 				refpos+=int(num)
341 | 				if m!='D':
342 | 					readpos+=int(num)
343 | 				if readstart==-1 and refpos>= aestart-1000:
344 | 					readstart=readpos-(refpos-aestart+1000)
345 | 				if readend==-1 and refpos>=aeend+1000:
346 | 					readend=readpos-(refpos-aeend-1000)
347 | 				if readstart>0 and readend>0: 
348 | 					break
349 | 				num='';continue
350 | 			if m in 'IS':
351 | 				readpos+=int(num);num='';continue
352 | 			if m=='H':
353 | 				num='';continue
354 | 		newseq=aligninfo.query_sequence[readstart:readend]
355 | 		oldseq=ctgseq[aestart-1000-10:aeend+1000+10]
356 | 		leftside= check_same(newseq[:100],oldseq[10:110])
357 | 		rightside= check_same(newseq[-100:],oldseq[-110:-10])
358 | 		shift1=0
359 | 		if leftside<90:
360 | 			for shift1 in range(-5,5):
361 | 				leftside= check_same(newseq[:100],oldseq[10+shift1:110+shift1])
362 | 				if leftside>=90:
363 | 					break
364 | 		shift2=0
365 | 		if rightside <90:
366 | 			for shift2 in range(-5,5):
367 | 				rightside= check_same(newseq[-100:],oldseq[-110+shift2:-10+shift2])
368 | 				if rightside>=90:
369 | 					break
370 | 		if leftside>=90 and rightside>=90:
371 | 			ctgseq=ctgseq[:aestart-1000+shift1]+newseq+ctgseq[aeend+1000+shift2:]
372 | 			correctedstructural+=1
373 | 	return (ctgseq,correctedstructural)
374 | 
375 | 
376 | 
377 | def error_correction_large(ctg,oldseq,aeset,snpset,bamfile,outpath,datatype,thread,flyeouttime):
378 | 	t0=time.time()
379 | 	(newseq,snpset)=base_correction(oldseq,snpset,ctg)
380 | 	if aeset!=[]:
381 | 		aeset=findpos(aeset,snpset,bamfile,outpath,datatype,thread,flyeouttime)
382 | 	if aeset!=[]:
383 | 		(newseq,numcorr)=ae_correction(newseq,aeset,outpath)
384 | 	ff=open(outpath+'contig_corrected_'+ctg+'.fa','w')
385 | 	ff.write('>'+ctg+'\n'+newseq+'\n')
386 | 	ff.close()
387 | 	t1=time.time()
388 | 	logf=open(outpath+'Inspector_correct.log','a')
389 | 	logf.write('TIME used for structural error correction of '+ctg+': '+str(t1-t0)+'\n')
390 | 	logf.close()
391 | 	return 0
392 | 
393 | def error_correction_small(ctg,oldseq,snpset,bamfile,outpath,datatype):
394 | 	t0=time.time()
395 | 	#snpset=[c for c in snpset if c.split('\t')[0]==ctg]
396 | 	(newseq,snpset)=base_correction(oldseq,snpset,ctg)
397 | 	ff=open(outpath+'contig_corrected_'+ctg+'.fa','w')
398 | 	ff.write('>'+ctg+'\n'+newseq+'\n')
399 | 	ff.close()
400 | 	t1=time.time()
401 | 	logf=open(outpath+'Inspector_correct.log','a')
402 | 	logf.write('TIME used for small error correction of '+ctg+':'+str(t1-t0)+'\n')
403 | 	logf.close()
404 | 	return 0
405 | 
406 | 
407 | 
408 | 


--------------------------------------------------------------------------------
/denovo_plot.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import pysam
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | def plot_n100(outpath,minlen):
  7 | 	ctglen=open(outpath+'contig_length_info','r').read().split('\n')[:-1]
  8 | 	ctglen=[int(c.split('\t')[1]) for c in ctglen if int(c.split('\t')[1]) >= minlen]
  9 | 
 10 | 	n100=[]
 11 | 	x100=[]
 12 | 	ctglen.sort(reverse=True)
 13 | 	totallen=sum(ctglen)
 14 | 	addlen=0
 15 | 	lastlen=0
 16 | 	for i in range(100):
 17 | 		x100+=[i+1]
 18 | 
 19 | 		while addlen < (i+1)/100.0*totallen:
 20 | 			try:
 21 | 				lastlen=ctglen.pop(1)
 22 | 				addlen+=lastlen
 23 | 			except:
 24 | 				break
 25 | 		n100+=[lastlen]
 26 | 	plt.plot(x100,n100,linewidth=2)
 27 | 	plt.xlabel('N1-N100')
 28 | 	plt.ylabel('Contig Length /bp')
 29 | 	plt.savefig(outpath+'plot_n1n100.pdf')
 30 | 	print ('end n100')
 31 | 	return 0
 32 | 
 33 | 
 34 | def plot_na100(outpath):
 35 | 	samfile=pysam.AlignmentFile(outpath+'contig_to_ref.sam','r')
 36 | 	allread=samfile.fetch()
 37 | 	alignlen=[]
 38 | 	for align in allread:
 39 | 		if align.flag==4:
 40 | 			continue
 41 | 		alignlen+=[align.query_alignment_length]
 42 | 	n100=[]
 43 | 	x100=[]
 44 | 	alignlen.sort(reverse=True)
 45 | 	totallen=sum(alignlen)
 46 | 	addlen=0
 47 | 	lastlen=0
 48 | 	for i in range(100):
 49 | 		x100+=[i+1]
 50 | 		while addlen < (i+1)/100.0*totallen:
 51 | 			try:
 52 | 				lastlen=alignlen.pop(1)
 53 | 				addlen+=lastlen
 54 | 			except:
 55 | 				break
 56 | 		n100+=[lastlen]
 57 | 	plt.plot(x100,n100,linewidth=2)
 58 | 	plt.xlabel('NA1-NA100')
 59 | 	plt.ylabel('Contig Length /bp')
 60 | 	plt.savefig(outpath+'plot_na1na100.pdf')
 61 | 	print ('end na100')
 62 | 	return 0
 63 | 
 64 | 
 65 | 
 66 | def findpos(c,ctglength,step,startrefpos,ctgstartpos):
 67 | 	temppos=[]
 68 | 	ctgname=c.query_name
 69 | 	refpos=c.reference_start
 70 | 	cigar=c.cigarstring
 71 | 	if 'S' not in cigar.split('M')[0].split('=')[0] and 'H' not in cigar.split('M')[0].split('=')[0]:
 72 | 		ctgpos=0
 73 | 	else:
 74 | 		ctgpos=int(cigar.split('M')[0].split('=')[0].split('S')[0].split('H')[0])
 75 | 	currpos=ctgpos
 76 | 
 77 | 	num=''
 78 | 
 79 | 	for m in cigar:
 80 | 		num+=m; continue
 81 | 		if m in 'M=X':
 82 | 			ctgpos+=int(num); refpos+=int(num); num=''
 83 | 		if m=='I':
 84 | 			ctgpos+=int(num);num=''
 85 | 		if m =='D':
 86 | 			refpos+=int(num); num='';continue
 87 | 		if m in 'SH':
 88 | 			if refpos>c.reference_start:
 89 | 				break
 90 | 			else:
 91 | 				num=''
 92 | 		while  ctgpos>=currpos+step:
 93 | 			if ctgpos>=currpos+step*2:
 94 | 				temppos+=[[currpos+step,refpos+startrefpos,ctgname]]
 95 | 			else:
 96 | 				temppos+=[[ctgpos,refpos+startrefpos,ctgname]]
 97 | 			currpos+=step
 98 | 	if c.flag in [16,2064]:
 99 | 		temppos=[ [ctglength-mm[0],mm[1],mm[2]] for mm in temppos]
100 | 	updatestart=[]
101 | 	for mm in temppos :
102 | 		updatestart+=[[mm[0]+ctgstartpos,mm[1],mm[2]]]
103 | 
104 | 	return updatestart
105 | 
106 | 
107 | 
108 | def plot_dotplot(outpath):
109 | 	print ('start dot plot')
110 | 	samfile=pysam.AlignmentFile(outpath+'contig_to_ref.bam','rb')
111 | 	allchrom=samfile.references
112 | 	allchromlen=samfile.lengths
113 | 	maxreflen=max(allchromlen)
114 | 	idex=allchromlen.index(maxreflen)
115 | 	maxchrom=allchrom[idex]
116 | 	print (maxchrom)
117 | 	allread=samfile.fetch(maxchrom)
118 | 	if maxreflen >= 10000000:
119 | 		step=10000
120 | 	elif maxreflen>=1000000:
121 | 		step=1000
122 | 	else:
123 | 		step=100
124 | 
125 | 	ctgleninfo=open(outpath+'contig_length_info','r').read().split('\n')[:-1]
126 | 	ctglen={}
127 | 	for c in ctgleninfo:
128 | 		ctglen[c.split('\t')[0]]=int(c.split('\t')[1])
129 | 	alignedctg={}
130 | 	for align in allread:
131 | 		if align.query_name not in alignedctg:
132 | 			alignedctg[align.query_name]=align.query_alignment_length
133 | 		else:
134 | 			alignedctg[align.query_name]+=align.query_alignment_length
135 | 	longalignctg=[c for c in alignedctg if alignedctg[c] >= maxreflen/100.0]
136 | 
137 | 	print (len(longalignctg))
138 | 
139 | 	startpos=0
140 | 	contig_startpos={}
141 | 	for c in longalignctg:
142 | 		contig_startpos[c]=startpos
143 | 		startpos+=ctglen[c]
144 | 	allpos=[]
145 | 
146 | 	allread=samfile.fetch(maxchrom)
147 | 
148 | 	for align in allread:
149 | 		if align.query_name not in longalignctg:
150 | 			continue
151 | 		temppos=[]
152 | 		ctglength=ctglen[align.query_name]
153 | 		ctgstart=contig_startpos[align.query_name]
154 | 		temppos=findpos(align,ctglength,step,0,ctgstart)
155 | 		allpos+=temppos
156 | 
157 | 
158 | 	plotx=[c[0] for c in allpos]
159 | 	ploty=[c[1] for c in allpos]
160 | 	plotcolor=[c[2] for c in allpos]
161 | 
162 | 	allcolor=set(plotcolor)
163 | 	colors={}
164 | 	i=1
165 | 	for c in allcolor:
166 | 		colors[c]=i
167 | 		i+=10
168 | 	plotcolor2=[colors[c] for c in plotcolor]
169 | 	size=[1]*len(plotcolor2)
170 | 	plt.scatter(plotx,ploty,s=size,c=plotcolor2)
171 | 	plt.xlabel('Reference Position')
172 | 	plt.ylabel('Contig Position')
173 | 	plt.savefig(outpath+'plot_synteny.pdf')
174 | 	return 0
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/denovo_static.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import pysam
  4 | import sys
  5 | import gzip
  6 | 
  7 | def simple(contigfile,outpath,min_size,min_size_assemblyerror):
  8 | 	if len(contigfile)==2:
  9 | 		halp=True
 10 | 	else:
 11 | 		halp=False
 12 | 	halpnum=1
 13 | 	f=open(outpath+'valid_contig.fa','w')
 14 | 	length=[]
 15 | 	maxlen=0
 16 | 	largecontiglength={}
 17 | 	all_contigs=[]
 18 | 	largecontigs=[]
 19 | 	map_contigs=[]
 20 | 	totallength=0
 21 | 	totallength_large=0
 22 | 	contig_length_info=[]
 23 | 
 24 | 
 25 | 	for contig in contigfile:
 26 | 		if contig.endswith('.gz'):
 27 | 			contig=gzip.open(contig,'rt')
 28 | 		else:
 29 | 			contig=open(contig,'r')
 30 | 		allcontig=contig.read().split('>')[1:]
 31 | 		for c in allcontig:
 32 | 			c=c.split('\n')[:-1]
 33 | 			contig_name=c[0].split(' ')[0]
 34 | 			if halp:
 35 | 				contig_name='HAP_'+str(halpnum)+'_'+contig_name
 36 | 			all_contigs+=[contig_name]
 37 | 			seq=''
 38 | 			length1=0
 39 | 			for cc in c[1:]:
 40 | 				seq+=cc
 41 | 				length1+=len(cc)
 42 | 			length+=[length1]
 43 | 			contig_length_info+=[contig_name+'\t'+str(length1)]
 44 | 			if length1>maxlen:
 45 | 				maxlen=length1
 46 | 				maxcontig=contig_name
 47 | 			if length1>=min_size:
 48 | 				f.write('>'+contig_name+'\n'+seq+'\n')
 49 | 				totallength+=length1
 50 | 				map_contigs+=[contig_name]
 51 | 			if length1>=min_size_assemblyerror:
 52 | 				largecontigs+=[contig_name]
 53 | 				largecontiglength[contig_name]=length1
 54 | 				totallength_large+=length1
 55 | 
 56 | 		halpnum+=1
 57 | 
 58 | 	f.close()
 59 | 
 60 | 	length.sort(reverse=True)
 61 | 	f=open(outpath+'contig_length_info','w')
 62 | 	for c in contig_length_info:
 63 | 		f.write(c+'\n')
 64 | 	f.close()
 65 | 	f=open(outpath+'summary_statistics','w')
 66 | 	f.write('Statics of contigs:\n')
 67 | 
 68 | 	iii=0
 69 | 	total=sum(length)//2
 70 | 	for c in length:
 71 | 		iii+=c
 72 | 		if iii>=total:
 73 | 			n50=c; break
 74 | 	length_ae=[c for c in length if c > min_size_assemblyerror]
 75 | 
 76 | 
 77 | 	f.write('Number of contigs\t'+str(len(length))+'\n')
 78 | 	f.write('Number of contigs > '+str(min_size)+' bp\t'+str(len(map_contigs))+'\n')
 79 | 	f.write('Number of contigs >'+str(min_size_assemblyerror)+' bp\t'+str(len(length_ae))+'\n')
 80 | 	f.write('Total length\t'+str(sum(length))+'\n')
 81 | 	f.write('Total length of contigs > '+str(min_size)+' bp\t'+str(totallength)+'\n')
 82 | 	f.write('Total length of contigs >'+str(min_size_assemblyerror)+'bp\t'+str(sum(length_ae))+'\n')
 83 | 
 84 | 	if len(length)==0:
 85 | 		logf=open(outpath+'Inspector.log','a')
 86 | 		logf.write('Error: No contigs found. Check if input file is empty or if --min_contig_length is too high.\n')
 87 | 		f.close()
 88 | 		logf.close()
 89 | 		quit()
 90 | 
 91 | 	if len(length_ae)==0:
 92 | 		logf=open(outpath+'Inspector.log','a')
 93 | 		logf.write('Warning: No contigs larger than '+str(min_size_assemblyerror)+'bp. No structural errors will be reported. Check if --min_contig_length_assemblyerror is too high.\n')
 94 | 		logf.close()
 95 | 
 96 | 	f.write('Longest contig\t'+str(length[0])+'\n')
 97 | 	if len(length)>1:
 98 | 		f.write('Second longest contig length\t'+str(length[1])+'\n')
 99 | 	f.write('N50\t'+str(n50)+'\n')
100 | 
101 | 	
102 | 
103 | 	iii=0; total=sum(length)//2; n50=0
104 | 	for c in length:
105 | 		iii+=c
106 | 		if iii>total:
107 | 			n50=c; break
108 | 	f.write('N50 of contigs >1Mbp\t'+str(n50)+'\n\n\n')
109 | 	f.close()
110 | 	return [all_contigs,map_contigs,largecontigs,totallength,totallength_large,maxcontig,maxlen,largecontiglength]
111 | 
112 | 
113 | def mapping_info_ctg(outpath,largechrom,smallchrom,contiglength,contiglength_large):
114 | 
115 | 	f=open(outpath+'summary_statistics','a')
116 | 	f.write('Read to Contig alignment:\n')
117 | 
118 | 	os.system('touch '+outpath+'map_depth/maplength_large_null')
119 | 	os.system('touch '+outpath+'map_depth/readnum_large_null')
120 | 	os.system('touch '+outpath+'map_depth/splitread_large_null')
121 | 	
122 | 
123 | 	os.system('cat '+outpath+'map_depth/maplength_large_* > '+outpath+'map_depth/all_maplength_large')
124 | 	os.system('cat '+outpath+'map_depth/maplength_* > '+outpath+'map_depth/all_maplength_total')
125 | 	os.system('cat '+outpath+'map_depth/readnum_large_* > '+outpath+'map_depth/all_readnum_large')
126 | 	os.system('cat '+outpath+'map_depth/readnum_* > '+outpath+'map_depth/all_readnum_total')
127 | 	os.system('cat '+outpath+'map_depth/splitread_large_* > '+outpath+'map_depth/all_splitread_large')
128 | 	os.system('cat '+outpath+'map_depth/splitread_* > '+outpath+'map_depth/all_splitread_total')
129 | 
130 | 	unmapped=int(pysam.AlignmentFile(outpath+'read_to_contig.bam','rb').unmapped)
131 | 	
132 | 	info=open(outpath+'map_depth/all_readnum_total','r').read().split('\n')[:-1]
133 | 	mapped=sum([int(ccc) for ccc in info])
134 | 	totalread=mapped+unmapped
135 | 	if totalread==0:
136 | 		logf=open(outpath+'Inspector.log','a')
137 | 		logf.write('Warning: No reads found in read_to_contig alignment.\n')
138 | 		logf.close()
139 | 		return 0
140 | 	mapprate=round(10000*float(mapped)/(totalread))/100.0
141 | 	f.write('Mapping rate /%\t'+str(mapprate)+'\n')
142 | 
143 | 	info=open(outpath+'map_depth/all_splitread_total','r').read().split('\n')[:-1]
144 | 	splitread=sum([int(ccc) for ccc in info])
145 | 	splrate=round(10000*float(splitread)/mapped)/100.0
146 | 	f.write('Split-read rate /%\t'+str(splrate)+'\n')
147 | 
148 | 	info=open(outpath+'map_depth/all_maplength_total','r').read().split('\n')[:-1]
149 | 	mappedlen=sum([int(ccc) for ccc in info])
150 | 	cov=round(10000*float(mappedlen)/contiglength)/10000.0
151 | 	f.write('Depth\t'+str(cov)+'\n')
152 | 
153 | 	try:
154 | 		info=open(outpath+'map_depth/all_readnum_large','r').read().split('\n')[:-1]
155 | 		mapped=sum([int(ccc) for ccc in info])
156 | 		mapprate=round(10000*float(mapped)/(totalread))/100.0
157 | 		f.write('Mapping rate in large contigs /%\t'+str(mapprate)+'\n')
158 | 
159 | 		info=open(outpath+'map_depth/all_splitread_large','r').read().split('\n')[:-1]
160 | 		splitread=sum([int(ccc) for ccc in info])
161 | 		splrate=round(10000*float(splitread)/mapped)/100.0
162 | 		f.write('Split-read rate in large contigs /%\t'+str(splrate)+'\n')
163 | 
164 | 		info=open(outpath+'map_depth/all_maplength_large','r').read().split('\n')[:-1]
165 | 		mappedlen=sum([int(ccc) for ccc in info])
166 | 		cov=round(10000*float(mappedlen)/contiglength_large)/10000.0
167 | 		f.write('Depth in large conigs\t'+str(cov)+'\n\n\n')
168 | 		f.close()
169 | 
170 | 	except:
171 | 		logf=open(outpath+'Inspector.log','a')
172 | 		logf.write('Warning: Failed to characterize read alignment in large contigs. \n')
173 | 		logf.close()
174 | 
175 | 	return cov
176 | 
177 | 
178 | 
179 | def sort_sv(a):
180 | 	return [a.split('\t')[0],int(a.split('\t')[1])]
181 | 
182 | 
183 | def assembly_info_cluster(outpath,min_size,max_size):
184 | 	os.system("cat "+outpath+"ae_merge_workspace/del_merged_* > "+outpath+"ae_merge_workspace/deletion-merged")
185 | 	os.system("cat "+outpath+"ae_merge_workspace/ins_merged_* > "+outpath+"ae_merge_workspace/insertion-merged")
186 | 	os.system("cat "+outpath+"ae_merge_workspace/inv_merged_* > "+outpath+"ae_merge_workspace/inversion-merged")
187 | 	f=open(outpath+'assembly_errors.bed','w')
188 | 	alldel=open(outpath+'ae_merge_workspace/deletion-merged','r').read().split('\n')[:-1]
189 | 	alldel=[c for c in alldel if min_size<=int(c.split('\t')[2])<=max_size]
190 | 	for c in alldel:
191 | 		c=c.split('\t')
192 | 		f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+int(c[2]))+'\t'+c[3]+'\tExpansion\tSize='+c[2]+'\t'+c[7]+'\n')
193 | 	allins=open(outpath+'ae_merge_workspace/insertion-merged','r').read().split('\n')[:-1]
194 | 	allins=[c for c in allins if min_size<=int(c.split('\t')[2])<=max_size]
195 | 	for c in allins:
196 | 		c=c.split('\t')
197 | 		f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+1)+'\t'+c[3]+'\tCollapse\tSize='+c[2]+'\t'+c[7]+'\n')
198 | 	allinv=open(outpath+'ae_merge_workspace/inversion-merged','r').read().split('\n')[:-1]
199 | 	allinv=[c for c in allinv if min_size<=int(c.split('\t')[2])<=max_size]
200 | 	for c in allinv:
201 | 		c=c.split('\t')
202 | 		f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+int(c[2]))+'\t'+c[3]+'\tInversion\t'+c[7]+'\n')
203 | 	f.close()
204 | 	return 0
205 | 
206 | 
207 | def assembly_info(outpath):
208 | 	
209 | 	os.system("cat "+outpath+"del-merged-* > "+outpath+"deletion-merged")
210 | 	os.system("cat "+outpath+"ins-merged-* > "+outpath+"insertion-merged")
211 | 	os.system("cat "+outpath+"dup-merged-* > "+outpath+"duplication-merged")
212 | 	os.system("cat "+outpath+"inv-merged-* > "+outpath+"inversion-merged")
213 | 	os.system("rm "+outpath+"*-info-* "+outpath+"*-merged-*")
214 | 	
215 | 	f=open(outpath+'deletion-merged','r')
216 | 	alldel=f.read().split('\n')[:-1]
217 | 	f.close()
218 | 	allins=open(outpath+'insertion-merged','r').read().split('\n')[:-1]
219 | 	alldup=open(outpath+'duplication-merged','r').read().split('\n')[:-1]
220 | 	allins+=alldup
221 | 	allsv=alldel+allins
222 | 	allsv.sort(key=sort_sv)
223 | 	f=open(outpath+'assembly_errors.bed','w')
224 | 	for c in allsv:
225 | 		if 'Del' in c:
226 | 			c=c.split('\t')
227 | 			f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+int(c[2]))+'\t'+c[3]+'\tExpansion\tSize='+c[2]+'\t'+c[6]+'\n')
228 | 		if 'Ins' in c :
229 | 			c=c.split('\t')
230 | 			f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+1)+'\t'+c[3]+'\tCollapse\tSize='+c[2]+'\t'+c[6]+'\n')
231 | 	allinv=open(outpath+'inversion-merged','r').read().split('\n')[:-1]
232 | 	for c in allinv:
233 | 		c=c.split('\t')
234 | 		f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+int(c[2]))+'\t'+c[3]+'\tInversion\t'+c[6]+'\n')
235 | 	f.close()
236 | 
237 | 	f=open(outpath+'summary_statistics','a')
238 | 	f.write('Number of assembly collapse\t'+str(len(allins))+'\n')
239 | 	f.write('Number of assembly expansion\t'+str(len(alldel))+'\n')
240 | 	f.write('Number of assembly inversion\t'+str(len(allinv))+'\n')
241 | 	f.close()
242 | 	return 0
243 | 
244 | def assembly_info_ref(outpath):
245 | 	
246 | 	os.system("cat "+outpath+"del-merged-* > "+outpath+"deletion-merged_ref")
247 | 	os.system("cat "+outpath+"ins-merged-* > "+outpath+"insertion-merged_ref")
248 | 	os.system("cat "+outpath+"dup-merged-* > "+outpath+"duplication-merged_ref")
249 | 	os.system("cat "+outpath+"inv-merged-* > "+outpath+"inversion-merged_ref")
250 | 	os.system("rm "+outpath+"*-info-* "+outpath+"*-merged-*")
251 | 
252 | 	f=open(outpath+'deletion-merged_ref','r')
253 | 	alldel=f.read().split('\n')[:-1]
254 | 	f.close()
255 | 	allins=open(outpath+'insertion-merged_ref','r').read().split('\n')[:-1]
256 | 	alldup=open(outpath+'duplication-merged_ref','r').read().split('\n')[:-1]
257 | 	allins+=alldup
258 | 	allsv=alldel+allins
259 | 	allsv.sort(key=sort_sv)
260 | 	f=open(outpath+'structural_errors_ref.bed','w')
261 | 	for c in allsv:
262 | 		if 'Ins' in c or 'Dup' in c:
263 | 			c=c.split('\t')
264 | 			f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+int(c[2]))+'\tExpansion\n')
265 | 		if 'Del' in c:
266 | 			c=c.split('\t')
267 | 			f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+1)+'\tCollapse\tSize='+c[2]+'\n')
268 | 	allinv=open(outpath+'inversion-merged_ref','r').read().split('\n')[:-1]
269 | 	for c in allinv:
270 | 		c=c.split('\t')
271 | 		f.write(c[0]+'\t'+c[1]+'\t'+str(int(c[1])+int(c[2]))+'\tInversion\n')
272 | 	f.close()
273 | 
274 | 	f=open(outpath+'summary_statistics','a')
275 | 	f.write('Assembly errors from contig to reference:\n')
276 | 	f.write('Number of assembly collapse\t'+str(len(alldel))+'\n')
277 | 	f.write('Number of assembly expansion\t'+str(len(allins))+'\n')
278 | 	f.write('Number of assembly inversion\t'+str(len(allinv))+'\n\n\n')
279 | 	f.close()
280 | 	#os.system("rm "+outpath+"*ion-merged")
281 | 	return 0
282 | 
283 | 
284 | def basepair_error(outpath):
285 | 	f=open(outpath+'assembly_basepair_error.vcf','r')
286 | 	a=f.readline()
287 | 	mismatch=0
288 | 	dels=0
289 | 	ins=0
290 | 	snp=0
291 | 	mnp=0
292 | 	while a!='':
293 | 		if a[0]=='#':
294 | 			a=f.readline(); continue
295 | 		if 'TYPE=snp' in a:
296 | 			mismatch+=1
297 | 			snp+=1
298 | 		if 'TYPE=ins' in a:
299 | 			mismatch+=len(a.split('\t')[4])-len(a.split('\t')[3])
300 | 			ins+=1
301 | 		if 'TYPE=del' in a:
302 | 			mismatch+=len(a.split('\t')[3])-len(a.split('\t')[4])
303 | 			dels+=1
304 | 		if 'TYPE=mnp' in a:
305 | 			mismatch+=len(a.split('\t')[4])
306 | 			mnp+=1
307 | 		a=f.readline()
308 | 	accuracy=1-mismatch/100000.0
309 | 	f=open(outpath+'summary_statistics','a')
310 | 	f.write('Number of small collapse\t'+str(ins)+'\n')
311 | 	f.write('Number of small expansion\t'+str(dels)+'\n')
312 | 	f.write('Number of single basepair error\t'+str(snp)+'\n')
313 | 	f.write('Number of multiple basepair error\t'+str(mnp)+'\n')
314 | 	f.write('Base pair accuracy\t'+str(accuracy)+'\n\n\n')
315 | 	return 0
316 | 
317 | def basepair_error_ref(outpath,largestchr):
318 | 	f=open(outpath+'contig_to_ref.sam','r')
319 | 	a=f.readline()
320 | 	mismatch=0
321 | 	totallength=0
322 | 	ins=0; dels=0; snp=0
323 | 	svs=[]
324 | 	while a!='':
325 | 		if a[0]=='@':
326 | 			a=f.readline(); continue
327 | 		if a.split('\t')[0]!=largestchr or a.split('\t')[1] in ['256','272']:
328 | 			a=f.readline(); continue
329 | 		cigar=a.split('\t')[5]
330 | 		num=''
331 | 		length=0
332 | 		chrom=a.split('\t')[2]
333 | 		pos=int(a.split('\t')[3])
334 | 		for c in cigar:
335 | 			if c in '1234567890':
336 | 				num+=c
337 | 			if c in 'SH':
338 | 				num=''
339 | 			if c in 'M=':
340 | 				length+=int(num); num=''
341 | 			if c == 'X':
342 | 				if int(num)==1:
343 | 					svs+=[chrom+'\t'+str(pos+length)+'\t'+str(pos+length)+'\tSNP']
344 | 				else:
345 | 					svs+=[chrom+'\t'+str(pos+length)+'\t'+str(pos+length+int(num)-1)+'\tMNP\tsize='+num]
346 | 				length+=int(num); mismatch+=int(num);num=''
347 | 				snp+=1
348 | 			if c == 'I':
349 | 				if int(num)<=10:
350 | 					svs+=[chrom+'\t'+str(pos+length)+'\t'+str(pos+length+1)+'\tExpansion\tsize='+num]
351 | 					mismatch+=int(num); ins+=1
352 | 				num=''
353 | 			if c == 'D':
354 | 				if int(num)<=10:
355 | 					svs+=[chrom+'\t'+str(pos+length)+'\t'+str(pos+length+int(num))+'\tCollapse']
356 | 					mismatch+=int(num); dels+=1
357 | 				length+=int(num)
358 | 				num=''
359 | 
360 | 		totallength+=length
361 | 		a=f.readline()
362 | 	accuracy=round((1-mismatch/float(totallength))*10000)/10000.0
363 | 	f=open(outpath+'summary_statistics','a')
364 | 	f.write('Base pair accuracy of longest contig from contig to reference:\n')
365 | 	f.write('Number of small assembly collapse\t'+str(dels)+'\n')
366 | 	f.write('Number of small assembly extension\t'+str(ins)+'\n')
367 | 	f.write('Number of single basepair error\t'+str(snp)+'\n')
368 | 	f.write('Base pair accuracy\t'+str(accuracy)+'\n\n\n')
369 | 	f.close()
370 | 	f=open(outpath+'small_scale_error_ref.bed','w')
371 | 	for c in svs:
372 | 		f.write(c+'\n')
373 | 	f.close()
374 | 	return 0
375 | 
376 | def sortblock(a):
377 | 	return [a[0],a[1]]
378 | 
379 | def count_ref_coverage(refcoveredall):
380 | 	allchrom=list(set([c[0] for c in refcoveredall]))
381 | 	new=[]
382 | 	for chrom in allchrom:
383 | 		refcovered=[c for c in refcoveredall if c[0]==chrom] 
384 | 		refcovered.sort(key=sortblock)
385 | 		ifovlp=0
386 | 		while len(refcovered)>1:
387 | 			if refcovered[0][2]<=refcovered[1][1]:
388 | 				new+=[refcovered[0]]
389 | 				refcovered=refcovered[1:]
390 | 			else:
391 | 				i=0
392 | 				ovlpstart=refcovered[i+1][1]; ovlpend=min(refcovered[i][2],refcovered[i+1][2])
393 | 				newblock=[]
394 | 				if refcovered[i+1][1] > refcovered[i][1]:
395 | 					newblock+=[[refcovered[i][0],refcovered[i][1],refcovered[i+1][1],refcovered[i][3]]]
396 | 				newblock+=[[refcovered[i][0],ovlpstart,ovlpend,refcovered[i][3]+refcovered[i+1][3]]]
397 | 				if refcovered[i+1][2] > refcovered[i][2]:
398 | 					newblock+=[[refcovered[i][0],refcovered[i][2],refcovered[i+1][2],refcovered[i+1][3]]]
399 | 				if refcovered[i+1][2]<refcovered[i][2]:
400 | 					newblock+=[[refcovered[i][0],refcovered[i+1][2],refcovered[i][2],refcovered[i][3]]]
401 | 				refcovered=newblock+refcovered[2:]
402 | 				refcovered.sort(key=sortblock)
403 | 
404 | 		new+=refcovered
405 | 
406 | 	b1=[c[2]-c[1] for c in new if c[3]==1]
407 | 	b2=[c[2]-c[1] for c in new if c[3]==2]
408 | 	b3=[c[2]-c[1] for c in new if c[3]>2]
409 | 	base1=sum(b1)
410 | 	base2=sum(b2)
411 | 	base3=sum(b3)
412 | 	return (base1,base2,base3)
413 | 	
414 | 
415 | def get_ref_align_info(path,totallength):
416 | 	f=pysam.AlignmentFile(path+'contig_to_ref.sam','r')
417 | 	allali=f.fetch()
418 | 	maplen=[]
419 | 	refcovered=[]
420 | 	for c in allali:
421 | 		if c.flag==4:
422 | 			continue
423 | 		readlen=c.query_alignment_length
424 | 		refcovered+=[[c.reference_name,c.reference_start,c.reference_end,1]]
425 | 		if c.flag in [0,2048]:
426 | 			leftclipinfo=c.cigartuples[0]
427 | 			leftclip = leftclipinfo[1] if leftclipinfo[0]==5 or leftclipinfo[0]==4 else 0
428 | 			leftclip = leftclipinfo[1] if leftclipinfo[0]==5 or leftclipinfo[0]==4 else 0
429 | 			maplen+=[[c.query_name,leftclip,leftclip+readlen]]
430 | 		if c.flag in [16,2064]:
431 | 			leftclipinfo=c.cigartuples[-1]
432 | 			leftclip = leftclipinfo[1] if leftclipinfo[0]==5 or leftclipinfo[0]==4 else 0
433 | 			maplen+=[[c.query_name,leftclip,leftclip+readlen]]
434 | 		
435 | 	n50info=[c[2]-c[1] for c in maplen]
436 | 	n50info.sort(reverse=True)
437 | 	lenacc=0
438 | 	na50=0
439 | 	info=sum(n50info)
440 | 	for c in n50info:
441 | 		lenacc+=c
442 | 		if lenacc>=0.5*info:
443 | 			na50=c;break
444 | 
445 | 	assembly_maplenratio=float(info)/totallength
446 | 	(base1,base2,base3)=count_ref_coverage(refcovered)
447 | 
448 | 	totalrefbase=sum(f.lengths)
449 | 	allrefchrom=list(f.references)
450 | 
451 | 	base0=totalrefbase-base1-base2-base3
452 | 	f=open(path+'summary_statistics','a')
453 | 	f.write('\n\n\nReference-based mode:\n')
454 | 	f.write('Genome Coverage /% '+str(float(base1+base2+base3)/totalrefbase)+'\nReference base with Depth=0 (including Ns): '+str(base0)+';\t'+str(base0/float(totalrefbase)*100)+'%\n')
455 | 	f.write('Reference base with Depth=1 '+str(base1)+';\t'+str(base1/float(totalrefbase)*100)+'%\n')
456 | 	f.write('Reference base with Depth=2 '+str(base2)+';\t'+str(base2/float(totalrefbase)*100)+'%\n')
457 | 	f.write('Reference base with Depth>2 '+str(base3)+';\t'+str(base3/float(totalrefbase)*100)+'%\n')
458 | 	f.write('Assembly contig mapping ratio (length) /%'+str(assembly_maplenratio)+'\n')
459 | 	f.write('Assembly contig NA50 '+str(na50)+'\n')
460 | 	f.close()
461 | 
462 | 	return allrefchrom
463 | 
464 | 
465 | def get_ref_chroms(outpath):
466 | 	f=open(outpath+'contig_to_ref.sam','r')
467 | 	a=f.readline()
468 | 	chroms=[]
469 | 	length=0
470 | 	longestlen=0
471 | 	longestchr=''
472 | 	while a[0]=='@':
473 | 		if a[:3]=='@SQ':
474 | 			chroms+=[a.split('\t')[1].split(':')[1]]
475 | 			length+=int(a.split('\t')[2].split(':')[1])
476 | 			if int(a.split('\t')[2].split(':')[1])>longestlen:
477 | 				longestlen=int(a.split('\t')[2].split(':')[1])
478 | 				longestchr=a.split('\t')[1].split(':')[1]
479 | 		a=f.readline()
480 | 	a=int(subprocess.check_output("awk \'$3==0\' "+outpath+'contig_to_ref.depth | wc -l',shell=True))
481 | 	covered=length-a
482 | 	return (chroms,length,longestchr,longestlen,covered)
483 | 
484 | 
485 | def check_depth_ref(outpath,ref):
486 | 	cov0=int(subprocess.check_output("awk \'$3==0\' "+outpath+'contig_to_ref.depth | wc -l',shell=True))
487 | 	cov1=int(subprocess.check_output("awk \'$3==1\' "+outpath+'contig_to_ref.depth | wc -l',shell=True))
488 | 	cov2=int(subprocess.check_output("awk \'$3==2\' "+outpath+'contig_to_ref.depth | wc -l',shell=True))
489 | 	cov3=int(subprocess.check_output("awk \'$3>2\' "+outpath+'contig_to_ref.depth | wc -l',shell=True))
490 | 	
491 | 	total=cov0+cov1+cov2+cov3
492 | 	
493 | 	f=open(outpath+'summary_statistics','a')
494 | 	f.write('#BP with cov=0   '+str(cov0)+',  '+str(cov0*100.00/total)+'\n')
495 | 	f.write('#BP with cov=1   '+str(cov1)+',  '+str(cov1*100.00/total)+'\n')
496 | 	f.write('#BP with cov=2   '+str(cov2)+',  '+str(cov2*100.00/total)+'\n')
497 | 	f.write('#BP with cov>2   '+str(cov3)+',  '+str(cov3*100.00/total)+'\n')
498 | 	f.write('Coverage:  '+str(1-round(10000*float(cov0)/total)/10000.0)+'\n')
499 | 	f.close()
500 | 	
501 | 	return 0
502 | 
503 | 


--------------------------------------------------------------------------------
/inspector-correct.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import multiprocessing
  4 | import sys
  5 | import denovo_correct as inspector_correct
  6 | import os
  7 | from datetime import datetime
  8 | import time 
  9 | 
 10 | 
 11 | t0=time.time()
 12 | parser=argparse.ArgumentParser(description='Assembly error correction based on Inspector assembly evaluation', usage='inspector-correct.py [-h] -i inspector_out/ --datatype pacbio-raw ')
 13 | parser.add_argument('-v','--version', action='version', version='Inspector_correct_v1.0')
 14 | parser.add_argument('-i','--inspector',type=str,default=False,help='Inspector evaluation directory. Original file names are required.',required=True)
 15 | parser.add_argument('--datatype',type=str,default=False,help='Type of read used for Inspector evaluation. This option is required for structural error correction when performing local assembly with Flye. (pacbio-raw, pacbio-hifi, nano-raw,pacbio-corr, nano-corr)',required=True)
 16 | parser.add_argument('-o','--outpath',type=str,default=False,help='output directory')
 17 | parser.add_argument('--flyetimeout',type=int,default=1200,help='Maximal runtime for local assembly with Flye. Unit is second. [1200]')
 18 | parser.add_argument('--skip_structural',action='store_true',default=False,help='Do not correct structural errors. Local assembly will not be performed.')
 19 | parser.add_argument('--skip_baseerror',action='store_true',default=False,help='Do not correct base errors.')
 20 | parser.add_argument('-t','--thread',type=int,default=8,help='number of threads')
 21 | 
 22 | if len(sys.argv)==1:
 23 | 	parser.print_help()
 24 | 	sys.exit(1)
 25 | 
 26 | inscor_args=parser.parse_args()
 27 | if inscor_args.inspector[-1]!='/':
 28 | 	readpath=inscor_args.inspector+'/'
 29 | else:
 30 | 	readpath=inscor_args.inspector
 31 | if not inscor_args.outpath:
 32 | 	outpath=readpath
 33 | else:
 34 | 	if inscor_args.outpath[-1]!='/':
 35 | 		outpath=inscor_args.outpath+'/'
 36 | 	else:
 37 | 		outpath=inscor_args.outpath
 38 | if not os.path.exists(outpath):
 39 | 	os.mkdir(outpath)
 40 | 
 41 | 
 42 | logf=open(outpath+'Inspector_correct.log','a')
 43 | logf.write('Inspector assembly error correction starting... '+datetime.now().strftime("%d/%m/%Y %H:%M:%S")+'\n')
 44 | 
 45 | if not inscor_args.skip_structural and not inscor_args.datatype:
 46 | 	logf.write('Error:  No data type (--datatype) given!\nFor Debreak usage, use -h or --help\n')
 47 | 	sys.exit(1)
 48 | 
 49 | if inscor_args.datatype not in ['pacbio-raw','pacbio-hifi', 'pacbio-corr', 'nano-raw','nano-corr']:
 50 | 	logf.write('Error:  Data type (--datatype) not valid. Supported read types are: pacbio-raw, pacbio-hifi, pacbio-corr, nano-raw, nano-corr.\n')
 51 | 	sys.exit(1)
 52 | 
 53 | 
 54 | t1=time.time()
 55 | logf.write('TIME for validating parameter'+str(t1-t0)+'\n')
 56 | 
 57 | try:
 58 | 	allctg=open(readpath+'valid_contig.fa','r').read().split('>')[1:]
 59 | except:
 60 | 	logf.write('Error: Contig file not valid. Please keep original file name in the inspector output directory.\nCheck if file is valid: '+readpath+'valid_contig.fa\n')
 61 | 	sys.exit(1)
 62 | ctginfo={}
 63 | for c in allctg:
 64 | 	ctginfo[c.split('\n')[0]]=c.split('\n')[1]
 65 | 
 66 | 
 67 | t2=time.time()
 68 | logf.write('TIME for reading contig and length'+str(t2-t1)+'\n')
 69 | newsnplist=[]
 70 | 
 71 | if not inscor_args.skip_baseerror:
 72 | 	try:
 73 | 		allsnplist=open(readpath+'small_scale_error.bed','r').read().split('\n')[1:-1]
 74 | 	except:
 75 | 		logf.write('Warning: small-scale eror bed file not found. Check file \'small_scale_error.bed\' in Inspector evaluation directory. Continue without small-scale error correction.\n')
 76 | 		allsnplist=[]
 77 | else:
 78 | 	allsnplist=[]
 79 | 
 80 | snpctg={}
 81 | 
 82 | if not inscor_args.skip_structural:
 83 | 	os.system('mkdir '+outpath+'assemble_workspace/')
 84 | 	try:
 85 | 		allaelist=open(readpath+'structural_error.bed','r').read().split('\n')[1:-1]
 86 | 	except:
 87 | 		logf.write('Warning: structural eror bed file not found. Check file \'structural_error.bed\' in Inspector evaluation directory. Continue without structural error correction.\n')
 88 | 		allaelist=[]
 89 | else:
 90 | 	allaelist=[]
 91 | 
 92 | snpctg={}
 93 | aectg={}
 94 | for ctgname in ctginfo:
 95 | 	snpctg[ctgname]=[]
 96 | 	aectg[ctgname]=[]
 97 | for aeinfo in allsnplist:
 98 | 	snpctg[aeinfo.split('\t')[0]]+=[aeinfo]
 99 | for aeinfo in allaelist:
100 | 	aectg[aeinfo.split('\t')[0]]+=[aeinfo]
101 | 
102 | allsnplist=[]
103 | allaelist=[]
104 | bamfile=readpath+'read_to_contig.bam'
105 | 
106 | t3=time.time()
107 | logf.write('TIME for reading assembly errors'+str(t3-t2)+'\n')
108 | logf.close()
109 | 
110 | 
111 | for chrominfo in ctginfo:
112 | 	inspector_correct.error_correction_large(chrominfo,ctginfo[chrominfo],aectg[chrominfo],snpctg[chrominfo],bamfile,outpath,inscor_args.datatype,inscor_args.thread//3,inscor_args.flyetimeout)
113 | 
114 | t4=time.time()
115 | logf=open(outpath+'Inspector_correct.log','a')
116 | logf.write('TIME for correcting all contigs'+str(t4-t3)+'\n')
117 | logf.close()
118 | 
119 | f=open(outpath+'contig_corrected.fa','w')
120 | for chrominfo in ctginfo:
121 | 	try:
122 | 		correctedinfo=open(outpath+'contig_corrected_'+chrominfo+'.fa','r').read()
123 | 		f.write(correctedinfo)
124 | 	except:
125 | 		logf=open(outpath+'Inspector_correct.log','a')
126 | 		logf.write('Warning: corrected contig ',chrominfo,'not found.\n')
127 | 		logf.close()
128 | f.close()
129 | t5=time.time()
130 | logf=open(outpath+'Inspector_correct.log','a')
131 | logf.write('TIME for writing corrected contig'+str(t5-t4)+'\n')
132 | os.system('rm '+outpath+'contig_corrected_*fa')
133 | logf.write('Inspector error correction finished. Bye.\n')
134 | logf.close()
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/inspector.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import argparse
  4 | import denovo_static
  5 | import debreak_detect
  6 | import debreak_merge_clustering as debreak_cluster
  7 | import debreak_merge
  8 | import multiprocessing
  9 | import math 
 10 | import time
 11 | from datetime import datetime
 12 | t0=time.time()
 13 | 
 14 | parser=argparse.ArgumentParser(description='de novo assembly evaluator', usage='inspector.py [-h] -c contig.fa -r raw_reads.fastq -o output_dict/')
 15 | parser.add_argument('--version', action='version', version='Inspector_v1.3.1')
 16 | parser.add_argument('-c','--contig',action='append', dest='contigfile',default=[],help='assembly contigs in FASTA format',required=True)
 17 | parser.add_argument('-r','--read',type=str,default=False,help='sequencing reads in FASTA/FASTQ format',required=True,nargs='+')
 18 | parser.add_argument('-d','--datatype',type=str,default='clr',help='Input read type. (clr, hifi, nanopore) [clr]')
 19 | parser.add_argument('-o','--outpath',type=str,default='./adenovo_evaluation-out/',help='output directory')
 20 | parser.add_argument('--ref',type=str,default=False,help='OPTIONAL reference genome in FASTA format')
 21 | 
 22 | parser.add_argument('-t','--thread',type=int,default=8,help='number of threads. [8]')
 23 | parser.add_argument('--min_depth',type=int,default=False,help='minimal read-alignment depth for a contig base to be considered in QV calculation. [20%% of average depth]')
 24 | parser.add_argument('--min_contig_length',type=int,default=10000,help='minimal length for a contig to be evaluated. [10000]')
 25 | parser.add_argument('--min_contig_length_assemblyerror',type=int,default=1000000,help='minimal contig length for assembly error detection. [1000000]')
 26 | parser.add_argument('--min_assembly_error_size',type=int,default=50,help='minimal size for assembly errors. [50]')
 27 | parser.add_argument('--max_assembly_error_size',type=int,default=4000000,help='maximal size for assembly errors. [4000000]')
 28 | parser.add_argument('--noplot',action='store_true',default=False,help='do not make plots')
 29 | parser.add_argument('--skip_read_mapping',action='store_true',default=False,help='skip the step of mapping reads to contig.')
 30 | parser.add_argument('--skip_structural_error',action='store_true',default=False,help='skip the step of identifying large structural errors.')
 31 | parser.add_argument('--skip_structural_error_detect',action='store_true',default=False,help='skip the step of detecting large structural errors.')
 32 | parser.add_argument('--skip_base_error',action='store_true',default=False,help='skip the step of identifying small-scale errors.')
 33 | parser.add_argument('--skip_base_error_detect',action='store_true',default=False,help='skip the step of detecting small-scale errors from pileup.')
 34 | 
 35 | denovo_args=parser.parse_args()
 36 | 
 37 | if denovo_args.outpath[-1]!='/':
 38 | 	denovo_args.outpath+='/'
 39 | if not os.path.exists(denovo_args.outpath):
 40 | 	os.mkdir(denovo_args.outpath)
 41 | 
 42 | logf=open(denovo_args.outpath+'Inspector.log','a')
 43 | logf.write('Inspector starting... '+datetime.now().strftime("%d/%m/%Y %H:%M:%S")+'\n')
 44 | logf.write("Start Assembly evaluation with contigs: " + str(denovo_args.contigfile)+'\n')
 45 | validate_read=[]
 46 | for inputfile in denovo_args.read:
 47 | 	try:
 48 | 		open(inputfile,'r')
 49 | 		validate_read+=[inputfile]
 50 | 	except:
 51 | 		logf.write('Warning: cannot open input file \"'+inputfile+'\". Removed from list.'+'\n')
 52 | if len(validate_read)==0:
 53 | 	logf.write('Error: No valida input read file. Abort.\n')
 54 | 	quit()
 55 | 
 56 | if denovo_args.datatype not in ['clr','hifi','nanopore']:
 57 | 	logf.write('Warning: Invalid input datatype (--datatype/-d). Should be one of the following: clr, ccs, nanopore. Use clr as default.\n')
 58 | 	denovo_args.datatype='clr'
 59 | 
 60 | # Check input arguments
 61 | if len(denovo_args.contigfile)==1:
 62 | 	singlecontig=True
 63 | elif len(denovo_args.contigfile)==2:
 64 | 	singlecontig=False
 65 | else:
 66 | 	logf.write('Error: Input contig file should be either 1 fasta file or two halploid.fa files. Check input -c/--contig.\n')
 67 | 	quit()
 68 | 
 69 | if not denovo_args.skip_base_error:
 70 | 	import denovo_baseerror
 71 | logf.close()
 72 | 
 73 | # Simple statistics of contigs	
 74 | contiginfo=denovo_static.simple(denovo_args.contigfile,denovo_args.outpath,denovo_args.min_contig_length,denovo_args.min_contig_length_assemblyerror)
 75 | chromosomes=contiginfo[0]
 76 | chromosomes_map=contiginfo[1]
 77 | chromosomes_large=contiginfo[2]
 78 | largecontig_length=contiginfo[7]
 79 | chromosomes_small=[mmm for mmm in chromosomes_map if mmm not in chromosomes_large]
 80 | totalcontiglen=contiginfo[3]
 81 | totalcontiglen_large=contiginfo[4]
 82 | 
 83 | 
 84 | t1=time.time()
 85 | logf=open(denovo_args.outpath+'Inspector.log','a')
 86 | logf.write('TIME: Before read mapping '+str(t1-t0)+'\n')
 87 | logf.close()
 88 | 
 89 | 
 90 | # Read alignment
 91 | if not denovo_args.skip_read_mapping:
 92 | 	inputfileid=1
 93 | 	for inputfile in validate_read:
 94 | 		os.system("minimap2 -a -Q  -N 1 -I 10G -t " + str(denovo_args.thread) + "  "+denovo_args.outpath+"valid_contig.fa " + inputfile + " | samtools sort -@ " + str(denovo_args.thread) + " -o  "+denovo_args.outpath+"read_to_contig_"+str(inputfileid)+".bam")
 95 | 		inputfileid+=1
 96 | 	if len(validate_read)>1:
 97 | 		os.system("samtools merge  "+denovo_args.outpath+"read_to_contig.bam  "+denovo_args.outpath+"read_to_contig_*.bam")
 98 | 		os.system("rm "+str(denovo_args.outpath)+"read_to_contig_*.bam")
 99 | 	else:
100 | 		os.system("mv "+denovo_args.outpath+"read_to_contig_1.bam "+denovo_args.outpath+"read_to_contig.bam")
101 | 	os.system("samtools index "+str(denovo_args.outpath)+"read_to_contig.bam")
102 | t2=time.time()
103 | logf=open(denovo_args.outpath+'Inspector.log','a')
104 | logf.write('TIME: Read Alignment: '+str(t2-t1)+'\n')
105 | logf.close()
106 | 
107 | 
108 | # Structural assembly error detection
109 | if not denovo_args.skip_structural_error_detect:
110 | 	os.system("mkdir "+denovo_args.outpath+"map_depth/")
111 | 	if not denovo_args.skip_structural_error:
112 | 		os.system("mkdir "+denovo_args.outpath+"debreak_workspace/")
113 | 		debreak_det=multiprocessing.Pool(denovo_args.thread)
114 | 		for i in range(len(chromosomes_large)):
115 | 			debreak_det.apply_async(debreak_detect.detect_sortbam,args=(denovo_args.outpath,denovo_args.min_assembly_error_size,denovo_args.max_assembly_error_size,chromosomes_large[i]))
116 | 		for i in range(len(chromosomes_small)):
117 | 			debreak_det.apply_async(debreak_detect.detect_sortbam_nosv,args=(denovo_args.outpath,chromosomes_small[i],'small'))
118 | 		debreak_det.close()
119 | 		debreak_det.join()
120 | 		os.system("cat "+denovo_args.outpath+"debreak_workspace/read_to_contig_*debreak.temp > "+denovo_args.outpath+"read_to_contig.debreak.temp")
121 | 	else:
122 | 		debreak_det=multiprocessing.Pool(denovo_args.thread)
123 | 		for i in range(len(chromosomes_large)):
124 | 			debreak_det.apply_async(debreak_detect.detect_sortbam_nosv,args=(denovo_args.outpath,chromosomes_large[i],'large'))
125 | 		for i in range(len(chromosomes_small)):
126 | 			debreak_det.apply_async(debreak_detect.detect_sortbam_nosv,args=(denovo_args.outpath,chromosomes_small[i],'small'))
127 | 		debreak_det.close()
128 | 		debreak_det.join()
129 | 
130 | 
131 | cov=denovo_static.mapping_info_ctg(denovo_args.outpath,chromosomes_large,chromosomes_small,totalcontiglen,totalcontiglen_large)
132 | minsupp=max(1,round(cov/10.0))
133 | 
134 | t3=time.time()
135 | logf=open(denovo_args.outpath+'Inspector.log','a')
136 | logf.write('TIME: Structural error signal detection: '+str(t3-t2)+'\n')
137 | logf.close()
138 | 
139 | 
140 | aelen_structuralerror=0
141 | if not denovo_args.skip_structural_error:
142 | 	os.system('mkdir '+denovo_args.outpath+'ae_merge_workspace')
143 | 	for chrom in largecontig_length:
144 | 		contiglength=largecontig_length[chrom]
145 | 		debreak_cluster.cluster(denovo_args.outpath,chrom,contiglength,minsupp,cov*2)
146 | 		debreak_cluster.cluster_ins(denovo_args.outpath,chrom,contiglength,minsupp,cov*2,'ins')
147 | 		debreak_cluster.cluster_ins(denovo_args.outpath,chrom,contiglength,minsupp,cov*2,'inv')
148 | 	denovo_static.assembly_info_cluster(denovo_args.outpath,denovo_args.min_assembly_error_size,denovo_args.max_assembly_error_size)
149 | 	debreak_cluster.genotype(cov,denovo_args.outpath)
150 | 	
151 | 	aelen_structuralerror=debreak_cluster.filterae(cov,denovo_args.outpath,denovo_args.min_assembly_error_size,denovo_args.datatype)
152 | 
153 | t4=time.time()
154 | logf=open(denovo_args.outpath+'Inspector.log','a')
155 | logf.write('TIME: Structural error clustering : '+str(t4-t3)+'\n')
156 | logf.close()
157 | 
158 | # SNP & indel detection
159 | aelen_baseerror=0
160 | if not denovo_args.skip_base_error:
161 | 	if not denovo_args.skip_base_error_detect:
162 | 		os.system('samtools faidx '+denovo_args.outpath+'valid_contig.fa')
163 | 		debreak_det=multiprocessing.Pool(denovo_args.thread)
164 | 		os.system('mkdir '+denovo_args.outpath+'base_error_workspace')
165 | 		for chrom in chromosomes_map:
166 | 			debreak_det.apply_async(denovo_baseerror.getsnv,args=(denovo_args.outpath,chrom,cov*2/5,cov*2,denovo_args.min_depth))
167 | 		debreak_det.close()
168 | 		debreak_det.join()
169 | 
170 | 	aelen_baseerror=denovo_baseerror.count_baseerrror(denovo_args.outpath,totalcontiglen,denovo_args.datatype,cov)
171 | 
172 | t5=time.time()
173 | logf=open(denovo_args.outpath+'Inspector.log','a')
174 | logf.write('TIME: Small-scale error detection: '+str(t5-t4)+'\n')
175 | logf.close()
176 | 
177 | #QV
178 | if aelen_structuralerror+aelen_baseerror>0:
179 | 	try:
180 | 		allvalidnum=open(denovo_args.outpath+'base_error_workspace/validbase','r').read().split('\n')[:-1]
181 | 		validctgbase=sum([int(validnum) for validnum in allvalidnum])
182 | 	except:
183 | 		validctgbase=totalcontiglen
184 | 	qv=-10 * math.log10(float(aelen_baseerror+aelen_structuralerror)/validctgbase)
185 | 
186 | 	f=open(denovo_args.outpath+'summary_statistics','a')
187 | 	f.write('\nQV\t'+str(qv)+'\n')
188 | 	f.close()
189 | 
190 | 
191 | t6=time.time()
192 | logf=open(denovo_args.outpath+'Inspector.log','a')
193 | logf.write('TIME: QV calculation: '+str(t6-t5)+'\n')
194 | logf.close()
195 | 
196 | # Reference-based evaluation
197 | if denovo_args.ref:
198 | 	mapinfo=os.system("minimap2 -a -I 10G --eqx -x asm5 -t " + str(denovo_args.thread//2) + " "+denovo_args.ref+" " + denovo_args.outpath + "valid_contig.fa  --secondary=no > "+ denovo_args.outpath+"contig_to_ref.sam")
199 | 	os.system("samtools sort -@ " + str(denovo_args.thread//2) + " "+ denovo_args.outpath+"contig_to_ref.sam -o  " + denovo_args.outpath+"contig_to_ref.bam")
200 | 	os.system("samtools index "+ denovo_args.outpath+"contig_to_ref.bam")
201 | 	chromosomes=denovo_static.get_ref_align_info(denovo_args.outpath,totalcontiglen)
202 | 	mapping_info=debreak_detect.detect_sam_ref("contig_to_ref.sam",denovo_args.outpath,denovo_args.outpath,denovo_args.min_assembly_error_size,denovo_args.max_assembly_error_size)
203 | 	
204 | 	minsupp=1
205 | 
206 | 	allsvsignal=open(denovo_args.outpath+'contig_to_ref.debreak.temp','r').read().split('\n')[:-1]
207 | 	rawdelcalls={}; rawinscalls={};rawdupcalls={};rawinvcalls={}
208 | 	for chrom in chromosomes:
209 | 		rawdelcalls[chrom]=[c.split('\t')[0]+'\t'+c.split('\t')[1]+'\t'+c.split('\t')[2]+'\t'+c.split('\t')[6]+'\t'+c.split('\t')[4] for c in allsvsignal if 'D-' in c and c.split('\t')[0]==chrom]
210 | 		rawinscalls[chrom]=[c.split('\t')[0]+'\t'+c.split('\t')[1]+'\t'+c.split('\t')[2]+'\t'+c.split('\t')[6]+'\t'+c.split('\t')[4] for c in allsvsignal if 'I-' in c and c.split('\t')[0]==chrom]
211 | 		rawdupcalls[chrom]=[c.split('\t')[0]+'\t'+c.split('\t')[1]+'\t'+c.split('\t')[2]+'\t'+c.split('\t')[6]+'\t'+c.split('\t')[4] for c in allsvsignal if 'DUP-' in c and c.split('\t')[0]==chrom]
212 | 		rawinvcalls[chrom]=[c.split('\t')[0]+'\t'+c.split('\t')[1]+'\t'+c.split('\t')[2]+'\t'+c.split('\t')[6]+'\t'+c.split('\t')[4] for c in allsvsignal if 'INV-' in c and c.split('\t')[0]==chrom]
213 | 	for chrom in chromosomes:
214 | 		debreak_merge.merge_insertion(minsupp,0,denovo_args.outpath,rawinscalls[chrom],chrom,'ins',True,)
215 | 		debreak_merge.merge_deletion(minsupp,0,denovo_args.outpath,rawdelcalls[chrom],chrom,'del',True,)
216 | 		debreak_merge.merge_deletion(minsupp,0,denovo_args.outpath,rawdupcalls[chrom],chrom,'dup',True,)
217 | 		debreak_merge.merge_insertion(minsupp,0,denovo_args.outpath,rawinvcalls[chrom],chrom,'inv',True,)
218 | 
219 | 	denovo_static.assembly_info_ref(denovo_args.outpath)
220 | 	
221 | 	denovo_static.basepair_error_ref(denovo_args.outpath,contiginfo[5])
222 | 
223 | t7=time.time()
224 | logf=open(denovo_args.outpath+'Inspector.log','a')
225 | logf.write('TIME: Reference-based mode: '+str(t7-t6)+'\n')
226 | logf.close()
227 | 
228 | # Plots
229 | if not denovo_args.noplot:
230 | 	try:
231 | 		import denovo_plot
232 | 		denovo_plot.plot_n100(denovo_args.outpath,denovo_args.min_contig_length)
233 | 	except:
234 | 		logf=open(denovo_args.outpath+'Inspector.log','a')
235 | 		logf.write('Warning: Failed to plot N1_N100.\n')
236 | 		logf.close()
237 | 	if denovo_args.ref:
238 | 		try:
239 | 			import denovo_plot
240 | 			denovo_plot.plot_na100(denovo_args.outpath)
241 | 			denovo_plot.plot_dotplot(denovo_args.outpath)
242 | 		except:
243 | 			logf=open(denovo_args.outpath+'Inspector.log','a')
244 | 			logf.write('Warning: Failed to plot NA1_NA100 and Dotplots.\n')
245 | 			logf.close()
246 | t8=time.time()
247 | logf=open(denovo_args.outpath+'Inspector.log','a')
248 | logf.write('TIME: Generate plots: '+str(t8-t7)+'\n')
249 | logf.write('Inspector evaluation finished. Bye.\n')
250 | logf.close()
251 | 
252 | 
253 | 


--------------------------------------------------------------------------------
/testdata/read_test.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChongLab/Inspector/0e08f882181cc0e0e0fa749cd87fb74a278ea0f0/testdata/read_test.fastq.gz


--------------------------------------------------------------------------------