├── .gitattributes
├── .gitignore
├── .gitmodules
├── AUTHORS
├── COPYING
├── Dockerfile
├── README.md
├── pyproject.toml
├── setup.py
├── tiddit
    ├── DBSCAN.py
    ├── __init__.py
    ├── __main__.py
    ├── graphlib.pyx
    ├── silverfish.pyx
    ├── tiddit_cluster.pyx
    ├── tiddit_contig_analysis.pyx
    ├── tiddit_coverage.pyx
    ├── tiddit_coverage_analysis.pyx
    ├── tiddit_gc.pyx
    ├── tiddit_signal.pyx
    ├── tiddit_stats.py
    ├── tiddit_variant.pyx
    └── tiddit_vcf_header.py
└── versioned_singularity
    ├── README.md
    ├── TIDDIT.2.10.0
    ├── TIDDIT.2.12.0
    ├── TIDDIT.2.12.1
    ├── TIDDIT.2.7.1
    ├── TIDDIT.2.8.0
    └── TIDDIT.2.8.1


/.gitattributes:
--------------------------------------------------------------------------------
1 | TIDDIT.simg filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin/*
2 | build/*
3 | include/*
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "SVDB"]
2 | 	path = SVDB
3 | 	url = https://github.com/SciLifeLab/SVDB.git
4 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Francesco Vezzi francesco.vezzi@scilifelab.se
2 | Jesper Eisfeldt jesper.eisfeldt@scilifelab.se
3 | Daniel Nilsson
4 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | All the tools distributed with this package are distributed under GNU General Public License version 3.0 (GPLv3).


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM condaforge/mambaforge:24.9.2-0
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | ## Set TIDDIT version
 6 | ARG TIDDIT_VERSION=3.9.1
 7 | 
 8 | ## Add some info
 9 | LABEL base_image="python:3.8-slim"
10 | LABEL software="TIDDIT.py"
11 | LABEL software.version=${TIDDIT_VERSION}
12 | 
13 | ## Download and extract
14 | RUN conda install conda-forge::unzip
15 | RUN conda install -c conda-forge pip gcc joblib
16 | RUN conda install -c bioconda numpy cython pysam bwa
17 | 
18 | RUN wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-${TIDDIT_VERSION}.zip && \
19 |     unzip TIDDIT-${TIDDIT_VERSION}.zip && \
20 |     rm TIDDIT-${TIDDIT_VERSION}.zip
21 | 
22 | ## Install
23 | RUN cd TIDDIT-TIDDIT-${TIDDIT_VERSION}  && \
24 | 	pip install -e .
25 | 
26 | ENTRYPOINT ["tiddit"]
27 | CMD ["--help"]
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | DESCRIPTION
  2 | ==============
  3 | TIDDIT: Is a tool to used to identify  chromosomal rearrangements using Mate Pair or Paired End sequencing data. TIDDIT identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions, using supplementary alignments as well as discordant pairs.
  4 | TIDDIT searches for discordant reads and split reads (supplementary alignments). TIDDIT also performs local assembly using a custom local de novo assembler.
  5 | Next all signals (contigs, split-reads, and discordant pairs) are clustered using DBSCAN. The resulting clusters are filtered and annotated, and reported as SV depending on the statistics.
  6 | TIDDIT has two analysis modules. The sv mode, which is used to search for structural variants. And the cov mode that analyse the read depth of a bam file and generates a coverage report.
  7 | On a 30X human genome, the TIDDIT SV module typically completetes within 5 hours, and requires less than 10Gb ram.
  8 | 
  9 | INSTALLATION
 10 | ==============
 11 | TIDDIT requires python3 (=> 3.8), cython, pysam, and Numpy.
 12 | 
 13 | 
 14 | By default, tiddit will require, bwa, fermi2 and ropebwt2 for local assembly; local assembly may be disabled through the "--skip_assembly" parameter.
 15 | 
 16 | Installation
 17 | 
 18 | Cloning from Git Hub:
 19 | 
 20 | ```
 21 | git clone https://github.com/SciLifeLab/TIDDIT.git
 22 | ```
 23 | 
 24 | To install TIDDIT:
 25 | ```
 26 | cd tiddit
 27 | pip install -e .
 28 | ```
 29 | 
 30 | Next install bwa, I recommend using conda:
 31 | 
 32 | conda install bwa
 33 | 
 34 | You may also compile bwa yourself. Remember to add executables to path, or provide path through the command line parameters.
 35 | ```
 36 | tiddit --help
 37 | tiddit --sv --help
 38 | tiddit --cov --help
 39 | ```
 40 | 
 41 | Optionally, the assembly calling may be turned off using the "--skip_assembly" option.
 42 | 
 43 | TIDDIT may be installed using bioconda:
 44 | ```
 45 | conda install tiddit
 46 | ```
 47 | 
 48 | or using the docker image on biocontainers
 49 | ```
 50 | docker pull quay.io/biocontainers/tiddit:<tag>
 51 | ```
 52 | visit https://quay.io/repository/biocontainers/tiddit?tab=tags for tags.
 53 | 
 54 | 
 55 | The SV module
 56 | =============
 57 | The main TIDDIT module, detects structural variant using discordant pairs, split reads and coverage information
 58 | 
 59 |     python tiddit --sv [Options] --bam in.bam --ref reference.fa
 60 | 
 61 | Where bam is the input bam or cram file. And reference.fasta is the reference fasta used to align the sequencing data: TIDDIT will crash if the reference fasta is different from the one used to align the reads. The reads of the input bam file must be sorted on genome position.
 62 | 
 63 | TIDDIT may be fine-tuned by altering these optional parameters:
 64 | 
 65 | 
 66 | 	-o	output prefix(default=output)
 67 | 	-i	paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)
 68 | 	-d	expected reads orientations, possible values "innie" (-> <-) or "outtie" (<- ->). Default: major orientation within the dataset
 69 | 	-p	Minimum number of supporting pairs in order to call a variant (default 3)
 70 | 	-r	Minimum number of supporting split reads to call a variant (default 3)
 71 | 	--threads	Number of threads (default 1)
 72 | 	-q	Minimum mapping quality to consider an alignment (default 5)
 73 | 	-n	the ploidy of the organism,(default = 2)
 74 | 	-e	clustering distance parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)
 75 | 	-c	average coverage, overwrites the estimated average coverage (useful for exome or panel data)
 76 | 	-l	min-pts parameter (default=3),must be set >= 2
 77 | 	-s	Number of reads to sample when computing library statistics(default=25000000)
 78 | 	-z 	minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)
 79 | 	--force_ploidy	force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)
 80 | 	 --force_overwrite     force the analysis and overwrite any data in the output folder
 81 | 	--n_mask	exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)
 82 | 	--skip_assembly	Skip running local assembly, tiddit will perform worse, but wont require fermi2, bwa, ropebwt and bwa indexed ref
 83 | 	--bwa	path to bwa executable file(default=bwa)
 84 | 	--fermi2	path to fermi2 executable file (default=fermi2)
 85 | 	--ropebwt2	path to ropebwt2 executable file (default=ropebwt2)
 86 | 	--p_ratio	minimum discordant pair/normal pair ratio at the breakpoint junction(default=0.1)
 87 | 	--r_ratio	minimum split read/coverage ratio at the breakpoint junction(default=0.1)
 88 | 	--max_coverage	filter call if X times higher than chromosome average coverage (default=4)
 89 | 	--min_contig	 Skip calling on small contigs (default < 10000 bp)
 90 | 
 91 | 
 92 | 
 93 | output:
 94 | 
 95 | TIDDIT SV module produces two output files, a vcf file containing SV calls, and a tab file dscribing the estimated ploidy and coverage across each contig.
 96 |          
 97 |                                 
 98 | The cov module
 99 | ==============
100 | Computes the coverge of different regions of the bam file
101 | 
102 |     python TIDDIT.py --cov [Options] --bam bam
103 |     
104 | optional parameters:
105 | 
106 |     -o - the prefix of the output files
107 |     -z - compute the coverage within bins of a specified size across the entire genome, default bin size is 500
108 |     -w - generate a wig file instead of bed
109 |  --ref - reference sequence (fasta), required for reading cram file.
110 | 
111 | Filters
112 | =============
113 | TIDDIT uses four different filters to detect low quality calls. The filter field of variants passing these tests are set to "PASS". If a variant fail any of these tests, the filter field is set to the filter failing that variant. These are the four filters empoyed by TIDDIT:
114 | 
115 |     Expectedlinks
116 | 	Less than <p_ratio> fraction of the spanning pairs or <r_ratio> fraction reads support the variant
117 |     FewLinks
118 |         The number of discordant pairs supporting the variant is too low compared to the number of discordant pairs within that genomic region.
119 |     Unexpectedcoverage
120 |         High coverage
121 | 
122 | Failed Variants may be removed using tools such as VCFtools or grep. Removing these variants greatly improves the precision of TIDDIT, but may reduce the sensitivity. It is adviced to remove filtered variants or prioritize the variants that have passed the quality checks.
123 | This command may be usedto filter the TIDDIT vcf:
124 | 
125 | 	grep -E "#|PASS" input.vcf > output.filtered.vcf
126 | 
127 | Quality column
128 | =============
129 | The scores in the quality column are calculated using non parametric sampling: 1000 points/genomic positions are sampled across each chromosome. And the number of read-pairs and reads spanning these points are counted.
130 | The variant support of each call is compared to these values, and the quality column is set to he lowest percentile higher than the (variant support*ploidy).
131 | 
132 | Note: SVs usually occur in repetetive regions, hence these scores are expected to be relatively low. A true variant may have a low score, and the score itself depends on the input data (mate-pair vs pe for instance).
133 | 
134 | Merging the vcf files
135 | =====================
136 | I usually merge vcf files using SVDB (https://github.com/J35P312)
137 | 
138 | svdb --merge --vcf file1.vcf file2.vcf --bnd_distance 500 --overlap 0.6 > merged.vcf
139 | 
140 | Merging of vcf files could be useful for tumor-normal analysis or for analysing a pedigree. But also to combine the output of multiple callers.
141 | 
142 | Tumor normal example
143 | ===================
144 | 
145 | run the tumor sample using a lower ratio treshold (to allow for subclonal events, and to account for low purity)
146 | 
147 | python TIDDIT.py --sv --p_ratio 0.10 --bam tumor.bam -o tumor --ref reference.fasta
148 | grep -E "#|PASS" tumor.vcf > tumor.pass.vcf
149 | 
150 | run the normal sample
151 | 
152 | python TIDDIT.py --sv --bam normal.bam -o normal --ref reference.fasta
153 | grep -E "#|PASS" normal.vcf > normal.pass.vcf
154 | 
155 | merge files:
156 | 
157 | svdb --merge --vcf tumor.pass.vcf normal.pass.vcf --bnd_distance 500 --overlap 0.6 > Tumor_normal.vcf
158 | 
159 | The output vcf should be filtered further and annotated (using a local-frequency database for instance)
160 | 
161 | Annotation
162 | ==========
163 | genes may be annotated using vep or snpeff. NIRVANA may be used for annotating CNVs, and SVDB may be used as a frequency database
164 | 
165 | Algorithm
166 | =========
167 | 
168 | Discordant pairs, split reads (supplementary alignments), and contigs are extracted. A discordant pair is any pair having a larger insert size than the  -i paramater, or a pair where the reads map to different chromosomes.
169 | supplementary alignments and discordant pairs are only extracted if their mapping quality exceed the -q parameter. Contigs are generated by assembling all reads with supplementary alignment using fermi2
170 | 
171 | The most recent version of TIDDIT uses an algorithm similar to DBSCAN: A cluster is formed if -l or more signals are located within the -e distance. Once a cluster is formed, more signals may be added if these signals are within the
172 | -e distance of -l signals within a cluster.
173 | 
174 | A cluster is rejected if it contains less than -r plus -p signals. If the cluster is rejected, it will not be printed to the vcf file.
175 | 
176 | If the cluster is not rejected, it will be printed to file, even if it fails any quality filter. 
177 | 
178 | The sensitivity and precision may be controlled using the -q,r,p, and -l parameters. 
179 | 
180 | LICENSE
181 | ==============
182 | All the tools distributed with this package are distributed under GNU General Public License version 3.0 (GPLv3). 
183 | 
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools","pysam","numpy","cython","joblib"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import numpy
 3 | import pyximport
 4 | import pysam
 5 | pyximport.install()
 6 | 
 7 | 
 8 | try:
 9 |     from Cython.Build import cythonize
10 |     has_cython = True
11 | except ImportError:
12 |     has_cython = False
13 | 
14 | if has_cython:
15 |     ext_modules = cythonize([
16 |         "tiddit/tiddit_signal.pyx",
17 |         "tiddit/tiddit_coverage.pyx",
18 |         "tiddit/tiddit_cluster.pyx",
19 |         "tiddit/tiddit_coverage_analysis.pyx",
20 |         "tiddit/tiddit_gc.pyx",
21 |         "tiddit/silverfish.pyx",
22 |         "tiddit/graphlib.pyx",
23 |         "tiddit/tiddit_variant.pyx",
24 |         "tiddit/tiddit_contig_analysis.pyx"])
25 | else:
26 |     ext_modules = []
27 | 
28 | setup(
29 |     name = 'tiddit',
30 |     version = '3.9.3',
31 | 
32 | 
33 |     url = "https://github.com/SciLifeLab/TIDDIT",
34 |     author = "Jesper Eisfeldt",
35 |     author_email= "jesper.eisfeldt@scilifelab.se",
36 |     ext_modules = ext_modules,
37 |     extra_link_args=pysam.get_libraries(),
38 |     define_macros=pysam.get_defines(),
39 |     include_dirs=[numpy.get_include()]+pysam.get_include(),
40 |     packages = ['tiddit'],
41 |     install_requires = ['numpy','pysam','joblib'],
42 |     entry_points = {'console_scripts': ['tiddit = tiddit.__main__:main']},
43 | 
44 | )
45 | 


--------------------------------------------------------------------------------
/tiddit/DBSCAN.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | #The functions of this script perform DBSCAN and returns the variatn clusters
  3 | 
  4 | #Call the cluser algorithm, the statistics function, and returns the final cluster
  5 | def generate_clusters(chrA,chrB,coordinates,library_stats,args):
  6 | 	candidates=[]
  7 | 	coordinates=coordinates[numpy.lexsort((coordinates[:,1],coordinates[:,0]))]
  8 | 	min_pts=args.l
  9 | 	if chrA == chrB and library_stats["ploidies"][chrA] > args.n*2:
 10 | 		min_pts=int(round(args.l/float(args.n)*library_stats["ploidies"][chrA]))
 11 | 
 12 | 	db=main(coordinates[:,0:2],args.e,min_pts)
 13 | 	unique_labels = set(db)
 14 | 
 15 | 	for var in unique_labels:
 16 | 		if var == -1:
 17 | 			continue
 18 | 		class_member_mask = (db == var)
 19 | 		candidate_signals =coordinates[class_member_mask]
 20 | 		resolution=candidate_signals[:,-2]
 21 | 		support=len(set(candidate_signals[:,-1]))
 22 | 		discordants=True
 23 | 		if len(set(resolution)) == 1 and max(resolution) == 1:
 24 | 			disordants=False
 25 | 
 26 | 		if discordants and support >= args.p:
 27 | 			candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) )
 28 | 		elif not discordants and support >= args.r and chrA == chrB:
 29 | 			candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) )
 30 | 
 31 | 	return(candidates)
 32 | 
 33 | def x_coordinate_clustering(data,epsilon,m):
 34 | 	clusters=numpy.zeros(len(data))
 35 | 	for i in range(0,len(clusters)):
 36 | 		clusters[i]=-1
 37 | 	cluster_id=-1
 38 | 	cluster=False
 39 | 
 40 | 	for i in range(0,len(data)-m+1):
 41 | 
 42 | 		distances=[]
 43 | 		current=data[i,:]
 44 | 		points=data[i+1:i+m+1,:]
 45 | 
 46 | 		#print points
 47 | 		distances=[]
 48 | 		for point in points:
 49 | 			distances.append(abs(point[0]-current[0]))
 50 | 
 51 | 		if max(distances) < epsilon:
 52 | 			#add to the cluster
 53 | 			if cluster:
 54 | 				clusters[i+m-1]=cluster_id
 55 | 				#define a new cluster
 56 | 			else:
 57 | 				cluster_id+=1
 58 | 				cluster=True
 59 | 				for j in range(i,i+m):
 60 | 					clusters[j]=cluster_id
 61 | 		else:
 62 | 			cluster=False
 63 | 
 64 | 	return(clusters,cluster_id)
 65 | 
 66 | def y_coordinate_clustering(data,epsilon,m,cluster_id,clusters):
 67 | 
 68 | 	cluster_id_list=set(clusters)
 69 | 	for cluster in cluster_id_list:
 70 | 		if cluster == -1:
 71 | 			continue
 72 | 		class_member_mask = (clusters == cluster)
 73 | 		indexes=numpy.where(class_member_mask)[0]
 74 | 		signals=data[class_member_mask]
 75 | 
 76 | 		y_coordinates=[]
 77 | 
 78 | 
 79 | 		for i  in range(0,len(signals)):
 80 | 			y_coordinates.append([signals[i][1],indexes[i]])
 81 | 		y_coordinates.sort(key=lambda x:x[0])
 82 | 		
 83 | 		sub_clusters=numpy.zeros(len(indexes))
 84 | 		for i in range(0,len(sub_clusters)):
 85 | 			sub_clusters[i]=-1
 86 | 
 87 | 		active_cluster=False
 88 | 		sub_cluster_id=0
 89 | 		y_coordinates=numpy.array(y_coordinates)
 90 | 		for i in range(0,len(y_coordinates)-m+1):
 91 | 			distances=[]
 92 | 			current=y_coordinates[i,:]
 93 | 			next=y_coordinates[i+1:i+m,:]
 94 | 
 95 | 			distances=[]
 96 | 			for pos in next:
 97 | 				distances.append(abs(pos[0]-current[0]))	
 98 | 
 99 | 			if max(distances) < epsilon:
100 | 				#add to the cluster
101 | 				if active_cluster:
102 | 					sub_clusters[i+m-1]=sub_cluster_id
103 | 					#define a new cluster
104 | 				else:
105 | 					sub_cluster_id+=1
106 | 					active_cluster=True
107 | 					for j in range(i,i+m):
108 | 						sub_clusters[j]=sub_cluster_id
109 | 			else:
110 | 				active_cluster=False
111 | 
112 | 		for i in range(0,len(sub_clusters)):
113 | 			if sub_clusters[i] == 1:
114 | 				clusters[ y_coordinates[i][1] ]	= cluster
115 | 
116 | 			elif sub_clusters[i] > -1:
117 | 				clusters[ y_coordinates[i][1] ]=sub_clusters[i] +cluster_id-1
118 | 			elif sub_clusters[i] == -1:
119 | 				clusters[ y_coordinates[i][1] ] = -1
120 | 
121 | 		if sub_cluster_id > 1:
122 | 			cluster_id += sub_cluster_id-1
123 | 	return(clusters,cluster_id)
124 | 
125 | def main(data,epsilon,m):
126 | 	clusters,cluster_id=x_coordinate_clustering(data,epsilon,m)
127 | 	clusters,cluster_id=y_coordinate_clustering(data,epsilon,m,cluster_id,clusters)
128 | 
129 | 	return(clusters)
130 | 
131 | 
132 | #hej=numpy.array([[1,2],[1,2],[1,2],[10,11]])
133 | #print(main(hej,0.1,2))
134 | #print(hej)
135 | 


--------------------------------------------------------------------------------
/tiddit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciLifeLab/TIDDIT/eb97e0ec34494edaf6688d04d4533c72f8f5fd59/tiddit/__init__.py


--------------------------------------------------------------------------------
/tiddit/__main__.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import argparse
  4 | import time
  5 | import pysam
  6 | import os
  7 | import shutil
  8 | import glob
  9 | 
 10 | import tiddit.tiddit_stats as tiddit_stats
 11 | import tiddit.tiddit_signal as tiddit_signal
 12 | import tiddit.tiddit_vcf_header as tiddit_vcf_header
 13 | import tiddit.tiddit_coverage_analysis as tiddit_coverage_analysis
 14 | import tiddit.tiddit_coverage as tiddit_coverage
 15 | import tiddit.tiddit_cluster as tiddit_cluster
 16 | import tiddit.tiddit_variant as tiddit_variant
 17 | import tiddit.tiddit_contig_analysis as tiddit_contig_analysis
 18 | import tiddit.tiddit_gc as tiddit_gc
 19 | 
 20 | def main():
 21 | 	version="3.9.3"
 22 | 	parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
 23 | 	parser.add_argument("--sv"	 , help="call structural variation", required=False, action="store_true")
 24 | 	parser.add_argument("--cov"        , help="generate a coverage bed file", required=False, action="store_true")
 25 | 	args, unknown = parser.parse_known_args()
 26 | 
 27 | 	if args.sv == True:
 28 | 
 29 | 		parser = argparse.ArgumentParser("""tiddit --sv --bam inputfile [-o prefix] --ref ref.fasta""")
 30 | 		#required parameters
 31 | 		parser.add_argument('--sv'       , help="call structural variation", required=False, action="store_true")
 32 | 		parser.add_argument('--force_overwrite'       , help="force the analysis and overwrite any data in the output folder", required=False, action="store_true")
 33 | 		parser.add_argument('--bam', type=str,required=True, help="coordinate sorted bam file(required)")
 34 | 		parser.add_argument('--ref', type=str, help="reference fasta",required=True)
 35 | 
 36 | 		#related to sv calling
 37 | 		parser.add_argument('-o', type=str,default="output", help="output prefix(default=output)")
 38 | 		parser.add_argument('-i', type=int, help="paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)")
 39 | 		parser.add_argument('-d', type=str,help="expected reads orientations, possible values \"innie\" (-> <-) or \"outtie\" (<- ->). Default: major orientation within the dataset")
 40 | 		parser.add_argument('-p', type=int,default=3, help="Minimum number of supporting pairs in order to call a variant (default 3)")
 41 | 		parser.add_argument('--threads', type=int,default=1, help="Number of threads (default=1)")
 42 | 		parser.add_argument('-r', type=int,default=3, help="Minimum number of supporting split reads to call a variant (default 3)")
 43 | 		parser.add_argument('-q', type=int,default=5, help="Minimum mapping quality to consider an alignment (default 5)")
 44 | 		parser.add_argument('-n', type=int,default=2, help="the ploidy of the organism,(default = 2)")
 45 | 		parser.add_argument('-e', type=int, help="clustering distance  parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)")
 46 | 		parser.add_argument('-c', type=float, help="average coverage, overwrites the estimated average coverage (useful for exome or panel data)")
 47 | 		parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set >= 2")
 48 | 		parser.add_argument('-s', type=int,default=25000000, help="Number of reads to sample when computing library statistics(default=25000000)")
 49 | 		parser.add_argument('--force_ploidy',action="store_true", help="force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)")
 50 | 		parser.add_argument('--n_mask',type=float,default=0.5, help="exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)")
 51 | 
 52 | 		#stuff related to filtering
 53 | 		parser.add_argument('--p_ratio', type=float,default=0.1, help="minimum discordant pair/normal pair ratio at the breakpoint junction(default=0.1)")
 54 | 		parser.add_argument('--r_ratio', type=float,default=0.1, help="minimum split read/coverage ratio at the breakpoint junction(default=0.1)")
 55 | 		parser.add_argument('--max_coverage', type=float,default=4, help="filter call if X times higher than chromosome average coverage (default=4)")
 56 | 		parser.add_argument('--min_contig', type=int,default=10000, help="Skip calling on small contigs (default < 10000 bp)")
 57 | 		parser.add_argument('-z', type=int,default=50, help="minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)")
 58 | 
 59 | 		#assembly related stuff
 60 | 		parser.add_argument('--skip_assembly', action="store_true", help="Skip running local assembly, tiddit will perform worse, but wont require bwa and bwa indexed ref, and will complete quicker")
 61 | 		parser.add_argument('--bwa', type=str,default="bwa", help="path to bwa executable file(default=bwa)")
 62 | 		parser.add_argument('--min_clip', type=int,default=4, help="Minimum clip reads to initiate local assembly of a region(default=4)")
 63 | 		parser.add_argument('--padding', type=int,default=100, help="Extend the local assembly by this number of bases (default=100bp)")
 64 | 		parser.add_argument('--min_pts_clips', type=int,default=3, help="min-pts parameter for the clustering of candidates for local assembly (default=3)")
 65 | 		parser.add_argument('--max_assembly_reads', type=int,default=100000, help="Skip assembly of regions containing too many reads (default=100000 reads)")
 66 | 		parser.add_argument('--max_local_assembly_region', type=int,default=2000, help="maximum size of the clip read cluster for being considered a local assembly candidate (default=2000 bp)")
 67 | 		parser.add_argument('--min_anchor_len', type=int,default=60, help="minimum mapped bases to be considered a clip read  (default=60 bp)")
 68 | 		parser.add_argument('--min_clip_len', type=int,default=25, help="minimum clipped bases to be considered a clip read (default=25 bp)")
 69 | 		parser.add_argument('--min_contig_len', type=int,default=200, help="minimum contig length for SV analysis (default=200 bp)")
 70 | 		parser.add_argument('-k', type=int,default=91, help="kmer lenght used by the local assembler (default=91 bp)")
 71 | 		args= parser.parse_args()
 72 | 
 73 | 		if args.l < 2:
 74 | 			print ("error, too low --l value!")
 75 | 			quit()
 76 | 
 77 | 		if not args.skip_assembly:
 78 | 			if not os.path.isfile(args.bwa) and not shutil.which(args.bwa):
 79 | 				print("error, BWA executable missing, add BWA to path, or specify using --bwa, or skip local assembly (--skip_assembly)")
 80 | 				quit()
 81 | 
 82 | 			if not glob.glob("{}*.bwt*".format(args.ref)):
 83 | 				print ("error, The reference must be indexed using bwa index; run bwa index, or skip local assembly (--skip_assembly)")
 84 | 				quit()
 85 | 
 86 | 
 87 | 		if not os.path.isfile(args.ref):
 88 | 			print ("error,  could not find the reference file")
 89 | 			quit()
 90 | 
 91 | 
 92 | 		if not (args.bam.endswith(".bam") or args.bam.endswith(".cram")):
 93 | 			print ("error, the input file is not a bam file, make sure that the file extension is .bam or .cram")
 94 | 			quit()
 95 | 
 96 | 		if not os.path.isfile(args.bam):
 97 | 			print ("error,  could not find the bam file")
 98 | 			quit()
 99 | 
100 | 		bam_file_name=args.bam
101 | 		samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=args.ref)
102 | 
103 | 		bam_header=samfile.header
104 | 		chromosomes=[]
105 | 
106 | 		for chr in bam_header["SQ"]:
107 | 			chromosomes.append(chr["SN"])
108 | 		samfile.close()
109 | 
110 | 
111 | 		try:
112 | 			sample_id=bam_header["RG"][0]["SM"]
113 | 		except:
114 | 			sample_id=bam_file_name.split("/")[-1].split(".")[0]
115 | 
116 | 
117 | 		samples=[sample_id]
118 | 
119 | 		contigs=[]
120 | 		contig_number={}
121 | 		contig_length={}
122 | 		i=0
123 | 		for contig in bam_header["SQ"]:
124 | 			contigs.append(contig["SN"])
125 | 			contig_number[contig["SN"]]=i
126 | 			contig_length[ contig["SN"] ]=contig["LN"]
127 | 			i+=1
128 | 
129 | 		prefix=args.o
130 | 		try:
131 | 			os.mkdir(f"{prefix}_tiddit")
132 | 			os.mkdir(f"{prefix}_tiddit/clips")
133 | 		except:
134 | 			if args.force_overwrite:
135 | 				pass
136 | 			else:
137 | 				print("Eror output folder exists")
138 | 				quit()
139 | 
140 | 		#if not args.skip_index:
141 | 		t=time.time()
142 | 		print("Creating index")
143 | 		pysam.index("-c","-m","4","-@",str(args.threads),bam_file_name,"{}_tiddit/{}.csi".format(args.o,sample_id))
144 | 		print("Created index in: " + str(time.time()-t) )
145 | 
146 | 		min_mapq=args.q
147 | 		max_ins_len=100000
148 | 		n_reads=args.s
149 | 
150 | 		library=tiddit_stats.statistics(bam_file_name,args.ref,min_mapq,max_ins_len,n_reads)
151 | 		if args.i:
152 | 			max_ins_len=args.i
153 | 		else:
154 | 			max_ins_len=library["percentile_insert_size"]
155 | 
156 | 
157 | 		t=time.time()
158 | 		coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig,False,args.min_anchor_len,args.min_clip_len)
159 | 		print("extracted signals in:")
160 | 		print(t-time.time())
161 | 
162 | 		gc_dictionary=tiddit_gc.main(args.ref,chromosomes,args.threads,50,0.5)
163 | 
164 | 		t=time.time()
165 | 		library=tiddit_coverage_analysis.determine_ploidy(coverage_data,contigs,library,args.n,prefix,args.c,args.ref,50,bam_header,gc_dictionary)
166 | 		print("calculated coverage in:")
167 | 		print(time.time()-t)
168 | 
169 | 
170 | 		if not args.skip_assembly:
171 | 
172 | 			t=time.time()
173 | 			tiddit_contig_analysis.main(prefix,sample_id,library,contigs,coverage_data,args)
174 | 			print("Clip read assembly in:")
175 | 			print(time.time()-t)
176 | 
177 | 		vcf_header=tiddit_vcf_header.main( bam_header,library,sample_id,version )
178 | 
179 | 		if not args.e:
180 | 			args.e=int(library["avg_insert_size"]/2.0)
181 | 		if not args.e:
182 | 			args.e=50
183 | 
184 | 		t=time.time()
185 | 		sv_clusters=tiddit_cluster.main(prefix,contigs,contig_length,samples,library["mp"],args.e,args.l,max_ins_len,args.min_contig,args.skip_assembly,args.r)
186 | 
187 | 		print("generated clusters in")
188 | 		print(time.time()-t)
189 | 
190 | 		f=open(prefix+".vcf","w")
191 | 		f.write(vcf_header+"\n")
192 | 
193 | 		t=time.time()
194 | 		variants=tiddit_variant.main(bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,gc_dictionary)
195 | 		print("analyzed clusters in")
196 | 		print(time.time()-t)
197 | 
198 | 		for chr in contigs:
199 | 			if not chr in variants:
200 | 				continue
201 | 			for variant in sorted(variants[chr], key=lambda x: x[0]):
202 | 				f.write( "\t".join(variant[1])+"\n" )
203 | 		f.close()
204 | 		quit()
205 | 
206 | 	elif args.cov:
207 | 		parser = argparse.ArgumentParser("""tiddit --cov --bam inputfile [-o prefix]""")
208 | 		parser.add_argument('--cov'        , help="generate a coverage bed/wig file", required=False, action="store_true")
209 | 		parser.add_argument('--bam', type=str,required=True, help="coordinate sorted bam file(required)")
210 | 		parser.add_argument('-o', type=str,default="output", help="output prefix(default=output)")
211 | 		parser.add_argument('-z', type=int,default=500, help="use bins of specified size(default = 500bp) to measure the coverage of the entire bam file, set output to stdout to print to stdout")
212 | 		parser.add_argument('-w'        , help="generate wig instead of bed", required=False, action="store_true")
213 | 		parser.add_argument('-q'        , help="minimum mapping quality(default=20)", required=False, default=20)
214 | 		parser.add_argument('--ref', type=str, help="reference fasta, used for reading cram")
215 | 		args= parser.parse_args()
216 | 
217 | 		if not os.path.isfile(args.bam):
218 | 			print ("error,  could not find the bam file")
219 | 			quit()
220 | 
221 | 		samfile = pysam.AlignmentFile(args.bam, "r",reference_filename=args.ref)
222 | 		bam_header=samfile.header
223 | 		coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,args.z)
224 | 		n_reads=0
225 | 		for read in samfile.fetch(until_eof=True):
226 | 
227 | 			if read.is_unmapped or read.is_duplicate:
228 | 				continue
229 | 
230 | 			t=time.time()
231 | 			if read.mapq >= args.q:
232 | 				n_reads+=1
233 | 
234 | 				read_position=read.reference_start
235 | 				read_end=read.reference_end
236 | 				read_reference_name=read.reference_name
237 | 
238 | 				coverage_data[read_reference_name]=tiddit_coverage.update_coverage(read_position,read_end,args.z,coverage_data[read_reference_name],end_bin_size[read_reference_name])
239 | 
240 | 		if args.w:
241 | 			tiddit_coverage.print_coverage(coverage_data,bam_header,args.z,"wig",args.o +".wig")
242 | 		else:
243 | 			tiddit_coverage.print_coverage(coverage_data,bam_header,args.z,"bed",args.o +".bed")
244 | 
245 | 	else:
246 | 		parser.print_help()
247 | 
248 | if __name__ == '__main__':
249 | 	main()
250 | 


--------------------------------------------------------------------------------
/tiddit/graphlib.pyx:
--------------------------------------------------------------------------------
  1 | class graph:
  2 | 
  3 | 	def __init__(self):
  4 | 		self.predecessors={}
  5 | 		self.sucessors={}
  6 | 		self.kmers={}
  7 | 		self.vertices={}
  8 | 		self.vertice_set=set([])
  9 | 		self.in_branch_points=set([])
 10 | 		self.out_branch_points=set([])
 11 | 		self.starting_points=set([])
 12 | 		self.end_points=set([])
 13 | 
 14 | 	#add node to the graph
 15 | 	def add_kmer(self,kmer,read):
 16 | 
 17 | 		if not kmer in self.kmers:
 18 | 			self.kmers[kmer]=set([])
 19 | 			self.predecessors[kmer]=set([])
 20 | 			self.sucessors[kmer]=set([])
 21 | 			self.starting_points.add(kmer)
 22 | 			self.end_points.add(kmer)
 23 | 
 24 | 		self.kmers[kmer].add(read)
 25 | 		
 26 | 	#add vertices between nodes
 27 | 	def add_vertice(self,kmer1,kmer2,read):
 28 | 
 29 | 		self.add_kmer(kmer1,read)
 30 | 		self.add_kmer(kmer2,read)
 31 | 		
 32 | 		if not kmer1 in self.vertices:
 33 | 			self.vertices[kmer1]={}
 34 | 
 35 | 		if kmer1 in self.end_points:
 36 | 			self.end_points.remove(kmer1)
 37 | 
 38 | 		if not kmer2 in self.vertices[kmer1]:
 39 | 			self.vertices[kmer1][kmer2]=set([])
 40 | 
 41 | 		if kmer2 in self.starting_points:
 42 | 			self.starting_points.remove(kmer2)
 43 | 
 44 | 		self.vertices[kmer1][kmer2].add(read)
 45 | 			
 46 | 		self.vertice_set.add((kmer1,kmer2))
 47 | 
 48 | 		self.sucessors[kmer1].add(kmer2)
 49 | 		if len(self.sucessors[kmer1]) > 1:
 50 | 			self.in_branch_points.add(kmer1)
 51 | 
 52 | 		self.predecessors[kmer2].add(kmer1)
 53 | 		if len(self.predecessors[kmer2]) > 1:
 54 | 			self.out_branch_points.add(kmer2)
 55 | 
 56 | 	def delete_vertice(self,kmer1,kmer2):
 57 | 	
 58 | 		if kmer1 in self.vertices:
 59 | 			if kmer2 in self.vertices[kmer1]:
 60 | 				del self.vertices[kmer1][kmer2]
 61 | 
 62 | 			if len(self.vertices[kmer1]) == 0:
 63 | 				del self.vertices[kmer1]
 64 | 
 65 | 		if kmer1 in self.in_branch_points:
 66 | 			if len(self.sucessors[kmer1]) < 3:
 67 | 				self.in_branch_points.remove(kmer1)
 68 | 
 69 | 		if kmer1 in self.sucessors:
 70 | 			if kmer2 in self.sucessors[kmer1]:
 71 | 				self.sucessors[kmer1].remove(kmer2)
 72 | 
 73 | 			if not self.sucessors[kmer1]:
 74 | 				self.end_points.add(kmer1)
 75 | 
 76 | 		if kmer2 in self.out_branch_points:
 77 | 			if len(self.predecessors[kmer2]) < 3:
 78 | 				self.out_branch_points.remove(kmer2)
 79 | 
 80 | 		if kmer1 in self.predecessors[kmer2]:
 81 | 			self.predecessors[kmer2].remove(kmer1)
 82 | 
 83 | 		if not len(self.predecessors[kmer2]):
 84 | 			self.starting_points.add(kmer2)
 85 | 
 86 | 		if (kmer1,kmer2) in self.vertice_set:
 87 | 			self.vertice_set.remove((kmer1,kmer2))
 88 | 
 89 | 	def delete_kmer(self,kmer):
 90 | 		if kmer in self.kmers:
 91 | 			del self.kmers[kmer]
 92 | 
 93 | 		sucessors=set([])
 94 | 		for sucessor in self.sucessors[kmer]:
 95 | 			sucessors.add(sucessor)
 96 | 
 97 | 		predecessors=set([])
 98 | 		for predecessor in self.predecessors[kmer]:
 99 | 			predecessors.add(predecessor)
100 | 
101 | 		for predecessor in predecessors:
102 | 			self.delete_vertice(predecessor,kmer)
103 | 
104 | 		for sucessor in sucessors:
105 | 			self.delete_vertice(kmer,sucessor)
106 | 


--------------------------------------------------------------------------------
/tiddit/silverfish.pyx:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | import tiddit.graphlib as graphlib
  4 | import copy
  5 | 
  6 | def build_kmer_hist(seq,kmer_hist,k):
  7 | 	seq_len=len(seq)
  8 | 	kmers=[]
  9 | 	for i in range(0,seq_len-k+1):
 10 | 
 11 | 		kmer=seq[i:i+k]
 12 | 		if len(kmer) < k:
 13 | 			break
 14 | 		kmers.append(kmer)
 15 | 
 16 | 		if not kmer in kmer_hist:
 17 | 			kmer_hist[kmer]=0
 18 | 
 19 | 		kmer_hist[kmer]+=1
 20 | 
 21 | 	return(kmers,kmer_hist)	
 22 | 
 23 | def find_chain(graph,start,ends):
 24 | 	chain=[start]
 25 | 	current_node=start
 26 | 	if start in ends:
 27 | 		return(chain)
 28 | 		
 29 | 	while True:
 30 | 		current_node=graph.sucessors[current_node]
 31 | 
 32 | 		if not current_node or len(current_node) > 1 or current_node == start or current_node in ends:
 33 | 			return(chain)
 34 | 		current_node=list(current_node)[0]
 35 | 		chain.append(current_node)
 36 | 		if current_node in ends:
 37 | 			return(chain)
 38 | 
 39 | def drop_kmers(graph,min_support):
 40 | 	kmers=list(graph.kmers.keys())
 41 | 	for kmer in kmers:
 42 | 		if len(graph.kmers[kmer]) < min_support:
 43 | 			graph.delete_kmer(kmer)
 44 | 	return(graph)
 45 | 
 46 | def trim_edges(graph,min_weight):
 47 | 	edge_list=list(graph.vertice_set)
 48 | 	for edge in edge_list:
 49 | 		if len(graph.vertices[edge[0]][edge[1]]) < min_weight:
 50 | 			graph.delete_vertice(edge[0],edge[1])
 51 | 	return(graph)
 52 | 
 53 | def remove_tips(graph,min_tip_length):
 54 | 	branch_start=graph.in_branch_points
 55 | 	branch_end=graph.out_branch_points
 56 | 	starting_point=graph.starting_points
 57 | 
 58 | 	switches=branch_end.union(branch_start)
 59 | 	for start in starting_point.union(branch_start,branch_end):	
 60 | 		chains=[]
 61 | 		for node in graph.sucessors[start]:
 62 | 			chains.append([start]+find_chain(graph,node,switches))
 63 | 
 64 | 		for chain in chains:
 65 | 			if len(chain) < 20 and chain[-1] in graph.end_points:
 66 | 				for node in chain:
 67 | 					graph.delete_kmer(node)
 68 | 
 69 | 	return(graph)
 70 | 
 71 | 
 72 | def chain_typer(chain,graph):
 73 | 
 74 | 	if chain[0] in graph.starting_points:
 75 | 		return("starting_point")
 76 | 	elif chain[-1] in graph.end_points:
 77 | 		return("end_point")
 78 | 
 79 | 	elif chain[0] in graph.in_branch_points:
 80 | 		if chain[-1] in graph.out_branch_points:
 81 | 			return("in_out")
 82 | 		elif chain[-1] in graph.in_branch_points:
 83 | 			return("in_in")
 84 | 
 85 | 	elif chain[0] in graph.out_branch_points:
 86 | 		if chain[-1] in graph.out_branch_points:
 87 | 			return("out_out")
 88 | 
 89 | 		elif chain[-1] in graph.in_branch_points:
 90 | 			return("out_in")
 91 | 
 92 | 	return("unknown")
 93 | 
 94 | def forward_scaffold(scaffold,chains,graph,chain_numbers):
 95 | 	results=[]
 96 | 	
 97 | 	for i in range(0,len(chains)):
 98 | 		if i in chain_numbers:
 99 | 			continue
100 | 
101 | 		if chains[i][0][0] == chains[scaffold][0][-1]:
102 | 			r=forward_scaffold(i,chains,graph,set([i]) | chain_numbers )
103 | 
104 | 			for j in range(0,len(r)):
105 | 				results.append( [ chains[scaffold][0]+r[j][0][1:],r[j][1] | set([scaffold]) ] )
106 | 			
107 | 	if not results:
108 | 		results=[ [chains[scaffold][0], set([scaffold]) ] ]
109 | 
110 | 	return(results)
111 | 
112 | def backward_scaffold(scaffold,chains,graph,chain_numbers):
113 | 	results=[]
114 | 	
115 | 	for i in range(0,len(chains)):
116 | 		if i in chain_numbers:
117 | 			continue
118 | 
119 | 		if chains[i][0][-1] == chains[scaffold][0][0]:
120 | 			r=backward_scaffold(i,chains,graph,set([i]) | chain_numbers )
121 | 
122 | 			for j in range(0,len(r)):
123 | 				results.append( [ r[j][0]+chains[scaffold][0][1:],r[j][1] | set([scaffold]) ] )
124 | 			
125 | 	if not results:
126 | 		results=[ [chains[scaffold][0], set([scaffold]) ] ]
127 | 
128 | 	return(results)
129 | 
130 | def main(reads,k,min_support):
131 | 
132 | 	time_all=time.time()
133 | 
134 | 	kmers={}
135 | 	time_kmerize=time.time()
136 | 	graph = graphlib.graph()
137 | 
138 | 	kmer_hist={}
139 | 	for read in reads:
140 | 		if len(reads[read]) < k:
141 | 			continue
142 | 
143 | 		read_kmers,kmer_hist=build_kmer_hist(reads[read],kmer_hist,k)
144 | 		kmers[read]=read_kmers
145 | 
146 | 	for read in kmers:
147 | 		if len(reads[read]) < k+1:
148 | 			continue
149 | 
150 | 
151 | 		for i in range(1,len(kmers[read])):
152 | 
153 | 			if kmer_hist[kmers[read][i-1]] < min_support and kmer_hist[kmers[read][i]] < min_support:
154 | 				continue
155 | 
156 | 			if kmer_hist[kmers[read][i]] < min_support and kmer_hist[kmers[read][i-1]] >= min_support:
157 | 				graph.add_kmer(kmers[read][i-1],read)
158 | 
159 | 			elif kmer_hist[kmers[read][i]] >= min_support and kmer_hist[kmers[read][i-1]] < min_support:
160 | 				graph.add_kmer(kmers[read][i],read)
161 | 
162 | 			if kmer_hist[kmers[read][i]] >= min_support and kmer_hist[kmers[read][i]] >= min_support:
163 | 				graph.add_vertice(kmers[read][i-1],kmers[read][i],read)		
164 | 
165 | 	if not reads:
166 | 		return([])
167 | 	
168 | 	graph=drop_kmers(graph,min_support)
169 | 	graph=trim_edges(graph,min_support)
170 | 	graph=remove_tips(graph,10)
171 | 
172 | 	branch_start=graph.in_branch_points
173 | 	branch_end=graph.out_branch_points
174 | 	starting_point=graph.starting_points
175 | 
176 | 	chains=[]
177 | 	switches=branch_end.union(branch_start)
178 | 	for start in starting_point.union(branch_start,branch_end):	
179 | 		for node in graph.sucessors[start]:
180 | 
181 | 			chain=[start]+find_chain(graph,node,switches)
182 | 			chain_type=chain_typer(chain,graph)
183 | 			chains.append([chain,chain_type])
184 | 
185 | 	scaffolds=[]
186 | 	for i in range(0,len(chains)):
187 | 		chain=chains[i][0]
188 | 		start=chain[0]
189 | 		end=chain[-1]
190 | 
191 | 		scaffold=[]	
192 | 
193 | 		if chains[i][1] == "end_point":
194 | 			results=backward_scaffold(i,chains,graph,set([i]) )
195 | 			scaffolds+=results
196 | 
197 | 		elif chains[i] == "start_point":
198 | 			results=forward_scaffold(i,chains,graph,set([i]) )
199 | 			scaffolds+=results
200 | 		else:
201 | 			forward=forward_scaffold(i,chains,graph,set([i]) )
202 | 			for forward_result in forward:
203 | 				backward_result=backward_scaffold(i,chains,graph,forward_result[1] )
204 | 				for result in backward_result:
205 | 					scaffolds.append([  result[0]+forward_result[0][len(chains[i][0])-1:],forward_result[1] | result[1]])
206 | 
207 | 	results=[]
208 | 	for i in range(0,len(scaffolds)):
209 | 	
210 | 		skip=False
211 | 		for j in range(0,len(scaffolds)):
212 | 			if j ==i or j < i:
213 | 				continue
214 | 			if not len(scaffolds[i][-1]-scaffolds[j][-1]):
215 | 				skip=True
216 | 
217 | 		if skip:
218 | 			continue
219 | 
220 | 		scaffolded_chains=list(map(str,scaffolds[i][-1]))
221 | 
222 | 		out=[]
223 | 		for j in range(1,len(scaffolds[i][0])):
224 | 			out.append(scaffolds[i][0][j][-1])
225 | 
226 | 		results.append(scaffolds[i][0][0]+"".join(out))
227 | 	return(results)
228 | 
229 | min_branch_length=2
230 | min_overlap=0.2
231 | max_overlap=0.8
232 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_cluster.pyx:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import tiddit.DBSCAN as DBSCAN
  4 | import numpy
  5 | from collections import Counter
  6 | 
  7 | def find_discordant_pos(fragment,is_mp):
  8 | 	if is_mp:
  9 | 		if fragment[5] == "False" and fragment[8] == "True":
 10 | 			posA=fragment[3]
 11 | 			posB=fragment[7]
 12 | 		elif fragment[5] == "False" and fragment[8] == "False":
 13 | 			posA=fragment[3]
 14 | 			posB=fragment[6]
 15 | 		elif fragment[5] == "True" and fragment[8] == "True":		
 16 | 			posA=fragment[4]
 17 | 			posB=fragment[7]
 18 | 		else:
 19 | 			posA=fragment[4]
 20 | 			posB=fragment[6]
 21 | 
 22 | 	else:
 23 | 		if fragment[5] == "False" and fragment[8] == "True":
 24 | 			posA=fragment[4]
 25 | 			posB=fragment[6]
 26 | 		elif fragment[5] == "False" and fragment[8] == "False":
 27 | 			posA=fragment[4]
 28 | 			posB=fragment[7]
 29 | 		elif fragment[5] == "True" and fragment[8] == "True":
 30 | 			posA=fragment[3]
 31 | 			posB=fragment[6]
 32 | 
 33 | 		else:
 34 | 			posA=fragment[3]
 35 | 			posB=fragment[7]
 36 | 
 37 | 	return(posA,posB)
 38 | 
 39 | def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,min_contig,skip_assembly,min_reads):
 40 | 
 41 | 	discordants={}
 42 | 	contigs=set([])
 43 | 	splits={}
 44 | 	positions={}
 45 | 
 46 | 	i=0
 47 | 	for sample in samples:
 48 | 		for line in open("{}_tiddit/discordants_{}.tab".format(prefix,sample)):
 49 | 			content=line.rstrip().split("\t")
 50 | 			chrA=content[1]
 51 | 			chrB=content[2]
 52 | 			if contig_length[chrA] < min_contig or contig_length[chrB] < min_contig:
 53 | 				continue
 54 | 
 55 | 			if not chrA in positions:
 56 | 				positions[chrA]={}
 57 | 			if not chrB in positions[chrA]:
 58 | 				positions[chrA][chrB]=[]
 59 | 
 60 | 			if not chrA in discordants:
 61 | 				discordants[chrA]={}
 62 | 			if not chrB in discordants[chrA]:
 63 | 				discordants[chrA][chrB]=[]
 64 | 
 65 | 			posA,posB=find_discordant_pos(content,is_mp)
 66 | 
 67 | 			if int(posA) > contig_length[chrA]:
 68 | 				posA=contig_length[chrA]
 69 | 				if int(posB) > contig_length[chrB]:
 70 | 					posA=contig_length[chrB]
 71 | 	
 72 | 			discordants[chrA][chrB].append([content[0],sample,"D",posA,content[5],posB,content[8],i,int(content[3]),int(content[4]),int(content[6]),int(content[7])])
 73 | 			positions[chrA][chrB].append([int(posA),int(posB),i])
 74 | 			i+=1
 75 | 
 76 | 		for line in open("{}_tiddit/splits_{}.tab".format(prefix,sample)):
 77 | 			content=line.rstrip().split("\t")
 78 | 			chrA=content[1]
 79 | 			chrB=content[2]
 80 | 			if contig_length[chrA] < min_contig or contig_length[chrB] < min_contig:
 81 | 				continue
 82 | 
 83 | 			if not chrA in positions:
 84 | 				positions[chrA]={}
 85 | 			if not chrB in positions[chrA]:
 86 | 				positions[chrA][chrB]=[]
 87 | 
 88 | 			if not chrA in discordants:
 89 | 				discordants[chrA]={}
 90 | 			if not chrB in discordants[chrA]:
 91 | 				discordants[chrA][chrB]=[]
 92 | 
 93 | 			posA=content[3]
 94 | 			posB=content[5]
 95 | 
 96 | 			if int(posA) > contig_length[chrA]:
 97 | 				posA=contig_length[chrA]
 98 | 			if int(posB) > contig_length[chrB]:
 99 | 				posB=contig_length[chrB]
100 | 
101 | 			discordants[chrA][chrB].append([content[0],sample,"S",posA,content[4],posB,content[6],i,int(content[7]),int(content[8]),int(content[9]),int(content[10])])
102 | 			positions[chrA][chrB].append([int(posA),int(posB),i])
103 | 			i+=1
104 | 
105 | 		if not skip_assembly:
106 | 			for line in open("{}_tiddit/contigs_{}.tab".format(prefix,sample)):
107 | 				content=line.rstrip().split("\t")
108 | 				chrA=content[1]
109 | 				chrB=content[2]
110 | 
111 | 				if contig_length[chrA] < min_contig or contig_length[chrB] < min_contig:
112 | 					continue
113 | 
114 | 
115 | 				if not chrA in positions:
116 | 					positions[chrA]={}
117 | 				if not chrB in positions[chrA]:
118 | 					positions[chrA][chrB]=[]
119 | 
120 | 				if not chrA in discordants:
121 | 					discordants[chrA]={}
122 | 				if not chrB in discordants[chrA]:
123 | 					discordants[chrA][chrB]=[]
124 | 
125 | 
126 | 				posA=content[3]
127 | 				posB=content[5]
128 | 
129 | 				if int(posA) > contig_length[chrA]:
130 | 					posA=contig_length[chrA]
131 | 				if int(posB) > contig_length[chrB]:
132 | 					posB=contig_length[chrB]
133 | 
134 | 				discordants[chrA][chrB].append([content[0],sample,"A",posA,content[4],posB,content[6],i,int(content[7]),int(content[8]),int(content[9]),int(content[10])])
135 | 				positions[chrA][chrB].append([int(posA),int(posB),i])
136 | 				contigs.add(i)
137 | 				i+=1
138 | 
139 | 	candidates={}
140 | 	for chrA in chromosomes:
141 | 		if not chrA in positions:
142 | 			continue
143 | 		if not chrA in candidates:
144 | 			candidates[chrA]={}
145 | 
146 | 		for chrB in chromosomes:
147 | 			if not chrB in positions[chrA]:
148 | 				continue
149 | 			if not chrB in candidates[chrA]:
150 | 				candidates[chrA][chrB]={}
151 | 
152 | 			positions[chrA][chrB]=numpy.array(sorted(positions[chrA][chrB],key=lambda l:l[0]))
153 | 			
154 | 			clusters=DBSCAN.main(positions[chrA][chrB],epsilon,m)
155 | 			positions[chrA][chrB]=list(positions[chrA][chrB])
156 | 			cluster_pos=[]
157 | 			for i in range(0,len(positions[chrA][chrB])):
158 | 				cluster_pos.append(list(positions[chrA][chrB][i])+[clusters[i]] )
159 | 
160 | 			cluster_pos= sorted(cluster_pos,key=lambda l:l[2])
161 | 			n_ctg_clusters=0
162 | 			for i in range(0,len(cluster_pos)):
163 | 				candidate=int(cluster_pos[i][-1])
164 | 				if candidate == -1 and not (chrA == chrB and discordants[chrA][chrB][i][2] == "A" and ( int(discordants[chrA][chrB][i][5])- int(discordants[chrA][chrB][i][3]) ) < max_ins_len*2):
165 | 					continue
166 | 				elif candidate == -1 and discordants[chrA][chrB][i][2] == "A":
167 | 					candidate=len(cluster_pos)+n_ctg_clusters
168 | 					n_ctg_clusters+=1
169 | 
170 | 				if not candidate in candidates[chrA][chrB]:
171 | 					candidates[chrA][chrB][candidate]={}
172 | 					candidates[chrA][chrB][candidate]["signal_type"]={}
173 | 					candidates[chrA][chrB][candidate]["samples"]=set([])
174 | 					candidates[chrA][chrB][candidate]["sample_discordants"]={}
175 | 					candidates[chrA][chrB][candidate]["sample_splits"]={}
176 | 					candidates[chrA][chrB][candidate]["sample_contigs"]={}
177 | 
178 | 
179 | 					candidates[chrA][chrB][candidate]["N_discordants"]=0
180 | 					candidates[chrA][chrB][candidate]["discordants"]=set([])
181 | 					candidates[chrA][chrB][candidate]["N_splits"]=0
182 | 					candidates[chrA][chrB][candidate]["splits"]=set([])
183 | 					candidates[chrA][chrB][candidate]["N_contigs"]=0
184 | 					candidates[chrA][chrB][candidate]["contigs"]=set([])
185 | 
186 | 
187 | 					candidates[chrA][chrB][candidate]["n_signals"]=0
188 | 
189 | 					candidates[chrA][chrB][candidate]["posA"]=0
190 | 					candidates[chrA][chrB][candidate]["positions_A"]={}
191 | 					candidates[chrA][chrB][candidate]["positions_A"]["contigs"]=[]
192 | 					candidates[chrA][chrB][candidate]["positions_A"]["splits"]=[]
193 | 					candidates[chrA][chrB][candidate]["positions_A"]["discordants"]=[]
194 | 					candidates[chrA][chrB][candidate]["positions_A"]["orientation_contigs"]=[]
195 | 					candidates[chrA][chrB][candidate]["positions_A"]["orientation_splits"]=[]
196 | 					candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"]=[]
197 | 					candidates[chrA][chrB][candidate]["positions_A"]["start"]=[]
198 | 					candidates[chrA][chrB][candidate]["positions_A"]["end"]=[]
199 | 
200 | 					candidates[chrA][chrB][candidate]["start_A"]=0
201 | 					candidates[chrA][chrB][candidate]["end_A"]=0
202 | 
203 | 					candidates[chrA][chrB][candidate]["posB"]=0
204 | 					candidates[chrA][chrB][candidate]["positions_B"]={}
205 | 					candidates[chrA][chrB][candidate]["positions_B"]["contigs"]=[]
206 | 					candidates[chrA][chrB][candidate]["positions_B"]["splits"]=[]
207 | 					candidates[chrA][chrB][candidate]["positions_B"]["discordants"]=[]
208 | 					candidates[chrA][chrB][candidate]["positions_B"]["orientation_contigs"]=[]
209 | 					candidates[chrA][chrB][candidate]["positions_B"]["orientation_splits"]=[]
210 | 					candidates[chrA][chrB][candidate]["positions_B"]["orientation_discordants"]=[]
211 | 					candidates[chrA][chrB][candidate]["positions_B"]["start"]=[]
212 | 					candidates[chrA][chrB][candidate]["positions_B"]["end"]=[]
213 | 
214 | 					candidates[chrA][chrB][candidate]["start_B"]=0
215 | 					candidates[chrA][chrB][candidate]["end_B"]=0
216 | 
217 | 				if not discordants[chrA][chrB][i][1] in candidates[chrA][chrB][candidate]["samples"]:
218 | 					candidates[chrA][chrB][candidate]["sample_discordants"][discordants[chrA][chrB][i][1]]=set([])
219 | 					candidates[chrA][chrB][candidate]["sample_splits"][discordants[chrA][chrB][i][1]]=set([])
220 | 					candidates[chrA][chrB][candidate]["sample_contigs"][discordants[chrA][chrB][i][1]]=set([])
221 | 
222 | 				candidates[chrA][chrB][candidate]["samples"].add(discordants[chrA][chrB][i][1])
223 | 
224 | 				candidates[chrA][chrB][candidate]["positions_A"]["start"].append(discordants[chrA][chrB][i][8])
225 | 				candidates[chrA][chrB][candidate]["positions_A"]["end"].append(discordants[chrA][chrB][i][9])
226 | 
227 | 				candidates[chrA][chrB][candidate]["positions_B"]["start"].append(discordants[chrA][chrB][i][10])
228 | 				candidates[chrA][chrB][candidate]["positions_B"]["end"].append(discordants[chrA][chrB][i][11])
229 | 
230 | 				if discordants[chrA][chrB][i][2] == "D":
231 | 					candidates[chrA][chrB][candidate]["discordants"].add(discordants[chrA][chrB][i][0])
232 | 					candidates[chrA][chrB][candidate]["positions_A"]["discordants"].append(int(discordants[chrA][chrB][i][3]))
233 | 					candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"].append(discordants[chrA][chrB][i][4])
234 | 
235 | 					candidates[chrA][chrB][candidate]["positions_B"]["discordants"].append(int(discordants[chrA][chrB][i][5]))
236 | 					candidates[chrA][chrB][candidate]["positions_B"]["orientation_discordants"].append(discordants[chrA][chrB][i][6])
237 | 					candidates[chrA][chrB][candidate]["sample_discordants"][discordants[chrA][chrB][i][1]].add(discordants[chrA][chrB][i][0])
238 | 
239 | 				elif discordants[chrA][chrB][i][2] == "S":
240 | 					candidates[chrA][chrB][candidate]["splits"].add(discordants[chrA][chrB][i][0])
241 | 					candidates[chrA][chrB][candidate]["positions_A"]["splits"].append(int(discordants[chrA][chrB][i][3]))
242 | 					candidates[chrA][chrB][candidate]["positions_A"]["orientation_splits"].append(discordants[chrA][chrB][i][4])
243 | 
244 | 					candidates[chrA][chrB][candidate]["positions_B"]["splits"].append(int(discordants[chrA][chrB][i][5]))
245 | 					candidates[chrA][chrB][candidate]["positions_B"]["orientation_splits"].append(discordants[chrA][chrB][i][6])
246 | 					candidates[chrA][chrB][candidate]["sample_splits"][discordants[chrA][chrB][i][1]].add(discordants[chrA][chrB][i][0])
247 | 				else:
248 | 					candidates[chrA][chrB][candidate]["contigs"].add(discordants[chrA][chrB][i][0])
249 | 					candidates[chrA][chrB][candidate]["positions_A"]["contigs"].append(int(discordants[chrA][chrB][i][3]))
250 | 					candidates[chrA][chrB][candidate]["positions_A"]["orientation_contigs"].append(discordants[chrA][chrB][i][4])
251 | 
252 | 					candidates[chrA][chrB][candidate]["positions_B"]["contigs"].append(int(discordants[chrA][chrB][i][5]))
253 | 					candidates[chrA][chrB][candidate]["positions_B"]["orientation_contigs"].append(discordants[chrA][chrB][i][6])
254 | 					candidates[chrA][chrB][candidate]["sample_contigs"][discordants[chrA][chrB][i][1]].add(discordants[chrA][chrB][i][0])
255 | 
256 | 
257 | 
258 | 	for chrA in candidates:
259 | 		for chrB in candidates[chrA]:
260 | 			for candidate in candidates[chrA][chrB]:
261 | 				candidates[chrA][chrB][candidate]["N_discordants"]=len(candidates[chrA][chrB][candidate]["discordants"])
262 | 				candidates[chrA][chrB][candidate]["N_splits"]=len(candidates[chrA][chrB][candidate]["splits"])
263 | 				candidates[chrA][chrB][candidate]["N_contigs"]=len(candidates[chrA][chrB][candidate]["contigs"])
264 | 
265 | 
266 | 				if candidates[chrA][chrB][candidate]["N_splits"] and min_reads <= candidates[chrA][chrB][candidate]["N_splits"]:
267 | 					candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["splits"]).most_common(1)[0][0]
268 | 					candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["splits"]).most_common(1)[0][0]
269 | 
270 | 				elif candidates[chrA][chrB][candidate]["N_contigs"]:
271 | 					candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["contigs"]).most_common(1)[0][0]
272 | 					candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["contigs"]).most_common(1)[0][0]
273 | 
274 | 				elif candidates[chrA][chrB][candidate]["N_splits"]:
275 | 					candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["splits"]).most_common(1)[0][0]
276 | 					candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["splits"]).most_common(1)[0][0]		
277 | 
278 | 				else:
279 | 					reverse_A = candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"].count("True")
280 | 					forward_A = candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"].count("False")
281 | 
282 | 					reverse_B = candidates[chrA][chrB][candidate]["positions_B"]["orientation_discordants"].count("True") 
283 | 					forward_B = candidates[chrA][chrB][candidate]["positions_B"]["orientation_discordants"].count("False")
284 | 
285 | 					if ( reverse_A >= 5*forward_A or reverse_A*5 <= forward_A ) and ( reverse_B >= 5*forward_B or reverse_B*5 <= forward_B ):
286 | 						A_reverse=False
287 | 						if reverse_A > forward_A:
288 | 							A_reverse=True
289 | 
290 | 						B_reverse=False
291 | 						if reverse_B > forward_B:
292 | 							B_reverse=True
293 | 
294 | 						if is_mp:
295 | 							if A_reverse and not B_reverse:
296 | 								candidates[chrA][chrB][candidate]["posA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
297 | 								candidates[chrA][chrB][candidate]["posB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
298 | 
299 | 							elif not A_reverse and B_reverse:
300 | 								candidates[chrA][chrB][candidate]["posA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
301 | 								candidates[chrA][chrB][candidate]["posB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
302 | 
303 | 							elif A_reverse and B_reverse:
304 | 								candidates[chrA][chrB][candidate]["posA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
305 | 								candidates[chrA][chrB][candidate]["posB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
306 | 
307 | 							else:
308 | 								candidates[chrA][chrB][candidate]["posA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
309 | 								candidates[chrA][chrB][candidate]["posB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
310 | 
311 | 						else:
312 | 							if not A_reverse and B_reverse:
313 | 								candidates[chrA][chrB][candidate]["posA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
314 | 								candidates[chrA][chrB][candidate]["posB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
315 | 
316 | 							elif A_reverse and not B_reverse:
317 | 								candidates[chrA][chrB][candidate]["posA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
318 | 								candidates[chrA][chrB][candidate]["posB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
319 | 
320 | 							elif not A_reverse and not B_reverse:
321 | 								candidates[chrA][chrB][candidate]["posA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
322 | 								candidates[chrA][chrB][candidate]["posB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
323 | 
324 | 							else:
325 | 								candidates[chrA][chrB][candidate]["posA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
326 | 								candidates[chrA][chrB][candidate]["posB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
327 | 
328 | 					else:
329 | 						candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["discordants"]).most_common(1)[0][0]
330 | 						candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["discordants"]).most_common(1)[0][0]
331 | 
332 | 				candidates[chrA][chrB][candidate]["startB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["start"])
333 | 				candidates[chrA][chrB][candidate]["endB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["end"])
334 | 
335 | 				candidates[chrA][chrB][candidate]["startA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["start"])
336 | 				candidates[chrA][chrB][candidate]["endA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["end"])
337 | 
338 | 	return(candidates)
339 | 
340 | #chromosomes=["1","2","3","4","5"]
341 | #samples=["SweGen0001"]
342 | #prefix=sys.argv[1]
343 | #hej=main(prefix,chromosomes,samples,False,150,2)
344 | ##print(hej["3"]["5"])
345 | #for entry in hej["3"]["5"]:
346 | #	print(hej["3"]["5"][entry])
347 | #
348 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_contig_analysis.pyx:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | from subprocess import Popen, PIPE, DEVNULL
  3 | from joblib import Parallel, delayed
  4 | 
  5 | from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
  6 | 
  7 | import numpy
  8 | import math
  9 | import os
 10 | 
 11 | import time
 12 | 
 13 | import tiddit.DBSCAN as DBSCAN
 14 | import tiddit.silverfish as silverfish
 15 | import tiddit.tiddit_signal as tiddit_signal
 16 | 
 17 | 
 18 | def read_contigs(aligned_contigs,prefix,sample_id,min_size):
 19 | 	samfile = pysam.AlignmentFile(aligned_contigs, "r")
 20 | 	bam_header=samfile.header
 21 | 	
 22 | 
 23 | 	split_contigs={}
 24 | 	for chrA in bam_header["SQ"]:
 25 | 		for chrB in bam_header["SQ"]:
 26 | 			if chrA["SN"] <= chrB["SN"]:
 27 | 
 28 | 				if not chrA["SN"] in split_contigs:
 29 | 					split_contigs[chrA["SN"]] = {}
 30 | 
 31 | 				split_contigs[chrA["SN"]][chrB["SN"]]={}
 32 | 
 33 | 	for read in samfile.fetch(until_eof=True):
 34 | 
 35 | 		if read.is_unmapped:
 36 | 			continue
 37 | 
 38 | 		if read.has_tag("SA") and not (read.is_supplementary or read.is_secondary):
 39 | 			split=tiddit_signal.SA_analysis(read,-2,"SA",read.reference_name)
 40 | 
 41 | 			if split:
 42 | 				if not split[2] in split_contigs[split[0]][split[1]]:
 43 | 					split_contigs[split[0]][split[1]][split[2]]=[]
 44 | 				split_contigs[split[0]][split[1]][split[2]]+=split[3:]
 45 | 
 46 | 			
 47 | 		elif read.has_tag("XA") and not (read.is_supplementary or read.is_secondary):
 48 | 			XA=read.get_tag("XA")
 49 | 			if XA.count(";") == 1:
 50 | 				if ",-" in XA:
 51 | 					XA=XA.replace(",-",",")
 52 | 					xa_list=XA.split(",")
 53 | 					xa_list.insert(2,"-")
 54 | 					XA=",".join(xa_list)
 55 | 				else:
 56 | 					XA=XA.replace(",+",",")
 57 | 					xa_list=XA.split(",")
 58 | 					xa_list.insert(2,"+")
 59 | 					XA=",".join(xa_list)
 60 | 
 61 | 				read.set_tag("XA",XA)
 62 | 				split=tiddit_signal.SA_analysis(read,-2,"XA",read.reference_name)
 63 | 
 64 | 				if split:
 65 | 					if not split[2] in split_contigs[split[0]][split[1]]:
 66 | 						split_contigs[split[0]][split[1]][split[2]]=[]
 67 | 					split_contigs[split[0]][split[1]][split[2]]+=split[3:]
 68 | 
 69 | 		elif not (read.is_supplementary or read.is_secondary) and len(read.cigartuples) > 2:
 70 | 
 71 | 			current_bp=read.reference_start
 72 | 			for i in range(0,len(read.cigartuples)-1):
 73 | 				if read.cigartuples[i][0] == 2 and read.cigartuples[i][1] > min_size:
 74 | 					
 75 | 					split_contigs[read.reference_name][read.reference_name]["{}_d_{}".format(read.query_name,i)]=[current_bp,read.is_reverse,current_bp+read.cigartuples[i][1],read.is_reverse,read.reference_start,current_bp,current_bp+read.cigartuples[i][1],read.reference_end]
 76 | 				current_bp+=read.cigartuples[i][1]
 77 | 
 78 | 	f=open("{}_tiddit/contigs_{}.tab".format(prefix,sample_id),"w")
 79 | 	positions=set([])
 80 | 	for chrA in split_contigs:
 81 | 		for chrB in split_contigs[chrA]:
 82 | 			for fragment in split_contigs[chrA][chrB]:
 83 | 
 84 | 				p=(chrA,chrB,split_contigs[chrA][chrB][fragment][0],split_contigs[chrA][chrB][fragment][2])
 85 | 				if p in positions:
 86 | 					continue
 87 | 
 88 | 				f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, split_contigs[chrA][chrB][fragment] )))  )
 89 | 				positions.add(p)
 90 | 
 91 | 	f.close()
 92 | 
 93 | def local_assembly(args,sample_id,prefix,regions,chr):
 94 | 
 95 | 	if os.path.isfile("{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr)):
 96 | 		os.remove("{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr))
 97 | 
 98 | 	cdef AlignmentFile samfile  = AlignmentFile(args.bam, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,sample_id))
 99 | 	mag=open( "{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr) ,"w")
100 | 	contig=1
101 | 	for region in regions[chr]:
102 | 
103 | 		n_reads=0
104 | 		proper=0
105 | 		low_mapq=0
106 | 
107 | 		if region[2]-region[1] > args.max_local_assembly_region:
108 | 			continue
109 | 
110 | 		reads={}
111 | 		for read in samfile.fetch(region[0], region[1], region[2]):
112 | 			if read.is_supplementary or read.is_duplicate or read.is_secondary:
113 | 				continue
114 | 			n_reads+=1
115 | 			if read.mapq < 10:
116 | 				low_mapq+=1
117 | 			if read.is_proper_pair:
118 | 				proper+=1
119 | 
120 | 			reads[str(n_reads)]=read.query_sequence
121 | 			if n_reads > 50000:
122 | 				break
123 | 
124 | 
125 | 		if n_reads > args.max_assembly_reads:
126 | 			continue
127 | 
128 | 		if low_mapq/n_reads > 0.25 or proper/n_reads < 0.75:
129 | 			continue
130 | 	
131 | 		results=silverfish.main(reads,args.k,args.min_clip)
132 | 		del reads
133 | 
134 | 		for result in results:
135 | 			if len(result) > args.min_contig_len:
136 | 				mag.write(f">{chr}_{region[1]}_{region[2]}_{contig}\n")
137 | 				mag.write(result+"\n")
138 | 			contig+=1
139 | 
140 | 	mag.close()
141 | 	return( "{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr) )
142 | 
143 | def main(prefix,sample_id,library,contigs,coverage_data,args):
144 | 
145 | 	clips={}
146 | 	c=[]
147 | 
148 | 	for line in open("{}_tiddit/clips_{}.fa".format(prefix,sample_id)):
149 | 
150 | 		if line[0] == ">":
151 | 			c.append(line.strip())
152 | 			pos=int(line.strip().split("|")[-1])
153 | 			chr=line.strip().split("|")[-2]
154 | 			if not chr in clips:
155 | 				clips[chr]=[[],[]]
156 | 
157 | 		else:
158 | 			c.append(line.strip())
159 | 			clips[chr][0].append( "\n".join(c) )
160 | 			clips[chr][1].append([pos,0])
161 | 			c=[]
162 | 
163 | 	regions={}
164 | 
165 | 
166 | 	assembly_l=args.min_pts_clips
167 | 	
168 | 	for chr in clips:
169 | 		regions[chr]=[]
170 | 
171 | 
172 | 		l=assembly_l
173 | 		if library[ "avg_coverage_{}".format(chr) ]/library["avg_coverage"] > 5:
174 | 			l=args.l*int(round(library[ "avg_coverage_{}".format(chr) ]/library["avg_coverage"]/2.0))
175 | 
176 | 		clusters,cluster_id = DBSCAN.x_coordinate_clustering(numpy.array(clips[chr][1]),50,l)
177 | 		cluster_stats={}
178 | 
179 | 		for i in range(0,len(clusters)):
180 | 			if clusters[i] == -1:
181 | 				continue
182 | 			if not clusters[i] in cluster_stats:
183 | 				cluster_stats[clusters[i]]=[0,[]]
184 | 			cluster_stats[clusters[i]][0]+=1
185 | 			cluster_stats[clusters[i]][1].append( clips[chr][1][i][0] )
186 | 
187 | 		for cluster in cluster_stats:
188 | 
189 | 			if cluster_stats[cluster][0] < args.min_clip:
190 | 				continue
191 | 
192 | 			clip_coverage=max(coverage_data[chr][ int(math.floor(min(cluster_stats[cluster][1])/50.0)):int(math.floor(max(cluster_stats[cluster][1])/50.0))+1 ])
193 | 
194 | 			if clip_coverage/library[ "avg_coverage_{}".format(chr) ] > args.max_coverage:
195 | 				continue
196 | 
197 | 			regions[chr].append([chr,min(cluster_stats[cluster][1] )-args.padding,max(cluster_stats[cluster][1])+args.padding])
198 | 
199 | 			if regions[chr][-1][1] < 1:
200 | 				regions[chr][-1][1]=1
201 | 
202 | 	del clips
203 | 
204 | 	contigs=Parallel(n_jobs=args.threads,timeout=99999)( delayed(local_assembly)(args,sample_id,prefix,regions,chr) for chr in regions)
205 | 
206 | 	mag=open(f"{prefix}_tiddit/clips.fa.assembly.clean.mag","w")
207 | 	for contig in contigs:
208 | 		for line in open(contig):
209 | 			mag.write(line.rstrip()+"\n")
210 | 	mag.close()
211 | 
212 | 	os.system("{} mem -t {} -x intractg {} {}_tiddit/clips.fa.assembly.clean.mag  1> {}_tiddit/clips.sam 2> /dev/null".format(args.bwa,args.threads,args.ref,prefix,prefix))
213 | 
214 | 	read_contigs("{}_tiddit/clips.sam".format(prefix) , prefix, sample_id, args.z)
215 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_coverage.pyx:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | cimport numpy
 4 | import numpy
 5 | import math
 6 | cimport cython
 7 | @cython.boundscheck(False)
 8 | @cython.wraparound(False)
 9 | 
10 | def create_coverage(bam_header,bin_size,c="all"):
11 | 	coverage_data={}
12 | 	end_bin_size={}
13 | 
14 | 	for contig in bam_header["SQ"]:
15 | 		if c == "all" or contig["SN"] == c:
16 | 			bins= int(math.ceil(contig["LN"]/float(bin_size)))
17 | 			coverage_data[ contig["SN"] ]=numpy.zeros(bins)
18 | 			end_bin_size[contig["SN"]]=contig["LN"]-(bins-1)*bin_size
19 | 			if c != "all":
20 | 				return(coverage_data[ contig["SN"] ],end_bin_size[contig["SN"]])
21 | 	return(coverage_data,end_bin_size)
22 | def print_coverage(coverage_data,bam_header,bin_size,file_type,outfile):
23 | 	f=open(outfile,"w",buffering=819200)
24 | 
25 | 	if file_type == "bed":
26 | 		f.write("#chromosome\tstart\tend\tcoverage\n")
27 | 	elif file_type == "wig":
28 | 		f.write("track type=wiggle_0 name=\"Coverage\" description=\"Per bin average coverage\"\n")
29 | 
30 | 	for contig in bam_header["SQ"]:
31 | 		if file_type == "wig":
32 | 			f.write("fixedStep chrom={} start=1 step={}\n".format(contig["SN"],bin_size))
33 | 
34 | 		for i in range(0,len(coverage_data[ contig["SN"] ]) ) :
35 | 			if file_type == "bed":
36 | 				bin_end=(i+1)*bin_size+1
37 | 
38 | 				if i == len(coverage_data[ contig["SN"] ]) -1:
39 | 					bin_end=contig["LN"]
40 | 
41 | 				f.write("{}\t{}\t{}\t{}\n".format(contig["SN"],1+i*bin_size,bin_end,coverage_data[ contig["SN"] ][i] ) )
42 | 			elif file_type == "wig":
43 | 				f.write("{}\n".format( coverage_data[ contig["SN"] ][i] ))
44 | 
45 | 	f.close()
46 | 
47 | ctypedef numpy.double_t DTYPE_t
48 | def update_coverage(long ref_start,long ref_end,int bin_size,numpy.ndarray[DTYPE_t, ndim=1] coverage_data,int end_bin_size):
49 | 
50 | 	cdef int first_bin=ref_start//bin_size
51 | 	cdef int end_bin=(ref_end-1)//bin_size
52 | 
53 | 	cdef float bases_first_bin
54 | 
55 | 	if end_bin == first_bin:
56 | 		bases_first_bin=ref_end-ref_start
57 | 		coverage_data[first_bin]=bases_first_bin/bin_size+coverage_data[first_bin]
58 | 
59 | 		return(coverage_data)
60 | 
61 | 	bases_first_bin=((first_bin+1)*bin_size)-ref_start
62 | 	coverage_data[first_bin]=bases_first_bin/bin_size+coverage_data[first_bin]
63 | 	cdef float bases_last_bin=(ref_end-1)-end_bin*bin_size
64 | 
65 | 
66 | 	if end_bin < len(coverage_data)-1:
67 | 		coverage_data[end_bin]=bases_last_bin/bin_size+coverage_data[end_bin]
68 | 	else:
69 | 		coverage_data[end_bin]=bases_last_bin/end_bin_size+coverage_data[end_bin]
70 | 
71 | 	for i in range(first_bin+1,end_bin):
72 | 		coverage_data[i]=1.0+coverage_data[i]
73 | 
74 | 	return(coverage_data)
75 | 
76 | #bam_file_name=sys.argv[1]
77 | 
78 | #samfile = pysam.AlignmentFile(bam_file_name, "r")
79 | #bam_header=samfile.header
80 | #bin_size=50
81 | #file_type="bed"
82 | #outfile=sys.argv[2]
83 | #min_q=10
84 | 
85 | #coverage_data,end_bin_size=create_coverage(bam_header,bin_size)
86 | 
87 | #for read in samfile.fetch():
88 | #	coverage_data=update_coverage(read,bin_size,coverage_data,min_q,end_bin_size)
89 | 
90 | #print_coverage(coverage_data,bam_header,bin_size,file_type,outfile)
91 | 
92 | #samfile.close()
93 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_coverage_analysis.pyx:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import re
 3 | cimport numpy
 4 | import pysam
 5 | import gzip
 6 | 
 7 | import tiddit.tiddit_coverage as tiddit_coverage
 8 | 
 9 | def determine_ploidy(dict coverage_data,contigs,dict library,int ploidy,str prefix,c, str reference_fasta,int bin_size,bam_header,gc):
10 | 
11 | 	f=open( "{}.ploidies.tab".format(prefix),"w" )
12 | 	f.write("Chromosome\tPloidy\tPloidy_rounded\tMean_coverage\n")
13 | 	all_cov=[]
14 | 	for chromosome in coverage_data:
15 | 		tmp=[]
16 | 		for i in range(0,len(coverage_data[chromosome])):
17 | 			if coverage_data[chromosome][i] > 0 and gc[chromosome][i] != -1:
18 | 
19 | 				tmp.append(coverage_data[chromosome][i])
20 | 				all_cov.append(coverage_data[chromosome][i])
21 | 		
22 | 		library[ "avg_coverage_{}".format(chromosome) ]=numpy.median(tmp)
23 | 		if numpy.isnan(library[ "avg_coverage_{}".format(chromosome) ]):
24 | 			library[ "avg_coverage_{}".format(chromosome) ]=0
25 | 
26 | 	if not c:
27 | 		library["avg_coverage"]=numpy.median(all_cov)
28 | 	else:
29 | 		library["avg_coverage"]=c
30 | 
31 | 	for chromosome in contigs:
32 | 		if not chromosome in coverage_data:
33 | 			continue
34 | 
35 | 		avg_coverage_contig=library[ "avg_coverage_{}".format(chromosome) ]
36 | 		library["contig_ploidy_{}".format(chromosome)]=int(round(ploidy*avg_coverage_contig/library["avg_coverage"]))
37 | 		f.write("{}\t{}\t{}\t{}\n".format(chromosome,avg_coverage_contig/library["avg_coverage"]*ploidy,library["contig_ploidy_{}".format(chromosome)],avg_coverage_contig))
38 | 	
39 | 
40 | 	f.close()
41 | 	return(library)
42 | 
43 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_gc.pyx:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import numpy
 3 | import math
 4 | from joblib import Parallel, delayed
 5 | 
 6 | def binned_gc(fasta_path,contig,bin_size,n_cutoff):
 7 | 	fasta=pysam.FastaFile(fasta_path)
 8 | 	contig_length=fasta.get_reference_length(contig)
 9 | 	number_of_bins=int(math.ceil(contig_length/bin_size))
10 | 	
11 | 	contig_gc=numpy.zeros(number_of_bins,dtype=numpy.int8)
12 | 
13 | 	next_start=0
14 | 	for bin in range(0,number_of_bins):
15 | 		slice=fasta.fetch(contig, next_start, next_start+bin_size)
16 | 		n=0
17 | 		gc=0
18 | 		number_of_chars=0
19 | 
20 | 		for character in slice:
21 | 			number_of_chars += 1
22 | 			if character == "N" or character == "n":
23 | 				n+=1
24 | 			elif character == "C" or character == "c" or character == "G" or character == "g":
25 | 				gc+=1
26 | 
27 | 		if n/bin_size > n_cutoff:
28 | 			contig_gc[bin]=-1
29 | 		else:
30 | 			contig_gc[bin] = round(100*gc/number_of_chars)
31 | 
32 | 		next_start+=bin_size
33 | 	return([contig,contig_gc])
34 | 
35 | def main(reference,contigs,threads,bin_size,n_cutoff):
36 | 	gc_list=Parallel(n_jobs=threads)( delayed(binned_gc)(reference,contig,bin_size,n_cutoff) for contig in contigs)
37 | 
38 | 	gc_dictionary={}
39 | 	for gc in gc_list:
40 | 		gc_dictionary[gc[0]]=gc[1]
41 | 
42 | 	return(gc_dictionary)
43 | 
44 | 
45 | #contigs=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17"]
46 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_signal.pyx:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import sys
  3 | import os
  4 | import itertools
  5 | import time
  6 | from joblib import Parallel, delayed
  7 | from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
  8 | 
  9 | import tiddit.tiddit_coverage as tiddit_coverage
 10 | 
 11 | def find_SA_query_range(SA):
 12 | 	cdef a =pysam.AlignedSegment()
 13 | 	a.reference_start=int( SA[1] )
 14 | 
 15 | 	if SA[2] == "+":
 16 | 		a.flag = 64
 17 | 	else:
 18 | 		a.flag = 80
 19 | 
 20 | 	cdef list SA_cigar=[]
 21 | 	SC = ["".join(x) for _, x in itertools.groupby(SA[3], key=str.isdigit)]		
 22 | 
 23 | 	cdef dict s_to_op={"M":0,"S":4,"H":5,"D":2,"I":1}
 24 | 	for i in range(0,int(len(SC)/2)):
 25 | 		op=s_to_op[SC[i*2+1]]
 26 | 		SA_cigar.append( (op,int(SC[i*2])) )
 27 | 
 28 | 	a.cigar = tuple(SA_cigar)
 29 | 	return(a)
 30 | 
 31 | def SA_analysis(read,min_q,tag,reference_name):
 32 | 	#print(read.query_alignment_start,read.query_alignment_end,read.is_reverse,read.cigarstring)
 33 | 	suplementary_alignments=read.get_tag(tag).rstrip(";").split(";")
 34 | 	
 35 | 	if len(suplementary_alignments) > 1:
 36 | 		SA_lengths=[]
 37 | 		ok_q=[]
 38 | 		for i in range(0,len(suplementary_alignments)):
 39 | 			SA_data=suplementary_alignments[0].split(",")
 40 | 			if int(SA_data[4]) >= min_q:
 41 | 				ok_q.append(i)
 42 | 				supplementry_alignment=find_SA_query_range(SA_data)
 43 | 				SA_lengths.append(supplementry_alignment.query_alignment_end-supplementry_alignment.query_alignment_start)
 44 | 
 45 | 		longest_aln=0
 46 | 		for i in range(0,len(ok_q)):
 47 | 			if SA_lengths[i] > SA_lengths[longest_aln]:
 48 | 				longest_aln=i
 49 | 
 50 | 		#all alignments fail quality treshold
 51 | 		if len(ok_q) == 0:
 52 | 			return()
 53 | 
 54 | 		#only one SA pass mapping quality treshold		
 55 | 		elif len(ok_q) == 1:
 56 | 			suplementary_alignments[0]=suplementary_alignments[ ok_q[0] ]
 57 | 		#many SA pass mapping quality treshold, pick the longest alignment.
 58 | 		else:
 59 | 			suplementary_alignments[0]=suplementary_alignments[ longest_aln ]
 60 | 
 61 | 	SA_data=suplementary_alignments[0].split(",")
 62 | 	SA_pos=int(SA_data[1])
 63 | 
 64 | 	if int(SA_data[4]) < min_q:
 65 | 		return()
 66 | 
 67 | 	cdef long read_query_alignment_end=read.query_alignment_end
 68 | 
 69 | 	clip_before=False
 70 | 
 71 | 	supplementry_alignment=find_SA_query_range(SA_data)
 72 | 	if supplementry_alignment.query_alignment_start < read.query_alignment_start:
 73 | 		clip_before=True
 74 | 
 75 | 	if not clip_before:
 76 | 		if read.is_reverse:
 77 | 
 78 | 
 79 | 			split_pos=read.reference_start+1
 80 | 		else:
 81 | 			split_pos=read.reference_end+1		
 82 | 	else:
 83 | 		if read.is_reverse:
 84 | 			split_pos=read.reference_end+1
 85 | 		else:
 86 | 			split_pos=read.reference_start+1
 87 | 
 88 | 	SA_chr=SA_data[0]
 89 | 
 90 | 	startA=read.reference_start+1
 91 | 	endA=read.reference_end+1
 92 | 
 93 | 	startB=supplementry_alignment.reference_start
 94 | 	endB=supplementry_alignment.reference_end
 95 | 
 96 | 	if clip_before:
 97 | 		if SA_data[2] == "-":
 98 | 
 99 | 			SA_split_pos=supplementry_alignment.reference_start
100 | 		else:
101 | 			SA_split_pos=supplementry_alignment.reference_end
102 | 	else:
103 | 		if SA_data[2] == "-":
104 | 			SA_split_pos=supplementry_alignment.reference_end
105 | 
106 | 		else:
107 | 			SA_split_pos=supplementry_alignment.reference_start
108 | 
109 | 
110 | 	if SA_chr < reference_name:
111 | 		chrA=SA_chr
112 | 		chrB=reference_name
113 | 		tmp=split_pos
114 | 		split_pos=SA_split_pos
115 | 		SA_split_pos=tmp
116 | 
117 | 		startB=read.reference_start+1
118 | 		endB=read.reference_end+1
119 | 		startA=supplementry_alignment.reference_start
120 | 		endA=supplementry_alignment.reference_end
121 | 
122 | 
123 | 	else:
124 | 		chrA=reference_name
125 | 		chrB=SA_chr
126 | 
127 | 		if chrA == chrB:
128 | 			if SA_split_pos < split_pos:
129 | 				tmp=split_pos
130 | 				split_pos=SA_split_pos
131 | 				SA_split_pos=tmp
132 | 
133 | 				startB=read.reference_start+1
134 | 				endB=read.reference_end+1
135 | 				startA=supplementry_alignment.reference_start
136 | 				endA=supplementry_alignment.reference_end
137 | 
138 | 	split=[]
139 | 	if "-" == SA_data[2]:
140 | 		split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos, True,startA,endA,startB,endB]
141 | 	else:
142 | 		split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos,False,startA,endA,startB,endB]
143 | 		#splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,False]
144 | 
145 | 	return(split)
146 | 
147 | def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size,skip_index,int min_anchor_len,int min_clip_len):
148 | 	print("Collecting signals on contig: {}".format(chromosome))
149 | 
150 | 	bam_index="{}_tiddit/{}.csi".format(prefix,sample_id)
151 | 	if skip_index:
152 | 		bam_index=False
153 | 
154 | 	cdef AlignmentFile samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename=bam_index)
155 | 	bam_header=samfile.header
156 | 	coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,bin_size,chromosome)	
157 | 
158 | 	cdef list clips=[]
159 | 	cdef list data=[]
160 | 	cdef list splits=[]
161 | 
162 | 	cdef int clip_dist=100
163 | 
164 | 	cdef long read_position
165 | 	cdef long read_end
166 | 	cdef int mapq
167 | 	cdef AlignedSegment read
168 | 
169 | 	for read in samfile.fetch(chromosome,until_eof=True):
170 | 
171 | 		if read.is_unmapped or read.is_duplicate:
172 | 			continue
173 | 
174 | 		read_chromosome=read.reference_name
175 | 		mate_chromosome=read.next_reference_name
176 | 		read_position=read.reference_start
177 | 		read_end=read.reference_end
178 | 		read_mapq=read.mapq
179 | 		read_supplementary=read.is_supplementary
180 | 
181 | 		if read_mapq >= min_q:
182 | 			coverage_data=tiddit_coverage.update_coverage(read_position,read_end,bin_size,coverage_data,end_bin_size)
183 | 
184 | 		if read_supplementary or read.is_secondary:
185 | 			continue
186 | 
187 | 
188 | 		if read_mapq < min_q:
189 | 			continue
190 | 
191 | 		if ( abs(read.isize) < max_ins and mate_chromosome == read_chromosome ):
192 | 			cigar_tuple=read.cigartuples
193 | 			if (cigar_tuple[0][0] == 4 and cigar_tuple[0][1] > min_clip_len) and (cigar_tuple[-1][0] == 0 and cigar_tuple[-1][1] > min_anchor_len):
194 | 				clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"])
195 | 
196 | 			elif cigar_tuple[-1][0] == 4 and cigar_tuple[-1][1] > min_clip_len and (cigar_tuple[0][0] == 0 and cigar_tuple[0][1] > min_anchor_len):
197 | 				clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"])
198 | 
199 | 		if read.has_tag("SA"):
200 | 			split=SA_analysis(read,min_q,"SA",read_chromosome)
201 | 			if split:
202 | 				splits.append(split)
203 | 
204 | 		if read.mate_is_unmapped:
205 | 			continue
206 | 
207 | 		if not read.is_paired:
208 | 			continue
209 | 
210 | 
211 | 		if ( abs(read.isize) > max_ins or mate_chromosome != read_chromosome ):
212 | 			read_query_name=read.query_name
213 | 
214 | 			if mate_chromosome < read_chromosome:
215 | 				chrA=mate_chromosome
216 | 				chrB=read_chromosome
217 | 			else:
218 | 				chrA=read_chromosome
219 | 				chrB=mate_chromosome
220 | 
221 | 			data.append([chrA,chrB,read_query_name,read_position+1,read_end+1,read.is_reverse,read_chromosome])
222 | 
223 | 	f=open("{}_tiddit/clips/{}.fa".format(prefix,chromosome),"w")
224 | 	for clip in clips:
225 | 		f.write("".join(clip))
226 | 	f.close()
227 | 
228 | 	return(chromosome,data,splits,coverage_data, "{}_tiddit/clips/{}.fa".format(prefix,chromosome) )
229 | 
230 | def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig,skip_index,int min_anchor_len,int min_clip_len):
231 | 
232 | 	cdef AlignmentFile samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
233 | 	bam_header=samfile.header
234 | 	samfile.close()
235 | 	cdef int bin_size=50
236 | 	file_type="wig"
237 | 	outfile=prefix+".tiddit_coverage.wig"
238 | 
239 | 	cdef long t_tot=0
240 | 
241 | 	cdef dict data={}
242 | 	cdef dict splits={}
243 | 	cdef dict coverage_data={}
244 | 	cdef list clip_fasta=[]
245 | 	chromosomes=[]
246 | 	
247 | 	for chrA in bam_header["SQ"]:
248 | 		if chrA["LN"] < min_contig:
249 | 			continue
250 | 
251 | 		chromosomes.append(chrA["SN"])
252 | 		data[chrA["SN"]]={}
253 | 		splits[chrA["SN"]]={}
254 | 		for chrB in bam_header["SQ"]:
255 | 			data[chrA["SN"]][chrB["SN"]]={}
256 | 			splits[chrA["SN"]][chrB["SN"]]={}
257 | 
258 | 	t=time.time()
259 | 	res=Parallel(n_jobs=threads)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size,skip_index,min_anchor_len,min_clip_len) for chromosome in chromosomes )
260 | 
261 | 	chromosomes=set(chromosomes)
262 | 	for i in range(0,len(res)):
263 | 		coverage_data[ res[i][0] ] = res[i][3]
264 | 
265 | 		if not res[i][0] in chromosomes:
266 | 			continue
267 | 
268 | 		for signal in res[i][1]:
269 | 			if not signal[0] in data:
270 | 				continue
271 | 	
272 | 			if not signal[2] in data[ signal[0] ][ signal[1] ]:
273 | 				data[ signal[0] ][signal[1]][signal[2]]=[]
274 | 			data[ signal[0] ][signal[1]][signal[2]].append(signal[3:])
275 | 
276 | 		for signal in res[i][2]:
277 | 			if not signal[0] in splits:
278 | 				continue
279 | 
280 | 			if not signal[2] in splits[ signal[0] ][ signal[1] ]:
281 | 				splits[ signal[0] ][signal[1]][signal[2]]=[]
282 | 			splits[ signal[0] ][signal[1]][signal[2]]+=signal[3:]
283 | 
284 | 		clip_fasta.append(res[i][4])
285 | 
286 | 	t_tot=time.time()-t
287 | 
288 | 	print("total",t_tot)
289 | 	#print("coverage",t_update)
290 | 	#print("split",t_split)
291 | 	#print("disc",t_disc)
292 | 
293 | 	#print("writing coverage wig")
294 | 	#tiddit_coverage.print_coverage(coverage_data,bam_header,bin_size,file_type,outfile)
295 | 
296 | 	print("Writing signals to file")
297 | 
298 | 	f=open("{}_tiddit/discordants_{}.tab".format(prefix,sample_id),"w")
299 | 
300 | 	for chrA in data:
301 | 		for chrB in data[chrA]:
302 | 			for fragment in data[chrA][chrB]:
303 | 				if len(data[chrA][chrB][fragment]) < 2:
304 | 					continue
305 | 
306 | 				if chrA == chrB:
307 | 					if data[chrA][chrB][fragment][1][-1] < data[chrA][chrB][fragment][0][-1]:
308 | 						out=data[chrA][chrB][fragment][1][0:-1]+data[chrA][chrB][fragment][0][0:-1]
309 | 					else:
310 | 						out=data[chrA][chrB][fragment][0][0:-1]+data[chrA][chrB][fragment][1][0:-1]
311 | 				else:
312 | 					if data[chrA][chrB][fragment][0][-1] == chrA:
313 | 						out=data[chrA][chrB][fragment][0][0:-1]+data[chrA][chrB][fragment][1][0:-1]
314 | 					else:
315 | 						out=data[chrA][chrB][fragment][1][0:-1]+data[chrA][chrB][fragment][0][0:-1]
316 | 
317 | 				f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, out )))  )
318 | 	f.close()
319 | 
320 | 	f=open("{}_tiddit/splits_{}.tab".format(prefix,sample_id),"w")
321 | 
322 | 	for chrA in splits:
323 | 		for chrB in splits[chrA]:
324 | 			for fragment in splits[chrA][chrB]:
325 | 				f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, splits[chrA][chrB][fragment] )))  )
326 | 	f.close()
327 | 
328 | 	f=open("{}_tiddit/clips_{}.fa".format(prefix,sample_id),"w")
329 | 	for clips in clip_fasta:
330 | 		for clip in open(clips):
331 | 			f.write(clip)
332 | 	f.close()
333 | 
334 | 	return(coverage_data)
335 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_stats.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import numpy
 3 | import time
 4 | 
 5 | def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
 6 | 	library={}
 7 | 	samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
 8 | 
 9 | 	insert_size=[]
10 | 	read_length=[]
11 | 	is_innie=0
12 | 	is_outtie=0
13 | 
14 | 	n_sampled=0
15 | 	t=time.time()
16 | 
17 | 	for read in samfile.fetch():
18 | 
19 | 		read_length.append( read.query_length )	
20 | 		n_sampled+=1
21 | 
22 | 		if n_sampled > n_reads:
23 | 			break
24 | 
25 | 		if read.mate_is_unmapped:
26 | 			continue
27 | 
28 | 		if read.is_reverse == read.mate_is_reverse:
29 | 			continue
30 | 
31 | 		if read.next_reference_name != read.reference_name or (read.template_length > max_ins_len):
32 | 			continue
33 | 
34 | 		if read.next_reference_start < read.reference_start:
35 | 			continue	
36 | 
37 | 		if read.is_supplementary or read.is_secondary or read.is_duplicate or read.mapq < min_mapq:
38 | 			continue
39 | 
40 | 		insert_size.append( read.template_length )
41 | 
42 | 		if read.is_reverse and not read.mate_is_reverse:
43 | 			is_outtie+=1
44 | 		else:
45 | 			is_innie+=1
46 | 
47 | 
48 | 	samfile.close()
49 | 
50 | 	library["avg_read_length"]=numpy.average(read_length)
51 | 	if len(insert_size):
52 | 		library["avg_insert_size"]=numpy.average(insert_size)
53 | 		library["std_insert_size"]=numpy.std(insert_size)
54 | 		library["percentile_insert_size"]=numpy.percentile(insert_size, 99.9)
55 | 	else:
56 | 		library["avg_insert_size"]=0
57 | 		library["std_insert_size"]=0
58 | 		library["percentile_insert_size"]=0
59 | 
60 | 		
61 | 
62 | 	print("LIBRARY STATISTICS")
63 | 	if is_innie > is_outtie:
64 | 		library["mp"]=False
65 | 		print("\tPair orientation = Forward-Reverse")
66 | 	else:
67 | 		print("\tPair orientation = Reverse-Forward")
68 | 		library["mp"]=True
69 | 
70 | 
71 | 	print("\tAverage Read length = {}".format(library["avg_read_length"]) )
72 | 	print("\tAverage insert size = {}".format(library["avg_insert_size"]) )
73 | 	print("\tStdev insert size = {}".format(library["std_insert_size"] ) )
74 | 	print("\t99.95 percentile insert size = {}".format( library["percentile_insert_size"]) )
75 | 	print("Calculated statistics in: " + str( t-time.time() ))
76 | 	print("")
77 | 
78 | 	return(library)
79 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_variant.pyx:
--------------------------------------------------------------------------------
  1 | import time
  2 | import math
  3 | import numpy
  4 | from joblib import Parallel, delayed
  5 | 
  6 | import pysam
  7 | from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
  8 | 
  9 | def percentile(a, q):
 10 | 	size = len(a)
 11 | 	percentiles=[]
 12 | 
 13 | 	sorted_a=sorted(a)
 14 | 
 15 | 	for v in q:
 16 | 		if not size:
 17 | 			percentiles.append(0)
 18 | 		else:
 19 | 			percentiles.append( sorted_a[ int(math.ceil((size * v) / 100.0)) - 1 ] )
 20 | 
 21 | 	return(percentiles)
 22 | 
 23 | 
 24 | def scoring(scoring_dict,percentiles):
 25 | 	score=[0]
 26 | 	if scoring_dict["n_contigs"]:
 27 | 		score.append(50)
 28 | 
 29 | 	if scoring_dict["n_discordants"]:
 30 | 		score.append(0)
 31 | 		for p in percentiles["FA"]:
 32 | 			if scoring_dict["n_discordants"]/(scoring_dict["refFA"]+scoring_dict["n_discordants"]) >= p:
 33 | 				score[-1]+=5
 34 | 
 35 | 		score.append(0)
 36 | 		for p in percentiles["FB"]:
 37 | 			if scoring_dict["n_discordants"]/(scoring_dict["refFB"]+scoring_dict["n_discordants"]) >= p:
 38 | 				score[-1]+=5
 39 | 
 40 | 
 41 | 	if scoring_dict["n_splits"]:
 42 | 		score.append(0)
 43 | 		for p in percentiles["RA"]:
 44 | 			if scoring_dict["n_splits"]/(scoring_dict["refRA"]+scoring_dict["n_splits"]) >= p:
 45 | 				score[-1]+=5
 46 | 
 47 | 		score.append(0)
 48 | 		for p in percentiles["RB"]:
 49 | 			if scoring_dict["n_splits"]/(scoring_dict["refRB"]+scoring_dict["n_splits"]) >= p:
 50 | 				score[-1]+=5
 51 | 
 52 | 	return(max(score))
 53 | 
 54 | def get_region(AlignmentFile samfile,str chr,int start,int end,int bp,int min_q,int max_ins, contig_number):
 55 | 
 56 | 	cdef int low_q=0
 57 | 	cdef int n_reads=0
 58 | 	cdef long bases=0
 59 | 	cdef int n_discs=0
 60 | 	cdef int n_splits=0
 61 | 
 62 | 	cdef int crossing_r=0
 63 | 	cdef int crossing_f=0
 64 | 
 65 | 	bam_header=samfile.header
 66 | 	contig_length=bam_header["SQ"][contig_number[chr]]["LN"]
 67 | 
 68 | 	q_start=start
 69 | 	q_end=end+max_ins
 70 | 
 71 | 	if q_end > contig_length:
 72 | 		q_end=contig_length
 73 | 
 74 | 	if q_start >= q_end:
 75 | 		q_start=q_end-10
 76 | 
 77 | 	cdef long read_reference_start
 78 | 	cdef long read_reference_end
 79 | 
 80 | 	cdef long r_start
 81 | 	cdef long r_end
 82 | 
 83 | 	cdef AlignedSegment read
 84 | 
 85 | 	for read in samfile.fetch(chr, q_start, q_end):
 86 | 		if read.is_unmapped:
 87 | 			continue
 88 | 
 89 | 		read_reference_start=read.reference_start
 90 | 
 91 | 		if not read.mate_is_unmapped:
 92 | 			if read.next_reference_start > end and read_reference_start > end:
 93 | 				continue
 94 | 		else:
 95 | 			if read_reference_start > end:
 96 | 				continue
 97 | 
 98 | 		if read.is_duplicate:
 99 | 			continue
100 | 
101 | 		if not (read_reference_start > end):
102 | 			n_reads+=1
103 | 			if read.mapq < min_q:
104 | 				low_q+=1
105 | 
106 | 		if read.mapq < min_q:
107 | 			continue
108 | 
109 | 		read_reference_end=read.reference_end
110 | 		read_reference_name=read.reference_name
111 | 		read_next_reference_name=read.next_reference_name
112 | 
113 | 		r_start=read_reference_start
114 | 		r_end=read_reference_end
115 | 
116 | 		if read_reference_start < bp-20 and r_end > bp+20:
117 | 			crossing_r+=1
118 | 
119 | 		mate_bp_read= (read.next_reference_start < bp-50 and r_end > bp+50)
120 | 		discordant= ( abs(read.isize) > max_ins or read_next_reference_name != read_reference_name )
121 | 
122 | 		if mate_bp_read and not discordant:
123 | 			crossing_f+=1
124 | 
125 | 		if read_reference_end < start:
126 | 			continue
127 | 		elif read_reference_start > end:
128 | 			continue
129 | 
130 | 		if read_reference_start < start:
131 | 			r_start=start
132 | 
133 | 		if read_reference_end > end:
134 | 			r_end=end
135 | 
136 | 		bases+=r_end-r_start+1
137 | 
138 | 		if read.has_tag("SA"):
139 | 			n_splits+=1
140 | 
141 | 		if discordant:
142 | 			n_discs+=1
143 | 
144 | 	coverage= bases/(end-start+1)
145 | 
146 | 	if n_reads > 0:
147 | 		frac_low_q=low_q/float(n_reads)
148 | 	else:
149 | 		frac_low_q=0
150 | 
151 | 	return(coverage,frac_low_q,n_discs,n_splits,crossing_f,crossing_r)
152 | 
153 | def find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,library):
154 | 	if chrA != chrB:
155 | 		return("BND",".")
156 | 
157 | 	p=library["contig_ploidy_{}".format(chrA)]
158 | 	for sample in samples:
159 | 		if library[ "avg_coverage_{}".format(chrA) ] != 0:
160 | 			cn=int(round(sample_data[sample]["covM"]*p/library[ "avg_coverage_{}".format(chrA) ]))
161 | 		else:
162 | 			cn=int(round(sample_data[sample]["covM"]*args.n/library[ "avg_coverage" ]))
163 | 
164 | 
165 | 	#mitochondria or similar
166 | 	if p > args.n*10:
167 | 		if cn > p*1.05:
168 | 			if inverted:
169 | 				return("DUP:INV",cn)
170 | 			else:
171 | 				return("DUP:TANDEM",cn)
172 | 		elif cn < p*0.95:
173 | 			return("DEL",cn)
174 | 		elif inverted > non_inverted:
175 | 			return("INV",cn)
176 | 		else:
177 | 			return("BND",cn)
178 | 	else:
179 | 		if cn > p:
180 | 			if inverted:
181 | 				return("DUP:INV",cn)
182 | 			else:
183 | 				return("DUP:TANDEM",cn)
184 | 
185 | 		if inverted > non_inverted:
186 | 			return("INV",cn)
187 | 		elif cn < p:
188 | 			return("DEL",cn)
189 | 		else:
190 | 			return("BND",cn)
191 | 
192 | def sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_splits,library,n_discs_regionA,n_discs_regionB,n_splits_regionA,n_splits_regionB,n_contigs):
193 | 	filt="PASS"
194 | 	for sample in sample_data:
195 | 
196 | 		#filter SV in high coverage regions
197 | 		if sample_data[sample]["covA"] > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
198 | 			return("UnexpectedCoverage")
199 | 		if sample_data[sample]["covB"] > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
200 | 			return("UnexpectedCoverage")
201 | 		if sample_data[sample]["covM"] > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
202 | 			return("UnexpectedCoverage")
203 | 
204 | 		#if abs(posA-posB) > max_ins_len:
205 | 		if not n_contigs:
206 | 			if (n_discordants < args.p_ratio * sample_data[sample]["refFA"] or n_discordants < args.p_ratio * sample_data[sample]["refFB"]) and (n_splits < args.r_ratio * sample_data[sample]["refRA"] or n_splits < args.r_ratio * sample_data[sample]["refRB"]):
207 | 				return("BelowExpectedLinks")
208 | 
209 | 			if (n_discordants < args.p_ratio * sample_data[sample]["covA"] or n_discordants < args.p_ratio * sample_data[sample]["covB"]) and (n_splits < args.r_ratio * sample_data[sample]["covA"] or n_splits < args.r_ratio * sample_data[sample]["covB"]):
210 | 				return("BelowExpectedLinks")
211 | 
212 | 			if n_discordants > n_splits:
213 | 				if n_discordants < 0.25*n_discs_regionA or n_discordants < 0.25*n_discs_regionB:
214 | 					return("FewLinks")
215 | 
216 | 			else:
217 | 				if n_splits < 0.25*n_splits_regionA or n_splits < 0.25*n_splits_regionB:
218 | 					return("FewLinks")
219 | 
220 | 			if sample_data[sample]["QA"] > 0.2 or sample_data[sample]["QB"] > 0.2:
221 | 				return("RegionalQ")
222 | 
223 | 			if n_discordants == 0 and (chrA != chrB):
224 | 				return("SplitsVSDiscs")
225 | 
226 | 		#interchromsomal translocation, supported only by contigs
227 | 		if n_contigs and (chrA != chrB):
228 | 			if n_discordants < args.p:
229 | 				return("BelowExpectedLinks")
230 | 
231 | 		#large variant, supported only by contigs but not discordant pairs
232 | 		elif n_contigs and (chrA == chrB and max_ins_len*3 < abs(posB-posA) ):
233 | 			if n_discordants < args.p:
234 | 				return("BelowExpectedLinks")
235 | 
236 | 	return(filt)
237 | 
238 | def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs,gc):
239 | 	cdef AlignmentFile samfile  = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
240 | 	variants=[]
241 | 
242 | 	var_n=0
243 | 	for chrB in sv_clusters[chrA]:
244 | 
245 | 		for cluster in sv_clusters[chrA][chrB]:
246 | 
247 | 			n_discordants=sv_clusters[chrA][chrB][cluster]["N_discordants"]
248 | 			n_splits=sv_clusters[chrA][chrB][cluster]["N_splits"]
249 | 			n_contigs=sv_clusters[chrA][chrB][cluster]["N_contigs"]
250 | 
251 | 			if (n_discordants < args.p and n_splits < args.r) and not n_contigs:
252 | 					continue
253 | 
254 | 			posA=sv_clusters[chrA][chrB][cluster]["posA"]
255 | 			posB=sv_clusters[chrA][chrB][cluster]["posB"]
256 | 
257 | 			if chrA == chrB and posA > posB:
258 | 				posT=posA
259 | 				posA=posB
260 | 				posB=posT
261 | 
262 | 			if chrA == chrB and abs(posA-posB) < args.z:
263 | 				continue
264 | 
265 | 			s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startA"]/50.0))
266 | 			e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endA"]/50.0))+1
267 | 			avg_a=numpy.average(coverage_data[chrA][s:e])
268 | 
269 | 			if avg_a > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
270 | 				continue
271 | 			elif (args.max_coverage*n_discordants/avg_a < args.p_ratio/2 and args.max_coverage*n_splits/avg_a < args.r_ratio/2) and not n_contigs:
272 | 				continue
273 | 
274 | 			s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startB"]/50.0))
275 | 			e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endB"]/50.0))+1
276 | 
277 | 			avg_b=numpy.average(coverage_data[chrB][s:e])
278 | 
279 | 			if avg_b == 0:
280 | 				continue
281 | 			elif avg_b > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
282 | 				continue
283 | 			elif (args.max_coverage*n_discordants/avg_b < args.p_ratio/2 and args.max_coverage*n_splits/avg_b < args.r_ratio/2) and not n_contigs:
284 | 				continue
285 | 
286 | 			var_n+=1
287 | 			sample_data={}
288 | 			for sample in samples:
289 | 
290 | 				coverageA,frac_low_qA,n_discsA,n_splitsA,crossing_f_A,crossing_r_A=get_region(samfile,chrA,sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"],posA,min_mapq,max_ins_len,contig_number)
291 | 				coverageB,frac_low_qB,n_discsB,n_splitsB,crossing_f_B,crossing_r_B=get_region(samfile,chrB,sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"],posB,min_mapq,max_ins_len,contig_number)
292 | 
293 | 				sample_data[sample]={}
294 | 				sample_data[sample]={"covA":coverageA,"QA":frac_low_qA,"discA":n_discsA,"splitA":n_splitsA,"refRA":crossing_r_A,"refFA":crossing_f_A}
295 | 				sample_data[sample].update({"covB":coverageB,"QB":frac_low_qB,"discB":n_discsB,"splitB":n_splitsB,"refRB":crossing_r_B,"refFB":crossing_f_B})
296 | 
297 | 				if chrA != chrB:
298 | 					sample_data[sample]["covM"]=0
299 | 				elif abs(posB - posA) < 1000:
300 | 					if posA < posB:
301 | 						coverageM,_,_,_,_,_=get_region(samfile,chrA,posA,posB,posA,min_mapq,max_ins_len,contig_number)
302 | 					else:
303 | 						coverageM,_,_,_,_,_=get_region(samfile,chrA,posB,posA,posB,min_mapq,max_ins_len,contig_number)
304 | 
305 | 					sample_data[sample]["covM"]=coverageM
306 | 				else:
307 | 					s=int(math.floor(posA/50.0))
308 | 					e=int(math.floor(posB/50.0))+1
309 | 					coverage_between=coverage_data[chrA][s:e]
310 | 					gc_between=gc[chrA][s:e]
311 | 					coverage_between=coverage_between[ gc_between > -1 ]
312 | 					if len(coverage_between) > 4:
313 | 						sample_data[sample]["covM"]=numpy.average(coverage_between)
314 | 					else:
315 | 						sample_data[sample]["covM"]=library[ "avg_coverage_{}".format(chrA) ]
316 | 						
317 | 
318 | 			inverted=0
319 | 			non_inverted=0
320 | 			for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"]) ):
321 | 				if sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_discordants"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_discordants"][i]:
322 | 					inverted+=1
323 | 				else:
324 | 					non_inverted+=1
325 | 
326 | 			for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"]) ):
327 | 				if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_splits"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_splits"][i]:
328 | 					inverted+=1
329 | 				else:
330 | 					non_inverted+=1
331 | 
332 | 			for i in range(0,len(sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"]) ):
333 | 				if not sv_clusters[chrA][chrB][cluster]["positions_A"]["orientation_contigs"][i] == sv_clusters[chrA][chrB][cluster]["positions_B"]["orientation_contigs"][i]:
334 | 					inverted+=1
335 | 				else:
336 | 					non_inverted+=1
337 | 
338 | 			svtype,cn=find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,library)
339 | 
340 | 			filt=sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_splits,library,sample_data[sample]["discA"],sample_data[sample]["discB"],sample_data[sample]["splitA"],sample_data[sample]["splitB"],n_contigs)
341 | 			format_col="GT:CN:COV:DV:RV:LQ:RR:DR"
342 | 
343 | 			#configure filters for CNV based on Read depth
344 | 			for sample in samples:
345 | 
346 | 				covA=sample_data[sample]["covA"]
347 | 				covM=sample_data[sample]["covM"]
348 | 				covB=sample_data[sample]["covB"]
349 | 
350 | 				if "DEL" in svtype:
351 | 					#homozygout del based on coverage
352 | 					if cn == 0:
353 | 						filt="PASS"
354 | 
355 | 
356 | 					#normal coverage on the flanking regions, abnormal inbetween
357 | 					if covA > covM*(cn+0.9) and covB > covM*(cn+0.9):
358 | 						filt="PASS"
359 | 
360 | 				#too few reads, but clear DR signal
361 | 				elif "DUP" in svtype and filt == "BelowExpectedLinks":
362 | 					filt="PASS"
363 | 				scoring_dict={"n_contigs":n_contigs, "n_discordants":n_discordants,"n_splits":n_splits,"covA":covA,"covM":covM,"covB":covB,"refRA":sample_data[sample]["refRA"],"refRB":sample_data[sample]["refRB"],"refFA":sample_data[sample]["refFA"],"refFB":sample_data[sample]["refFB"]}
364 | 
365 | 			if svtype != "BND":
366 | 				info=["SVTYPE={}".format(svtype),"SVLEN={}".format(posB-posA),"END={}".format(posB)]
367 | 				alt="<{}>".format(svtype)
368 | 
369 | 				info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
370 | 				info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
371 | 				info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
372 | 				info+=["LFB={},{}".format(sample_data[sample]["discB"],sample_data[sample]["splitB"])]
373 | 				info+=["LTE={},{}".format(n_discordants,n_splits)]
374 | 
375 | 				if n_contigs:
376 | 					for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
377 | 						if "_d_" in c:
378 | 							c=c.split("_d_")[0]
379 | 
380 | 						ctgs=[ contig_seqs[c] ]
381 | 					info+=["CTG={}".format("|".join(ctgs) )]
382 | 
383 | 				else:
384 | 					info+=["CTG=."]
385 | 
386 | 
387 | 
388 | 				info=";".join(info)
389 | 				variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt,".",filt,info,format_col]
390 | 				for sample in samples:
391 | 					GT="./."
392 | 
393 | 					if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
394 | 						GT="0/1"
395 | 					if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
396 | 						GT="1/1"
397 | 					if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
398 | 						GT="1/1"
399 | 					if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
400 | 						if sample_data[sample]["covB"]:
401 | 							if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
402 | 								GT="1/1"
403 | 							else:
404 | 								GT="0/1"
405 | 						else:
406 | 							GT="1/1"
407 | 
408 | 						if sample_data[sample]["covA"]:
409 | 							if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
410 | 								GT="1/1"
411 | 							else:
412 | 								GT="0/1"
413 | 						else:
414 | 							GT="1/1"
415 | 
416 | 
417 | 					if "DEL" in alt:
418 | 						if cn == 0:
419 | 							GT = "1/1"
420 | 						else:
421 | 							GT= "0/1"
422 | 					elif "DUP" in alt:
423 | 						if cn >= 2*library["contig_ploidy_{}".format(chrA)]:
424 | 							GT= "1/1"
425 | 						else:
426 | 							GT="0/1"
427 | 
428 | 					variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
429 | 				variants.append([chrA,posA,variant,scoring_dict])
430 | 			else:
431 | 				info=["SVTYPE=BND".format(svtype)]
432 | 				inverted=False
433 | 				before=True
434 | 
435 | 				if posA == sv_clusters[chrA][chrB][cluster]["endA"]:
436 | 					before=False
437 | 
438 | 				if inverted  > non_inverted:
439 | 					inverted=True
440 | 
441 | 				if not inverted and not before:
442 | 					alt_str_a="N[{}:{}[".format(chrB,posB)
443 | 					alt_str_b="]{}:{}]N".format(chrA,posA)
444 | 				elif not inverted and before:
445 | 					alt_str_a="]{}:{}]N".format(chrB,posB)
446 | 					alt_str_b="N[{}:{}[".format(chrA,posA)
447 | 				elif inverted and  not before:
448 | 					alt_str_a="N]{}:{}]".format(chrB,posB)
449 | 					alt_str_b="[{}:{}[N".format(chrA,posA)
450 | 				else:
451 | 					alt_str_a="[{}:{}[N".format(chrB,posB)
452 | 					alt_str_b="N]{}:{}]".format(chrA,posA)
453 | 
454 | 				info+=["REGIONA={},{}".format(sv_clusters[chrA][chrB][cluster]["startA"],sv_clusters[chrA][chrB][cluster]["endA"])]
455 | 				info+=["REGIONB={},{}".format(sv_clusters[chrA][chrB][cluster]["startB"],sv_clusters[chrA][chrB][cluster]["endB"])]
456 | 				info+=["LFA={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
457 | 				info+=["LFB={},{}".format(sample_data[sample]["discA"],sample_data[sample]["splitA"])]
458 | 				info+=["LTE={},{}".format(n_discordants,n_splits)]
459 | 
460 | 				if n_contigs:
461 | 					for c in sv_clusters[chrA][chrB][cluster]["contigs"]:
462 | 						if "_d_" in c:
463 | 							c=c.split("_d_")[0]
464 |  
465 | 						ctgs=[ contig_seqs[c] ]
466 | 					info+=["CTG={}".format("|".join(ctgs) )]
467 | 
468 | 				else:
469 | 					info+=["CTG=."]
470 | 
471 | 
472 | 
473 | 				info=";".join(info)
474 | 				variant=[chrA,str(posA),"SV_{}_1".format(var_n),"N",alt_str_a,".",filt,info,format_col]
475 | 				for sample in samples:
476 | 					GT="./."
477 | 					if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
478 | 						GT="0/1"
479 | 					if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
480 | 						GT="1/1"
481 | 					if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
482 | 						GT="1/1"
483 | 					if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
484 | 						if sample_data[sample]["covB"]:
485 | 							if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
486 | 								GT="1/1"
487 | 							else:
488 | 								GT="0/1"
489 | 						else:
490 | 								GT="1/1"
491 | 
492 | 						if sample_data[sample]["covA"]:
493 | 							if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
494 | 								GT="1/1"
495 | 							else:
496 | 								GT="0/1"
497 | 
498 | 
499 | 						else:
500 | 								GT="1/1"
501 | 
502 | 
503 | 
504 | 					variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
505 | 				variants.append([chrA,posA,variant,scoring_dict])
506 | 
507 | 
508 | 				variant=[chrB,str(posB),"SV_{}_2".format(var_n),"N",alt_str_b,".",filt,info,format_col]
509 | 				for sample in samples:
510 | 					GT="./."
511 | 					if len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) >= args.r or len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) >= args.p:
512 | 						GT="0/1"
513 | 					if sample_data[sample]["refRB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]) or sample_data[sample]["refRA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample]):
514 | 						GT="1/1"
515 | 					if sample_data[sample]["refFB"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) or sample_data[sample]["refFA"] < 0.1*len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]):
516 | 						GT="1/1"
517 | 					if n_contigs and (not len(sv_clusters[chrA][chrB][cluster]["sample_discordants"][sample]) and not len(sv_clusters[chrA][chrB][cluster]["sample_splits"][sample])):
518 | 						if sample_data[sample]["covB"]:
519 | 							if sample_data[sample]["refRB"]/sample_data[sample]["covB"] < 0.2:
520 | 								GT="1/1"
521 | 							else:
522 | 								GT="0/1"
523 | 						else:
524 | 								GT="1/1"
525 | 
526 | 						if sample_data[sample]["covA"]:
527 | 							if sample_data[sample]["refRA"]/sample_data[sample]["covA"] < 0.2:
528 | 								GT="1/1"
529 | 							else:
530 | 								GT="0/1"
531 | 						else:
532 | 								GT="1/1"
533 | 
534 | 
535 | 
536 | 
537 | 					variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
538 | 				variants.append([chrB,posB,variant, scoring_dict ])
539 | 
540 | 	samfile.close()
541 | 	return(variants)
542 | 
543 | def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,gc):
544 | 	contig_seqs={}
545 | 	new_seq=False
546 | 	if not args.skip_assembly:
547 | 		for line in open("{}_tiddit/clips.fa.assembly.clean.mag".format(args.o)):
548 | 
549 | 			if line[0] == ">":
550 | 				name=line[1:].rstrip()
551 | 			else:
552 | 				contig_seqs[name]=line.strip("\n")
553 | 
554 | 			#if not new_seq and line[0] == "@" and "\t" in line:
555 | 			#	name=line.split("\t")[0][1:]
556 | 			#	new_seq=True
557 | 
558 | 			#elif new_seq:
559 | 			#	contig_seqs[name]=line.strip("\n")
560 | 			#	new_seq=False
561 | 
562 | 
563 | 	variants={}
564 | 	for chrA in sv_clusters:
565 | 		variants[chrA]=[]
566 | 		for chrB in sv_clusters[chrA]:
567 | 			variants[chrB]=[]
568 | 
569 | 	variants_list=Parallel(n_jobs=args.threads,prefer="threads")( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs,gc) for chrA in sv_clusters)
570 | 
571 | 	ratios={"fragments_A":[],"fragments_B":[],"reads_A":[],"reads_B":[]}
572 | 	for v in variants_list:
573 | 		for variant in v:
574 | 			if variant[3]["n_discordants"]:
575 | 				ratios["fragments_A"].append(variant[3]["n_discordants"]/(variant[3]["refFA"]+variant[3]["n_discordants"]) )
576 | 				ratios["fragments_B"].append(variant[3]["n_discordants"]/(variant[3]["refFB"]+variant[3]["n_discordants"]) )
577 | 
578 | 			if variant[3]["n_splits"]:
579 | 				ratios["reads_A"].append(variant[3]["n_splits"]/(variant[3]["refRA"]+variant[3]["n_splits"]) )
580 | 				ratios["reads_B"].append(variant[3]["n_splits"]/(variant[3]["refRB"]+variant[3]["n_splits"]) )
581 | 
582 | 
583 | 	p=[1,5,10,20,30,40,50,60,70,75,80,85,90,95,97.5,99]
584 | 
585 | 	percentiles={"FA":percentile(ratios["fragments_A"],p),"FB":percentile(ratios["fragments_B"],p),"RA":percentile(ratios["reads_A"],p),"RB":percentile(ratios["reads_B"],p)}
586 | 
587 | 	for v in variants_list:
588 | 		for variant in v:	
589 | 			score=scoring(variant[3],percentiles)
590 | 			variant[2][5]=str(score)
591 | 			variants[ variant[0] ].append( [ variant[1],variant[2] ] )
592 | 
593 | 	return(variants)
594 | 


--------------------------------------------------------------------------------
/tiddit/tiddit_vcf_header.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import sys
 3 | 
 4 | def main(bam_header,library,sample_id,version):
 5 | 
 6 | 	vcf_header=[]	
 7 | 
 8 | 	vcf_header.append("##fileformat=VCFv4.1")
 9 | 	vcf_header.append("##source=TIDDIT-" + version)
10 | 
11 |         #declare the events classified by TIDDIT
12 | 
13 | 	vcf_header.append("##ALT=<ID=DEL,Description=\"Deletion\">")
14 | 	vcf_header.append("##ALT=<ID=DUP,Description=\"Duplication\">")
15 | 	vcf_header.append("##ALT=<ID=DUP:TANDEM,Description=\"Tandem duplication\">")
16 | 	vcf_header.append("##ALT=<ID=DUP:INV,Description=\"Inverted tandem duplication\">")
17 | 	vcf_header.append("##ALT=<ID=INV,Description=\"Inversion\">")
18 | 	vcf_header.append("##ALT=<ID=INS,Description=\"Insertion\">")
19 | 	vcf_header.append("##ALT=<ID=BND,Description=\"Break end\">")
20 | 
21 | 	#print chromosomes and length
22 | 	for contig in bam_header["SQ"]:
23 | 		#print(contig)
24 | 		vcf_header.append("##contig=<ID={},length={}>".format(contig["SN"],contig["LN"]) )
25 | 
26 | 	#declare the info field
27 | 
28 | 	vcf_header.append("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">")
29 | 	vcf_header.append("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End of an intra-chromosomal variant\">")
30 | 	vcf_header.append("##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">")
31 | 	vcf_header.append("##INFO=<ID=LFA,Number=2,Type=Integer,Description=\"Read-pairs and split reads in region A\">")
32 | 	vcf_header.append("##INFO=<ID=LFB,Number=2,Type=Integer,Description=\"Read-pairs and split reads in region B\">")
33 | 	vcf_header.append("##INFO=<ID=LTE,Number=2,Type=Integer,Description=\"Read-pairs and split reads supporting the event\">")
34 | 	vcf_header.append("##INFO=<ID=CTG,Number=1,Type=String,Description=\"Sequence of contig\">")
35 | 	vcf_header.append("##INFO=<ID=REGIONA,Number=2,Type=Integer,Description=\"Start and end of regionB\">")
36 | 	vcf_header.append("##INFO=<ID=REGIONB,Number=2,Type=Integer,Description=\"Start and end of regionB\">")
37 | 
38 | 	#Declare the filters
39 | 
40 | 	vcf_header.append("##FILTER=<ID=BelowExpectedLinks,Description=\"The number of links or reads between A and B is too small\">")
41 | 	vcf_header.append("##FILTER=<ID=FewLinks,Description=\"Unexpectedly low fraction of discordant reads betwen A and B\">")
42 | 	vcf_header.append("##FILTER=<ID=UnexpectedCoverage,Description=\"The coverage of the window on chromosome B or A is higher than 4*average coverage\">")
43 | 	vcf_header.append("##FILTER=<ID=Smear,Description=\"Window A and Window B overlap\">")
44 | 	vcf_header.append("##FILTER=<ID=RegionalQ,Description=\"The mapping quality of the region is lower than the user set limit\">")
45 | 	vcf_header.append("##FILTER=<ID=MinSize,Description=\"The variant is smaller than the user set limit\">")
46 | 	vcf_header.append("##FILTER=<ID=Ploidy,Description=\"Intrachromosomal variant on a chromosome having 0 ploidy\">")
47 | 	vcf_header.append("##FILTER=<ID=SplitsVSDiscs,Description=\"large variant supported mainly by split reads (and not discorant pairs) \">")
48 | 	vcf_header.append("##FILTER=<ID=Density,Description=\"The discordant reads cluster too tightly\">")
49 | 
50 | 	#set format
51 | 
52 | 	vcf_header.append("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")
53 | 	vcf_header.append("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">")
54 | 	vcf_header.append("##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of paired-ends that support the event\">")
55 | 	vcf_header.append("##FORMAT=<ID=RV,Number=1,Type=Integer,Description=\"Number of split reads that support the event\">")
56 | 	vcf_header.append("##FORMAT=<ID=DR,Number=2,Type=Integer,Description=\"Number of paired-ends that supporting the reference allele (breakpoint A, and B)\">")
57 | 	vcf_header.append("##FORMAT=<ID=RR,Number=2,Type=Integer,Description=\"Number of reads supporting the reference allele (breakpoint A, and B)\">")
58 | 	vcf_header.append("##FORMAT=<ID=COV,Number=3,Type=Float,Description=\"Coverage (at A,B, and between)\">")
59 | 	vcf_header.append("##FORMAT=<ID=LQ,Number=2,Type=Float,Description=\"Fraction of low quality reads\">")
60 | 
61 | 	#library statistics line
62 | 	vcf_header.append("##LibraryStats=TIDDIT-{} Coverage={}  ReadLength={} MeanInsertSize={} STDInsertSize={} Reverse_Forward={}".format(version,library["avg_coverage"],library["avg_read_length"],library["avg_insert_size"],library["std_insert_size"],library["mp"] ) ) 
63 | 
64 | 	#command used to launch tiddit
65 | 	vcf_header.append("##TIDDITcmd=\"" + " ".join(sys.argv) + "\"")
66 | 
67 | 	vcf_header.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"+sample_id)
68 | 	return("\n".join(vcf_header))
69 | 
70 | 
71 | #generate test header
72 | #bam_file_name=sys.argv[1]
73 | 
74 | #samfile = pysam.AlignmentFile(bam_file_name, "r")
75 | #bam_header=samfile.header
76 | #samfile.close()
77 | 
78 | #try:
79 | #	sample_id=header["RG"][0]["SM"]
80 | #
81 | #except:
82 | #	sample_id=bam_file_name.split("/")[-1].split(".")[0]
83 | 
84 | #library={}
85 | #version="4.0.0"
86 | 
87 | #library["avg_read_length"]=151
88 | #library["avg_insert_size"]=350
89 | #library["std_insert_size"]=400
90 | #library["mp"]=True
91 | #library["avg_coverage"]=35
92 | 
93 | 
94 | #print(main(bam_header,library,sample_id,version))
95 | 
96 | 


--------------------------------------------------------------------------------
/versioned_singularity/README.md:
--------------------------------------------------------------------------------
1 | This folder contains singularity recipies for various TIDDIT releases
2 | 


--------------------------------------------------------------------------------
/versioned_singularity/TIDDIT.2.10.0:
--------------------------------------------------------------------------------
 1 | BootStrap: debootstrap
 2 | OSVersion: trusty
 3 | MirrorURL: http://us.archive.ubuntu.com/ubuntu/
 4 | 
 5 | 
 6 | %runscript
 7 |     echo "This is what happens when you run the container..."
 8 | 
 9 | 
10 | %post
11 |     echo "Hello from inside the container"
12 |     sed -i 's/$/ universe/' /etc/apt/sources.list
13 |     apt-get update
14 |     apt-get -y --force-yes install build-essential cmake make zlib1g-dev python python-dev python-setuptools git wget libbz2-dev unzip
15 |     easy_install pip
16 |     pip install numpy cython
17 | 
18 |     wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-2.10.0.zip
19 |     unzip TIDDIT-2.10.0.zip
20 | 
21 |     mv TIDDIT-TIDDIT-2.10.0/* /bin/
22 |     cd /bin/ && ./INSTALL.sh
23 |     chmod +x /bin/TIDDIT.py
24 | 


--------------------------------------------------------------------------------
/versioned_singularity/TIDDIT.2.12.0:
--------------------------------------------------------------------------------
 1 | BootStrap: debootstrap
 2 | OSVersion: trusty
 3 | MirrorURL: http://us.archive.ubuntu.com/ubuntu/
 4 | 
 5 | 
 6 | %runscript
 7 |     echo "This is what happens when you run the container..."
 8 | 
 9 | 
10 | %post
11 |     echo "Hello from inside the container"
12 |     sed -i 's/$/ universe/' /etc/apt/sources.list
13 |     apt-get update
14 |     apt-get -y --force-yes install build-essential cmake make zlib1g-dev python python-dev python-setuptools git wget libbz2-dev unzip
15 |     easy_install pip
16 |     pip install numpy cython
17 | 
18 |     wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-2.12.0.zip
19 |     unzip TIDDIT-2.12.0.zip
20 | 
21 |     mv TIDDIT-TIDDIT-2.12.0/* /bin/
22 |     cd /bin/ && ./INSTALL.sh
23 |     chmod +x /bin/TIDDIT.py
24 | 


--------------------------------------------------------------------------------
/versioned_singularity/TIDDIT.2.12.1:
--------------------------------------------------------------------------------
 1 | BootStrap: debootstrap
 2 | OSVersion: trusty
 3 | MirrorURL: http://us.archive.ubuntu.com/ubuntu/
 4 | 
 5 | %environment
 6 |     SHELL=/bin/bash
 7 |     PATH=/opt/anaconda/bin:${PATH}
 8 | 
 9 | %runscript
10 |     alias python=python3
11 |     PATH=/opt/anaconda/bin:${PATH}
12 |     echo "This is what happens when you run the container..."
13 | 
14 | 
15 | %post
16 |     echo "Hello from inside the container"
17 |     sed -i 's/$/ universe/' /etc/apt/sources.list
18 |     apt-get update
19 |     apt-get upgrade
20 |     apt-get -y --force-yes install build-essential cmake make zlib1g-dev python python-dev python-setuptools git wget libbz2-dev unzip
21 | 
22 |     cd /root/ && wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
23 |     cd /root/ && chmod 700 ./Miniconda3-latest-Linux-x86_64.sh
24 |     cd /root/ && bash ./Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda/ 
25 | 
26 |     export PATH=/opt/anaconda/bin:${PATH} 
27 | 
28 |     pip install numpy cython
29 | 
30 |     wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-2.12.1.zip
31 |     unzip TIDDIT-2.12.1.zip
32 | 
33 |     mv TIDDIT-TIDDIT-2.12.1/* /bin/
34 |     cd /bin/ && ./INSTALL.sh
35 |     chmod +x /bin/TIDDIT.py
36 | 


--------------------------------------------------------------------------------
/versioned_singularity/TIDDIT.2.7.1:
--------------------------------------------------------------------------------
 1 | BootStrap: debootstrap
 2 | OSVersion: trusty
 3 | MirrorURL: http://us.archive.ubuntu.com/ubuntu/
 4 | 
 5 | 
 6 | %runscript
 7 |     echo "This is what happens when you run the container..."
 8 | 
 9 | 
10 | %post
11 |     echo "Hello from inside the container"
12 |     sed -i 's/$/ universe/' /etc/apt/sources.list
13 |     apt-get update
14 |     apt-get -y --force-yes install build-essential cmake make zlib1g-dev python python-dev python-setuptools git wget libbz2-dev unzip
15 |     easy_install pip
16 |     pip install numpy cython
17 | 
18 |     wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-2.7.1.zip
19 |     unzip TIDDIT-2.7.1.zip
20 | 
21 |     mv TIDDIT-TIDDIT-2.7.1/* /bin/
22 |     cd /bin/ && ./INSTALL.sh
23 |     chmod +x /bin/TIDDIT.py
24 |     cd /
25 | 
26 | 
27 |     mkdir reference
28 |     cd reference
29 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz
30 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/GRCh38.primary_assembly.genome.fa.gz
31 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.transcripts.fa.gz
32 | 
33 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/GRCh37_mapping/gencode.v32lift37.annotation.gtf.gz
34 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/GRCh37_mapping/gencode.v32lift37.transcripts.fa.gz
35 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz
36 | 
37 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/GRCh37_mapping/gencode.v32lift37.metadata.HGNC.gz
38 |     wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.metadata.HGNC.gz
39 | 
40 |     wget https://hgdownload.cse.ucsc.edu/goldenPath/panPan2/bigZips/panPan2.fa.gz
41 |     wget https://hgdownload.cse.ucsc.edu/goldenPath/panTro5/bigZips/panTro5.fa.gz
42 |     
43 | 


--------------------------------------------------------------------------------
/versioned_singularity/TIDDIT.2.8.0:
--------------------------------------------------------------------------------
 1 | BootStrap: debootstrap
 2 | OSVersion: trusty
 3 | MirrorURL: http://us.archive.ubuntu.com/ubuntu/
 4 | 
 5 | 
 6 | %runscript
 7 |     echo "This is what happens when you run the container..."
 8 | 
 9 | 
10 | %post
11 |     echo "Hello from inside the container"
12 |     sed -i 's/$/ universe/' /etc/apt/sources.list
13 |     apt-get update
14 |     apt-get -y --force-yes install build-essential cmake make zlib1g-dev python python-dev python-setuptools git wget libbz2-dev unzip
15 |     easy_install pip
16 |     pip install numpy cython
17 | 
18 |     wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-2.8.0.zip
19 |     unzip TIDDIT-2.8.0.zip
20 | 
21 |     mv TIDDIT-TIDDIT-2.8.0/* /bin/
22 |     cd /bin/ && ./INSTALL.sh
23 |     chmod +x /bin/TIDDIT.py
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/versioned_singularity/TIDDIT.2.8.1:
--------------------------------------------------------------------------------
 1 | BootStrap: debootstrap
 2 | OSVersion: trusty
 3 | MirrorURL: http://us.archive.ubuntu.com/ubuntu/
 4 | 
 5 | 
 6 | %runscript
 7 |     echo "This is what happens when you run the container..."
 8 | 
 9 | 
10 | %post
11 |     echo "Hello from inside the container"
12 |     sed -i 's/$/ universe/' /etc/apt/sources.list
13 |     apt-get update
14 |     apt-get -y --force-yes install build-essential cmake make zlib1g-dev python python-dev python-setuptools git wget libbz2-dev unzip
15 |     easy_install pip
16 |     pip install numpy cython
17 | 
18 |     wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-2.8.1.zip
19 |     unzip TIDDIT-2.8.1.zip
20 | 
21 |     mv TIDDIT-TIDDIT-2.8.1/* /bin/
22 |     cd /bin/ && ./INSTALL.sh
23 |     chmod +x /bin/TIDDIT.py
24 | 


--------------------------------------------------------------------------------