├── .DS_Store ├── bin ├── .DS_Store ├── runRAILS.sh ├── runRAILSminimapSTREAM.sh ├── runRAILSminimap.sh ├── cobbler.pl └── RAILS ├── paper ├── paper.pdf ├── paper.md └── paper.bib ├── rails-logo.png ├── .github └── workflows │ └── stale.yml ├── Dockerfile ├── readme.md └── LICENSE /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/RAILS/HEAD/.DS_Store -------------------------------------------------------------------------------- /bin/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/RAILS/HEAD/bin/.DS_Store -------------------------------------------------------------------------------- /paper/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/RAILS/HEAD/paper/paper.pdf -------------------------------------------------------------------------------- /rails-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcgsc/RAILS/HEAD/rails-logo.png -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: 'Close stale issues and PRs' 2 | on: 3 | schedule: 4 | - cron: '30 1 * * *' 5 | 6 | jobs: 7 | stale: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/stale@v4 11 | with: 12 | stale-issue-message: 'This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your interest in RAILS and Cobbler!' 13 | days-before-stale: 30 14 | days-before-close: 5 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # This container will allow you to run RAILS and Cobbler 3 | # 4 | FROM debian:testing 5 | 6 | # 7 | # Authorship 8 | # 9 | MAINTAINER rwarren@bcgsc.ca 10 | 11 | # 12 | # Update and Install dependencies 13 | # 14 | RUN apt-get update -qq && apt-get install -y bwa wget cpanminus 15 | 16 | # 17 | # Download the software 18 | # 19 | RUN wget https://github.com/bcgsc/RAILS/tree/master/tarball/rails_v1-5-0.tar.gz && tar xvfz rails_v1-5-0.tar.gz && rm rails_v1-5-0.tar.gz 20 | 21 | # 22 | # Set the default working directory 23 | # 24 | WORKDIR /RAILS_v1.5.0 25 | -------------------------------------------------------------------------------- /bin/runRAILS.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #RLW 2016 3 | if [ $# -ne 6 ]; then 4 | echo "Usage: $(basename $0) " 5 | exit 1 6 | fi 7 | 8 | echo Resolving ambiguous bases -Ns- in $1 assembly using long sequences $2 9 | echo reformatting file $1 10 | ### WARNING: MAKE SURE YOUR INPUT FASTA IS ONE SEQUENCE PER LINE, WITH NO LINE BREAKS! 11 | echo WARNING: MAKE SURE YOUR INPUT FASTA IS ONE SEQUENCE PER LINE WITH NO LINE BREAKS! 12 | cat $1 | perl -ne 'if(/^\>/){$scafnum++;}else{my $len=length($_);my @scaftigs=split(/N+/i,$_);my $scaftignum=0;foreach my $scaftig(@scaftigs){ my $len=length($scaftig);$scaftignum++; print ">wga$scafnum";print "."; print "$scaftignum,$len\n$scaftig\n";}}' > $1-formatted.fa 13 | echo reformatting file $2 14 | cat $2 | perl -ne 'if(/^\>/){$ct++;}else{my $len=length($_);print ">seq$ct,$len\n$_";}' > $2-formatted.fa 15 | echo Building sequence database index out of your $1-formatted.fa assembly contigs.. 16 | bwa index $1-formatted.fa 17 | echo Aligning long sequences $2-formatted.fa to your contigs.. 18 | ### YOU MAY CONSIDER: SETTING THE MORE STRINGENT bwa mem -x intractg OPTION AND ADJUSTING -t to higher values for speed 19 | bwa mem -a -t $6 $1-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_gapfilling.bam 20 | echo Scaffolding $1-formatted.fa using $2-formatted.fa and filling gaps with sequences in $2-formatted.fa 21 | echo $2-formatted.fa > $2-formatted.fof 22 | echo $2_vs_$1_gapfilling.bam > $2_vs_$1_gapfilling.fof 23 | cobbler.pl -f $1 -s $2_vs_$1_gapfilling.fof -d $3 -i $4 -b $2_vs_$1_$3_$4_gapsFill -q $2-formatted.fof -p $5 24 | echo Process terminated. 25 | echo RAILS scaffolding $1.gapsFill.fa sequences using long seqs $2 -- anchoring sequence threshold $3 bp 26 | echo reformatting file $1.gapsFill.fa 27 | cat $2_vs_$1_$3_$4_gapsFill.fa | perl -ne 'if(/^\>/){$ct++;}else{my $len=length($_);print ">wga$ct,$len\n$_";}' > $2_vs_$1_$3_$4_gapsFill-formatted.fa 28 | echo Building sequence database index out of your $2_vs_$1_$3_$4_gapsFill-formatted.fa assembly contigs.. 29 | bwa index $2_vs_$1_$3_$4_gapsFill-formatted.fa 30 | echo Aligning long sequences $2-formatted.fa to your contigs.. 31 | ### YOU MAY CONSIDER: SETTING THE MORE STRINGENT bwa mem -x intractg OPTION AND ADJUSTING -t to higher values for speed 32 | bwa mem -a -t $6 $2_vs_$1_$3_$4_gapsFill-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_scaffolding.bam 33 | echo Scaffolding $2_vs_$1_$3_$4_gapsFill-formatted.fa using $2-formatted.fa and filling new gaps with sequences in $2-formatted.fa 34 | echo $2-formatted.fa > $2-formatted.fof 35 | echo $2_vs_$1_scaffolding.bam > $2_vs_$1_scaffolding.fof 36 | RAILS -f $2_vs_$1_$3_$4_gapsFill-formatted.fa -s $2_vs_$1_scaffolding.fof -d $3 -i $4 -b $2_vs_$1_$3_$4_rails -q $2-formatted.fof -p $5 37 | echo RAILS process terminated. 38 | -------------------------------------------------------------------------------- /bin/runRAILSminimapSTREAM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #RLW 2016,2019 3 | 4 | if [ $# -ne 9 ]; then 5 | echo "Usage: $(basename $0) " 6 | exit 1 7 | fi 8 | 9 | echo Resolving ambiguous bases -Ns- in $1 assembly using long sequences $2 10 | #------------------------- 11 | echo reformatting file $1 12 | ### WARNING: MAKE SURE YOUR INPUT FASTA IS ONE SEQUENCE PER LINE, WITH NO LINE BREAKS! 13 | echo WARNING: MAKE SURE YOUR INPUT FASTA IS ONE SEQUENCE PER LINE WITH NO LINE BREAKS! 14 | cat $1 | perl -ne 'if(/^\>/){$scafnum++;}else{my $len=length($_);my @scaftigs=split(/N+/i,$_);my $scaftignum=0;foreach my $scaftig(@scaftigs){ my $len=length($scaftig);$scaftignum++; print ">wga$scafnum";print "."; print "$scaftignum,$len\n$scaftig\n";}}' > $1-formatted.fa 15 | echo reformatting file $2 16 | cat $2 | perl -ne 'if(/^\>/){$ct++;}else{my $len=length($_);print ">seq$ct,$len\n$_";}' > $2-formatted.fa 17 | echo $2-formatted.fa > $2-formatted.fof 18 | #-------------------------- 19 | # Cobbler 20 | #-------------------------- 21 | echo Aligning and Cobbler gap-filling with long sequences $2-formatted.fa.. 22 | 23 | 24 | if [ $7 == 'ont' ]; then 25 | echo Running minimap2 with preset map-ont 26 | minimap2 -x map-ont -I50g -N 10 -a -t $9 $1-formatted.fa $2-formatted.fa | cobbler.pl -f $1 -s stream -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_gapsFill -q $2-formatted.fof -p $8 27 | 28 | elif [ $7 == 'pacbio' ]; then 29 | echo Running minimap2 with preset map-pb 30 | minimap2 -x map-pb -I50g -N 10 -a -t $9 $1-formatted.fa $2-formatted.fa | cobbler.pl -f $1 -s stream -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_gapsFill -q $2-formatted.fof -p $8 31 | 32 | else 33 | echo Running minimap2 with no preset 34 | minimap2 -I50g -N 10 -a -t $9 $1-formatted.fa $2-formatted.fa | cobbler.pl -f $1 -s stream -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_gapsFill -q $2-formatted.fof -p $8 35 | 36 | fi 37 | 38 | echo Process terminated. 39 | #-------------------------- 40 | echo RAILS scaffolding $1.gapsFill.fa sequences and gap-filling using long seqs $2 -- anchoring sequence threshold $3 bp 41 | echo reformatting file $1.gapsFill.fa 42 | cat $2_vs_$1_$3_$4_gapsFill.fa | perl -ne 'if(/^\>/){$ct++;}else{my $len=length($_);print ">wga$ct,$len\n$_";}' > $2_vs_$1_$3_$4_gapsFill-formatted.fa 43 | #-------------------------- 44 | # RAILS 45 | #-------------------------- 46 | echo long sequences $2-formatted.fa alignments to your contigs..RAILS scaffolding and gap-filling 47 | 48 | if [ $7 == 'ont' ]; then 49 | echo Running minimap2 with preset map-ont 50 | minimap2 -x map-ont -I50g -N 10 -a -t $9 $2_vs_$1_$3_$4_gapsFill-formatted.fa $2-formatted.fa | RAILS -f $2_vs_$1_$3_$4_gapsFill-formatted.fa -s stream -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_rails -q $2-formatted.fof -p $8 51 | 52 | elif [ $7 == 'pacbio' ]; then 53 | echo Running minimap2 with preset map-pb 54 | minimap2 -x map-pb -I50g -N 10 -a -t $9 $2_vs_$1_$3_$4_gapsFill-formatted.fa $2-formatted.fa | RAILS -f $2_vs_$1_$3_$4_gapsFill-formatted.fa -s stream -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_rails -q $2-formatted.fof -p $8 55 | 56 | else 57 | echo Running minimap2 with no preset 58 | minimap2 -I50g -N 10 -a -t $9 $2_vs_$1_$3_$4_gapsFill-formatted.fa $2-formatted.fa | RAILS -f $2_vs_$1_$3_$4_gapsFill-formatted.fa -s stream -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_rails -q $2-formatted.fof -p $8 59 | 60 | fi 61 | 62 | #-------------------------- 63 | echo RAILS process terminated. 64 | -------------------------------------------------------------------------------- /bin/runRAILSminimap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #RLW 2016,2019 3 | 4 | if [ $# -ne 9 ]; then 5 | echo "Usage: $(basename $0) " 6 | exit 1 7 | fi 8 | 9 | echo Resolving ambiguous bases -Ns- in $1 assembly using long sequences $2 10 | #------------------------- 11 | echo reformatting file $1 12 | ### WARNING: MAKE SURE YOUR INPUT FASTA IS ONE SEQUENCE PER LINE, WITH NO LINE BREAKS! 13 | echo WARNING: MAKE SURE YOUR INPUT FASTA IS ONE SEQUENCE PER LINE WITH NO LINE BREAKS! 14 | cat $1 | perl -ne 'if(/^\>/){$scafnum++;}else{my $len=length($_);my @scaftigs=split(/N+/i,$_);my $scaftignum=0;foreach my $scaftig(@scaftigs){ my $len=length($scaftig);$scaftignum++; print ">wga$scafnum";print "."; print "$scaftignum,$len\n$scaftig\n";}}' > $1-formatted.fa 15 | echo reformatting file $2 16 | cat $2 | perl -ne 'if(/^\>/){$ct++;}else{my $len=length($_);print ">seq$ct,$len\n$_";}' > $2-formatted.fa 17 | #-------------------------- 18 | echo Aligning long sequences $2-formatted.fa to your contigs.. 19 | 20 | if [ $7 == 'ont' ]; then 21 | echo Running minimap2 with preset map-ont 22 | minimap2 -x map-ont -I50g -N 10 -a -t $9 $1-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_gapfilling.bam 23 | 24 | elif [ $7 == 'pacbio' ]; then 25 | echo Running minimap2 with preset map-pb 26 | minimap2 -x map-pb -I50g -N 10 -a -t $9 $1-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_gapfilling.bam 27 | 28 | else 29 | echo Running minimap2 with no preset 30 | minimap2 -I50g -N 10 -a -t $9 $1-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_gapfilling.bam 31 | 32 | fi 33 | 34 | #-------------------------- 35 | echo Gap-filling $1-formatted.fa using $2-formatted.fa 36 | echo $2-formatted.fa > $2-formatted.fof 37 | echo $2_vs_$1_gapfilling.bam > $2_vs_$1_gapfilling.fof 38 | echo Running cobbler.pl -f $1 -s $2_vs_$1_gapfilling.fof -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_gapsFill -q $2-formatted.fof -p $8 ... 39 | cobbler.pl -f $1 -s $2_vs_$1_gapfilling.fof -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_gapsFill -q $2-formatted.fof -p $8 40 | echo Process terminated. 41 | #-------------------------- 42 | echo RAILS scaffolding $1.gapsFill.fa sequences and gap-filling using long seqs $2 -- anchoring sequence threshold $3 bp 43 | echo reformatting file $1.gapsFill.fa 44 | cat $2_vs_$1_$3_$4_gapsFill.fa | perl -ne 'if(/^\>/){$ct++;}else{my $len=length($_);print ">wga$ct,$len\n$_";}' > $2_vs_$1_$3_$4_gapsFill-formatted.fa 45 | #-------------------------- 46 | echo Aligning long sequences $2-formatted.fa to your contigs.. 47 | 48 | if [ $7 == 'ont' ]; then 49 | echo Running minimap2 with preset map-ont 50 | minimap2 -x map-ont -I50g -N 10 -a -t $9 $2_vs_$1_$3_$4_gapsFill-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_scaffolding.bam 51 | 52 | elif [ $7 == 'pacbio' ]; then 53 | echo Running minimap2 with preset map-pb 54 | minimap2 -x map-pb -I50g -N 10 -a -t $9 $2_vs_$1_$3_$4_gapsFill-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_scaffolding.bam 55 | 56 | else 57 | echo Running minimap2 with no preset 58 | minimap2 -I50g -N 10 -a -t $9 $2_vs_$1_$3_$4_gapsFill-formatted.fa $2-formatted.fa | samtools view -Sb - > $2_vs_$1_scaffolding.bam 59 | 60 | fi 61 | 62 | #-------------------------- 63 | echo Scaffolding $2_vs_$1_$3_$4_gapsFill-formatted.fa using $2-formatted.fa and filling new gaps with sequences in $2-formatted.fa 64 | echo $2-formatted.fa > $2-formatted.fof 65 | echo $2_vs_$1_scaffolding.bam > $2_vs_$1_scaffolding.fof 66 | echo Running RAILS -f $2_vs_$1_$3_$4_gapsFill-formatted.fa -s $2_vs_$1_scaffolding.fof -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_rails -q $2-formatted.fof -p $8 ... 67 | RAILS -f $2_vs_$1_$3_$4_gapsFill-formatted.fa -s $2_vs_$1_scaffolding.fof -l $6 -g $5 -d $3 -i $4 -b $2_vs_$1_$3_$4_rails -q $2-formatted.fof -p $8 68 | #-------------------------- 69 | echo RAILS process terminated. 70 | -------------------------------------------------------------------------------- /paper/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'RAILS and Cobbler: Scaffolding and automated finishing of draft genomes using long DNA sequences' 3 | tags: 4 | - De novo sequence assembly 5 | - Genome scaffolding 6 | - Automated genome finishing 7 | - LINKS 8 | - RAILS 9 | authors: 10 | - name: Rene L Warren 11 | orcid: 0000-0002-9890-2293 12 | affiliation: 1 13 | affiliations: 14 | - name: BC Cancer Agency, Genome Sciences Centre, Vancouver, BC, Canada 15 | index: 1 16 | date: 10 November 2016 17 | bibliography: paper.bib 18 | --- 19 | 20 | # Summary 21 | 22 | Despite major advances in DNA sequencing technologies we do not yet have complete genome sequences. 23 | Producing high-quality, contiguous, draft assemblies *de novo* is of paramount importance as it informs on genetic content and organization of the genome [@Pagani2012]. The past decade has seen improvements in sequence throughput, a substantially lower DNA sequencing cost and increased read lengths. Whereas the base accuracy of short (currently ~250 bp) read lengths such as those from Illumina have improved (>99%), the base accuracy of long sequence read platforms (Pacific Biosciences, Oxford Nanopore) remains low for generating reference-grade genome assemblies without read error correction. Gap-filling tools designed to help finish draft genomes in an automated fashion, which includes our own [@Paulino2015], have been recently developed [@Tsai2010, @Boetzer2012]. They are typically designed to work with short sequencing reads, not high-quality long sequences from other draft assemblies. 24 | In many such projects that employ short sequence reads for *de novo* assembly, a k-mer graph assembly approach is often favored, as it effectively discards errors and spurious sequences, albeit at the cost of long-range information loss and limited ability to resolve long repeats. However, researchers routinely produce various assembly drafts varying the parameter k length in search of the most contiguous assembly. This multitude of assembly drafts is comprised of sequences with untapped potential, representing a wealth of information for gap-filling and scaffolding. 25 | Here, I make available two bioinformatics software tools, Cobbler and RAILS [@RAILS] to exploit this information for automated finishing and scaffolding with long DNA sequences, respectively. They can be used to scaffold & finish high-quality draft genome assemblies with any long, preferably high-quality, sequences such as scaftigs/contigs from another genome draft. They both rely on accurate, long DNA sequences to patch gaps in existing genome assembly drafts. More specifically, Cobbler is a utility to automatically patch gaps (ambiguous regions in a draft assembly, represented by N's). It does so by first aligning the long sequences to the assembly, tallying the alignments and replacing N's with the sequences from these long DNA sequences. RAILS is an all-in-one scaffolder and gap-filler. Its process is similar to that of Cobbler. It scaffolds a given genome draft with the help of long DNA sequences (contig sequences are ordered/oriented using alignment information) using the scaffolding engine I originally developed for SSAKE [@Warren15022007] and LINKS [@Warren2015]. The newly created gaps are automatically filled with the DNA string of the provided long DNA sequences. In a simulated long sequences experiment (1, 2.5, 5, 15 kbp sequences) designed from the human genome reference, Cobbler closed >65% of gaps in a human genome assembly draft (Table 1; test provided with the distribution, correlation of close gaps with length estimates from draft assembly R=0.8253). Using the same sequence data, RAILS further scaffolded that same baseline assembly from (N50 length) 5.6 to 7.3 Mbp, representing a 30% increase in contiguity (Table 2). RAILS and Cobbler are implemented in PERL and run on any systems where PERL is installed. 26 | 27 | **Table 1.** Patching gaps in a genome assembly draft with Cobbler, using simulated 1, 2.5, 5 and 15 kbp simulated long sequences from human genome reference GRCh38. 28 | 29 | Metric | Value 30 | ---- | ----: 31 | Total gaps | 148,091 32 | Number of gaps patched | 95,523 33 | Proportion of gaps patched | 65.1% 34 | Average length (bp) | 343.39 35 | Length st.dev +/- | 931.12 36 | Total bases added | 32,801,755 37 | Largest gap resolved (bp) | 13,662 38 | Shortest gap resolved (bp) | 1 39 | 40 | **Table 2.** Assembly statistics on human genome scaffolding and finishing post Cobbler and RAILS (reporting sequences 500 bp and larger). 41 | 42 | Stage | n:500 | n:N50 | n:NG50 | NG50 (bp) | N50 (bp)| max (bp) | sum (bp) 43 | --------- | ------: | -----: | -----: | ---------: | ---------: | ---------: | -------: 44 | Baseline | 65,905 | 145 | 164 | 5,144,025 | 5,597,244 | 26.41e6 | 2.794e9 45 | Cobbler | 65,905 | 145 | 161 | 5,312,196 | 5,658,133 | 26.66e6 | 2.827e9 46 | RAILS | 64,210 | 113 | 125 | 6,935,685 | 7,266,542 | 32.14e6 | 2.836e9 47 | 48 | # References 49 | -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @online{RAILS, 2 | author = {Rene L Warren}, 3 | title = {RAILS and Cobbler: Scaffolding and automated finishing of draft genomes using long sequences}, 4 | year = 2016, 5 | url = {https://github.com/warrenlr/RAILS}, 6 | urldate = {2016-11-10} 7 | } 8 | 9 | @Article{Pagani2012, 10 | author="Pagani, I. 11 | and Liolios, K. 12 | and Jansson, J. 13 | and Chen, I. -. M. 14 | and Smirnova, T. 15 | and Nosrat, B. 16 | and Markowitz, V. M. 17 | and Kyrpides, N. C.", 18 | title="The Genomes OnLine Database (GOLD) v. 4: status of genomic and metagenomic projects and their associated metadata", 19 | journal="Nucleic Acids Res", 20 | year="2012", 21 | volume="40", 22 | doi="10.1093/nar/gkr1100", 23 | url="http://dx.doi.org/10.1093/nar/gkr1100" 24 | } 25 | 26 | @Article{Paulino2015, 27 | author="Paulino, Daniel 28 | and Warren, Ren{\'e} L. 29 | and Vandervalk, Benjamin P. 30 | and Raymond, Anthony 31 | and Jackman, Shaun D. 32 | and Birol, Inan{\c{c}}", 33 | title="Sealer: a scalable gap-closing application for finishing draft genomes", 34 | journal="BMC Bioinformatics", 35 | year="2015", 36 | volume="16", 37 | number="1", 38 | pages="230", 39 | abstract="While next-generation sequencing technologies have made sequencing genomes faster and more affordable, deciphering the complete genome sequence of an organism remains a significant bioinformatics challenge, especially for large genomes. Low sequence coverage, repetitive elements and short read length make de novo genome assembly difficult, often resulting in sequence and/or fragment ``gaps'' -- uncharacterized nucleotide (N) stretches of unknown or estimated lengths. Some of these gaps can be closed by re-processing latent information in the raw reads. Even though there are several tools for closing gaps, they do not easily scale up to processing billion base pair genomes.", 40 | issn="1471-2105", 41 | doi="10.1186/s12859-015-0663-4", 42 | url="http://dx.doi.org/10.1186/s12859-015-0663-4" 43 | } 44 | 45 | @Article{Tsai2010, 46 | author="Tsai, Isheng J. 47 | and Otto, Thomas D. 48 | and Berriman, Matthew", 49 | title="Improving draft assemblies by iterative mapping and assembly of short reads to eliminate gaps", 50 | journal="Genome Biology", 51 | year="2010", 52 | volume="11", 53 | number="4", 54 | pages="R41", 55 | abstract="Advances in sequencing technology allow genomes to be sequenced at vastly decreased costs. However, the assembled data frequently are highly fragmented with many gaps. We present a practical approach that uses Illumina sequences to improve draft genome assemblies by aligning sequences against contig ends and performing local assemblies to produce gap-spanning contigs. The continuity of a draft genome can thus be substantially improved, often without the need to generate new data.", 56 | issn="1474-760X", 57 | doi="10.1186/gb-2010-11-4-r41", 58 | url="http://dx.doi.org/10.1186/gb-2010-11-4-r41" 59 | } 60 | 61 | @Article{Boetzer2012, 62 | author="Boetzer, Marten 63 | and Pirovano, Walter", 64 | title="Toward almost closed genomes with GapFiller", 65 | journal="Genome Biology", 66 | year="2012", 67 | volume="13", 68 | number="6", 69 | pages="R56", 70 | abstract="De novo assembly is a commonly used application of next-generation sequencing experiments. The ultimate goal is to puzzle millions of reads into one complete genome, although draft assemblies usually result in a number of gapped scaffold sequences. In this paper we propose an automated strategy, called GapFiller, to reliably close gaps within scaffolds using paired reads. The method shows good results on both bacterial and eukaryotic datasets, allowing only few errors. As a consequence, the amount of additional wetlab work needed to close a genome is drastically reduced. The software is available at http://www.baseclear.com/bioinformatics-tools/ .", 71 | issn="1474-760X", 72 | doi="10.1186/gb-2012-13-6-r56", 73 | url="http://dx.doi.org/10.1186/gb-2012-13-6-r56" 74 | } 75 | 76 | @Article{Warren2015, 77 | author="Warren, Ren{\'e} L. 78 | and Yang, Chen 79 | and Vandervalk, Benjamin P. 80 | and Behsaz, Bahar 81 | and Lagman, Albert 82 | and Jones, Steven J. M. 83 | and Birol, Inan{\c{c}}", 84 | title="LINKS: Scalable, alignment-free scaffolding of draft genomes with long reads", 85 | journal="GigaScience", 86 | year="2015", 87 | volume="4", 88 | number="1", 89 | pages="35", 90 | abstract="Owing to the complexity of the assembly problem, we do not yet have complete genome sequences. The difficulty in assembling reads into finished genomes is exacerbated by sequence repeats and the inability of short reads to capture sufficient genomic information to resolve those problematic regions. In this regard, established and emerging long read technologies show great promise, but their current associated higher error rates typically require computational base correction and/or additional bioinformatics pre-processing before they can be of value.", 91 | issn="2047-217X", 92 | doi="10.1186/s13742-015-0076-3", 93 | url="http://dx.doi.org/10.1186/s13742-015-0076-3" 94 | } 95 | 96 | @article{Warren15022007, 97 | author = {Warren, René L. and Sutton, Granger G. and Jones, Steven J. M. and Holt, Robert A.}, 98 | title = {Assembling millions of short DNA sequences using SSAKE}, 99 | volume = {23}, 100 | number = {4}, 101 | pages = {500-501}, 102 | year = {2007}, 103 | doi = {10.1093/bioinformatics/btl629}, 104 | abstract ={Summary: Novel DNA sequencing technologies with the potential for up to three orders magnitude more sequence throughput than conventional Sanger sequencing are emerging. The instrument now available from Solexa Ltd, produces millions of short DNA sequences of 25 nt each. Due to ubiquitous repeats in large genomes and the inability of short sequences to uniquely and unambiguously characterize them, the short read length limits applicability for de novo sequencing. However, given the sequencing depth and the throughput of this instrument, stringent assembly of highly identical sequences can be achieved. We describe SSAKE, a tool for aggressively assembling millions of short nucleotide sequences by progressively searching through a prefix tree for the longest possible overlap between any two sequences. SSAKE is designed to help leverage the information from short sequence reads by stringently assembling them into contiguous sequences that can be used to characterize novel sequencing targets.Availability:http://www.bcgsc.ca/bioinfo/software/ssakeContact:rwarren@bcgsc.ca}, 105 | URL = {http://bioinformatics.oxfordjournals.org/content/23/4/500.abstract}, 106 | eprint = {http://bioinformatics.oxfordjournals.org/content/23/4/500.full.pdf+html}, 107 | journal = {Bioinformatics} 108 | } 109 | 110 | 111 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | [![Release](https://img.shields.io/github/release/bcgsc/RAILS.svg)](https://github.com/bcgsc/RAILS/releases) 2 | [![Downloads](https://img.shields.io/github/downloads/bcgsc/RAILS/total?logo=github)](https://github.com/bcgsc/RAILS/releases/download/v1.5.1/rails_v1-5-1.tar.gz) 3 | [![Issues](https://img.shields.io/github/issues/bcgsc/RAILS.svg)](https://github.com/bcgsc/RAILS/issues) 4 | [![link](https://img.shields.io/badge/RAILScobbler-manuscript-brightgreen)](https://doi.org/10.21105/joss.00116) 5 | Thank you for your [![Stars](https://img.shields.io/github/stars/bcgsc/RAILS.svg)](https://github.com/bcgsc/RAILS/stargazers) 6 | 7 | ![Logo](https://github.com/bcgsc/RAILS/blob/master/rails-logo.png) 8 | 9 | # RAILS v1.5.1 and Cobbler v0.6.1 10 | ## Rene L. Warren, 2014-present 11 | 12 | ### Contents 13 | -------- 14 | 1. [Name](#name) 15 | 2. [Description](#des) 16 | 3. [What's new](#new) 17 | 4. [Implementation and requirements](#imp) 18 | 5. [Community guidelines](#guide) 19 | 6. [Installation](#install) 20 | 7. [Dependencies](#dep) 21 | 8. [Test data](#test) 22 | 9. [Citing RAILS/Cobbler](#citing) 23 | 10. [Usage](#usage) 24 | 11. [Algorithm](#algo) 25 | 12. [Runs on human](#runs) 26 | 13. [License preamble](#license) 27 | -------- 28 | 29 | 30 | ### Name 31 | ------------- 32 | 33 |
 34 | RAILS: Radial Assembly Improvement by Long Sequence Scaffolding
 35 | 
 36 | Cobbler: Gap-filling with long sequences
 37 | 
38 | 39 | ### Description 40 | ------------- 41 | 42 | RAILS and Cobbler are genomics application for scaffolding and automated finishing of genome assemblies with long DNA sequences. 43 | They can be used to scaffold & finish high-quality draft genome assemblies with any long, preferably high-quality, sequences such as scaftigs/contigs from another genome draft. 44 | 45 | They both rely on accurate, long DNA sequences to patch gaps in existing genome assembly drafts. 46 | 47 | Cobbler is a utility to automatically patch gaps (ambiguous regions in a draft assembly, represented by N's) 48 | It does so by first aligning the long sequences to the assembly, tallying the alignments and replacing N's with the sequences from these long DNA sequences. 49 | 50 | RAILS is an all-in-one scaffolder and gap-filler. Its process is similar to that of Cobbler. It scaffolds your genome draft with the help of long DNA sequences (contig sequences are ordered/oriented using alignment information). The newly created gaps are automatically filled with the DNA sequence of the provided long DNA sequence. 51 | 52 | You can test the software by executing "runme.sh" in the test folder. A simulated SARS genome assembly is provided to test the software. 53 | 54 | ### What's new in v1.5.1 55 | 56 | Remove requirement on samtools when running in "stream" mode 57 | 58 | 59 | ### What's new in v1.5.0 60 | 61 | Ability to stream the .sam output of your favorite aligner directly into cobbler/RAILS (tested with minimap2/human data -- see runRAILSminimapSTREAM.sh) 62 | 63 | 64 | ### What's new in v1.4.2 65 | 66 | Improved documentation, minor fixes, support for minimap2 (see runRAILSminimap.sh in the test folder) 67 | 68 | 69 | ### What's new in v1.4.1 70 | 71 | 1. Save in memory gap sequence from highest-matching read for both cobbler and RAILS 72 | 2. Track the number of reads support in cobbler (-l) and RAILS, and allow cutoff when scaffolding (-l and -a), with latter (RAILS) 73 | 3. Remove the hardcoded two-hit requirement for a read in RAILS. Instead, process two best hits for each read aligning different sequences 74 | 4. Implement grace (-g) option, which effectively simulate read trimming (valuable for Nanopore read mapping (suggested -g 250 to -g 500)) 75 | 5. bug fixes (-list.tsv (cobbler) reported some instances of gap-fill regions not fixed in the assembly). cobbler gap-fill table now lists #supporting reads for each gap filled 76 | 77 | 78 | ### Implementation and requirements 79 | ------------- 80 | 81 | RAILS and Cobbler are implemented in PERL and run on any OS where PERL is installed. 82 | Both tools require samtools (tested with v1.8) to read sequence alignment bamfiles. 83 | The runRAILS.sh pipeline requires bwa (see Dependencies below for tested version). 84 | The runRAILSminimap.sh and runRAILSminimapSTREAM.sh pipelines require minimap2. 85 | Please make sure these tools are in your PATH before running the above pipelines. 86 | 87 | 88 | ### Community guidelines 89 | ------------- 90 | 91 | I encourage the community to contribute to the development of this software, by providing suggestions for improving the code and/or directly contributing to the open source code for these tools. Users and developers may report software issues, bug fix requests, comments, etc, at 92 | 93 | 94 | ### Installation 95 | ------------- 96 | 97 | Download the tar ball, gunzip and extract the files on your system using: 98 | 99 |
100 | gunzip rails_v1-5-1.tar.gz
101 | tar -xvf rails_v1-5-1.tar
102 | 
103 | 104 | Pleasure ensure that both cobbler.pl and RAILS are in your PATH. 105 | 106 | Alternatively, individual tools are available for download/cloning within the github repository 107 | 108 | 109 | ### Dependencies 110 | ------------- 111 | 112 | Make sure you have installed bwa (Version: 0.7.15-r1140) or minimap2 (2.15-r905) and that they are in your PATH. 113 | Make sure you have installed samtools (Version: 1.8) and that it is in your PATH. 114 | 115 | Other versions of bwa, minimap2 & samtools may or may not be compatible and they have not been tested. Users may choose to use other versions than the ones specified here, at they see fit, but are expected to thoroughly test the behavior on their own. 116 | 117 | Compatible tools may be used, but have not been tested fully (eg. sambamba) 118 | 119 | 120 | ### Test data 121 | ------------- 122 | 123 |
124 | Go to ./test
125 | (cd test)
126 | 
127 | You may need to change both runme.sh and runmeHuman.sh to specify the path of samtools on your system
128 | 
129 | 1. SARS:
130 | execute runme.sh
131 | (./runme.sh)
132 | 
133 | 2. Human:
134 | execute runmeHuman.sh (will take a while to run with bwa mem (~12h). With minimap2, this test will take ~1h.)
135 | (./runmeHuman.sh)
136 | 
137 | 138 | 139 | ### Citing RAILS/Cobbler 140 | ------------- 141 | 142 | Thank you for your [![Stars](https://img.shields.io/github/stars/bcgsc/RAILS.svg)](https://github.com/bcgsc/RAILS/stargazers) and for using, developing and promoting this free software! 143 | 144 | If you use RAILS or Cobbler for you research, please cite: 145 | 146 |
147 | Warren RL. 2016. RAILS and Cobbler: Scaffolding and automated finishing
148 | of draft genomes using long DNA sequences. The Journal of Open Source
149 | Software. doi: 10.21105/joss.00116
150 | 
151 | [![link](https://img.shields.io/badge/RAILScobbler-manuscript-brightgreen)](https://doi.org/10.21105/joss.00116) 152 | 153 | 154 | ### Usage 155 | ------------- 156 | 157 |
158 | ./runRAILS.sh
159 | Usage: runRAILS.sh     
160 | 
161 | this pipeline will:
162 | 1. reformat the assembly file $1
163 | 2. rename the long sequence file $2
164 | 3. Build a database index with bwa
165 | 4. Align the reformatted long sequences to your re-formatted baseline assembly
166 | 5. Run Cobbler to gap-fill regions of ambiguity
167 | 6. Reformat Cobbler's .fa file
168 | 7. Build a database index of it with bwa
169 | 8. Align the reformatted long sequences to your re-formatted cobbler assembly
170 | 9. Run RAILS to generate a newly scaffolded assembly draft
171 | 
172 | Usage: ./cobbler.pl [v0.6.1]
173 | -f  Assembled Sequences to further scaffold (Multi-FASTA format NO LINE BREAKS, required)
174 | -q  File of filenames containing long Sequences queried (Multi-FASTA format NO LINE BREAKS, required)
175 | -s  File of filenames containing full path to BAM file(s) (use v0.2 for reading SAM files) or simply type: stream for streaming the .sam output of minimap2 or favorite aligner
176 | -p  Full path to samtools (known to work/tested with v1.8, required if reading BAM files)
177 | -d  Anchoring bases on contig edges (ie. minimum required alignment size on contigs, default -d 1000, optional)
178 | -i  Minimum sequence identity fraction (0 to 1), default -i 0.9, optional
179 | -l  Minimum number of long sequence support per gap, default -l 1, optional
180 | -g  Grace length (bp), default -g 1, optional
181 | -t  LIST of names/header, long sequences to avoid using for merging/gap-filling scaffolds (optional)
182 | -b  Base name for your output files (optional)
183 | -v  Runs in verbose mode (-v 1 = yes, default = no, optional)
184 | IMPORTANT: the order of files in -q and -s MUST match!
185 | 
186 | 
187 | Usage: ./RAILS [v1.5.1]
188 | -f  Assembled Sequences to further scaffold (Multi-Fasta format, required)
189 | -q  File of filenames containing long Sequences queried (Multi-Fasta format, required)
190 | -s  File of filenames containing full path to BAM file(s) or simply type: stream for streaming the .sam output of minimap2 or favorite aligner
191 | -p  Full path to samtools (known to work/tested with v1.8, required if reading BAM files)
192 | -d  Anchoring bases on contig edges (ie. minimum required alignment size on contigs, default -d 1000, optional)
193 | -i  Minimum sequence identity fraction (0 to 1), default -i 0.9, optional
194 | -t  LIST of names/header, long sequences to avoid using for merging/gap-filling scaffolds (optional)
195 | -l  Minimum number of links to compute scaffold (default -l 1, optional)
196 | -a  Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a 0.99, optional)
197 | -g  Grace length (bp), default -g 1, optional
198 | -b  Base name for your output files (optional)
199 | -v  Runs in verbose mode (-v 1 = yes, default = no, optional)
200 | IMPORTANT: the order of files in -q and -s MUST match!
201 | 
202 | 
203 | 
204 | 205 | ### Algorithm 206 | ------------- 207 | 208 | The pipeline is detailed in the provided script runRAILS.sh. PLEASE ensure the draft assembly is FASTA-formatted with one sequence per line (NO LINE BREAKS) 209 | 210 | Cobbler's process: 211 | 212 | The assembly draft sequence supplied to Cobbler is first broken up at the ambiguous regions of the assembly (Ns) to create scaftigs. 213 | In the runRAILS.sh, these scaftigs are renamed, tracking their scaffold of origin (renumbered incrementally) and their position within it (also numbered incrementally). 214 | A bwa index is created and the long sequence file, also re-numbered, is aligned to the scaftigs. 215 | Cobbler is supplied with the alignment file (-s sam file) and the long reads files (-q option), specifying the minimum length of anchoring bases (-d) aligning at the edge of scaftigs and the minimum sequence identity of the alignment (-i). When 1 or more long sequences align unambiguously to the 3'end of a scaftig and the 5'end of its neighbour, the gap is patched with the sequence of that long sequence. If no long sequences are suitable, or the -d and -i conditions are not met, the original Ns are placed back between those scaftigs. 216 | 217 | RAILS process: 218 | 219 | In RAILS, the process is similar as for Cobbler, except that the draft assembly is not broken up at Ns, since the goal is to merge distinct sequences into larger ones. Long sequences are aligned to the draft assembly sequences, orienting and ordering sequences and simulateneously filling the gaps between them, using DNA bases from the long sequences. 220 | 221 | Scaffolding in RAILS is done using the LINKS scaffolder code (Warren et al. 2015), the unpublished scaffolding engine in the widely-used SSAKE assembler (Warren et al. 2007), and foundation of the SSPACE-LongRead scaffolder (Boetzer and Pirovano, 2014). 222 | 223 | The grace (-g) parameter may be used to set the MAXIMUM length of unaligned bases allowed at the end of each (long) sequencing read alignment to the draft genome assembly. For example, setting -g 250 tells cobbler/RAILS to consider a sequencing read with a soft-clip of up to 250 bp in 5' or 3' 224 | 225 | Output: For both Cobbler and RAILS, a summary of the gaps closed and their lengths is provided (.tsv) as a text file. 226 | A fasta file (.fa) of the finished and/or scaffolded draft is generated for both along with a log file reporting basic success statistics. 227 | 228 |
229 | Boetzer M, Pirovano W. 2014. SSPACE-LongRead: scaffolding bacterial draft genomes using long read sequence information. BMC Bioinformatics.15:211. DOI: 10.1186/1471-2105-15-211
230 | 
231 | Warren RL, Yang C, Vandervalk BP, Behsaz B, Lagman A, Jones SJ, Birol I. 2015. LINKS: Scalable, alignment-free scaffolding of draft genomes with long reads. GigaScience 4:35. DOI: 10.1186/s13742-015-0076-3
232 | 
233 | Warren RL, Sutton GG, Jones SJM, Holt RA.  2007.  Assembling millions of short DNA sequences using SSAKE.  Bioinformatics. 23(4):500-501. DOI: 10.1093/bioinformatics/btl629
234 | 
235 | 236 | 237 | ### Runs on human 238 | ------------- 239 | 240 | On a human HG004 ABySS draft assembly, cobbler filled over 65% of the gaps using 1, 2.5, 5, 15 kb long DNA sequences simulated from the human genome reference. The Pearson correlation between the predicted gap sizes and the size of patched gaps is R=0.8150 241 | 242 | 243 | **Table 1.** Patching gaps with Cobbler (v0.2) using simulated 1, 2.5, 5, 15kbp simulated long sequences from human genome reference GRCh38. 244 | 245 | Metric | Value 246 | ---- | ----: 247 | Total gaps | 148,091 248 | Number of gaps patched | 95,523 249 | Proportion of gaps patched | 65.1% 250 | Average length (bp) | 343.39 251 | Length st.dev +/- | 931.12 252 | Total bases added | 32,801,755 253 | Largest gap resolved (bp) | 13,662 254 | Shortest gap resolved (bp) | 1 255 | 256 | RAILS (v1.1) was used to further contiguate the human baseline assembly draft and automatically close gaps within in: 257 | 258 | **Table 2.** RAILS scaffolding and gap-filling summary on a human assembly baseline, using simulated 1, 2.5, 5, 15kbp simulated long sequences from human genome reference GRCh38. 259 | 260 | Metric | Value 261 | ---- | ----: 262 | Number of merges induced | 6,029 263 | Average closed gap length (bp) | 1,136.71 264 | Closed gap length st.dev +/- | 2,511.69 265 | Total bases added | 6,853,222 266 | Largest gap resolved (bp) | 14,471 267 | Shortest gap resolved (bp) | 1 268 | 269 | 6,029 merges resulted from RAILS scaffolding of the baseline human assembly draft (1,695 >= 500bp) 270 | The scaffold N50 length increased from 5.6 to 7.3 Mbp, a 30% increase in N50 length. 271 | 272 | 273 | **Table 3.** Assembly statistics on human genome scaffolding and finishing post Cobbler and RAILS (reporting sequences 500 bp and larger). 274 | 275 | Stage | n:500 | n:N50 | n:NG50 | NG50 (bp) | N50 (bp)| max (bp) | sum (bp) 276 | --------- | ------: | -----: | -----: | ---------: | ---------: | ---------: | -------: 277 | Baseline | 65,905 | 145 | 164 | 5,144,025 | 5,597,244 | 26.41e6 | 2.794e9 278 | Cobbler | 65,905 | 145 | 161 | 5,312,196 | 5,658,133 | 26.66e6 | 2.827e9 279 | RAILS | 64,210 | 113 | 125 | 6,935,685 | 7,266,542 | 32.14e6 | 2.836e9 280 | 281 | 282 | ### License preamble 283 | ------------- 284 | 285 | RAILS and Cobbler Copyright (c) 2014-present British Columbia Cancer Agency Branch. All rights reserved. 286 | 287 | RAILS and Cobbler are released under the GNU General Public License v3 288 | 289 | This program is free software: you can redistribute it and/or modify 290 | it under the terms of the GNU General Public License as published by 291 | the Free Software Foundation, version 3. 292 | 293 | This program is distributed in the hope that it will be useful, 294 | but WITHOUT ANY WARRANTY; without even the implied warranty of 295 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 296 | GNU General Public License for more details. 297 | 298 | You should have received a copy of the GNU General Public License 299 | along with this program. If not, see . 300 | 301 | -------------------------------------------------------------------------------- /bin/cobbler.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | #AUTHOR 4 | # Rene Warren 5 | # rwarren at bcgsc.ca 6 | 7 | 8 | #NAME 9 | #RAILS: Radial Assembly Improvement by Long Sequence Scaffolding 10 | #Scaffolding and gap-closure using alignment of long sequences 11 | 12 | #SYNOPSIS 13 | 14 | #DOCUMENTATION 15 | # readme.md distributed with this software 16 | # We hope this code is useful to you -- Please send comments & suggestions to rwarren * bcgsc.ca 17 | # If you use RAILS, the RAILS code or ideas, please cite our work 18 | # 19 | 20 | #LICENSE 21 | # LINKS, RAILS and Cobbler Copyright (c) 2014-2019 Canada's Michael Smith Genome Science Centre. All rights reserved. 22 | 23 | use strict; 24 | use Getopt::Std; 25 | use Net::SMTP; 26 | use vars qw($opt_f $opt_s $opt_d $opt_i $opt_v $opt_b $opt_t $opt_q $opt_l $opt_g $opt_p); 27 | getopts('f:s:d:v:b:t:i:q:g:l:p:'); 28 | my ($base_name,$anchor,$seqid,$verbose,$minreads,$grace)=("",1000,0.9,0,1,1); 29 | 30 | my $version = "[v0.6.1]"; 31 | my $dev = "rwarren\@bcgsc.ca"; 32 | my $SAMPATH = ""; 33 | 34 | #------------------------------------------------- 35 | 36 | if(! $opt_f || ! $opt_s || ! $opt_q){ 37 | print "Usage: $0 $version\n"; 38 | print "-f Assembled Sequences to further scaffold (Multi-FASTA format NO LINE BREAKS, required)\n"; 39 | print "-q File of filenames containing long Sequences queried (Multi-FASTA format NO LINE BREAKS, required)\n"; 40 | print "-s File of filenames containing full path to BAM file(s) (use v0.2 for reading SAM files) or simply type: stream for streaming the .sam output of minimap2 or favorite aligner\n"; 41 | print "-p Full path to samtools (known to work/tested with v1.8, required if reading BAM files)\n"; 42 | print "-d Anchoring bases on contig edges (ie. minimum required alignment size on contigs, default -d $anchor, optional)\n"; 43 | print "-i Minimum sequence identity fraction (0 to 1), default -i $seqid, optional\n"; 44 | print "-l Minimum number of long sequence support per gap, default -l $minreads, optional\n"; 45 | print "-g Grace length (bp), default -g $grace, optional\n"; 46 | print "-t LIST of names/header, long sequences to avoid using for merging/gap-filling scaffolds (optional)\n"; 47 | print "-b Base name for your output files (optional)\n"; 48 | print "-v Runs in verbose mode (-v 1 = yes, default = no, optional)\n"; 49 | die "IMPORTANT: the order of files in -q and -s MUST match!\n"; 50 | } 51 | 52 | my $file = $opt_f; 53 | my $fof = $opt_s; 54 | my $queryfof = $opt_q; 55 | $anchor = $opt_d if($opt_d); 56 | $seqid = $opt_i if($opt_i); 57 | $verbose = $opt_v if($opt_v); 58 | my $listfile = $opt_t if($opt_t); 59 | $base_name = $opt_b if($opt_b); 60 | $grace = $opt_g if($opt_g); 61 | $minreads = $opt_l if($opt_l); 62 | $SAMPATH = $opt_p if($opt_p); 63 | 64 | my $assemblyruninfo=""; 65 | 66 | 67 | if(! -e $file){ 68 | die "Invalid file: $file -- fatal\n"; 69 | } 70 | 71 | if(! -e $SAMPATH && $fof ne "stream"){ 72 | die "Invalid: $SAMPATH -- fatal\n"; 73 | } 74 | 75 | ### Naming output files 76 | if ($base_name eq ""){ 77 | 78 | $base_name = $file . ".scaff_s-" . $fof . "_q-" . $queryfof . "_d" . $anchor . "_i" . $seqid . "_l" . $minreads . "_g" . $grace . "_t" . $listfile; 79 | my $pid_num = getpgrp(0); 80 | $base_name .= "_pid" . $pid_num; 81 | } 82 | 83 | my $log = $base_name . ".log"; 84 | my $newassemblyfile = $base_name . ".fa"; 85 | my $tsvfile = $base_name . "-list.tsv"; 86 | 87 | open (LOG, ">$log") || die "Can't write to $log -- fatal\n"; 88 | 89 | 90 | #------------------------------------------------- 91 | 92 | my $init_message = "\nRunning: $0 $version\n-f $file\n-q $queryfof\n-s $fof\n"; 93 | 94 | $init_message .= "-d $anchor\n-i $seqid\n-l $minreads\n-g $grace\n-t $listfile\n"; 95 | 96 | print $init_message; 97 | print LOG $init_message; 98 | $assemblyruninfo=$init_message . "\n"; 99 | 100 | #------------------------------------------------- 101 | 102 | my $date = `date`; 103 | chomp($date); 104 | 105 | my $reading_reads_message = "\n=>Reading bam: $date\n"; 106 | print $reading_reads_message; 107 | print LOG $reading_reads_message; 108 | $assemblyruninfo.=$reading_reads_message; 109 | my $tigpair; 110 | my $initpos=0; 111 | my $totalpairs=0; 112 | 113 | ### READ Query read FOF 114 | my @qryfilearray; 115 | open(QRYFOF,$queryfof) || die "Can't open $queryfof for reading -- fatal.\n"; 116 | while(){ 117 | chomp; 118 | push @qryfilearray, $_; 119 | } 120 | close QRYFOF; 121 | 122 | 123 | #XXX 124 | 125 | if(-f $fof){ 126 | 127 | my $ctline=0; 128 | 129 | open(FOF,$fof) || die "Can't open $fof for reading -- fatal.\n"; 130 | while(){ 131 | chomp; 132 | my $bamfile = $_; 133 | my $rh = &readSeqMemory($qryfilearray[$ctline]);### ONLY READ READ SEQUENCE IN MEMORY FOR THOSE MATCHING BAM (SAME ORDER NEEDED) 134 | print "Parsing alignment file $bamfile...\n"; 135 | $tigpair = &readBam($tigpair,$bamfile,$anchor,$seqid,$listfile,$initpos,$rh,$grace); 136 | print "done.\n"; 137 | $ctline++; 138 | } 139 | close FOF; 140 | 141 | }elsif($opt_s eq "stream"){### STREAMING SUPPORT 142 | 143 | my $rh = &readSeqMemory($qryfilearray[0]);### ONLY SUPPORT ONE READ FILE 144 | print "Parsing alignment file...\n"; 145 | $tigpair=&readSam($tigpair,$opt_s,$anchor,$seqid,$listfile,$initpos,$rh,$grace); 146 | print "done.\n"; 147 | 148 | } 149 | 150 | 151 | my $date = `date`; 152 | chomp($date); 153 | my $patchmsg = "done.\nFixing ambiguous bases (Ns): $date\n"; 154 | print $patchmsg; 155 | print LOG $patchmsg; 156 | $assemblyruninfo.=$patchmsg; 157 | my ($gsl,$totalgap) = &patchGaps($file,$tigpair,$newassemblyfile,$tsvfile,$minreads); 158 | 159 | my $date = `date`; 160 | chomp($date); 161 | my ($avg,$sum,$max,$min) = &average($gsl); 162 | my $sd = &stdev($gsl); 163 | my $final_message = "done: $date\n\n--------------- $0 Summary ---------------\nNumber of gaps patched : %i out of %i (%.2f %%)\nAverage length (bp) : %.2f\nLength st.dev +/- : %.2f\nTotal bases added : %i\nLargest gap resolved (bp) : %i\nShortest gap resolved (bp) : %i\n---------------------------------------------\n"; 164 | my @arrsg=@$gsl; 165 | my $numgaps = $#arrsg+1; 166 | my $percentclosed = $numgaps / $totalgap *100; 167 | printf $final_message, ($numgaps,$totalgap,$percentclosed,$avg,$sd,$sum,$max,$min); 168 | printf LOG $final_message, ($numgaps,$totalgap,$percentclosed,$avg,$sd,$sum,$max,$min); 169 | 170 | $assemblyruninfo .= "done: $date\n\n--------------- $0 Summary ---------------\nNumber of gaps patched : $numgaps out of $totalgap ($percentclosed %) \nAverage length (bp) : $avg\nLength st.dev +/- : $sd\nTotal bases added : $sum\nLargest gap resolved (bp) : $max\nShortest gap resolved (bp) : $min\n---------------------------------------------\n"; 171 | 172 | exit; 173 | 174 | ###for dev. test purposes 175 | eval{ 176 | my $wdir = `pwd`; 177 | chomp($wdir); 178 | my $smtp = Net::SMTP->new('mailhost'); 179 | $smtp->mail("RAILS\@bcgsc.ca"); 180 | $smtp->to($dev); 181 | $smtp->data(); 182 | $smtp->datasend("Subject: Your $0 run\n"); 183 | $smtp->datasend("At: $wdir\n"); 184 | $smtp->datasend($assemblyruninfo); 185 | $smtp->dataend(); 186 | $smtp->quit; 187 | }; 188 | 189 | exit; 190 | 191 | 192 | #----------------- 193 | sub readSeqMemory{ 194 | 195 | my $file = shift; 196 | 197 | my $fh; 198 | my $prev="NA"; 199 | my $seq=""; 200 | open(FA,$file) || die "Cannot open $file for reading -- fatal.\n"; 201 | while(){ 202 | chomp; 203 | if (/\>(\S+)/){ 204 | my $head=$1; 205 | if($prev ne $head && $prev ne "NA"){ 206 | $fh->{$prev} = $seq; 207 | } 208 | $prev = $head; 209 | $seq=''; 210 | }elsif(/^(\S+)$/){ 211 | $seq .= uc($1); 212 | } 213 | } 214 | $fh->{$prev} = $seq; 215 | 216 | close FA; 217 | 218 | return $fh; 219 | } 220 | 221 | #---------------- 222 | sub patchGaps{ 223 | my ($file,$tigpair,$newfile,$gaplist,$minreads) = @_; 224 | 225 | my $tignames; 226 | my $head =""; 227 | my $ctseq=0; 228 | open(IN,$file) || die "Error reading $file -- fatal.\n"; 229 | open(OUT,">$newfile") || die "Error writing $newfile -- fatal.\n";; 230 | open(TSV,">$gaplist") || die "Error reading $gaplist -- fatal.\n"; 231 | my $filledct=0; 232 | my $totalgap=0; 233 | my @gapspatched; 234 | print "\nSequences processed:\n"; 235 | print TSV "scaffold\tscaftig\tgapLength\tgapFilledLength\treadSupportCount\n"; 236 | while(){ 237 | chomp; 238 | if(/^\>(\S+)/){ 239 | print OUT "$_\n"; 240 | $head=$1; 241 | $ctseq++; 242 | $tignames->{$ctseq}=$head; 243 | print "\r$ctseq"; 244 | $|++; 245 | }else{ 246 | my @scaftigs = split(/N+/i,$_); 247 | my @gaps = split(/[ABCDEFGHIJKLMOPQRSTUVWXYZ]+/i,$_);###Anything but Ns 248 | my $numgap = $#gaps; 249 | $totalgap += $numgap if($numgap>0) ; 250 | #print "@gaps $numgap\n";#XXXXX 251 | 252 | my $scaftignum=0; 253 | my $gappos=1;######ASSUMES SCAFFOLDS NEVER START WITH Ns 254 | foreach my $scaftig(@scaftigs){ 255 | print OUT "$scaftig"; 256 | my $len = length($scaftig); 257 | $scaftignum++; 258 | my $num = $ctseq . "." . $scaftignum; 259 | my $nextscaftig = $scaftignum+1; 260 | #print "$num\n"; 261 | if(defined $tigpair->{$num}){ 262 | my $list = $tigpair->{$num}; 263 | my $next = $ctseq . "." . $nextscaftig; 264 | 265 | if(defined $tigpair->{$num}{$next}{'seq'} && length($tigpair->{$num}{$next}{'seq'})>0 && $tigpair->{$num}{$next}{'distr'}>=$minreads){###MIN READ LOGIC 266 | print OUT "$tigpair->{$num}{$next}{'seq'}"; ### gap-filling 267 | $filledct++; 268 | my $gaplen = length($gaps[$gappos]); 269 | my $filledlen = length($tigpair->{$num}{$next}{'seq'}); 270 | print TSV "$ctseq\t$scaftignum\t$gaplen\t$filledlen\t$tigpair->{$num}{$next}{'distr'}\n"; 271 | push @gapspatched, $filledlen if($filledlen > 0); 272 | #print "Scaftig $num -- $next\n $tigpair->{$num}{$next}{'distr'}\n$tigpair->{$num}{$next}{'seq'}\n$tigpair->{$num}{$next}{'configuration'}\n$tigpair->{$num}{$next}{'origin'}\nGAP:$gappos :: $gaps[$gappos]\n" if(defined $tigpair->{$num}{$next}); 273 | }else{### Does not pass filters, put back the Ns 274 | if($gaps[$gappos] ne ""){ 275 | print OUT "$gaps[$gappos]"; 276 | my $gaplen = length($gaps[$gappos]); 277 | print TSV "$ctseq\t$scaftignum\t$gaplen\t\t$tigpair->{$num}{$next}{'distr'}\n";### will still indicate read support 278 | } 279 | } 280 | }else{### Does not pass filters, put back the Ns 281 | print OUT "$gaps[$gappos]"; 282 | my $gaplen = length($gaps[$gappos]); 283 | print TSV "$ctseq\t$scaftignum\t$gaplen\t\n"; 284 | } 285 | $gappos++; 286 | } 287 | print OUT "\n"; 288 | } 289 | } 290 | close IN; 291 | close OUT; 292 | close TSV; 293 | print "\ndone.\n"; 294 | my $endmessage = "Filled $filledct out of $totalgap gaps (gaps are defined by any stretch of Ns in your assembly)\nGap-filled assembly: $newfile\nList of gap lengths: $gaplist\n"; 295 | print LOG "$endmessage"; 296 | print "$endmessage"; 297 | $assemblyruninfo .= $endmessage; 298 | 299 | return \@gapspatched,$totalgap; 300 | } 301 | 302 | #--------------- 303 | sub readSam{ 304 | 305 | my ($tigpair,$bamfile,$anchor,$seqid,$listfile,$initpos,$rh,$grace) = @_; 306 | 307 | my $mem; 308 | if(-f $listfile){ 309 | open(IN,$listfile) || die "Can't read $listfile -- fatal.\n"; 310 | while(){ 311 | chomp; 312 | $mem->{$_}=1; 313 | } 314 | close IN; 315 | } 316 | my $bt; 317 | my $track_all; 318 | #HS9_159:6:1308:13492:64472 272 scaffold43,6983,f43Z6983 6439 0 536M * 0 0 * * NM:i:0 AS:i:536 319 | #HS9_159:6:1308:13492:64472 0 scaffold30,32025,f30Z32025 25411 0 536M * 0 0 GCTTATAAAAGAAGGTGCAATTGATCCTTGCCTTACGCCTACAAAGGAGGGTAGGTGCGATTGGTCCTTACATTCTTACGCCGCTTAGGAAGCTAGGCGAGATAGGATGGGTTCTAGAGCACCTAACTAGCTTTACACGCCGAATCCAGACCTGCCGGCTACCATCCGGATTCATACTAGATAACATAAAGGAGAGAACAACTGTTCAAAGAACAACTCGGAGAACATTTGTATCCGGTGGTTGGGGCATTGCGTGCTATACCAACTACCTCAGGTGCGCGAGGTCTCATTCCTTTTCCAAGCCCAATAAAGAAAAAATATCATTAGTGATGGTGAATCCCGTTTATATAAGTAAGTTGCATTCTTATCTAAGTAAGTGGGCTTTCCTAAGTCACTTATTGGGTGGGGGGCCCCTGTCGAGTGAGCCATCCTTCCTCACCCTCTCTTTTGTTGGGCGAGCCATCTTTCCTTTTATACGATTCGATCCAGTAGATAAGGAAGACCGACCGAGAACAACCAATGGCCTTCCCTGGGGG * NM:i:0 AS:i:536 XS:i:536 320 | #HS9_159:6:1308:13492:64472 272 scaffold22,90777,f22Z90777 90233 0 536M * 0 0 * * NM:i:0 AS:i:536 321 | my $t; 322 | my $ct=0; 323 | 324 | my %options = (); 325 | 326 | print join( 327 | "\t", 328 | 'qname', 329 | 'qstart', 330 | 'qend', 331 | 'qalen', 332 | 'qlen', 333 | 'rname', 334 | 'rstart', 335 | 'rend', 336 | 'ralen', 337 | 'rlen', 338 | 'edit_dist', 339 | ) . "\n" if $options{header}; 340 | 341 | 342 | my %rlength = (); 343 | 344 | while(<>){###Stream from STDIN 345 | 346 | chomp; 347 | $ct++; 348 | 349 | my @a=split(/\t/); 350 | my @b=split(/\,/,$a[2]); 351 | my @c=split(/\,/,$a[0]); 352 | 353 | if ($options{rlen} && /^\@SQ\s+SN:(\S+)\s+LN:(\S+)/) { 354 | $rlength{$1} = $2; 355 | } 356 | next unless @a >= 10; 357 | my $line = $_; 358 | my $qname = $a[0]; 359 | my $rname = $a[2]; 360 | my $rstart = $a[3]; 361 | my $cigar = $a[5]; 362 | my $qseq = $a[9]; 363 | # Query 364 | my $qstart = 1; 365 | $_ = $cigar; 366 | s/^(\d+)[SH]/$qstart += $1/eg; 367 | my $qalen = 0; 368 | $_ = $cigar; 369 | s/(\d+)[M=XI]/$qalen += $1/eg; 370 | my $qend = $qstart + $qalen - 1; 371 | $_ = $cigar; 372 | my $end_clip_len = 0; 373 | s/(\d+)[SH]$/$end_clip_len += $1/eg; 374 | my $qlen = $c[1]; 375 | #if ($qalen > 0) { 376 | # $qlen = ($qstart-1) + $qalen + $end_clip_len; 377 | #} elsif ($qseq ne "*") { 378 | # $qlen = length($a[9]); 379 | #} 380 | 381 | # Reference 382 | my $ralen = 0; 383 | $_ = $cigar; 384 | s/(\d+)[M=XDN]/$ralen += $1/eg; 385 | my $rend = $rstart + $ralen - 1; 386 | my $rlen = $b[1]; 387 | #if ($options{rlen} && exists($rlength{$rname})) { 388 | # $rlen = $rlength{$rname}; 389 | #} 390 | 391 | # Calculate edit distance including clipping 392 | my $edit_dist = ''; 393 | if ($line =~ /NM:i:(\d+)/) { 394 | $edit_dist = $1;# + $qstart - 1 + $end_clip_len; 395 | } 396 | 397 | # if ($rname eq '*') { 398 | # # case: query sequence is unmapped 399 | # print join("\t", $qname, $qstart, $qend, $qalen, $qlen) . "\n"; 400 | # } else { 401 | # print join("\t", $qname, $qstart, $qend, $qalen, $qlen, $rname, $rstart, $rend, $ralen, $rlen); 402 | # print "\t$edit_dist" if length($edit_dist) > 0; 403 | # print "\n"; 404 | # } 405 | 406 | my $read = $a[0] . "-" . $ct; 407 | my $si=0; 408 | $si = ($qalen - $edit_dist) / $qalen if($qalen); 409 | 410 | 411 | if($si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace ) )){ ### this indicates anchoring bases, within $anchor of edges 412 | 413 | 414 | print "$si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace\n" if($verbose); 415 | my $dir; 416 | my $start; 417 | my $end; 418 | ###Coordinates on the scaffolds 419 | if($rstart <= $grace && ($qlen-$qend)<= $grace){ 420 | $start = $rend; 421 | $end = $rstart; 422 | }else{ 423 | $start = $rstart; 424 | $end = $rend; 425 | } 426 | my $orient=""; 427 | if($a[1]==272 || $a[1]==16 || $a[1]==2064){ ### matches on negative strand 428 | $orient="r"; 429 | my $tmpstart = $qlen - $qend; 430 | my $tmpend = $qlen - $qstart; 431 | $qstart = $tmpstart; 432 | $qend = $tmpend; 433 | }else{ 434 | $orient="f"; 435 | } 436 | ###tracks from a read perspective 437 | my ($numtig,$scaftignum,$sz)=($2,$1,$3) if($a[2]=~/\D+((\d+)\.\d+),(\d+)/);### scaffoldNUMBER,LENGTH eg. wga1,1301 438 | $t->{$a[0]}{$scaftignum}{'orient'}= $dir . $orient ; 439 | $t->{$a[0]}{$scaftignum}{'real'}=$read;###my $read = $a[0] . "-" . $ct; 440 | $t->{$a[0]}{$scaftignum}{'length'}=$qlen; 441 | $track_all->{$read}{'tig'}=$numtig; 442 | $track_all->{$read}{'scaftig'}=$scaftignum; 443 | $track_all->{$read}{'start'}=$start; 444 | $track_all->{$read}{'end'}=$end; 445 | $track_all->{$read}{'multiple'}=1; 446 | $track_all->{$read}{'sam'}=$line; 447 | $track_all->{$read}{'orient'}=$orient; 448 | $track_all->{$read}{'qalen'}=$qalen; 449 | $track_all->{$read}{'qstart'}=$qstart; 450 | $track_all->{$read}{'qend'}=$qend; 451 | $track_all->{$read}{'si'}=$si; ### added 11APR2018 the read with most matching bases is chosen for gapfill (patch seq) 452 | # print "$line\n\n"; 453 | } 454 | } 455 | close IN;###End SAM parse 456 | my ($occ,$same)=(0,0);###TRACK REDUNDANCY 457 | 458 | foreach my $rd(keys %$t){ 459 | my $scafflist=$t->{$rd}; 460 | my $num = keys(%$scafflist); 461 | my $prevscaff = "NA"; 462 | foreach my $scaff(sort {$a<=>$b} keys %$scafflist){ 463 | if($prevscaff ne "NA"){ 464 | #if($num==2){###maps on two different scaftigs only 465 | #print "$num!\n"; 466 | my @arr; 467 | my $totalreadlength=0; 468 | my $current = $scafflist->{$scaff}{'real'}; 469 | my $prev = $scafflist->{$prevscaff}{'real'}; 470 | $totalreadlength = $scafflist->{$scaff}{'length'}; 471 | 472 | my ($p_s,$p_t)=($1,$2) if($track_all->{$prev}{'scaftig'}=~/(\d+)\.(\d+)/); 473 | my ($c_s,$c_t)=($1,$2) if($track_all->{$current}{'scaftig'}=~/(\d+)\.(\d+)/); 474 | my $prev_match = $p_s . "." . ($p_t + 1); 475 | my $curr_match = $c_s . "." . ($c_t + 1); 476 | 477 | #print "$track_all->{$current}{'tig'} == $track_all->{$prev}{'tig'} $track_all->{$current}{'scaftig'} $track_all->{$prev}{'scaftig'}\n"; 478 | #print "$track_all->{$current}{'scaftig'} ... $track_all->{$prev}{'scaftig'}\n"; 479 | if($track_all->{$current}{'tig'} == $track_all->{$prev}{'tig'} && (($track_all->{$current}{'scaftig'} eq $prev_match) || ($track_all->{$prev}{'scaftig'} eq $curr_match ))){###ADDED OCT2016, make sure on same scaffold and consecutive 480 | 481 | my ($one,$two)=($track_all->{$current}{'scaftig'},$track_all->{$prev}{'scaftig'}); 482 | if($p_t < $c_t){ 483 | ($one,$two)=($track_all->{$prev}{'scaftig'},$track_all->{$current}{'scaftig'}) 484 | } 485 | #print "$one,$two\n"; 486 | ### this will track the best anchoring long reads for the merge/gapfill 487 | my $m1 = $track_all->{$current}{'qalen'} * $track_all->{$current}{'si'}; 488 | my $m2 = $track_all->{$prev}{'qalen'} * $track_all->{$prev}{'si'}; 489 | my $matchbases = $m1 + $m2; 490 | 491 | if(! defined $mem->{$rd} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### WILL TRACK BEST ANCHORING BASES 492 | $tigpair->{$one}{$two}{'distr'}++; 493 | if($matchbases > $bt->{$one}{$two}{'bestmatch'}){ 494 | $bt->{$one}{$two}{'bestmatch'} = $matchbases; 495 | my $pos=0; 496 | $pos = $track_all->{$prev}{'qend'} if($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'}); 497 | $pos = $track_all->{$current}{'qend'} if($track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}); 498 | my $gapseqlen = $totalreadlength - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'}); 499 | print ">$rd $pos @ $gapseqlen \n$rh->{$rd}\n\n" if($verbose); 500 | my $patch = substr($rh->{$rd},$pos,$gapseqlen-1); 501 | 502 | print "GAP:$patch\n" if($verbose); 503 | 504 | ###JUST SOME TEST CODE 505 | if(defined $tigpair->{$track_all->{$prev}{'scaftig'}}{$track_all->{$current}{'scaftig'}}{'seq'}){ 506 | $occ++; 507 | $same if($patch eq $tigpair->{$track_all->{$prev}{'scaftig'}}{$track_all->{$current}{'scaftig'}}{'seq'}); 508 | 509 | print "$prev ($track_all->{$prev}{'scaftig'})...$current ($track_all->{$current}{'scaftig'})\n$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}\nNEW GAP:\n$patch\n" if($verbose); 510 | 511 | #if($patch ne $tigpair->{$track_all->{$prev}{'scaftig'}}{$track_all->{$current}{'scaftig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";} 512 | } 513 | 514 | if($track_all->{$prev}{'orient'} eq $track_all->{$current}{'orient'}){### ff or rr 515 | $patch = &reverseComplement($patch) if($track_all->{$prev}{'orient'} eq "r"); 516 | $tigpair->{$one}{$two}{'seq'}=lc($patch); 517 | $tigpair->{$one}{$two}{'origin'}=$rd; 518 | } 519 | print ">>>> $track_all->{$prev}{'scaftig'} $track_all->{$current}{'scaftig'} $patch\n" if($verbose); 520 | 521 | print "$track_all->{$prev}{'sam'}\n$track_all->{$current}{'sam'}\n x====x ($totalreadlength) ($track_all->{$prev}{'qstart'}-$track_all->{$prev}{'qend'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND ($track_all->{$current}{'qstart'}-$track_all->{$current}{'qend'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'})\n===x x==== sc$track_all->{$prev}{'tig'} ($track_all->{$prev}{'start'}-$track_all->{$prev}{'end'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND sc$track_all->{$current}{'tig'} ($track_all->{$current}{'start'}-$track_all->{$current}{'end'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'}) \n\n" if($verbose); 522 | }###save for bestmatch only 523 | } 524 | } 525 | }#IF PREV NE NA 526 | $prevscaff = $scaff; 527 | }#foreach scatigs, ordered 528 | } 529 | 530 | print "\nRedundant same contig combo linking:$occ\nSame gap sequence fill:$same\n\n"; 531 | 532 | return $tigpair; 533 | } 534 | 535 | 536 | #--------------- 537 | sub readBam{ 538 | 539 | my ($tigpair,$bamfile,$anchor,$seqid,$listfile,$initpos,$rh,$grace) = @_; 540 | 541 | my $mem; 542 | if(-f $listfile){ 543 | open(IN,$listfile) || die "Can't read $listfile -- fatal.\n"; 544 | while(){ 545 | chomp; 546 | $mem->{$_}=1; 547 | } 548 | close IN; 549 | } 550 | my $bt; 551 | my $track_all; 552 | #HS9_159:6:1308:13492:64472 272 scaffold43,6983,f43Z6983 6439 0 536M * 0 0 * * NM:i:0 AS:i:536 553 | #HS9_159:6:1308:13492:64472 0 scaffold30,32025,f30Z32025 25411 0 536M * 0 0 GCTTATAAAAGAAGGTGCAATTGATCCTTGCCTTACGCCTACAAAGGAGGGTAGGTGCGATTGGTCCTTACATTCTTACGCCGCTTAGGAAGCTAGGCGAGATAGGATGGGTTCTAGAGCACCTAACTAGCTTTACACGCCGAATCCAGACCTGCCGGCTACCATCCGGATTCATACTAGATAACATAAAGGAGAGAACAACTGTTCAAAGAACAACTCGGAGAACATTTGTATCCGGTGGTTGGGGCATTGCGTGCTATACCAACTACCTCAGGTGCGCGAGGTCTCATTCCTTTTCCAAGCCCAATAAAGAAAAAATATCATTAGTGATGGTGAATCCCGTTTATATAAGTAAGTTGCATTCTTATCTAAGTAAGTGGGCTTTCCTAAGTCACTTATTGGGTGGGGGGCCCCTGTCGAGTGAGCCATCCTTCCTCACCCTCTCTTTTGTTGGGCGAGCCATCTTTCCTTTTATACGATTCGATCCAGTAGATAAGGAAGACCGACCGAGAACAACCAATGGCCTTCCCTGGGGG * NM:i:0 AS:i:536 XS:i:536 554 | #HS9_159:6:1308:13492:64472 272 scaffold22,90777,f22Z90777 90233 0 536M * 0 0 * * NM:i:0 AS:i:536 555 | my $t; 556 | my $ct=0; 557 | 558 | my %options = (); 559 | 560 | print join( 561 | "\t", 562 | 'qname', 563 | 'qstart', 564 | 'qend', 565 | 'qalen', 566 | 'qlen', 567 | 'rname', 568 | 'rstart', 569 | 'rend', 570 | 'ralen', 571 | 'rlen', 572 | 'edit_dist', 573 | ) . "\n" if $options{header}; 574 | 575 | 576 | my %rlength = (); 577 | 578 | my $ERRLOG = $bamfile.".bampreprocessor.err.log".$$.time(); 579 | my $cmd = "$SAMPATH view $bamfile 2>$ERRLOG|"; 580 | open(IN,$cmd) || die "Error reading $bamfile -- fatal.\n"; 581 | while(){ 582 | 583 | chomp; 584 | $ct++; 585 | 586 | my @a=split(/\t/); 587 | my @b=split(/\,/,$a[2]); 588 | my @c=split(/\,/,$a[0]); 589 | 590 | if ($options{rlen} && /^\@SQ\s+SN:(\S+)\s+LN:(\S+)/) { 591 | $rlength{$1} = $2; 592 | } 593 | next unless @a >= 10; 594 | my $line = $_; 595 | my $qname = $a[0]; 596 | my $rname = $a[2]; 597 | my $rstart = $a[3]; 598 | my $cigar = $a[5]; 599 | my $qseq = $a[9]; 600 | # Query 601 | my $qstart = 1; 602 | $_ = $cigar; 603 | s/^(\d+)[SH]/$qstart += $1/eg; 604 | my $qalen = 0; 605 | $_ = $cigar; 606 | s/(\d+)[M=XI]/$qalen += $1/eg; 607 | my $qend = $qstart + $qalen - 1; 608 | $_ = $cigar; 609 | my $end_clip_len = 0; 610 | s/(\d+)[SH]$/$end_clip_len += $1/eg; 611 | my $qlen = $c[1]; 612 | #if ($qalen > 0) { 613 | # $qlen = ($qstart-1) + $qalen + $end_clip_len; 614 | #} elsif ($qseq ne "*") { 615 | # $qlen = length($a[9]); 616 | #} 617 | 618 | # Reference 619 | my $ralen = 0; 620 | $_ = $cigar; 621 | s/(\d+)[M=XDN]/$ralen += $1/eg; 622 | my $rend = $rstart + $ralen - 1; 623 | my $rlen = $b[1]; 624 | #if ($options{rlen} && exists($rlength{$rname})) { 625 | # $rlen = $rlength{$rname}; 626 | #} 627 | 628 | # Calculate edit distance including clipping 629 | my $edit_dist = ''; 630 | if ($line =~ /NM:i:(\d+)/) { 631 | $edit_dist = $1;# + $qstart - 1 + $end_clip_len; 632 | } 633 | 634 | # if ($rname eq '*') { 635 | # # case: query sequence is unmapped 636 | # print join("\t", $qname, $qstart, $qend, $qalen, $qlen) . "\n"; 637 | # } else { 638 | # print join("\t", $qname, $qstart, $qend, $qalen, $qlen, $rname, $rstart, $rend, $ralen, $rlen); 639 | # print "\t$edit_dist" if length($edit_dist) > 0; 640 | # print "\n"; 641 | # } 642 | 643 | my $read = $a[0] . "-" . $ct; 644 | my $si=0; 645 | $si = ($qalen - $edit_dist) / $qalen if($qalen); 646 | 647 | 648 | if($si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace ) )){ ### this indicates anchoring bases, within $anchor of edges 649 | 650 | 651 | print "$si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace\n" if($verbose); 652 | my $dir; 653 | my $start; 654 | my $end; 655 | ###Coordinates on the scaffolds 656 | if($rstart <= $grace && ($qlen-$qend)<= $grace){ 657 | $start = $rend; 658 | $end = $rstart; 659 | }else{ 660 | $start = $rstart; 661 | $end = $rend; 662 | } 663 | my $orient=""; 664 | if($a[1]==272 || $a[1]==16 || $a[1]==2064){ ### matches on negative strand 665 | $orient="r"; 666 | my $tmpstart = $qlen - $qend; 667 | my $tmpend = $qlen - $qstart; 668 | $qstart = $tmpstart; 669 | $qend = $tmpend; 670 | }else{ 671 | $orient="f"; 672 | } 673 | ###tracks from a read perspective 674 | my ($numtig,$scaftignum,$sz)=($2,$1,$3) if($a[2]=~/\D+((\d+)\.\d+),(\d+)/);### scaffoldNUMBER,LENGTH eg. wga1,1301 675 | $t->{$a[0]}{$scaftignum}{'orient'}= $dir . $orient ; 676 | $t->{$a[0]}{$scaftignum}{'real'}=$read;###my $read = $a[0] . "-" . $ct; 677 | $t->{$a[0]}{$scaftignum}{'length'}=$qlen; 678 | $track_all->{$read}{'tig'}=$numtig; 679 | $track_all->{$read}{'scaftig'}=$scaftignum; 680 | $track_all->{$read}{'start'}=$start; 681 | $track_all->{$read}{'end'}=$end; 682 | $track_all->{$read}{'multiple'}=1; 683 | $track_all->{$read}{'sam'}=$line; 684 | $track_all->{$read}{'orient'}=$orient; 685 | $track_all->{$read}{'qalen'}=$qalen; 686 | $track_all->{$read}{'qstart'}=$qstart; 687 | $track_all->{$read}{'qend'}=$qend; 688 | $track_all->{$read}{'si'}=$si; ### added 11APR2018 the read with most matching bases is chosen for gapfill (patch seq) 689 | # print "$line\n\n"; 690 | } 691 | } 692 | close IN;###End SAM parse 693 | my ($occ,$same)=(0,0);###TRACK REDUNDANCY 694 | 695 | foreach my $rd(keys %$t){ 696 | my $scafflist=$t->{$rd}; 697 | my $num = keys(%$scafflist); 698 | my $prevscaff = "NA"; 699 | foreach my $scaff(sort {$a<=>$b} keys %$scafflist){ 700 | if($prevscaff ne "NA"){ 701 | #if($num==2){###maps on two different scaftigs only 702 | #print "$num!\n"; 703 | my @arr; 704 | my $totalreadlength=0; 705 | my $current = $scafflist->{$scaff}{'real'}; 706 | my $prev = $scafflist->{$prevscaff}{'real'}; 707 | $totalreadlength = $scafflist->{$scaff}{'length'}; 708 | 709 | my ($p_s,$p_t)=($1,$2) if($track_all->{$prev}{'scaftig'}=~/(\d+)\.(\d+)/); 710 | my ($c_s,$c_t)=($1,$2) if($track_all->{$current}{'scaftig'}=~/(\d+)\.(\d+)/); 711 | my $prev_match = $p_s . "." . ($p_t + 1); 712 | my $curr_match = $c_s . "." . ($c_t + 1); 713 | 714 | #print "$track_all->{$current}{'tig'} == $track_all->{$prev}{'tig'} $track_all->{$current}{'scaftig'} $track_all->{$prev}{'scaftig'}\n"; 715 | #print "$track_all->{$current}{'scaftig'} ... $track_all->{$prev}{'scaftig'}\n"; 716 | if($track_all->{$current}{'tig'} == $track_all->{$prev}{'tig'} && (($track_all->{$current}{'scaftig'} eq $prev_match) || ($track_all->{$prev}{'scaftig'} eq $curr_match ))){###ADDED OCT2016, make sure on same scaffold and consecutive 717 | 718 | my ($one,$two)=($track_all->{$current}{'scaftig'},$track_all->{$prev}{'scaftig'}); 719 | if($p_t < $c_t){ 720 | ($one,$two)=($track_all->{$prev}{'scaftig'},$track_all->{$current}{'scaftig'}) 721 | } 722 | #print "$one,$two\n"; 723 | ### this will track the best anchoring long reads for the merge/gapfill 724 | my $m1 = $track_all->{$current}{'qalen'} * $track_all->{$current}{'si'}; 725 | my $m2 = $track_all->{$prev}{'qalen'} * $track_all->{$prev}{'si'}; 726 | my $matchbases = $m1 + $m2; 727 | 728 | if(! defined $mem->{$rd} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### WILL TRACK BEST ANCHORING BASES 729 | $tigpair->{$one}{$two}{'distr'}++; 730 | if($matchbases > $bt->{$one}{$two}{'bestmatch'}){ 731 | $bt->{$one}{$two}{'bestmatch'} = $matchbases; 732 | my $pos=0; 733 | $pos = $track_all->{$prev}{'qend'} if($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'}); 734 | $pos = $track_all->{$current}{'qend'} if($track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}); 735 | my $gapseqlen = $totalreadlength - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'}); 736 | print ">$rd $pos @ $gapseqlen \n$rh->{$rd}\n\n" if($verbose); 737 | my $patch = substr($rh->{$rd},$pos,$gapseqlen-1); 738 | 739 | print "GAP:$patch\n" if($verbose); 740 | 741 | ###JUST SOME TEST CODE 742 | if(defined $tigpair->{$track_all->{$prev}{'scaftig'}}{$track_all->{$current}{'scaftig'}}{'seq'}){ 743 | $occ++; 744 | $same if($patch eq $tigpair->{$track_all->{$prev}{'scaftig'}}{$track_all->{$current}{'scaftig'}}{'seq'}); 745 | 746 | print "$prev ($track_all->{$prev}{'scaftig'})...$current ($track_all->{$current}{'scaftig'})\n$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}\nNEW GAP:\n$patch\n" if($verbose); 747 | 748 | #if($patch ne $tigpair->{$track_all->{$prev}{'scaftig'}}{$track_all->{$current}{'scaftig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";} 749 | } 750 | 751 | if($track_all->{$prev}{'orient'} eq $track_all->{$current}{'orient'}){### ff or rr 752 | $patch = &reverseComplement($patch) if($track_all->{$prev}{'orient'} eq "r"); 753 | $tigpair->{$one}{$two}{'seq'}=lc($patch); 754 | $tigpair->{$one}{$two}{'origin'}=$rd; 755 | } 756 | print ">>>> $track_all->{$prev}{'scaftig'} $track_all->{$current}{'scaftig'} $patch\n" if($verbose); 757 | 758 | print "$track_all->{$prev}{'sam'}\n$track_all->{$current}{'sam'}\n x====x ($totalreadlength) ($track_all->{$prev}{'qstart'}-$track_all->{$prev}{'qend'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND ($track_all->{$current}{'qstart'}-$track_all->{$current}{'qend'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'})\n===x x==== sc$track_all->{$prev}{'tig'} ($track_all->{$prev}{'start'}-$track_all->{$prev}{'end'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND sc$track_all->{$current}{'tig'} ($track_all->{$current}{'start'}-$track_all->{$current}{'end'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'}) \n\n" if($verbose); 759 | }###save for bestmatch only 760 | } 761 | } 762 | }#IF PREV NE NA 763 | $prevscaff = $scaff; 764 | }#foreach scatigs, ordered 765 | } 766 | 767 | print "\nRedundant same contig combo linking:$occ\nSame gap sequence fill:$same\n\n"; 768 | 769 | return $tigpair; 770 | } 771 | 772 | #----------------------- 773 | sub reverseComplement{ 774 | $_ = shift; 775 | $_ = uc(); 776 | tr/ATGCYRKMBDHV/TACGRYMKVHDB/; 777 | return (reverse()); 778 | } 779 | 780 | #---------------- 781 | sub average{ 782 | my $data = shift; 783 | if (not @$data) { 784 | die("Empty arrayn -- maybe the scaffold merging step did not necessitate gap filling. It is also possible that your version of samtools is not supported. This script was tested with samtools v1.8."); 785 | } 786 | my $total = 0; 787 | my $max = 0; 788 | my $min = 1000000; 789 | foreach (@$data) { 790 | $total += $_; 791 | $max = $_ if($_ > $max); 792 | $min = $_ if($_ < $min); 793 | } 794 | my $average = $total / @$data; 795 | return $average,$total,$max,$min; 796 | } 797 | 798 | #---------------- 799 | sub stdev{ 800 | my $data = shift; 801 | if(@$data == 1){ 802 | return 0; 803 | } 804 | my $average = &average($data); 805 | my $sqtotal = 0; 806 | foreach(@$data) { 807 | $sqtotal += ($average-$_) ** 2; 808 | } 809 | my $std = ($sqtotal / (@$data-1)) ** 0.5; 810 | return $std; 811 | } 812 | 813 | ## We hope this code is useful to you -- Please send comments & suggestions to rwarren at bcgsc.ca 814 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ### GNU GENERAL PUBLIC LICENSE 2 | 3 | Version 3, 29 June 2007 4 | 5 | Copyright (C) 2007 Free Software Foundation, Inc. 6 | 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | ### Preamble 12 | 13 | The GNU General Public License is a free, copyleft license for 14 | software and other kinds of works. 15 | 16 | The licenses for most software and other practical works are designed 17 | to take away your freedom to share and change the works. By contrast, 18 | the GNU General Public License is intended to guarantee your freedom 19 | to share and change all versions of a program--to make sure it remains 20 | free software for all its users. We, the Free Software Foundation, use 21 | the GNU General Public License for most of our software; it applies 22 | also to any other work released this way by its authors. You can apply 23 | it to your programs, too. 24 | 25 | When we speak of free software, we are referring to freedom, not 26 | price. Our General Public Licenses are designed to make sure that you 27 | have the freedom to distribute copies of free software (and charge for 28 | them if you wish), that you receive source code or can get it if you 29 | want it, that you can change the software or use pieces of it in new 30 | free programs, and that you know you can do these things. 31 | 32 | To protect your rights, we need to prevent others from denying you 33 | these rights or asking you to surrender the rights. Therefore, you 34 | have certain responsibilities if you distribute copies of the 35 | software, or if you modify it: responsibilities to respect the freedom 36 | of others. 37 | 38 | For example, if you distribute copies of such a program, whether 39 | gratis or for a fee, you must pass on to the recipients the same 40 | freedoms that you received. You must make sure that they, too, receive 41 | or can get the source code. And you must show them these terms so they 42 | know their rights. 43 | 44 | Developers that use the GNU GPL protect your rights with two steps: 45 | (1) assert copyright on the software, and (2) offer you this License 46 | giving you legal permission to copy, distribute and/or modify it. 47 | 48 | For the developers' and authors' protection, the GPL clearly explains 49 | that there is no warranty for this free software. For both users' and 50 | authors' sake, the GPL requires that modified versions be marked as 51 | changed, so that their problems will not be attributed erroneously to 52 | authors of previous versions. 53 | 54 | Some devices are designed to deny users access to install or run 55 | modified versions of the software inside them, although the 56 | manufacturer can do so. This is fundamentally incompatible with the 57 | aim of protecting users' freedom to change the software. The 58 | systematic pattern of such abuse occurs in the area of products for 59 | individuals to use, which is precisely where it is most unacceptable. 60 | Therefore, we have designed this version of the GPL to prohibit the 61 | practice for those products. If such problems arise substantially in 62 | other domains, we stand ready to extend this provision to those 63 | domains in future versions of the GPL, as needed to protect the 64 | freedom of users. 65 | 66 | Finally, every program is threatened constantly by software patents. 67 | States should not allow patents to restrict development and use of 68 | software on general-purpose computers, but in those that do, we wish 69 | to avoid the special danger that patents applied to a free program 70 | could make it effectively proprietary. To prevent this, the GPL 71 | assures that patents cannot be used to render the program non-free. 72 | 73 | The precise terms and conditions for copying, distribution and 74 | modification follow. 75 | 76 | ### TERMS AND CONDITIONS 77 | 78 | #### 0. Definitions. 79 | 80 | "This License" refers to version 3 of the GNU General Public License. 81 | 82 | "Copyright" also means copyright-like laws that apply to other kinds 83 | of works, such as semiconductor masks. 84 | 85 | "The Program" refers to any copyrightable work licensed under this 86 | License. Each licensee is addressed as "you". "Licensees" and 87 | "recipients" may be individuals or organizations. 88 | 89 | To "modify" a work means to copy from or adapt all or part of the work 90 | in a fashion requiring copyright permission, other than the making of 91 | an exact copy. The resulting work is called a "modified version" of 92 | the earlier work or a work "based on" the earlier work. 93 | 94 | A "covered work" means either the unmodified Program or a work based 95 | on the Program. 96 | 97 | To "propagate" a work means to do anything with it that, without 98 | permission, would make you directly or secondarily liable for 99 | infringement under applicable copyright law, except executing it on a 100 | computer or modifying a private copy. Propagation includes copying, 101 | distribution (with or without modification), making available to the 102 | public, and in some countries other activities as well. 103 | 104 | To "convey" a work means any kind of propagation that enables other 105 | parties to make or receive copies. Mere interaction with a user 106 | through a computer network, with no transfer of a copy, is not 107 | conveying. 108 | 109 | An interactive user interface displays "Appropriate Legal Notices" to 110 | the extent that it includes a convenient and prominently visible 111 | feature that (1) displays an appropriate copyright notice, and (2) 112 | tells the user that there is no warranty for the work (except to the 113 | extent that warranties are provided), that licensees may convey the 114 | work under this License, and how to view a copy of this License. If 115 | the interface presents a list of user commands or options, such as a 116 | menu, a prominent item in the list meets this criterion. 117 | 118 | #### 1. Source Code. 119 | 120 | The "source code" for a work means the preferred form of the work for 121 | making modifications to it. "Object code" means any non-source form of 122 | a work. 123 | 124 | A "Standard Interface" means an interface that either is an official 125 | standard defined by a recognized standards body, or, in the case of 126 | interfaces specified for a particular programming language, one that 127 | is widely used among developers working in that language. 128 | 129 | The "System Libraries" of an executable work include anything, other 130 | than the work as a whole, that (a) is included in the normal form of 131 | packaging a Major Component, but which is not part of that Major 132 | Component, and (b) serves only to enable use of the work with that 133 | Major Component, or to implement a Standard Interface for which an 134 | implementation is available to the public in source code form. A 135 | "Major Component", in this context, means a major essential component 136 | (kernel, window system, and so on) of the specific operating system 137 | (if any) on which the executable work runs, or a compiler used to 138 | produce the work, or an object code interpreter used to run it. 139 | 140 | The "Corresponding Source" for a work in object code form means all 141 | the source code needed to generate, install, and (for an executable 142 | work) run the object code and to modify the work, including scripts to 143 | control those activities. However, it does not include the work's 144 | System Libraries, or general-purpose tools or generally available free 145 | programs which are used unmodified in performing those activities but 146 | which are not part of the work. For example, Corresponding Source 147 | includes interface definition files associated with source files for 148 | the work, and the source code for shared libraries and dynamically 149 | linked subprograms that the work is specifically designed to require, 150 | such as by intimate data communication or control flow between those 151 | subprograms and other parts of the work. 152 | 153 | The Corresponding Source need not include anything that users can 154 | regenerate automatically from other parts of the Corresponding Source. 155 | 156 | The Corresponding Source for a work in source code form is that same 157 | work. 158 | 159 | #### 2. Basic Permissions. 160 | 161 | All rights granted under this License are granted for the term of 162 | copyright on the Program, and are irrevocable provided the stated 163 | conditions are met. This License explicitly affirms your unlimited 164 | permission to run the unmodified Program. The output from running a 165 | covered work is covered by this License only if the output, given its 166 | content, constitutes a covered work. This License acknowledges your 167 | rights of fair use or other equivalent, as provided by copyright law. 168 | 169 | You may make, run and propagate covered works that you do not convey, 170 | without conditions so long as your license otherwise remains in force. 171 | You may convey covered works to others for the sole purpose of having 172 | them make modifications exclusively for you, or provide you with 173 | facilities for running those works, provided that you comply with the 174 | terms of this License in conveying all material for which you do not 175 | control copyright. Those thus making or running the covered works for 176 | you must do so exclusively on your behalf, under your direction and 177 | control, on terms that prohibit them from making any copies of your 178 | copyrighted material outside their relationship with you. 179 | 180 | Conveying under any other circumstances is permitted solely under the 181 | conditions stated below. Sublicensing is not allowed; section 10 makes 182 | it unnecessary. 183 | 184 | #### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 185 | 186 | No covered work shall be deemed part of an effective technological 187 | measure under any applicable law fulfilling obligations under article 188 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 189 | similar laws prohibiting or restricting circumvention of such 190 | measures. 191 | 192 | When you convey a covered work, you waive any legal power to forbid 193 | circumvention of technological measures to the extent such 194 | circumvention is effected by exercising rights under this License with 195 | respect to the covered work, and you disclaim any intention to limit 196 | operation or modification of the work as a means of enforcing, against 197 | the work's users, your or third parties' legal rights to forbid 198 | circumvention of technological measures. 199 | 200 | #### 4. Conveying Verbatim Copies. 201 | 202 | You may convey verbatim copies of the Program's source code as you 203 | receive it, in any medium, provided that you conspicuously and 204 | appropriately publish on each copy an appropriate copyright notice; 205 | keep intact all notices stating that this License and any 206 | non-permissive terms added in accord with section 7 apply to the code; 207 | keep intact all notices of the absence of any warranty; and give all 208 | recipients a copy of this License along with the Program. 209 | 210 | You may charge any price or no price for each copy that you convey, 211 | and you may offer support or warranty protection for a fee. 212 | 213 | #### 5. Conveying Modified Source Versions. 214 | 215 | You may convey a work based on the Program, or the modifications to 216 | produce it from the Program, in the form of source code under the 217 | terms of section 4, provided that you also meet all of these 218 | conditions: 219 | 220 | - a) The work must carry prominent notices stating that you modified 221 | it, and giving a relevant date. 222 | - b) The work must carry prominent notices stating that it is 223 | released under this License and any conditions added under 224 | section 7. This requirement modifies the requirement in section 4 225 | to "keep intact all notices". 226 | - c) You must license the entire work, as a whole, under this 227 | License to anyone who comes into possession of a copy. This 228 | License will therefore apply, along with any applicable section 7 229 | additional terms, to the whole of the work, and all its parts, 230 | regardless of how they are packaged. This License gives no 231 | permission to license the work in any other way, but it does not 232 | invalidate such permission if you have separately received it. 233 | - d) If the work has interactive user interfaces, each must display 234 | Appropriate Legal Notices; however, if the Program has interactive 235 | interfaces that do not display Appropriate Legal Notices, your 236 | work need not make them do so. 237 | 238 | A compilation of a covered work with other separate and independent 239 | works, which are not by their nature extensions of the covered work, 240 | and which are not combined with it such as to form a larger program, 241 | in or on a volume of a storage or distribution medium, is called an 242 | "aggregate" if the compilation and its resulting copyright are not 243 | used to limit the access or legal rights of the compilation's users 244 | beyond what the individual works permit. Inclusion of a covered work 245 | in an aggregate does not cause this License to apply to the other 246 | parts of the aggregate. 247 | 248 | #### 6. Conveying Non-Source Forms. 249 | 250 | You may convey a covered work in object code form under the terms of 251 | sections 4 and 5, provided that you also convey the machine-readable 252 | Corresponding Source under the terms of this License, in one of these 253 | ways: 254 | 255 | - a) Convey the object code in, or embodied in, a physical product 256 | (including a physical distribution medium), accompanied by the 257 | Corresponding Source fixed on a durable physical medium 258 | customarily used for software interchange. 259 | - b) Convey the object code in, or embodied in, a physical product 260 | (including a physical distribution medium), accompanied by a 261 | written offer, valid for at least three years and valid for as 262 | long as you offer spare parts or customer support for that product 263 | model, to give anyone who possesses the object code either (1) a 264 | copy of the Corresponding Source for all the software in the 265 | product that is covered by this License, on a durable physical 266 | medium customarily used for software interchange, for a price no 267 | more than your reasonable cost of physically performing this 268 | conveying of source, or (2) access to copy the Corresponding 269 | Source from a network server at no charge. 270 | - c) Convey individual copies of the object code with a copy of the 271 | written offer to provide the Corresponding Source. This 272 | alternative is allowed only occasionally and noncommercially, and 273 | only if you received the object code with such an offer, in accord 274 | with subsection 6b. 275 | - d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | - e) Convey the object code using peer-to-peer transmission, 288 | provided you inform other peers where the object code and 289 | Corresponding Source of the work are being offered to the general 290 | public at no charge under subsection 6d. 291 | 292 | A separable portion of the object code, whose source code is excluded 293 | from the Corresponding Source as a System Library, need not be 294 | included in conveying the object code work. 295 | 296 | A "User Product" is either (1) a "consumer product", which means any 297 | tangible personal property which is normally used for personal, 298 | family, or household purposes, or (2) anything designed or sold for 299 | incorporation into a dwelling. In determining whether a product is a 300 | consumer product, doubtful cases shall be resolved in favor of 301 | coverage. For a particular product received by a particular user, 302 | "normally used" refers to a typical or common use of that class of 303 | product, regardless of the status of the particular user or of the way 304 | in which the particular user actually uses, or expects or is expected 305 | to use, the product. A product is a consumer product regardless of 306 | whether the product has substantial commercial, industrial or 307 | non-consumer uses, unless such uses represent the only significant 308 | mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to 312 | install and execute modified versions of a covered work in that User 313 | Product from a modified version of its Corresponding Source. The 314 | information must suffice to ensure that the continued functioning of 315 | the modified object code is in no case prevented or interfered with 316 | solely because modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or 331 | updates for a work that has been modified or installed by the 332 | recipient, or for the User Product in which it has been modified or 333 | installed. Access to a network may be denied when the modification 334 | itself materially and adversely affects the operation of the network 335 | or violates the rules and protocols for communication across the 336 | network. 337 | 338 | Corresponding Source conveyed, and Installation Information provided, 339 | in accord with this section must be in a format that is publicly 340 | documented (and with an implementation available to the public in 341 | source code form), and must require no special password or key for 342 | unpacking, reading or copying. 343 | 344 | #### 7. Additional Terms. 345 | 346 | "Additional permissions" are terms that supplement the terms of this 347 | License by making exceptions from one or more of its conditions. 348 | Additional permissions that are applicable to the entire Program shall 349 | be treated as though they were included in this License, to the extent 350 | that they are valid under applicable law. If additional permissions 351 | apply only to part of the Program, that part may be used separately 352 | under those permissions, but the entire Program remains governed by 353 | this License without regard to the additional permissions. 354 | 355 | When you convey a copy of a covered work, you may at your option 356 | remove any additional permissions from that copy, or from any part of 357 | it. (Additional permissions may be written to require their own 358 | removal in certain cases when you modify the work.) You may place 359 | additional permissions on material, added by you to a covered work, 360 | for which you have or can give appropriate copyright permission. 361 | 362 | Notwithstanding any other provision of this License, for material you 363 | add to a covered work, you may (if authorized by the copyright holders 364 | of that material) supplement the terms of this License with terms: 365 | 366 | - a) Disclaiming warranty or limiting liability differently from the 367 | terms of sections 15 and 16 of this License; or 368 | - b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | - c) Prohibiting misrepresentation of the origin of that material, 372 | or requiring that modified versions of such material be marked in 373 | reasonable ways as different from the original version; or 374 | - d) Limiting the use for publicity purposes of names of licensors 375 | or authors of the material; or 376 | - e) Declining to grant rights under trademark law for use of some 377 | trade names, trademarks, or service marks; or 378 | - f) Requiring indemnification of licensors and authors of that 379 | material by anyone who conveys the material (or modified versions 380 | of it) with contractual assumptions of liability to the recipient, 381 | for any liability that these contractual assumptions directly 382 | impose on those licensors and authors. 383 | 384 | All other non-permissive additional terms are considered "further 385 | restrictions" within the meaning of section 10. If the Program as you 386 | received it, or any part of it, contains a notice stating that it is 387 | governed by this License along with a term that is a further 388 | restriction, you may remove that term. If a license document contains 389 | a further restriction but permits relicensing or conveying under this 390 | License, you may add to a covered work material governed by the terms 391 | of that license document, provided that the further restriction does 392 | not survive such relicensing or conveying. 393 | 394 | If you add terms to a covered work in accord with this section, you 395 | must place, in the relevant source files, a statement of the 396 | additional terms that apply to those files, or a notice indicating 397 | where to find the applicable terms. 398 | 399 | Additional terms, permissive or non-permissive, may be stated in the 400 | form of a separately written license, or stated as exceptions; the 401 | above requirements apply either way. 402 | 403 | #### 8. Termination. 404 | 405 | You may not propagate or modify a covered work except as expressly 406 | provided under this License. Any attempt otherwise to propagate or 407 | modify it is void, and will automatically terminate your rights under 408 | this License (including any patent licenses granted under the third 409 | paragraph of section 11). 410 | 411 | However, if you cease all violation of this License, then your license 412 | from a particular copyright holder is reinstated (a) provisionally, 413 | unless and until the copyright holder explicitly and finally 414 | terminates your license, and (b) permanently, if the copyright holder 415 | fails to notify you of the violation by some reasonable means prior to 416 | 60 days after the cessation. 417 | 418 | Moreover, your license from a particular copyright holder is 419 | reinstated permanently if the copyright holder notifies you of the 420 | violation by some reasonable means, this is the first time you have 421 | received notice of violation of this License (for any work) from that 422 | copyright holder, and you cure the violation prior to 30 days after 423 | your receipt of the notice. 424 | 425 | Termination of your rights under this section does not terminate the 426 | licenses of parties who have received copies or rights from you under 427 | this License. If your rights have been terminated and not permanently 428 | reinstated, you do not qualify to receive new licenses for the same 429 | material under section 10. 430 | 431 | #### 9. Acceptance Not Required for Having Copies. 432 | 433 | You are not required to accept this License in order to receive or run 434 | a copy of the Program. Ancillary propagation of a covered work 435 | occurring solely as a consequence of using peer-to-peer transmission 436 | to receive a copy likewise does not require acceptance. However, 437 | nothing other than this License grants you permission to propagate or 438 | modify any covered work. These actions infringe copyright if you do 439 | not accept this License. Therefore, by modifying or propagating a 440 | covered work, you indicate your acceptance of this License to do so. 441 | 442 | #### 10. Automatic Licensing of Downstream Recipients. 443 | 444 | Each time you convey a covered work, the recipient automatically 445 | receives a license from the original licensors, to run, modify and 446 | propagate that work, subject to this License. You are not responsible 447 | for enforcing compliance by third parties with this License. 448 | 449 | An "entity transaction" is a transaction transferring control of an 450 | organization, or substantially all assets of one, or subdividing an 451 | organization, or merging organizations. If propagation of a covered 452 | work results from an entity transaction, each party to that 453 | transaction who receives a copy of the work also receives whatever 454 | licenses to the work the party's predecessor in interest had or could 455 | give under the previous paragraph, plus a right to possession of the 456 | Corresponding Source of the work from the predecessor in interest, if 457 | the predecessor has it or can get it with reasonable efforts. 458 | 459 | You may not impose any further restrictions on the exercise of the 460 | rights granted or affirmed under this License. For example, you may 461 | not impose a license fee, royalty, or other charge for exercise of 462 | rights granted under this License, and you may not initiate litigation 463 | (including a cross-claim or counterclaim in a lawsuit) alleging that 464 | any patent claim is infringed by making, using, selling, offering for 465 | sale, or importing the Program or any portion of it. 466 | 467 | #### 11. Patents. 468 | 469 | A "contributor" is a copyright holder who authorizes use under this 470 | License of the Program or a work on which the Program is based. The 471 | work thus licensed is called the contributor's "contributor version". 472 | 473 | A contributor's "essential patent claims" are all patent claims owned 474 | or controlled by the contributor, whether already acquired or 475 | hereafter acquired, that would be infringed by some manner, permitted 476 | by this License, of making, using, or selling its contributor version, 477 | but do not include claims that would be infringed only as a 478 | consequence of further modification of the contributor version. For 479 | purposes of this definition, "control" includes the right to grant 480 | patent sublicenses in a manner consistent with the requirements of 481 | this License. 482 | 483 | Each contributor grants you a non-exclusive, worldwide, royalty-free 484 | patent license under the contributor's essential patent claims, to 485 | make, use, sell, offer for sale, import and otherwise run, modify and 486 | propagate the contents of its contributor version. 487 | 488 | In the following three paragraphs, a "patent license" is any express 489 | agreement or commitment, however denominated, not to enforce a patent 490 | (such as an express permission to practice a patent or covenant not to 491 | sue for patent infringement). To "grant" such a patent license to a 492 | party means to make such an agreement or commitment not to enforce a 493 | patent against the party. 494 | 495 | If you convey a covered work, knowingly relying on a patent license, 496 | and the Corresponding Source of the work is not available for anyone 497 | to copy, free of charge and under the terms of this License, through a 498 | publicly available network server or other readily accessible means, 499 | then you must either (1) cause the Corresponding Source to be so 500 | available, or (2) arrange to deprive yourself of the benefit of the 501 | patent license for this particular work, or (3) arrange, in a manner 502 | consistent with the requirements of this License, to extend the patent 503 | license to downstream recipients. "Knowingly relying" means you have 504 | actual knowledge that, but for the patent license, your conveying the 505 | covered work in a country, or your recipient's use of the covered work 506 | in a country, would infringe one or more identifiable patents in that 507 | country that you have reason to believe are valid. 508 | 509 | If, pursuant to or in connection with a single transaction or 510 | arrangement, you convey, or propagate by procuring conveyance of, a 511 | covered work, and grant a patent license to some of the parties 512 | receiving the covered work authorizing them to use, propagate, modify 513 | or convey a specific copy of the covered work, then the patent license 514 | you grant is automatically extended to all recipients of the covered 515 | work and works based on it. 516 | 517 | A patent license is "discriminatory" if it does not include within the 518 | scope of its coverage, prohibits the exercise of, or is conditioned on 519 | the non-exercise of one or more of the rights that are specifically 520 | granted under this License. You may not convey a covered work if you 521 | are a party to an arrangement with a third party that is in the 522 | business of distributing software, under which you make payment to the 523 | third party based on the extent of your activity of conveying the 524 | work, and under which the third party grants, to any of the parties 525 | who would receive the covered work from you, a discriminatory patent 526 | license (a) in connection with copies of the covered work conveyed by 527 | you (or copies made from those copies), or (b) primarily for and in 528 | connection with specific products or compilations that contain the 529 | covered work, unless you entered into that arrangement, or that patent 530 | license was granted, prior to 28 March 2007. 531 | 532 | Nothing in this License shall be construed as excluding or limiting 533 | any implied license or other defenses to infringement that may 534 | otherwise be available to you under applicable patent law. 535 | 536 | #### 12. No Surrender of Others' Freedom. 537 | 538 | If conditions are imposed on you (whether by court order, agreement or 539 | otherwise) that contradict the conditions of this License, they do not 540 | excuse you from the conditions of this License. If you cannot convey a 541 | covered work so as to satisfy simultaneously your obligations under 542 | this License and any other pertinent obligations, then as a 543 | consequence you may not convey it at all. For example, if you agree to 544 | terms that obligate you to collect a royalty for further conveying 545 | from those to whom you convey the Program, the only way you could 546 | satisfy both those terms and this License would be to refrain entirely 547 | from conveying the Program. 548 | 549 | #### 13. Use with the GNU Affero General Public License. 550 | 551 | Notwithstanding any other provision of this License, you have 552 | permission to link or combine any covered work with a work licensed 553 | under version 3 of the GNU Affero General Public License into a single 554 | combined work, and to convey the resulting work. The terms of this 555 | License will continue to apply to the part which is the covered work, 556 | but the special requirements of the GNU Affero General Public License, 557 | section 13, concerning interaction through a network will apply to the 558 | combination as such. 559 | 560 | #### 14. Revised Versions of this License. 561 | 562 | The Free Software Foundation may publish revised and/or new versions 563 | of the GNU General Public License from time to time. Such new versions 564 | will be similar in spirit to the present version, but may differ in 565 | detail to address new problems or concerns. 566 | 567 | Each version is given a distinguishing version number. If the Program 568 | specifies that a certain numbered version of the GNU General Public 569 | License "or any later version" applies to it, you have the option of 570 | following the terms and conditions either of that numbered version or 571 | of any later version published by the Free Software Foundation. If the 572 | Program does not specify a version number of the GNU General Public 573 | License, you may choose any version ever published by the Free 574 | Software Foundation. 575 | 576 | If the Program specifies that a proxy can decide which future versions 577 | of the GNU General Public License can be used, that proxy's public 578 | statement of acceptance of a version permanently authorizes you to 579 | choose that version for the Program. 580 | 581 | Later license versions may give you additional or different 582 | permissions. However, no additional obligations are imposed on any 583 | author or copyright holder as a result of your choosing to follow a 584 | later version. 585 | 586 | #### 15. Disclaimer of Warranty. 587 | 588 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 589 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 590 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT 591 | WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT 592 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 593 | A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND 594 | PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE 595 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR 596 | CORRECTION. 597 | 598 | #### 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR 602 | CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 603 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES 604 | ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT 605 | NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR 606 | LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM 607 | TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER 608 | PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 609 | 610 | #### 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | ### How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these 626 | terms. 627 | 628 | To do so, attach the following notices to the program. It is safest to 629 | attach them to the start of each source file to most effectively state 630 | the exclusion of warranty; and each file should have at least the 631 | "copyright" line and a pointer to where the full notice is found. 632 | 633 | RAILS and Cobbler: Scaffolding and automated finishing of draft genomes using long DNA sequences 634 | Copyright (C) 2014-2016 British Columbia Cancer Agency Branch. Rene Warren. All rights reserved. 635 | 636 | This program is free software: you can redistribute it and/or modify 637 | it under the terms of the GNU General Public License as published by 638 | the Free Software Foundation, either version 3 of the License, or 639 | (at your option) any later version. 640 | 641 | This program is distributed in the hope that it will be useful, 642 | but WITHOUT ANY WARRANTY; without even the implied warranty of 643 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 644 | GNU General Public License for more details. 645 | 646 | You should have received a copy of the GNU General Public License 647 | along with this program. If not, see . 648 | 649 | Also add information on how to contact you by electronic and paper 650 | mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | RAILS and Cobbler Copyright (c) 2014-2016 British Columbia Cancer Agency Branch. Rene Warren. All rights reserved. 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands \`show w' and \`show c' should show the 661 | appropriate parts of the General Public License. Of course, your 662 | program's commands might be different; for a GUI interface, you would 663 | use an "about box". 664 | 665 | You should also get your employer (if you work as a programmer) or 666 | school, if any, to sign a "copyright disclaimer" for the program, if 667 | necessary. For more information on this, and how to apply and follow 668 | the GNU GPL, see . 669 | 670 | The GNU General Public License does not permit incorporating your 671 | program into proprietary programs. If your program is a subroutine 672 | library, you may consider it more useful to permit linking proprietary 673 | applications with the library. If this is what you want to do, use the 674 | GNU Lesser General Public License instead of this License. But first, 675 | please read . 676 | -------------------------------------------------------------------------------- /bin/RAILS: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | #AUTHOR 4 | # Rene Warren 5 | # rwarren at bcgsc.ca 6 | # 7 | # 8 | # NAME 9 | # RAILS: Radial Assembly Improvement by Long Sequence Scaffolding 10 | # Scaffolding and gap-closure using alignment of long sequences 11 | # 12 | # SYNOPSIS 13 | # 14 | # DOCUMENTATION 15 | # readme.md distributed with this software 16 | # We hope this code is useful to you -- Please send comments & suggestions to rwarren * bcgsc.ca 17 | # If you use RAILS, the RAILS code or ideas, please cite our work 18 | # 19 | # 20 | # LICENSE 21 | # LINKS, RAILS and Cobbler Copyright (c) 2014-2019 Canada's Michael Smith Genome Science Centre. All rights reserved. 22 | 23 | use strict; 24 | use Getopt::Std; 25 | use Net::SMTP; 26 | use vars qw($opt_f $opt_s $opt_d $opt_i $opt_e $opt_l $opt_a $opt_v $opt_b $opt_t $opt_p $opt_q $opt_g $opt_p); 27 | getopts('f:s:d:e:l:a:v:b:t:p:i:q:g:p:'); 28 | my ($base_name,$anchor,$seqid,$insert_stdev,$min_links,$max_link_ratio,$grace,$verbose)=("",1000,0.9,1.0,1,0.99,1,0,); 29 | 30 | my $version = "[v1.5.1]"; 31 | my $dev = "rwarren\@bcgsc.ca"; 32 | my $SAMPATH = ""; 33 | #------------------------------------------------- 34 | 35 | if(! $opt_f || ! $opt_s || ! $opt_q){ 36 | print "Usage: $0 $version\n"; 37 | print "-f Assembled Sequences to further scaffold (Multi-Fasta format, required)\n"; 38 | print "-q File of filenames containing long Sequences queried (Multi-Fasta format, required)\n"; 39 | print "-s File of filenames containing full path to BAM file(s) or simply type: stream for streaming the .sam output of minimap2 or favorite aligner\n"; 40 | print "-p Full path to samtools (known to work/tested with v1.8, required if reading BAM files)\n"; 41 | print "-d Anchoring bases on contig edges (ie. minimum required alignment size on contigs, default -d $anchor, optional)\n"; 42 | print "-i Minimum sequence identity fraction (0 to 1), default -i $seqid, optional\n"; 43 | print "-t LIST of names/header, long sequences to avoid using for merging/gap-filling scaffolds (optional)\n"; 44 | #print "-e Error (%) allowed on -d distance e.g. -e 0.1 == distance +/- 10% (default -e $insert_stdev, optional)\n"; 45 | print "-l Minimum number of links to compute scaffold (default -l $min_links, optional)\n"; 46 | print "-a Maximum link ratio between two best contig pairs *higher values lead to least accurate scaffolding* (default -a $max_link_ratio, optional)\n"; 47 | print "-g Grace length (bp), default -g $grace, optional\n"; 48 | print "-b Base name for your output files (optional)\n"; 49 | print "-v Runs in verbose mode (-v 1 = yes, default = no, optional)\n"; 50 | die "IMPORTANT: the order of files in -q and -s MUST match!\n"; 51 | } 52 | 53 | my $file = $opt_f; 54 | my $fof = $opt_s; 55 | my $queryfof = $opt_q; 56 | $anchor = $opt_d if($opt_d); 57 | $seqid = $opt_i if($opt_i); 58 | $verbose = $opt_v if($opt_v); 59 | #DO NOT UNCOMMENT THE FOLLOWING LINES, ONLY DEFAULTS TO BE USED WITH RAILS 60 | $min_links = $opt_l if($opt_l); 61 | $max_link_ratio = $opt_a if($opt_a); 62 | #$insert_stdev = $opt_e if($opt_e); 63 | my $listfile = $opt_t if($opt_t); 64 | $base_name = $opt_b if($opt_b); 65 | $grace = $opt_g if($opt_g); 66 | $SAMPATH = $opt_p if($opt_p); 67 | 68 | my $assemblyruninfo=""; 69 | 70 | 71 | if(! -e $file){ 72 | die "Invalid file: $file -- fatal\n"; 73 | } 74 | 75 | if(! -e $SAMPATH && $fof ne "stream"){ 76 | die "Invalid: $SAMPATH -- fatal\n"; 77 | } 78 | 79 | ### Naming output files 80 | if ($base_name eq ""){ 81 | 82 | $base_name = $file . ".scaff_s-" . $fof . "_q-" . $queryfof . "_d" . $anchor . "_i" . $seqid . "_e" . $insert_stdev . "_l" . $min_links . "_a" . $max_link_ratio . "_g" . $grace . "_t" . $listfile; 83 | 84 | my $pid_num = getpgrp(0); 85 | $base_name .= "_pid" . $pid_num; 86 | } 87 | 88 | my $log = $base_name . ".log"; 89 | my $scaffold = $base_name . ".scaffolds"; 90 | my $issues = $base_name . ".pairing_issues"; 91 | my $distribution = $base_name . ".pairing_distribution.csv"; 92 | 93 | open (LOG, ">$log") || die "Can't write to $log -- fatal\n"; 94 | 95 | 96 | #------------------------------------------------- 97 | 98 | my $init_message = "\nRunning: $0 $version\n-f $file\n-q $queryfof\n-s $fof\n"; 99 | 100 | $init_message .= "-d $anchor\n-i $seqid\n-e $insert_stdev\n-l $min_links\n-a $max_link_ratio\n-g $grace\n-t $listfile\n"; 101 | 102 | print $init_message; 103 | print LOG $init_message; 104 | $assemblyruninfo=$init_message . "\n"; 105 | 106 | #------------------------------------------------- 107 | 108 | my $date = `date`; 109 | chomp($date); 110 | 111 | my $reading_reads_message = "\n=>Reading bam: $date\n"; 112 | print $reading_reads_message; 113 | print LOG $reading_reads_message; 114 | $assemblyruninfo.=$reading_reads_message; 115 | my $matepair; 116 | my $tigpair; 117 | my $initpos=0; 118 | my $totalpairs=0; 119 | my ($track_all,$tig_length,$tignames); 120 | 121 | ### READ Query read FOF 122 | my @qryfilearray; 123 | open(QRYFOF,$queryfof) || die "Can't open $queryfof for reading -- fatal.\n"; 124 | while(){ 125 | chomp; 126 | push @qryfilearray, $_; 127 | } 128 | close QRYFOF; 129 | 130 | 131 | if(-f $fof){ 132 | 133 | my $ctline=0; 134 | 135 | open(FOF,$fof) || die "Can't open $fof for reading -- fatal.\n"; 136 | while(){ 137 | chomp; 138 | my $bamfile = $_; 139 | my $rh = &readSeqMemory($qryfilearray[$ctline]);### ONLY READ READ SEQUENCE IN MEMORY FOR THOSE MATCHING BAM (SAME ORDER NEEDED) 140 | print "Parsing alignment file $bamfile...\n"; 141 | ($matepair,$track_all,$tigpair)=&readBam($matepair,$track_all,$tigpair,$bamfile,$anchor,$seqid,$listfile,$matepair,$initpos,$rh,$grace); 142 | print "done.\n"; 143 | $ctline++; 144 | } 145 | close FOF; 146 | 147 | }elsif($opt_s eq "stream"){### STREAMING SUPPORT 148 | 149 | my $rh = &readSeqMemory($qryfilearray[0]);### ONLY SUPPORT ONE READ FILE 150 | print "Parsing alignment file...\n"; 151 | ($matepair,$track_all,$tigpair)=&readSam($matepair,$track_all,$tigpair,$opt_s,$anchor,$seqid,$listfile,$matepair,$initpos,$rh,$grace); 152 | print "done.\n"; 153 | 154 | } 155 | 156 | 157 | 158 | ($tig_length,$tignames) = &readContigs($file); 159 | 160 | open (SC, ">$scaffold") || die "\nCan't write to $scaffold -- fatal\n"; 161 | $date = `date`; 162 | chomp($date); 163 | my $sc_start_message = "\n=>Scaffolding initiated: $date\n"; 164 | print $sc_start_message; 165 | print LOG $sc_start_message; 166 | $assemblyruninfo.= $sc_start_message . "\n"; 167 | 168 | my $contigpairs = &pairContigs($matepair,$track_all,$tig_length,$issues,$distribution,$totalpairs,$verbose); 169 | &buildScaffolds($contigpairs,$tig_length,$verbose); 170 | 171 | close SC; 172 | $date = `date`; 173 | chomp($date); 174 | 175 | my $sc_end_message = "=>Scaffolding ended: $date\n"; 176 | print $sc_end_message; 177 | print LOG $sc_end_message; 178 | $assemblyruninfo.= $sc_end_message . "\n"; 179 | print "Scaffolds layout in: $scaffold\n"; 180 | 181 | $date = `date`; 182 | chomp($date); 183 | 184 | my $sc_fasta_message = "=>Making fasta file: $date\n"; 185 | print $sc_fasta_message; 186 | print LOG $sc_fasta_message; 187 | $assemblyruninfo.= $sc_fasta_message . "\n"; 188 | 189 | my $tighash = &readContigsMemory($file); 190 | my ($scaffold_fasta,$gsl,$merges) = &buildScaffoldFasta($scaffold,$tighash,$tigpair); 191 | 192 | $date = `date`; 193 | chomp($date); 194 | 195 | print "Scaffolds fasta in: $scaffold_fasta\n"; 196 | 197 | my ($avg,$sum,$max,$min) = &average($gsl); 198 | my $sd = &stdev($gsl); 199 | my $final_message = "--------------- $0 Summary ---------------\nNumber of merges induced : %i\nAverage closed gap length (bp) : %.2f\nClosed gap length st.dev +/- : %.2f\nTotal bases added : %i\nLargest gap resolved (bp) : %i\nShortest gap resolved (bp) : %i\n---------------------------------------------\n*0 bp gaps are not counted towards the average\n"; 200 | my @arrsg=@$gsl; 201 | my $numgaps = $#arrsg+1; 202 | printf $final_message, ($merges,$avg,$sd,$sum,$max,$min); 203 | printf LOG $final_message, ($merges,$avg,$sd,$sum,$max,$min); 204 | 205 | $assemblyruninfo .= "done: $date\n\n--------------- $0 Summary ---------------\nNumber of merges : $numgaps\nAverage closed gap length (bp) : $avg\nClosed gap length st.dev +/- : $sd\nTotal bases added : $sum\nLargest gap resolved (bp) : $max\nShortest gap resolved (bp) : $min\n---------------------------------------------\n*0 bp gaps are not counted towards the average\n"; 206 | 207 | close LOG; 208 | 209 | exit; 210 | 211 | ###for dev. test purposes 212 | eval{ 213 | my $wdir = `pwd`; 214 | chomp($wdir); 215 | my $smtp = Net::SMTP->new('mailhost'); 216 | $smtp->mail("RAILS\@bcgsc.ca"); 217 | $smtp->to($dev); 218 | $smtp->data(); 219 | $smtp->datasend("Subject: Your $0 run\n"); 220 | $smtp->datasend("At: $wdir\n"); 221 | $smtp->datasend($assemblyruninfo); 222 | $smtp->dataend(); 223 | $smtp->quit; 224 | }; 225 | 226 | exit; 227 | 228 | #---------------- 229 | sub average{ 230 | my $data = shift; 231 | if (not @$data) { 232 | die("Empty arrayn -- maybe the scaffold merging step did not necessitate gap filling. It is also possible that your version of samtools is not supported. This script was tested with samtools v1.8."); 233 | } 234 | my $total = 0; 235 | my $max = 0; 236 | my $min = 1000000; 237 | foreach (@$data) { 238 | $total += $_; 239 | $max = $_ if($_ > $max); 240 | $min = $_ if($_ < $min); 241 | } 242 | my $average = $total / @$data; 243 | return $average,$total,$max,$min; 244 | } 245 | #---------------- 246 | sub stdev{ 247 | my $data = shift; 248 | if(@$data == 1){ 249 | return 0; 250 | } 251 | my $average = &average($data); 252 | my $sqtotal = 0; 253 | foreach(@$data) { 254 | $sqtotal += ($average-$_) ** 2; 255 | } 256 | my $std = ($sqtotal / (@$data-1)) ** 0.5; 257 | return $std; 258 | } 259 | 260 | #---------------- 261 | sub readContigs{ 262 | my $file = shift; 263 | 264 | my ($tig_length,$tignames); 265 | my $head =""; 266 | my $cttig=0; 267 | open(IN,$file) || die "Error reading $file -- fatal.\n"; 268 | 269 | print "\nContigs processed:\n"; 270 | while(){ 271 | chomp; 272 | if(/^\>(\S+)/){ 273 | $head=$1; 274 | $cttig++; 275 | $tignames->{$cttig}=$head; 276 | print "\r$cttig"; 277 | $|++; 278 | }else{ 279 | my $len = length($_); 280 | $tig_length->{$cttig} = $len; 281 | } 282 | } 283 | close IN; 284 | 285 | return $tig_length,$tignames; 286 | } 287 | 288 | #--------------- 289 | sub readSam{ 290 | 291 | my ($matepair,$track_all,$tigpair,$bamfile,$anchor,$seqid,$listfile,$matepair,$initpos,$rh,$grace) = @_; 292 | 293 | my $mem; 294 | if(-f $listfile){ 295 | open(IN,$listfile) || die "Can't read $listfile -- fatal.\n"; 296 | while(){ 297 | chomp; 298 | $mem->{$_}=1; 299 | } 300 | close IN; 301 | } 302 | my $bt; 303 | #HS9_159:6:1308:13492:64472 272 scaffold43,6983,f43Z6983 6439 0 536M * 0 0 * * NM:i:0 AS:i:536 304 | #HS9_159:6:1308:13492:64472 0 scaffold30,32025,f30Z32025 25411 0 536M * 0 0 GCTTATAAAAGAAGGTGCAATTGATCCTTGCCTTACGCCTACAAAGGAGGGTAGGTGCGATTGGTCCTTACATTCTTACGCCGCTTAGGAAGCTAGGCGAGATAGGATGGGTTCTAGAGCACCTAACTAGCTTTACACGCCGAATCCAGACCTGCCGGCTACCATCCGGATTCATACTAGATAACATAAAGGAGAGAACAACTGTTCAAAGAACAACTCGGAGAACATTTGTATCCGGTGGTTGGGGCATTGCGTGCTATACCAACTACCTCAGGTGCGCGAGGTCTCATTCCTTTTCCAAGCCCAATAAAGAAAAAATATCATTAGTGATGGTGAATCCCGTTTATATAAGTAAGTTGCATTCTTATCTAAGTAAGTGGGCTTTCCTAAGTCACTTATTGGGTGGGGGGCCCCTGTCGAGTGAGCCATCCTTCCTCACCCTCTCTTTTGTTGGGCGAGCCATCTTTCCTTTTATACGATTCGATCCAGTAGATAAGGAAGACCGACCGAGAACAACCAATGGCCTTCCCTGGGGG * NM:i:0 AS:i:536 XS:i:536 305 | #HS9_159:6:1308:13492:64472 272 scaffold22,90777,f22Z90777 90233 0 536M * 0 0 * * NM:i:0 AS:i:536 306 | my $t; 307 | my $ct=0; 308 | 309 | my %options = (); 310 | 311 | print join( 312 | "\t", 313 | 'qname', 314 | 'qstart', 315 | 'qend', 316 | 'qalen', 317 | 'qlen', 318 | 'rname', 319 | 'rstart', 320 | 'rend', 321 | 'ralen', 322 | 'rlen', 323 | 'edit_dist', 324 | ) . "\n" if $options{header}; 325 | 326 | 327 | my %rlength = (); 328 | 329 | while(<>){###will stream from STDIN 330 | 331 | chomp; 332 | $ct++; 333 | 334 | my @a=split(/\t/); 335 | my @b=split(/\,/,$a[2]); 336 | my @c=split(/\,/,$a[0]); 337 | 338 | 339 | if ($options{rlen} && /^\@SQ\s+SN:(\S+)\s+LN:(\S+)/) { 340 | $rlength{$1} = $2; 341 | } 342 | next unless @a >= 10; 343 | my $line = $_; 344 | my $qname = $a[0]; 345 | my $rname = $a[2]; 346 | my $rstart = $a[3]; 347 | my $cigar = $a[5]; 348 | my $qseq = $a[9]; 349 | # Query 350 | my $qstart = 1; 351 | $_ = $cigar; 352 | s/^(\d+)[SH]/$qstart += $1/eg; 353 | my $qalen = 0; 354 | $_ = $cigar; 355 | s/(\d+)[M=XI]/$qalen += $1/eg; 356 | my $qend = $qstart + $qalen - 1; 357 | $_ = $cigar; 358 | my $end_clip_len = 0; 359 | s/(\d+)[SH]$/$end_clip_len += $1/eg; 360 | my $qlen = $c[1]; 361 | 362 | # Reference 363 | my $ralen = 0; 364 | $_ = $cigar; 365 | s/(\d+)[M=XDN]/$ralen += $1/eg; 366 | my $rend = $rstart + $ralen - 1; 367 | my $rlen = $b[1]; 368 | 369 | # Calculate edit distance including clipping 370 | my $edit_dist = ''; 371 | if ($line =~ /NM:i:(\d+)/) { 372 | $edit_dist = $1;# + $qstart - 1 + $end_clip_len; 373 | } 374 | 375 | my $read = $a[0] . "-" . $ct; 376 | my $si=0; 377 | $si = ($qalen - $edit_dist) / $qalen if($qalen); 378 | 379 | 380 | if($si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace ) )){ ### this indicates anchoring bases, within $anchor of edges 381 | 382 | 383 | my $start; 384 | my $end; 385 | ###Coordinates on the scaffolds 386 | if($rstart <= $grace && ($qlen-$qend)<= $grace){ 387 | $start = $rend; 388 | $end = $rstart; 389 | }else{ 390 | $start = $rstart; 391 | $end = $rend; 392 | } 393 | my $orient=""; 394 | if($a[1]==272 || $a[1]==16 || $a[1]==2064){ ### matches on negative strand 395 | $orient="r"; 396 | my $tmpstart = $qlen - $qend; 397 | my $tmpend = $qlen - $qstart; 398 | $qstart = $tmpstart; 399 | $qend = $tmpend; 400 | }else{ 401 | $orient="f"; 402 | } 403 | 404 | $t->{$a[0]}{$a[2]}{'order'}=$ct; 405 | $t->{$a[0]}{$a[2]}{'orient'}=$orient ; 406 | $t->{$a[0]}{$a[2]}{'real'}=$read; 407 | $t->{$a[0]}{$a[2]}{'length'}=$qlen; 408 | if($a[2]=~/\D+(\d+)\,(\d+)/){### scaffoldNUMBER,LENGTH eg. wga1,1301 409 | my ($numtig,$sz)=($1,$2); 410 | $track_all->{$read}{'tig'}=$numtig; 411 | $track_all->{$read}{'start'}=$start; 412 | $track_all->{$read}{'end'}=$end; 413 | $track_all->{$read}{'multiple'}=1; 414 | $track_all->{$read}{'sam'}=$line; 415 | $track_all->{$read}{'orient'}=$orient; 416 | $track_all->{$read}{'qalen'}=$qalen;### tracks anchor size 417 | $track_all->{$read}{'qstart'}=$qstart; 418 | $track_all->{$read}{'qend'}=$qend; 419 | $track_all->{$read}{'si'}=$si; ### tracks sequence identity 420 | } 421 | } 422 | } 423 | close IN;###End SAM parse 424 | my ($occ,$same)=(0,0);###TRACK REDUNDANCY 425 | 426 | foreach my $rd(keys %$t){ 427 | my $scafflist=$t->{$rd}; 428 | my $num = keys(%$scafflist); 429 | 430 | #if($num==2){###maps on two different scaffolds only 431 | my @arr; 432 | my $prev=""; 433 | my $current=""; 434 | my $totalreadlength=0; 435 | my $counttig = 0; 436 | #foreach my $scaff(keys %$scafflist){ 437 | GETCONTIG: 438 | foreach my $scaff(sort {$scafflist->{$a}{'order'}<=>$scafflist->{$b}{'order'}} keys %$scafflist){### best contig alignments listed first 439 | $counttig++; 440 | $current = $scafflist->{$scaff}{'real'}; 441 | $prev = $current if($prev eq ""); 442 | $totalreadlength = $scafflist->{$scaff}{'length'}; 443 | last GETCONTIG if($counttig==2); 444 | } 445 | 446 | my ($one,$two)=($track_all->{$current}{'tig'},$track_all->{$prev}{'tig'}); 447 | if($track_all->{$prev}{'tig'}<$track_all->{$current}{'tig'}){ 448 | ($one,$two)=($track_all->{$prev}{'tig'},$track_all->{$current}{'tig'}) 449 | } 450 | 451 | ### this will track the best anchoring long reads for the merge/gapfill 452 | my $m1 = $track_all->{$current}{'qalen'} * $track_all->{$current}{'si'}; 453 | my $m2 = $track_all->{$prev}{'qalen'} * $track_all->{$prev}{'si'}; 454 | my $matchbases = $m1 + $m2; 455 | 456 | if(! defined $mem->{$rd} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### WILL TRACK BEST ANCHORING BASES 457 | 458 | ### This is used to count #support linkages and -a 459 | $matepair->{$prev}{$current}{'is'} = $totalreadlength;# - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'} ); 460 | $matepair->{$prev}{$current}{'bt'}=0; 461 | 462 | if($matchbases > $bt->{$one}{$two}{'bestmatch'}){###conditional to track best patch sequence only 463 | 464 | $bt->{$one}{$two}{'bestmatch'} = $matchbases; ### beenthere, bt, tracks that two contigs have been merged by a read (first one it saw here) .. to allow patch to match the chosen read support 465 | my $pos=0; 466 | $pos = $track_all->{$prev}{'qend'} if($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'}); 467 | $pos = $track_all->{$current}{'qend'} if($track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}); 468 | my $gapseqlen = $totalreadlength - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'}); 469 | print ">$rd $pos @ $gapseqlen \n$rh->{$rd}\n\n" if($verbose); 470 | my $patch = substr($rh->{$rd},$pos,$gapseqlen-1); 471 | 472 | #$patch = &reverseComplement($patch) if($track_all->{$prev}{'orient'} eq "-" && $track_all->{$current}{'orient'} eq "-" ); 473 | 474 | print "GAP:$patch\n" if($verbose); 475 | 476 | ###JUST SOME TEST CODE 477 | if(defined $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){### previous patch saved 478 | $occ++; 479 | $same++ if($patch eq $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}); 480 | #print "$prev ($track_all->{$prev}{'tig'})...$current ($track_all->{$current}{'tig'})\n$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}\nNEW GAP:\n$patch\n"; 481 | #if($patch ne $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";} 482 | } 483 | 484 | ### EVEN THOUGH A COUNTER 'distr' TRACKS NUMBER OF SUPPORT FOR TIG A AND B, LAST COMBO OVERRIDES REST THIS COULD BE PROBLEMATIC WHEN MULTIPLE READS SUPPORT SAME CONTIGS, SEE NOTE ABOVE AND REASON TO LIMIT SUPPORT TO FIRST READ SEEN 485 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'distr'}++; 486 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}=$patch; 487 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'configuration'}=$track_all->{$prev}{'orient'} . $track_all->{$prev}{'tig'} . $track_all->{$current}{'orient'} . $track_all->{$current}{'tig'}; 488 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'origin'}=$rd; 489 | 490 | print ">>>> $track_all->{$prev}{'tig'} $track_all->{$current}{'tig'} $patch\n" if($verbose); 491 | 492 | print "$track_all->{$prev}{'sam'}\n$track_all->{$current}{'sam'}\n x====x ($totalreadlength) ($track_all->{$prev}{'qstart'}-$track_all->{$prev}{'qend'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND ($track_all->{$current}{'qstart'}-$track_all->{$current}{'qend'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'})\n===x x==== sc$track_all->{$prev}{'tig'} ($track_all->{$prev}{'start'}-$track_all->{$prev}{'end'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND sc$track_all->{$current}{'tig'} ($track_all->{$current}{'start'}-$track_all->{$current}{'end'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'}) \n\n" if($verbose); 493 | }###matchbase used to track best patch sequence 494 | } 495 | #} 496 | } 497 | 498 | print "\nRedundant same contig combo linking:$occ\nSame gap sequence fill:$same\n\n"; 499 | return $matepair,$track_all,$tigpair; 500 | } 501 | 502 | 503 | #--------------- 504 | sub readBam{ 505 | 506 | my ($matepair,$track_all,$tigpair,$bamfile,$anchor,$seqid,$listfile,$matepair,$initpos,$rh,$grace) = @_; 507 | 508 | my $mem; 509 | if(-f $listfile){ 510 | open(IN,$listfile) || die "Can't read $listfile -- fatal.\n"; 511 | while(){ 512 | chomp; 513 | $mem->{$_}=1; 514 | } 515 | close IN; 516 | } 517 | my $bt; 518 | #HS9_159:6:1308:13492:64472 272 scaffold43,6983,f43Z6983 6439 0 536M * 0 0 * * NM:i:0 AS:i:536 519 | #HS9_159:6:1308:13492:64472 0 scaffold30,32025,f30Z32025 25411 0 536M * 0 0 GCTTATAAAAGAAGGTGCAATTGATCCTTGCCTTACGCCTACAAAGGAGGGTAGGTGCGATTGGTCCTTACATTCTTACGCCGCTTAGGAAGCTAGGCGAGATAGGATGGGTTCTAGAGCACCTAACTAGCTTTACACGCCGAATCCAGACCTGCCGGCTACCATCCGGATTCATACTAGATAACATAAAGGAGAGAACAACTGTTCAAAGAACAACTCGGAGAACATTTGTATCCGGTGGTTGGGGCATTGCGTGCTATACCAACTACCTCAGGTGCGCGAGGTCTCATTCCTTTTCCAAGCCCAATAAAGAAAAAATATCATTAGTGATGGTGAATCCCGTTTATATAAGTAAGTTGCATTCTTATCTAAGTAAGTGGGCTTTCCTAAGTCACTTATTGGGTGGGGGGCCCCTGTCGAGTGAGCCATCCTTCCTCACCCTCTCTTTTGTTGGGCGAGCCATCTTTCCTTTTATACGATTCGATCCAGTAGATAAGGAAGACCGACCGAGAACAACCAATGGCCTTCCCTGGGGG * NM:i:0 AS:i:536 XS:i:536 520 | #HS9_159:6:1308:13492:64472 272 scaffold22,90777,f22Z90777 90233 0 536M * 0 0 * * NM:i:0 AS:i:536 521 | my $t; 522 | my $ct=0; 523 | 524 | my %options = (); 525 | 526 | print join( 527 | "\t", 528 | 'qname', 529 | 'qstart', 530 | 'qend', 531 | 'qalen', 532 | 'qlen', 533 | 'rname', 534 | 'rstart', 535 | 'rend', 536 | 'ralen', 537 | 'rlen', 538 | 'edit_dist', 539 | ) . "\n" if $options{header}; 540 | 541 | 542 | my %rlength = (); 543 | 544 | my $ERRLOG = $bamfile.".bampreprocessor.err.log".$$.time(); 545 | my $cmd = "$SAMPATH view $bamfile 2>$ERRLOG|";###read BAM 546 | open(IN,$cmd) || die "Error reading $bamfile -- fatal.\n"; 547 | while(){ 548 | 549 | chomp; 550 | $ct++; 551 | 552 | my @a=split(/\t/); 553 | my @b=split(/\,/,$a[2]); 554 | my @c=split(/\,/,$a[0]); 555 | 556 | 557 | if ($options{rlen} && /^\@SQ\s+SN:(\S+)\s+LN:(\S+)/) { 558 | $rlength{$1} = $2; 559 | } 560 | next unless @a >= 10; 561 | my $line = $_; 562 | my $qname = $a[0]; 563 | my $rname = $a[2]; 564 | my $rstart = $a[3]; 565 | my $cigar = $a[5]; 566 | my $qseq = $a[9]; 567 | # Query 568 | my $qstart = 1; 569 | $_ = $cigar; 570 | s/^(\d+)[SH]/$qstart += $1/eg; 571 | my $qalen = 0; 572 | $_ = $cigar; 573 | s/(\d+)[M=XI]/$qalen += $1/eg; 574 | my $qend = $qstart + $qalen - 1; 575 | $_ = $cigar; 576 | my $end_clip_len = 0; 577 | s/(\d+)[SH]$/$end_clip_len += $1/eg; 578 | my $qlen = $c[1]; 579 | 580 | # Reference 581 | my $ralen = 0; 582 | $_ = $cigar; 583 | s/(\d+)[M=XDN]/$ralen += $1/eg; 584 | my $rend = $rstart + $ralen - 1; 585 | my $rlen = $b[1]; 586 | 587 | # Calculate edit distance including clipping 588 | my $edit_dist = ''; 589 | if ($line =~ /NM:i:(\d+)/) { 590 | $edit_dist = $1;# + $qstart - 1 + $end_clip_len; 591 | } 592 | 593 | my $read = $a[0] . "-" . $ct; 594 | my $si=0; 595 | $si = ($qalen - $edit_dist) / $qalen if($qalen); 596 | 597 | 598 | if($si >= $seqid && $qalen >= $anchor && (( $rstart <= $grace && ($qlen-$qend)<= $grace) || ($qstart<=$grace && ($rlen-$rend)<=$grace ) )){ ### this indicates anchoring bases, within $anchor of edges 599 | 600 | 601 | my $start; 602 | my $end; 603 | ###Coordinates on the scaffolds 604 | if($rstart <= $grace && ($qlen-$qend)<= $grace){ 605 | $start = $rend; 606 | $end = $rstart; 607 | }else{ 608 | $start = $rstart; 609 | $end = $rend; 610 | } 611 | my $orient=""; 612 | if($a[1]==272 || $a[1]==16 || $a[1]==2064){ ### matches on negative strand 613 | $orient="r"; 614 | my $tmpstart = $qlen - $qend; 615 | my $tmpend = $qlen - $qstart; 616 | $qstart = $tmpstart; 617 | $qend = $tmpend; 618 | }else{ 619 | $orient="f"; 620 | } 621 | 622 | $t->{$a[0]}{$a[2]}{'order'}=$ct; 623 | $t->{$a[0]}{$a[2]}{'orient'}=$orient ; 624 | $t->{$a[0]}{$a[2]}{'real'}=$read; 625 | $t->{$a[0]}{$a[2]}{'length'}=$qlen; 626 | if($a[2]=~/\D+(\d+)\,(\d+)/){### scaffoldNUMBER,LENGTH eg. wga1,1301 627 | my ($numtig,$sz)=($1,$2); 628 | $track_all->{$read}{'tig'}=$numtig; 629 | $track_all->{$read}{'start'}=$start; 630 | $track_all->{$read}{'end'}=$end; 631 | $track_all->{$read}{'multiple'}=1; 632 | $track_all->{$read}{'sam'}=$line; 633 | $track_all->{$read}{'orient'}=$orient; 634 | $track_all->{$read}{'qalen'}=$qalen;### tracks anchor size 635 | $track_all->{$read}{'qstart'}=$qstart; 636 | $track_all->{$read}{'qend'}=$qend; 637 | $track_all->{$read}{'si'}=$si; ### tracks sequence identity 638 | } 639 | } 640 | } 641 | close IN;###End SAM parse 642 | my ($occ,$same)=(0,0);###TRACK REDUNDANCY 643 | 644 | foreach my $rd(keys %$t){ 645 | my $scafflist=$t->{$rd}; 646 | my $num = keys(%$scafflist); 647 | 648 | #if($num==2){###maps on two different scaffolds only 649 | my @arr; 650 | my $prev=""; 651 | my $current=""; 652 | my $totalreadlength=0; 653 | my $counttig = 0; 654 | #foreach my $scaff(keys %$scafflist){ 655 | GETCONTIG: 656 | foreach my $scaff(sort {$scafflist->{$a}{'order'}<=>$scafflist->{$b}{'order'}} keys %$scafflist){### best contig alignments listed first 657 | $counttig++; 658 | $current = $scafflist->{$scaff}{'real'}; 659 | $prev = $current if($prev eq ""); 660 | $totalreadlength = $scafflist->{$scaff}{'length'}; 661 | last GETCONTIG if($counttig==2); 662 | } 663 | 664 | my ($one,$two)=($track_all->{$current}{'tig'},$track_all->{$prev}{'tig'}); 665 | if($track_all->{$prev}{'tig'}<$track_all->{$current}{'tig'}){ 666 | ($one,$two)=($track_all->{$prev}{'tig'},$track_all->{$current}{'tig'}) 667 | } 668 | 669 | ### this will track the best anchoring long reads for the merge/gapfill 670 | my $m1 = $track_all->{$current}{'qalen'} * $track_all->{$current}{'si'}; 671 | my $m2 = $track_all->{$prev}{'qalen'} * $track_all->{$prev}{'si'}; 672 | my $matchbases = $m1 + $m2; 673 | 674 | if(! defined $mem->{$rd} && ($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'} || $track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}) && $rh->{$rd} ne ""){### WILL TRACK BEST ANCHORING BASES 675 | 676 | ### This is used to count #support linkages and -a 677 | $matepair->{$prev}{$current}{'is'} = $totalreadlength;# - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'} ); 678 | $matepair->{$prev}{$current}{'bt'}=0; 679 | 680 | if($matchbases > $bt->{$one}{$two}{'bestmatch'}){###conditional to track best patch sequence only 681 | 682 | $bt->{$one}{$two}{'bestmatch'} = $matchbases; ### beenthere, bt, tracks that two contigs have been merged by a read (first one it saw here) .. to allow patch to match the chosen read support 683 | my $pos=0; 684 | $pos = $track_all->{$prev}{'qend'} if($track_all->{$current}{'qstart'} > $track_all->{$prev}{'qend'}); 685 | $pos = $track_all->{$current}{'qend'} if($track_all->{$prev}{'qstart'} > $track_all->{$current}{'qend'}); 686 | my $gapseqlen = $totalreadlength - ($track_all->{$prev}{'qalen'} + $track_all->{$current}{'qalen'}); 687 | print ">$rd $pos @ $gapseqlen \n$rh->{$rd}\n\n" if($verbose); 688 | my $patch = substr($rh->{$rd},$pos,$gapseqlen-1); 689 | 690 | #$patch = &reverseComplement($patch) if($track_all->{$prev}{'orient'} eq "-" && $track_all->{$current}{'orient'} eq "-" ); 691 | 692 | print "GAP:$patch\n" if($verbose); 693 | 694 | ###JUST SOME TEST CODE 695 | if(defined $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){### previous patch saved 696 | $occ++; 697 | $same++ if($patch eq $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}); 698 | #print "$prev ($track_all->{$prev}{'tig'})...$current ($track_all->{$current}{'tig'})\n$tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}\nNEW GAP:\n$patch\n"; 699 | #if($patch ne $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}){print "NOT SAME\n\n";}else{print "SAME\n\n";} 700 | } 701 | 702 | ### EVEN THOUGH A COUNTER 'distr' TRACKS NUMBER OF SUPPORT FOR TIG A AND B, LAST COMBO OVERRIDES REST THIS COULD BE PROBLEMATIC WHEN MULTIPLE READS SUPPORT SAME CONTIGS, SEE NOTE ABOVE AND REASON TO LIMIT SUPPORT TO FIRST READ SEEN 703 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'distr'}++; 704 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'seq'}=$patch; 705 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'configuration'}=$track_all->{$prev}{'orient'} . $track_all->{$prev}{'tig'} . $track_all->{$current}{'orient'} . $track_all->{$current}{'tig'}; 706 | $tigpair->{$track_all->{$prev}{'tig'}}{$track_all->{$current}{'tig'}}{'origin'}=$rd; 707 | 708 | print ">>>> $track_all->{$prev}{'tig'} $track_all->{$current}{'tig'} $patch\n" if($verbose); 709 | 710 | print "$track_all->{$prev}{'sam'}\n$track_all->{$current}{'sam'}\n x====x ($totalreadlength) ($track_all->{$prev}{'qstart'}-$track_all->{$prev}{'qend'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND ($track_all->{$current}{'qstart'}-$track_all->{$current}{'qend'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'})\n===x x==== sc$track_all->{$prev}{'tig'} ($track_all->{$prev}{'start'}-$track_all->{$prev}{'end'}:$track_all->{$prev}{'qalen'} $track_all->{$prev}{'orient'}) AND sc$track_all->{$current}{'tig'} ($track_all->{$current}{'start'}-$track_all->{$current}{'end'}:$track_all->{$current}{'qalen'} $track_all->{$current}{'orient'}) \n\n" if($verbose); 711 | }###matchbase used to track best patch sequence 712 | } 713 | #} 714 | } 715 | 716 | print "\nRedundant same contig combo linking:$occ\nSame gap sequence fill:$same\n\n"; 717 | return $matepair,$track_all,$tigpair; 718 | } 719 | 720 | #----------------------- 721 | sub reverseComplement{ 722 | $_ = shift; 723 | $_ = uc(); 724 | tr/ATGCYRKMBDHV/TACGRYMKVHDB/; 725 | return (reverse()); 726 | } 727 | 728 | #------------------------------------ 729 | #Order and orient contigs into scaffolds 730 | sub buildScaffolds{ 731 | 732 | my ($pair, $tig_length, $contig_size_cutoff, $verbose) = @_; 733 | 734 | my $seen_it; 735 | my $sc_ct = 0; 736 | 737 | #print SC "Scaffold Number,Scaffold Size (only contig lengths considered),Scaffold Chain: e.g. _f127z7068k12a0.58m42_r3090z62k7r0.14m76_ means: contig127(+ strand=f), size 7068 (z) has 12 links (k), link ratio of 0.58 (a) and with a mean gap/overlap of 42nt (m) with reverse (r) of contig3090 (size 62) on the right.\n"; 738 | 739 | SEED: 740 | foreach my $tig (sort {$tig_length->{$b}<=>$tig_length->{$a}} keys %$tig_length){ 741 | 742 | my $ftig = "f" . $tig; 743 | my $rtig = "r" . $tig; 744 | 745 | if(! defined $seen_it->{$tig}){##should prevent re-using a contig as seed if it's already been incorporated into a scaffold 746 | 747 | $sc_ct++; 748 | 749 | my $chainleft = ""; 750 | 751 | my $ori_chainright = $ftig . "Z" . $tig_length->{$tig}; 752 | my $chainright = $ori_chainright; 753 | my $total = $tig_length->{$tig}; 754 | 755 | ($total, $chainright, $seen_it) = &computeLayout("R", $chainright, $ftig, $pair, $tig_length, $total, $seen_it, $tig); 756 | ($total, $chainleft, $seen_it) = &computeLayout("L", $chainleft, $rtig, $pair, $tig_length, $total, $seen_it, $tig); 757 | 758 | $seen_it->{$tig}++; 759 | 760 | delete $pair->{$ftig}; 761 | delete $pair->{$rtig}; 762 | delete $tig_length->{$tig}; 763 | 764 | my $scaffold = $chainleft . $chainright; 765 | print SC "scaffold" . $sc_ct . ",$total,$scaffold\n" if($total >= $contig_size_cutoff); 766 | } 767 | } 768 | } 769 | 770 | #------------------------------------ 771 | # links contigs together into a chain - must satisfy user-defined criterions (-k -a) 772 | sub computeLayout{ 773 | 774 | my ($ext, $chain, $tig, $pair, $tig_length, $total, $seen_it, $orig_tig_number) = @_; 775 | 776 | my $orig_tig = $tig; 777 | my $extension = 1; 778 | 779 | EXTENSION: 780 | while($extension){ 781 | 782 | my $tnum = $1 if($tig=~/[fr](\d+)/); 783 | my $tnumf = "f" . $tnum; 784 | my $tnumr = "r" . $tnum; 785 | 786 | if(! defined $seen_it->{$tnum}){ 787 | 788 | $seen_it->{$tnum}++ if($tnumf ne $orig_tig); 789 | 790 | print "Attempt to extend $tig\n" if ($verbose); 791 | my $list = $pair->{$tig}; 792 | my ($match1,$link1,$gaps1,$match2,$link2,$gaps2,$cntloop)=("",0,0,"",0,0,0); 793 | 794 | LINK: 795 | foreach my $match (sort {$list->{$b}{'links'}<=>$list->{$a}{'links'}} keys %$list){### sort by most supported links 796 | 797 | if($cntloop){ 798 | ($match2,$link2,$gaps2) = ($match,$list->{$match}{'links'},$list->{$match}{'gaps'}); 799 | print "$tig links second best $match2 (links:$link2 total sz:$gaps2)\n" if ($verbose); 800 | last LINK; 801 | }else{ 802 | ($match1,$link1,$gaps1) = ($match,$list->{$match}{'links'},$list->{$match}{'gaps'}); 803 | print "$tig links best $match1 (links:$link1 total sz:$gaps1)\n" if ($verbose); 804 | } 805 | $cntloop++; 806 | } 807 | 808 | ###ratio 809 | my $ratio = 0.00; 810 | $ratio = $link2 / $link1 if ($link1); ## relative ratio of the two most abundant contig pairs 811 | if ($ratio =~ /(\d+\.\d{2})/){$ratio = $1;} 812 | ###mean 813 | my $mean = 0; 814 | $mean = int($gaps1 / $link1) if ($link1); 815 | 816 | my $tempnum = $1 if($match1 =~ /[fr](\d+)/); 817 | 818 | #### Assessment 819 | if(defined $seen_it->{$tempnum} || $link1 < $min_links || $ratio > $max_link_ratio || $tempnum == $orig_tig_number){ 820 | $extension = 0; 821 | print "defined seen_it->{ $tempnum } || $link1 < $min_links || $ratio > $max_link_ratio\n L1:$link1 L2:$link2 M1:$match1 M2:$match2 G1:$gaps1 G2:$gaps2 " if ($verbose); 822 | 823 | last EXTENSION; 824 | }{### pass filter.. does this contig 825 | print "$ext extension. mean: $mean links:$link1 linkratio:$ratio\n" if ($verbose); 826 | 827 | if($ext eq "R"){ 828 | $chain .= "k" . $link1 . "a" . $ratio . "m" . $mean . "_" . $match1 . "z" . $tig_length->{$tempnum}; 829 | }else{ 830 | my $temp_match = ""; 831 | if($match1 =~ /^r(\d+)/){$temp_match = "f" . $1;}else{$temp_match = "r". $1;} 832 | $chain = $temp_match . "z" . $tig_length->{$tempnum} . "k" . $link1 . "a" . $ratio . "m" . $mean . "_" . $chain; 833 | } 834 | $total += $tig_length->{$tempnum}; 835 | 836 | print "NEXT TIG TO LOOK AT= $match1\n" if ($verbose); 837 | $tig = $match1; 838 | $extension = 1; 839 | 840 | print "Will flag $tnum as seen (only if $tnumf != $orig_tig)." if ($verbose); 841 | 842 | if($tnumf ne $orig_tig){ 843 | delete $pair->{$tnumf}; 844 | delete $pair->{$tnumr}; 845 | delete $tig_length->{$tnum}; 846 | }else{ 847 | delete $pair->{$tnumf}; 848 | } 849 | } 850 | }else{ 851 | print "NO MORE MATCH FOR $tig in hash: pair>>\n" if ($verbose); 852 | $extension = 0; 853 | last EXTENSION; 854 | } 855 | }### pair is defined 856 | return $total, $chain, $seen_it; 857 | } 858 | 859 | #------------------------------------ 860 | sub getDistance{ 861 | 862 | my ($insert_size, $length_i, $start_i, $start_j) = @_; 863 | 864 | # L ------ --------- R 865 | # i -> <- j 866 | # .... ...... insert_span 867 | # ============ insert_size 868 | 869 | my $insert_span = ($length_i - $start_i) + $start_j; 870 | my $gap_or_overlap = $insert_size - $insert_span; 871 | 872 | return $gap_or_overlap; 873 | } 874 | 875 | #----------------- 876 | #build contig pairs based on template information - must satisfy user-defined criterions (-d -e) 877 | sub pairContigs{ 878 | 879 | my ($matepair,$track,$tig_length,$issues,$distribution,$totalpairs,$verbose) = @_; 880 | my ($ct_illogical, $ct_ok_contig, $ct_ok_pairs, $ct_problem_pairs, $ct_iz_issues, $ct_single, $ct_both)= (0,0,0,0,0,0,0); 881 | my $ct_illogical_hash; 882 | my $ct_ok_contig_hash; 883 | my $ct_ok_pairs_hash; 884 | my $ct_problem_pairs_hash; 885 | my $ct_iz_issues_hash; 886 | my $ct_single_hash; 887 | my $ct_both_hash; 888 | #$verbose=1;# XX for testing purposes 889 | my ($pair,$err,$track_insert); 890 | 891 | print "Pairing contigs...\n" if ($verbose); 892 | 893 | open(PET, ">$issues") || die "Can't open $issues for writing -- fatal\n"; 894 | 895 | foreach my $read_a (keys %$matepair){ 896 | my $mateslist = $matepair->{$read_a}; 897 | 898 | foreach my $read_b (keys %$mateslist){ 899 | 900 | if($matepair->{$read_a}{$read_b}{'bt'}==0 && $track->{$read_a}{'multiple'}==1 && $track->{$read_b}{'multiple'}==1){ ###This has little if no effect, but negative for some odd reason 901 | 902 | ##below indicates this specific pair has been seen 903 | $matepair->{$read_a}{$read_b}{'bt'}=1; 904 | 905 | my $insert_size = $mateslist->{$read_b}{'is'}; 906 | my $min_allowed = -1 * ($insert_stdev * $insert_size); 907 | my ($low_iz, $up_iz) = ($insert_size + $min_allowed, $insert_size - $min_allowed); 908 | #print "MIN ALLOWED: $min_allowed\n"; 909 | print "Pair read1=$read_a read2=$read_b\n" if ($verbose); 910 | 911 | if(defined $track->{$read_a}{'tig'} && defined $track->{$read_b}{'tig'}){### both pairs assembled 912 | 913 | $ct_both++; 914 | $ct_both_hash->{$insert_size}++; 915 | 916 | my $tig_a = $track->{$read_a}{'tig'}; 917 | my $tig_b = $track->{$read_b}{'tig'}; 918 | 919 | my $ftig_a = "f" . $tig_a; 920 | my $ftig_b = "f" . $tig_b; 921 | 922 | my $rtig_a = "r" . $tig_a; 923 | my $rtig_b = "r" . $tig_b; 924 | 925 | my $A_length = $tig_length->{$tig_a}; 926 | my $A_start = $track->{$read_a}{'start'}; 927 | my $A_end = $track->{$read_a}{'end'}; 928 | 929 | my $B_length = $tig_length->{$tig_b}; 930 | my $B_start = $track->{$read_b}{'start'} ; 931 | my $B_end = $track->{$read_b}{'end'}; 932 | 933 | if ($tig_a != $tig_b){####paired reads located on <> contigs 934 | 935 | ####Determine most likely possibility 936 | if ($track->{$read_a}{'start'} < $track->{$read_a}{'end'}){ 937 | 938 | if ($track->{$read_b}{'end'} < $track->{$read_b}{'start'}){####-> <- ::: A-> <-B / rB -> <- rA 939 | my $d = &getDistance($insert_size, $A_length, $A_start, $B_start); 940 | print "A-> <-B WITH $tig_a -> <- $tig_b GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Alen, Astart,Bstart\n" if($verbose); 941 | if($d >= $min_allowed){ 942 | $pair->{$ftig_a}{$ftig_b}{'links'}++; 943 | $pair->{$ftig_a}{$ftig_b}{'gaps'} += $d; 944 | $pair->{$rtig_b}{$rtig_a}{'links'}++; 945 | $pair->{$rtig_b}{$rtig_a}{'gaps'} += $d; 946 | $ct_ok_pairs++; 947 | $ct_ok_pairs_hash->{$insert_size}++; 948 | #print "$ftig_a ($read_a) ...$ftig_b ($read_b)\n"; 949 | }else{ 950 | my $err_pair = $ftig_a . "-". $ftig_b; 951 | $err->{$err_pair}{'links'}++; 952 | $err->{$err_pair}{'gaps'} += $d; 953 | $ct_problem_pairs++; 954 | $ct_problem_pairs_hash->{$insert_size}++; 955 | print PET "Pairs unsatisfied in distance within a contig pair. A-> <-B WITH tig#$tig_a -> $d <- tig#$tig_b, A=$A_length nt (start:$A_start, end:$A_end) B=$B_length nt (start:$B_start, end:$B_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 956 | } 957 | }else{#### -> -> ::: A-> <-rB / B-> <-rA 958 | my $rB_start = $B_length - $B_start; 959 | my $d = &getDistance($insert_size, $A_length, $A_start, $rB_start); 960 | print "A-> <-rB WITH $tig_a -> <- r.$tig_b GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Alen,Astart,rBstart\n" if($verbose); 961 | if($d >= $min_allowed){ 962 | $pair->{$ftig_a}{$rtig_b}{'links'}++; 963 | $pair->{$ftig_a}{$rtig_b}{'gaps'} += $d; 964 | $pair->{$ftig_b}{$rtig_a}{'links'}++; 965 | $pair->{$ftig_b}{$rtig_a}{'gaps'} += $d; 966 | $ct_ok_pairs++; 967 | $ct_ok_pairs_hash->{$insert_size}++; 968 | #print "$ftig_a ($read_a) ...$rtig_b ($read_b)\n"; 969 | }else{ 970 | my $err_pair = $ftig_a . "-". $rtig_b; 971 | $err->{$err_pair}{'links'}++; 972 | $err->{$err_pair}{'gaps'} += $d; 973 | $ct_problem_pairs++; 974 | $ct_problem_pairs_hash->{$insert_size}++; 975 | print PET "Pairs unsatisfied in distance within a contig pair. A-> <-rB WITH tig#$tig_a -> $d <- tig#r.$tig_b, A=$A_length nt (start:$A_start, end:$A_end) B=$B_length nt (start:$B_start, end:$B_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 976 | } 977 | } 978 | }else{ 979 | 980 | if ($track->{$read_b}{'end'} > $track->{$read_b}{'start'}){####<- -> ::: B-> <-A / rA -> <- rB 981 | my $d = &getDistance($insert_size, $B_length, $B_start, $A_start); 982 | print "B-> <-A WITH $tig_b -> <- $tig_a GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Blen,Bstart,Astart\n" if($verbose); 983 | if($d >= $min_allowed){ 984 | $pair->{$ftig_b}{$ftig_a}{'links'}++; 985 | $pair->{$ftig_b}{$ftig_a}{'gaps'} += $d; 986 | $pair->{$rtig_a}{$rtig_b}{'links'}++; 987 | $pair->{$rtig_a}{$rtig_b}{'gaps'} += $d; 988 | $ct_ok_pairs++; 989 | $ct_ok_pairs_hash->{$insert_size}++; 990 | #print "$ftig_a ($read_a) ...$ftig_b ($read_b)\n"; 991 | }else{ 992 | my $err_pair = $ftig_b . "-". $ftig_a; 993 | $err->{$err_pair}{'links'}++; 994 | $err->{$err_pair}{'gaps'} += $d; 995 | $ct_problem_pairs++; 996 | $ct_problem_pairs_hash->{$insert_size}++; 997 | print PET "Pairs unsatisfied in distance within a contig pair. B-> <-A WITH tig#$tig_b -> $d <- tig#$tig_a, B=$B_length nt (start:$B_start, end:$B_end) A=$A_length nt (start:$A_start, end:$A_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 998 | } 999 | }else{ ####<- <- ::: rB-> <-A / rA-> <-B 1000 | my $rB_start = $B_length - $B_start; 1001 | my $d = &getDistance($insert_size, $B_length, $rB_start, $A_start); 1002 | print "rB-> <-A WITH r.$tig_b -> <- $tig_a GAP $d A=$A_length ($A_start-$A_end) B=$B_length ($B_start-$B_end) Blen,rBstart,Astart\n" if($verbose); 1003 | if($d >= $min_allowed){ 1004 | $pair->{$rtig_b}{$ftig_a}{'links'}++; 1005 | $pair->{$rtig_b}{$ftig_a}{'gaps'} += $d; 1006 | $pair->{$rtig_a}{$ftig_b}{'links'}++; 1007 | $pair->{$rtig_a}{$ftig_b}{'gaps'} += $d; 1008 | $ct_ok_pairs++; 1009 | $ct_ok_pairs_hash->{$insert_size}++; 1010 | #print "$ftig_a ($read_a) ...$rtig_b ($read_b)\n"; 1011 | }else{ 1012 | my $err_pair = $rtig_b . "-". $ftig_a; 1013 | $err->{$err_pair}{'links'}++; 1014 | $err->{$err_pair}{'gaps'} += $d; 1015 | $ct_problem_pairs++; 1016 | $ct_problem_pairs_hash->{$insert_size}++; 1017 | print PET "Pairs unsatisfied in distance within a contig pair. rB-> <-A WITH tig#r.$tig_b -> $d <- tig#$tig_a, B=$B_length nt (start:$B_start, end:$B_end) A=$A_length nt (start:$A_start, end:$A_end) CALCULATED DISTANCE APART: $d < $min_allowed\n"; 1018 | } 1019 | } 1020 | } 1021 | }else{###Clone, paired reads located on the same contig -- could be used to investigate misassemblies 1022 | 1023 | print "Pair ($read_a and $read_b) located on same contig $tig_a ($A_length nt)\n" if ($verbose); 1024 | my $pet_size = 0; 1025 | 1026 | if ($A_start > $B_start && ($B_start < $B_end) && ($A_start > $A_end)){ # B --> <-- A 1027 | $pet_size = $A_start - $B_start; 1028 | $track_insert->{$pet_size}++; 1029 | if($pet_size >= $low_iz && $pet_size <= $up_iz){ 1030 | $ct_ok_contig++; 1031 | $ct_ok_contig_hash->{$insert_size}++; 1032 | }else{ 1033 | print PET "Pairs unsatisfied in distance within a contig. Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end CALCULATED DISTANCE APART: $pet_size\n"; 1034 | $ct_iz_issues++; 1035 | $ct_iz_issues_hash->{$insert_size}++; 1036 | } 1037 | }elsif($B_start > $A_start && ($B_start > $B_end) && ($A_start < $A_end)){ # A --> <-- B 1038 | $pet_size = $B_start - $A_start; 1039 | $track_insert->{$pet_size}++; 1040 | if($pet_size >= $low_iz && $pet_size <= $up_iz){ 1041 | $ct_ok_contig++; 1042 | $ct_ok_contig_hash->{$insert_size}++; 1043 | }else{ 1044 | print PET "Pairs unsatisfied in distance within a contig. Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end CALCULATED DISTANCE APART: $pet_size\n"; 1045 | $ct_iz_issues++; 1046 | $ct_iz_issues_hash->{$insert_size}++; 1047 | } 1048 | }else{ 1049 | $ct_illogical++; 1050 | $ct_illogical_hash->{$insert_size}++; 1051 | print PET "Pairs unsatisfied in pairing logic within a contig. Pair ($read_a - $read_b) on contig $tig_a ($A_length nt) Astart:$A_start Aend:$A_end Bstart:$B_start Bend:$B_end\n"; 1052 | } 1053 | } 1054 | }else{###both pairs assembled 1055 | $ct_single++; 1056 | $ct_single_hash->{$insert_size}++; 1057 | } 1058 | }#if unseen 1059 | }#pairing read b 1060 | }#read a 1061 | 1062 | ### summary of contig pair issues 1063 | print PET "------------- Putative issues with contig pairing - Summary ----------------\n"; 1064 | foreach my $err_pair (sort {$err->{$b}{'links'}<=>$err->{$a}{'links'}} keys %$err){ 1065 | my $mean_iz = 0; 1066 | $mean_iz = $err->{$err_pair}{'gaps'} / $err->{$err_pair}{'links'} if ($err->{$err_pair}{'links'}); 1067 | print PET "Pair $err_pair has $err->{$err_pair}{'links'} links and mean distance = $mean_iz\n"; 1068 | } 1069 | close PET; 1070 | 1071 | my $satisfied = $ct_ok_pairs + $ct_ok_contig; 1072 | my $unsatisfied = $ct_problem_pairs + $ct_iz_issues + $ct_illogical; 1073 | my $ct_both_reads = $ct_both * 2; 1074 | 1075 | print LOG "\n===========PAIRED K-MER STATS===========\n"; 1076 | print LOG "Total number of pairs extracted from -s $fof: $totalpairs\n"; 1077 | print LOG "At least one sequence/pair missing from contigs: $ct_single\n"; 1078 | print LOG "Assembled pairs: $ct_both ($ct_both_reads sequences)\n"; 1079 | print LOG "\tSatisfied in distance/logic within contigs (i.e. -> <-, distance on target: $ct_ok_contig\n"; 1080 | print LOG "\tUnsatisfied in distance within contigs (i.e. distance out-of-bounds): $ct_iz_issues\n"; 1081 | print LOG "\tUnsatisfied pairing logic within contigs (i.e. illogical pairing ->->, <-<- or <-->): $ct_illogical\n"; 1082 | print LOG "\t---\n"; 1083 | print LOG "\tSatisfied in distance/logic within a given contig pair (pre-scaffold): $ct_ok_pairs\n"; 1084 | print LOG "\tUnsatisfied in distance within a given contig pair (i.e. calculated distances out-of-bounds): $ct_problem_pairs\n"; 1085 | print LOG "\t---\n"; 1086 | print LOG "Total satisfied: $satisfied\tunsatisfied: $unsatisfied\n\nBreakdown by insert sizes:\n"; 1087 | 1088 | #foreach my $izopt(sort {$a<=>$b} keys %$ct_both_hash){ 1089 | # print LOG "--------Reads with $izopt bp inserts--------\n"; 1090 | # my $maopt = -1 * ($insert_stdev * $izopt); 1091 | # my ($low_izopt, $up_izopt) = ($izopt + $maopt, $izopt - $maopt); 1092 | # print LOG "MIN:$low_izopt MAX:$up_izopt as defined by $izopt * $insert_stdev\n"; 1093 | # print LOG "At least one sequence/pair missing: $ct_single_hash->{$izopt}\n"; 1094 | # print LOG "Assembled pairs: $ct_both_hash->{$izopt}\n"; 1095 | # print LOG "\tSatisfied in distance/logic within contigs (i.e. -> <-, distance on target: $ct_ok_contig_hash->{$izopt}\n"; 1096 | # print LOG "\tUnsatisfied in distance within contigs (i.e. distance out-of-bounds): $ct_iz_issues_hash->{$izopt}\n"; 1097 | # print LOG "\tUnsatisfied pairing logic within contigs (i.e. illogical pairing ->->, <-<- or <-->): $ct_illogical_hash->{$izopt}\n"; 1098 | # print LOG "\t---\n"; 1099 | # print LOG "\tSatisfied in distance/logic within a given contig pair (pre-scaffold): $ct_ok_pairs_hash->{$izopt}\n"; 1100 | # print LOG "\tUnsatisfied in distance within a given contig pair (i.e. calculated distances out-of-bounds): $ct_problem_pairs_hash->{$izopt}\n"; 1101 | #} 1102 | print LOG "============================================\n"; 1103 | 1104 | open (CSV, ">$distribution") || die "Can't open $distribution for writing -- fatal"; 1105 | 1106 | foreach my $is (sort {$a<=>$b} keys %$track_insert){ 1107 | print CSV "$is,$track_insert->{$is}\n"; 1108 | } 1109 | 1110 | close CSV; 1111 | return $pair; 1112 | } 1113 | 1114 | #------------------- 1115 | sub readSeqMemory{ 1116 | 1117 | my $file = shift; 1118 | 1119 | my $fh; 1120 | my $prev="NA"; 1121 | my $seq=""; 1122 | open(FA,$file) || die "Cannot open $file for reading -- fatal.\n"; 1123 | while(){ 1124 | chomp; 1125 | if (/\>(\S+)/){ 1126 | my $head=$1; 1127 | #$seq =~ s/[BDEFHIJKLMOPQRSUVWXYZ]/N/g; 1128 | if($prev ne $head && $prev ne "NA"){ 1129 | $fh->{$prev} = $seq; 1130 | } 1131 | $prev = $head; 1132 | $seq=''; 1133 | }elsif(/^(\S+)$/){ 1134 | $seq .= uc($1); 1135 | } 1136 | } 1137 | $fh->{$prev} = $seq; 1138 | 1139 | close FA; 1140 | 1141 | return $fh; 1142 | } 1143 | 1144 | #------------------- 1145 | sub readContigsMemory{ 1146 | 1147 | my $file = shift; 1148 | 1149 | my $fh; 1150 | my $prev="NA"; 1151 | my $seq=""; 1152 | my $cttig=0; 1153 | open(FA,$file) || die "Cannot open $file for reading -- fatal.\n"; 1154 | while(){ 1155 | chomp; 1156 | if (/\>(\S+)/){ 1157 | $cttig++; 1158 | my $head=$cttig; 1159 | #RLW2016 $seq =~ s/[BDEFHIJKLMOPQRSUVWXYZ]/N/g; 1160 | if($prev ne $head && $prev ne "NA" && $seq ne ""){ 1161 | $fh->{$prev} = $seq; 1162 | } 1163 | $prev = $head; 1164 | $seq=''; 1165 | }elsif(/^(\S+)$/){ 1166 | #RLW2016 $seq .= uc($1); 1167 | $seq .= $1; 1168 | } 1169 | } 1170 | $fh->{$prev} = $seq; 1171 | close FA; 1172 | 1173 | return $fh; 1174 | } 1175 | 1176 | #------------------- 1177 | sub buildScaffoldFasta{ 1178 | 1179 | my ($dotscaffold,$fh,$tigpair) = @_; 1180 | 1181 | open(IN,$dotscaffold) || die "Cannot open $dotscaffold for reading -- fatal.\n"; 1182 | 1183 | my $scaffold_fasta = $dotscaffold . ".fa"; 1184 | my $gaplist = $dotscaffold . "_GAPseqList.txt"; 1185 | 1186 | open(OUT,">$scaffold_fasta") || die "can't write to $scaffold_fasta -- fatal\n"; 1187 | open(LIST,">$gaplist") || die "can't write to $gaplist -- fatal\n"; 1188 | 1189 | my $tot=0; 1190 | my $sct=0; 1191 | my @gsl; 1192 | my $merges=0; 1193 | 1194 | while(){ 1195 | chomp; 1196 | my $sc=""; 1197 | my @a = split(/\,/); 1198 | my @tig; 1199 | 1200 | if($a[2]=~/\_/){ 1201 | @tig = split(/\_/,$a[2]); 1202 | }else{ 1203 | push @tig, $a[2]; 1204 | } 1205 | 1206 | $sct++; 1207 | my $tigsum=0; 1208 | 1209 | my $oldhead = $_; 1210 | my $scaffseq = ""; 1211 | 1212 | my $ct=0; 1213 | $merges += $#tig; 1214 | foreach my $t (@tig){ 1215 | 1216 | 1217 | if($t=~/([fr])(\d+)z(\d+)(\S+)?/i){ 1218 | my $orient = $1; 1219 | my $tnum=$2; 1220 | my $head = $orient . $tnum; 1221 | my $search = $tnum; 1222 | my $other = $4; 1223 | $tot+= $3; 1224 | $tigsum +=$3; 1225 | 1226 | my $gap = "NA"; 1227 | my $gapseq = ""; 1228 | $gap = $1 if($other=~/m(\-?\d+)/); 1229 | 1230 | my $seq = $fh->{$search}; 1231 | $seq = &reverseComplement($seq) if($orient eq "r"); 1232 | 1233 | my $f = $tig[$ct+1];###lookup ahead 1234 | if($f=~/([fr])(\d+)z(\d+)(\S+)?/i){ 1235 | my $forient = $1; 1236 | my $ftnum=$2; 1237 | 1238 | ### if rc a forward alignment then must rc a patch 1239 | if(defined $tigpair->{$tnum}{$ftnum}{'seq'}){ 1240 | print LIST "$tigpair->{$tnum}{$ftnum}{'origin'}\n"; 1241 | $gapseq = lc($tigpair->{$tnum}{$ftnum}{'seq'}); 1242 | 1243 | my $configuration = $tigpair->{$tnum}{$ftnum}{'configuration'}; 1244 | my ($str1,$tig1,$str2,$tig2) = ($1,$2,$3,$4) if($tigpair->{$tnum}{$ftnum}{'configuration'}=~/([rf])(\d+)([rf])(\d+)/); 1245 | if($tig1==$tnum && $tig2==$ftnum){ 1246 | if($str1 ne $orient){ 1247 | $gapseq = &reverseComplement($tigpair->{$tnum}{$ftnum}{'seq'}); 1248 | } 1249 | } 1250 | }elsif(defined $tigpair->{$ftnum}{$tnum}{'seq'}){ 1251 | print LIST "$tigpair->{$ftnum}{$tnum}{'origin'}\n"; 1252 | $gapseq = lc($tigpair->{$ftnum}{$tnum}{'seq'}); 1253 | my $configuration = $tigpair->{$ftnum}{$tnum}{'configuration'}; 1254 | my ($str1,$tig1,$str2,$tig2) = ($1,$2,$3,$4) if($tigpair->{$ftnum}{$tnum}{'configuration'}=~/([rf])(\d+)([rf])(\d+)/); 1255 | if($tig1==$ftnum && $tig2==$tnum){ 1256 | if($str1 ne $forient){ 1257 | $gapseq = &reverseComplement($tigpair->{$ftnum}{$tnum}{'seq'}); 1258 | } 1259 | } 1260 | } 1261 | 1262 | #if(defined $tigpair->{$tnum}{$ftnum}{'seq'}){ 1263 | # $gapseq = lc($tigpair->{$tnum}{$ftnum}{'seq'}); 1264 | #}elsif(defined $tigpair->{$ftnum}{$tnum}{'seq'}){ 1265 | # $gapseq = lc($tigpair->{$ftnum}{$tnum}{'seq'}); 1266 | #}else{ 1267 | # $gapseq = "N" x ($gap-1) if($gap > 0); 1268 | # $gapseq = "n" if($gap ne "NA" && $gap <= 0 ); 1269 | #} 1270 | } 1271 | my $gl = length($gapseq); 1272 | push @gsl,$gl if($gl); 1273 | $seq .= $gapseq; 1274 | $scaffseq .= $seq; 1275 | }#tig regex 1276 | $ct++; 1277 | }#each tig 1278 | my $newlength = length($scaffseq); 1279 | my $head = ">$a[0] $newlength $oldhead"; 1280 | print OUT "$head\n$scaffseq\n"; 1281 | } 1282 | 1283 | close IN; 1284 | close OUT; 1285 | close LIST; 1286 | return $scaffold_fasta,\@gsl,$merges; 1287 | } 1288 | 1289 | ## We hope this code is useful to you -- Please send comments & suggestions to rwarren at bcgsc.ca 1290 | --------------------------------------------------------------------------------