├── .ackrc ├── .gitignore ├── README.md ├── article ├── Aligners │ ├── 10_MagicBLAST │ │ └── align.tcsh │ ├── 20_HISAT2_relaxed │ │ └── align.tcsh │ ├── 21_HISAT2 │ │ └── align.tcsh │ ├── 30_STAR │ │ └── align.tcsh │ ├── 31_STARlong │ │ └── align.tcsh │ ├── 32_STAR.2.6c │ │ └── align.tcsh │ └── 40_TopHat2 │ │ └── align.tcsh ├── MagicBlastPaperMasterScript.tcsh ├── README └── scripts │ ├── AliQC.py │ ├── MagicBlastPaperMasterScript.tcsh │ ├── README │ ├── directErrorCount.awk │ ├── directErrorCount.tcsh │ ├── gff2cig.awk │ ├── introns_precision_recall.awk │ ├── mapping_accuracy.header │ ├── schtroumpf │ ├── submit │ ├── tags │ ├── transpose │ └── transpose.awk ├── docs ├── _config.yml ├── _includes │ ├── disqus.html │ ├── footer.html │ ├── google_analytics.html │ ├── header.html │ └── navigation.html ├── _layouts │ ├── default.html │ └── page.html ├── _posts │ ├── .gitkeep │ ├── 2016-12-29-blastdb.md │ ├── 2016-12-29-copyright.md │ ├── 2016-12-29-exeptions.md │ ├── 2016-12-29-fasta.md │ ├── 2016-12-29-multithreading.md │ ├── 2016-12-29-output.md │ ├── 2016-12-29-paired.md │ ├── 2016-12-29-rnavsdna.md │ ├── 2016-12-29-sra.md │ ├── 2016-12-29-tutorial.md │ ├── 2017-09-13-release.md │ ├── 2017-11-14-download.md │ ├── 2020-05-15-license.md │ ├── 2020-07-14-feedback.md │ └── 2021-05-06-cloud-sra.md ├── css │ ├── main.css │ └── syntax.css └── index.md └── magicblast-tools ├── .gitignore ├── README.md ├── base.py ├── combine-genome-transcripts.py ├── get-introns.py ├── get-transcripts.py ├── gff.py ├── gtf.py ├── requirements.txt ├── sam.py └── txt.py /.ackrc: -------------------------------------------------------------------------------- 1 | --ignore-dir=_site 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sw? 2 | _site 3 | _pages 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Magic-BLAST documentation: 2 | https://ncbi.github.io/magicblast 3 | 4 | ## Publication: 5 | Boratyn GM, Thierry-Mieg J, Thierry-Mieg D, Busby B, Madden TL. (2019) **Magic-BLAST, an accurate RNA-seq aligner for long and short reads.** *BMC Bioinformatics* 20: 405. \[[article](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2996-x)\] 6 | 7 | ## Scripts to recreate published experimental results: 8 | https://github.com/ncbi/magicblast/tree/master/article 9 | -------------------------------------------------------------------------------- /article/Aligners/10_MagicBLAST/align.tcsh: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Jean Thierry-Mieg 27 | # 28 | set mm=$1 29 | set run=$2 30 | set genome=$3 31 | set reads=$4 32 | set mates=$5 33 | 34 | set genomeF=Fasta/$run/genome.gz 35 | set out=$mm/$run/$mm.$run 36 | 37 | set nThreads=4 38 | set genomeDir=Aligners/$mm/$genome 39 | if (! -e $genomeDir) mkdir -p $genomeDir 40 | 41 | mkdir -p $genomeDir 42 | 43 | # create a BLAST database if not present 44 | if (! -e $genomeDir/genome.nhr) then 45 | echo $genomeDir/genome.nhr 46 | gunzip -c Fasta/$run/genome.gz > $genomeDir/my_genome.fasta 47 | bin/makeblastdb -in $genomeDir/my_genome.fasta -out $genomeDir/genome -dbtype nucl -parse_seqids 48 | \rm $genomeDir/my_genome.fasta 49 | endif 50 | 51 | mkdir -p $mm/$run 52 | 53 | # run magicblast 54 | echo "$mm $run $reads $mates" 55 | ls -ls $genomeDir/genome.nhr 56 | echo "$reads" 57 | ls -ls $reads 58 | echo "$reads" 59 | set infmt=`echo $reads | gawk '/fastq/{print "fastq";next}/fasta/{print "fasta"}'` 60 | 61 | 62 | if (-e $genomeDir/genome.nhr && -e $reads && ! -e $mm/$run/$mm.$run.sam) then 63 | set mmm="-query_mate $mates" 64 | if (X$mates == X) set mmm="" 65 | echo "time bin/magicblast -query $reads $mmm -infmt $infmt -db $genomeDir/genome -num_threads $nThreads " 66 | time bin/magicblast -query $reads $mmm -infmt $infmt -db $genomeDir/genome -num_threads $nThreads > $out.sam_unsorted 67 | ls -ls $out.sam_unsorted 68 | time sort $out.sam_unsorted > $out.sam_sorted 69 | gzip $out.sam_sorted 70 | ls -ls $out.sam_sorted.gz 71 | # rm $mm/$run/$mm.$run.sam_unsorted 72 | else 73 | echo "Did not find $genomeDir/genome.nhr or $reads or found $mm/$run/$mm.$run.sam" 74 | ls -ls $mm/$run/$mm.$run.sam 75 | endif 76 | 77 | 78 | -------------------------------------------------------------------------------- /article/Aligners/20_HISAT2_relaxed/align.tcsh: -------------------------------------------------------------------------------- 1 | ../21_HISAT2/align.tcsh -------------------------------------------------------------------------------- /article/Aligners/21_HISAT2/align.tcsh: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Jean Thierry-Mieg 27 | # 28 | set mm=$1 29 | set run=$2 30 | set target=$3 31 | set reads=$4 32 | set mates=$5 33 | 34 | set genome=Fasta/$run/genome.gz 35 | set out=$mm/$run/$mm.$run 36 | mkdir -p $mm/$run 37 | set mm2=21_HISAT2 38 | 39 | set nThreads=4 40 | set genomeDir=Aligners/$mm2/$target 41 | if (! -e $genomeDir) mkdir -p $genomeDir 42 | 43 | if (! $?TMPDIR) then 44 | set TMPDIR=/tmp 45 | if (-d /export/home/TMP) set TMPDIR=/export/home/TMP 46 | endif 47 | 48 | # construct the hisat genome index 49 | if (-e $genome && ! -e $genomeDir/genome.1.ht2) then 50 | echo "gunzip -c $genome > $genomeDir/mygenome.fasta" 51 | gunzip -c $genome > $genomeDir/mygenome.fasta 52 | ls -ls $genomeDir/mygenome.fasta 53 | if (-e $genomeDir/mygenome.fasta) then 54 | time bin/hisat2-master/hisat2-build $genomeDir/mygenome.fasta $genomeDir/genome 55 | touch $genomeDir/done 56 | # \rm $genomeDir/mygenome.fasta 57 | endif 58 | endif 59 | 60 | if (-e $genomeDir/genome.1.ht2) then 61 | echo "HISAT2 index is ready" 62 | else 63 | echo "missing HISAT2 index $genomeDir/genome.1.ht2" 64 | goto done 65 | endif 66 | 67 | if ($reads == "") then 68 | echo "missing parametets 4 which should be the reads file" 69 | goto done 70 | endif 71 | 72 | if (! -e $reads) then 73 | echo "cannot find the reads file $reads" 74 | goto done 75 | endif 76 | 77 | if ($mates == "") then 78 | set rr="-U $reads" 79 | else 80 | set rr="-1 $reads -2 $mates" 81 | endif 82 | 83 | set type=`echo $reads | gawk '{t="f"}/fastq/{t="q"}{print t}'` 84 | 85 | set params="" 86 | if ($mm == 20_HISAT2_relaxed) then 87 | set params="--min-score L,0.0,-2" 88 | endif 89 | 90 | if (-e $out.sam || -e $out.sam_sorted.gz) then 91 | echo "$out.sam ready" 92 | else 93 | if (! -d $mm/$run) mkdir -p $mm/$run 94 | uname -a 95 | echo " time bin/hisat2-master/hisat2 -$type -x $genomeDir/genome $rr $params -p $nThreads -S $out.sam" 96 | time bin/hisat2-master/hisat2 -$type -x $genomeDir/genome $rr $params -p $nThreads -S $out.sam 97 | endif 98 | 99 | if (-e $out.sam && ! -e $out.sam_sorted.gz) then 100 | if (-e $TMPDIR/$mm/$run) \rm -rf $TMPDIR/$mm/$run 101 | mkdir -p $TMPDIR/$mm/$run 102 | cat $out.sam | sort -T $TMPDIR/$mm/$run | gzip > $out.sam_sorted.gz 103 | endif 104 | if (-e $out.sam && -e $out.sam_sorted.gz) then 105 | \rm $out.sam 106 | endif 107 | 108 | if (-e $TMPDIR/$mm/$run) \rm -rf $TMPDIR/$mm/$run 109 | 110 | done: 111 | echo done 112 | -------------------------------------------------------------------------------- /article/Aligners/30_STAR/align.tcsh: -------------------------------------------------------------------------------- 1 | ../31_STARlong/align.tcsh -------------------------------------------------------------------------------- /article/Aligners/31_STARlong/align.tcsh: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Jean Thierry-Mieg 27 | # 28 | set mm=$1 29 | set run=$2 30 | set target=$3 31 | set reads=$4 32 | set mates=$5 33 | 34 | set genome=Fasta/$run/genome.gz 35 | set out=$mm/$run/$mm.$run 36 | mkdir -p $mm/$run 37 | 38 | set nThreads=4 39 | set genomeDir=Aligners/$mm/$target 40 | if (! -e $genomeDir) mkdir -p $genomeDir 41 | 42 | if (! $?TMPDIR) then 43 | set TMPDIR=/tmp 44 | if (-d /export/home/TMP) set TMPDIR=/export/home/TMP 45 | endif 46 | 47 | # --sjdbOverhang 100 : best for long reads >= 100, do not give is no gff file 48 | if (-e $genome && ! -e $genomeDir/SAindex) then 49 | echo "gunzip -c $genome > $genomeDir/mygenome.fasta" 50 | gunzip -c $genome > $genomeDir/mygenome.fasta 51 | ls -ls $genomeDir/mygenome.fasta 52 | if (-e $genomeDir/mygenome.fasta) then 53 | # lmem03 8threads 1h49 elapsed, 513%, 32673u+1117s 54 | time bin/STARlong --runMode genomeGenerate --runThreadN $nThreads --genomeDir $genomeDir --genomeFastaFiles $genomeDir/mygenome.fasta 55 | touch $genomeDir/done 56 | #\rm $genomeDir/mygenome.fasta 57 | endif 58 | endif 59 | 60 | if (-e $genomeDir/SAindex) then 61 | echo "STAR index is ready" 62 | else 63 | echo "missing STAR index $genomeDir/SAindex" 64 | goto done 65 | endif 66 | 67 | if (0) then 68 | --readFilesCommand gunzip -c # allow .gz on input 69 | --outSAMtype SAM/BAM/None [Unsorted/SortedByCoordinates] 70 | --outSAMattributes All 71 | --outSAMattributes Standard 72 | --outFileNamePrefix $out 73 | --outTmpDir $TMPDIR 74 | --outFilterMatchNmin 24 75 | --outFilterScoreMin 24 76 | --outFilterMismatchNmax 100000 77 | --outFilterMismatchNoverLmax 100000 78 | --twopassMode Basic 79 | --genomeLoad LoadAndRemove 80 | --genomeLoad NoSharedMemory # only one compatible with 2-pass mode 81 | endif 82 | echo $out'Aligned.out.sam' 83 | 84 | if (-e $out'Aligned.out.sam' && ! -e $out.sam_sorted.gz) then 85 | cat $out'Aligned.out.sam' | sort -T $TMPDIR | gzip > $mm/$run/$mm.$run.sam_sorted.gz 86 | endif 87 | 88 | set mySTAR=STAR 89 | if ($mm == 31_STARlong) set mySTAR=STARlong 90 | if ($mm == 30_STAR) then 91 | set mySTAR=STAR_1pass 92 | endif 93 | 94 | if (-e $out.sam || -e $out.sam_sorted.gz) then 95 | echo "$out.sam ready" 96 | else 97 | if (! -d $mm/$run) mkdir -p $mm/$run 98 | mkdir -p $TMPDIR/$mm 99 | if (-e $TMPDIR/$mm/$run) \rm -rf $TMPDIR/$mm/$run 100 | if ($mm == 30_STAR) then 101 | uname -a 102 | echo " time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --genomeLoad NoSharedMemory --readFilesIn $reads $mates" 103 | time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --genomeLoad NoSharedMemory --readFilesIn $reads $mates 104 | else 105 | uname -a 106 | echo " time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --outFilterMatchNmin 24 --outFilterScoreMin 24 --outFilterMismatchNmax 100000 --outFilterMismatchNoverLmax .5 --genomeLoad NoSharedMemory --twopassMode Basic --seedPerReadNmax 100000 --readFilesIn $reads $mates" 107 | time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --outFilterMatchNmin 24 --outFilterScoreMin 24 --outFilterMismatchNmax 100000 --outFilterMismatchNoverLmax .5 --genomeLoad NoSharedMemory --twopassMode Basic --seedPerReadNmax 100000 --readFilesIn $reads $mates 108 | endif 109 | endif 110 | 111 | 112 | if (-e $out'Aligned.out.sam' && ! -e $mm/$run/$mm.$run.sam_sorted.gz) then 113 | cat $out'Aligned.out.sam' | sort -T $TMPDIR | gzip > $mm/$run/$mm.$run.sam_sorted.gz 114 | endif 115 | if (-e $out'Aligned.out.sam' && -e $mm/$run/$mm.$run.sam_sorted.gz) then 116 | \rm $out'Aligned.out.sam' 117 | endif 118 | 119 | 120 | done: 121 | echo done 122 | -------------------------------------------------------------------------------- /article/Aligners/32_STAR.2.6c/align.tcsh: -------------------------------------------------------------------------------- 1 | ../31_STARlong/align.tcsh -------------------------------------------------------------------------------- /article/Aligners/40_TopHat2/align.tcsh: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Jean Thierry-Mieg, Greg Boratyn 27 | # 28 | 29 | set mm=$1 30 | set run=$2 31 | set genome=$3 32 | set reads=$4 33 | set mates=$5 34 | 35 | set genomeF=Fasta/$run/genome.gz 36 | set out=$mm/$run/$mm.$run 37 | 38 | set nThreads=4 39 | set genomeDir=Aligners/$mm/$genome 40 | if (! -e $genomeDir) mkdir -p $genomeDir 41 | 42 | mkdir -p $genomeDir 43 | set path=($path `pwd`/bin/bowtie2) 44 | 45 | # create an index if not present 46 | if (! -e $genomeDir/genome.1.bt2) then 47 | echo $genomeDir/genome.1.bt2 48 | gunzip -c Fasta/$run/genome.gz > $genomeDir/my_genome.fasta 49 | bin/bowtie2/bowtie2-build $genomeDir/my_genome.fasta $genomeDir/genome 50 | \rm $genomeDir/my_genome.fasta 51 | endif 52 | 53 | mkdir -p $mm/$run 54 | 55 | # run tophat 56 | echo "$mm $run $reads $mates" 57 | ls -ls $genomeDir/genome.1.bt2 58 | echo "$reads" 59 | ls -ls $reads 60 | echo "$reads" 61 | 62 | if (-e $genomeDir/genome.1.bt2 && -e $reads && ! -e $mm/$run/$mm.$run.sam) then 63 | echo "time bin/tophat2/tophat2 -p $nThreads -o ${out}_dir $genomeDir/genome $reads $mates " 64 | time bin/tophat2/tophat2 -p $nThreads -o ${out}_dir $genomeDir/genome $reads $mates 65 | samtools view -h ${out}_dir/accepted_hits.bam >$out.sam_unsorted 66 | time sort $mm/$run/$mm.$run.sam_unsorted > $mm/$run/$mm.$run.sam_sorted 67 | gzip $mm/$run/$mm.$run.sam_sorted 68 | \rm $mm/$run/$mm.$run.sam_unsorted 69 | else 70 | echo "Did not find $genomeDir/genome.1.ht2 or $reads or found $mm/$run/$mm.$run.sam" 71 | ls -ls $mm/$run/$mm.$run.sam 72 | endif 73 | 74 | 75 | -------------------------------------------------------------------------------- /article/MagicBlastPaperMasterScript.tcsh: -------------------------------------------------------------------------------- 1 | scripts/MagicBlastPaperMasterScript.tcsh -------------------------------------------------------------------------------- /article/README: -------------------------------------------------------------------------------- 1 | # Aug 1st, 2018 2 | # Author: Jean Thierry-Mieg, NCBI/NLM/NIH 3 | # For questions, please email mieg@ncbi.nlm.nih.gov 4 | 5 | 6 | The present directory contains the main scripts used in the Magic-BLAST paper analysis 7 | and can be used to replicate our analysis. 8 | 9 | =========== 10 | 11 | The link for the paper is: https://github.com/ncbi/magicblast/tree/master/article 12 | 13 | Please clone the site: 14 | git clone https://github.com/ncbi/magicblast 15 | cd magicblast/article 16 | you should now see the present README file and 2 directories 17 | scripts: see scripts/README for a description of the content 18 | Aligners: see Aligners/README for a description of the content 19 | and a link to the main script: 20 | MagicBlastPaperMasterScript.tcsh 21 | 22 | Try 23 | tcsh MagicBlastPaperMasterScript.tcsh --help 24 | to see the list of the commands 25 | 26 | ============ 27 | 28 | You can control the number of runs to be analyzed 29 | by editing the definition of the variable $runs, around line 50 30 | of the main script MagicBlastPaperMasterScript.tcsh 31 | 32 | You can control the number of aligners to be analyzed 33 | by editing the definition of the variable $methods, around line 27 34 | of the main script MagicBlastPaperMasterScript.tcsh 35 | 36 | ============ 37 | 38 | init: 39 | 40 | Please run 41 | tcsh MagicBlastPaperMasterScript.tcsh init 42 | 43 | This command should download from NCBI the linux binaries and the reference genomes used in this benchmark 44 | i.e. the 2 Baruzzo genomes for their human and their P.falciparum test sets ls -ls Reference_genome/ 45 | and the GRCh38 human genome which is used for iRefSeq and the Illumina, Roche and Pacbio runs 46 | 47 | 845712 -rw-r--r--. 1 mieg biodata 866005452 Aug 6 15:37 GRCh38.genome.fasta.gz 48 | 4 -rw-r--r--. 1 mieg biodata 1230 Aug 6 15:37 HG19.Baruzzo.genome.TM.txt 49 | 915076 -rw-r--r--. 1 mieg biodata 937034891 Aug 6 15:37 HG19.Baruzzo.genome.fasta.gz 50 | 4 -rw-r--r--. 1 mieg biodata 869 Aug 6 15:37 PFAL.Baruzzo.genome.TM.txt 51 | 6260 -rw-r--r--. 1 mieg biodata 6408381 Aug 6 15:37 PFAL.Baruzzo.genome.fasta.gz 52 | 53 | init also creates the Fasta directory, but does not yet import the runs 54 | 55 | The binaries are compiled for Linux 64bits Intel processors. See bin/README for details 56 | 57 | ============ 58 | 59 | download: 60 | 61 | Please run 62 | tcsh MagicBlastPaperMasterScript.tcsh download 63 | This command may take a long time depending on the quality of your network connection 64 | it will load from NCBI the fasta/fastq files of all 18 runs: 65 | 9 Baruzzo HG19, 9 PFAL, Illumina, PacBio, Roche 66 | plus the iRefSeq fasta and gff files. 67 | 68 | All these files are copied into Fasta/$run 69 | Because some files are paired-end files, we expect 41 files and 4 symbolic links 70 | ls -ls Fasta/*/*.fast[aq].gz | wc -l 71 | 72 | ============ 73 | ============ 74 | 75 | There is now a choice, you may either download from NCBI the precomputed SAM files 76 | or recompute them yourself 77 | 78 | ============ 79 | 80 | sam: 81 | To download the precomputed SAM files, please run 82 | tcsh MagicBlastPaperMasterScript.tcsh sam 83 | This command may take a very long time depending on the quality of your network connection. 84 | It will load from NCBI the sam files for the 18 runs and for all aligners 85 | 86 | ============ 87 | 88 | align: 89 | To align the data on your own machine, please run 90 | tcsh MagicBlastPaperMasterScript.tcsh align 91 | Notice that the script works in lazy mode: 92 | if some sam files have been downloaded, the corresponding run will not be realigned 93 | 94 | Realigning may take a very long time and requires very large RAM, some cases 95 | demand more than 32 Gb of RAM. If you do not have a large hardware, we rather 96 | recommend that you download the precomputed SAM files as explained above. 97 | If you do realign, you may want to study and configure the self documented 98 | file scripts/submit which can help to fan out the alignments on a compute far. 99 | 100 | 101 | ============ 102 | ============ 103 | 104 | Analysis. 105 | 106 | Once the sam files are available, they are analyzed by the following three sub-commands 107 | 108 | aliqc : run AliQC.py on each run in the background (slow) 109 | Notice that the command ' MagicBlastPaperMasterScript.tcsh aliqc' requires HTSeq 110 | which is installed in scripts/HTSeq. This is expected to work, but if you encounter 111 | a problem, please study the additional information in the file scripts/HTSeq/README 112 | 113 | This command scans the SAM files, compares them to the reference genome and construct 114 | detailed statistics on the quality of the alignment and the nature of the mismatches. 115 | accuracy : analyze the intron found in all the BAM files and compares them to the benchmark truth 116 | This command runs a dedicated C program, see info in the file bin/README, which 117 | compares the introns discovered in the SAM file to the benchmark truth. 118 | export : export QC and ROC curve of intron discovery and the histogram of aligned lengths 119 | The final tab delimited tables will appear in the RESULTS directory. 120 | 121 | 122 | ============= 123 | ============= 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /article/scripts/MagicBlastPaperMasterScript.tcsh: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # 27 | ## MagicBLAST_paper_master_script.tcsh 28 | ## 29 | ## MagicBLAST paper, june 2018, master script 30 | ## Author, Greg Boratyn, Danielle Thierry-Mieg, Jean Thierry-Mieg, Ben Busby, Tom Madden 31 | ## email for this script: mieg@ncbi.nlm.nih.gov 32 | 33 | ## This is a tcsh executable script 34 | ## To see the on line help, run it under the tcsh interpretor using the command 35 | ## MagicBLAST_paper_master_script.tcsh 36 | 37 | if ($# == 0) goto phase_Help 38 | if ($1 == help) goto phase_Help 39 | if ($1 == '-help') goto phase_Help 40 | if ($1 == '--help') goto phase_Help 41 | 42 | ############################################################################# 43 | ## Metadata 44 | 45 | ## Aligners 46 | # List of aligners used in the analysis 47 | # The number in front serves to order the tables in a systematic way 48 | # one can insert new versions of each program by inserting new numbers 49 | # but since the numbers are erased in the final tables, the number AND the names 50 | # must be unique 51 | setenv methods "10_MagicBLAST 20_HISAT2_relaxed 21_HISAT2 30_STAR 31_STARlong 32_STAR.2.6c 40_TopHat2" 52 | # setenv methods "21_HISAT2 31_STARlong" 53 | # setenv methods "21_HISAT2" 54 | # setenv methods "40_TopHat2" 55 | # setenv methods "20_HISAT2_relaxed 21_HISAT2 " 56 | # setenv methods "10_MagicBLAST" 57 | # setenv methods "30_STAR 31_STARlong 32_STAR.2.6c" 58 | # setenv methods "10_MagicBLAST 20_HISAT2_relaxed 21_HISAT2 30_STAR 31_STARlong 32_STAR.2.6c" 59 | ############################################################################# 60 | ## Datasets 61 | ## Each dataset must be aligned on the reference genome carrying the relevant truth 62 | # Experimental human datasets, to be aligned on the NCBI human genome 63 | setenv main_runs "iRefSeq PacBio Roche Illumina" 64 | 65 | # Baruzzo datasets, to be aligned on the Baruzzo human reference genome 66 | setenv HG19_r1_runs "HG19t1r1 HG19t2r1 HG19t3r1" 67 | setenv HG19_r2_runs "HG19t1r2 HG19t2r2 HG19t3r2" 68 | setenv HG19_r3_runs "HG19t1r3 HG19t2r3 HG19t3r3" 69 | setenv HG19_runs "$HG19_r1_runs $HG19_r2_runs $HG19_r3_runs" 70 | 71 | # Baruzzo datasets, to be aligned on the Baruzzo malaria reference genome 72 | setenv PFAL_r1_runs "PFALt1r1 PFALt2r1 PFALt3r1" 73 | setenv PFAL_r2_runs "PFALt1r2 PFALt2r2 PFALt3r2" 74 | setenv PFAL_r3_runs "PFALt1r3 PFALt2r3 PFALt3r3" 75 | setenv PFAL_runs "$PFAL_r1_runs $PFAL_r2_runs $PFAL_r3_runs" 76 | 77 | setenv runs "$main_runs $HG19_runs $PFAL_runs" 78 | 79 | # Additional PacBio runs from Brain and Testis 80 | setenv pacbio_runs "SRR5189652 SRR5189667" 81 | # Additional long paired end Illumina reads 82 | # 250_250 (from metastatic melanoma) and 300+300 from MCF7 cells) 83 | setenv long_illumina_runs "SRR5438850 SRR5437876" 84 | # setenv runs "$main_runs $HG19_runs $PFAL_runs $pacbio_runs $long_illumina_runs" 85 | # setenv runs "$long_illumina_runs" 86 | # setenv runs "$pacbio_runs" 87 | # setenv runs "PacBio Illumina" 88 | # setenv runs "Roche PacBio iRefSeq " 89 | # setenv runs "$runs PFALt1r1S HG19t1r1_50 $pacbio_runs" 90 | # setenv runs "HG19t1r1_50" 91 | # setenv runs "PFALt1r1S" 92 | # setenv runs "$pacbio_runs" 93 | # setenv runs "$runs HG19t1r1_50 PFALt1r1S" 94 | setenv runs "$main_runs $HG19_runs $PFAL_runs $pacbio_runs $long_illumina_runs PFALt1r1S HG19t1r1_50 " 95 | 96 | 97 | # This adapter is present in the PacBio SRR runs and gives a peak at 32 aligned bases = polyA + first 8 bp of adaptor 98 | # AAAAAAAAAAAAAAAAAAAAAAAAAAAAGTACTCT GCGTTGATACCACTGCTTAGATCGGAAGAG 99 | ############################################################################# 100 | ## Fasta and Fastq files 101 | ## All runs fasta or fastq files are in the directories Fasta/$run 102 | ## If they are absent, the script will download them from NCBI 103 | # 104 | # The script assumes that all files are gzipped, and called $run/$run*.fast[aq].gz 105 | # their logical name, i.e. PacBio, links to their SRA identificator, e.g. SRR5009494. 106 | # The iRefSeq and the Baruzzo files are given in fasta format 107 | # The Illumina, Pabio and Roche fils are given in .fastq format 108 | # Some runs are paired-ends: 109 | # -Illumina paired end run has 2 files called SRR534301_1 and SRR534301_2 110 | # -Baruzzo paired end runs are called .forward and .reverse 111 | # In the Roche file Fasta/Roche/SRR899420.fastq we removed all read_1 (all 4-bases long) 112 | # and kept only the 24577 read_2. 113 | # 114 | # For convenience and completeness, we also copied in Fasta/iRefSeq the original gff file 115 | # The iRefSeq fasta file can be extracted from the gff file using the command option 116 | # MagicBLAST_paper_master_script.tcsh make_iRefSeq 117 | # of the the present script 118 | 119 | foreach run ($runs) 120 | if (! -d Fasta/$run) mkdir -p Fasta/$run 121 | end 122 | 123 | ############################################################################# 124 | ## Reference genome 125 | # The main genome is NCBI release 19, limited to the main chromosome and 126 | # the mitochondria excluding the patches and the alternates 127 | setenv main_genome GRCh38.genome.fasta.gz 128 | # Baruzzo benchmark reference genome, available from 129 | setenv HG19_genome HG19.Baruzzo.genome.fasta.gz 130 | setenv PFAL_genome PFAL.Baruzzo.genome.fasta.gz 131 | 132 | ## Automatic download of the BenchMark fastq files from NCBI 133 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/" 134 | if (! -d Reference_genome) mkdir Reference_genome 135 | pushd Reference_genome 136 | foreach ff (GRCh38.genome.fasta.gz HG19.Baruzzo.genome.TM.txt HG19.Baruzzo.genome.fasta.gz PFAL.Baruzzo.genome.TM.txt PFAL.Baruzzo.genome.fasta.gz) 137 | if (! -e $ff) then 138 | wget $FTP/Reference_genome/$ff 139 | endif 140 | end 141 | popd 142 | 143 | if (1) then 144 | foreach run ($HG19_runs) 145 | if (! -d Fasta/$run || -e Fasta/$run/target) continue 146 | pushd Fasta/$run 147 | ln -s ../../Reference_genome/HG19.Baruzzo.genome.fasta.gz genome.gz 148 | echo HG19 > target 149 | popd 150 | end 151 | foreach run ($PFAL_runs) 152 | if (! -d Fasta/$run || -e Fasta/$run/target) continue 153 | pushd Fasta/$run 154 | ln -s ../../Reference_genome/PFAL.Baruzzo.genome.fasta.gz genome.gz 155 | echo PFAL > target 156 | popd 157 | end 158 | foreach run ($main_runs) 159 | if (! -d Fasta/$run || -e Fasta/$run/target) continue 160 | pushd Fasta/$run 161 | ln -s ../../Reference_genome/GRCh38.genome.fasta.gz genome.gz 162 | echo GRCh38 > target 163 | popd 164 | end 165 | foreach run ($pacbio_runs $long_illumina_runs) 166 | if (! -d Fasta/$run || -e Fasta/$run/target) continue 167 | pushd Fasta/$run 168 | ln -s ../../Reference_genome/GRCh38.genome.fasta.gz genome.gz 169 | echo GRCh38 > target 170 | popd 171 | end 172 | touch Fasta/genomes_are_assigned 173 | 174 | foreach run ($pacbio_runs iRefSeq PacBio) 175 | if (-d Fasta/$run) touch Fasta/$run/isLongRun 176 | end 177 | endif 178 | 179 | ############################################################################# 180 | ## Automatic download of the binaries from the NCBI ftp site 181 | 182 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article" 183 | if (! -d bin || ! -d scripts/HTSeq) then 184 | if (-e bin/binaries.linux64.tar.gz) then 185 | mv binaries.linux64.tar.gz . 186 | endif 187 | if (! -e binaries.linux64.tar.gz) then 188 | wget $FTP/binaries.linux64.tar.gz 189 | endif 190 | if (! -e binaries.linux64.tar.gz) then 191 | echo "FATAL ERROR: The automatic download of the binaries from $FTP/binaries.linux64.tar.gz failed" 192 | echo "May be the network connection did not work, please try manually to run the command" 193 | echo " wget $FTP/binaries.linux64.tar.gz" 194 | echo "if it does not work please email mieg@ncbi.nlm.nih.gov" 195 | endif 196 | if (-e binaries.linux64.tar.gz) then 197 | echo "expanding binaries.linux64.tar.gz, please wait" 198 | gunzip -c binaries.linux64.tar.gz | tar xf - 199 | mv binaries.linux64.tar.gz bin 200 | endif 201 | endif 202 | 203 | ############################################################################# 204 | ## BAM files 205 | ## The BAM files are named $method/$run/$method.$run.bam 206 | ## All runs were aligned on the relevant appropriate genome by all aligners 207 | ## but it did not always work. Some files are missing, for example 30_STAR.iRefSeq.bam, 208 | ## because STAR crashed on long reads 209 | 210 | ############################################################################## 211 | ## utilities 212 | setenv TMPDIR /tmp 213 | if (-d /export/home/TMP) setenv TMPDIR /export/home/TMP 214 | if (! -d tmp) mkdir tmp 215 | if (! -d RESULTS) mkdir RESULTS 216 | 217 | ############################################################################## 218 | ############################################################################## 219 | ## Executable and source code 220 | ## Our scripts are in the scripts directory, e.g. scripts/AliQC.py 221 | ## Our executable are compiled for generic LINUX 64 bits machine in the bin directory 222 | ## e.g. magicblast, dna2dna, sam2gold 223 | ## Our source code is available for analysis and recompilation in machine optimized mode 224 | ## in the source_code directory, together with instructions in the correspondng README file. 225 | 226 | echo -n "## MagicBlastPaperMasterScript.tcsh $1 : " 227 | date 228 | 229 | echo "runs = $runs" 230 | echo "methods = $methods" 231 | 232 | if ($1 == init) goto phase_Init 233 | if ($1 == download) goto phase_Download 234 | if ($1 == align) goto phase_Align 235 | if ($1 == Make_iRefSeq) goto phase_Make_iRefSeq 236 | if ($1 == count) goto phase_Count 237 | if ($1 == sam) goto phase_Sam 238 | if ($1 == accuracy) goto phase_Accuracy 239 | if ($1 == aliqc) goto phase_aliqc 240 | if ($1 == errors) goto phase_DirectErrorCount 241 | if ($1 == export) goto phase_Export 242 | if ($1 == aliLn) goto phase_aliLn 243 | if ($1 == subs) goto phase_Count_subtitutions_in_benchmark 244 | echo "Unknown command : $1, please try --help" 245 | goto phaseLoop 246 | 247 | phase_Help: 248 | 249 | echo "\nusage scripts/MagicBlastPaperMasterScript.tcsh, where command is one of:" 250 | echo 'help : This cnline help' 251 | echo 'init : in tcsh, "source README init" will set the variables $runs, $methods which may be convenient' 252 | echo 'download : Automatic download of the fastq files, please monitor carefully the results' 253 | echo 'Make_iRefSeq : create the iRefSeq fasta file and intron file from the gff and the genome file' 254 | echo 'count : Count the reads in each fasta/fastq file' 255 | echo 'sam : Automatically download the sam files from NCBI (rather that running the aligners locally)' 256 | echo 'align : run for all runs all aligners for which the script Aligners/$method/align.tcsh is defined' 257 | echo 'aliqc : run AliQC.py on each run in the background (slow), presupposes that HTSeq is installed' 258 | echo 'accuracy : measure intron and alignment accuracy relative to the gold standard' 259 | echo 'errors : count the errors in te BAM files using the NM:i:x optional field' 260 | echo 'subs : count substitutions in the human and malaria Baruzzo benchmarks' 261 | echo 'aliLn : export histogram of aligned lengths' 262 | echo 'export : export QC and ROC curve of intron discovery' 263 | 264 | goto phaseLoop 265 | 266 | phase_Init: 267 | goto phaseLoop 268 | 269 | ############################################################################## 270 | ############################################################################# 271 | ## Automatic download of the BenchMark fastq files from NCBI 272 | phase_Download: 273 | 274 | ## NCBI repository for the datafiles used in the paper 275 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/" 276 | 277 | echo "checking $HG19_runs $PFAL_runs" 278 | foreach run ( $HG19_runs $PFAL_runs) 279 | if (! -e Fasta/$run/$run.reverse.fasta.gz) then 280 | pushd Fasta/$run 281 | echo "Trying to download $run from$FTP" 282 | wget $FTP/Fasta/$run/$run.cig.gz 283 | wget $FTP/Fasta/$run/$run.forward.fasta.gz 284 | wget $FTP/Fasta/$run/$run.reverse.fasta.gz 285 | popd 286 | endif 287 | end 288 | 289 | set run=HG19t1r1_50 290 | if (! -d Fasta/$run) then 291 | echo "preparing the 5+50 clipped run" 292 | mkdir Fasta/$run 293 | pushd Fasta/$run 294 | ln -s ../../Reference_genome/HG19.Baruzzo.genome.fasta.gz genome.gz 295 | ../../bin/dna2dna -i ../HG19t1r1/HG19t1r1.forward.fasta.gz -gzo -o $run.forward -rightClipAt 50 296 | ../../bin/dna2dna -i ../HG19t1r1/HG19t1r1.reverse.fasta.gz -gzo -o $run.reverse -rightClipAt 50 297 | popd 298 | endif 299 | 300 | set run=PFALt1r1S 301 | if (! -d Fasta/$run) then 302 | echo "preparing the subsampled run" 303 | mkdir Fasta/$run 304 | pushd Fasta/$run 305 | ln -s ../../Reference_genome/PFAL.Baruzzo.genome.fasta.gz genome.gz 306 | ../../bin/dna2dna -i ../PFALt1r1/PFALt1r1.forward.fasta.gz -gzo -o $run.forward -subsample 100 307 | ../../bin/dna2dna -i ../PFALt1r1/PFALt1r1.reverse.fasta.gz -gzo -o $run.reverse -subsample 100 308 | popd 309 | endif 310 | 311 | echo "checking iRefSeq" 312 | foreach run (iRefSeq) 313 | if (! -e Fasta/$run/$run.fasta.gz) then 314 | pushd Fasta/$run 315 | echo "Trying to download $run from$FTP" 316 | wget $FTP/Fasta/$run/$run.cig.gz 317 | wget $FTP/Fasta/$run/$run.fasta.gz 318 | wget $FTP/Fasta/$run/GRCh38_genome.gff.gz 319 | ln -s GRCh38_genome.gff.gz genome.gff.gz 320 | gunzip -c $run.fasta.gz | ../../bin/dna2dna -getTM > $run.TM 321 | popd 322 | endif 323 | end 324 | 325 | echo "checking Roche" 326 | foreach run (Roche) 327 | if (! -e Fasta/$run/$run.fasta.gz) then 328 | pushd Fasta/$run 329 | echo "Trying to download $run from$FTP" 330 | wget $FTP/Fasta/$run/$run.fasta.gz 331 | gunzip -c $run.fasta.gz | ../../bin/dna2dna -getTM > $run.TM 332 | popd 333 | endif 334 | end 335 | 336 | ############################################################################# 337 | ## .cig TRUTH Files 338 | ## The Baruzzo benchmark is providing the original position of the simulated reads 339 | ## in their .cig format, which is analogous, but not identical, to the SAM format. 340 | ## Since the fasta and the .cig files both come from Baruzzo, we located then in Fasta/$run.cig.gz 341 | ## For convenience, we reformatted the RefSeq gff file into a similar Fasta/iRefSeq/iRefSeq.cig.gz 342 | if (-e Fasta/iRefSeq/genome.gff.gz && ! -e Fasta/iRefSeq/iRefSeq.cig.gz) then 343 | gunzip -c Fasta/iRefSeq/genome.gff.gz | gawk -F '\t' -f scripts/gff2cig.awk | gzip > Fasta/iRefSeq/iRefSeq.cig.gz4 344 | endif 345 | ## To compare the BAM files produced by the different aligners to the .cig "truth" 346 | ## we developped a C code called sam2gold (see below) 347 | 348 | ############################################################################# 349 | ## Automatic download of the Illumina Roche pacBio from SRA 350 | 351 | foreach run (PacBio Illumina) 352 | if (! -d Fasta/$run) continue 353 | if ($run == PacBio) set run2=SRR5009494 354 | if ($run == Roche) set run2=SRR899420 355 | if ($run == Illumina) set run2=SRR534301 356 | if (! -e Fasta/$run/$run2.fastq.gz && ! -e Fasta/$run/$run2'_1'.fastq.gz) then 357 | set n=`bin/fastq-dump --help | wc -l` 358 | if ($n < 10) then 359 | echo "Sorry, the executable bin/fastq-dump available from NCBI SRA and needed to download the $run run is not found" 360 | goto phaseLoop 361 | endif 362 | set sf="" 363 | if ($run == Illumina) set sf="--split-files" 364 | echo "Trying to download $run2 from SRA" 365 | bin/fastq-dump $sf -O Fasta/$run $run2 366 | if (-e Fasta/$run/$run2.fastq || -e Fasta/$run/$run2'_1'.fastq) then 367 | gzip Fasta/$run/$run2*.fastq 368 | pushd Fasta/$run 369 | if (-e $run2.fastq.gz) ln -s $run2.fastq.gz $run.fastq.gz 370 | if (-e $run2'_1'.fastq.gz) ln -s $run2'_1'.fastq.gz $run'_1'.fastq.gz 371 | if (-e $run2'_2'.fastq.gz) ln -s $run2'_2'.fastq.gz $run'_2'.fastq.gz 372 | popd 373 | endif 374 | if (-e ~/ncbi/public/sra/$run2) \rm -rf ~/ncbi/public/sra/$run2 375 | endif 376 | end 377 | 378 | ############################################################################# 379 | ## Automatic download of the fastq files from SRA 380 | 381 | foreach run ($pacbio_runs) 382 | if (! -d Fasta/$run) continue 383 | if (! -e Fasta/$run/$run.fasta.gz && ! -e Fasta/$run/$run.fastq.gz) then 384 | set n=`bin/fastq-dump --help | wc -l` 385 | if ($n < 10) then 386 | echo "Sorry, the executable bin/fastq-dump available from NCBI SRA and needed to download the $run run is not found" 387 | goto phaseLoop 388 | endif 389 | echo "Trying to download $run from SRA" 390 | bin/fastq-dump -O Fasta/$run $run 391 | gzip Fasta/$run/$run.fastq 392 | endif 393 | end 394 | 395 | foreach run ($long_illumina_runs) 396 | if (! -d Fasta/$run) continue 397 | if (! -e Fasta/$run/$run.fasta'_1'.gz && ! -e Fasta/$run/$run'_1'.fastq.gz) then 398 | set n=`bin/fastq-dump --help | wc -l` 399 | if ($n < 10) then 400 | echo "Sorry, the executable bin/fastq-dump available from NCBI SRA and needed to download the $run run is not found" 401 | goto phaseLoop 402 | endif 403 | echo "Trying to download $run2 from SRA" 404 | bin/fastq-dump -O Fasta/$run --split-files $run 405 | gzip Fasta/$run/$run*.fastq 406 | endif 407 | end 408 | 409 | goto phaseLoop 410 | 411 | ############################################################################## 412 | ############################################################################## 413 | ## Count the number of reads, the shortest, the longest read in every fasta/fastq file 414 | ## using the utility bin/dna2dna (compiled for Linux 64bits) 415 | ## The source code is part of the aceview/magic distribution in the source_code directory 416 | 417 | phase_Count: 418 | echo 'counting the number of reads in each fasta/fastq file' 419 | foreach run ($runs) 420 | if (-e Fasta/$run/$run.fasta.gz && ! -e Fasta/$run/$run.count) then 421 | echo "counting $run, please wait" 422 | bin/dna2dna -i Fasta/$run/$run.fasta.gz -I fasta -count -o Fasta/$run/$run 423 | endif 424 | if (-e Fasta/$run/$run.fastq.gz && ! -e Fasta/$run/$run.count) then 425 | echo "counting $run, please wait" 426 | bin/dna2dna -i Fasta/$run/$run.fastq.gz -I fastq -count -o Fasta/$run/$run 427 | endif 428 | if (-e Fasta/$run/$run'_1'.fastq.gz && ! -e Fasta/$run/$run'_1'.count) then 429 | echo "counting $run, please wait" 430 | bin/dna2dna -i Fasta/$run/$run'_1'.fastq.gz -I fastq -count -o Fasta/$run/$run'_1' 431 | bin/dna2dna -i Fasta/$run/$run'_2'.fastq.gz -I fastq -count -o Fasta/$run/$run'_2' 432 | endif 433 | if (-e Fasta/$run/$run.forward.fasta.gz && ! -e Fasta/$run/$run.forward.count) then 434 | echo "counting $run, please wait" 435 | bin/dna2dna -i Fasta/$run/$run.forward.fasta.gz -I fasta -count -o Fasta/$run/$run.forward 436 | bin/dna2dna -i Fasta/$run/$run.reverse.fasta.gz -I fasta -count -o Fasta/$run/$run.reverse 437 | endif 438 | set nreads=`cat Fasta/$run/*.count | gawk '/^Fragment_kept/{n+=$2}END{print n}'` 439 | echo "$run contains $nreads reads" 440 | end 441 | 442 | goto phaseLoop 443 | 444 | ############################################################################## 445 | ############################################################################## 446 | ## Create the iRefSeq fasta file and intron file from the gff and the genome file 447 | 448 | phase_Make_iRefSeq: 449 | 450 | ## In practice, the file Fasta/iRefSeq/iRefSeq.fasta.gz is downloaded from 451 | ## ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/ 452 | ## The scrip is given here for transparency and to allow the reconstruction 453 | ## of the iRefSeq in the future from a diferent gff file and reference genome 454 | 455 | if (! -e Fasta/iRefSeq/iRefSeq.fasta.gz) then 456 | echo "Creating Fasta/iRefSeq/iRefSeq.fasta.gz using the genome and the gff3 anntation" 457 | if (! -e Fasta/iRefSeq/genome.gz) then 458 | echo "Missing file Fasta/iRefSeq/genome.gz, I cannot create the iRefSeq fasta file" 459 | goto phaseLoop 460 | endif 461 | if (! -e Fasta/iRefSeq/genome.gff.gz( then 462 | echo "Missing file Fasta/iRefSeq/genome.gff.gz, I cannot create the iRefSeq fasta file" 463 | goto phaseLoop 464 | endif 465 | 466 | echo "Found the genome and the gff file, constructing the fasta in Fasta/iRefSeq/tmp" 467 | if (! -d Fasta/iRefSeq/tmp) mkdir Fasta/iRefSeq/tmp 468 | pushd Fasta/iRefSeq/tmp 469 | # This script is surprisingly complex, sorry, because we are trying to identify the NMs which map as well 470 | # at different locus of the genome, but while doing so, we unfortunately discovered a number of 471 | # irregularities in the definition of the RefSeqs that we try to compensate 472 | 473 | # To simplify the matter, we directly provide the iRefSeq fasta and gff files on our ftp site. 474 | 475 | # extract the NM_ from the gff file 476 | zcat ../genome.gff.gz | grep NM_ | grep NC_ > NM.gff 477 | # we could directly export the fasta file with the command 478 | # ../../../bin/dna2dna -gff3 NM.gff -gtfRemap iRefSeq -gtfGenome ../genome.gz -o iRefSeq -O fasta 479 | # but some NM have a single indentifier and yet map on 2 chroms 480 | # by not providing the genome we only export the 6 columns sponge file 481 | ../../../bin/dna2dna -gff3 NM.gff -gtfRemap iRefSeq -o iRefSeq -O fasta 482 | set nNM=`cat iRefSeq.[fr].sponge | cut -f 1 | sort -u | wc -l` 483 | echo "Number of NM_ $nNM (is 45065)" 484 | set nNM_chr=`cat iRefSeq.[fr].sponge | cut -f 1,3 | sort -u | wc -l` 485 | echo "Number of NM_chrom $nNM_chr (is 45108)" 486 | set nG=`cat iRefSeq.[fr].sponge | cut -f 6 | sort -u | wc -l` 487 | echo "Number of genes with NM $nG (is 19269)" 488 | echo "Evaluating the mapping multiplicity of the iRefSeq" 489 | # to fix the issue that the same NM may map on several chromosomes 45108 = 45065 = 43 cases 490 | # we merge the chrom and the NM in column 1 to create a disambiguated shadow file 491 | cat iRefSeq.[fr].sponge | gawk -F '\t' '{printf("%s:%s\t%s\t%s\t%s\t%s\t%s\n",$1,$3,$2,$3,$4,$5,$6);}' > NM_chr.sponge 492 | # the sponge file has the the NM the gene and the coordinates of all exons, hence the sequence 493 | # we now construct the fasta file 494 | ../../../bin/dna2dna -shadow NM_chr.sponge -i ../genome.gz -o iRefSeq -O fasta -maxLineLn 80 -gzo 495 | # measure the number of disntinct NM with identical sponge (hence DNA) and mapping 496 | cat iRefSeq.[fr].sponge | grep NM_ | sort > _t 497 | cat _t | gawk '{nm=$1;z[nm]=z[nm] "_" $3 "_" $4 "_" $5;}END{for(k in z){g[z[k]]=k;u[z[k]]+=1;}for (v in u) {if(u[v]>1)print u[v],g[v]}}' | sort -k 1n > iRefSeq.mapping_multiplicity 498 | ../../../dna2dna -i iRefSeq.fasta.gz -count -o iRefSeq 499 | \mv iRefSeq.fasta.gz iRefSeq.count .. 500 | # extract 501 | ../../../dna2dna -i iRefSeq.fasta.gz -O raw | sort > _t1 502 | # count the distnct NM sequences : 44914 503 | ../../../dna2dna -i iRefSeq.fasta.gz -O raw | cut -f 1 | sort -u | wc 504 | ../../../dna2dna -i iRefSeq.fasta.gz -getTM > ../iRefSeq.TM 505 | 506 | # map the NM on the NM to find who is identical or included in the other 507 | clipalign -i iRefSeq.fasta.gz -t iRefSeq.fasta.gz -errMax 0 -o nm2nm -maxHit 24 -minAli 140 508 | bestali -i nm2nm.hits -exportBest -o nm2nm33 509 | # now count NM mapping exactly in NM with a different geneid -> 143, we add 43+43 for the 43 NM which map on X and Y 510 | cat nm2nm33.hits | gawk '{if($2-$4==0 && index($1,"|"$9">")==0)print}' > nm2nm.2genes.hits 511 | wc nm2nm.2genes.hits 512 | cat nm2nm.2genes.hits | gawk -F '\t' '{n[$1]++;}END{for(k in n)u[n[k]]++;for (v in u) {if(v>0)k+=u[v];kk+=v*u[v];print v, u[v];}print k, kk}' | sort -k 1n 513 | # we now have 303 NM mapping on another NM with a different gene name, however some distinct genes have same gene coordinates 514 | # extract the extreme coords of the NM from the sponge file 515 | cat NM_chr.sponge | gawk -F '\t' '{nm=$1;a1=$4;if($5a2)a2=$4;if(aa2[nm]-a1)aa1[nm]=-a1;}END{for(nm in aa1) printf("%s\t%d\t%d\n",nm,-aa1[nm],aa2[nm]);}' > NM_chr.segment 516 | # reanalize the nm 2 nm hits file and eliminate the lines with overlapping coordinates 517 | echo ZZZZZ > ZZZZZ 518 | cat NM_chr.segment ZZZZZ nm2nm.2genes.hits | gawk -F '\t' '/^ZZZZZ/{zz++;next;}{nm=$1;if(zz+0<1){aa1[nm]=$2;aa2[nm]=$3;split(nm,aa,":");chrom[nm]=aa[2];next;}}{split($1,aa,"|");nm1=aa[1];nm2=$11;ok=1;if (chrom[nm1]==chrom[nm2] && aa1[nm1] aa1[nm2])ok=0;if (ok==1)print}' > nm2nm.2genes.hits.no_doublons 519 | cat NM_chr.segment ZZZZZ nm2nm.2genes.hits | gawk -F '\t' '/^ZZZZZ/{zz++;next;}{nm=$1;if(zz+0<1){aa1[nm]=$2;aa2[nm]=$3;split(nm,aa,":");chrom[nm]=aa[2];next;}}{split($1,aa,"|");nm1=aa[1];nm2=$11;ok=1;if (chrom[nm1]==chrom[nm2] && aa1[nm1] aa1[nm2])ok=0;if (ok==0)print}' > nm2nm.2genes.hits.doublons 520 | # final count of the repeated NM : 291 NM have several mappings 521 | cat nm2nm.2genes.hits.no_doublons | gawk -F '\t' '{n[$1]++;}END{for(k in n)u[n[k]]++;for (v in u) {if(v>0)k+=u[v];kk+=v*u[v];print v, u[v];}print kk}' | sort -k 1n 522 | # so finally we have 291 NM have multiple mapping just by looking at the annotated NM themselves + (43 + 43) from the pseudo autosomal region with single NM and geneid total 291+86=379 523 | cat NM_chr.segment| gawk '{split($1,aa,":");n[aa[1]]++;chrom[aa[1]]=aa[2];}END{for (nm in n)if(n[nm]>1)print nm,n[nm],chrom[nm];}' > NM.pseudo_autosomal_region.mapping_twice 524 | wc NM.pseudo_autosomal_region.mapping_twice 525 | cat nm2nm.2genes.hits.no_doublons | gawk '{split($1,aa,"|");print aa[3] "="$9}' | sed -e 's/>/ gene_pairs 526 | ## construct a cig file for the refseq 527 | # Use NM_...:chrom... as NM identifiers because in the pseudo autosomal region, the same NM maps in 2 places: one NM_ 2 locus 528 | # whereas a usual palindromic exactly duplicated genes has 1 NM per locus, i.e. 2 NM 2 locus 529 | # this raises the number of NM supported introns from 210357 to 210509 530 | 531 | zcat ../genome.gff.gz | gawk -F '\t' '{if ($3 != "exon") next;}{split($9,aa,"Genbank:");split(aa[2],bb,",");split(bb[1],cc,";");seq=cc[1];if(substr(seq,1,2)!="NM" && substr(seq,1,2)!="zNR")next;seq=seq ":" $1; chrom[seq]=$1;nx[seq]++;i=nx[seq];a1[seq,i]=$4;a2[seq,i]=$5;strand[seq]=$7;}END{for (seq in nx){n=nx[seq];printf("%s\t%s",seq,chrom[seq]);if(strand[seq]=="+"){printf("\t%d\t%d\t", a1[seq,1], a2[seq,n]) ;for(i = 1 ; i <=n ; i++){if(i>1){dx=a1[seq,i]-a2[seq,i-1]-1;printf("%dN",dx);}dx=a2[seq,i]-a1[seq,i]+1;printf("%dM",dx);}}else {printf("\t%d\t%d\t", a1[seq,n], a2[seq,1]) ;for(i = n ; i >=1 ; i--){if(i iRefSeq.cig.gz 532 | mv iRefSeq.cig.gz .. 533 | 534 | endif 535 | 536 | goto phaseLoop 537 | 538 | ############################################################################## 539 | ############################################################################## 540 | ## SAM 541 | ## download the precomputed SAM files from NCBI 542 | phase_Sam: 543 | 544 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/" 545 | foreach run ($runs) 546 | foreach method ($methods) 547 | 548 | ## the preferred methos is to download the aligned files from NCBI 549 | 550 | # For HISAT and STAR we have a special version of the code to align long runs 551 | # so in these cases we do not atempt to align the long runs with the short code 552 | if ($method == 30_STAR || $method == 32_STAR.2.6c) then 553 | if (-e Fasta/$run/isLongRun) continue 554 | endif 555 | # and vice versa 556 | if ( $method == 20_HISAT2_relaxed) then 557 | if (! -e Fasta/$run/isLongRun) continue 558 | endif 559 | 560 | if (! -e $method/$run/$method.$run.sam_sorted.gz) then 561 | mkdir -p $method/$run 562 | pushd $method/$run 563 | wget $FTP/SAM/$method.$run.sam_sorted.gz 564 | popd 565 | endif 566 | 567 | end 568 | end 569 | 570 | goto phaseLoop 571 | 572 | ############################################################################## 573 | ############################################################################## 574 | ## ALIGN Run all aligners on all runs 575 | 576 | phase_Align: 577 | 578 | foreach run ($runs) 579 | foreach method ($methods) 580 | 581 | if (! -e Aligners/$method/align.tcsh) then 582 | echo "missing script Aligners/$method/align.tcsh" 583 | continue 584 | endif 585 | 586 | # For HISAT and STAR we have a special version of the code to align long runs 587 | # so in these cases we do not atempt to align the long runs with the short code 588 | if ($method == 30_STAR || $method == 32_STAR.2.6c) then 589 | if (-e Fasta/$run/isLongRun) continue 590 | endif 591 | # and vice versa 592 | if ( $method == 20_HISAT2_relaxed) then 593 | if (! -e Fasta/$run/isLongRun) continue 594 | endif 595 | 596 | if (-e Aligners/$method/align.tcsh && ! -e $method/$run/$method.$run.sam_sorted.gz) then 597 | set read_1="x" 598 | set read_2="" 599 | if (-e Fasta/$run/$run'_1.fastq.gz') set read_1=Fasta/$run/$run'_1.fastq.gz' 600 | if (-e Fasta/$run/$run'_2.fastq.gz') set read_2=Fasta/$run/$run'_2.fastq.gz' 601 | if (-e Fasta/$run/$run'_1.fasta.gz') set read_1=Fasta/$run/$run'_1.fasta.gz' 602 | if (-e Fasta/$run/$run'_2.fasta.gz') set read_2=Fasta/$run/$run'_2.fasta.gz' 603 | if (-e Fasta/$run/$run.fasta.gz) set read_1=Fasta/$run/$run.fasta.gz 604 | if (-e Fasta/$run/$run.fastq.gz) set read_1=Fasta/$run/$run.fastq.gz 605 | if (-e Fasta/$run/$run.forward.fastq.gz) set read_1=Fasta/$run/$run.forward.fastq.gz 606 | if (-e Fasta/$run/$run.reverse.fastq.gz) set read_2=Fasta/$run/$run.reverse.fastq.gz 607 | if (-e Fasta/$run/$run.forward.fasta.gz) set read_1=Fasta/$run/$run.forward.fasta.gz 608 | if (-e Fasta/$run/$run.reverse.fasta.gz) set read_2=Fasta/$run/$run.reverse.fasta.gz 609 | 610 | if (! -e $read_1) then 611 | echo "Run $run Missing read file $read_1" 612 | ls -ls Fasta/$run/*fast* 613 | continue 614 | endif 615 | set target=`cat Fasta/$run/target` 616 | if (! -d $method/$run) mkdir -p $method/$run 617 | echo "align $method/$run" 618 | scripts/submit $method/$run "Aligners/$method/align.tcsh $method $run $target $read_1 $read_2" 619 | # scripts/submit $method/$run "Aligners/$method/align.tcsh $method $run $target $read_1 $read_2" 64G UGE4 620 | endif 621 | end 622 | end 623 | 624 | goto phaseLoop 625 | 626 | ############################################################################## 627 | ############################################################################## 628 | ## Intron, exon, insertion deletion comparison to the TRUTH 629 | ## Compare the alignment results, provided in BAM format 630 | ## to the GOLD standard truth from iRefSeq and Baruzzo given in .cig format 631 | ## The source C-code is part of the aceview/magic distribution www.aceview.org/Software 632 | ## The executable for LINUX 64 bits is in bin 633 | ## 634 | ## sam2gold produces several output files 635 | ## .qc a small self documented statistics table in tab delimited format 636 | ## .aliqc the same statistics in a more computer friendly tag-values tab delimited format 637 | ## .Intron a table giving the coordinates of all introns, with support in GOLD or BAM 638 | ## .Deletion a table giving the coordinates of all deletions, with support in GOLD or BAM 639 | ## .Insertion a table giving the coordinates of all insertions, with support in GOLD or BAM 640 | 641 | phase_Accuracy: 642 | foreach run ($runs) 643 | if (-e Fasta/iRefSeq/iRefSeq.cig.gz && ! -e Fasta/$run/$run.cig.gz) then 644 | pushd Fasta/$run 645 | ln -s ../iRefSeq/iRefSeq.cig.gz $run.cig.gz 646 | popd 647 | endif 648 | end 649 | # Illumina $HG19_runs $PFAL_runs 650 | #$main_runs $pacbio_runs $long_illumina_runs $HG19_runs $PFAL_runs $methods 651 | foreach run ($runs) 652 | foreach mm ($methods) 653 | if (-e Fasta/$run/$run.cig.gz && -e $mm/$run/$mm.$run.sam_sorted.gz && ! -e $mm/$run/$mm.$run.delins.tsv) then 654 | echo $mm/$run/$mm.$run.sam_sorted.gz 655 | set arp="" 656 | set arp=`echo $mm | gawk 'BEGIN{arp="";}{if(index($1,"STAR")>0) arp="-addReadPairSuffix"}END{printf("%s",arp);}'` 657 | \rm $mm/$run/$mm.$run.sam2gold.* 658 | scripts/submit $mm/$run/$mm.$run.sam2gold "bin/sam2gold $arp -g $run..GOLD:Fasta/$run/$run.cig.gz -i $run..$mm':'$mm/$run/$mm.$run.sam_sorted.gz -o $mm/$run/$mm.$run" 659 | endif 660 | end 661 | end 662 | 663 | goto phaseLoop 664 | 665 | ############################################################################## 666 | ############################################################################## 667 | ## Alignment quality control 668 | ## Evaluate in great details the intrinsic quality of the alignment results, provided in BAM format 669 | ## This analysis does not refer to the gold truth 670 | ## This is a python.2.7 scripts given in scripts/AliQC.py 671 | ## It was developped in collaboration with Joe Meehan, FDA, for the MAQC/SEQC project 672 | ## There is a dependency, one must fisrt install HTSeq as explained in the previous section 673 | ## 674 | ## aliqc produces a computer friendly tag-values tab delimited format .aliqc.tsv 675 | ## In the following section aliqc is used again to merge these file into a single table 676 | 677 | phase_aliqc: 678 | 679 | # create sam_sorted only once 680 | set ok=1 681 | foreach mm ($methods) 682 | foreach run ($runs) 683 | if (-e $mm/$run/$mm.$run.bam && ! -e $mm/$run/$mm.$run.sam_sorted.gz) then 684 | set ok=0 685 | echo "transformng $mm/$run/$mm.$run.bam into .sam_sorted.gz" 686 | scripts/submit $mm/$run/$mm.$run.samview "samtools view $mm/$run/$mm.$run.bam | sort -T $TMPDIR | gzip > $mm/$run/$mm.$run.sam_sorted.gz" 687 | endif 688 | end 689 | end 690 | if ($ok == 0) goto phaseLoop 691 | 692 | # use sam_sorted.gz rather than bam (aliqc --BAM also works, but it need to call sort which is very costly) 693 | # PacBio Roche iRefSeq SRR5189652 SRR5189667 HG19t1r1 HG19t1r2 HG19t2r1 HG19t2r2 HG19t3r1 PFALt1r1 PFALt1r2 PFALt1r3 PFALt2r1 PFALt2r2 PFALt2r3 PFALt3r1 PFALt3r2 PFALt3r3 SRR5438850 694 | foreach minAli (0 50 80) 695 | foreach run ($runs) 696 | foreach mm ($methods) 697 | if (-e $mm/$run/$mm.$run.sam_sorted.gz && -e Fasta/$run/genome.gz && ! -e $mm/$run/$mm.$run.minAli$minAli.aliqc.tsv) then 698 | echo "Running AliQC.py on $mm/$run/$mm.$run.sam_sorted.gz" 699 | set nreads=`cat Fasta/$run/*.count | gawk '/^Fragment_kept/{n+=$2}END{print n}'` 700 | if ($nreads == 0) then 701 | echo "missing file Fasta/$run/$run.count, please run phase 1" 702 | else 703 | echo "Running AliQC.py $mm $run" 704 | scripts/submit $mm/$run/aliqc.minali$minAli "python3 scripts/AliQC.py --SAMSORTEDGZ -i $mm/$run/$mm.$run.sam_sorted.gz -r $run..$mm.minAli$minAli -f Fasta/$run/genome.gz -o $mm/$run/$mm.$run.minAli$minAli --nreads $nreads --minAli $minAli" 705 | endif 706 | endif 707 | end 708 | end 709 | end 710 | 711 | goto phaseLoop 712 | 713 | ############################################################################## 714 | ############################################################################## 715 | ## Direct statistics of the error counts reported in the BAM files 716 | ## The AliQC.py code, above, parses the bam file and the genome 717 | ## and computed its own evaluation of the number of error per aligned read 718 | ## In the present script, we rely on the presence in the BAM files of the NM:i:x 719 | ## optional field, collate the X values and report the statistics 720 | ## Hopefully, the 2 methods should be compatible, but they do not have 721 | ## to agree exactly since aliqc counts a double or triple deletion as a single event 722 | ## and some aligners may report it as 2 or 3 errors 723 | 724 | phase_DirectErrorCount: 725 | echo phase_DirectErrorCount 726 | foreach mm ($methods) 727 | foreach run ($runs) 728 | echo "phase_DirectErrorCount $mm $run" 729 | if (-e $mm/$run/$mm.$run.sam_sorted.gz && ! -e $mm/$run/$mm.$run.nerrors) then 730 | scripts/submit $mm/$run/$mm.$run.nerrors "scripts/directErrorCount.tcsh $mm $run" 731 | endif 732 | end 733 | end 734 | 735 | # export the results in a single human readable table 736 | set toto=RESULTS/Error_counts_using_NM_optional_tag.txt 737 | echo -n "## $toto :" > $toto 738 | if (-e $toto.1) \rm $toto.1 739 | date >> $toto 740 | foreach mm ($methods) 741 | foreach run ($runs) 742 | if (-e $mm/$run/$mm.$run.nerrors) then 743 | echo -n "$mm\t$run\t" >> $toto.1 744 | cat $mm/$run/$mm.$run.nerrors | gawk '/^#/{next;}{n[$1]=$2;if($1>max)max=$1;}END{for(i=-5;i<=max;i++)printf("\t%d",n[i]);printf("\n")}' >> $toto.1 745 | endif 746 | end 747 | end 748 | cat $toto.1 | gawk '{if(NF>max)max=NF;}END{printf("\t\t");max-=6;for(i=-5;i<=max;i++)printf("\t%d",i);printf("\n")}' > $toto.2 749 | cat $toto.2 $toto.1 | gawk -F '\t' -f scripts/transpose.awk >> $toto 750 | \rm $toto.1 $toto.2 751 | echo "The table of errors using the optional NM tag of the BAM files is in" 752 | echo $toto 753 | 754 | goto phaseLoop 755 | 756 | ############################################################################## 757 | ############################################################################## 758 | ## Count the substitutions as declared in the Baruzzo Benchmark 759 | ## The statistics only measures the substitutions in the r3 reads 760 | ## fully and continuously aligned on the plus strand of chromosome 8 761 | ## This seems sufficient since it involves in each case at least 100,000 reads 762 | 763 | phase_Count_subtitutions_in_benchmark: 764 | 765 | # there are several phases in the calsulation 766 | # 1: select the full reads, characterized by a cigar string 100M 767 | if (! -e SUBS) mkdir SUBS 768 | foreach sp (HG19 PFAL) 769 | foreach tt (t1 t2 t3) 770 | if (-e Fasta/$sp$tt'r3'/$sp$tt'r3'.cig.gz && ! -e SUBS/subs.$sp$tt) then 771 | zcat Fasta/$sp$tt'r3'/$sp$tt'r3'.cig.gz | grep chr8 | grep '+' | grep 100M | cut -f 1,2,3,4,8 > SUBS/subs.$sp$tt 772 | endif 773 | end 774 | end 775 | 776 | # 2: construct a 6 columns tab delimited shadow file, summarizing the coordinate of the alignemnts 777 | foreach sp (HG19 PFAL) 778 | foreach tt (t1 t2 t3) 779 | if (-e SUBS/subs.$sp$tt && ! -e SUBS/subs.$sp$tt.shadow ) then 780 | cat SUBS/subs.$sp$tt | gawk -F '\t' '{printf("%s\t1\t100\t%s\t%d\t%d\n",$1,$2,$3,$4);}' > SUBS/subs.$sp$tt.shadow 781 | endif 782 | end 783 | end 784 | 785 | # 3: isolate the genome of chromosome 8, using the dna2dna utility 786 | foreach sp (HG19 PFAL) 787 | if (-e Reference_genome/$sp.Baruzzo.genome.fasta.gz && ! -e Reference_genome/$sp.Baruzzo.chr8.fasta.gz) then 788 | bin/dna2dna -i Reference_genome/$sp.Baruzzo.genome.fasta.gz -I fasta -O fasta -keepName -o Reference_genome/$sp.Baruzzo.chr8 -gzo 789 | endif 790 | end 791 | 792 | # 4: Export the corresponding genomic segement in raw format 793 | # The raw format has just 2 tab delimited columns: atgcatgc identifier 794 | # Notice that dna2dna is very versatile, it can directly export messenger RNAs given a genome and a gff file. 795 | # try bin/dna2dna --help for a full list of functioalities 796 | foreach sp (HG19 PFAL) 797 | foreach tt (t1 t2 t3) 798 | if (-e SUBS/subs.$sp$tt.shadow && -e Reference_genome/$sp.Baruzzo.chr8.fasta.gz && ! -e SUBS/subs.$sp$tt.raw) then 799 | dna2dna -i Reference_genome/$sp.Baruzzo.chr8.fasta.gz -shadow SUBS/subs.$sp$tt.shadow -O raw -keepName > SUBS/subs.$sp$tt.raw 800 | endif 801 | end 802 | end 803 | 804 | # 5: the first subs file contains in column 1 and 5 the name and sequence of each read 805 | # the raw file contains in column 2 and 1 the name and sequence of each corresponding genomic segment 806 | # Both sequences are exactly 100 bases long, hence a simple awk script is sufficient to count the mismatching bases 807 | echo ZZZZZ > SUBS/ZZZZZ 808 | foreach sp (HG19 PFAL) 809 | foreach tt (t1 t2 t3) 810 | cat SUBS/subs.$sp$tt SUBS/ZZZZZ SUBS/subs.$sp$tt.raw | gawk -F '\t' '/^ZZZZZ/{zz=1;}{if(zz<1){seq[$1]=$5;next;}if (seq[$2]){s1=seq[$2];s2=toupper($1);n=0;for(i=1;i<=100;i++)if(substr(s1,i,1) != substr(s2,i,1))n++;print n}}' | tags | sort -k 1n > SUBS/subs.$sp$tt.txt & 811 | end 812 | end 813 | 814 | # 6: produce a final synthetic table 815 | set toto=RESULTS/mm_stats.Baruzzo.txt 816 | echo -n "### $toto : " > $toto 817 | date >> $toto 818 | foreach sp (HG19 PFAL) 819 | foreach tt (t1 t2 t3) 820 | cat SUBS/subs.$sp$tt.txt | gawk '{n1+=$2;n2+=$1*$2;printf ("%s\t%d\t%d\t%d\t%d\n",t,$1,$2,n1,n2);}' t=$sp$tt >> $toto 821 | end 822 | end 823 | echo "The distribution of substitutions in chromosome 8 in tbe Baruzzo benchmark" 824 | echo "have been exported in $toto" 825 | head -12 $toto 826 | 827 | goto phaseLoop 828 | 829 | ############################################################################## 830 | ############################################################################## 831 | ## Exportation of global, human readable, quality control tables 832 | ## These tables where used directly to prepare the plots and table of the 833 | ## Magic-BLAST paper 834 | 835 | phase_Export: 836 | 837 | if (! -d RESULTS) mkdir RESULTS 838 | 839 | # collate the aliqc.tsv tables from all runs using again the AliQC.py scripts with --table option 840 | # this will produce 3 output tables 841 | foreach minAli (0 50 80) 842 | if (-e toto) \rm toto 843 | foreach method ($methods) 844 | cat $method/*/*.minAli$minAli.aliqc.tsv >> toto 845 | end 846 | cat toto | python3 scripts/AliQC.py --view table --split --minAli $minAli -o RESULTS/magicblast.table.minAli$minAli 847 | end 848 | 849 | # reformat the 3 output tables 850 | foreach minAli (0 50 80) 851 | foreach type (mismatch_histo_and_types mismatch_per_cycle_per_kb_aligned aligned_reads_multiplicity_length_aligned) 852 | set toto=RESULTS/magicblast.table.minAli$minAli.$type 853 | cat $toto.tsv | head -12 | gawk -F '\t' '/^##/{next;}/^#/{print}' > $toto.txt 854 | cat $toto.tsv | gawk -F '\t' '/^###/{next;}/^##/{print}' >> $toto.txt 855 | cat $toto.tsv | gawk -F '\t' '/^#/{next;}{print}' | sort -k 1,1 -k 2,2 -k 3,3 | grep -v r2 | grep -v r3 | sed -e 's/r1//g' | gawk -F '\t' '{if ($2 != old2){printf("\n");old2=$2;old3="";}}{if ($3 != old3){printf("\n");old3=$3}}{print}' >> $toto.txt 856 | cat $toto.tsv | gawk -F '\t' '/^###/{print}' >> $toto.txt 857 | end 858 | end 859 | 860 | set toto=RESULTS/Mapping_accuracy.txt 861 | echo -n "## $toto :" > $toto 862 | date >> $toto 863 | cat scripts/mapping_accuracy.header >> $toto 864 | 865 | cat */*/*.introns.tsv | sed -e 's/\.\./\t/' | grep GoldMap | grep -v GOLD | sort -k 2,2 -k 3,3 | gawk -F '\t' -f scripts/introns_precision_recall.awk | sort -u | sed -e 's/t1r/T1R/g' -e 's/t2r/T2R/g' -e 's/t3r/T3R/g' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' | sed -e 's/PFAL/Malaria/g' -e 's/STARlongzz/STAR long/g' >> $toto 866 | 867 | set toto=RESULTS/Mapping_accuracy.light.txt 868 | echo -n "## $toto :" > $toto 869 | date >> $toto 870 | cat scripts/mapping_accuracy.header >> $toto 871 | 872 | cat */*/*.introns.tsv | grep -v R2 | grep -v R3 | sed -e 's/R1//g' | grep -v SRR | grep -v Simulated | sed -e 's/\.\./\t/' | grep GoldMap | grep -v GOLD | sort -k 2,2 -k 3,3 | gawk -F '\t' -f scripts/introns_precision_recall.awk | sort -u | sed -e 's/r1//g' | grep -v r2 | grep -v r3 | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto 873 | 874 | # goto phaseLoop 875 | 876 | ##### INTRONS report Insertion Deletion 877 | 878 | foreach type (Intron Insertion Deletion) 879 | 880 | set toto1="RESULTS/$type"_per_coverage_stats.txt 881 | set toto1L="RESULTS/$type"_per_coverage_stats.light.txt 882 | set toto3="RESULTS/$type"_per_coverage_stats.best.txt 883 | set toto4="RESULTS/$type"_per_coverage_stats.1support.txt 884 | set toto3L="RESULTS/$type"_per_coverage_stats.best.light.txt 885 | set toto4L="RESULTS/$type"_per_coverage_stats.1support.light.txt 886 | echo -n "### $toto1 :" > $toto1 887 | echo -n "### $toto3 :" > $toto3 888 | echo -n "### $toto4 :" > $toto4 889 | echo -n "### $toto1L :" > $toto1L 890 | echo -n "### $toto3L :" > $toto3L 891 | echo -n "### $toto4L :" > $toto4L 892 | date >> $toto1 893 | date >> $toto3 894 | date >> $toto4 895 | date >> $toto1L 896 | date >> $toto3L 897 | date >> $toto4L 898 | 899 | if ($type == Intron) then 900 | echo "## An alignment supporting an intron is defined by a line in the SAM/BAM file where the CIGAR contains an N with minimal accepted intron length 50 bases" > $toto1.1 901 | echo "## When a read is aligned at multiple sites, each of its alignments supporting an intron is counted" >> $toto1.1 902 | echo "## Some spliced genes are truly repeated, some are very similar. If one rejects all multiply aligned reads, the introns of these genes cannot be detected," >> $toto1.1 903 | echo "## Therefore, we keep the introns detected by multiply aligned reads, and do not artificially overestimate the specificity of methods unable to select the true positions" >> $toto1.1 904 | echo "## Note that in the benchmark, all reads are uniquely aligned, yet some support multiple neighboring introns" >> $toto1.1 905 | else 906 | echo "## An alignment supporting an indel is defined by a line in the SAM/BAM file where the CIGAR contains an I or o D" > $toto1.1 907 | echo "## When a read is aligned at multiple sites, each of its alignments supporting an indel is counted" >> $toto1.1 908 | endif 909 | 910 | echo -n "# Species\tRun\tMethod\tMinimal $type support" >> $toto1.1 911 | echo "\t$type in benchmark\t$type discovered in method\tFP: False positive $type\tTP: True positive $type\tFN: False negative $type\t$type discovery precision p=TP/(TP+FP)\t$type discovery recall r=TP/(TP+FN)\t$type discovery F-score 2pr/(p+r)" >> $toto1.1 912 | 913 | cat $toto1.1 >> $toto1 914 | cat $toto1.1 >> $toto1L 915 | 916 | if (-e $toto1.2) \rm $toto1.2 917 | 918 | foreach mm ($methods) 919 | cat $mm/*/*.delins.tsv | grep $type | sed -e 's/on_/on/' -e "s/$type//" | grep -v GOLD | sort -k 1,1n -k 2,2 | sed -e 's/\.\./\t/' -e 's/g_//' | gawk '/^#/{next;}/GOLD/{next;}{support=$1;species="Human";run=$2;if(support > 200 && run != "Illumina")next;if(substr(run,1,4)=="PFAL")species="Malaria";method=$3;printf("%s\t%s\t%s\t%d",species,run,method,support); printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", $5, $6,$8,$7,$9,$10,$11,$12);}' >> $toto1.2 920 | end 921 | 922 | cat $toto1.2 | sort -k 1,1 -k 2,2 -k 3,3 -k 4,4n > $toto1.3 923 | cat $toto1.3 | gawk -F '\t' '/^#/{print;next;}{if($3!=old)printf("\n\n");old=$3;print;}' >> $toto1 924 | cat $toto1.3 | grep -v r2 | grep -v r3 | grep -v SRR | grep -v Simulated | gawk -F '\t' '/^#/{print;next;}{if($3!=old)printf("\n\n");old=$3;print;}' >> $toto1L 925 | 926 | echo "## Limited to best support-depth" >> $toto3 927 | cat $toto1.1 >> $toto3 928 | cat $toto1.3 | gawk -F '\t' 'BEGIN{best=0;}{z=$1 "\t" $2 "\t" $3; if (z != old) {old=z;if(bestScore)print best;bestScore=$12;best=$0;}if($12>bestScore){bestScore=$12;best=$0;}}END{if(bestScore)print best;}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto3 929 | 930 | echo "## Limited to best support-depth" >> $toto3L 931 | cat $toto1.1 >> $toto3L 932 | cat $toto1.3 | grep -v r2 | grep -v r3 | grep -v SRR | grep -v Simulated | gawk -F '\t' 'BEGIN{best=0;}{z=$1 "\t" $2 "\t" $3; if (z != old) {old=z;if(bestScore)print best;bestScore=$12;best=$0;}if($12>bestScore){bestScore=$12;best=$0;}}END{if(bestScore)print best;}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto3L 933 | 934 | echo "## Limited to 1 support" >> $toto4 935 | cat $toto1.1 >> $toto4 936 | cat $toto1.3 | gawk -F '\t' '{if($4==1)print}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto4 937 | 938 | echo "## Limited to 1 support" >> $toto4L 939 | cat $toto1.1 >> $toto4L 940 | cat $toto1.3 | grep -v r2 | grep -v r3 | grep -v SRR | grep -v Simulated | gawk -F '\t' '{if($4==1)print}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto4L 941 | 942 | 943 | #\rm $toto1.[123] 944 | end 945 | # goto phaseLoop # fall through to aliLn 946 | 947 | ############################################################################## 948 | ############################################################################## 949 | ####### aliLn 950 | 951 | phase_aliLn: 952 | 953 | set toto=RESULTS/Aligned_length.histo.txt 954 | echo -n "## $toto : " > $toto 955 | date >> $toto 956 | 957 | \rm $toto.* 958 | foreach run (iRefSeq Roche PacBio Illumina SRR5189652 SRR5189667 SRR5437876 SRR5438850) 959 | 960 | if (! -e Fasta/$run/$run.TM) then 961 | if (-e Fasta/$run/$run.fastq.gz) then 962 | bin/dna2dna -i Fasta/$run/$run.fastq.gz -I fastq -getTM > Fasta/$run/$run.TM 963 | endif 964 | endif 965 | if (! -e Fasta/$run/$run.TM) then 966 | if (-e Fasta/$run/$run.fasta.gz) then 967 | bin/dna2dna -i Fasta/$run/$run.fasta.gz -I fasta -getTM > Fasta/$run/$run.TM 968 | endif 969 | endif 970 | if (! -e Fasta/$run/$run.TM) then 971 | if (-e Fasta/$run/$run'_1'.fasta.gz) then 972 | zcat Fasta/$run/$run'_'?.fasta.gz | bin/dna2dna -I fasta -getTM > Fasta/$run/$run.TM 973 | endif 974 | endif 975 | 976 | set delta=1 977 | if ($run == Roche) set delta=10 978 | if ($run == PacBio) set delta=30 979 | if ($run == SRR5189652) set delta=30 980 | if ($run == SRR5189667) set delta=30 981 | if ($run == SRR5437876) set delta=30 982 | if ($run == SRR5438850) set delta=30 983 | if ($run == iRefSeq) set delta=100 984 | foreach mm ($methods) 985 | if (-e $mm/$run/$mm.$run.aliLn) then 986 | cat $mm/$run/$mm.$run.aliLn | gawk '{k=int(($1+delta - 1)/delta) ; if(k>900)k=900;nn[k] += $2;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=$delta mm=$mm rr=$run >> $toto.1 987 | endif 988 | end 989 | end 990 | 991 | echo "Illumina\ttruth\t101\t217498656" >> $toto.1 992 | echo "SRR5437876\ttruth\t300\t32935604" >> $toto.1 993 | 994 | cat Fasta/iRefSeq/iRefSeq.TM | gawk -F '\t' '/^#/{next;}{k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=100 mm="truth" rr=iRefSeq > $toto.1a 995 | cat Fasta/PacBio/PacBio.TM | gawk -F '\t' '/^#/{next;}{k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=30 mm="truth" rr=PacBio >> $toto.1a 996 | cat Fasta/SRR5189652/SRR5189652.TM | gawk -F '\t' '/^#/{next;}{if($2<19)next;k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=30 mm="truth" rr=SRR5189652 >> $toto.1a 997 | cat Fasta/SRR5189667/SRR5189667.TM | gawk -F '\t' '/^#/{next;}{if($2<19)next;k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=30 mm="truth" rr=SRR5189667 >> $toto.1a 998 | cat Fasta/Roche/Roche.TM | gawk -F '\t' '/^#/{next;}{if($2<19)next;k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=10 mm="truth" rr=Roche >> $toto.1a 999 | 1000 | 1001 | 1002 | echo "Human_T1\ttruth\t100\t20000000" > $toto.t0 1003 | echo "Human_T2\ttruth\t100\t20000000" >> $toto.t0 1004 | echo "Human_T3\ttruth\t100\t20000000" >> $toto.t0 1005 | echo "Malaria_T1\ttruth\t100\t20000000" >> $toto.t0 1006 | echo "Malaria_T2\ttruth\t100\t20000000" >> $toto.t0 1007 | echo "Malaria_T3\ttruth\t100\t20000000" >> $toto.t0 1008 | 1009 | set delta=1 1010 | foreach tt (1 2 3) 1011 | if (-e $toto.t$tt) \rm $toto.t$tt 1012 | foreach mm ($methods) 1013 | cat $mm/HG19t$tt'r1'/$mm.HG19t$tt'r1'.aliLn | gawk '{k=int(($1+delta - 1)/delta) ; if(k>900)k=900;nn[k] += $2;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=$delta mm=$mm rr=Human_T$tt >> $toto.t$tt 1014 | cat $mm/PFALt$tt'r1'/$mm.PFALt$tt'r1'.aliLn | gawk '{k=int(($1+delta - 1)/delta) ; if(k>900)k=900;nn[k] += $2;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=$delta mm=$mm rr=Malaria_T$tt >> $toto.t$tt 1015 | end 1016 | 1017 | end 1018 | 1019 | cat $toto.t[0123] >> $toto.1 1020 | cat $toto.1 | sort -k 1,1 -k 3,3n -k 2,2 > $toto.2 1021 | cat $toto.1a | sort -k 1,1 -k 3,3n -k 2,2 >> $toto.2 1022 | 1023 | echo -n "## $toto :" > $toto 1024 | if (-e $toto.5) \rm $toto.5 1025 | date >> $toto 1026 | echo "## Histogram of length to be aligned in truth dataset,and aligned by each program. Each read is counted only once, at the location of its BAM primary alignment (excluding the secondaries with flag 256)" >> $toto 1027 | foreach rr (iRefSeq Roche PacBio Illumina $pacbio_runs $long_illumina_runs) 1028 | echo "# $rr\tTruth" > $toto.4 1029 | cat $toto.2 | grep $rr | gawk -F '\t' '{k=$3;n=$4;m=$2;if(k>kMax)kMax=k;kk[k]=1;nn[m,k]=n;}END{kk[0]=1;for (k=0;k<=kMax;k++)if(kk[k]>0)printf("%d\t%d\n", k,nn["truth",k]);}' >> $toto.4 1030 | echo "\n\n" >> $toto.4 1031 | cat $toto.4 | scripts/transpose >> $toto.5 1032 | echo "\n\n" >> $toto.5 1033 | end 1034 | # 1035 | foreach rr (iRefSeq Roche PacBio Illumina $pacbio_runs $long_illumina_runs) 1036 | echo -n "# $rr\tTruth" > $toto.4 1037 | foreach mm ($methods) 1038 | echo -n "\t$mm" >> $toto.4 1039 | end 1040 | cat $toto.2 | grep $rr | gawk -F '\t' '{k=$3;n=$4;m=$2;if(k>kMax)kMax=k;kk[k]=1;nn[m,k]=n;}END{kk[0]=1;for (k=0;k<=kMax;k++)if(kk[k]>0)printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", k,nn["truth",k],nn["10_MagicBLAST",k],nn["20_HISAT2_relaxed",k],nn["21_HISAT2",k],nn["30_STAR",k],nn["31_STARlong",k],nn["32_STAR.2.6c",k],nn["40_TopHat2",k]);}' >> $toto.4 1041 | echo "\n\n" >> $toto.4 1042 | cat $toto.4 | scripts/transpose >> $toto.5 1043 | echo "\n\n" >> $toto.5 1044 | end 1045 | 1046 | foreach rr (Human_T1 Human_T2 Human_T3 Malaria_T1 Malaria_T2 Malaria_T3) 1047 | echo -n "# $rr\tTruth" > $toto.4 1048 | foreach mm ($methods) 1049 | echo -n "\t$mm" >> $toto.4 1050 | end 1051 | cat $toto.2 | grep $rr | gawk -F '\t' '{k=$3;n=$4;m=$2;if(k>kMax)kMax=k;kk[k]=1;nn[m,k]=n;}END{kk[0]=1;for (k=0;k<=kMax;k++)if(kk[k]>0)printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", k,nn["truth",k],nn["10_MagicBLAST",k],nn["20_HISAT2_relaxed",k],nn["21_HISAT2",k],nn["30_STAR",k],nn["31_STARlong",k],nn["32_STAR.2.6c",k],nn["40_TopHat2",k]);}' >> $toto.4 1052 | echo "\n\n" >> $toto.4 1053 | cat $toto.4 | scripts/transpose >> $toto.5 1054 | echo "\n\n" >> $toto.5 1055 | end 1056 | 1057 | cat $toto.5 | scripts/transpose | sed -e 's/Truth/Actual reads/g' >> $toto 1058 | 1059 | cat RESULTS/Intron_per_coverage_stats.txt | gawk -F '\t' '/^#/{next}{z=$1 "\t" $2 "\t" $3;k=0+n[z];if($4>k)n[z]=$4;}END{for(k in n)printf("%s\t%d\n",k,n[k]);printf("toto\n");}' | sort | gawk -F '\t' 'BEGIN{printf("# Species\tRun\tMethods\tMaximal intron support\n")}{z=$1 "\t" $2;if (z != oldz){if(length(oldz)>3)printf("%s\t%s\t%s\n",oldz,substr(m[oldz],2),substr(n[oldz],2));m[z]="";n[z]="";}oldz=z;m[z]=m[z]","$3;n[z]=n[z]","$4;}' > RESULTS/Intron_per_coverage_stats.title.txt 1060 | \rm $toto.* 1061 | 1062 | goto phaseLoop 1063 | 1064 | ############################################################################## 1065 | ############################################################################## 1066 | ## Done 1067 | 1068 | phase6: 1069 | phaseLoop: 1070 | echo -n "$1 done : " 1071 | date 1072 | 1073 | ############################################################################## 1074 | ############################################################################## 1075 | ############################################################################## 1076 | ############################################################################## 1077 | 1078 | -------------------------------------------------------------------------------- /article/scripts/README: -------------------------------------------------------------------------------- 1 | # Aug 1st, 2018 2 | # Author: Jean Thierry-Mieg, NCBI/NLM/NIH 3 | # For questions, please email mieg@ncbi.nlm.nih.gov 4 | 5 | 6 | The present directory contains scripts used in the Magic-BLAST paper analysis. 7 | 8 | =========== 9 | 10 | MagicBlastPaperMasterScript.tcsh 11 | is the main scripts driving the whole analysis, it calls the other scripts and the C executables 12 | present in the bin directory. It is described in the supplementary material of the paper. 13 | 14 | How to use this scripts is described in the file ../README in the parent directory 15 | 16 | Please check the file ../bin/README and verify if the executables work correctly on your machine 17 | The main difficulty may come from the installation of HTSeq needed by AliQC.py. 18 | 19 | ========== 20 | 21 | AliQC.py 22 | is a python code developed in collaboration we Joe Meehan, FDA, for the 23 | MAQC/SEQC project led by Leming Shi several years ago. It was ported 24 | to python3 for the present project. The code has several functionalities 25 | try 'python3 AliQC.py --help', and it is called with the proper parameters 26 | by the master script. The purpose is to scan the SAM files, which can appear 27 | indifferently as BAM, SAM, or SAM.gz files, and to extract detailed statistics 28 | on the alignments and error patterns. 29 | Notice that AliQC.py does not import a truth file, so all 30 | the measurement are intrinsic to the SAM file. 31 | Comparisons to the 'truth' of the benchmarks is delegated to the C code 'bin/sam2gold'. 32 | AliQC calls HTSeq to parse the SAM files. 33 | 34 | ========== 35 | 36 | HTSeq is described in the following publication: 37 | 38 | Simon Anders, Paul Theodor Pyl, Wolfgang Huber 39 | HTSeq — A Python framework to work with high-throughput sequencing data 40 | Bioinformatics 2015 Jan 15; 31(2): 166–169. 41 | 42 | It is a python library that is used by AliQC.py solely to parse the SAM files into 43 | an object-oriented format, the analysis itself being programmed in AliQC.py. 44 | It is very possible that HTSeq will not work on your machine because it directly 45 | imports precompiled python object modules (.so files). If it does not work, please 46 | follow the instructions in scripts/HTSeq/README 47 | 48 | =========== 49 | 50 | transpose (which calls transpose.awk) 51 | is a simple utility to transpose any tab delimited file 52 | Usage 53 | cat f1 | scripts/transpose > f2 54 | It is used by the other scripts. 55 | 56 | =========== 57 | 58 | tags 59 | is a simple utility to sort and count entries in the first column of a file 60 | Usage 61 | tags f1 # in this case the first column ends on the first tab or space 62 | tags -t f1 # in this case the first column ends on the first tab 63 | It is used by the other scripts. 64 | 65 | =========== 66 | 67 | submit 68 | is a script to control the parallelization of large programs 69 | It is by default configured to allow 4 programs to run in parallel, 70 | see line 51, e.g. edit (NCORE = 4) into (NCORE = 8). 71 | By select 'farm = local' on line 16, the submissions would 72 | become purely sequential. If you have access to a SLURM compute-farm, 73 | you could submit on the farm by uncommenting line 23 74 | and editing your accounts around line 260. Other farms would be 75 | relatively easy to configure. 76 | 77 | ========== 78 | 79 | directErrorCount.awk, gff2cig.awk, introns_precision_recall.awk, transpose.awk 80 | are format changing scripts called by the other scripts 81 | 82 | =========== 83 | =========== 84 | 85 | -------------------------------------------------------------------------------- /article/scripts/directErrorCount.awk: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # 3 | # PUBLIC DOMAIN NOTICE 4 | # National Center for Biotechnology Information 5 | # 6 | # This software/database is a "United States Government Work" under the 7 | # terms of the United States Copyright Act. It was written as part of 8 | # the author's official duties as a United States Government employee and 9 | # thus cannot be copyrighted. This software/database is freely available 10 | # to the public for use. The National Library of Medicine and the U.S. 11 | # Government have not placed any restriction on its use or reproduction. 12 | # 13 | # Although all reasonable efforts have been taken to ensure the accuracy 14 | # and reliability of the software and data, the NLM and the U.S. 15 | # Government do not and cannot warrant the performance or results that 16 | # may be obtained by using this software or data. The NLM and the U.S. 17 | # Government disclaim all warranties, express or implied, including 18 | # warranties of performance, merchantability or fitness for any particular 19 | # purpose. 20 | # 21 | # Please cite the author in any work or product based on this material. 22 | # 23 | # =========================================================================== 24 | # 25 | # Author: Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov 26 | # 27 | 28 | { 29 | flag=$2 ; 30 | if (int((flag % 512)/256) ==1) 31 | next ; 32 | 33 | i = index($0,"NM:i:") ; 34 | if (i < 1) 35 | i = index($0,"nM:i:") ; 36 | if (i < 1) 37 | next ; 38 | split(substr($0,i),aa," ") ; 39 | err = aa[1] ; 40 | gsub (/nM/,"NM",err) ; 41 | print err ; 42 | cigar=$6 ; 43 | if (cigar ~ /^([0-9]*[MN])*$/) 44 | { 45 | if (err=="NM:i:0") # complete no error 46 | print "NM:i:-1" ; 47 | else 48 | print "NM:i:-2"; # complete with error 49 | } 50 | else 51 | { 52 | if (err=="NM:i:0") # partial no error 53 | print "NM:i:-3" ; 54 | else # partial with error 55 | print "NM:i:-4"; 56 | } 57 | } 58 | 59 | 60 | -------------------------------------------------------------------------------- /article/scripts/directErrorCount.tcsh: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov 27 | # 28 | ## Direct statistics of the error counts reported in the BAM files 29 | ## We rely on the presence in the BAM files of the NM:i:x 30 | ## optional filed, collate the X values and report the statistics 31 | 32 | set mm=$1 33 | set run=$2 34 | 35 | set nreads=`cat Fasta/$run/*.count | gawk '/^Fragment_kept/{n+=$2}END{print n}'` 36 | 37 | zcat $mm/$run/$mm.$run.sam_sorted.gz | gawk -F '\t' -f scripts/directErrorCount.awk | scripts/tags | gawk '{split ( $1,aa,":" ) ; x=aa[3] ; y = $2 ; if(x>=0) ali+= y ; printf ( "%d\t%d\n",x,y ) ; }END{printf ("-5\t%d\n",nreads - ali)}' nreads=$nreads | sort -k 1,1nr | gawk '{if (line<1)print "-6:"method": -5:unaligned -4:partial with error, -3:partial no error, -2: complete with error, -1: complete no errors, 0: no error, 1,2,3...:n errors, columns 3 and 4 are cumuls";line++;if($1>=0){n1 += $2 ; n2 += $1*$2 ;}printf ( "%d\t%d\t%d\t%d\n",$1,$2,n1,n2 ) ; }' method=$mm nreads=$nreads | sort -k 1,1n > $mm/$run/$mm.$run.nerrors 38 | 39 | -------------------------------------------------------------------------------- /article/scripts/gff2cig.awk: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # 3 | # PUBLIC DOMAIN NOTICE 4 | # National Center for Biotechnology Information 5 | # 6 | # This software/database is a "United States Government Work" under the 7 | # terms of the United States Copyright Act. It was written as part of 8 | # the author's official duties as a United States Government employee and 9 | # thus cannot be copyrighted. This software/database is freely available 10 | # to the public for use. The National Library of Medicine and the U.S. 11 | # Government have not placed any restriction on its use or reproduction. 12 | # 13 | # Although all reasonable efforts have been taken to ensure the accuracy 14 | # and reliability of the software and data, the NLM and the U.S. 15 | # Government do not and cannot warrant the performance or results that 16 | # may be obtained by using this software or data. The NLM and the U.S. 17 | # Government disclaim all warranties, express or implied, including 18 | # warranties of performance, merchantability or fitness for any particular 19 | # purpose. 20 | # 21 | # Please cite the author in any work or product based on this material. 22 | # 23 | # =========================================================================== 24 | # 25 | # Author: Jean Thierry-Mieg 26 | # 27 | # 28 | 29 | { 30 | if ($3 != "exon") 31 | next ; 32 | } 33 | { 34 | split ($9,aa,"Genbank:") ; 35 | split(aa[2],bb,",") ; split(bb[1],cc,";") ; seq=cc[1] ; 36 | 37 | if(substr(seq,1,2) != "NM" && substr(seq,1,2) != "zNR") 38 | next ; 39 | 40 | seq=seq ":" $1 ; chrom[seq] = $1 ; 41 | nx[seq]++ ; i=nx[seq] ; 42 | a1[seq,i] = $4 ; a2[seq,i] = $5 ; strand[seq]=$7; 43 | } 44 | END { 45 | for (seq in nx) 46 | { 47 | n = nx[seq] ; 48 | printf ("%s\t%s",seq,chrom[seq]) ; 49 | if (strand[seq] == "+") 50 | { 51 | printf ("\t%d\t%d\t", a1[seq,1], a2[seq,n]) ; 52 | for(i = 1 ; i <=n ; i++) 53 | { 54 | if (i>1) 55 | { 56 | dx = a1[seq,i] -a2[seq,i-1] - 1 ; 57 | printf("%dN",dx); 58 | } 59 | dx = a2[seq,i] - a1[seq,i] + 1; 60 | printf ("%dM",dx); 61 | } 62 | } 63 | else 64 | { 65 | printf ("\t%d\t%d\t", a1[seq,n], a2[seq, 1]) ; 66 | for (i = n ; i >=1 ; i--) 67 | { 68 | if (i0) c = nu / (u+nu) ; 43 | printf ("\t%.4f", c) ; 44 | 45 | fp1 =$12 ; fp2 = $13 ; fp = fp1 + fp2 ; 46 | tp1 =$10 ; tp2 = $11 ; tp = tp1 + tp2 ; 47 | fn = $16 ; 48 | printf ("\t\t%d\t%d\t%d\t%d\t%d\t%d\t%d", fp,fp1,fp2, tp1,tp2, tp, fn) ; 49 | 50 | p = 0 ; r = 0 ; f = 0 ; c = 0 ; 51 | if (tp > 0) 52 | { 53 | p = tp / (tp + fp) ; 54 | r = tp / (tp + fn) ; 55 | f = 2 * p * r / (p+r) ; 56 | c = tp2 / tp ; 57 | } 58 | printf ("\t%.4f\t%.4f\t%.4f\t%.4f",p,r,f,c) ; 59 | 60 | fp1 = $8 ; fp2 = $9 ; fp = fp1 + fp2 ; 61 | tp1 = $6 ; tp2 = $7 ; tp = tp1 + tp2 ; 62 | fn=$16; 63 | c = 0 ; if (tp > 0) c = tp1 / tp ; 64 | 65 | printf ("\t\t%d\t%d\t%d\t%d\t%d\t%d\t%d", fp, fp1, fp2, tp1, tp2, tp, fn) ; 66 | 67 | p = 0 ; r = 0 ; f = 0 ; c = 0 ; 68 | if (tp > 0) 69 | { 70 | p = tp / (tp+fp); 71 | r = tp / (tp+fn); 72 | f = 2 * p * r / (p+r) ; 73 | c = tp2 /tp ; 74 | } 75 | printf ("\t%.4f\t%.4f\t%.4f\t%.4f",p,r,f,c) ; 76 | 77 | printf("\n"); 78 | } 79 | -------------------------------------------------------------------------------- /article/scripts/mapping_accuracy.header: -------------------------------------------------------------------------------- 1 | ## This table evaluates the accuracy of the mapping by comparing in each method, read by read, the coordinates of the ends of its alignments to the unique reference alignment provided by the Baruzzo benchmark [Nature Methods 14.2 pp135-139, 2017] 2 | ## Each read is aligned uniquely in the benchmark. In each method, the aligner may recover the true chromosomal coordinates of the first and last base of the read (TP1), or provide a partial alignment included in the true segment (TP2), or overlap the true position but extend out of it (FP1), or align completely elsewhere (FP2), or fail to align (FN). 3 | ## If a read aligns at several sites, it will count as exact (or partial) if at least one of its alignments has the same coordinates as the benchmark unique alignment (or is included). This applies to the first half of the table, columns 10 to 20, which reports on reads. 4 | ## In contrast, in the second half of the table, columns 22 to 32, which reports on alignments, each alignment is counted in its category. 5 | ## The precision (p=(TP1+TP2)/(TP1+TP2+FP1+FP2)), recall (r=(TP1+TP2)/(TP1+TP2+FN)) and F score (f=2 p r/(p+r)) are computed for reads and alignments. 6 | ## The percentage of aligned at multiple sites is shown. The percentage of reads well mapped, at the correct position, but only partially aligned is also computed. 7 | # Species Run Method Reads in benchmark Reads aligned in method Reads uniquely aligned, U Reads multi-aligned, NU % reads multiply aligned NU/(U+NU) Misaligned reads some partial, FP=FP1+FP2 Overlapping the truth, but extending out, FP1 Mismapped reads not overlapping the truth, FP2 Reads at exact same site as benchmark, TP1 Read at correct site, but partial, TP2 Correct mapping, some partial, TP=TP1+TP2 Unaligned FN Read mapping precision TP/(TP+FP) Read mapping recall TP/(TP+FN) Read mapping F-score 2pr/(p+r) % Well mapped but partial TP2/(TP1+TP2) Misalignments some partial, FP=FP1+FP2 Overlapping the truth, but extending out, FP1 Mismappings not overlapping the truth, FP2 Exact alignments identical to benchmark, TP1 Exact but partial alignments, TP2 Correct alignments, some partial, TP=TP1+TP2 Unaligned, FN Alignment mapping precision TP/(TP+FP) Alignment mapping recall TP/(TP+FN) Alignment mapping F-score 2pr/(p+r) % Well mapped but partial TP2/(TP1+TP2) 8 | -------------------------------------------------------------------------------- /article/scripts/schtroumpf: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author Jean Thierry-Mieg 27 | # 28 | # 29 | 30 | set out_err="$1" 31 | set pgm="$2" 32 | 33 | # echo "schtroumpf start" 34 | (eval "/bin/time -p $pgm" > $out_err.out) >& $out_err.err 35 | # echo "schtroumpf end" 36 | 37 | ############################################## 38 | -------------------------------------------------------------------------------- /article/scripts/submit: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author Jean Thierry-Mieg 27 | # 28 | 29 | set reserve_unit="" 30 | set out_err="$1" 31 | set pgm="$2" 32 | set farm_requested="$3" 33 | # exit 0 34 | ############################################## 35 | ## Please select how you want to submit the job 36 | ## by removing the # in front of one of the options 37 | ## If you add a new way, or find a problem 38 | ## please mail the edited script to 39 | ## mieg@ncbi.nlm.nih.gov 40 | 41 | set farm=ZERO 42 | # set farm=local 43 | set farm=MULTICORE 44 | # set farm=UGE 45 | # set farm=BIOWULF 46 | # set farm=SGE 47 | # set farm=LSF 48 | # set farm=PARASOL 49 | # set farm=SLURM 50 | # set farm=TH2 51 | # set farm=FUDAN 52 | 53 | if ($?MAGIC_SUBMIT) then 54 | set farm=$MAGIC_SUBMIT 55 | endif 56 | 57 | if ($?MAGIC_FARM) then 58 | set farm=$MAGIC_FARM 59 | endif 60 | 61 | if ($farm == ZERO) then 62 | echo "FATAL ERROR: please edit the first few lines of script subnit as documented in that file to specify your hardware" 63 | endif 64 | if ($farm != local && $farm != MULTICORE && $farm != UGE && $farm != BIOWULF &&$farm != TH2 && $farm != SLURM && $farm != FUDAN) then 65 | echo 'FATAL ERROR: sorry, the only allowed value for $MAGIC_SUBMIT are' 66 | echo ' local : single threading on local machine' 67 | echo ' MULTICORE : mutithreading on local machine, please configure NCORE (default 8) in scripts/submit' 68 | echo ' UGE : unified grid engine, successor to SGE: Sun Grid Engine, please cofigure the queue name (default unified) in scripts/submit' 69 | echo ' SLURM : slurm clusters' 70 | echo ' BIOWULF : NIH Helix/Biowulf' 71 | echo ' FUDAN : ad-hoc for Fudan university' 72 | echo " currently you are requesting the unknown value: $farm" 73 | echo ' Please configure scripts/submit or define $MAGIC_SUBMIT properly, nay help on new config for additional hardware would be welcome, thank you' 74 | exit 1 75 | endif 76 | 77 | set NCORE=4 # number of jobs to run in parallel in MULTICORE case 78 | set mem_requested="16G" 79 | set reserve_mem=8 80 | set request_multicore="" 81 | if ($farm_requested == "32G") then 82 | set farm_requested="" 83 | set mem_requested="32G" 84 | set reserve_mem=32 85 | else if ($farm_requested == "64G") then 86 | set farm_requested="" 87 | set mem_requested="64G" 88 | set reserve_mem=64 89 | else if ($farm_requested == "128G") then 90 | set farm_requested="" 91 | set mem_requested="128G" 92 | set reserve_mem=128 93 | else if ($farm_requested == "32G4T") then 94 | set farm_requested="" 95 | set mem_requested="32G" 96 | set reserve_mem=32 97 | set request_multicore="-pe multicore 4 -R y" 98 | else if ($farm_requested == "1G") then 99 | set farm_requested="" 100 | set mem_requested="1G" 101 | set reserve_mem=1 102 | else if ($farm_requested == "UGE4") then 103 | set farm_requested="" 104 | set mem_requested="32G" 105 | set reserve_mem=32 106 | set farm=UGE4 107 | else 108 | if ($farm_requested != "") set farm=$3 109 | endif 110 | 111 | if (! $?MAGIC) setenv MAGIC XXX 112 | ############################################## 113 | ## Verify that the farm variable has been configured 114 | if ($farm == ZERO) then 115 | echo "# FATAL ERROR in file $0" 116 | echo "# You need to decide if the codes should run locally" 117 | echo "# or be submitted to a compute farm using SGE, LSF ..." 118 | echo "# Please remove one of the # in lines 14 to 18 of file scripts/submit" 119 | echo "# Then test the configuration " 120 | echo " $0 test 'echo hello_world1 > test.txt'" 121 | echo " $0 wait" 122 | echo " $0 test 'echo hello_world2 >> test.txt'" 123 | echo " $0 wait" 124 | echo "# Then check the content of the files test.out, test.err, test.txt" 125 | echo "# test.err should contain timing info, test.out may be empty" 126 | echo "# test.txt should contain hello_world1 and 2" 127 | exit 1 128 | endif 129 | 130 | ############################################## 131 | ## Localize self 132 | if (! $?scripts) then 133 | set scripts=`echo $0 | gawk '{i=index($1,"submit");a="./";if (i>1)a=substr($1,1,i-1);printf("%s",a);}'` 134 | endif 135 | # echo "scripts=$scripts" 136 | 137 | ############################################## 138 | ## Execute the program on the local machine 139 | 140 | if ($farm == local) then 141 | 142 | if ("$out_err" == run || "$pgm" == run) exit 0 143 | 144 | if ("$out_err" == wait || "$pgm" == wait) exit 0 145 | if ("$out_err" == wait5 || "$pgm" == wait5) exit 0 146 | # no need to wait on a local machine 147 | 148 | echo "submit: pgm=###$pgm### stdout/err=###$out_err.out/err###" 149 | (eval "/bin/time -p $pgm" > $out_err.out) >& $out_err.err 150 | 151 | exit 0 152 | 153 | endif 154 | 155 | ############################################## 156 | ## submit a program locally on a multicore machine 157 | 158 | if ($farm == MULTICORE) then 159 | 160 | # We use twice the number of jobs that should exe in parallel 161 | # because for each we run a wrapping script and the real command 162 | 163 | @ NCORE2 = 2 * $NCORE 164 | 165 | count_jobs: 166 | # The objective is to count all jobs submitted by this system 167 | # i do not know how to cath the process id i just submitted 168 | # so i grep on schtroumpf that I include in all my summitted commands 169 | 170 | # on MAC, probably a Unix BSD legacy, ps -lf does not give the full line, 171 | # if $USER is too long, it may not be listed enterely, prefer $uid 172 | set schtroumpf=schtroumpf 173 | set schtroumpf_exe=$scripts/$schtroumpf 174 | if ($?MAGIC) set schtroumpf=schtroumpf$MAGIC 175 | if ($?MAGIC) set schtroumpf_exe=tmp/$schtroumpf 176 | if (! -x tmp/$schtroumpf) then 177 | cp $scripts/schtroumpf tmp/$schtroumpf 178 | endif 179 | if ($?MAGIC_SUBMIT_USE_DASH) then 180 | if ($MAGIC_SUBMIT_USE_DASH == 1) then 181 | set n=`ps -xu $uid | grep $schtroumpf | grep -vc "grep $schtroumpf"` 182 | else 183 | set n=`ps xu $uid | grep $schtroumpf | grep -vc "grep $schtroumpf"` 184 | endif 185 | else 186 | set n=`ps xu $uid | grep $schtroumpf | grep -vc "grep $schtroumpf"` 187 | endif 188 | # echo "counted $n running little $schtroumpfs" 189 | 190 | if ("$out_err" == run || "$pgm" == run) exit 0 191 | 192 | if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then 193 | if ($n > 0) then 194 | sleep 2 # 2 seconds for example, may be 1 sec is ok 195 | if ("$out_err" == wait5 || "$pgm" == wait5) sleep 298 # complete to 5 minutes 196 | goto count_jobs 197 | endif 198 | else 199 | if ($n >= $NCORE2) then 200 | sleep 5 # 5 seconds for example, may be 2 sec is ok 201 | goto count_jobs 202 | endif 203 | echo "background submit: pgm=###$pgm### stdout/err=###$out_err.out/err### $n running $schtroumpf_exe" 204 | $schtroumpf_exe $out_err "$pgm" & 205 | if ($?MAGIC_SUBMIT_DELAY) then 206 | if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY 207 | else 208 | sleep .1 # 2 second delay before the next call to submit 209 | endif 210 | endif 211 | 212 | exit 0 213 | 214 | endif # MULTICORE 215 | 216 | ############################################## 217 | ## Submit a program to the SGE Sun Grid Engine 218 | 219 | # UGE (Univa Grid Engine) is the continuation of SGE (Sun Grid Engine) with tiny syntax modifs 220 | # after implementing the modifs, we fall thru on SGE case 221 | if ($farm == UGE || $farm == UGE4) then 222 | # source /netopt/uge/ncbi/common/settings.csh 223 | unsetenv LD_RUN_PATH 224 | unsetenv LD_LIBRARY_PATH 225 | unsetenv DYLD_LIBRARY_PATH 226 | set reserve_unit=G 227 | endif 228 | 229 | if ($farm == SGE || $farm == UGE || $farm == UGE4) then 230 | 231 | setenv SGE_SUMMARY stderr 232 | setenv SGE_NOMAIL 233 | 234 | if ("$out_err" == run || "$pgm" == run) exit 0 235 | 236 | if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then 237 | if ("$out_err" == wait5 || "$pgm" == wait5) then 238 | $scripts/qusage 5 239 | else 240 | $scripts/qusage 1 241 | endif 242 | else 243 | if (! $?queue) set queue=unified 244 | # set queue=low 245 | set xx="" 246 | if ($queue == test) set xx=',express=TRUE' 247 | 248 | set isBlastp="" 249 | if ("$pgm" =~ "*blastp*") set isBlastp="-l blastp_frosty" 250 | echo "SGE submit: pgm=###$pgm### stdout/err=###$out_err.out/err### queue=###$queue### isBlasp=###$isBlastp###" 251 | 252 | if ($queue == profiling) then 253 | set numero=`qsub -V -b y -j n -N $MAGIC -P unified $isBlastp -l h_rt=86400,h_vmem=64G,mem_free=64G,m_mem_free=32G,cputype="*E5-2650*",m_core=16,mem_total=125G -o $out_err.out -e $out_err.err "/bin/time -p $pgm"` 254 | else if ($farm == UGE4) then 255 | echo qsub -V -b y -j n -N $MAGIC -P $queue -pe multicore 4 -R y $isBlastp -l h_rt=86400,h_vmem=$mem_requested,mem_free=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx -o $out_err.out -e $out_err.err \"/bin/time -p $pgm\" 256 | set numero=`qsub -V -b y -j n -N $MAGIC -P $queue -pe multicore 4 -R y $isBlastp -l h_rt=86400,h_vmem=$mem_requested,mem_free=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx -o $out_err.out -e $out_err.err "/bin/time -p $pgm"` 257 | else 258 | # new syntax as of 2018 on UGE_ncbi qsub -l m_mem_free=4G,h_rt=14400,h_vmem=10G 259 | echo qsub -V -b y -j n -N $MAGIC -P $queue $isBlastp $request_multicore -l h_rt=86400,h_vmem=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx -o $out_err.out -e $out_err.err \"/bin/time -p $pgm\" 260 | set numero=`qsub -V -b y -j n -N $MAGIC -P $queue $isBlastp $request_multicore -l h_rt=86400,h_vmem=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx -o $out_err.out -e $out_err.err "/bin/time -p $pgm"` 261 | endif 262 | 263 | # multithreaded tasks 264 | # qsub -P unified -N $MAGIC -pe multicore 4 -R y job 265 | # see https://confluence.ncbi.nlm.nih.gov/pages/viewpage.action?pageId=13402915 266 | # "-R y" tells SGE to reserve slots on a host, so that a single core job doesn't run first and keep your job waiting forever 267 | 268 | # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main 269 | if (1) sleep .1 270 | if ($?MAGIC_SUBMIT_DELAY) then 271 | if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY 272 | else 273 | echo sleep .1 274 | endif 275 | echo "####numero=$numero####" 276 | endif 277 | 278 | exit 0 279 | 280 | endif 281 | 282 | ############################################## 283 | ## Submit a program to the SLURM system 284 | 285 | # SLURM is used on helix.nih and in obsolete Fudan 286 | 287 | if ($farm == SLURM) then 288 | setenv farm SLURM 289 | set partition=CLUSTER 290 | if ("$out_err" == run || "$pgm" == run) exit 0 291 | 292 | if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then 293 | if ("$out_err" == wait5 || "$pgm" == wait5) then 294 | $scripts/qusage 5 295 | else 296 | $scripts/qusage 1 297 | endif 298 | else 299 | set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/")n=1;}END{print n+0;}'` 300 | yhresubmit: 301 | if ($isShell == 1) then 302 | echo "sbatch -N 1 -J $MAGIC --share --mem=24000 --time-min=120 --distribution=cyclic -o $out_err.out -e $out_err.err $pgm" 303 | sbatch -N 1 -J $MAGIC --share --mem=24000 --time-min=120 --distribution=cyclic -o $out_err.out -e $out_err.err $pgm 304 | set Status=$status 305 | else 306 | echo "srun -J $MAGIC -o $out_err.out -e $out_err.err $pgm" 307 | srun -J $MAGIC -o $out_err.out -e $out_err.err $pgm 308 | set Status=$status 309 | endif 310 | if ($Status != 0) then 311 | sleep 5 312 | goto yhresubmit 313 | endif 314 | 315 | # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main 316 | if (1) sleep .1 317 | if ($?MAGIC_SUBMIT_DELAY) then 318 | if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY 319 | else 320 | echo sleep .1 321 | endif 322 | 323 | endif 324 | 325 | exit 0 326 | 327 | endif 328 | 329 | ############################################## 330 | ## Submit a program to the BIOWULF cluster 331 | 332 | # BIOWULF 333 | 334 | if ($farm == BIOWULF) then 335 | setenv farm BIOWULF 336 | # setenv TMPDIR "/lscratch/magic.$MAGIC" 337 | # setenv TMPDIR "/lscratch/$SLURM_JOBID" 338 | # partition: norm (normal) quick (<4h) 339 | set partition=quick 340 | if ("$out_err" == run || "$pgm" == run) exit 0 341 | 342 | if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then 343 | if ("$out_err" == wait5 || "$pgm" == wait5) then 344 | $scripts/qusage 5 345 | else 346 | $scripts/qusage 1 347 | endif 348 | else 349 | set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/" || index($1,".tcsh")>0)n=1;}END{print n+0;}'` 350 | yhresubmit: 351 | if ($isShell == 1) then 352 | echo "sbatch --ntasks 1 -J $MAGIC --share --gres=lscratch:100 --mem=24g --time-min=120 --distribution=cyclic --partition=$partition --output $out_err.out --error $out_err.err $pgm" 353 | sbatch --ntasks 1 -J $MAGIC --share --gres=lscratch:100 --mem=24g --time-min=120 --distribution=cyclic --partition=$partition --output $out_err.out --error $out_err.err $pgm 354 | set Status=$status 355 | else 356 | echo "srun -J $MAGIC -o $out_err.out -e $out_err.err $pgm" 357 | srun -J $MAGIC -o $out_err.out -e $out_err.err $pgm 358 | set Status=$status 359 | endif 360 | if ($Status != 0) then 361 | sleep 5 362 | goto yhresubmit 363 | endif 364 | 365 | # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main 366 | if (1) sleep .1 367 | if ($?MAGIC_SUBMIT_DELAY) then 368 | if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY 369 | else 370 | echo sleep .1 371 | endif 372 | 373 | endif 374 | 375 | exit 0 376 | 377 | endif 378 | 379 | ############################################## 380 | ## Submit a program to the FUDAN system 381 | 382 | # FUDAN is used in Fudan 383 | 384 | if ($farm == FUDAN) then 385 | setenv farm FUDAN 386 | set partition=CLUSTER 387 | if ("$out_err" == run || "$pgm" == run) exit 0 388 | 389 | if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then 390 | if ("$out_err" == wait5 || "$pgm" == wait5) then 391 | $scripts/qusage 5 392 | else 393 | $scripts/qusage 1 394 | endif 395 | else 396 | set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/")n=1;}END{print n+0;}'` 397 | fudanresubmit: 398 | if ($isShell == 1) then 399 | echo "qsub -N $MAGIC --share --mem=24000 --time-min=600 -o $out_err.out -e $out_err.err $pgm" 400 | qsub -N $MAGIC --share --mem=24000 --time-min=600 -o $out_err.out -e $out_err.err $pgm 401 | set Status=$status 402 | else 403 | echo "srun -N $MAGIC -o $out_err.out -e $out_err.err $pgm" 404 | srun -N $MAGIC -o $out_err.out -e $out_err.err $pgm 405 | set Status=$status 406 | endif 407 | if ($Status != 0) then 408 | sleep 5 409 | goto fudanresubmit 410 | endif 411 | 412 | # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main 413 | if (1) sleep .1 414 | if ($?MAGIC_SUBMIT_DELAY) then 415 | if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY 416 | else 417 | echo sleep .1 418 | endif 419 | 420 | endif 421 | 422 | exit 0 423 | 424 | endif 425 | 426 | ############################################## 427 | ## Submit a program to the TH2 super computer 428 | 429 | # TH2 used on the chinese super computer 430 | 431 | if ($farm == TH2) then 432 | setenv farm TH2 433 | if ("$out_err" == run || "$pgm" == run) exit 0 434 | 435 | if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then 436 | if ("$out_err" == wait5 || "$pgm" == wait5) then 437 | $scripts/qusage 5 438 | else 439 | $scripts/qusage 1 440 | endif 441 | else 442 | set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/")n=1;}END{print n+0;}'` 443 | yhresubmit: 444 | if ($isShell == 1) then 445 | echo "yhbatch -p nsfc1 -N 1 -J $MAGIC --share --mem=16000 --time-min=600 --distribution=cyclic -o $out_err.out -e $out_err.err $pgm" 446 | yhbatch -p nsfc1 -N 1 -J $MAGIC --share --mem=16000 --time-min=600 --distribution=cyclic -o $out_err.out -e $out_err.err $pgm 447 | set Status=$status 448 | else 449 | echo "yhrun -p nsfc1 -J $MAGIC -o $out_err.out -e $out_err.err $pgm" 450 | yhrun -p nsfc1 -J $MAGIC -o $out_err.out -e $out_err.err $pgm 451 | set Status=$status 452 | endif 453 | if ($Status != 0) then 454 | sleep 5 455 | goto yhresubmit 456 | endif 457 | 458 | # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main 459 | if (1) sleep .1 460 | if ($?MAGIC_SUBMIT_DELAY) then 461 | if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY 462 | else 463 | echo sleep .1 464 | endif 465 | 466 | endif 467 | 468 | exit 0 469 | 470 | endif 471 | 472 | ############################################## 473 | ## Submit a program to the Parasol parallel batch system of Jim Kent 474 | 475 | if ($farm == PARASOL) then 476 | 477 | if ("$out_err" == wait || "$pgm" == wait) then 478 | if (-e tmp/parasol.job.list) then 479 | para problems 480 | mv tmp/parasol.job.list tmp/parasol.job.done.$$ 481 | endif 482 | else if ("$out_err" == run || "$pgm" == run) then 483 | parasol make tmp/Parasol/job.list 484 | else 485 | if (! -d $CaliRootDir/tmp/Parasol) mkdir $CaliRootDir/tmp/Parasol 486 | set toto=$CaliRootDir/tmp/Parasol/job.$$ 487 | echo '#!/bin/csh -e' > $toto 488 | echo -n "cd " >> $toto 489 | pwd >> $toto 490 | echo "(/bin/time -p $pgm > $out_err.out) >& $out_err.err " >> $toto 491 | echo "csh $toto" >> $CaliRootDir/tmp/Parasol/job.list 492 | endif 493 | 494 | exit 0 495 | 496 | endif 497 | 498 | ############################################## 499 | ## Submit a program to the LSF queue 500 | 501 | if ($farm == LSF) then 502 | 503 | if ("$pgm" == wait) then 504 | echo " i do not remember the LSF syntax to synchronize, maybe 'bjobs' please edit scripts/submit and mail me the fix, thanks" 505 | else if ("$pgm" != run) then 506 | if (! $?queue) set queue=unified 507 | echo " i do not remember the LSF syntax, please edit scripts/submit" 508 | echo "LSF submit: pgm=###$pgm### stdout/err=###$out_err.out/err### queue=###$queue###" 509 | bsub -q $queue -R "select[$linux] rusage[cpu=1:mem=8000:duration=30000]" -o $out_err.err -e $out_err.err "/bin/time -p $pgm" 510 | 511 | if ($?MAGIC_SUBMIT_DELAY) then 512 | if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY 513 | else 514 | sleep 1 515 | endif 516 | 517 | endif 518 | exit 1 519 | 520 | endif 521 | 522 | echo "ERROR : Unknown third parameter $farm in" 523 | echo "ERROR : $0 $*" 524 | echo "ERROR : The only acceptable values are: local, MULTICORE, SGE and LSF" 525 | echo "ERROR : Please edit the script or modify the call to the script" 526 | exit 1 527 | 528 | ############################################## 529 | ############################################## 530 | 531 | foreach ii (1 2 3 4 5 6 7 8 9 10 11 12 13 14) 532 | pushd B$ii 533 | ln -s tmp.$ii tmp 534 | if (! -d tmp.$ii) mkdir tmp.$ii 535 | ln -s ../TARGET 536 | ln -s tmp.$ii/Batch$ii Fastc 537 | ln -s ../LIMITS 538 | mkdir RESULTS 539 | mkdir MetaDB 540 | cd MetaDB 541 | ln -s ../../MetaDB/database 542 | ln -s ../../MetaDB/wspec 543 | cd .. 544 | ln -s ../bin 545 | ln -s ../metaData 546 | ln -s ../scripts 547 | ln -s scripts/MAGIC 548 | cp ../ZZZZZ . 549 | cp ../ZZZZZ.gz . 550 | ln -s bin/xacembly bly 551 | ln -s ./bin/tacembly tbly 552 | popd 553 | end 554 | 555 | ############################################# 556 | 557 | # find suspended jobs 558 | 559 | qstat > _qs1 560 | qwhat > _qs2 561 | cat _qs1 ZZZZZ _qs2 | gawk '/^ZZZZZ/{zz++;next;}{if(zz<1){if($5=="S")ss[$1]=1;next;}if(ss[$1]==1)print}' > _qs3 562 | cat _qs3 | gawk '{if($4=="scripts/alignAndSelectBest.tcsh")printf("qdel %s\n",$1);}' > _qs4 563 | -------------------------------------------------------------------------------- /article/scripts/tags: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | # =========================================================================== 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Jean Thierry-Mieg 27 | # 28 | 29 | if ("$1" == "-t") then 30 | gawk -F '\t' '{n[$1]++;}END{for (k in n)printf("%s\t%d\n",k,n[k]);}' $2 | sort 31 | else 32 | gawk '{n[$1]++;}END{for (k in n)printf("%s\t%d\n",k,n[k]);}' $1 | sort 33 | endif 34 | -------------------------------------------------------------------------------- /article/scripts/transpose: -------------------------------------------------------------------------------- 1 | gawk -F '\t' -f scripts/transpose.awk 2 | -------------------------------------------------------------------------------- /article/scripts/transpose.awk: -------------------------------------------------------------------------------- 1 | { j++ ; for (i = 1 ; i <= NF ; i++)a[i,j] = $i ; if(NF>iMax) iMax = NF ; jMax = j ;} 2 | END { 3 | for (i = 1 ; i <= iMax ; i++) 4 | { 5 | printf ("%s", a[i,1]) ; 6 | for (j = 2 ; j <= jMax ; j++) 7 | printf ("\t%s", a[i,j]) ; 8 | printf ("\n",i) ; 9 | } 10 | } 11 | 12 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | # Site title and subtitle. This is used in _includes/header.html 2 | title: 'NCBI Magic-BLAST' 3 | subtitle: 'RNA-seq mapping tool' 4 | 5 | # if you wish to integrate disqus on pages set your shortname here 6 | disqus_shortname: '' 7 | 8 | # if you use google analytics, add your tracking id here 9 | google_analytics_id: '' 10 | 11 | # Enable/show navigation. There are there options: 12 | # 0 - always hide 13 | # 1 - always show 14 | # 2 - show only if posts are present 15 | navigation: 2 16 | 17 | # URL to source code, used in _includes/footer.html 18 | codeurl: 'https://ftp.ncbi.nlm.nih.gov/blast/executables/magicblast/LATEST' 19 | 20 | # Default categories (in order) to appear in the navigation 21 | sections: [ 22 | ['doc', 'Documentation'], 23 | ['cook', 'Cookbook'], 24 | ['ref', 'Reference'], 25 | ['dev', 'Developers'], 26 | ['release', 'Release Notes'], 27 | ['post', 'Posts'] 28 | ] 29 | 30 | # Keep as an empty string if served up at the root. If served up at a specific 31 | # path (e.g. on GitHub pages) leave off the trailing slash, e.g. /my-project 32 | baseurl: '/magicblast' 33 | 34 | # Dates are not included in permalinks 35 | permalink: none 36 | 37 | # Syntax highlighting 38 | highlighter: rouge 39 | 40 | # Since these are pages, it doesn't really matter 41 | future: true 42 | 43 | # Exclude non-site files 44 | exclude: ['bin', 'README.md'] 45 | 46 | # Use the kramdown Markdown renderer 47 | markdown: kramdown 48 | redcarpet: 49 | extensions: [ 50 | 'no_intra_emphasis', 51 | 'fenced_code_blocks', 52 | 'autolink', 53 | 'strikethrough', 54 | 'superscript', 55 | 'with_toc_data', 56 | 'tables', 57 | 'hardwrap' 58 | ] 59 | -------------------------------------------------------------------------------- /docs/_includes/disqus.html: -------------------------------------------------------------------------------- 1 |
2 | 13 | 14 | -------------------------------------------------------------------------------- /docs/_includes/footer.html: -------------------------------------------------------------------------------- 1 | Download {{ site.title }} 2 | -------------------------------------------------------------------------------- /docs/_includes/google_analytics.html: -------------------------------------------------------------------------------- 1 | 10 | -------------------------------------------------------------------------------- /docs/_includes/header.html: -------------------------------------------------------------------------------- 1 |

{{ site.title }} 2 | {% if site.subtitle %}{{ site.subtitle }}{% endif %} 3 |

4 | -------------------------------------------------------------------------------- /docs/_includes/navigation.html: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | {{ site.title }}{% if page.title %} : {{ page.title }}{% endif %} 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 |
19 | 22 |
23 | 24 |
25 | {% assign post_count = site.posts|size %} 26 | {% if site.navigation != 0 and site.navigation == 1 or post_count > 0 %} 27 | 30 | 31 |
32 | {{ content }} 33 |
34 | {% else %} 35 |
36 | {{ content }} 37 |
38 | {% endif %} 39 |
40 | 41 |
42 | 45 |
46 |
47 | 48 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /docs/_layouts/page.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 | 10 | 11 | {{ content }} 12 | -------------------------------------------------------------------------------- /docs/_posts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncbi/magicblast/63d24b02fe26d6360aa23de148521e3cdea1cdbd/docs/_posts/.gitkeep -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-blastdb.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: cook 4 | title: "Create a BLAST database" 5 | order: 1 6 | --- 7 | 8 | First you need to create a BLAST database for your genome or transcriptome. For your reference sequences in a FASTA file, use this command line: 9 | 10 | ``` 11 | makeblastdb -in -dbtype nucl -parse_seqids -out -title "Database title" 12 | ``` 13 | 14 | The ```-parse_seqids``` option is required to keep the original sequence identifiers. Otherwise makeblastdb will generate its own identifiers, ```-title``` is optional. 15 | 16 | For more information on makeblastdb see [NCBI BLAST+ Command Line User Manual](https://www.ncbi.nlm.nih.gov/books/NBK279688/). 17 | 18 | Magic-BLAST will work with a genome in a FASTA file, but will be very slow for anything larger than a bacterial genome (about 5 million bases), so we do not recommend it. 19 | 20 | 21 |   22 | 23 | 24 | #### Example 25 | 26 | To create a BLAST database from the reference file ```my_reference.fa``` 27 | 28 | ``` 29 | $ cat my_reference.fa 30 | >sequence_1 Homo sapiens hemoglobin subunit alpha 2 (HBA2), mRNA 31 | CATAAACCCTGGCGCGCTCGCGGGCCGGCACTCTTCTGGTCCCCACAGACTCAGAGAGAACCCACCATGG 32 | TGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCGCACGCTGGCGAGTA 33 | TGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACCAAGACCTACTTCCCGCACTTCGAC 34 | CTGAGCCACGGCTCTGCCCAGGTTAAGGGCCACGGCAAGAAGGTGGCCGACGCGCTGACCAACGCCGTGG 35 | CGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGA 36 | CCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTC 37 | ACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAATACC 38 | GTTAAGCTGGAGCCTCGGTAGCCGTTCCTCCTGCCCGCTGGGCCTCCCAACGGGCCCTCCTCCCCTCCTT 39 | GCACCGGCCCTTCCTGGTCTTTGAATAAAGTCTGAGTGGGCAGCAAAAAAAAAAAAAAAAAA 40 | >sequence_2 Homo sapiens alpha one globin (HBA1) mRNA, complete cds 41 | CAGACTCAGAGAGAACCCACCATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTA 42 | AGGTCGGCGCGCACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCAC 43 | CAAGACCTACTTCCCGCACTTCGACCTGAGCCACGGCTCTGCCCAGGTTAAGGGCCACGGCAAGAAGGTG 44 | GCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACC 45 | TGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCT 46 | GGCCGCCCACCTCCCCGCCGAGTTCACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTG 47 | AGCACCGTGCTGACCTCCAAATACCGTTAAGCTGGAGCCTCGGTGGCCATGCTTCTTGCCCCTTGGGC 48 | ``` 49 | 50 | use this command line 51 | 52 | ``` 53 | makeblastdb -in my_reference.fa -out my_reference -parse_seqids -dbtype nucl 54 | ``` 55 | 56 | Note that the word following '>' is a sequence identifier that will be used in Magic-BLAST reports. The identifier should be unique. 57 | 58 | 59 | ### Download a genome 60 | 61 | There are several ways to download whole genomes, transcriptomes, or selected sequences from NCBI. 62 | 63 | #### NCBI Datasets 64 | 65 | You can search and download genome and transcript sequences from [NCBI Datasets Genome](https://www.ncbi.nlm.nih.gov/datasets/genome/) page. Search for an organism, select an assembly, and you will see download options. 66 | 67 | For example, you can download the GRCh38 assembly of the human genome, from [https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/). 68 | 69 | #### NCBI EDirect tools 70 | 71 | To download human chromosome 1 using [NCBI EDirect tools](https://github.com/NCBI-Hackathons/EDirectCookbook) use: 72 | 73 | ``` 74 | search -db nucleotide -query NC_000001 | efetch -format fasta >NC_000001.fa 75 | ``` 76 | 77 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-copyright.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: dev 4 | title: "Copyright" 5 | order: 1 6 | --- 7 | With the exception of certain third-party files summarized below, this software is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the authors' official duties as United States Government employees and thus cannot be copyrighted. This software is freely available to the public for use. The National Library of Medicine and the U.S. Government have not placed any restriction on its use or reproduction. 8 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-exeptions.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: dev 4 | title: "Exceptions" 5 | order: 2 6 | --- 7 | Location: configure 8 | 9 | Authors: Free Software Foundation, Inc. 10 | 11 | License: Unrestricted; at top of file 12 | 13 | *** 14 | 15 | Location: config.guess, config.sub 16 | 17 | Authors: FSF 18 | 19 | License: Unrestricted when distributed with the Toolkit; standalone, GNU General Public License [gpl.txt] 20 | 21 | *** 22 | 23 | Location: {src,include}/dbapi/driver/ftds*/freetds 24 | 25 | Authors: See src/dbapi/driver/ftds*/freetds/AUTHORS 26 | 27 | License: GNU Library/Lesser General Public License 28 | [src/dbapi/driver/ftds*/freetds/COPYING.LIB] 29 | 30 | *** 31 | 32 | Location: include/dbapi/driver/odbc/unix_odbc 33 | 34 | Authors: Peter Harvey and Nick Gorham 35 | 36 | License: GNU LGPL 37 | 38 | *** 39 | 40 | Location: {src,include}/gui/widgets/FLU 41 | 42 | Authors: Jason Bryan 43 | 44 | License: GNU LGPL 45 | 46 | *** 47 | 48 | Location: {src,include}/gui/widgets/Fl_Table 49 | 50 | Authors: Greg Ercolano 51 | 52 | License: GNU LGPL 53 | 54 | *** 55 | 56 | Location: include/util/bitset 57 | 58 | Author: Anatoliy Kuznetsov 59 | 60 | License: MIT [include/util/bitset/license.txt] 61 | 62 | *** 63 | 64 | Location: {src,include}/util/compress/bzip2 65 | 66 | Author: Julian R Seward 67 | 68 | License: BSDish [src/util/compress/bzip2/LICENSE] 69 | 70 | *** 71 | 72 | Location: {src,include}/util/compress/zlib 73 | 74 | Authors: Jean-loup Gailly and Mark Adler 75 | 76 | License: BSDish [include/util/compress/zlib/zlib.h] 77 | 78 | *** 79 | 80 | Location: {src,include}/util/regexp 81 | 82 | Author: Philip Hazel 83 | 84 | License: BSDish [src/util/regexp/doc/LICENCE] 85 | 86 | *** 87 | 88 | Location: {src,include}/misc/xmlwrapp 89 | 90 | Author: Peter J Jones at al. [src/misc/xmlwrapp/AUTHORS] 91 | 92 | License: BSDish [src/misc/xmlwrapp/LICENSE] 93 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-fasta.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: cook 4 | title: "Reads in FASTA or FASTQ" 5 | order: 3 6 | --- 7 | 8 | If your reads are in a local FASTA file use this command line: 9 | 10 | ``` 11 | magicblast -query reads.fa -db my_reference 12 | ``` 13 | 14 | If your reads are in a local FASTQ file use this command line: 15 | 16 | ``` 17 | magicblast -query reads.fastq -db my_reference -infmt fastq 18 | ``` 19 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-multithreading.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: cook 4 | title: "Multi-threading" 5 | order: 6 6 | --- 7 | 8 | To use multiple CPUs, specify the maximal number of threads with the 9 | ```-num_threads``` parameter: 10 | 11 | ``` 12 | magicblast -query reads.fa -db genome -num_threads 10 13 | ``` 14 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-output.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: doc 4 | title: "Output" 5 | order: 1 6 | --- 7 | 8 | By default, results are provided to the standard output in the SAM format. 9 | Use ```-out ``` option to redirect output to a file. 10 | Use ```-outfmt``` option to specify the output format: 11 | 12 | ```-outfmt sam``` : SAM format (default) 13 | 14 | ```-outfmt tabular``` : exports a simple tab delimited format defined below. 15 | 16 | The output can be also compressed, using the ```-gzo``` flag: 17 | 18 | ``` 19 | magicblast -query reads.fa -db genome -out output.gz -gzo 20 | ``` 21 | 22 | #### Unaligned reads 23 | 24 | By default Magic-BLAST reports unaligned reads, with unmapped bit (4) set in SAM flags or '*' in the second column of the tabular output. If you do not want unmapped reads reported, use ```-no_unaligned``` option: 25 | 26 | ``` 27 | magicblast -query reads -db reference -no_unaligned 28 | ``` 29 | 30 | 31 | #### Tabular output format 32 | 33 | The tabular output format shows one alignment per line with these tab 34 | delimited fields: 35 | 36 | 1. Query/read sequence identifier 37 | 2. Reference sequence identifier 38 | 3. Percent identity of the alignment 39 | 4. Not used 40 | 5. Not used 41 | 6. Not used 42 | 7. Alignment start position on the query sequence 43 | 8. Alignment stop position on the query sequence 44 | 9. Alignment start position on the reference sequence 45 | 10. Alignment stop position on the reference sequence 46 | 11. Not used 47 | 12. Not used 48 | 13. Alignment score 49 | 14. Query strand 50 | 15. Reference sequence strand 51 | 16. Query/read length 52 | 17. Alignment as extended BTOP string 53 | This is the same BTOP string as in BLAST tabular output with a 54 | few extensions: 55 | - a number represents this many matches, 56 | - two bases represent a mismatch and show query and reference base, 57 | - base and gap or gap and base, show a gap in query or reference, 58 | - ^\^ represents an intron of this number of bases, 59 | - \_\\_ represents an insertion (gap in reference) of this number of bases, 60 | - %\% represents a deletion (gap in read) of this number of bases, 61 | - (\) shows number of query bases that are shared between 62 | two parts of a spliced alignment; used when proper splice sites 63 | were not found 64 | 18. Number of different alignments reported for this query sequence 65 | 19. Not used 66 | 20. Compartment - a unique identifier for all alignments that belong to 67 | a single fragment. These can be two alignments for a pair of reads 68 | or alignments to exons that were not spliced. 69 | 21. Reverse complemented unaligned query sequence from the beginning 70 | of the query, or '-' if the query aligns to the left edge 71 | 22. Unaligned sequence at the end of the query, or '-' 72 | 23. Reference sequence identifier where the mate is aligned, if 73 | different from the identifier in column 2, otherwise '-' 74 | 24. Alignment start position on the reference sequence for the mate, or 75 | '-' if no alignment for the mate was found; a negative number 76 | denotes a divergent pair 77 | 25. Composite alignment score for all exons that belong to the fragment 78 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-paired.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: cook 4 | title: "Paired reads" 5 | order: 4 6 | --- 7 | 8 | For SRA accessions Magic-BLAST determines whether reads are paired and maps them appropriately. 9 | 10 | For reads in FASTA and FASTQ files paired reads can either be in a single file, or two files. 11 | 12 | ##### Single file 13 | 14 | For paired reads presented as successive entries in a single FASTA or FASTQ 15 | file, i.e. read 1 and 2 of fragment 1, then read 1 and 2 of fragment 2, 16 | etc., simply add the parameter ```-paired```: 17 | 18 | ``` 19 | magicblast -query reads.fa -db genome -paired 20 | ``` 21 | 22 | or 23 | 24 | ``` 25 | magicblast -query reads.fastq -db genome -paired -infmt fastq 26 | ``` 27 | 28 | ##### Two files 29 | 30 | For paired reads presented in two parallel files, use these options: 31 | 32 | ``` 33 | magicblast -query reads.fa -query_mate mates.fa -db genome 34 | ``` 35 | 36 | or 37 | 38 | ``` 39 | magicblast -query reads.fastq -query_mate mates.fastq -db genome -infmt fastq 40 | ``` 41 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-rnavsdna.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: cook 4 | title: "RNA vs DNA" 5 | order: 5 6 | --- 7 | 8 | ##### Splicing 9 | 10 | By default, Magic-BLAST aligns RNA reads to a genome and reports spliced 11 | alignments, possibly spanning several exons. To disable spliced alignments, 12 | use the ```-splice F``` option. 13 | 14 | For example, mapping RNA or DNA reads to a bacterial genome: 15 | 16 | ``` 17 | magicblast -sra SRR5647973 -db salmonella_enterica_genome -splice F 18 | ``` 19 | 20 |   21 | 22 | ##### Transcriptome 23 | 24 | Use the ```-reftype transcriptome``` option, to 25 | map reads to a transcriptome database. For example: 26 | 27 | ``` 28 | magicblast -query reads.fa -db my_transcripts -reftype transcriptome 29 | ``` 30 | 31 | The ```-ref_type transcriptome``` option is a short hand for ```-splice F -limit_lookup F```, so the above call is equivalent to: 32 | 33 | ```magicblast -query reads.fa -db my_transcripts -splice F -limit_lookup F``` 34 | 35 | Magic-Blast finds alignments between a read and a genome based on initial 36 | common word in both. Many genomes contain interspersed repeats that make mapping much more time consuming. To make mapping faster we disregard words that appear too often in the reference. This is not desirable when mapping to transcripts, because a transcript with many variants could be considered a repeat. The ```-limit_lookup F``` option turns this functionality off. 37 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-sra.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: cook 4 | title: "Use NCBI SRA repository" 5 | order: 2 6 | --- 7 | 8 | If you are mapping an experiment from [NCBI Sequence Read Archive](https://www.ncbi.nlm.nih.gov/sra), use ```-sra ``` option: 9 | 10 | ``` 11 | magicblast -sra -db 12 | ``` 13 | 14 |   15 | 16 | 17 | For example: 18 | 19 | ``` 20 | magicblast -sra SRR1237994 -db my_reference 21 | ``` 22 | 23 |   24 | 25 | 26 | To map several SRA runs use comma-separated list of accessions: 27 | 28 | ``` 29 | magicblast -sra SRR1237994,SRR1237993 -db my_reference 30 | ``` 31 | or a list of accessions in text file, one per line: 32 | ``` 33 | echo SRR1237994 >accessions 34 | echo SRR1237993 >>accessions 35 | magicblast -sra_batch accessions -db my_reference 36 | ``` 37 | 38 |   39 | 40 | See [Create BLAST database](../cook/blastdb.html) to see how to create a BLAST database. 41 | 42 | -------------------------------------------------------------------------------- /docs/_posts/2016-12-29-tutorial.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: doc 4 | title: "How Magic-BLAST works" 5 | order: 0 6 | --- 7 | #### Index vs. BLAST database 8 | 9 | Unlike most mapping tools, Magic-BLAST does not build an index of a genome and instead it builds an index of a batch of reads and scans a BLAST database for potential matches. BLAST database can be created from a FASTA file in seconds or minutes instead of hours for most indices. It also allows for mapping to or searching arbitrarily large collections of sequences. 10 | 11 | Magic-BLAST can also work with a genome as a FASTA file. However it is not recommended for more than a few million bases, because mapping to a FASTA file is much slower than to a BLAST database. 12 | 13 | #### Seed and extend 14 | 15 | Magic-BLAST works similarly to other BLAST programs. First it finds a seed alignment, an exact 16-base match and extends alignment to the left and right. Shorter alignments are combined over introns if splice signals are found. For paired reads, the cumulative pair score is used to select the best mapping. 16 | 17 | #### Database word counts 18 | 19 | To avoid mapping to repeats Magic-BLAST scans the database and counts 16-base words. Those that appear more than 10 times are not extended. 20 | 21 | This functionality can be turned off with ```-limit_lookup F``` option and should not be used when mapping to transcripts. 22 | 23 | 24 | -------------------------------------------------------------------------------- /docs/_posts/2017-09-13-release.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: release 4 | title: "Release Notes" 5 | order: 0 6 | --- 7 | 8 | #### Magic-BLAST 1.7.2: April 19, 2023 9 | 10 | ##### Bug fixes 11 | * Fix for missing libraries in MacOs 12 | 13 | 14 | --- 15 | 16 | 17 | #### Magic-BLAST 1.7.1: February 17, 2023 18 | 19 | ##### Bug fixes 20 | * Fix for the missing nghttp2.dll in Windows 21 | * Fix for incorrectly reported version number 22 | 23 | 24 | --- 25 | 26 | 27 | #### Magic-BLAST 1.7.0: October 14, 2022 28 | 29 | ##### New features 30 | * Magic-BLAST now reports read quality scores in SAM output if they were provided in the input. 31 | * Magic-BLAST now reports Mapping Quality (MAPQ) score in the SAM output. The scores are 60 for a single alignment, and *round*(-10 *log*_10(1 - 1 / num_alignments)) for more than one alignment. 32 | * Magic-BLAST now reports splice signals in the tabular format BTOP string. To get the old BTOP format without splice signals, set the environment variable ```BTOP_NO_SPLICE_SIGNALS```. 33 | * New ```-tag ``` option to add a user label to each alignment, in the last column of the tabular report, and ```XU:Z:``` tag in the SAM format. 34 | 35 | 36 | ##### Bug fixes 37 | * Fixed incorrect SAM flags for unaligned paired reads 38 | * Only correct values (2 -- 255) are allowed for ```-max_db_word_count``` parameter. 39 | * Fixes for reporting unique paired alignments. 40 | 41 | 42 | --- 43 | 44 | 45 | #### Magic-BLAST 1.6.0: May 6, 2021 46 | 47 | ##### New features 48 | * Usage reporting - Help improve Magic-BLAST by sharing limited information about your search. [Details on the information collected, how it is used, and how to opt-out](https://www.ncbi.nlm.nih.gov/books/NBK563686/). 49 | * Reads from NCBI SRA repository are streamed from the cloud when ```-sra``` or ```-sra_batch``` option is used. [More details here]({{ site.baseurl }}{% link _posts/2021-05-06-cloud-sra.md %}). 50 | * NCBI taxonomy IDs are reported in SAM output if they are present in the target BLAST database. 51 | * Unaligned reads can be reported separately from the aligned ones, using ```-out_unaligned ``` option. One can also select the format with ```-unaligned_fmt``` option. Choices are SAM, tabular, and FASTA. The default format is the same as one for the main report (SAM or tabular, selected with ```-outfmt``` option). 52 | * A file with list of SRA accessions can be provided to Magic-BLAST via the ```-sra_batch``` option. 53 | 54 | 55 | ##### Bug fixes 56 | * Magic-BLAST correctly reports database sequence accessions for BLAST databases without gis. 57 | * Fixed discontinuity in adaptive score threshold function. Below are the new thresholds: 58 | 59 | Read length (r) | Score threshold 60 | --------------|:---------------: 61 | r <= 20 | r 62 | 20 < r <= 34 | 20 63 | 50 < r < 200 | 0.6 * r 64 | r >= 200 | 120 65 | 66 | 67 | --- 68 | 69 | 70 | #### Magic-BLAST 1.5.0: August 22, 2019 71 | 72 | ##### New features 73 | * Support for the new BLAST database version (BLASTDBv5) that allows for limiting search by taxonomy ([more information about database version 5 here](https://ftp.ncbi.nlm.nih.gov/blast/db/v5/blastdbv5.pdf)) 74 | * New option ```-md_tag```: SAM MD tag is no longer reported by default. To have it included in SAM report, use ```-md_tag``` option. 75 | * New symbol in tabular report BTOP string: ```%%``` that represents a deletion (gap in read) of this number of bases. 76 | * New adaptive alignment score threshold, calculated based on read length (score thresholds below). This is the default behavior. Users can change alignment score threshold with the ```-score``` option and set it either to a constant or a linear function of read length. 77 | 78 | Read length (r) | Score threshold 79 | --------------|:---------------: 80 | r <= 20 | r 81 | 20 < r <= 30 | 20 82 | 30 < r <= 50 | r - 10 83 | 50 < r < 200 | 0.6 * r 84 | r >= 200 | 120 85 | 86 | 87 | ##### Improvements 88 | * Improved multi-threading and run time. 89 | * Improved alignment heuristics that allow for larger error rates and better alignments for long reads. 90 | * Magic-BLAST aligns nanopore reads. 91 | * NCBI accessions instead of gis are reported in SAM and tabular reports. 92 | * Short, low-complexity alignments are no longer reported. 93 | * The default value for ```-max_db_word_count``` parameter was lowered to from 60 to 30. 16-base words that appear in the genome more than this number of times will be filtered. 94 | * The maximum insert size for properly aligned pairs is 1,000,000 bases for spliced alignments (RNA-seq) and 100,000 bases for non-spliced alignments (genomic). The alignments for pairs with larger insert size are still reported, but SAM flag for properly aligned pair is not set. 95 | 96 | 97 | ##### Bug fixes 98 | * SAM MD tag reports correct number of matching bases around an intron. 99 | * Using ```-max_db_word_count``` option no longer requires explicit use of ```-limit_lookup``` option. 100 | * Magic-BLAST no longer crashes with an empty sequence in FASTQ file. 101 | 102 | 103 | 104 | --- 105 | 106 | 107 | 108 | #### Magic-BLAST 1.4.0: August 10, 2018 109 | 110 | ##### New features 111 | * New option: ```-no_discordant``` to report only concordant read pairs 112 | * Report strand-specific alignments with ```-fr``` and ```-rf``` flags for forward-reveresed and reversed-forward 113 | * New option to control repeat filtering: ```-max_db_word_count```. 16-base words that appear in the genome more than this number of times will be filtered (default is 60). 114 | 115 | 116 | ##### Improvements 117 | * Improved sensitivity: count for frequent database words to be filtered was increased to 60 (used to be 10). This threshold can be changed with the ```-max_db_word_count``` option. 118 | * Non-cannonical splice signal now require longer alignments on the exon, with score at least 50 on both sides of an intron. 119 | * More informative error messages for SRA access 120 | * Much better handling of non-cannonical splice sites and compositionally biased genomes 121 | 122 | 123 | ##### Bug fixes 124 | * Alignments no longer stop prematurely 125 | * Fix for not returning unmapped reads when none is aligned 126 | * Magic-BLAST no longer reports zero-length introns 127 | * Parameters of the score threshold as a function of read length are no longer 100 smaller than specified by the user 128 | 129 | 130 | 131 | --- 132 | 133 | 134 | #### Magic-BLAST 1.3.0: September 15, 2017 135 | 136 | ##### New features 137 | * The alignment cutoff score (```-score``` option) can be expressed as either a constant or a function of read length in this format: L,b,a for a * length + b 138 | * Maximum edit distance cutoff for alignments can be specified with ```-max_edit_dist``` option 139 | * SRA caching is now turned off by default and can be turned on with the ```-sra_cache``` flag. When accessing sequences in NCBI SRA repository the data can be cached in a local file so that it is not downloaded over the network again when reused later. It may result in very large local files and is only needed if you use SRA accessions multiple times, have very limited network bandwidth and a large disk space. 140 | 141 | 142 | ##### Improvements 143 | * Unmapped reads are reported in the SAM and Tabular reports, use ```-no_unaligned``` option to not have unaligned reads reported 144 | * The XS tag (used by transcript assemblers) is now reported in SAM output 145 | * The score threshold (```-score``` option) now applies to the whole spliced alignment 146 | * The query batch size (number of reads processed at a time) is now controlled with environment variable BATCH_SIZE expressed in cumulative number of bases 147 | * The default mismatch and gap extension penalties are now set to 4 148 | * Improved sensitivity and run time 149 | 150 | 151 | ##### Bug fixes 152 | * Read ids for pairs are printed properly in the SAM format (no ".1" ".2" or "/1" "/2") 153 | * The secondary alignment bit (256) is set in SAM flags 154 | * Maximum intron length option ```-max_intron_length``` works properly 155 | 156 | 157 | --- 158 | 159 | 160 | #### Magic-BLAST 1.2.0: February 17, 2017 161 | 162 | ##### Improvements 163 | 164 | * Improved multi-threading for larger genomes 165 | * Improved splice site detection 166 | 167 | ##### Bug fixes 168 | 169 | * Magic-BLAST now works with multiple SRA accessions 170 | * Fixed the macOS dmg installer that used to remove BLAST+ binaries 171 | * The -seqidlist option is no longer ignored 172 | 173 | 174 | --- 175 | 176 | 177 | #### Magic-BLAST 1.1.0: November 4, 2016 178 | 179 | ##### Improvements 180 | 181 | * -sra option connects to NCBI via HTTPS 182 | * Results are formatted with 'bare' accessions 183 | * Tabular output includes a header with column titles 184 | 185 | ##### Bug fixes: 186 | 187 | * Fixed SAM flag values 188 | 189 | 190 | --- 191 | 192 | 193 | #### Magic-BLAST 1.0.0: August 19, 2016 194 | 195 | * First release 196 | 197 | -------------------------------------------------------------------------------- /docs/_posts/2017-11-14-download.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: doc 4 | title: "Download" 5 | order: 2 6 | --- 7 | 8 | #### Source code and Linux, MacOs, and Windows binaries 9 | 10 | Download source code and Linux, MacOs, and Windows binaries from [https://ftp.ncbi.nlm.nih.gov/blast/executables/magicblast/LATEST](https://ftp.ncbi.nlm.nih.gov/blast/executables/magicblast/LATEST) 11 | 12 |   13 | 14 | #### Bioconda 15 | 16 | You can also install Magic-BLAST from [Bioconda](https://anaconda.org/bioconda/magicblast): 17 | 18 | ``` 19 | conda install -c bioconda magicblast 20 | ``` 21 | 22 |   23 | 24 | #### Docker 25 | 26 | We also provide `ncbi/magicblast` docker image: 27 | 28 | ``` 29 | docker pull ncbi/magicblast 30 | ``` 31 | 32 | For more information about `ncbi/magicblast` image see [https://github.com/ncbi/docker/tree/master/magicblast](https://github.com/ncbi/docker/tree/master/magicblast) 33 | 34 | Magic-BLAST is also a part of `ncbi/blast-workbench` image that contains additional tools: [BLAST+ package](https://www.ncbi.nlm.nih.gov/books/NBK279690/) (including `makeblastdb` program to create a BLAST database), and [EDirect](https://dataguide.nlm.nih.gov/edirect/documentation.html): 35 | 36 | ``` 37 | docker pull ncbi/blast-workbench 38 | ``` 39 | 40 | For more information about `ncbi/blast-workbench` image see [https://github.com/ncbi/docker/tree/master/blast-workbench](https://github.com/ncbi/docker/tree/master/blast-workbench) 41 | 42 | -------------------------------------------------------------------------------- /docs/_posts/2020-05-15-license.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: dev 4 | title: "License" 5 | order: 0 6 | --- 7 | 8 | Please refer to the [license file](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/magicblast/LICENSE) for license information. 9 | -------------------------------------------------------------------------------- /docs/_posts/2020-07-14-feedback.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: doc 4 | title: "Feedback" 5 | order: 3 6 | --- 7 | 8 | For feedback, bug reports, questions, suggestions, or feature requests, please 9 | - e-mail , or 10 | - open an issue in 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_posts/2021-05-06-cloud-sra.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | category: cook 4 | title: "Access SRA reads in the cloud" 5 | order: 7 6 | --- 7 | 8 | NCBI provides SRA data in the cloud to support large scale hyper parallel data analyses. Starting with version 1.6.0 Magic-BLAST downloads SRA data from the cloud providers rather than NCBI servers. Benefits include uninterrupted downloads, faster download speeds, and huge bandwidth for parallel downloads to multiple ruining processes or cloud instances. For more information see [SRA in the Cloud](https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud). 9 | 10 | To take advantage of the increased download bandwidth when running Magic-BLAST in the cloud, please configure SRA downloads, using ```vdb-config -i``` program from [SRA Toolkit](https://github.com/ncbi/sra-tools). 11 | 12 | Here are webinars prepared by NCBI SRA group for how to set up a cloud instance and configure SRA downloads: 13 | * [Download SRA reads from Amazon Web Services (AWS)](https://youtu.be/rjjrHnZfymU?list=PLH-TjWpFfWrt5MNqU7Jvsk73QefO3ADwD) 14 | * [Download SRA reads from Google Cloud Platform (GCP)](https://youtu.be/RNmBINl0bxc?list=PLH-TjWpFfWrt5MNqU7Jvsk73QefO3ADwD) 15 | -------------------------------------------------------------------------------- /docs/css/main.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-weight: 400; 3 | text-shadow: 0 1px 1px rgba(255, 255, 255, 0.7); 4 | } 5 | 6 | pre, code, pre code { 7 | border: none; 8 | border-radius: 0; 9 | background-color: #f9f9f9; 10 | font-size: 0.85em; 11 | } 12 | 13 | .highlight { 14 | background-color: #f9f9f9; 15 | } 16 | 17 | pre { 18 | font-size: 1em; 19 | } 20 | 21 | code { 22 | color: inherit; 23 | } 24 | 25 | #header { 26 | border-bottom: 1px solid #eee; 27 | margin-bottom: 20px; 28 | } 29 | 30 | #header a:hover { 31 | text-decoration: none; 32 | } 33 | 34 | #footer { 35 | margin: 20px 0; 36 | font-size: 0.85em; 37 | color: #999; 38 | text-align: center; 39 | } 40 | 41 | #content > .page-header:first-child { 42 | margin-top: 0; 43 | } 44 | 45 | #content > .page-header:first-child h2 { 46 | margin-top: 0; 47 | } 48 | 49 | 50 | #navigation { 51 | font-size: 0.9em; 52 | } 53 | 54 | #navigation li a { 55 | padding-left: 10px; 56 | padding-right: 10px; 57 | } 58 | 59 | #navigation .nav-header { 60 | padding-left: 0; 61 | padding-right: 0; 62 | } 63 | 64 | body.rtl { 65 | direction: rtl; 66 | } 67 | 68 | body.rtl #header .brand { 69 | float: right; 70 | margin-left: 5px; 71 | } 72 | body.rtl .row-fluid [class*="span"] { 73 | float: right !important; 74 | margin-left: 0; 75 | margin-right: 2.564102564102564%; 76 | } 77 | body.rtl .row-fluid [class*="span"]:first-child { 78 | margin-right: 0; 79 | } 80 | 81 | body.rtl ul, body.rtl ol { 82 | margin: 0 25px 10px 0; 83 | } 84 | 85 | table { 86 | margin-bottom: 1rem; 87 | border: 1px solid #e5e5e5; 88 | border-collapse: collapse; 89 | } 90 | 91 | td, th { 92 | padding: .25rem .5rem; 93 | border: 1px solid #e5e5e5; 94 | } 95 | -------------------------------------------------------------------------------- /docs/css/syntax.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #ffffff; } 3 | .highlight .c { color: #888888 } /* Comment */ 4 | .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ 5 | .highlight .k { color: #008800; font-weight: bold } /* Keyword */ 6 | .highlight .cm { color: #888888 } /* Comment.Multiline */ 7 | .highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */ 8 | .highlight .c1 { color: #888888 } /* Comment.Single */ 9 | .highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */ 10 | .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ 11 | .highlight .ge { font-style: italic } /* Generic.Emph */ 12 | .highlight .gr { color: #aa0000 } /* Generic.Error */ 13 | .highlight .gh { color: #333333 } /* Generic.Heading */ 14 | .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ 15 | .highlight .go { color: #888888 } /* Generic.Output */ 16 | .highlight .gp { color: #555555 } /* Generic.Prompt */ 17 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 18 | .highlight .gu { color: #666666 } /* Generic.Subheading */ 19 | .highlight .gt { color: #aa0000 } /* Generic.Traceback */ 20 | .highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */ 21 | .highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */ 22 | .highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */ 23 | .highlight .kp { color: #008800 } /* Keyword.Pseudo */ 24 | .highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */ 25 | .highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */ 26 | .highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */ 27 | .highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */ 28 | .highlight .na { color: #336699 } /* Name.Attribute */ 29 | .highlight .nb { color: #003388 } /* Name.Builtin */ 30 | .highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */ 31 | .highlight .no { color: #003366; font-weight: bold } /* Name.Constant */ 32 | .highlight .nd { color: #555555 } /* Name.Decorator */ 33 | .highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */ 34 | .highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */ 35 | .highlight .nl { color: #336699; font-style: italic } /* Name.Label */ 36 | .highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */ 37 | .highlight .py { color: #336699; font-weight: bold } /* Name.Property */ 38 | .highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */ 39 | .highlight .nv { color: #336699 } /* Name.Variable */ 40 | .highlight .ow { color: #008800 } /* Operator.Word */ 41 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 42 | .highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */ 43 | .highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */ 44 | .highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */ 45 | .highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */ 46 | .highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */ 47 | .highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */ 48 | .highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */ 49 | .highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */ 50 | .highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */ 51 | .highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */ 52 | .highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */ 53 | .highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */ 54 | .highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */ 55 | .highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */ 56 | .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ 57 | .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */ 58 | .highlight .vc { color: #336699 } /* Name.Variable.Class */ 59 | .highlight .vg { color: #dd7700 } /* Name.Variable.Global */ 60 | .highlight .vi { color: #3333bb } /* Name.Variable.Instance */ 61 | .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */ 62 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "NCBI Magic-BLAST Documentation" 4 | --- 5 | 6 | Magic-BLAST is a tool for mapping large next-generation RNA or DNA sequencing 7 | runs against a whole genome or transcriptome. Each alignment optimizes 8 | a composite score, taking into account simultaneously the two reads of 9 | a pair, and in case of RNA-seq, locating the candidate introns and adding 10 | up the score of all exons. This is very different from other versions of 11 | BLAST, where each exon is scored as a separate hit and read-pairing is 12 | ignored. 13 | 14 | Magic-BLAST incorporates within the NCBI BLAST code framework ideas 15 | developed in the NCBI Magic pipeline, in particular hit extensions by 16 | local walk and jump [(http://www.ncbi.nlm.nih.gov/pubmed/26109056)](http://www.ncbi.nlm.nih.gov/pubmed/26109056), and recursive clipping of 17 | mismatches near the edges of the reads, which avoids accumulating 18 | artefactual mismatches near splice sites and is needed to distinguish 19 | short indels from substitutions near the edges. 20 | 21 | More details about the algorithm and comparison with other similar tools are published here: 22 | 23 | Boratyn GM, Thierry-Mieg J, Thierry-Mieg D, Busby B, Madden TL. (2019) **Magic-BLAST, an accurate RNA-seq aligner for long and short reads.** *BMC Bioinformatics* 20: 405. \[[article](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2996-x)\] 24 | 25 | We call the whole next generation run (from Illumina, Roche-454, ABI, or 26 | another sequencing platform excluding SOLiD), a query. The input reads may 27 | be provided as SRA accession or a file in a SRA, FASTA, and FASTQ format. 28 | Read pairs can be presented as parallel files, or as successive reads in a 29 | single file. 30 | 31 | The reference genome or transcriptome can be given as a BLAST database 32 | or a FASTA file. It is preferable to use BLAST database for large genomes, 33 | such as human, or transcript collections, such as all of RefSeq, Ensembl, 34 | or AceView. See here on [how to create a BLAST database](/magicblast/cook/blastdb.html). 35 | 36 | The full list of options is listed when you use ```-help``` option. 37 | 38 | Thank you for trying this tool and providing us with feedback. Please, 39 | let us know of any desired enhancement, problem or difficulty. 40 | 41 | E-mail blast-help@ncbi.nlm.nih.gov with questions or comments. 42 | -------------------------------------------------------------------------------- /magicblast-tools/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *~ 3 | -------------------------------------------------------------------------------- /magicblast-tools/README.md: -------------------------------------------------------------------------------- 1 | Magic-BLAST tools 2 | === 3 | 4 | A a few scripts to scripts to postprocess SAM/BAM files. 5 | 6 | 7 | Features 8 | -------- 9 | * get-introns.py - collect intron locations. 10 | * get-transcripts.py - assemble transcript sequences from a genome using a GFF or GTF annotation. 11 | * combine-genome-transcript.py - iterate over read alignments to a genome and transcripts, select better scoring alignments, remap transcript alignments to the genome, and save them in a SAM or BAM file. 12 | 13 | 14 | ## Dependencies 15 | The programs in this directory work with Python 3.6+ and require these packages: 16 | * pysam 17 | * pyfaidx 18 | * pandas 19 | 20 | To install them run: 21 | ``` 22 | pip install -r requirements.txt 23 | ``` 24 | 25 | 26 | Usage 27 | ----- 28 | 29 | ### Get intron locations from a SAM/BAM file 30 | ``` 31 | get-introns.py --bam --gff --introns 32 | ``` 33 | 34 | The output is a tab delimited file with intron locations marked as KNOWN or NEW. 35 | 36 | 37 | ### Assemble transcript sequences from a genome 38 | ``` 39 | get-transcripts.py --genome --gff 40 | ``` 41 | The transcripts will be written to the standard output. 42 | 43 | 44 | ### Combine alignments to genome and transcripts 45 | ``` 46 | combine-genome-transcripts.py --to-genome --to-transcripts --gff --out [-b] 47 | ``` 48 | -------------------------------------------------------------------------------- /magicblast-tools/base.py: -------------------------------------------------------------------------------- 1 | #============================================================================ 2 | # 3 | # PUBLIC DOMAIN NOTICE 4 | # National Center for Biotechnology Information 5 | # 6 | # This software/database is a "United States Government Work" under the 7 | # terms of the United States Copyright Act. It was written as part of 8 | # the author's official duties as a United States Government employee and 9 | # thus cannot be copyrighted. This software/database is freely available 10 | # to the public for use. The National Library of Medicine and the U.S. 11 | # Government have not placed any restriction on its use or reproduction. 12 | # 13 | # Although all reasonable efforts have been taken to ensure the accuracy 14 | # and reliability of the software and data, the NLM and the U.S. 15 | # Government do not and cannot warrant the performance or results that 16 | # may be obtained by using this software or data. The NLM and the U.S. 17 | # Government disclaim all warranties, express or implied, including 18 | # warranties of performance, merchantability or fitness for any particular 19 | # purpose. 20 | # 21 | # Please cite the author in any work or product based on this material. 22 | # 23 | # =========================================================================== 24 | # 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 26 | # 27 | # --------------------------------------------------------------------------- 28 | 29 | """Basic definitions used in other libraries""" 30 | 31 | from collections import namedtuple 32 | 33 | Intron = namedtuple('Intron', ['seqid', 'start', 'end', 'strand']) 34 | Exon = namedtuple('Exon', ['seqid', 'start', 'end', 'strand']) 35 | mRNA = namedtuple('mRNA', ['seqid', 'start', 'end', 'strand', 'exons', 'attributes']) 36 | -------------------------------------------------------------------------------- /magicblast-tools/combine-genome-transcripts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #============================================================================ 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 27 | # 28 | # --------------------------------------------------------------------------- 29 | 30 | """Compare mapping to genome with mapping to transcripts""" 31 | 32 | import sam 33 | import pysam 34 | import gff 35 | import gtf 36 | import argparse 37 | import bisect 38 | import sys 39 | import contextlib 40 | 41 | 42 | def is_equal(query_name_1, query_name_2): 43 | """Determine whether two read ids represent the same read""" 44 | q1 = query_name_1 45 | q2 = query_name_2 46 | if q1[-1] in ['a', 'b']: 47 | q1 = q1[:-1] 48 | if q2[-1] in ['a', 'b']: 49 | q2 = q2[:-1] 50 | 51 | return q1 == q2 52 | 53 | def get_aligns(stream): 54 | """Get all alignments for a single reads. The SAM or BAM file must be 55 | sorted by read name""" 56 | 57 | result = [] 58 | for align in stream: 59 | if len(result) == 0 or \ 60 | is_equal(result[0].query_name, align.query_name): 61 | result.append(align) 62 | else: 63 | yield result 64 | result = [align] 65 | 66 | if len(result) > 0: 67 | yield result 68 | 69 | 70 | def index_aligns(aligns): 71 | """Index alignments by reference name and reference start position""" 72 | result = {} 73 | for i in aligns: 74 | 75 | k = (i.reference_name, i.reference_start) 76 | result[k] = i 77 | 78 | return result 79 | 80 | 81 | def get_score(aligns): 82 | """Get alignment score, composite for paired reads""" 83 | # for single reads return score of the first alignment 84 | # we assume that only top scoring alignments are reported 85 | if aligns[0].flag & 1 == 0: 86 | return aligns[0].get_tag('AS') if aligns[0].flag & 4 == 0 else 0 87 | 88 | # for paired reads report sum of scores for properly paired alignments 89 | # (bit 2 set), and single read score for other cases 90 | scores = [] 91 | forward = {} 92 | reverse = {} 93 | for i in aligns: 94 | # if alignment is not properly paired, save score 95 | if i.flag & 2 == 0: 96 | scores.append(i.get_tag('AS') if i.flag & 4 == 0 else 0) 97 | continue 98 | 99 | # for properly paired alignments, find the mate alignment and add 100 | # scores 101 | d = None 102 | if i.flag & 64: 103 | d = forward 104 | elif i.flag & 128: 105 | d = reverse 106 | else: 107 | raise ValueError(f'Unrecognised paired flags for alignment: {i}') 108 | 109 | k = (i.reference_name, i.reference_start) 110 | d[k] = i 111 | 112 | for i in forward.values(): 113 | k = (i.next_reference_name, i.next_reference_start) 114 | if k not in reverse: 115 | raise ValueError(f'Missing mate for {i}') 116 | scores.append(i.get_tag('AS') + reverse[k].get_tag('AS')) 117 | 118 | return max(scores) 119 | 120 | 121 | def compare_alignments(align_1, align_2): 122 | """Compare alignmet scores. 123 | Return 1 if align_1 score is larger than align_2 score, zero if 124 | align_1 score == align_2 score, and -1 if align_1 score is smaller than 125 | align_2 score""" 126 | score_1 = get_score(align_1) 127 | score_2 = get_score(align_2) 128 | 129 | if score_1 == score_2: 130 | return 0 131 | elif score_1 > score_2: 132 | return 1 133 | return -1 134 | 135 | 136 | def get_alignment_end(align): 137 | """Find end position of a SAM alignment""" 138 | if align.flag & 4: 139 | raise ValueError(f'Read unaligned: {align}') 140 | 141 | pos = align.reference_start 142 | for op, num in align.cigartuples: 143 | if op in [sam.CIGAR_MATCH, sam.CIGAR_DELETION, sam.CIGAR_INTRON]: 144 | pos += num 145 | 146 | return pos 147 | 148 | 149 | def transcript2genome(positions, align, transcript): 150 | """Translate a position on a transcript to position on a genome""" 151 | 152 | # find cumulative exon lengths 153 | exon_lens = [e.end - e.start + 1 for e in transcript.exons] 154 | cum_exon_lens = [sum(exon_lens[:i + 1]) for i in range(0, len(exon_lens))] 155 | 156 | for pos in positions: 157 | 158 | # if transcript is annotated on the negative negative strand of the 159 | # genome, go from the end of the read alignment to the transcript: 160 | # find alignment position from sequence end and reverse CIGAR 161 | if transcript.strand == '-': 162 | align_start = sum(exon_lens) - get_alignment_end(align) + 1 163 | 164 | ind = bisect.bisect_left(cum_exon_lens, align_start) 165 | if ind > 0 and cum_exon_lens[ind] == align_start: 166 | ind += 1 167 | if ind == 0: 168 | yield transcript.exons[ind].start + align_start - 1 169 | else: 170 | yield transcript.exons[ind].start + align_start - 1 - \ 171 | cum_exon_lens[ind - 1] 172 | 173 | 174 | def reverse_complement(sequence): 175 | """Reverse complement a sequence""" 176 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} 177 | result = [] 178 | for i in reversed(sequence): 179 | result.append(complement.get(i, 'N')) 180 | return ''.join(result) 181 | 182 | 183 | def remap_to_genome(align, annot, mates = None): 184 | """Remap alignment to a transctript to alignment on a genome. Generates 185 | a string, single line SAM output""" 186 | # if a read is unaligned retrun the same SAM line 187 | if align.flag & 4: 188 | return align.to_string() 189 | 190 | if align.reference_name not in annot: 191 | raise ValueError(f'{align.reference_name} not present in annotation') 192 | 193 | transcript = annot[align.reference_name] 194 | 195 | flag = align.flag 196 | sequence = align.query_sequence 197 | if transcript.strand == '-': 198 | flag ^= 16 199 | sequence = reverse_complement(sequence) 200 | 201 | exon_lens = [e.end - e.start + 1 for e in transcript.exons] 202 | 203 | # alignment of read to transcript left-most position 204 | align_start = align.reference_start + 1 205 | cigartuples = align.cigartuples 206 | 207 | # if transcript is annotated on the negative negative strand of the genome 208 | # go from the end of the read alignment to the transcript: 209 | # find alignment position from sequence end and reverse CIGAR 210 | if transcript.strand == '-': 211 | align_start = sum(exon_lens) - get_alignment_end(align) + 1 212 | cigartuples = reversed(cigartuples) 213 | 214 | # find read alignment to the genome start position 215 | cum_exon_lens = [sum(exon_lens[:i + 1]) for i in range(0, len(exon_lens))] 216 | ind = bisect.bisect_left(cum_exon_lens, align_start) 217 | if ind > 0 and cum_exon_lens[ind] == align_start: 218 | ind += 1 219 | if ind == 0: 220 | start = transcript.exons[ind].start + align_start - 1 221 | else: 222 | start = transcript.exons[ind].start + align_start - 1 - \ 223 | cum_exon_lens[ind - 1] 224 | 225 | op2cigar = {sam.CIGAR_MATCH: 'M', sam.CIGAR_INSERTION: 'I', 226 | sam.CIGAR_DELETION: 'D', sam.CIGAR_SOFT_CLIP: 'S'} 227 | 228 | # generate new CIGAR 229 | cigar = '' 230 | g_pos = start 231 | for op, num in cigartuples: 232 | if op == sam.CIGAR_INTRON: 233 | raise RuntimeError('Bad alignment') 234 | 235 | if op in [sam.CIGAR_MATCH, sam.CIGAR_DELETION]: 236 | 237 | bases_left = num 238 | while bases_left > 0: 239 | 240 | # if rthe current alignment segment ends within current exon 241 | if g_pos + bases_left <= transcript.exons[ind].end: 242 | cigar += f'{bases_left}{op2cigar[op]}' 243 | g_pos += bases_left 244 | break 245 | else: 246 | 247 | # if the current alignment segment spans an intron 248 | exon_bases = transcript.exons[ind].end - g_pos + 1 249 | bases_left -= exon_bases 250 | g_pos += exon_bases 251 | cigar += f'{exon_bases}{op2cigar[op]}' 252 | if bases_left > 0: 253 | intron = transcript.exons[ind + 1].start - transcript.exons[ind].end - 1 254 | cigar += f'{intron}N' 255 | g_pos += intron 256 | ind += 1 257 | 258 | elif op in [sam.CIGAR_INSERTION, sam.CIGAR_SOFT_CLIP]: 259 | cigar += f'{num}{op2cigar[op]}' 260 | else: 261 | raise ValueError(f'Unsupported CIGAR operation: {op}') 262 | 263 | # compute mate start for paired reads 264 | mate_name = '*' 265 | mate_start = 0 266 | if align.flag & 1 and align.next_reference_name is not None : 267 | if align.next_reference_name == align.reference_name: 268 | mate_name = '=' 269 | mate_start = transcript2genome([align.next_reference_start], align, 270 | transcript) 271 | 272 | else: 273 | # get alignment start for a new transcript 274 | mate_transcript = annot[align.next_reference_name] 275 | mate_name = annot[align.next_reference_name].seqid 276 | 277 | mate_align = mates[(align.next_reference_name, 278 | align.next_reference_start)] 279 | 280 | mate_start = transcript2genome([align.next_reference_start], 281 | mate_align, 282 | mate_transcript) 283 | 284 | 285 | result = f'{align.query_name}\t{flag}\t{annot[align.reference_name].seqid}'\ 286 | f'\t{start}\t255\t{cigar}\t{mate_name}\t{mate_start}\t0' \ 287 | f'\t{sequence}\t*'\ 288 | f'\tNH:i:{align.get_tag("NH")}\tAS:i:{align.get_tag("AS")}'\ 289 | f'\tNM:i:{align.get_tag("NM")}' 290 | 291 | return result 292 | 293 | 294 | @contextlib.contextmanager 295 | def wopen(filename = None, mode = 'w', template = None): 296 | """Open stream for writing either to stdout or a file""" 297 | if 'b' in mode: 298 | ff = pysam.AlignmentFile(filename, mode, template = template) 299 | else: 300 | if filename and filename != '-' and filename != 'stdout': 301 | ff = open(filename, 'w') 302 | else: 303 | ff = sys.stdout 304 | print(template.text, file=ff) 305 | 306 | try: 307 | yield ff 308 | finally: 309 | if ff is not sys.stdout: 310 | ff.close() 311 | 312 | 313 | if __name__ == '__main__': 314 | 315 | parser = argparse.ArgumentParser(description='Add annotation to Magic-BLAST mapping') 316 | parser.add_argument('--to-genome', metavar='FILE', dest='genome', type=str, 317 | help='BAM file with mapping to a genome') 318 | parser.add_argument('--to-transcripts', metavar='FILE', dest='transcripts', 319 | type=str, help='BAM file with mapping to a transcripts') 320 | parser.add_argument('--gff', metavar='FILE', dest='gfffile', type=str, 321 | help='Genome annotations file') 322 | parser.add_argument('--out', metavar='FILE', dest='outfile', type=str, 323 | help='Output SAM file', default='-') 324 | parser.add_argument('-b', dest='isbam', action='store_true', 325 | help='Output BAM') 326 | 327 | 328 | args = parser.parse_args() 329 | 330 | # read annotation file 331 | with open(args.gfffile) as f: 332 | if args.gfffile.endswith('gff'): 333 | m = gff.get_mrnas(f) 334 | 335 | # index mRNAs by accession 336 | mrnas = {} 337 | for k in m.keys(): 338 | new_key = ':'.join([m[k].attributes['Name'], m[k].seqid]) 339 | if not new_key.startswith('NM_'): 340 | continue 341 | 342 | if new_key in mrnas: 343 | raise ValueError(f'{new_key} already present in mrnas') 344 | 345 | m[k].exons.sort(key = lambda x: x.start) 346 | mrnas[new_key] = m[k] 347 | else: 348 | m = gtf.get_transcripts(f) 349 | mrnas = m 350 | 351 | mode = 'w' 352 | isbam = args.isbam or args.outfile.endswith('bam') 353 | if isbam: 354 | mode = mode + 'b' 355 | 356 | # read and compare BAM files 357 | with sam.open_sam_or_bam(args.genome) as fg, sam.open_sam_or_bam(args.transcripts) as ft, wopen(args.outfile, mode = mode, template = fg) as out: 358 | for genome, transcript in zip(get_aligns(fg), get_aligns(ft)): 359 | 360 | # print(f'genome:\t{genome[0]}') 361 | # print(f'transcript\t{transcript[0]}') 362 | # print('') 363 | 364 | duplicates = {} 365 | if compare_alignments(genome, transcript) >= 0: 366 | for i in genome: 367 | if isbam: 368 | out.write(i) 369 | else: 370 | print(i.to_string(), file=out) 371 | else: 372 | 373 | if transcript[0].flag & 1: 374 | indexed = index_aligns(transcript) 375 | 376 | for i in transcript: 377 | remapped = remap_to_genome(i, mrnas, indexed) 378 | fields = remapped.split() 379 | key = (fields[2], fields[3]) 380 | # check if a similar alignment was already produced; 381 | # alignments to RNA variants may result in the 382 | # same alignment on the genome 383 | if key not in duplicates: 384 | duplicates[key] = 1 385 | if isbam: 386 | out.write(pysam.AlignedSegment.fromstring( 387 | remapped, fg.header)) 388 | else: 389 | print(remapped, file=out) 390 | 391 | -------------------------------------------------------------------------------- /magicblast-tools/get-introns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #============================================================================ 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 27 | # 28 | # --------------------------------------------------------------------------- 29 | 30 | """Parse a SAM/BAM file and get an intron support table""" 31 | 32 | import sam 33 | import gff 34 | import gtf 35 | import txt 36 | import gzip 37 | import pickle 38 | import argparse 39 | import sys 40 | import numpy as np 41 | import pandas as pd 42 | from pyfaidx import Fasta 43 | from collections import Counter 44 | from collections import defaultdict 45 | 46 | def get_splice_signals(introns, fasta_filename): 47 | """Find splice signals for introns and return as dictionary indexed by introns""" 48 | sites = {} 49 | genome = Fasta(fasta_filename) 50 | for i in introns: 51 | if i.seqid not in genome: 52 | sites[i] = 'xxxx' 53 | else: 54 | # indices into pyfaidx sequences are zero-based 55 | sites[i] = genome[i.seqid][(i.start - 1):(i.start + 1)].seq.upper() + genome[i.seqid][(i.end - 2):i.end].seq.upper() 56 | 57 | return sites 58 | 59 | 60 | def print_splice_signal_histogram(introns, signals): 61 | """Find and print histogram of splice signals for a given set of introns""" 62 | hist = Counter() 63 | for i in introns: 64 | hist[signals[i]] += 1 65 | for s, c in sorted([(s, hist[s]) for s in hist], key=lambda x: x[1], reverse=True): 66 | print('{0}\t{1}'.format(s, c)) 67 | print('') 68 | 69 | 70 | class EmptySpliceSignal: 71 | """No splice signal, class needed for defaultdict""" 72 | def __call__(self): 73 | """Return a constant representing empty splice signal""" 74 | return '---' 75 | 76 | 77 | if __name__ == '__main__': 78 | 79 | parser = argparse.ArgumentParser(description='Generate intron support table from a SAM/BAM file') 80 | parser.add_argument('--bam', metavar='FILE', dest='bamfile', type=str, 81 | help='BAM file') 82 | parser.add_argument('--gff', metavar='FILE', dest='gfffile', type=str, 83 | help='GFF file') 84 | parser.add_argument('--genome', metavar='FILE', dest='genome', type=str, 85 | help='Genome FASTA file') 86 | parser.add_argument('--filter-by', metavar='LIST', dest='filter_by', 87 | type=str, help='Filter reads') 88 | parser.add_argument('--filter-annot', metavar='STRING', 89 | dest='filter_annot', type=str, 90 | help='Filter annotated introns by transcript accession') 91 | parser.add_argument('--introns', metavar='FILE', dest='introns', type=str, 92 | help='Output file for the intron table' 93 | '(default: stdout)', default='-') 94 | parser.add_argument('--spec', dest='spec', action='store_true', 95 | help='Show results for sensitivity and specificity' 96 | ' analysis') 97 | parser.add_argument('--splice_histogram', dest='splice_histogram', 98 | action='store_true', 99 | help='Show splice signal histogram') 100 | parser.add_argument('--numbers', dest='numbers', action='store_true', 101 | help='Show numbers of annotated and unannotated ' 102 | 'introns') 103 | parser.add_argument('--sort', metavar='FILE', dest='sort', type=str, 104 | help='Output sort file for ROC score computation') 105 | parser.add_argument('--counts', metavar='FILE', dest='counts', type=str, 106 | help='Tab delimited file with numbers of read ' 107 | 'placements for weighting read contrinution.' 108 | 'Format: read_id, mate_id (1, 2), number of placements') 109 | parser.add_argument('--max-count', metavar='FILE', dest='max_count', 110 | type=int, help='Maximum count for computation of ' 111 | 'weighted read alignment counts') 112 | 113 | 114 | args = parser.parse_args() 115 | 116 | # if not args.gfffile: 117 | # raise InputError('Annotation file not specified, use --gff option') 118 | 119 | gff_introns = {} 120 | if args.gfffile: 121 | print('Reading annotatiotns', file=sys.stderr) 122 | 123 | if args.gfffile.endswith('pickle'): 124 | f = open(args.gfffile, 'rb') 125 | gff_introns = pickle.load(f) 126 | f.close() 127 | else: 128 | if args.gfffile.endswith('.gz'): 129 | f = gzip.GzipFile(args.gfffile, 'r') 130 | else: 131 | f = open(args.gfffile) 132 | 133 | if args.gfffile.endswith('.gff') or args.gfffile.endswith('.gff.gz'): 134 | gff_introns = gff.get_splice_sites(f, 135 | accession=args.filter_annot) 136 | elif args.gfffile.endswith('.gtf') or args.gfffile.endswith('.gtf.gz'): 137 | gff_introns = gtf.get_splice_sites(f) 138 | elif args.gfffile.endswith('.sam') or args.gfffile.endswith('.sam.gz'): 139 | gff_introns = sam.get_introns_with_reads(args.gfffile) 140 | elif args.gfffile.endswith('.txt'): 141 | gff_introns = txt.get_introns(f) 142 | else: 143 | raise InputError('Unrecognized annotation file extension, '\ 144 | 'must be one of these: .gff, .gtf, .sam, '\ 145 | '.pickle') 146 | f.close() 147 | print('{0} introns in the annotation'.format(len(gff_introns))) 148 | print('done', file=sys.stderr) 149 | else: 150 | gff_introns = {} 151 | 152 | placements = None 153 | if args.counts: 154 | max_count = None 155 | if args.max_count: 156 | max_count = args.max_count 157 | placements = sam.read_placements(args.counts, max_count) 158 | 159 | # parse command line filtering arguments 160 | filter_by = None 161 | if args.filter_by: 162 | filter_by = {} 163 | a = args.filter_by.split() 164 | for w, i in zip(a[::2], a[1::2]): 165 | if w == 'read_id': 166 | f = open(i) 167 | filter_by[w] = {i: 1 for i in f.read().splitlines()} 168 | f.close() 169 | else: 170 | filter_by[w] = int(i) 171 | print(' filtering reads by {0}\t{1}'.format(w, i), file=sys.stderr) 172 | 173 | 174 | # get introns from a SAM/BAM file 175 | print('Reading SAM/BAM file', file=sys.stderr) 176 | introns = sam.get_introns_with_reads(args.bamfile, force_single = True, 177 | filter_by = filter_by, 178 | placements = placements) 179 | print('done', file=sys.stderr) 180 | 181 | splice_signals = defaultdict(EmptySpliceSignal()) 182 | if args.genome: 183 | splice_signals = get_splice_signals(introns, args.genome) 184 | 185 | if args.splice_histogram: 186 | print_splice_signal_histogram(introns, splice_signals) 187 | 188 | 189 | # sort introns by end position, start position, reference sequence id 190 | keys = sorted(introns.keys(), key=lambda x: x.end) 191 | keys.sort(key=lambda x: x.start) 192 | keys.sort(key=lambda x: x.seqid) 193 | 194 | # print intron coverage table 195 | f = sys.stdout 196 | if args.introns != '-': 197 | f = open(args.introns, 'w') 198 | for k in keys: 199 | known = 'NEW' 200 | if k in gff_introns: 201 | known = 'KNOWN' 202 | print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'\ 203 | .format(k.seqid, k.start, k.end, introns[k], known, 204 | splice_signals[k]), file=f) 205 | if args.introns != '-': 206 | f.close() 207 | 208 | 209 | # create a dataframe to compute sensitivity, precision, and a sort file 210 | if args.spec or args.sort or args.numbers: 211 | 212 | d = {'intron': [':'.join(str(i) for i in [k.seqid, k.start, k.end]) \ 213 | for k in keys], 214 | 'coverage': [introns[k] for k in keys], 215 | 'annotation': ['KNOWN' if k in gff_introns else 'NEW' \ 216 | for k in keys]} 217 | 218 | df = pd.DataFrame(data=d) 219 | 220 | num_annot = df['annotation'].apply(func=lambda x: 0 if x == 'KNOWN' \ 221 | else 1) 222 | df['key'] = 2 * df['coverage'] + num_annot 223 | df.sort_values(by='key', ascending=False, inplace=True) 224 | # df = df.reindex(columns=['intron', 'coverage', 'annotation', 'key']) 225 | 226 | # print sensitivity/specificity report 227 | if args.spec: 228 | num_known = len(gff_introns) 229 | print('Number of known introns: {0}'.format(num_known)) 230 | for i in [1, 2, 3, 5]: 231 | print('Coverage >= {0}'.format(i)) 232 | idx = (df['coverage'] >= i) 233 | num_tp = df[idx & (df['annotation'] == 'KNOWN')].shape[0] 234 | sensitivity = float(num_tp) / float(num_known) 235 | specificity = float(num_tp) / float(df[idx].shape[0]) 236 | print('Sensitivity: {0}'.format(sensitivity)) 237 | print('Precision: {0}'.format(specificity)) 238 | print('') 239 | 240 | # print number of annotated and unannotated introns 241 | if args.numbers: 242 | num_known = len(gff_introns) 243 | print('Number of known introns: {0}'.format(num_known)) 244 | for i in [1, 2, 3, 5]: 245 | print('Coverage >= {0}'.format(i)) 246 | idx = (df['coverage'] >= i) 247 | num_tp = df[idx & (df['annotation'] == 'KNOWN')].shape[0] 248 | num_fp = df[idx & (df['annotation'] != 'KNOWN')].shape[0] 249 | print('Number of annotated introns: {0}'.format(num_tp)) 250 | print('Number of unannotated introns: {0}'.format(num_fp)) 251 | print('') 252 | 253 | # print the sort file 254 | if args.sort: 255 | with open(args.sort, 'w') as f: 256 | n = 0 257 | for r in df.iterrows(): 258 | print('\t'.join(str(i) for i in [n, r[1]['coverage'], \ 259 | '+' if r[1]['annotation'] == 'KNOWN' else '-', \ 260 | r[1]['intron']]), file=f) 261 | n += 1 262 | 263 | 264 | 265 | -------------------------------------------------------------------------------- /magicblast-tools/get-transcripts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #============================================================================ 3 | # 4 | # PUBLIC DOMAIN NOTICE 5 | # National Center for Biotechnology Information 6 | # 7 | # This software/database is a "United States Government Work" under the 8 | # terms of the United States Copyright Act. It was written as part of 9 | # the author's official duties as a United States Government employee and 10 | # thus cannot be copyrighted. This software/database is freely available 11 | # to the public for use. The National Library of Medicine and the U.S. 12 | # Government have not placed any restriction on its use or reproduction. 13 | # 14 | # Although all reasonable efforts have been taken to ensure the accuracy 15 | # and reliability of the software and data, the NLM and the U.S. 16 | # Government do not and cannot warrant the performance or results that 17 | # may be obtained by using this software or data. The NLM and the U.S. 18 | # Government disclaim all warranties, express or implied, including 19 | # warranties of performance, merchantability or fitness for any particular 20 | # purpose. 21 | # 22 | # Please cite the author in any work or product based on this material. 23 | # 24 | # =========================================================================== 25 | # 26 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 27 | # 28 | # --------------------------------------------------------------------------- 29 | 30 | """Get transcript sequences from a genome and a GTF file""" 31 | 32 | import gtf 33 | import gff 34 | from pyfaidx import Fasta 35 | import argparse 36 | 37 | if __name__ == '__main__': 38 | 39 | parser = argparse.ArgumentParser(description='Get transcript sequences from a genome and a GTF file') 40 | parser.add_argument('--genome', metavar='FILE', dest='genome', type=str, 41 | help='Reference sequence in FASTA format') 42 | parser.add_argument('--gff', metavar='FILE', dest='gff', type=str, 43 | help='GFF or GTF file') 44 | parser.add_argument('--select', metavar='STRING', dest='select', type=str, 45 | help='Print only sequences whose id contain provided ' 46 | 'string') 47 | 48 | args = parser.parse_args() 49 | 50 | f = open(args.gff) 51 | if args.gff.endswith('.gtf'): 52 | transcripts = gtf.get_transcripts(f) 53 | elif args.gff.endswith('.gff'): 54 | transcripts = gff.get_mrnas(f) 55 | else: 56 | raise ValueError('Unrecognized file extension for: {}. Only GFF or GTF' 57 | ' files are allowed'.format(args.gff)) 58 | f.close() 59 | 60 | genome = Fasta(args.genome) 61 | 62 | for i in transcripts: 63 | 64 | strand = transcripts[i].exons[0].strand 65 | sequence = '' 66 | 67 | exons = sorted(transcripts[i].exons, key=lambda x: x.start, 68 | reverse = (strand == '-')) 69 | 70 | seqid = transcripts[i].seqid 71 | 72 | for exon in exons: 73 | 74 | if exon.strand != strand: 75 | raise ValueError('Mismatched strands for transcript: {0}'.\ 76 | format(i)) 77 | 78 | if strand == '-': 79 | sequence += genome[seqid][(exon.start - 1):(exon.end)].\ 80 | reverse.complement.seq.upper() 81 | else: 82 | sequence += genome[seqid][(exon.start - 1):(exon.end)].\ 83 | seq.upper() 84 | 85 | seqid = i 86 | if 'Name' in transcripts[i].attributes: 87 | # A few mRNAs align to both X and Y chromosomes in slightly 88 | # different locations, so we are adding reference id to sequence 89 | # id to distinguish between the two alignments 90 | seqid = transcripts[i].attributes['Name'] + ':' + transcripts[i].seqid 91 | 92 | if args.select and args.select not in seqid: 93 | continue 94 | 95 | print('>{0}'.format(seqid)) 96 | for n in range(0, len(sequence), 80): 97 | print('{0}'.format(sequence[n:(n + 80)])) 98 | 99 | 100 | -------------------------------------------------------------------------------- /magicblast-tools/gff.py: -------------------------------------------------------------------------------- 1 | #============================================================================ 2 | # 3 | # PUBLIC DOMAIN NOTICE 4 | # National Center for Biotechnology Information 5 | # 6 | # This software/database is a "United States Government Work" under the 7 | # terms of the United States Copyright Act. It was written as part of 8 | # the author's official duties as a United States Government employee and 9 | # thus cannot be copyrighted. This software/database is freely available 10 | # to the public for use. The National Library of Medicine and the U.S. 11 | # Government have not placed any restriction on its use or reproduction. 12 | # 13 | # Although all reasonable efforts have been taken to ensure the accuracy 14 | # and reliability of the software and data, the NLM and the U.S. 15 | # Government do not and cannot warrant the performance or results that 16 | # may be obtained by using this software or data. The NLM and the U.S. 17 | # Government disclaim all warranties, express or implied, including 18 | # warranties of performance, merchantability or fitness for any particular 19 | # purpose. 20 | # 21 | # Please cite the author in any work or product based on this material. 22 | # 23 | # =========================================================================== 24 | # 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 26 | # 27 | # --------------------------------------------------------------------------- 28 | 29 | """GFF file parser""" 30 | 31 | from collections import namedtuple 32 | from base import Intron 33 | from base import Exon 34 | from base import mRNA 35 | import sys 36 | import re 37 | 38 | 39 | Record = namedtuple('Record', ['seqid', 'source', 'feature', 'start', 40 | 'end', 'score', 'strand', 'frame', 41 | 'attribute']) 42 | 43 | def parse(line): 44 | """Parse a single line and return Record""" 45 | fields = line.rstrip().split('\t') 46 | try: 47 | r = Record( 48 | seqid = None if fields[0] == '.' else fields[0], 49 | source = None if fields[1] == '.' else fields[1], 50 | feature = None if fields[2] == '.' else fields[2], 51 | start = None if fields[3] == '.' else int(fields[3]), 52 | end = None if fields[4] == '.' else int(fields[4]), 53 | score = None if fields[5] == '.' else float(fields[5]), 54 | strand = None if fields[6] == '.' else fields[6], 55 | frame = None if fields[7] == '.' else int(fields[7]), 56 | attribute = None if fields[8] == '.' else fields[8] 57 | ) 58 | except ValueError: 59 | print(line) 60 | 61 | return r 62 | 63 | 64 | def get_introns(stream): 65 | """Collect introns from gff stream and return as a dictionary""" 66 | introns = {} 67 | for line in stream: 68 | if line.startswith('#'): 69 | continue 70 | 71 | r = parse(line) 72 | if r.feature != 'intron': 73 | continue 74 | 75 | introns[Intron(seqid = r.seqid, start = r.start, end = r.end, 76 | strand = r.strand)] = 1 77 | 78 | return introns 79 | 80 | 81 | def get_mrnas(stream, source = None): 82 | """Collect mRNA extents with exons""" 83 | mrnas = {} 84 | 85 | for line__ in stream: 86 | if isinstance(line__, str): 87 | line = line__ 88 | elif isinstance(line__, bytes): 89 | line = line__.decode() 90 | else: 91 | raise InputError('Unsupported stream data') 92 | 93 | if line.startswith('#'): 94 | continue 95 | 96 | f = parse(line) 97 | 98 | if source is not None and f.source != source: 99 | continue 100 | 101 | # if f.feature in ['mRNA', 'transcript', 'primary_transcript', 'miRNA', 102 | # 'lnc_RNA', 'gene', 'snoRNA', 'antisense_RNA']: 103 | # if f.feature in ['mRNA', 'transcript']: 104 | if f.feature == 'mRNA': 105 | # m = re.search('ID=(rna\d\d*);', f.attribute) 106 | m = re.search('ID=(\w[\w:]*)', f.attribute) 107 | if not m: 108 | raise ValueError('mRNA id could not be found') 109 | index = m.group(1) 110 | exons = [] 111 | if index in mrnas: 112 | if mrnas[index].start is not None: 113 | raise RuntimeError('mRNA with the same id is already present') 114 | exons = mrna[index].exons 115 | 116 | attributes = {} 117 | for a in f.attribute.rstrip().split(';'): 118 | r = a.split('=') 119 | if (len(r) == 2): 120 | attributes[r[0]] = r[1] 121 | 122 | mrnas[index] = mRNA(seqid = f.seqid, start = f.start, end = f.end, 123 | strand = f.strand, exons = exons, 124 | attributes = attributes) 125 | 126 | if f.feature == 'exon': 127 | # m = re.search('Parent=(rna\d\d*)', f.attribute) 128 | m = re.search('Parent=(\w[\w:]*)', f.attribute) 129 | if not m: 130 | # there seem to be exons not assigned to mRNAs 131 | # raise ValueError('Exon without parent: {0}'.format(f.attribute)) 132 | print('WARNING: Exon without parent: {0}'.format(line)) 133 | continue 134 | 135 | index = m.group(1) 136 | if index not in mrnas: 137 | # raise ValueError('Parent of the exon not found: {0}'.format(line)) 138 | # mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None, 139 | # strand = None, exons = []) 140 | continue 141 | 142 | mrnas[index].exons.append(Exon(seqid = f.seqid, start = f.start, 143 | end = f.end, strand = f.strand)) 144 | 145 | # these things appear in RNA-seq, but are not mRNA 146 | if f.feature == 'five_prime_UTR': 147 | m = re.search('ID=(id\d\d*);', f.attribute) 148 | if not m: 149 | raise ValueError("5'UTR id could not be found") 150 | 151 | index = m.group(1) 152 | if index not in mrnas: 153 | mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None, 154 | strand = None, exons = [], attributes = {}) 155 | 156 | mrnas[index].exons.append(Exon(seqid = f.seqid, start = f.start, 157 | end = f.end, strand = f.strand)) 158 | 159 | # this is a hack 160 | # there are introns with no exons in the gff file, we create fake 161 | # exons for easier processing 162 | if f.feature == 'intron': 163 | m = re.search('ID=(id\d\d*);', f.attribute) 164 | if not m: 165 | raise ValueError('Intron id could not be found') 166 | 167 | index = m.group(1) 168 | if index in mrnas: 169 | raise RuntimeError('mRNA element already present') 170 | 171 | exon1 = Exon(seqid = f.seqid, start = f.start - 2, 172 | end = f.start - 1, strand = f.strand) 173 | exon2 = Exon(seqid = f.seqid, start = f.end + 1, end = f.end + 2, 174 | strand = f.strand) 175 | mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None, 176 | strand = None, exons = [exon1, exon2], 177 | attributes = {}) 178 | 179 | 180 | return mrnas 181 | 182 | def get_splice_sites_from_exons(exons, use_strand): 183 | """Collect splice sites from list of exons and return as a dictionary""" 184 | sites = {} 185 | sorted_exons = sorted(exons, key=lambda x: x.start) 186 | for f, s in zip(sorted_exons, sorted_exons[1:]): 187 | strand = None 188 | if use_strand: 189 | strand = f.strand 190 | sites[Intron(seqid = f.seqid, start = f.end + 1, end = s.start - 1, 191 | strand = strand)] = 1 192 | 193 | return sites 194 | 195 | def get_splice_sites(stream, source = None, use_strand = False, accession = None): 196 | """Get splice sites from a mRNAs and returs and a dictionary of introns""" 197 | sites = {} 198 | mrnas = get_mrnas(stream, source) 199 | for r in mrnas: 200 | if accession is not None and 'Name' in mrnas[r].attributes and \ 201 | not mrnas[r].attributes['Name'].startswith(accession): 202 | continue 203 | 204 | s = get_splice_sites_from_exons(mrnas[r].exons, use_strand) 205 | for i in s: 206 | sites[i] = 1 207 | return sites 208 | 209 | 210 | 211 | if __name__ == '__main__': 212 | 213 | import argparse 214 | 215 | parser = argparse.ArgumentParser(description='Generate a list of introns from a GFF file') 216 | parser.add_argument('gfffile', metavar='FILE', type=str, help='GFF file') 217 | 218 | args = parser.parse_args() 219 | 220 | with open(args.gfffile) as f: 221 | introns = get_splice_sites(f, use_strand = True) 222 | 223 | for k in introns: 224 | print(f'{k.seqid}\t{k.start}\t{k.end}\t{k.strand}') 225 | 226 | 227 | -------------------------------------------------------------------------------- /magicblast-tools/gtf.py: -------------------------------------------------------------------------------- 1 | #============================================================================ 2 | # 3 | # PUBLIC DOMAIN NOTICE 4 | # National Center for Biotechnology Information 5 | # 6 | # This software/database is a "United States Government Work" under the 7 | # terms of the United States Copyright Act. It was written as part of 8 | # the author's official duties as a United States Government employee and 9 | # thus cannot be copyrighted. This software/database is freely available 10 | # to the public for use. The National Library of Medicine and the U.S. 11 | # Government have not placed any restriction on its use or reproduction. 12 | # 13 | # Although all reasonable efforts have been taken to ensure the accuracy 14 | # and reliability of the software and data, the NLM and the U.S. 15 | # Government do not and cannot warrant the performance or results that 16 | # may be obtained by using this software or data. The NLM and the U.S. 17 | # Government disclaim all warranties, express or implied, including 18 | # warranties of performance, merchantability or fitness for any particular 19 | # purpose. 20 | # 21 | # Please cite the author in any work or product based on this material. 22 | # 23 | # =========================================================================== 24 | # 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 26 | # 27 | # --------------------------------------------------------------------------- 28 | 29 | """GTF file parser""" 30 | 31 | from collections import namedtuple 32 | from base import Intron 33 | from base import Exon 34 | from base import mRNA 35 | import sys 36 | import re 37 | 38 | 39 | Record = namedtuple('Record', ['seqid', 'source', 'feature', 'start', 40 | 'end', 'score', 'strand', 'frame', 41 | 'attribute']) 42 | 43 | def parse(line): 44 | """Parse a single line and return Record""" 45 | fields = line.rstrip().split('\t') 46 | r = Record( 47 | seqid = None if fields[0] == '.' else fields[0], 48 | source = None if fields[1] == '.' else fields[1], 49 | feature = None if fields[2] == '.' else fields[2], 50 | start = None if fields[3] == '.' else int(fields[3]), 51 | end = None if fields[4] == '.' else int(fields[4]), 52 | score = None if fields[5] == '.' else float(fields[5]), 53 | strand = None if fields[6] == '.' else fields[6], 54 | frame = None if fields[7] == '.' else int(fields[7]), 55 | attribute = None if fields[8] == '.' else fields[8] 56 | ) 57 | return r 58 | 59 | 60 | 61 | def get_transcripts(stream): 62 | """Collect transcripts as collections of CDS/exons""" 63 | transcripts = {} 64 | 65 | for line__ in stream: 66 | if isinstance(line__, str): 67 | line = line__ 68 | elif isinstance(line__, bytes): 69 | line = line__.decode() 70 | else: 71 | raise InputError('Unsupported stream data') 72 | 73 | if line.startswith('#') or not line.strip(): 74 | continue 75 | 76 | f = parse(line) 77 | if f.feature == 'exon': 78 | m = re.search('transcript_id "([a-zA-Z0-9\.]+)";', f.attribute) 79 | if not m: 80 | raise ValueError('Gene id could not be found') 81 | index = m.group(1) 82 | exons = [] 83 | if index not in transcripts: 84 | transcripts[index] = mRNA(seqid = f.seqid, start = None, 85 | end = None, strand = f.strand, 86 | exons = [], attributes = '') 87 | 88 | transcripts[index].exons.append(Exon(seqid = f.seqid, 89 | start = f.start, end = f.end, 90 | strand = f.strand)) 91 | 92 | return transcripts 93 | 94 | 95 | def get_splice_sites_from_exons(exons, use_strand): 96 | """Collect splice sites from list of exons and return as a dictionary""" 97 | sites = {} 98 | sorted_exons = sorted(exons, key=lambda x: x.start) 99 | for f, s in zip(sorted_exons, sorted_exons[1:]): 100 | strand = None 101 | if use_strand: 102 | strand = f.strand 103 | sites[Intron(seqid = f.seqid, start = f.end + 1, end = s.start - 1, 104 | strand = strand)] = 1 105 | 106 | return sites 107 | 108 | def get_splice_sites(stream, use_strand = False): 109 | """Get splice sites from a mRNAs and returs and a dictionary of introns""" 110 | sites = {} 111 | transcripts = get_transcripts(stream) 112 | for r in transcripts: 113 | s = get_splice_sites_from_exons(transcripts[r].exons, use_strand) 114 | for i in s: 115 | sites[i] = 1 116 | return sites 117 | 118 | 119 | 120 | if __name__ == '__main__': 121 | 122 | import argparse 123 | 124 | parser = argparse.ArgumentParser(description='Generate a list of introns from a GTF file') 125 | parser.add_argument('gtffile', metavar='FILE', type=str, help='GFF file') 126 | 127 | args = parser.parse_args() 128 | 129 | with open(args.gtffile) as f: 130 | introns = get_splice_sites(f, use_strand = True) 131 | 132 | for k in introns: 133 | print(f'{k.seqid}\t{k.start - 2}\t{k.end}\t{k.strand}') 134 | 135 | -------------------------------------------------------------------------------- /magicblast-tools/requirements.txt: -------------------------------------------------------------------------------- 1 | pysam 2 | pyfaidx 3 | pandas 4 | -------------------------------------------------------------------------------- /magicblast-tools/sam.py: -------------------------------------------------------------------------------- 1 | #============================================================================ 2 | # 3 | # PUBLIC DOMAIN NOTICE 4 | # National Center for Biotechnology Information 5 | # 6 | # This software/database is a "United States Government Work" under the 7 | # terms of the United States Copyright Act. It was written as part of 8 | # the author's official duties as a United States Government employee and 9 | # thus cannot be copyrighted. This software/database is freely available 10 | # to the public for use. The National Library of Medicine and the U.S. 11 | # Government have not placed any restriction on its use or reproduction. 12 | # 13 | # Although all reasonable efforts have been taken to ensure the accuracy 14 | # and reliability of the software and data, the NLM and the U.S. 15 | # Government do not and cannot warrant the performance or results that 16 | # may be obtained by using this software or data. The NLM and the U.S. 17 | # Government disclaim all warranties, express or implied, including 18 | # warranties of performance, merchantability or fitness for any particular 19 | # purpose. 20 | # 21 | # Please cite the author in any work or product based on this material. 22 | # 23 | # =========================================================================== 24 | # 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 26 | # 27 | # --------------------------------------------------------------------------- 28 | 29 | """Useful functions for getting information from a SAM/BAM file that work on 30 | top of pysam""" 31 | 32 | import pysam 33 | from collections import Counter 34 | from collections import namedtuple 35 | from collections import defaultdict 36 | from base import Intron 37 | import re 38 | 39 | 40 | CIGAR_MATCH = 0 41 | CIGAR_INSERTION = 1 42 | CIGAR_DELETION = 2 43 | CIGAR_INTRON = 3 44 | CIGAR_SOFT_CLIP = 4 45 | 46 | 47 | def open_sam_or_bam(filename): 48 | """Open a SAM or BAM file and return file handle""" 49 | is_bam = "" 50 | if filename.endswith('.bam'): 51 | is_bam = 'b' 52 | return pysam.AlignmentFile(filename, 'r' + is_bam) 53 | 54 | 55 | def get_standard_read_name(r, force_single = False, trim = False): 56 | """Standardize read names. Some programs change read names. 57 | If force_single is true, .1 and .2 will be added to read names so that 58 | they can be treated as single. If trim is true, then the last 2 59 | characters of read name will be trimmed. This is to remove .1 and .2 60 | from a SAM or BAM file.""" 61 | read_name = r.query_name 62 | # standardise read accessions 63 | # hisat puts .R. before read number and removes .1 and .2 64 | read_name = re.sub(r'\.R\.', '.', read_name) 65 | if force_single and r.flag & 1: 66 | if r.flag & 64: 67 | read_name += '.1' 68 | else: 69 | read_name += '.2' 70 | 71 | if trim: 72 | read_name = read_name[:-2] 73 | 74 | return read_name 75 | 76 | 77 | 78 | def get_introns_from_cigar(position, cigar): 79 | """Get introns start and stop positions from SAM alignment position and 80 | cigar string""" 81 | 82 | # pysam cigartuple op codes 83 | introns = [] 84 | kMatch = 0 85 | kIns = 1 86 | kDel = 2 87 | kIntron = 3 88 | kSoftClip = 4 89 | 90 | kMatchOnly = 7 91 | kMismatch = 8 92 | 93 | s_offset = position 94 | 95 | for op, num in cigar: 96 | if op in [kMatch, kMatchOnly, kMismatch, kDel]: 97 | s_offset += num 98 | elif op == kIntron: 99 | introns.append((s_offset, s_offset + num - 1)) 100 | s_offset += num 101 | 102 | return introns 103 | 104 | 105 | def get_exons_from_cigar(position, cigar): 106 | """Get exon start and stop positions from SAM alignment position and CIGAR 107 | string""" 108 | exons = [] 109 | s_offset = position 110 | start = s_offset 111 | for op, num in cigar: 112 | if op in [0, 2, 7, 8]: 113 | s_offset += num 114 | elif op == 3: 115 | exons.append((start + 1, s_offset)) 116 | s_offset += num 117 | start = s_offset 118 | 119 | exons.append((start + 1, s_offset)) 120 | 121 | return exons 122 | 123 | 124 | def get_exons(line): 125 | """Get a list of exons from a single SAM alignment""" 126 | return get_exons_from_cigar(line.reference_start, line.cigartuples) 127 | 128 | 129 | def do_filter(r, filter_by): 130 | """Apply filter to an alignment, return True if alignment passes""" 131 | 132 | if not isinstance(filter_by, dict): 133 | raise ValueError('filter_by argument must be a dictionary') 134 | 135 | for k in filter_by: 136 | if k == 'edit_distance': 137 | edist = r.get_tag('NM') 138 | if edist > filter_by[k]: 139 | return False 140 | 141 | elif k == 'score': 142 | score = get_score(r) 143 | if score < filter_by[k]: 144 | return False 145 | 146 | elif k == 'count': 147 | count = r.get_tag('NH') 148 | if count > filter_by[k]: 149 | return False 150 | 151 | elif k == 'edit_distance_clip': 152 | edist = 0 153 | for op, num in r.cigartuples: 154 | if op == 4: 155 | edist += num 156 | edist += r.get_tag('NM') 157 | if edist > filter_by[k]: 158 | return False 159 | 160 | elif k == 'read_id': 161 | read_name = r.query_name.replace('.R.', '.') 162 | if read_name not in filter_by[k]: 163 | return False 164 | 165 | else: 166 | raise ValueError('Unrecognised filter name: {0}'.format(k)) 167 | 168 | 169 | return True 170 | 171 | 172 | def get_introns_(stream, force_single, filter_by): 173 | """Get intron postitions from a SAM stream""" 174 | introns = Counter() 175 | 176 | for read in stream: 177 | 178 | # skip unaligned reads 179 | if read.flag & 4: 180 | continue 181 | 182 | # apply alignment filters 183 | if filter_by is not None: 184 | if not do_filter(r, filter_by): 185 | continue 186 | 187 | # read.reference_start is zero based 188 | for (f, t) in get_introns_from_cigar(read.reference_start + 1, 189 | read.cigartuples): 190 | 191 | subject = stream.getrname(read.reference_id) 192 | introns[Intron(seqid = subject, start = f, end = t, 193 | strand = None)] += 1 194 | 195 | return introns 196 | 197 | 198 | def get_introns(filename, force_single, filter_by, trim): 199 | """Get intron positions""" 200 | f = open_sam_or_bam(filename) 201 | introns = get_introns_(f, force_single, filter_by) 202 | f.close() 203 | return introns 204 | 205 | 206 | def get_introns_with_reads_(stream, force_single, filter_by, trim, with_reads, 207 | placements): 208 | """Get intron postitions from a SAM stream""" 209 | introns = None 210 | if with_reads: 211 | introns = defaultdict(set) 212 | else: 213 | if placements is None: 214 | introns = defaultdict(int) 215 | else: 216 | introns = defaultdict(float) 217 | 218 | for r in stream: 219 | 220 | # skip unalined reads 221 | if r.cigartuples is None: 222 | continue 223 | 224 | # apply alignment filters 225 | if filter_by is not None: 226 | if not do_filter(r, filter_by): 227 | continue 228 | 229 | # read.reference_start is zero based 230 | for (f, t) in get_introns_from_cigar(r.reference_start + 1, r.cigartuples): 231 | strand = '+' 232 | if r.flag & 16 != 0: 233 | strand = '-' 234 | 235 | subject = stream.getrname(r.reference_id) 236 | i = Intron(seqid = subject, start = f, end = t, strand = None) 237 | # standardise read accessions 238 | # hisat puts .R. before read number and removes .1 and .2 239 | read_name = get_standard_read_name(r, force_single = force_single, 240 | trim = trim) 241 | 242 | num_clipped = 0 243 | for op, num in r.cigartuples: 244 | if op == 4: 245 | num_clipped += num 246 | 247 | # if i not in introns: 248 | # introns[i] = set() 249 | if with_reads: 250 | introns[i].add(read_name) 251 | else: 252 | if placements is None: 253 | introns[i] += 1 254 | else: 255 | introns[i] += 1.0 / placements[read_name] 256 | 257 | return introns 258 | 259 | 260 | def get_introns_with_reads(filename, force_single = False, filter_by = None, 261 | trim = False, with_reads = False, placements = None): 262 | """Get intron positions with reads""" 263 | f = open_sam_or_bam(filename) 264 | introns = get_introns_with_reads_(f, force_single = force_single, 265 | filter_by = filter_by, trim = trim, 266 | with_reads = with_reads, 267 | placements = placements) 268 | f.close() 269 | return introns 270 | 271 | 272 | def get_score(line): 273 | """Compute alignment score from CIGAR and edit distance for a single line 274 | of SAM/BAM file. This is prefered to relaying on AS tag to be able to 275 | compare mappers with different scoring schemes""" 276 | penalty = 8 277 | score = 0 278 | 279 | # get edit distance 280 | edit_dist = line.get_tag('NM') 281 | 282 | for op, num in line.cigar: 283 | # score matches (and mismatches as matches) 284 | if op == 0: 285 | score += num 286 | # score gaps 287 | elif op in [1, 2]: 288 | score -= num * penalty 289 | edit_dist -= num 290 | 291 | # add penalty for mismatches and subtract match scores for them 292 | score -= (edit_dist * penalty) + edit_dist 293 | return score 294 | -------------------------------------------------------------------------------- /magicblast-tools/txt.py: -------------------------------------------------------------------------------- 1 | #============================================================================ 2 | # 3 | # PUBLIC DOMAIN NOTICE 4 | # National Center for Biotechnology Information 5 | # 6 | # This software/database is a "United States Government Work" under the 7 | # terms of the United States Copyright Act. It was written as part of 8 | # the author's official duties as a United States Government employee and 9 | # thus cannot be copyrighted. This software/database is freely available 10 | # to the public for use. The National Library of Medicine and the U.S. 11 | # Government have not placed any restriction on its use or reproduction. 12 | # 13 | # Although all reasonable efforts have been taken to ensure the accuracy 14 | # and reliability of the software and data, the NLM and the U.S. 15 | # Government do not and cannot warrant the performance or results that 16 | # may be obtained by using this software or data. The NLM and the U.S. 17 | # Government disclaim all warranties, express or implied, including 18 | # warranties of performance, merchantability or fitness for any particular 19 | # purpose. 20 | # 21 | # Please cite the author in any work or product based on this material. 22 | # 23 | # =========================================================================== 24 | # 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov 26 | # 27 | # --------------------------------------------------------------------------- 28 | 29 | """Parse a text junctions file""" 30 | 31 | from base import Intron 32 | 33 | def get_introns(stream): 34 | """Get introns locations and return them as a dictionary""" 35 | introns = {} 36 | for line in stream: 37 | fields = line.rstrip().split()[1].split(':') 38 | seqid = fields[0] 39 | ff = fields[1].split('-') 40 | start = int(ff[0]) 41 | end = int(ff[1]) 42 | introns[Intron(seqid = seqid, start = start + 1, end = end - 1, 43 | strand = None)] = 1 44 | 45 | return introns 46 | 47 | --------------------------------------------------------------------------------