├── .ackrc
├── .gitignore
├── README.md
├── article
    ├── Aligners
    │   ├── 10_MagicBLAST
    │   │   └── align.tcsh
    │   ├── 20_HISAT2_relaxed
    │   │   └── align.tcsh
    │   ├── 21_HISAT2
    │   │   └── align.tcsh
    │   ├── 30_STAR
    │   │   └── align.tcsh
    │   ├── 31_STARlong
    │   │   └── align.tcsh
    │   ├── 32_STAR.2.6c
    │   │   └── align.tcsh
    │   └── 40_TopHat2
    │   │   └── align.tcsh
    ├── MagicBlastPaperMasterScript.tcsh
    ├── README
    └── scripts
    │   ├── AliQC.py
    │   ├── MagicBlastPaperMasterScript.tcsh
    │   ├── README
    │   ├── directErrorCount.awk
    │   ├── directErrorCount.tcsh
    │   ├── gff2cig.awk
    │   ├── introns_precision_recall.awk
    │   ├── mapping_accuracy.header
    │   ├── schtroumpf
    │   ├── submit
    │   ├── tags
    │   ├── transpose
    │   └── transpose.awk
├── docs
    ├── _config.yml
    ├── _includes
    │   ├── disqus.html
    │   ├── footer.html
    │   ├── google_analytics.html
    │   ├── header.html
    │   └── navigation.html
    ├── _layouts
    │   ├── default.html
    │   └── page.html
    ├── _posts
    │   ├── .gitkeep
    │   ├── 2016-12-29-blastdb.md
    │   ├── 2016-12-29-copyright.md
    │   ├── 2016-12-29-exeptions.md
    │   ├── 2016-12-29-fasta.md
    │   ├── 2016-12-29-multithreading.md
    │   ├── 2016-12-29-output.md
    │   ├── 2016-12-29-paired.md
    │   ├── 2016-12-29-rnavsdna.md
    │   ├── 2016-12-29-sra.md
    │   ├── 2016-12-29-tutorial.md
    │   ├── 2017-09-13-release.md
    │   ├── 2017-11-14-download.md
    │   ├── 2020-05-15-license.md
    │   ├── 2020-07-14-feedback.md
    │   └── 2021-05-06-cloud-sra.md
    ├── css
    │   ├── main.css
    │   └── syntax.css
    └── index.md
└── magicblast-tools
    ├── .gitignore
    ├── README.md
    ├── base.py
    ├── combine-genome-transcripts.py
    ├── get-introns.py
    ├── get-transcripts.py
    ├── gff.py
    ├── gtf.py
    ├── requirements.txt
    ├── sam.py
    └── txt.py


/.ackrc:
--------------------------------------------------------------------------------
1 | --ignore-dir=_site
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sw?
2 | _site
3 | _pages
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Magic-BLAST documentation:
2 | https://ncbi.github.io/magicblast
3 | 
4 | ## Publication:
5 | Boratyn GM, Thierry-Mieg J, Thierry-Mieg D, Busby B, Madden TL. (2019) **Magic-BLAST, an accurate RNA-seq aligner for long and short reads.** *BMC Bioinformatics* 20: 405. \[[article](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2996-x)\]
6 | 
7 | ## Scripts to recreate published experimental results:
8 | https://github.com/ncbi/magicblast/tree/master/article
9 | 


--------------------------------------------------------------------------------
/article/Aligners/10_MagicBLAST/align.tcsh:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh
 2 | # ===========================================================================
 3 | #
 4 | #                            PUBLIC DOMAIN NOTICE
 5 | #               National Center for Biotechnology Information
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software/database is freely available
11 | #  to the public for use. The National Library of Medicine and the U.S.
12 | #  Government have not placed any restriction on its use or reproduction.
13 | #
14 | #  Although all reasonable efforts have been taken to ensure the accuracy
15 | #  and reliability of the software and data, the NLM and the U.S.
16 | #  Government do not and cannot warrant the performance or results that
17 | #  may be obtained by using this software or data. The NLM and the U.S.
18 | #  Government disclaim all warranties, express or implied, including
19 | #  warranties of performance, merchantability or fitness for any particular
20 | #  purpose.
21 | #
22 | #  Please cite the author in any work or product based on this material.
23 | #
24 | # ===========================================================================
25 | #
26 | #  Author: Jean Thierry-Mieg
27 | #
28 | set mm=$1
29 | set run=$2
30 | set genome=$3
31 | set reads=$4
32 | set mates=$5
33 | 
34 | set genomeF=Fasta/$run/genome.gz
35 | set out=$mm/$run/$mm.$run
36 | 
37 | set nThreads=4
38 | set genomeDir=Aligners/$mm/$genome
39 | if (! -e $genomeDir) mkdir -p $genomeDir
40 | 
41 | mkdir -p $genomeDir
42 | 
43 | # create a BLAST database if not present
44 | if (! -e $genomeDir/genome.nhr) then
45 |   echo $genomeDir/genome.nhr
46 |   gunzip -c Fasta/$run/genome.gz > $genomeDir/my_genome.fasta
47 |   bin/makeblastdb -in $genomeDir/my_genome.fasta -out $genomeDir/genome -dbtype nucl -parse_seqids
48 |   \rm $genomeDir/my_genome.fasta
49 | endif
50 | 
51 | mkdir -p $mm/$run
52 | 
53 | # run magicblast
54 | echo "$mm $run $reads $mates"
55 | ls -ls  $genomeDir/genome.nhr
56 | echo "$reads"
57 | ls -ls $reads
58 | echo "$reads"
59 | set infmt=`echo $reads | gawk '/fastq/{print "fastq";next}/fasta/{print "fasta"}'`
60 | 
61 | 
62 | if (-e  $genomeDir/genome.nhr && -e $reads && ! -e $mm/$run/$mm.$run.sam) then
63 |   set mmm="-query_mate $mates"
64 |   if (X$mates == X) set mmm=""
65 |   echo "time bin/magicblast -query $reads $mmm  -infmt $infmt -db $genomeDir/genome -num_threads $nThreads "
66 |   time bin/magicblast -query $reads $mmm -infmt $infmt -db $genomeDir/genome -num_threads $nThreads > $out.sam_unsorted
67 |   ls -ls $out.sam_unsorted
68 |   time sort $out.sam_unsorted >  $out.sam_sorted
69 |   gzip $out.sam_sorted
70 |   ls -ls $out.sam_sorted.gz
71 |   # rm   $mm/$run/$mm.$run.sam_unsorted
72 | else
73 |   echo "Did not find $genomeDir/genome.nhr or  $reads or found $mm/$run/$mm.$run.sam"
74 |   ls -ls  $mm/$run/$mm.$run.sam
75 | endif
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/article/Aligners/20_HISAT2_relaxed/align.tcsh:
--------------------------------------------------------------------------------
1 | ../21_HISAT2/align.tcsh


--------------------------------------------------------------------------------
/article/Aligners/21_HISAT2/align.tcsh:
--------------------------------------------------------------------------------
  1 | #!/bin/tcsh
  2 | # ===========================================================================
  3 | #
  4 | #                            PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | #
 26 | #  Author: Jean Thierry-Mieg
 27 | #
 28 | set mm=$1
 29 | set run=$2
 30 | set target=$3
 31 | set reads=$4
 32 | set mates=$5
 33 | 
 34 | set genome=Fasta/$run/genome.gz
 35 | set out=$mm/$run/$mm.$run
 36 | mkdir -p $mm/$run
 37 | set mm2=21_HISAT2
 38 | 
 39 | set nThreads=4
 40 | set genomeDir=Aligners/$mm2/$target
 41 | if (! -e $genomeDir) mkdir -p $genomeDir
 42 | 
 43 | if (! $?TMPDIR) then
 44 |   set TMPDIR=/tmp
 45 |   if (-d /export/home/TMP) set TMPDIR=/export/home/TMP
 46 | endif
 47 | 
 48 | # construct the hisat genome index
 49 | if (-e $genome  && ! -e $genomeDir/genome.1.ht2) then
 50 |   echo "gunzip -c $genome > $genomeDir/mygenome.fasta"
 51 |         gunzip -c $genome > $genomeDir/mygenome.fasta
 52 |   ls -ls $genomeDir/mygenome.fasta
 53 |   if (-e $genomeDir/mygenome.fasta) then
 54 |     time bin/hisat2-master/hisat2-build $genomeDir/mygenome.fasta $genomeDir/genome
 55 |     touch $genomeDir/done
 56 |     # \rm  $genomeDir/mygenome.fasta
 57 |   endif
 58 | endif
 59 | 
 60 | if (-e $genomeDir/genome.1.ht2) then
 61 |   echo "HISAT2 index is ready"
 62 | else
 63 |   echo "missing HISAT2 index $genomeDir/genome.1.ht2"
 64 |   goto done
 65 | endif
 66 | 
 67 | if ($reads == "") then
 68 |   echo "missing parametets 4 which should be the reads file"
 69 |   goto done
 70 | endif
 71 | 
 72 | if (! -e $reads) then
 73 |   echo "cannot find the reads file $reads"
 74 |   goto done
 75 | endif
 76 | 
 77 | if ($mates == "") then
 78 |   set rr="-U $reads"
 79 | else
 80 |   set rr="-1 $reads -2 $mates"
 81 | endif
 82 |  
 83 | set type=`echo $reads | gawk '{t="f"}/fastq/{t="q"}{print t}'`
 84 | 
 85 | set params=""
 86 | if ($mm == 20_HISAT2_relaxed) then
 87 |   set params="--min-score L,0.0,-2"
 88 | endif
 89 | 
 90 | if (-e  $out.sam || -e $out.sam_sorted.gz) then
 91 |   echo "$out.sam ready"
 92 | else
 93 |   if (! -d $mm/$run) mkdir -p  $mm/$run
 94 |   uname -a
 95 |   echo " time bin/hisat2-master/hisat2 -$type -x $genomeDir/genome $rr $params -p $nThreads -S $out.sam"
 96 | 	 time bin/hisat2-master/hisat2 -$type -x $genomeDir/genome $rr $params -p $nThreads -S $out.sam
 97 | endif
 98 |    
 99 | if (-e $out.sam && ! -e $out.sam_sorted.gz) then
100 |   if (-e $TMPDIR/$mm/$run) \rm -rf $TMPDIR/$mm/$run
101 |   mkdir -p $TMPDIR/$mm/$run
102 |   cat   $out.sam | sort -T $TMPDIR/$mm/$run | gzip >  $out.sam_sorted.gz
103 | endif
104 | if (-e $out.sam && -e $out.sam_sorted.gz) then
105 |    \rm  $out.sam
106 | endif
107 | 
108 | if (-e $TMPDIR/$mm/$run) \rm -rf $TMPDIR/$mm/$run
109 | 
110 | done:
111 |  echo done
112 | 


--------------------------------------------------------------------------------
/article/Aligners/30_STAR/align.tcsh:
--------------------------------------------------------------------------------
1 | ../31_STARlong/align.tcsh


--------------------------------------------------------------------------------
/article/Aligners/31_STARlong/align.tcsh:
--------------------------------------------------------------------------------
  1 | #!/bin/tcsh
  2 | # ===========================================================================
  3 | #
  4 | #                            PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | #
 26 | #  Author: Jean Thierry-Mieg
 27 | #
 28 | set mm=$1
 29 | set run=$2
 30 | set target=$3
 31 | set reads=$4
 32 | set mates=$5
 33 | 
 34 | set genome=Fasta/$run/genome.gz
 35 | set out=$mm/$run/$mm.$run
 36 | mkdir -p $mm/$run
 37 | 
 38 | set nThreads=4
 39 | set genomeDir=Aligners/$mm/$target
 40 | if (! -e $genomeDir) mkdir -p $genomeDir
 41 | 
 42 | if (! $?TMPDIR) then
 43 |   set TMPDIR=/tmp
 44 |   if (-d /export/home/TMP) set TMPDIR=/export/home/TMP
 45 | endif
 46 | 
 47 | # --sjdbOverhang 100 : best for long reads >= 100, do not give is no gff file
 48 | if (-e $genome  && ! -e $genomeDir/SAindex) then
 49 |   echo "gunzip -c $genome > $genomeDir/mygenome.fasta"
 50 |         gunzip -c $genome > $genomeDir/mygenome.fasta
 51 |   ls -ls $genomeDir/mygenome.fasta
 52 |   if (-e $genomeDir/mygenome.fasta) then
 53 |     # lmem03 8threads 1h49 elapsed, 513%, 32673u+1117s
 54 |     time bin/STARlong --runMode genomeGenerate --runThreadN $nThreads --genomeDir $genomeDir --genomeFastaFiles  $genomeDir/mygenome.fasta 
 55 |     touch $genomeDir/done
 56 |     #\rm  $genomeDir/mygenome.fasta
 57 |   endif
 58 | endif
 59 | 
 60 | if (-e $genomeDir/SAindex) then
 61 |   echo "STAR index is ready"
 62 | else
 63 |   echo "missing STAR index $genomeDir/SAindex"
 64 |   goto done
 65 | endif
 66 | 
 67 | if (0) then
 68 | --readFilesCommand gunzip -c # allow .gz on input
 69 | --outSAMtype SAM/BAM/None [Unsorted/SortedByCoordinates]
 70 | --outSAMattributes All
 71 | --outSAMattributes Standard
 72 | --outFileNamePrefix $out
 73 | --outTmpDir $TMPDIR
 74 | --outFilterMatchNmin 24
 75 | --outFilterScoreMin 24
 76 | --outFilterMismatchNmax 100000
 77 | --outFilterMismatchNoverLmax 100000
 78 | --twopassMode Basic
 79 | --genomeLoad LoadAndRemove
 80 | --genomeLoad NoSharedMemory  # only one compatible with 2-pass mode
 81 | endif
 82 | echo $out'Aligned.out.sam'
 83 | 
 84 | if (-e $out'Aligned.out.sam' && ! -e $out.sam_sorted.gz) then
 85 |   cat   $out'Aligned.out.sam' | sort -T $TMPDIR | gzip >  $mm/$run/$mm.$run.sam_sorted.gz
 86 | endif
 87 | 
 88 | set mySTAR=STAR
 89 | if ($mm == 31_STARlong) set mySTAR=STARlong
 90 | if ($mm == 30_STAR) then
 91 |   set mySTAR=STAR_1pass
 92 | endif
 93 | 
 94 | if (-e  $out.sam || -e $out.sam_sorted.gz) then
 95 |   echo "$out.sam ready"
 96 | else
 97 |   if (! -d $mm/$run) mkdir -p  $mm/$run
 98 |   mkdir -p $TMPDIR/$mm
 99 |   if (-e $TMPDIR/$mm/$run) \rm -rf $TMPDIR/$mm/$run
100 |   if ($mm == 30_STAR) then
101 |     uname -a
102 |     echo " time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --genomeLoad NoSharedMemory  --readFilesIn $reads $mates"
103 |            time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --genomeLoad NoSharedMemory  --readFilesIn $reads $mates
104 |   else
105 |     uname -a
106 |     echo " time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --outFilterMatchNmin 24 --outFilterScoreMin 24 --outFilterMismatchNmax 100000 --outFilterMismatchNoverLmax .5  --genomeLoad NoSharedMemory --twopassMode Basic --seedPerReadNmax 100000 --readFilesIn $reads $mates"
107 |            time bin/$mySTAR --runThreadN $nThreads --genomeDir $genomeDir --readFilesCommand gunzip -c --outSAMtype SAM --outSAMattributes Standard --outFileNamePrefix $out --outTmpDir $TMPDIR/$mm/$run --outFilterMatchNmin 24 --outFilterScoreMin 24 --outFilterMismatchNmax 100000 --outFilterMismatchNoverLmax .5  --genomeLoad NoSharedMemory --twopassMode Basic --seedPerReadNmax 100000 --readFilesIn $reads $mates
108 |   endif
109 | endif
110 |    
111 | 
112 | if (-e $out'Aligned.out.sam' && ! -e $mm/$run/$mm.$run.sam_sorted.gz) then
113 |    cat   $out'Aligned.out.sam' | sort -T $TMPDIR | gzip >  $mm/$run/$mm.$run.sam_sorted.gz
114 | endif
115 | if (-e $out'Aligned.out.sam' && -e $mm/$run/$mm.$run.sam_sorted.gz) then
116 |    \rm  $out'Aligned.out.sam' 
117 | endif
118 | 
119 | 
120 | done:
121 |  echo done
122 | 


--------------------------------------------------------------------------------
/article/Aligners/32_STAR.2.6c/align.tcsh:
--------------------------------------------------------------------------------
1 | ../31_STARlong/align.tcsh


--------------------------------------------------------------------------------
/article/Aligners/40_TopHat2/align.tcsh:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh
 2 | # ===========================================================================
 3 | #
 4 | #                            PUBLIC DOMAIN NOTICE
 5 | #               National Center for Biotechnology Information
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software/database is freely available
11 | #  to the public for use. The National Library of Medicine and the U.S.
12 | #  Government have not placed any restriction on its use or reproduction.
13 | #
14 | #  Although all reasonable efforts have been taken to ensure the accuracy
15 | #  and reliability of the software and data, the NLM and the U.S.
16 | #  Government do not and cannot warrant the performance or results that
17 | #  may be obtained by using this software or data. The NLM and the U.S.
18 | #  Government disclaim all warranties, express or implied, including
19 | #  warranties of performance, merchantability or fitness for any particular
20 | #  purpose.
21 | #
22 | #  Please cite the author in any work or product based on this material.
23 | #
24 | # ===========================================================================
25 | #
26 | #  Author: Jean Thierry-Mieg, Greg Boratyn
27 | #
28 | 
29 | set mm=$1
30 | set run=$2
31 | set genome=$3
32 | set reads=$4
33 | set mates=$5
34 | 
35 | set genomeF=Fasta/$run/genome.gz
36 | set out=$mm/$run/$mm.$run
37 | 
38 | set nThreads=4
39 | set genomeDir=Aligners/$mm/$genome
40 | if (! -e $genomeDir) mkdir -p $genomeDir
41 | 
42 | mkdir -p $genomeDir
43 | set path=($path `pwd`/bin/bowtie2)
44 | 
45 | # create an index if not present
46 | if (! -e $genomeDir/genome.1.bt2) then
47 |   echo $genomeDir/genome.1.bt2
48 |   gunzip -c Fasta/$run/genome.gz > $genomeDir/my_genome.fasta
49 |   bin/bowtie2/bowtie2-build $genomeDir/my_genome.fasta $genomeDir/genome
50 |   \rm $genomeDir/my_genome.fasta
51 | endif
52 | 
53 | mkdir -p $mm/$run
54 | 
55 | # run tophat
56 | echo "$mm $run $reads $mates"
57 | ls -ls  $genomeDir/genome.1.bt2
58 | echo "$reads"
59 | ls -ls $reads
60 | echo "$reads"
61 | 
62 | if (-e  $genomeDir/genome.1.bt2 && -e $reads && ! -e $mm/$run/$mm.$run.sam) then
63 |   echo "time bin/tophat2/tophat2 -p $nThreads -o ${out}_dir $genomeDir/genome $reads $mates "
64 |   time bin/tophat2/tophat2 -p $nThreads -o ${out}_dir $genomeDir/genome $reads $mates
65 |   samtools view -h ${out}_dir/accepted_hits.bam >$out.sam_unsorted 
66 |   time sort  $mm/$run/$mm.$run.sam_unsorted >  $mm/$run/$mm.$run.sam_sorted
67 |   gzip  $mm/$run/$mm.$run.sam_sorted
68 |   \rm   $mm/$run/$mm.$run.sam_unsorted
69 | else
70 |   echo "Did not find $genomeDir/genome.1.ht2 or  $reads or found $mm/$run/$mm.$run.sam"
71 |   ls -ls  $mm/$run/$mm.$run.sam
72 | endif
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/article/MagicBlastPaperMasterScript.tcsh:
--------------------------------------------------------------------------------
1 | scripts/MagicBlastPaperMasterScript.tcsh


--------------------------------------------------------------------------------
/article/README:
--------------------------------------------------------------------------------
  1 | # Aug 1st, 2018
  2 | # Author: Jean Thierry-Mieg, NCBI/NLM/NIH
  3 | # For questions, please email  mieg@ncbi.nlm.nih.gov
  4 | 
  5 | 
  6 | The present directory contains the main scripts used in the Magic-BLAST paper analysis
  7 | and can be used to replicate our analysis.
  8 |  
  9 | ===========
 10 | 
 11 | The link for the paper is: https://github.com/ncbi/magicblast/tree/master/article
 12 | 
 13 | Please clone the site:
 14 |   git clone https://github.com/ncbi/magicblast
 15 |   cd magicblast/article
 16 | you should now see the present README file and 2 directories
 17 |   scripts: see scripts/README for a description of the content
 18 |   Aligners: see Aligners/README for a description of the content
 19 | and a link to the main script:
 20 |   MagicBlastPaperMasterScript.tcsh
 21 | 
 22 | Try
 23 |   tcsh MagicBlastPaperMasterScript.tcsh --help
 24 | to see the list of the commands
 25 | 
 26 | ============
 27 | 
 28 | You can control the number of runs to be analyzed
 29 | by editing the definition of the variable $runs, around line 50
 30 | of the main script  MagicBlastPaperMasterScript.tcsh
 31 | 
 32 | You can control the number of aligners to be analyzed
 33 | by editing the definition of the variable $methods, around line 27
 34 | of the main script  MagicBlastPaperMasterScript.tcsh
 35 | 
 36 | ============
 37 | 
 38 | init:
 39 | 
 40 |   Please run
 41 |     tcsh MagicBlastPaperMasterScript.tcsh init
 42 | 
 43 |   This command should download from NCBI the linux binaries and the reference genomes used in this benchmark
 44 | i.e. the 2 Baruzzo genomes for their human and their P.falciparum test sets  ls -ls Reference_genome/
 45 | and the GRCh38 human genome which is used for iRefSeq and the Illumina, Roche and Pacbio runs
 46 | 
 47 |   845712 -rw-r--r--. 1 mieg biodata 866005452 Aug  6 15:37 GRCh38.genome.fasta.gz
 48 |        4 -rw-r--r--. 1 mieg biodata      1230 Aug  6 15:37 HG19.Baruzzo.genome.TM.txt
 49 |   915076 -rw-r--r--. 1 mieg biodata 937034891 Aug  6 15:37 HG19.Baruzzo.genome.fasta.gz
 50 |        4 -rw-r--r--. 1 mieg biodata       869 Aug  6 15:37 PFAL.Baruzzo.genome.TM.txt
 51 |     6260 -rw-r--r--. 1 mieg biodata   6408381 Aug  6 15:37 PFAL.Baruzzo.genome.fasta.gz
 52 | 
 53 |   init also creates the Fasta directory, but does not yet import the runs
 54 | 
 55 |   The binaries are compiled for Linux 64bits Intel processors. See bin/README for details
 56 | 
 57 | ============
 58 | 
 59 | download:
 60 | 
 61 |   Please run
 62 |     tcsh MagicBlastPaperMasterScript.tcsh download
 63 |   This command may take a long time depending on the quality of your network connection
 64 | it will load from NCBI the fasta/fastq files of all 18 runs:
 65 |    9 Baruzzo HG19, 9 PFAL, Illumina, PacBio, Roche
 66 |    plus the iRefSeq fasta and gff files. 
 67 | 
 68 | All these files are copied into Fasta/$run
 69 | Because some files are paired-end files, we expect 41 files and 4 symbolic links
 70 |   ls -ls Fasta/*/*.fast[aq].gz | wc -l
 71 |  
 72 | ============
 73 | ============
 74 | 
 75 | There is now a choice, you may either download from NCBI the precomputed SAM files
 76 | or recompute them yourself
 77 | 
 78 | ============
 79 | 
 80 | sam:
 81 |   To download the precomputed SAM files, please run
 82 |     tcsh MagicBlastPaperMasterScript.tcsh sam
 83 |   This command may take a very long time depending on the quality of your network connection.
 84 | It will load from NCBI the sam files for the 18 runs and for all aligners
 85 | 
 86 | ============
 87 | 
 88 | align:
 89 |   To align the data on your own machine, please run
 90 |     tcsh MagicBlastPaperMasterScript.tcsh align
 91 |   Notice that the script works in lazy mode:
 92 |   if some sam files have been downloaded, the corresponding run will not be realigned
 93 | 
 94 |   Realigning may take a very long time and requires very large RAM, some cases
 95 |   demand more than 32 Gb of RAM. If you do not have a large hardware, we rather
 96 |   recommend that you download the precomputed SAM files as explained above.
 97 |   If you do realign, you may want to study and configure the self documented 
 98 |   file scripts/submit which can help to fan out the alignments on a compute far.
 99 | 
100 |  
101 | ============
102 | ============
103 | 
104 | Analysis.
105 | 
106 |   Once the sam files are available, they are analyzed by the following three sub-commands
107 | 
108 |     aliqc : run AliQC.py on each run in the background (slow)
109 |        Notice that the command ' MagicBlastPaperMasterScript.tcsh aliqc' requires HTSeq
110 |        which is installed in scripts/HTSeq. This is expected to work, but if you encounter
111 |        a problem, please study the additional information in the file scripts/HTSeq/README
112 | 
113 |        This command scans the SAM files, compares them to the reference genome and construct
114 |        detailed statistics on the quality of the alignment and the nature of the mismatches.
115 |     accuracy : analyze the intron found in all the BAM files and compares them to the benchmark truth
116 |        This command runs a dedicated C program, see info in the file bin/README, which 
117 |        compares the introns discovered in the SAM file to the benchmark truth.
118 |     export : export QC and ROC curve of intron discovery and the histogram of aligned lengths
119 |        The final tab delimited tables will appear in the RESULTS directory.
120 | 
121 | 
122 | =============
123 | =============
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/article/scripts/MagicBlastPaperMasterScript.tcsh:
--------------------------------------------------------------------------------
   1 | #!/bin/tcsh
   2 | # ===========================================================================
   3 | #
   4 | #                            PUBLIC DOMAIN NOTICE
   5 | #               National Center for Biotechnology Information
   6 | #
   7 | #  This software/database is a "United States Government Work" under the
   8 | #  terms of the United States Copyright Act.  It was written as part of
   9 | #  the author's official duties as a United States Government employee and
  10 | #  thus cannot be copyrighted.  This software/database is freely available
  11 | #  to the public for use. The National Library of Medicine and the U.S.
  12 | #  Government have not placed any restriction on its use or reproduction.
  13 | #
  14 | #  Although all reasonable efforts have been taken to ensure the accuracy
  15 | #  and reliability of the software and data, the NLM and the U.S.
  16 | #  Government do not and cannot warrant the performance or results that
  17 | #  may be obtained by using this software or data. The NLM and the U.S.
  18 | #  Government disclaim all warranties, express or implied, including
  19 | #  warranties of performance, merchantability or fitness for any particular
  20 | #  purpose.
  21 | #
  22 | #  Please cite the author in any work or product based on this material.
  23 | #
  24 | # ===========================================================================
  25 | #
  26 | #
  27 | ##       MagicBLAST_paper_master_script.tcsh
  28 | ##
  29 | ## MagicBLAST paper, june 2018, master script
  30 | ## Author, Greg Boratyn, Danielle Thierry-Mieg, Jean Thierry-Mieg, Ben Busby, Tom Madden
  31 | ## email for this script:   mieg@ncbi.nlm.nih.gov
  32 | 
  33 | ## This is a tcsh executable script
  34 | ## To see the on line help, run it under the tcsh interpretor using the command
  35 | ##       MagicBLAST_paper_master_script.tcsh
  36 | 
  37 | if ($# == 0) goto phase_Help
  38 | if ($1 == help) goto phase_Help
  39 | if ($1 == '-help') goto phase_Help
  40 | if ($1 == '--help') goto phase_Help
  41 | 
  42 | #############################################################################
  43 | ## Metadata
  44 | 
  45 | ## Aligners
  46 | # List of aligners used in the analysis
  47 | # The number in front serves to order the tables in a systematic way
  48 | # one can insert new versions of each program by inserting new numbers
  49 | # but since the numbers are erased in the final tables, the number AND the names 
  50 | # must be unique
  51 | setenv methods "10_MagicBLAST  20_HISAT2_relaxed 21_HISAT2 30_STAR 31_STARlong  32_STAR.2.6c 40_TopHat2"
  52 | # setenv methods "21_HISAT2 31_STARlong"
  53 | # setenv methods "21_HISAT2"
  54 | # setenv methods "40_TopHat2"
  55 | # setenv methods "20_HISAT2_relaxed 21_HISAT2 "
  56 | # setenv methods "10_MagicBLAST"
  57 | # setenv methods "30_STAR 31_STARlong  32_STAR.2.6c"
  58 | # setenv methods "10_MagicBLAST  20_HISAT2_relaxed 21_HISAT2 30_STAR 31_STARlong  32_STAR.2.6c"
  59 | #############################################################################
  60 | ## Datasets
  61 | ## Each dataset must be aligned on the reference genome carrying the relevant truth
  62 | # Experimental human datasets, to be aligned on the NCBI human genome
  63 | setenv main_runs "iRefSeq PacBio Roche Illumina"
  64 | 
  65 | # Baruzzo datasets, to be aligned on the Baruzzo human reference genome
  66 | setenv HG19_r1_runs "HG19t1r1 HG19t2r1 HG19t3r1"
  67 | setenv HG19_r2_runs "HG19t1r2 HG19t2r2 HG19t3r2"
  68 | setenv HG19_r3_runs "HG19t1r3 HG19t2r3 HG19t3r3"
  69 | setenv HG19_runs "$HG19_r1_runs $HG19_r2_runs $HG19_r3_runs"
  70 | 
  71 | # Baruzzo datasets, to be aligned on the Baruzzo malaria reference genome
  72 | setenv PFAL_r1_runs "PFALt1r1 PFALt2r1 PFALt3r1"
  73 | setenv PFAL_r2_runs "PFALt1r2 PFALt2r2 PFALt3r2"
  74 | setenv PFAL_r3_runs "PFALt1r3 PFALt2r3 PFALt3r3"
  75 | setenv PFAL_runs "$PFAL_r1_runs $PFAL_r2_runs $PFAL_r3_runs"
  76 | 
  77 | setenv runs "$main_runs $HG19_runs  $PFAL_runs"
  78 | 
  79 | # Additional PacBio runs from Brain and Testis
  80 | setenv pacbio_runs "SRR5189652 SRR5189667"
  81 | # Additional long paired end Illumina reads 
  82 | # 250_250 (from metastatic melanoma)  and 300+300 from MCF7 cells)
  83 | setenv long_illumina_runs "SRR5438850 SRR5437876"
  84 | # setenv runs "$main_runs $HG19_runs  $PFAL_runs $pacbio_runs $long_illumina_runs"
  85 | # setenv runs "$long_illumina_runs"
  86 | # setenv runs "$pacbio_runs"
  87 | # setenv runs "PacBio Illumina"
  88 | # setenv runs "Roche PacBio iRefSeq "
  89 | # setenv runs "$runs PFALt1r1S HG19t1r1_50 $pacbio_runs"
  90 | # setenv runs "HG19t1r1_50"
  91 | # setenv runs "PFALt1r1S"
  92 | # setenv runs "$pacbio_runs"
  93 | # setenv runs "$runs HG19t1r1_50 PFALt1r1S"
  94 | setenv runs "$main_runs $HG19_runs  $PFAL_runs $pacbio_runs $long_illumina_runs PFALt1r1S HG19t1r1_50 "
  95 | 
  96 | 
  97 | # This adapter is present in the PacBio SRR runs and gives a peak at 32 aligned bases = polyA + first 8 bp of adaptor
  98 | # AAAAAAAAAAAAAAAAAAAAAAAAAAAAGTACTCT  GCGTTGATACCACTGCTTAGATCGGAAGAG
  99 | #############################################################################
 100 | ## Fasta and Fastq files
 101 | ## All runs fasta or fastq files are in the directories Fasta/$run
 102 | ##  If they are absent, the script will download them from NCBI
 103 | #
 104 | #  The script assumes that all files are gzipped, and called $run/$run*.fast[aq].gz
 105 | #  their logical name, i.e. PacBio, links to their SRA identificator, e.g. SRR5009494.
 106 | #    The iRefSeq and the Baruzzo files are given in fasta format
 107 | #    The Illumina, Pabio and Roche fils are given in .fastq format
 108 | #    Some runs are paired-ends:
 109 | #       -Illumina paired end run has 2 files called SRR534301_1 and SRR534301_2
 110 | #       -Baruzzo paired end runs are called .forward and .reverse
 111 | #    In the Roche file Fasta/Roche/SRR899420.fastq we removed all read_1 (all 4-bases long)
 112 | #      and kept only the 24577 read_2.
 113 | #
 114 | #  For convenience and completeness, we also copied in Fasta/iRefSeq the original gff file
 115 | #  The iRefSeq fasta file can be extracted from the gff file using the command option
 116 | #       MagicBLAST_paper_master_script.tcsh make_iRefSeq 
 117 | #  of the the present script
 118 | 
 119 | foreach run ($runs)
 120 |   if (! -d Fasta/$run) mkdir -p  Fasta/$run
 121 | end 
 122 | 
 123 | #############################################################################
 124 | ## Reference genome
 125 | # The main genome is NCBI release 19, limited to the main chromosome and 
 126 | # the mitochondria excluding the patches and the alternates
 127 | setenv main_genome GRCh38.genome.fasta.gz
 128 | # Baruzzo benchmark reference genome, available from
 129 | setenv HG19_genome HG19.Baruzzo.genome.fasta.gz
 130 | setenv PFAL_genome PFAL.Baruzzo.genome.fasta.gz
 131 | 
 132 | ## Automatic download of the BenchMark fastq files from NCBI
 133 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/"
 134 | if (! -d Reference_genome) mkdir Reference_genome
 135 | pushd Reference_genome
 136 |   foreach ff (GRCh38.genome.fasta.gz  HG19.Baruzzo.genome.TM.txt  HG19.Baruzzo.genome.fasta.gz  PFAL.Baruzzo.genome.TM.txt  PFAL.Baruzzo.genome.fasta.gz)
 137 |     if (! -e $ff) then
 138 |       wget $FTP/Reference_genome/$ff
 139 |     endif
 140 |   end
 141 | popd
 142 | 
 143 | if (1) then
 144 |   foreach run ($HG19_runs)
 145 |     if (! -d Fasta/$run || -e Fasta/$run/target) continue
 146 |     pushd Fasta/$run
 147 |       ln -s ../../Reference_genome/HG19.Baruzzo.genome.fasta.gz genome.gz
 148 |       echo HG19 > target
 149 |     popd
 150 |   end
 151 |   foreach run ($PFAL_runs)
 152 |     if (! -d Fasta/$run || -e Fasta/$run/target) continue
 153 |     pushd Fasta/$run
 154 |       ln -s ../../Reference_genome/PFAL.Baruzzo.genome.fasta.gz genome.gz
 155 |       echo PFAL > target
 156 |     popd
 157 |   end
 158 |   foreach run ($main_runs)
 159 |     if (! -d Fasta/$run || -e Fasta/$run/target) continue
 160 |     pushd Fasta/$run
 161 |       ln -s ../../Reference_genome/GRCh38.genome.fasta.gz genome.gz
 162 |       echo GRCh38 > target
 163 |     popd
 164 |   end
 165 |   foreach run ($pacbio_runs $long_illumina_runs)
 166 |     if (! -d Fasta/$run || -e Fasta/$run/target) continue
 167 |     pushd Fasta/$run
 168 |       ln -s ../../Reference_genome/GRCh38.genome.fasta.gz genome.gz
 169 |       echo GRCh38 > target
 170 |     popd
 171 |   end
 172 |   touch Fasta/genomes_are_assigned
 173 | 
 174 |   foreach run ($pacbio_runs iRefSeq PacBio)
 175 |     if (-d  Fasta/$run) touch Fasta/$run/isLongRun
 176 |   end
 177 | endif
 178 | 
 179 | #############################################################################
 180 | ## Automatic download of the binaries from the NCBI ftp site
 181 | 
 182 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article"
 183 | if (! -d bin || ! -d scripts/HTSeq) then
 184 |   if (-e bin/binaries.linux64.tar.gz) then
 185 |     mv binaries.linux64.tar.gz .
 186 |   endif
 187 |   if (! -e binaries.linux64.tar.gz) then  
 188 |      wget $FTP/binaries.linux64.tar.gz
 189 |   endif
 190 |   if (! -e binaries.linux64.tar.gz) then  
 191 |     echo "FATAL ERROR: The automatic download of the binaries from $FTP/binaries.linux64.tar.gz failed"
 192 |     echo "May be the network connection did not work, please try manually to run the command"
 193 |     echo "    wget $FTP/binaries.linux64.tar.gz"
 194 |     echo "if it does not work please email mieg@ncbi.nlm.nih.gov"
 195 |   endif
 196 |   if (-e binaries.linux64.tar.gz) then
 197 |     echo "expanding binaries.linux64.tar.gz, please wait"
 198 |     gunzip -c binaries.linux64.tar.gz | tar xf -
 199 |     mv binaries.linux64.tar.gz bin
 200 |   endif
 201 | endif
 202 | 
 203 | #############################################################################
 204 | ## BAM files
 205 | ## The BAM files are named    $method/$run/$method.$run.bam
 206 | ## All runs were aligned on the relevant appropriate genome by all aligners
 207 | ## but it did not always work. Some files are missing, for example 30_STAR.iRefSeq.bam, 
 208 | ## because STAR crashed on long reads
 209 | 
 210 | ##############################################################################
 211 | ## utilities
 212 | setenv TMPDIR /tmp
 213 | if (-d /export/home/TMP) setenv TMPDIR /export/home/TMP
 214 | if (! -d tmp) mkdir tmp
 215 | if (! -d RESULTS) mkdir RESULTS
 216 | 
 217 | ##############################################################################
 218 | ##############################################################################
 219 | ## Executable and source code
 220 | ## Our scripts are in the scripts directory, e.g. scripts/AliQC.py
 221 | ## Our executable are compiled for generic LINUX 64 bits machine in the bin directory
 222 | ##    e.g. magicblast, dna2dna, sam2gold 
 223 | ## Our source code is available for analysis and recompilation in machine optimized mode
 224 | ## in the source_code directory, together with instructions in the correspondng README file.
 225 | 
 226 | echo -n "## MagicBlastPaperMasterScript.tcsh $1 : "
 227 | date
 228 | 
 229 | echo "runs = $runs"
 230 | echo "methods = $methods"
 231 | 
 232 | if ($1 == init) goto phase_Init
 233 | if ($1 == download) goto phase_Download
 234 | if ($1 == align) goto phase_Align
 235 | if ($1 == Make_iRefSeq) goto phase_Make_iRefSeq
 236 | if ($1 == count) goto phase_Count
 237 | if ($1 == sam) goto phase_Sam
 238 | if ($1 == accuracy) goto phase_Accuracy
 239 | if ($1 == aliqc) goto phase_aliqc
 240 | if ($1 == errors) goto phase_DirectErrorCount
 241 | if ($1 == export) goto phase_Export
 242 | if ($1 == aliLn) goto phase_aliLn
 243 | if ($1 == subs) goto phase_Count_subtitutions_in_benchmark
 244 | echo "Unknown command : $1, please try --help"
 245 | goto phaseLoop
 246 | 
 247 | phase_Help:
 248 | 
 249 | echo "\nusage  scripts/MagicBlastPaperMasterScript.tcsh, where command is one of:"
 250 | echo 'help   : This cnline help'
 251 | echo 'init   : in tcsh, "source README init" will set the variables $runs, $methods which may be convenient'
 252 | echo 'download  : Automatic download of the fastq files, please monitor carefully the results'
 253 | echo 'Make_iRefSeq : create the iRefSeq fasta file and intron file from the gff and the genome file'
 254 | echo 'count  : Count the reads in each fasta/fastq file'
 255 | echo 'sam    : Automatically download the sam files from NCBI (rather that running the aligners locally)'
 256 | echo 'align  : run for all runs all aligners for which the script Aligners/$method/align.tcsh is defined'
 257 | echo 'aliqc : run AliQC.py on each run in the background (slow), presupposes that HTSeq is installed'
 258 | echo 'accuracy : measure intron and alignment accuracy relative to the gold standard'
 259 | echo 'errors : count the errors in te BAM files using the NM:i:x optional field'
 260 | echo 'subs   : count substitutions in the human and malaria Baruzzo benchmarks'
 261 | echo 'aliLn  : export histogram of aligned lengths'
 262 | echo 'export : export QC and ROC curve of intron discovery'
 263 | 
 264 | goto phaseLoop
 265 | 
 266 | phase_Init:
 267 | goto phaseLoop
 268 | 
 269 | ##############################################################################
 270 | #############################################################################
 271 | ## Automatic download of the BenchMark fastq files from NCBI
 272 | phase_Download:
 273 | 
 274 | ## NCBI repository for the datafiles used in the paper
 275 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/"
 276 | 
 277 | echo "checking $HG19_runs  $PFAL_runs"
 278 | foreach run ( $HG19_runs  $PFAL_runs)
 279 |   if (! -e Fasta/$run/$run.reverse.fasta.gz) then
 280 |     pushd Fasta/$run
 281 |       echo "Trying to download $run from$FTP"
 282 |       wget $FTP/Fasta/$run/$run.cig.gz
 283 |       wget $FTP/Fasta/$run/$run.forward.fasta.gz
 284 |       wget $FTP/Fasta/$run/$run.reverse.fasta.gz
 285 |     popd
 286 |   endif
 287 | end
 288 | 
 289 | set run=HG19t1r1_50
 290 | if (! -d Fasta/$run) then
 291 |   echo "preparing the 5+50 clipped run"
 292 |   mkdir Fasta/$run
 293 |   pushd  Fasta/$run
 294 |     ln -s ../../Reference_genome/HG19.Baruzzo.genome.fasta.gz genome.gz
 295 |     ../../bin/dna2dna -i ../HG19t1r1/HG19t1r1.forward.fasta.gz -gzo -o $run.forward -rightClipAt 50
 296 |     ../../bin/dna2dna -i ../HG19t1r1/HG19t1r1.reverse.fasta.gz -gzo -o $run.reverse -rightClipAt 50
 297 |   popd
 298 | endif
 299 | 
 300 | set run=PFALt1r1S
 301 | if (! -d Fasta/$run) then
 302 |   echo "preparing the subsampled run"
 303 |   mkdir Fasta/$run
 304 |   pushd  Fasta/$run
 305 |     ln -s ../../Reference_genome/PFAL.Baruzzo.genome.fasta.gz genome.gz
 306 |     ../../bin/dna2dna -i ../PFALt1r1/PFALt1r1.forward.fasta.gz -gzo -o $run.forward -subsample 100
 307 |     ../../bin/dna2dna -i ../PFALt1r1/PFALt1r1.reverse.fasta.gz -gzo -o $run.reverse -subsample 100
 308 |   popd
 309 | endif
 310 | 
 311 | echo "checking iRefSeq"
 312 | foreach run (iRefSeq)
 313 |   if (! -e Fasta/$run/$run.fasta.gz) then
 314 |     pushd Fasta/$run
 315 |       echo "Trying to download $run from$FTP"
 316 |       wget $FTP/Fasta/$run/$run.cig.gz
 317 |       wget $FTP/Fasta/$run/$run.fasta.gz
 318 |       wget $FTP/Fasta/$run/GRCh38_genome.gff.gz
 319 |       ln -s GRCh38_genome.gff.gz genome.gff.gz
 320 |       gunzip -c $run.fasta.gz | ../../bin/dna2dna -getTM > $run.TM
 321 |     popd
 322 |   endif
 323 | end
 324 | 
 325 | echo "checking Roche"
 326 | foreach run (Roche)
 327 |   if (! -e Fasta/$run/$run.fasta.gz) then
 328 |     pushd Fasta/$run
 329 |       echo "Trying to download $run from$FTP"
 330 |       wget $FTP/Fasta/$run/$run.fasta.gz
 331 |       gunzip -c $run.fasta.gz | ../../bin/dna2dna -getTM > $run.TM
 332 |     popd
 333 |   endif
 334 | end
 335 | 
 336 | #############################################################################
 337 | ## .cig TRUTH Files
 338 | ## The Baruzzo benchmark is providing the original position of the simulated reads
 339 | ## in their .cig format, which is analogous, but not identical, to the SAM format.
 340 | ## Since the fasta and the .cig files both come from Baruzzo, we located then in Fasta/$run.cig.gz
 341 | ## For convenience, we reformatted the RefSeq gff file into a similar Fasta/iRefSeq/iRefSeq.cig.gz
 342 | if (-e Fasta/iRefSeq/genome.gff.gz && ! -e Fasta/iRefSeq/iRefSeq.cig.gz) then
 343 |   gunzip -c Fasta/iRefSeq/genome.gff.gz | gawk -F '\t' -f scripts/gff2cig.awk | gzip >  Fasta/iRefSeq/iRefSeq.cig.gz4
 344 | endif
 345 | ## To compare the BAM files produced by the different aligners to the .cig "truth"
 346 | ## we developped a C code called sam2gold (see below)
 347 | 
 348 | #############################################################################
 349 | ## Automatic download of the Illumina Roche pacBio from SRA
 350 | 
 351 | foreach run (PacBio Illumina)
 352 |   if (! -d Fasta/$run) continue
 353 |   if ($run == PacBio) set run2=SRR5009494
 354 |   if ($run == Roche)  set run2=SRR899420
 355 |   if ($run == Illumina)  set run2=SRR534301
 356 |   if (! -e Fasta/$run/$run2.fastq.gz && ! -e Fasta/$run/$run2'_1'.fastq.gz) then
 357 |      set n=`bin/fastq-dump --help | wc -l`
 358 |     if ($n < 10) then
 359 |       echo "Sorry, the executable bin/fastq-dump available from NCBI SRA and needed to download the $run run is not found"
 360 |       goto phaseLoop
 361 |     endif
 362 |     set sf=""
 363 |     if ($run == Illumina)  set sf="--split-files"
 364 |     echo "Trying to download $run2 from SRA"
 365 |     bin/fastq-dump $sf -O Fasta/$run $run2
 366 |     if (-e Fasta/$run/$run2.fastq || -e Fasta/$run/$run2'_1'.fastq) then
 367 |       gzip Fasta/$run/$run2*.fastq
 368 |       pushd Fasta/$run
 369 |         if (-e $run2.fastq.gz) ln -s $run2.fastq.gz $run.fastq.gz
 370 |         if (-e $run2'_1'.fastq.gz) ln -s $run2'_1'.fastq.gz $run'_1'.fastq.gz
 371 |         if (-e $run2'_2'.fastq.gz) ln -s $run2'_2'.fastq.gz $run'_2'.fastq.gz  
 372 |       popd
 373 |     endif
 374 |     if (-e ~/ncbi/public/sra/$run2) \rm -rf  ~/ncbi/public/sra/$run2
 375 |   endif
 376 | end
 377 | 
 378 | #############################################################################
 379 | ## Automatic download of the fastq files from SRA
 380 | 
 381 | foreach run ($pacbio_runs)
 382 |   if (! -d Fasta/$run) continue
 383 |   if (! -e Fasta/$run/$run.fasta.gz && ! -e Fasta/$run/$run.fastq.gz) then
 384 |     set n=`bin/fastq-dump --help | wc -l`
 385 |     if ($n < 10) then
 386 |       echo "Sorry, the executable bin/fastq-dump available from NCBI SRA and needed to download the $run run is not found"
 387 |       goto phaseLoop
 388 |     endif
 389 |     echo "Trying to download $run from SRA"
 390 |     bin/fastq-dump -O Fasta/$run $run
 391 |     gzip Fasta/$run/$run.fastq
 392 |   endif
 393 | end
 394 | 
 395 | foreach run ($long_illumina_runs)
 396 |   if (! -d Fasta/$run) continue
 397 |   if (! -e Fasta/$run/$run.fasta'_1'.gz && ! -e Fasta/$run/$run'_1'.fastq.gz) then
 398 |     set n=`bin/fastq-dump --help | wc -l`
 399 |     if ($n < 10) then
 400 |       echo "Sorry, the executable bin/fastq-dump available from NCBI SRA and needed to download the $run run is not found"
 401 |       goto phaseLoop
 402 |     endif
 403 |     echo "Trying to download $run2 from SRA"
 404 |     bin/fastq-dump -O Fasta/$run --split-files $run 
 405 |     gzip Fasta/$run/$run*.fastq
 406 |   endif
 407 | end
 408 | 
 409 | goto phaseLoop
 410 | 
 411 | ##############################################################################
 412 | ##############################################################################
 413 | ## Count the number of reads, the shortest, the longest read in every fasta/fastq file
 414 | ## using the utility bin/dna2dna (compiled for Linux 64bits) 
 415 | ## The source code is part of the aceview/magic distribution in the source_code directory
 416 | 
 417 | phase_Count:
 418 | echo 'counting the number of reads in each fasta/fastq file'
 419 | foreach run ($runs)
 420 |   if (-e Fasta/$run/$run.fasta.gz && ! -e Fasta/$run/$run.count) then
 421 |     echo "counting $run, please wait"
 422 |     bin/dna2dna -i  Fasta/$run/$run.fasta.gz -I fasta -count -o Fasta/$run/$run
 423 |   endif
 424 |   if (-e Fasta/$run/$run.fastq.gz && ! -e Fasta/$run/$run.count) then
 425 |     echo "counting $run, please wait"
 426 |     bin/dna2dna -i  Fasta/$run/$run.fastq.gz -I fastq -count -o Fasta/$run/$run
 427 |   endif
 428 |   if (-e Fasta/$run/$run'_1'.fastq.gz && ! -e Fasta/$run/$run'_1'.count) then
 429 |     echo "counting $run, please wait"
 430 |     bin/dna2dna -i  Fasta/$run/$run'_1'.fastq.gz -I fastq -count -o Fasta/$run/$run'_1'
 431 |     bin/dna2dna -i  Fasta/$run/$run'_2'.fastq.gz -I fastq -count -o Fasta/$run/$run'_2'
 432 |   endif
 433 |   if (-e Fasta/$run/$run.forward.fasta.gz && ! -e Fasta/$run/$run.forward.count) then
 434 |     echo "counting $run, please wait"
 435 |     bin/dna2dna -i  Fasta/$run/$run.forward.fasta.gz -I fasta -count -o Fasta/$run/$run.forward 
 436 |     bin/dna2dna -i  Fasta/$run/$run.reverse.fasta.gz -I fasta -count -o Fasta/$run/$run.reverse 
 437 |   endif
 438 |   set nreads=`cat Fasta/$run/*.count  | gawk '/^Fragment_kept/{n+=$2}END{print n}'`
 439 |   echo "$run contains $nreads reads"
 440 | end
 441 |  
 442 | goto phaseLoop
 443 | 
 444 | ##############################################################################
 445 | ##############################################################################
 446 | ## Create the iRefSeq fasta file and intron file from the gff and the genome file
 447 | 
 448 | phase_Make_iRefSeq:
 449 | 
 450 | ## In practice, the file Fasta/iRefSeq/iRefSeq.fasta.gz is downloaded from
 451 | ## ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/
 452 | ## The scrip is given here for transparency and to allow the reconstruction
 453 | ## of the iRefSeq in the future from a diferent gff file and reference genome
 454 | 
 455 | if (! -e Fasta/iRefSeq/iRefSeq.fasta.gz) then
 456 |   echo "Creating Fasta/iRefSeq/iRefSeq.fasta.gz using the genome and the gff3 anntation"
 457 |   if (! -e Fasta/iRefSeq/genome.gz) then
 458 |     echo "Missing file Fasta/iRefSeq/genome.gz, I cannot create the iRefSeq fasta file"
 459 |     goto phaseLoop
 460 |   endif
 461 |   if (! -e Fasta/iRefSeq/genome.gff.gz( then
 462 |      echo "Missing file Fasta/iRefSeq/genome.gff.gz, I cannot create the iRefSeq fasta file"
 463 |     goto phaseLoop
 464 |   endif
 465 | 
 466 |   echo "Found the genome and the gff file, constructing the fasta in Fasta/iRefSeq/tmp"
 467 |   if (! -d Fasta/iRefSeq/tmp) mkdir  Fasta/iRefSeq/tmp
 468 |   pushd  Fasta/iRefSeq/tmp
 469 |     # This script is surprisingly complex, sorry, because we are trying to identify the NMs which map as well
 470 |     # at different locus of the genome, but while doing so, we unfortunately discovered a number of
 471 |     # irregularities in the definition of the RefSeqs that we try to compensate
 472 | 
 473 |     # To simplify the matter, we directly provide the iRefSeq fasta and gff files on our ftp site. 
 474 | 
 475 |     # extract the NM_ from the gff file
 476 |     zcat ../genome.gff.gz | grep NM_ | grep NC_ > NM.gff
 477 |     # we could directly export the fasta file with the command 
 478 |     # ../../../bin/dna2dna -gff3 NM.gff -gtfRemap iRefSeq -gtfGenome ../genome.gz -o iRefSeq -O fasta
 479 |     # but some NM have a single indentifier and yet map on 2 chroms
 480 |     # by not providing the genome we only export the 6 columns sponge file
 481 |     ../../../bin/dna2dna -gff3 NM.gff -gtfRemap iRefSeq -o iRefSeq -O fasta
 482 |     set nNM=`cat iRefSeq.[fr].sponge | cut -f 1 | sort -u | wc -l`
 483 |     echo  "Number of NM_ $nNM (is 45065)"
 484 |     set nNM_chr=`cat iRefSeq.[fr].sponge | cut -f 1,3 | sort -u | wc -l`
 485 |     echo  "Number of NM_chrom $nNM_chr (is 45108)"
 486 |     set nG=`cat iRefSeq.[fr].sponge | cut -f 6 | sort -u | wc -l`
 487 |     echo  "Number of genes with NM $nG (is  19269)"
 488 |     echo "Evaluating the mapping multiplicity of the iRefSeq"
 489 |     # to fix the issue that the same NM may map on  several chromosomes 45108 = 45065 = 43 cases
 490 |     # we merge the chrom and the NM in column 1 to create a disambiguated shadow file
 491 |     cat  iRefSeq.[fr].sponge  | gawk -F '\t' '{printf("%s:%s\t%s\t%s\t%s\t%s\t%s\n",$1,$3,$2,$3,$4,$5,$6);}' > NM_chr.sponge
 492 |     # the sponge file has the the NM the gene and the coordinates of all exons, hence the sequence
 493 |     # we now construct the fasta file
 494 |      ../../../bin/dna2dna -shadow NM_chr.sponge -i ../genome.gz -o iRefSeq -O fasta -maxLineLn 80 -gzo
 495 |     # measure the number of disntinct NM with identical sponge (hence DNA) and mapping    
 496 |     cat iRefSeq.[fr].sponge | grep NM_ | sort > _t
 497 |     cat _t | gawk '{nm=$1;z[nm]=z[nm] "_" $3 "_" $4 "_" $5;}END{for(k in z){g[z[k]]=k;u[z[k]]+=1;}for (v in u) {if(u[v]>1)print u[v],g[v]}}' | sort -k 1n > iRefSeq.mapping_multiplicity
 498 |     ../../../dna2dna -i iRefSeq.fasta.gz  -count -o iRefSeq
 499 |     \mv iRefSeq.fasta.gz iRefSeq.count ..
 500 |     # extract
 501 |     ../../../dna2dna -i iRefSeq.fasta.gz -O raw | sort > _t1
 502 |     # count the distnct NM sequences : 44914
 503 |     ../../../dna2dna -i iRefSeq.fasta.gz -O raw | cut -f 1 | sort -u | wc
 504 |     ../../../dna2dna  -i iRefSeq.fasta.gz -getTM > ../iRefSeq.TM
 505 | 
 506 |     # map the NM on the NM to find who is identical or included in the other
 507 |     clipalign -i iRefSeq.fasta.gz -t iRefSeq.fasta.gz -errMax 0 -o nm2nm -maxHit 24 -minAli 140
 508 |     bestali -i  nm2nm.hits -exportBest -o nm2nm33
 509 |     # now count NM mapping exactly in NM with a different geneid -> 143, we add 43+43 for the 43 NM which map on X and Y
 510 |     cat nm2nm33.hits | gawk '{if($2-$4==0 && index($1,"|"$9">")==0)print}' > nm2nm.2genes.hits
 511 |     wc nm2nm.2genes.hits
 512 |     cat nm2nm.2genes.hits | gawk  -F '\t' '{n[$1]++;}END{for(k in n)u[n[k]]++;for (v  in u) {if(v>0)k+=u[v];kk+=v*u[v];print v, u[v];}print k, kk}' | sort -k 1n
 513 |     # we now have 303 NM mapping on another NM with a different gene name, however some distinct genes have same gene coordinates
 514 |     # extract the extreme coords of the NM from the sponge file
 515 |     cat NM_chr.sponge | gawk -F '\t' '{nm=$1;a1=$4;if($5<a1)a1=$5;a2=$5;if($4>a2)a2=$4;if(aa2[nm]<a2)aa2[nm]=a2;if(aa1[nm]>-a1)aa1[nm]=-a1;}END{for(nm in aa1) printf("%s\t%d\t%d\n",nm,-aa1[nm],aa2[nm]);}' > NM_chr.segment
 516 |     # reanalize the nm 2 nm hits file and eliminate the lines with overlapping coordinates
 517 |     echo ZZZZZ > ZZZZZ
 518 |     cat NM_chr.segment ZZZZZ  nm2nm.2genes.hits | gawk  -F '\t' '/^ZZZZZ/{zz++;next;}{nm=$1;if(zz+0<1){aa1[nm]=$2;aa2[nm]=$3;split(nm,aa,":");chrom[nm]=aa[2];next;}}{split($1,aa,"|");nm1=aa[1];nm2=$11;ok=1;if (chrom[nm1]==chrom[nm2] && aa1[nm1]<aa2[nm2] && aa2[nm1] > aa1[nm2])ok=0;if (ok==1)print}' > nm2nm.2genes.hits.no_doublons
 519 |     cat NM_chr.segment ZZZZZ  nm2nm.2genes.hits | gawk  -F '\t' '/^ZZZZZ/{zz++;next;}{nm=$1;if(zz+0<1){aa1[nm]=$2;aa2[nm]=$3;split(nm,aa,":");chrom[nm]=aa[2];next;}}{split($1,aa,"|");nm1=aa[1];nm2=$11;ok=1;if (chrom[nm1]==chrom[nm2] && aa1[nm1]<aa2[nm2] && aa2[nm1] > aa1[nm2])ok=0;if (ok==0)print}' > nm2nm.2genes.hits.doublons
 520 |     # final count of the repeated NM : 291 NM have several mappings
 521 |     cat nm2nm.2genes.hits.no_doublons | gawk  -F '\t' '{n[$1]++;}END{for(k in n)u[n[k]]++;for (v  in u) {if(v>0)k+=u[v];kk+=v*u[v];print v, u[v];}print kk}' | sort -k 1n
 522 |     # so finally we have 291 NM have multiple mapping just by looking at the annotated NM themselves + (43 + 43)  from the pseudo autosomal region with single NM and geneid total 291+86=379
 523 |     cat NM_chr.segment| gawk '{split($1,aa,":");n[aa[1]]++;chrom[aa[1]]=aa[2];}END{for (nm in n)if(n[nm]>1)print nm,n[nm],chrom[nm];}' > NM.pseudo_autosomal_region.mapping_twice 
 524 |     wc  NM.pseudo_autosomal_region.mapping_twice     
 525 |     cat nm2nm.2genes.hits.no_doublons | gawk '{split($1,aa,"|");print aa[3] "="$9}' | sed -e 's/>/</' | sort -u > gene_pairs
 526 |     ## construct a cig file for the refseq
 527 |     # Use NM_...:chrom... as NM identifiers because in the pseudo autosomal region, the same NM maps in 2 places: one NM_ 2 locus
 528 |     # whereas a usual palindromic exactly duplicated genes has 1 NM per locus, i.e. 2 NM 2 locus
 529 |     # this raises the number of NM supported introns from 210357 to 210509
 530 |     
 531 |     zcat ../genome.gff.gz | gawk -F '\t' '{if ($3 != "exon") next;}{split($9,aa,"Genbank:");split(aa[2],bb,",");split(bb[1],cc,";");seq=cc[1];if(substr(seq,1,2)!="NM" && substr(seq,1,2)!="zNR")next;seq=seq ":" $1; chrom[seq]=$1;nx[seq]++;i=nx[seq];a1[seq,i]=$4;a2[seq,i]=$5;strand[seq]=$7;}END{for (seq in nx){n=nx[seq];printf("%s\t%s",seq,chrom[seq]);if(strand[seq]=="+"){printf("\t%d\t%d\t", a1[seq,1], a2[seq,n]) ;for(i = 1 ; i <=n ; i++){if(i>1){dx=a1[seq,i]-a2[seq,i-1]-1;printf("%dN",dx);}dx=a2[seq,i]-a1[seq,i]+1;printf("%dM",dx);}}else {printf("\t%d\t%d\t", a1[seq,n], a2[seq,1]) ;for(i = n ; i >=1 ; i--){if(i<n){dx=a1[seq,i]-a2[seq,i+1]-1;printf("%dN",dx);}dx=a2[seq,i]-a1[seq,i]+1;printf("%dM",dx);}}printf("\t.\t%s\t.\n",strand[seq]);}}' | gzip > iRefSeq.cig.gz
 532 |     mv iRefSeq.cig.gz ..
 533 | 
 534 | endif
 535 | 
 536 | goto phaseLoop
 537 | 
 538 | ##############################################################################
 539 | ##############################################################################
 540 | ## SAM 
 541 | ## download the precomputed SAM files from NCBI
 542 | phase_Sam:
 543 | 
 544 | set FTP="ftp://ftp.ncbi.nlm.nih.gov/blast/demo/magicblast_article/"
 545 | foreach run ($runs)
 546 |   foreach method ($methods)
 547 | 
 548 |     ## the preferred methos is to download the aligned files from NCBI
 549 | 
 550 |     # For HISAT and STAR we have a special version of the code to align long runs
 551 |     # so in these cases we do not atempt to align the long runs with the short code
 552 |     if ($method == 30_STAR || $method == 32_STAR.2.6c) then
 553 |        if (-e Fasta/$run/isLongRun) continue
 554 |     endif
 555 |     # and vice versa
 556 |     if ( $method == 20_HISAT2_relaxed) then
 557 |        if (! -e Fasta/$run/isLongRun) continue
 558 |     endif
 559 |  
 560 |       if (! -e $method/$run/$method.$run.sam_sorted.gz) then
 561 |         mkdir -p $method/$run
 562 |         pushd  $method/$run
 563 |           wget $FTP/SAM/$method.$run.sam_sorted.gz
 564 |         popd
 565 |       endif
 566 | 
 567 |   end
 568 | end
 569 | 
 570 | goto phaseLoop
 571 | 
 572 | ##############################################################################
 573 | ##############################################################################
 574 | ## ALIGN Run all aligners on all runs
 575 | 
 576 | phase_Align:
 577 | 
 578 | foreach run ($runs)
 579 |   foreach method ($methods)
 580 | 
 581 |     if (! -e Aligners/$method/align.tcsh) then
 582 |       echo "missing script Aligners/$method/align.tcsh"
 583 |       continue
 584 |     endif
 585 | 
 586 |     # For HISAT and STAR we have a special version of the code to align long runs
 587 |     # so in these cases we do not atempt to align the long runs with the short code
 588 |     if ($method == 30_STAR || $method == 32_STAR.2.6c) then
 589 |        if (-e Fasta/$run/isLongRun) continue
 590 |     endif
 591 |     # and vice versa
 592 |     if ( $method == 20_HISAT2_relaxed) then
 593 |        if (! -e Fasta/$run/isLongRun) continue
 594 |     endif
 595 | 
 596 |     if (-e Aligners/$method/align.tcsh && ! -e $method/$run/$method.$run.sam_sorted.gz) then
 597 |       set read_1="x"
 598 |       set read_2=""
 599 |       if (-e Fasta/$run/$run'_1.fastq.gz') set read_1=Fasta/$run/$run'_1.fastq.gz'
 600 |       if (-e Fasta/$run/$run'_2.fastq.gz') set read_2=Fasta/$run/$run'_2.fastq.gz'
 601 |       if (-e Fasta/$run/$run'_1.fasta.gz') set read_1=Fasta/$run/$run'_1.fasta.gz'
 602 |       if (-e Fasta/$run/$run'_2.fasta.gz') set read_2=Fasta/$run/$run'_2.fasta.gz'
 603 |       if (-e Fasta/$run/$run.fasta.gz) set read_1=Fasta/$run/$run.fasta.gz
 604 |       if (-e Fasta/$run/$run.fastq.gz) set read_1=Fasta/$run/$run.fastq.gz
 605 |       if (-e Fasta/$run/$run.forward.fastq.gz) set read_1=Fasta/$run/$run.forward.fastq.gz
 606 |       if (-e Fasta/$run/$run.reverse.fastq.gz) set read_2=Fasta/$run/$run.reverse.fastq.gz
 607 |       if (-e Fasta/$run/$run.forward.fasta.gz) set read_1=Fasta/$run/$run.forward.fasta.gz
 608 |       if (-e Fasta/$run/$run.reverse.fasta.gz) set read_2=Fasta/$run/$run.reverse.fasta.gz
 609 | 
 610 |       if (! -e $read_1) then
 611 |         echo "Run $run Missing read file $read_1"
 612 | 	ls -ls Fasta/$run/*fast*
 613 |         continue
 614 |       endif
 615 |       set target=`cat Fasta/$run/target`
 616 |       if (! -d $method/$run) mkdir -p $method/$run
 617 |         echo "align $method/$run"
 618 |         scripts/submit $method/$run "Aligners/$method/align.tcsh $method $run $target $read_1 $read_2" 
 619 |         # scripts/submit $method/$run "Aligners/$method/align.tcsh $method $run $target $read_1 $read_2" 64G UGE4
 620 |     endif
 621 |   end
 622 | end
 623 | 
 624 | goto phaseLoop
 625 | 
 626 | ##############################################################################
 627 | ##############################################################################
 628 | ## Intron, exon, insertion deletion comparison to the TRUTH
 629 | ## Compare the alignment results, provided in BAM format 
 630 | ## to the GOLD standard truth from iRefSeq and Baruzzo given in .cig format
 631 | ## The source C-code is part of the aceview/magic distribution www.aceview.org/Software
 632 | ## The executable for LINUX 64 bits is in bin
 633 | ##
 634 | ## sam2gold produces several output files
 635 | ##    .qc a small self documented statistics table in tab delimited format
 636 | ##    .aliqc the same statistics in a more computer friendly tag-values tab delimited format
 637 | ##    .Intron a table giving the coordinates of all introns, with support in GOLD or BAM
 638 | ##    .Deletion a table giving the coordinates of all deletions, with support in GOLD or BAM
 639 | ##    .Insertion a table giving the coordinates of all insertions, with support in GOLD or BAM
 640 | 
 641 | phase_Accuracy:
 642 | foreach run ($runs)
 643 |   if (-e Fasta/iRefSeq/iRefSeq.cig.gz && ! -e Fasta/$run/$run.cig.gz) then
 644 |     pushd Fasta/$run
 645 |       ln -s ../iRefSeq/iRefSeq.cig.gz   $run.cig.gz
 646 |     popd
 647 |   endif
 648 | end
 649 | # Illumina $HG19_runs  $PFAL_runs 
 650 | #$main_runs $pacbio_runs $long_illumina_runs  $HG19_runs  $PFAL_runs $methods
 651 | foreach run ($runs)
 652 |   foreach mm ($methods)
 653 |      if (-e Fasta/$run/$run.cig.gz && -e $mm/$run/$mm.$run.sam_sorted.gz && ! -e $mm/$run/$mm.$run.delins.tsv) then
 654 |        echo $mm/$run/$mm.$run.sam_sorted.gz
 655 |        set arp=""
 656 |        set arp=`echo $mm | gawk 'BEGIN{arp="";}{if(index($1,"STAR")>0) arp="-addReadPairSuffix"}END{printf("%s",arp);}'`
 657 |        \rm  $mm/$run/$mm.$run.sam2gold.*
 658 |        scripts/submit $mm/$run/$mm.$run.sam2gold "bin/sam2gold $arp -g $run..GOLD:Fasta/$run/$run.cig.gz -i  $run..$mm':'$mm/$run/$mm.$run.sam_sorted.gz -o  $mm/$run/$mm.$run"
 659 |       endif
 660 |   end
 661 | end
 662 |  
 663 | goto phaseLoop
 664 | 
 665 | ##############################################################################
 666 | ##############################################################################
 667 | ## Alignment quality control
 668 | ## Evaluate in great details the intrinsic quality of the alignment results, provided in BAM format 
 669 | ## This analysis does not refer to the gold truth
 670 | ## This is a python.2.7 scripts given in scripts/AliQC.py
 671 | ## It was developped in collaboration with Joe Meehan, FDA, for the MAQC/SEQC project
 672 | ## There is a dependency, one must fisrt install HTSeq as explained in the previous section
 673 | ##
 674 | ## aliqc produces a computer friendly tag-values tab delimited format .aliqc.tsv
 675 | ## In the following section aliqc is used again to merge these file into a single table
 676 | 
 677 | phase_aliqc:
 678 | 
 679 | # create sam_sorted only once
 680 | set ok=1
 681 |   foreach mm ($methods)
 682 |     foreach run ($runs)
 683 |       if (-e $mm/$run/$mm.$run.bam && ! -e $mm/$run/$mm.$run.sam_sorted.gz) then
 684 |         set ok=0
 685 |         echo "transformng $mm/$run/$mm.$run.bam into .sam_sorted.gz"
 686 |         scripts/submit $mm/$run/$mm.$run.samview "samtools view $mm/$run/$mm.$run.bam | sort -T $TMPDIR | gzip >  $mm/$run/$mm.$run.sam_sorted.gz" 
 687 |       endif
 688 |     end
 689 |   end
 690 | if ($ok == 0) goto phaseLoop
 691 | 
 692 | # use sam_sorted.gz rather than bam (aliqc --BAM also works, but it need to call sort which is very costly)
 693 | # PacBio Roche iRefSeq SRR5189652 SRR5189667 HG19t1r1 HG19t1r2 HG19t2r1 HG19t2r2 HG19t3r1 PFALt1r1  PFALt1r2  PFALt1r3  PFALt2r1 PFALt2r2 PFALt2r3 PFALt3r1 PFALt3r2 PFALt3r3 SRR5438850
 694 | foreach minAli (0 50 80)
 695 |   foreach run ($runs)
 696 |     foreach mm ($methods)
 697 |       if (-e $mm/$run/$mm.$run.sam_sorted.gz && -e Fasta/$run/genome.gz && ! -e $mm/$run/$mm.$run.minAli$minAli.aliqc.tsv) then
 698 |         echo "Running AliQC.py on $mm/$run/$mm.$run.sam_sorted.gz"
 699 | 	set nreads=`cat Fasta/$run/*.count  | gawk '/^Fragment_kept/{n+=$2}END{print n}'`
 700 | 	if ($nreads == 0) then
 701 | 	  echo "missing file Fasta/$run/$run.count, please run phase 1"
 702 |         else
 703 |           echo "Running AliQC.py $mm $run"
 704 |           scripts/submit $mm/$run/aliqc.minali$minAli "python3 scripts/AliQC.py --SAMSORTEDGZ -i $mm/$run/$mm.$run.sam_sorted.gz -r $run..$mm.minAli$minAli -f Fasta/$run/genome.gz -o $mm/$run/$mm.$run.minAli$minAli --nreads $nreads --minAli $minAli"
 705 |          endif
 706 |       endif
 707 |     end
 708 |   end
 709 | end
 710 | 
 711 | goto phaseLoop
 712 | 
 713 | ##############################################################################
 714 | ##############################################################################
 715 | ## Direct statistics of the error counts reported in the BAM files
 716 | ## The AliQC.py code, above, parses the bam file and the genome
 717 | ## and computed its own evaluation of the number of error per aligned read
 718 | ## In the present script, we rely on the presence in the BAM files of the NM:i:x
 719 | ## optional field, collate the X values and report the statistics
 720 | ## Hopefully, the 2 methods should be compatible, but they do not have
 721 | ## to agree exactly since aliqc counts a double or triple deletion as a single event
 722 | ## and some aligners may report it as 2 or 3 errors
 723 | 
 724 | phase_DirectErrorCount:
 725 | echo phase_DirectErrorCount
 726 | foreach mm ($methods)
 727 |   foreach run ($runs)
 728 |     echo "phase_DirectErrorCount $mm $run"
 729 |     if (-e $mm/$run/$mm.$run.sam_sorted.gz && ! -e $mm/$run/$mm.$run.nerrors) then
 730 |       scripts/submit $mm/$run/$mm.$run.nerrors "scripts/directErrorCount.tcsh $mm $run" 
 731 |      endif
 732 |   end
 733 | end
 734 | 
 735 | # export the results in a single human readable table
 736 | set toto=RESULTS/Error_counts_using_NM_optional_tag.txt
 737 | echo -n "## $toto :" > $toto
 738 | if (-e $toto.1) \rm $toto.1
 739 | date >> $toto
 740 | foreach mm ($methods)
 741 |   foreach run ($runs)
 742 |     if (-e  $mm/$run/$mm.$run.nerrors) then
 743 |       echo -n "$mm\t$run\t" >> $toto.1
 744 |       cat $mm/$run/$mm.$run.nerrors | gawk '/^#/{next;}{n[$1]=$2;if($1>max)max=$1;}END{for(i=-5;i<=max;i++)printf("\t%d",n[i]);printf("\n")}' >> $toto.1
 745 |     endif
 746 |   end
 747 | end
 748 | cat $toto.1 | gawk '{if(NF>max)max=NF;}END{printf("\t\t");max-=6;for(i=-5;i<=max;i++)printf("\t%d",i);printf("\n")}' > $toto.2
 749 | cat $toto.2 $toto.1 | gawk -F '\t' -f scripts/transpose.awk >> $toto
 750 | \rm $toto.1 $toto.2
 751 | echo "The table of errors using the optional NM tag of the BAM files is in"
 752 | echo $toto
 753 | 
 754 | goto phaseLoop
 755 | 
 756 | ##############################################################################
 757 | ##############################################################################
 758 | ## Count the substitutions as declared in the Baruzzo Benchmark
 759 | ## The statistics only measures the substitutions in the r3 reads
 760 | ## fully and continuously aligned on the plus strand of chromosome 8
 761 | ## This seems sufficient since it involves in each case at least 100,000 reads 
 762 | 
 763 | phase_Count_subtitutions_in_benchmark:
 764 | 
 765 | # there are several phases in the calsulation
 766 | # 1: select the full reads, characterized by a cigar string 100M
 767 | if (! -e SUBS) mkdir SUBS
 768 | foreach sp (HG19 PFAL)
 769 |   foreach tt (t1 t2 t3)
 770 |     if (-e Fasta/$sp$tt'r3'/$sp$tt'r3'.cig.gz && ! -e  SUBS/subs.$sp$tt) then 
 771 |       zcat Fasta/$sp$tt'r3'/$sp$tt'r3'.cig.gz | grep chr8  | grep '+' |  grep 100M | cut -f 1,2,3,4,8 > SUBS/subs.$sp$tt
 772 |     endif
 773 |   end
 774 | end
 775 | 
 776 | # 2: construct a 6 columns tab delimited shadow file, summarizing the coordinate of the alignemnts
 777 | foreach sp (HG19 PFAL)
 778 |   foreach tt (t1 t2 t3)
 779 |     if (-e  SUBS/subs.$sp$tt && ! -e SUBS/subs.$sp$tt.shadow ) then 
 780 |       cat SUBS/subs.$sp$tt | gawk -F '\t' '{printf("%s\t1\t100\t%s\t%d\t%d\n",$1,$2,$3,$4);}' > SUBS/subs.$sp$tt.shadow 
 781 |     endif
 782 |   end
 783 | end
 784 | 
 785 | # 3: isolate the genome of chromosome 8, using the dna2dna utility
 786 | foreach sp (HG19 PFAL)
 787 |   if (-e Reference_genome/$sp.Baruzzo.genome.fasta.gz && ! -e Reference_genome/$sp.Baruzzo.chr8.fasta.gz) then
 788 |     bin/dna2dna -i Reference_genome/$sp.Baruzzo.genome.fasta.gz -I fasta -O fasta -keepName -o Reference_genome/$sp.Baruzzo.chr8 -gzo 
 789 |   endif
 790 | end
 791 | 
 792 | # 4: Export the corresponding genomic segement in raw format
 793 | # The raw format has just 2 tab delimited columns: atgcatgc  identifier
 794 | # Notice that dna2dna is very versatile, it can directly export messenger RNAs given a genome and a gff file.
 795 | # try bin/dna2dna --help for a full list of functioalities
 796 | foreach sp (HG19 PFAL)
 797 |   foreach tt (t1 t2 t3)
 798 |     if (-e  SUBS/subs.$sp$tt.shadow && -e Reference_genome/$sp.Baruzzo.chr8.fasta.gz && ! -e SUBS/subs.$sp$tt.raw) then
 799 |       dna2dna -i Reference_genome/$sp.Baruzzo.chr8.fasta.gz -shadow SUBS/subs.$sp$tt.shadow -O raw -keepName >  SUBS/subs.$sp$tt.raw 
 800 |     endif
 801 |   end
 802 | end
 803 | 
 804 | # 5: the first subs file contains in column 1 and 5 the name and sequence of each read
 805 | #    the raw file contains in column 2 and 1 the name and sequence of each corresponding genomic segment
 806 | #    Both sequences are exactly 100 bases long, hence a simple awk script is sufficient to count the mismatching bases
 807 | echo ZZZZZ > SUBS/ZZZZZ
 808 | foreach sp (HG19 PFAL)
 809 |   foreach tt (t1 t2 t3)
 810 |     cat SUBS/subs.$sp$tt SUBS/ZZZZZ  SUBS/subs.$sp$tt.raw | gawk -F '\t' '/^ZZZZZ/{zz=1;}{if(zz<1){seq[$1]=$5;next;}if (seq[$2]){s1=seq[$2];s2=toupper($1);n=0;for(i=1;i<=100;i++)if(substr(s1,i,1) != substr(s2,i,1))n++;print n}}' | tags | sort -k 1n > SUBS/subs.$sp$tt.txt &
 811 |   end
 812 | end
 813 | 
 814 | # 6: produce a final synthetic table
 815 | set toto=RESULTS/mm_stats.Baruzzo.txt
 816 | echo -n "### $toto : " > $toto
 817 | date >> $toto
 818 | foreach sp (HG19 PFAL)
 819 |   foreach tt (t1 t2 t3)
 820 |     cat  SUBS/subs.$sp$tt.txt | gawk '{n1+=$2;n2+=$1*$2;printf ("%s\t%d\t%d\t%d\t%d\n",t,$1,$2,n1,n2);}' t=$sp$tt >> $toto
 821 |   end
 822 | end
 823 | echo "The distribution of substitutions in chromosome 8 in tbe Baruzzo benchmark"
 824 | echo "have been exported in $toto"
 825 | head -12 $toto
 826 | 
 827 | goto phaseLoop
 828 | 
 829 | ##############################################################################
 830 | ##############################################################################
 831 | ## Exportation of global, human readable, quality control tables
 832 | ## These tables where used directly to prepare the plots and table of the
 833 | ## Magic-BLAST paper
 834 | 
 835 | phase_Export:
 836 | 
 837 | if (! -d RESULTS) mkdir RESULTS
 838 | 
 839 | # collate the aliqc.tsv tables from all runs using again the AliQC.py scripts with --table option
 840 | # this will produce 3 output tables
 841 | foreach minAli (0 50 80)
 842 |   if (-e toto) \rm toto
 843 |   foreach method ($methods)
 844 |     cat $method/*/*.minAli$minAli.aliqc.tsv  >> toto
 845 |   end
 846 |   cat toto | python3 scripts/AliQC.py --view table --split --minAli $minAli -o RESULTS/magicblast.table.minAli$minAli
 847 | end
 848 | 
 849 | # reformat the 3 output tables
 850 | foreach minAli (0 50 80)
 851 |   foreach type (mismatch_histo_and_types mismatch_per_cycle_per_kb_aligned aligned_reads_multiplicity_length_aligned)
 852 |     set toto=RESULTS/magicblast.table.minAli$minAli.$type
 853 |     cat $toto.tsv  | head -12 |  gawk -F '\t' '/^##/{next;}/^#/{print}' > $toto.txt
 854 |     cat $toto.tsv  | gawk -F '\t' '/^###/{next;}/^##/{print}' >> $toto.txt
 855 |     cat $toto.tsv  | gawk -F '\t' '/^#/{next;}{print}' | sort -k 1,1 -k 2,2 -k 3,3 | grep -v r2 | grep -v r3 | sed -e 's/r1//g' |  gawk -F '\t' '{if ($2 != old2){printf("\n");old2=$2;old3="";}}{if ($3 != old3){printf("\n");old3=$3}}{print}' >> $toto.txt
 856 |     cat $toto.tsv  | gawk -F '\t' '/^###/{print}' >> $toto.txt
 857 |   end
 858 | end
 859 | 
 860 | set toto=RESULTS/Mapping_accuracy.txt
 861 | echo -n "## $toto :" > $toto
 862 | date >> $toto 
 863 | cat scripts/mapping_accuracy.header   >> $toto
 864 | 
 865 | cat */*/*.introns.tsv | sed  -e 's/\.\./\t/' | grep GoldMap | grep -v GOLD | sort -k 2,2 -k 3,3 | gawk -F '\t' -f scripts/introns_precision_recall.awk  | sort -u | sed  -e 's/t1r/T1R/g'  -e 's/t2r/T2R/g'  -e 's/t3r/T3R/g' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}'  | sed -e 's/PFAL/Malaria/g' -e 's/STARlongzz/STAR long/g' >> $toto
 866 | 
 867 | set toto=RESULTS/Mapping_accuracy.light.txt
 868 | echo -n "## $toto :" > $toto
 869 | date >> $toto 
 870 | cat scripts/mapping_accuracy.header   >> $toto
 871 | 
 872 | cat */*/*.introns.tsv | grep -v R2 | grep -v R3 |  sed  -e 's/R1//g' | grep -v SRR | grep -v Simulated | sed  -e 's/\.\./\t/' | grep GoldMap | grep -v GOLD | sort -k 2,2 -k 3,3 | gawk -F '\t' -f scripts/introns_precision_recall.awk  | sort -u | sed  -e 's/r1//g'   | grep -v r2 | grep -v r3 | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto
 873 | 
 874 | # goto phaseLoop
 875 | 
 876 | ##### INTRONS report  Insertion Deletion
 877 | 
 878 | foreach type (Intron  Insertion Deletion)
 879 | 
 880 |   set toto1="RESULTS/$type"_per_coverage_stats.txt
 881 |   set toto1L="RESULTS/$type"_per_coverage_stats.light.txt
 882 |   set toto3="RESULTS/$type"_per_coverage_stats.best.txt
 883 |   set toto4="RESULTS/$type"_per_coverage_stats.1support.txt
 884 |   set toto3L="RESULTS/$type"_per_coverage_stats.best.light.txt
 885 |   set toto4L="RESULTS/$type"_per_coverage_stats.1support.light.txt
 886 |   echo -n "### $toto1 :" > $toto1
 887 |   echo -n "### $toto3 :" > $toto3
 888 |   echo -n "### $toto4 :" > $toto4
 889 |   echo -n "### $toto1L :" > $toto1L
 890 |   echo -n "### $toto3L :" > $toto3L
 891 |   echo -n "### $toto4L :" > $toto4L
 892 |   date >> $toto1
 893 |   date >> $toto3
 894 |   date >> $toto4
 895 |   date >> $toto1L
 896 |   date >> $toto3L
 897 |   date >> $toto4L
 898 | 
 899 |  if ($type == Intron) then
 900 |     echo "## An alignment supporting an intron is defined by a line in the SAM/BAM file where the CIGAR contains an N with minimal accepted intron length 50 bases" > $toto1.1
 901 |     echo "## When a read is aligned at multiple sites, each of its alignments supporting an intron is counted" >> $toto1.1
 902 |     echo "## Some spliced genes are truly repeated, some are very similar. If one rejects all multiply aligned reads, the introns of these genes cannot be detected," >> $toto1.1
 903 |     echo "## Therefore, we keep the introns detected by multiply aligned reads, and do not artificially overestimate the specificity of methods unable to select the true positions" >> $toto1.1
 904 |     echo "## Note that in the benchmark, all reads are uniquely aligned, yet some support multiple neighboring introns" >> $toto1.1
 905 |  else
 906 |     echo "## An alignment supporting an indel is defined by a line in the SAM/BAM file where the CIGAR contains an I or o D" > $toto1.1
 907 |     echo "## When a read is aligned at multiple sites, each of its alignments supporting an indel is counted" >> $toto1.1
 908 |   endif
 909 | 
 910 |   echo -n "# Species\tRun\tMethod\tMinimal $type support" >> $toto1.1
 911 |   echo "\t$type in benchmark\t$type discovered in method\tFP: False positive $type\tTP: True positive $type\tFN: False negative $type\t$type discovery precision p=TP/(TP+FP)\t$type discovery recall r=TP/(TP+FN)\t$type discovery F-score 2pr/(p+r)" >> $toto1.1
 912 | 
 913 |   cat $toto1.1 >> $toto1
 914 |   cat $toto1.1 >> $toto1L
 915 |  
 916 |   if (-e $toto1.2) \rm $toto1.2
 917 | 
 918 |   foreach mm ($methods)
 919 |     cat $mm/*/*.delins.tsv | grep $type | sed -e 's/on_/on/' -e "s/$type//" | grep -v GOLD | sort -k 1,1n -k 2,2 |  sed  -e 's/\.\./\t/'  -e 's/g_//' | gawk '/^#/{next;}/GOLD/{next;}{support=$1;species="Human";run=$2;if(support > 200 && run != "Illumina")next;if(substr(run,1,4)=="PFAL")species="Malaria";method=$3;printf("%s\t%s\t%s\t%d",species,run,method,support); printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", $5, $6,$8,$7,$9,$10,$11,$12);}'  >> $toto1.2
 920 |   end
 921 | 
 922 |   cat $toto1.2 | sort -k 1,1 -k 2,2 -k 3,3 -k 4,4n >  $toto1.3
 923 |   cat $toto1.3 | gawk -F '\t' '/^#/{print;next;}{if($3!=old)printf("\n\n");old=$3;print;}'  >> $toto1
 924 |   cat $toto1.3 | grep -v r2 | grep -v r3 | grep -v SRR | grep -v Simulated | gawk -F '\t' '/^#/{print;next;}{if($3!=old)printf("\n\n");old=$3;print;}' >> $toto1L
 925 | 
 926 |   echo "## Limited to best support-depth" >> $toto3
 927 |   cat $toto1.1 >> $toto3
 928 |    cat $toto1.3 | gawk -F '\t' 'BEGIN{best=0;}{z=$1 "\t" $2 "\t" $3; if (z != old) {old=z;if(bestScore)print best;bestScore=$12;best=$0;}if($12>bestScore){bestScore=$12;best=$0;}}END{if(bestScore)print best;}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto3
 929 |   
 930 |   echo "## Limited to best support-depth" >> $toto3L
 931 |   cat $toto1.1 >> $toto3L
 932 |    cat $toto1.3 | grep -v r2 | grep -v r3 | grep -v SRR | grep -v Simulated | gawk -F '\t' 'BEGIN{best=0;}{z=$1 "\t" $2 "\t" $3; if (z != old) {old=z;if(bestScore)print best;bestScore=$12;best=$0;}if($12>bestScore){bestScore=$12;best=$0;}}END{if(bestScore)print best;}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto3L
 933 |   
 934 |   echo "## Limited to 1 support" >> $toto4
 935 |   cat $toto1.1 >> $toto4
 936 |   cat $toto1.3 | gawk -F '\t' '{if($4==1)print}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto4
 937 | 
 938 |   echo "## Limited to 1 support" >> $toto4L
 939 |   cat $toto1.1 >> $toto4L
 940 |   cat $toto1.3 | grep -v r2 | grep -v r3 | grep -v SRR | grep -v Simulated | gawk -F '\t' '{if($4==1)print}' | gawk -F '\t' '/^#/{print;next;}{if($2!=old)printf("\n");old=$2;print;}' >> $toto4L
 941 | 
 942 | 
 943 |  #\rm $toto1.[123]
 944 | end
 945 | # goto phaseLoop  # fall through to aliLn
 946 | 
 947 | ##############################################################################
 948 | ##############################################################################
 949 | ####### aliLn
 950 | 
 951 | phase_aliLn:
 952 | 
 953 | set toto=RESULTS/Aligned_length.histo.txt
 954 | echo -n  "## $toto : " > $toto
 955 | date >> $toto
 956 | 
 957 | \rm $toto.*
 958 |   foreach run (iRefSeq Roche PacBio Illumina SRR5189652 SRR5189667 SRR5437876 SRR5438850)
 959 | 
 960 |     if (! -e Fasta/$run/$run.TM) then
 961 |       if (-e Fasta/$run/$run.fastq.gz) then
 962 |         bin/dna2dna -i Fasta/$run/$run.fastq.gz -I fastq -getTM > Fasta/$run/$run.TM
 963 |       endif
 964 |     endif
 965 |     if (! -e Fasta/$run/$run.TM) then
 966 |       if (-e Fasta/$run/$run.fasta.gz) then
 967 |         bin/dna2dna -i Fasta/$run/$run.fasta.gz -I fasta -getTM > Fasta/$run/$run.TM
 968 |       endif
 969 |     endif
 970 |     if (! -e Fasta/$run/$run.TM) then
 971 |       if (-e Fasta/$run/$run'_1'.fasta.gz) then
 972 |         zcat  Fasta/$run/$run'_'?.fasta.gz | bin/dna2dna -I fasta -getTM > Fasta/$run/$run.TM
 973 |       endif
 974 |     endif
 975 | 
 976 |     set delta=1
 977 |     if ($run == Roche) set delta=10
 978 |     if ($run == PacBio) set delta=30
 979 |     if ($run == SRR5189652) set delta=30
 980 |     if ($run == SRR5189667) set delta=30
 981 |     if ($run == SRR5437876) set delta=30
 982 |     if ($run == SRR5438850) set delta=30
 983 |     if ($run == iRefSeq) set delta=100
 984 |     foreach mm ($methods)
 985 |       if (-e  $mm/$run/$mm.$run.aliLn) then
 986 |         cat $mm/$run/$mm.$run.aliLn | gawk '{k=int(($1+delta - 1)/delta) ; if(k>900)k=900;nn[k] += $2;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=$delta mm=$mm rr=$run >> $toto.1
 987 |       endif  
 988 |     end
 989 |   end 
 990 | 
 991 |   echo "Illumina\ttruth\t101\t217498656" >>  $toto.1
 992 |   echo "SRR5437876\ttruth\t300\t32935604" >>  $toto.1
 993 | 
 994 |   cat Fasta/iRefSeq/iRefSeq.TM | gawk -F '\t' '/^#/{next;}{k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=100 mm="truth" rr=iRefSeq > $toto.1a
 995 |   cat Fasta/PacBio/PacBio.TM | gawk -F '\t' '/^#/{next;}{k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=30 mm="truth" rr=PacBio >> $toto.1a
 996 |   cat Fasta/SRR5189652/SRR5189652.TM | gawk -F '\t' '/^#/{next;}{if($2<19)next;k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=30 mm="truth" rr=SRR5189652 >> $toto.1a
 997 |   cat Fasta/SRR5189667/SRR5189667.TM | gawk -F '\t' '/^#/{next;}{if($2<19)next;k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=30 mm="truth" rr=SRR5189667 >> $toto.1a
 998 |   cat Fasta/Roche/Roche.TM | gawk -F '\t' '/^#/{next;}{if($2<19)next;k=int(($2+delta - 1)/delta) ; if(k>900)k=900;nn[k]++;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=10 mm="truth" rr=Roche >> $toto.1a
 999 | 
1000 | 
1001 | 
1002 |   echo "Human_T1\ttruth\t100\t20000000" > $toto.t0
1003 |   echo "Human_T2\ttruth\t100\t20000000" >>  $toto.t0
1004 |   echo "Human_T3\ttruth\t100\t20000000" >>  $toto.t0
1005 |   echo "Malaria_T1\ttruth\t100\t20000000" >>  $toto.t0
1006 |   echo "Malaria_T2\ttruth\t100\t20000000" >>  $toto.t0
1007 |   echo "Malaria_T3\ttruth\t100\t20000000" >>  $toto.t0
1008 |   
1009 |   set delta=1
1010 |   foreach tt (1 2 3)
1011 |     if (-e $toto.t$tt) \rm $toto.t$tt
1012 |     foreach mm ($methods)
1013 |       cat $mm/HG19t$tt'r1'/$mm.HG19t$tt'r1'.aliLn | gawk '{k=int(($1+delta - 1)/delta) ; if(k>900)k=900;nn[k] += $2;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=$delta mm=$mm rr=Human_T$tt >> $toto.t$tt
1014 |       cat $mm/PFALt$tt'r1'/$mm.PFALt$tt'r1'.aliLn | gawk '{k=int(($1+delta - 1)/delta) ; if(k>900)k=900;nn[k] += $2;if(k>kMax)kMax =k;}END{for (i = 0 ; i <= kMax ; i++) printf ("%s\t%s\t%d\t%d\n", rr, mm, delta*i, nn[i]+0);}' delta=$delta mm=$mm rr=Malaria_T$tt >> $toto.t$tt
1015 |     end
1016 | 
1017 |   end
1018 | 
1019 |   cat $toto.t[0123] >> $toto.1
1020 |   cat $toto.1 | sort -k 1,1 -k 3,3n -k 2,2 > $toto.2
1021 |   cat $toto.1a | sort -k 1,1 -k 3,3n -k 2,2 >> $toto.2
1022 | 
1023 |   echo -n "## $toto :" > $toto
1024 |   if (-e $toto.5) \rm $toto.5
1025 |   date >> $toto
1026 |   echo "## Histogram of length to be aligned in truth dataset,and aligned by each program. Each read is counted only once, at the location of its BAM primary alignment (excluding the secondaries with flag 256)" >> $toto
1027 |   foreach rr (iRefSeq Roche PacBio Illumina $pacbio_runs $long_illumina_runs)
1028 |     echo "# $rr\tTruth" > $toto.4
1029 |     cat $toto.2 | grep $rr | gawk -F '\t' '{k=$3;n=$4;m=$2;if(k>kMax)kMax=k;kk[k]=1;nn[m,k]=n;}END{kk[0]=1;for (k=0;k<=kMax;k++)if(kk[k]>0)printf("%d\t%d\n", k,nn["truth",k]);}' >> $toto.4
1030 |     echo "\n\n" >> $toto.4
1031 |     cat $toto.4 | scripts/transpose >> $toto.5
1032 |     echo "\n\n" >> $toto.5
1033 |   end
1034 | #
1035 |   foreach rr (iRefSeq Roche PacBio Illumina $pacbio_runs $long_illumina_runs)
1036 |     echo -n "# $rr\tTruth" > $toto.4
1037 |     foreach mm ($methods)
1038 |       echo -n "\t$mm" >> $toto.4
1039 |     end
1040 |     cat $toto.2 | grep $rr | gawk -F '\t' '{k=$3;n=$4;m=$2;if(k>kMax)kMax=k;kk[k]=1;nn[m,k]=n;}END{kk[0]=1;for (k=0;k<=kMax;k++)if(kk[k]>0)printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", k,nn["truth",k],nn["10_MagicBLAST",k],nn["20_HISAT2_relaxed",k],nn["21_HISAT2",k],nn["30_STAR",k],nn["31_STARlong",k],nn["32_STAR.2.6c",k],nn["40_TopHat2",k]);}' >> $toto.4
1041 |     echo "\n\n" >> $toto.4
1042 |     cat $toto.4 | scripts/transpose >> $toto.5
1043 |     echo "\n\n" >> $toto.5
1044 |   end
1045 | 
1046 |   foreach rr (Human_T1 Human_T2 Human_T3 Malaria_T1 Malaria_T2 Malaria_T3)
1047 |     echo -n "# $rr\tTruth" > $toto.4
1048 |     foreach mm ($methods)
1049 |       echo -n "\t$mm" >> $toto.4
1050 |     end
1051 |     cat $toto.2 | grep $rr | gawk -F '\t' '{k=$3;n=$4;m=$2;if(k>kMax)kMax=k;kk[k]=1;nn[m,k]=n;}END{kk[0]=1;for (k=0;k<=kMax;k++)if(kk[k]>0)printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", k,nn["truth",k],nn["10_MagicBLAST",k],nn["20_HISAT2_relaxed",k],nn["21_HISAT2",k],nn["30_STAR",k],nn["31_STARlong",k],nn["32_STAR.2.6c",k],nn["40_TopHat2",k]);}' >> $toto.4
1052 |     echo "\n\n" >> $toto.4
1053 |     cat $toto.4 | scripts/transpose >> $toto.5
1054 |     echo "\n\n" >> $toto.5
1055 |   end
1056 | 
1057 |   cat $toto.5 | scripts/transpose | sed -e 's/Truth/Actual reads/g' >> $toto
1058 | 
1059 |   cat RESULTS/Intron_per_coverage_stats.txt | gawk -F '\t' '/^#/{next}{z=$1 "\t" $2 "\t" $3;k=0+n[z];if($4>k)n[z]=$4;}END{for(k in n)printf("%s\t%d\n",k,n[k]);printf("toto\n");}' | sort | gawk -F '\t' 'BEGIN{printf("# Species\tRun\tMethods\tMaximal intron support\n")}{z=$1 "\t" $2;if (z != oldz){if(length(oldz)>3)printf("%s\t%s\t%s\n",oldz,substr(m[oldz],2),substr(n[oldz],2));m[z]="";n[z]="";}oldz=z;m[z]=m[z]","$3;n[z]=n[z]","$4;}' > RESULTS/Intron_per_coverage_stats.title.txt
1060 | \rm $toto.*
1061 | 
1062 | goto phaseLoop
1063 | 
1064 | ##############################################################################
1065 | ##############################################################################
1066 | ## Done
1067 | 
1068 | phase6:
1069 | phaseLoop:
1070 |   echo -n "$1 done : "
1071 |   date
1072 | 
1073 | ##############################################################################
1074 | ##############################################################################
1075 | ##############################################################################
1076 | ##############################################################################
1077 |  
1078 | 


--------------------------------------------------------------------------------
/article/scripts/README:
--------------------------------------------------------------------------------
 1 | # Aug 1st, 2018
 2 | # Author: Jean Thierry-Mieg, NCBI/NLM/NIH
 3 | # For questions, please email  mieg@ncbi.nlm.nih.gov
 4 | 
 5 | 
 6 | The present directory contains scripts used in the Magic-BLAST paper analysis. 
 7 |  
 8 | ===========
 9 | 
10 | MagicBlastPaperMasterScript.tcsh
11 |   is the main scripts driving the whole analysis, it calls the other scripts and the C executables
12 |   present in the bin directory. It is described in the supplementary material of the paper.
13 | 
14 |   How to use this scripts is described in the file ../README in the parent directory
15 | 
16 |   Please check the file ../bin/README and verify if the executables work correctly on your machine
17 |   The main difficulty may come from the installation of HTSeq needed by AliQC.py.
18 | 
19 | ==========
20 | 
21 | AliQC.py
22 |   is a python code developed in collaboration we Joe Meehan, FDA, for the
23 |   MAQC/SEQC project led by Leming Shi several years ago. It was ported
24 |   to python3 for the present project. The code has several functionalities
25 |   try 'python3 AliQC.py --help', and it is called with the proper parameters
26 |   by the master script. The purpose is to scan the SAM files, which can appear
27 |   indifferently as BAM, SAM, or SAM.gz files, and to extract detailed statistics
28 |   on the alignments and error patterns.
29 |   Notice that AliQC.py does not import a truth file, so all 
30 |   the measurement are intrinsic to the SAM file. 
31 |   Comparisons to the 'truth' of the benchmarks is delegated to the C code 'bin/sam2gold'.
32 |   AliQC calls HTSeq to parse the SAM files.
33 | 
34 | ==========
35 | 
36 | HTSeq is described in the following publication:
37 | 
38 |     Simon Anders, Paul Theodor Pyl, Wolfgang Huber
39 |     HTSeq — A Python framework to work with high-throughput sequencing data
40 |     Bioinformatics 2015 Jan 15; 31(2): 166–169.
41 | 
42 |   It is  a python library that is used by AliQC.py solely to parse the SAM files into 
43 |   an object-oriented format, the analysis itself being programmed in AliQC.py.
44 |   It is very possible that HTSeq will not work on your machine because it directly
45 |   imports precompiled python object modules (.so files). If it does not work, please 
46 |   follow the instructions in scripts/HTSeq/README
47 | 
48 | ===========
49 | 
50 | transpose (which calls transpose.awk)
51 |   is a simple utility to transpose any tab delimited file
52 |   Usage
53 |     cat f1 | scripts/transpose > f2
54 |   It is used by the other scripts.
55 | 
56 | ===========
57 | 
58 | tags
59 |   is a simple utility to sort and count entries in the first column of a file
60 |   Usage
61 |     tags f1     # in this case the first column ends on the first tab or space
62 |     tags -t f1  # in this case the first column ends on the first tab
63 |   It is used by the other scripts.
64 | 
65 | ===========
66 | 
67 | submit
68 |   is a script to control the parallelization of large programs
69 |   It is by default configured to allow 4 programs to run in parallel,
70 |   see line 51, e.g. edit (NCORE = 4) into (NCORE = 8).
71 |   By select 'farm = local' on line 16, the submissions would
72 |   become purely sequential. If you have access to a SLURM compute-farm, 
73 |   you could submit on the farm by uncommenting line 23
74 |   and editing your accounts around line 260. Other farms would be
75 |   relatively easy to configure.
76 | 
77 | ==========
78 | 
79 | directErrorCount.awk, gff2cig.awk, introns_precision_recall.awk, transpose.awk
80 |   are format changing scripts called by the other scripts
81 | 
82 | ===========
83 | ===========
84 | 
85 | 


--------------------------------------------------------------------------------
/article/scripts/directErrorCount.awk:
--------------------------------------------------------------------------------
 1 | # ===========================================================================
 2 | #
 3 | #                            PUBLIC DOMAIN NOTICE
 4 | #               National Center for Biotechnology Information
 5 | #
 6 | #  This software/database is a "United States Government Work" under the
 7 | #  terms of the United States Copyright Act.  It was written as part of
 8 | #  the author's official duties as a United States Government employee and
 9 | #  thus cannot be copyrighted.  This software/database is freely available
10 | #  to the public for use. The National Library of Medicine and the U.S.
11 | #  Government have not placed any restriction on its use or reproduction.
12 | #
13 | #  Although all reasonable efforts have been taken to ensure the accuracy
14 | #  and reliability of the software and data, the NLM and the U.S.
15 | #  Government do not and cannot warrant the performance or results that
16 | #  may be obtained by using this software or data. The NLM and the U.S.
17 | #  Government disclaim all warranties, express or implied, including
18 | #  warranties of performance, merchantability or fitness for any particular
19 | #  purpose.
20 | #
21 | #  Please cite the author in any work or product based on this material.
22 | #
23 | # ===========================================================================
24 | #
25 | #  Author: Jean Thierry-Mieg  mieg@ncbi.nlm.nih.gov 
26 | #
27 | 
28 | {
29 |     flag=$2 ;
30 |     if (int((flag % 512)/256) ==1) 
31 | 	next ;
32 | 
33 |     i = index($0,"NM:i:") ;
34 |     if (i < 1) 
35 | 	i = index($0,"nM:i:") ;
36 |     if (i < 1) 
37 | 	next ;
38 |     split(substr($0,i),aa," ") ;
39 |     err = aa[1] ;
40 |     gsub (/nM/,"NM",err) ;
41 |     print err ;
42 |     cigar=$6 ;
43 |     if (cigar ~ /^([0-9]*[MN])*$/)
44 |     {
45 | 	if (err=="NM:i:0")   # complete no error
46 | 	    print "NM:i:-1" ;
47 | 	else 
48 | 	    print "NM:i:-2"; # complete with error
49 |     }
50 |     else 
51 |     {
52 | 	if (err=="NM:i:0")   # partial no error
53 | 	    print "NM:i:-3" ;
54 | 	else                 # partial with error
55 | 	    print "NM:i:-4"; 
56 |     }
57 | }
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/article/scripts/directErrorCount.tcsh:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh
 2 | # ===========================================================================
 3 | #
 4 | #                            PUBLIC DOMAIN NOTICE
 5 | #               National Center for Biotechnology Information
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software/database is freely available
11 | #  to the public for use. The National Library of Medicine and the U.S.
12 | #  Government have not placed any restriction on its use or reproduction.
13 | #
14 | #  Although all reasonable efforts have been taken to ensure the accuracy
15 | #  and reliability of the software and data, the NLM and the U.S.
16 | #  Government do not and cannot warrant the performance or results that
17 | #  may be obtained by using this software or data. The NLM and the U.S.
18 | #  Government disclaim all warranties, express or implied, including
19 | #  warranties of performance, merchantability or fitness for any particular
20 | #  purpose.
21 | #
22 | #  Please cite the author in any work or product based on this material.
23 | #
24 | # ===========================================================================
25 | #
26 | #  Author: Jean Thierry-Mieg  mieg@ncbi.nlm.nih.gov
27 | #
28 | ## Direct statistics of the error counts reported in the BAM files
29 | ## We rely on the presence in the BAM files of the NM:i:x
30 | ## optional filed, collate the X values and report the statistics
31 | 
32 | set mm=$1
33 | set run=$2
34 | 
35 |   set nreads=`cat Fasta/$run/*.count  | gawk '/^Fragment_kept/{n+=$2}END{print n}'`
36 | 
37 | zcat $mm/$run/$mm.$run.sam_sorted.gz | gawk -F '\t' -f scripts/directErrorCount.awk | scripts/tags | gawk '{split ( $1,aa,":" ) ; x=aa[3] ; y = $2 ; if(x>=0) ali+= y ; printf ( "%d\t%d\n",x,y ) ; }END{printf ("-5\t%d\n",nreads - ali)}'  nreads=$nreads | sort -k 1,1nr | gawk '{if (line<1)print "-6:"method": -5:unaligned -4:partial with error,  -3:partial no error, -2: complete with error, -1: complete no errors, 0: no error, 1,2,3...:n errors, columns 3 and 4 are cumuls";line++;if($1>=0){n1 += $2 ; n2 += $1*$2 ;}printf ( "%d\t%d\t%d\t%d\n",$1,$2,n1,n2 ) ; }' method=$mm nreads=$nreads | sort -k 1,1n >  $mm/$run/$mm.$run.nerrors
38 | 
39 | 


--------------------------------------------------------------------------------
/article/scripts/gff2cig.awk:
--------------------------------------------------------------------------------
 1 | # ===========================================================================
 2 | #
 3 | #                            PUBLIC DOMAIN NOTICE
 4 | #               National Center for Biotechnology Information
 5 | #
 6 | #  This software/database is a "United States Government Work" under the
 7 | #  terms of the United States Copyright Act.  It was written as part of
 8 | #  the author's official duties as a United States Government employee and
 9 | #  thus cannot be copyrighted.  This software/database is freely available
10 | #  to the public for use. The National Library of Medicine and the U.S.
11 | #  Government have not placed any restriction on its use or reproduction.
12 | #
13 | #  Although all reasonable efforts have been taken to ensure the accuracy
14 | #  and reliability of the software and data, the NLM and the U.S.
15 | #  Government do not and cannot warrant the performance or results that
16 | #  may be obtained by using this software or data. The NLM and the U.S.
17 | #  Government disclaim all warranties, express or implied, including
18 | #  warranties of performance, merchantability or fitness for any particular
19 | #  purpose.
20 | #
21 | #  Please cite the author in any work or product based on this material.
22 | #
23 | # ===========================================================================
24 | #
25 | #  Author: Jean Thierry-Mieg
26 | #
27 | #
28 | 
29 | { 
30 |     if ($3 != "exon") 
31 | 	next ;
32 | }
33 | {
34 |     split ($9,aa,"Genbank:") ; 
35 |     split(aa[2],bb,",") ; split(bb[1],cc,";") ; seq=cc[1] ;
36 |     
37 |     if(substr(seq,1,2) != "NM" && substr(seq,1,2) != "zNR") 
38 | 	next ;
39 | 
40 |     seq=seq ":" $1 ; chrom[seq] = $1 ; 
41 |     nx[seq]++ ; i=nx[seq] ;
42 |     a1[seq,i] = $4 ; a2[seq,i] = $5 ; strand[seq]=$7;
43 | }
44 | END {
45 |     for (seq in nx)
46 |     {
47 | 	n = nx[seq] ;
48 | 	printf ("%s\t%s",seq,chrom[seq]) ;
49 | 	if (strand[seq] == "+")
50 | 	{
51 | 	    printf ("\t%d\t%d\t", a1[seq,1], a2[seq,n]) ;
52 | 	    for(i = 1 ; i <=n ; i++)
53 | 	    {
54 | 		if (i>1) 
55 | 		{
56 | 		    dx = a1[seq,i] -a2[seq,i-1] - 1 ;
57 | 		    printf("%dN",dx);
58 | 		}
59 | 		dx = a2[seq,i] - a1[seq,i] + 1;
60 | 		printf ("%dM",dx);
61 | 	    }
62 | 	}
63 | 	else 
64 | 	{
65 | 	    printf ("\t%d\t%d\t", a1[seq,n], a2[seq, 1]) ;
66 | 	    for (i = n ; i >=1 ; i--)
67 | 	    {
68 | 		if (i<n)
69 | 		{
70 | 		    dx = a1[seq,i] - a2[seq,i+1] - 1 ;
71 | 		    printf("%dN",dx);
72 | 		}
73 | 		dx = a2[seq,i] - a1[seq,i] + 1;
74 | 		printf("%dM",dx);
75 | 	    }
76 | 	}
77 | 	printf("\t.\t%s\t.\n",strand[seq]);
78 |     }
79 | }
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/article/scripts/introns_precision_recall.awk:
--------------------------------------------------------------------------------
 1 | # ===========================================================================
 2 | #
 3 | #                            PUBLIC DOMAIN NOTICE
 4 | #               National Center for Biotechnology Information
 5 | #
 6 | #  This software/database is a "United States Government Work" under the
 7 | #  terms of the United States Copyright Act.  It was written as part of
 8 | #  the author's official duties as a United States Government employee and
 9 | #  thus cannot be copyrighted.  This software/database is freely available
10 | #  to the public for use. The National Library of Medicine and the U.S.
11 | #  Government have not placed any restriction on its use or reproduction.
12 | #
13 | #  Although all reasonable efforts have been taken to ensure the accuracy
14 | #  and reliability of the software and data, the NLM and the U.S.
15 | #  Government do not and cannot warrant the performance or results that
16 | #  may be obtained by using this software or data. The NLM and the U.S.
17 | #  Government disclaim all warranties, express or implied, including
18 | #  warranties of performance, merchantability or fitness for any particular
19 | #  purpose.
20 | #
21 | #  Please cite the author in any work or product based on this material.
22 | #
23 | # ===========================================================================
24 | #
25 | #  Author: Jean Thierry-Mieg
26 | #
27 | 
28 | /^#/ {
29 |     tt++ ;
30 |     if(tt==1)
31 | 	print ;
32 |     next ; 
33 | }
34 | { 
35 |     z1 = substr($2,1,4) ;
36 |     z2 = substr($2,5) ;
37 |     printf ("%s\t%s\t%s", z1, z2, $3) ; 
38 |     
39 |     u = $14 ; nu = $15 ;
40 |     printf ("\t%s\t%d\t%d\t%d", $5, u+nu,u,nu) ; 
41 |     
42 |     c = 0 ; if (nu>0) c = nu / (u+nu) ;
43 |     printf ("\t%.4f", c) ; 
44 | 
45 |     fp1 =$12 ; fp2 = $13 ; fp = fp1 + fp2 ;
46 |     tp1 =$10 ; tp2 = $11 ; tp = tp1 + tp2 ;
47 |     fn = $16 ;
48 |     printf ("\t\t%d\t%d\t%d\t%d\t%d\t%d\t%d", fp,fp1,fp2, tp1,tp2, tp, fn) ; 
49 |     
50 |     p = 0 ; r = 0 ; f = 0 ; c = 0 ;
51 |     if (tp > 0)
52 |     {
53 | 	p = tp / (tp + fp) ;
54 | 	r = tp / (tp + fn) ;
55 | 	f = 2 * p * r / (p+r) ;
56 | 	c = tp2 / tp ;
57 |     }
58 |     printf ("\t%.4f\t%.4f\t%.4f\t%.4f",p,r,f,c) ; 
59 | 
60 |     fp1 = $8 ; fp2 = $9 ; fp = fp1 + fp2 ; 
61 |     tp1 = $6 ; tp2 = $7 ; tp = tp1 + tp2 ;
62 |     fn=$16; 
63 |     c = 0 ; if (tp > 0)	c = tp1 / tp ;
64 | 
65 |     printf ("\t\t%d\t%d\t%d\t%d\t%d\t%d\t%d", fp, fp1, fp2, tp1, tp2, tp, fn) ;
66 |     
67 |     p = 0 ; r = 0 ; f = 0 ; c = 0 ;
68 |     if (tp > 0)
69 |     {
70 | 	p = tp / (tp+fp); 
71 | 	r = tp / (tp+fn);
72 | 	f = 2 * p * r / (p+r) ;
73 | 	c = tp2 /tp ;
74 |     }
75 |     printf ("\t%.4f\t%.4f\t%.4f\t%.4f",p,r,f,c) ; 
76 |     
77 |     printf("\n");
78 | }
79 | 


--------------------------------------------------------------------------------
/article/scripts/mapping_accuracy.header:
--------------------------------------------------------------------------------
1 | ## This table evaluates the accuracy of the mapping by comparing in each method, read by read, the coordinates of the ends of its alignments to the unique reference alignment provided by the Baruzzo benchmark [Nature Methods 14.2 pp135-139, 2017]
2 | ## Each read is aligned uniquely in the benchmark. In each method, the aligner may recover the true chromosomal coordinates of the first and last base of the read (TP1), or provide a partial alignment included in the true segment (TP2), or overlap the true position but extend out of it (FP1), or align completely elsewhere (FP2), or fail to align (FN). 
3 | ## If a read aligns at several sites, it will count as exact (or partial) if at least one of its alignments has the same coordinates as the benchmark unique alignment (or is included). This applies to the first half of the table, columns 10 to 20, which reports on reads.
4 | ## In contrast, in the second half of the table, columns 22 to 32, which reports on alignments, each alignment is counted in its category.
5 | ## The precision (p=(TP1+TP2)/(TP1+TP2+FP1+FP2)), recall (r=(TP1+TP2)/(TP1+TP2+FN)) and F score (f=2 p r/(p+r)) are computed for reads and alignments.
6 | ## The percentage of aligned at multiple sites is shown. The percentage of reads well mapped, at the correct position, but only partially aligned is also computed.
7 | # Species	Run	Method	Reads in benchmark	Reads aligned in method	Reads uniquely aligned, U	Reads multi-aligned, NU	% reads multiply aligned NU/(U+NU)		Misaligned reads some partial, FP=FP1+FP2	Overlapping the truth, but extending out, FP1	Mismapped reads not overlapping the truth, FP2	Reads at exact same site as benchmark, TP1	Read at correct site, but partial, TP2	Correct mapping, some partial, TP=TP1+TP2	Unaligned FN	Read mapping precision TP/(TP+FP)	Read mapping recall TP/(TP+FN)	Read mapping F-score 2pr/(p+r)	% Well mapped but partial TP2/(TP1+TP2)		Misalignments some partial, FP=FP1+FP2	Overlapping the truth, but extending out, FP1	Mismappings not overlapping the truth, FP2	Exact alignments identical to benchmark, TP1	Exact but partial alignments, TP2	Correct alignments, some partial, TP=TP1+TP2	Unaligned, FN	Alignment mapping precision TP/(TP+FP)	Alignment mapping recall TP/(TP+FN)	Alignment mapping F-score 2pr/(p+r)	% Well mapped but partial TP2/(TP1+TP2)
8 | 


--------------------------------------------------------------------------------
/article/scripts/schtroumpf:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh -f
 2 | # ===========================================================================
 3 | #
 4 | #                            PUBLIC DOMAIN NOTICE
 5 | #               National Center for Biotechnology Information
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software/database is freely available
11 | #  to the public for use. The National Library of Medicine and the U.S.
12 | #  Government have not placed any restriction on its use or reproduction.
13 | #
14 | #  Although all reasonable efforts have been taken to ensure the accuracy
15 | #  and reliability of the software and data, the NLM and the U.S.
16 | #  Government do not and cannot warrant the performance or results that
17 | #  may be obtained by using this software or data. The NLM and the U.S.
18 | #  Government disclaim all warranties, express or implied, including
19 | #  warranties of performance, merchantability or fitness for any particular
20 | #  purpose.
21 | #
22 | #  Please cite the author in any work or product based on this material.
23 | #
24 | # ===========================================================================
25 | #
26 | #  Author Jean Thierry-Mieg
27 | #
28 | #
29 | 
30 | set out_err="$1"
31 | set pgm="$2"
32 | 
33 | # echo "schtroumpf start"
34 |    (eval "/bin/time -p $pgm" > $out_err.out) >& $out_err.err
35 | # echo "schtroumpf end"
36 | 
37 | ##############################################
38 | 


--------------------------------------------------------------------------------
/article/scripts/submit:
--------------------------------------------------------------------------------
  1 | #!/bin/tcsh -f
  2 | # ===========================================================================
  3 | #
  4 | #                            PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | #
 26 | #  Author Jean Thierry-Mieg
 27 | #
 28 | 
 29 | set reserve_unit=""
 30 | set out_err="$1"
 31 | set pgm="$2"
 32 | set farm_requested="$3"
 33 | # exit 0
 34 | ##############################################
 35 | ## Please select how you want to submit the job
 36 | ## by removing the # in front of one of the options
 37 | ## If you add a new way, or find a problem
 38 | ## please mail the edited script to
 39 | ## mieg@ncbi.nlm.nih.gov
 40 | 
 41 | set farm=ZERO
 42 | # set farm=local
 43 | set farm=MULTICORE
 44 | # set farm=UGE
 45 | # set farm=BIOWULF
 46 | # set farm=SGE
 47 | # set farm=LSF
 48 | # set farm=PARASOL
 49 | # set farm=SLURM
 50 | # set farm=TH2
 51 | # set farm=FUDAN
 52 | 
 53 | if ($?MAGIC_SUBMIT) then
 54 |   set farm=$MAGIC_SUBMIT
 55 | endif
 56 | 
 57 | if ($?MAGIC_FARM) then
 58 |   set farm=$MAGIC_FARM
 59 | endif
 60 | 
 61 | if ($farm == ZERO) then
 62 |   echo "FATAL ERROR: please edit the first few lines of script subnit as documented in that file to specify your hardware"
 63 | endif
 64 | if ($farm != local && $farm != MULTICORE && $farm != UGE && $farm != BIOWULF &&$farm != TH2 && $farm != SLURM && $farm != FUDAN) then
 65 |   echo 'FATAL ERROR: sorry, the only allowed value for $MAGIC_SUBMIT are'
 66 |   echo '       local     : single threading on local machine'
 67 |   echo '       MULTICORE : mutithreading on local machine, please configure NCORE (default 8) in scripts/submit'
 68 |   echo '       UGE       : unified grid engine, successor to SGE: Sun Grid Engine, please cofigure the queue name (default unified) in  scripts/submit'
 69 |   echo '       SLURM     : slurm clusters'
 70 |   echo '       BIOWULF   : NIH Helix/Biowulf'
 71 |   echo '       FUDAN     : ad-hoc for Fudan university'
 72 |   echo "    currently you are requesting the unknown value: $farm"
 73 |   echo '    Please configure scripts/submit or define $MAGIC_SUBMIT properly, nay help on new config for additional hardware would be welcome, thank you'
 74 |   exit 1
 75 | endif
 76 | 
 77 | set NCORE=4  # number of jobs to run in parallel in MULTICORE case
 78 | set mem_requested="16G"
 79 | set reserve_mem=8
 80 | set request_multicore=""
 81 | if ($farm_requested == "32G") then
 82 |   set farm_requested=""
 83 |   set mem_requested="32G"
 84 |   set reserve_mem=32
 85 | else if ($farm_requested == "64G") then
 86 |   set farm_requested=""
 87 |   set mem_requested="64G"
 88 |   set reserve_mem=64
 89 | else if ($farm_requested == "128G") then
 90 |   set farm_requested=""
 91 |   set mem_requested="128G"
 92 |   set reserve_mem=128
 93 | else if ($farm_requested == "32G4T") then
 94 |   set farm_requested=""
 95 |   set mem_requested="32G"
 96 |   set reserve_mem=32
 97 |   set request_multicore="-pe multicore 4  -R y"
 98 | else if ($farm_requested == "1G") then
 99 |   set farm_requested=""
100 |   set mem_requested="1G"
101 |   set reserve_mem=1
102 | else if ($farm_requested == "UGE4") then
103 |   set farm_requested=""
104 |   set mem_requested="32G"
105 |   set reserve_mem=32
106 |   set farm=UGE4
107 | else
108 |   if ($farm_requested != "") set farm=$3
109 | endif
110 | 
111 | if (! $?MAGIC) setenv MAGIC XXX
112 | ##############################################
113 | ## Verify that the farm variable has been configured
114 | if ($farm == ZERO) then
115 |   echo "# FATAL ERROR in  file $0"
116 |   echo "# You need to decide if the codes should run locally"
117 |   echo "# or be submitted to a compute farm using SGE, LSF ..."
118 |   echo "# Please remove one of the # in lines 14 to 18 of file scripts/submit"
119 |   echo "# Then test the configuration "
120 |   echo "       $0  test 'echo hello_world1 > test.txt'"
121 |   echo "       $0  wait"
122 |   echo "       $0  test 'echo hello_world2 >> test.txt'"
123 |   echo "       $0  wait"
124 |   echo "# Then check the content of the files test.out, test.err, test.txt"
125 |   echo "# test.err should contain timing info, test.out may be empty"
126 |   echo "# test.txt should contain hello_world1 and 2"
127 |   exit 1
128 | endif
129 | 
130 | ##############################################
131 | ## Localize self
132 | if (! $?scripts) then
133 |   set scripts=`echo $0 | gawk '{i=index($1,"submit");a="./";if (i>1)a=substr($1,1,i-1);printf("%s",a);}'`
134 | endif
135 | # echo "scripts=$scripts"
136 | 
137 | ##############################################
138 | ## Execute the program on the local machine
139 | 
140 | if ($farm == local) then
141 | 
142 |   if ("$out_err" == run || "$pgm" == run) exit 0
143 | 
144 |   if ("$out_err" == wait || "$pgm" == wait) exit 0
145 |   if ("$out_err" == wait5 || "$pgm" == wait5) exit 0
146 |     # no need to wait on a local machine
147 | 
148 |     echo "submit: pgm=###$pgm###    stdout/err=###$out_err.out/err###"
149 |     (eval "/bin/time -p $pgm" > $out_err.out) >& $out_err.err
150 | 
151 |   exit 0
152 | 
153 | endif
154 | 
155 | ##############################################
156 | ## submit a program locally on a multicore machine
157 | 
158 | if ($farm == MULTICORE) then
159 | 
160 | # We use twice the number of jobs that should exe in parallel
161 | # because for each we run a wrapping script and the real command
162 | 
163 | @ NCORE2 = 2 * $NCORE
164 | 
165 | count_jobs:
166 |   # The objective is to count all jobs submitted by this system
167 |   # i do not know how to cath the process id i just submitted
168 |   # so i grep on schtroumpf that I include in all my summitted commands
169 | 
170 |   # on MAC, probably a Unix BSD legacy, ps -lf does not give the full line, 
171 |   # if $USER is too long, it may not be listed enterely, prefer $uid
172 |   set  schtroumpf=schtroumpf 
173 |   set schtroumpf_exe=$scripts/$schtroumpf
174 |   if ($?MAGIC) set  schtroumpf=schtroumpf$MAGIC
175 |   if ($?MAGIC) set schtroumpf_exe=tmp/$schtroumpf
176 |   if (! -x tmp/$schtroumpf) then
177 |     cp  $scripts/schtroumpf tmp/$schtroumpf
178 |   endif
179 |   if ($?MAGIC_SUBMIT_USE_DASH) then
180 |      if ($MAGIC_SUBMIT_USE_DASH == 1) then 
181 |        set n=`ps -xu $uid | grep $schtroumpf | grep -vc "grep $schtroumpf"`
182 |      else
183 |         set n=`ps xu $uid | grep $schtroumpf | grep -vc "grep $schtroumpf"`
184 |      endif
185 |   else
186 |      set n=`ps xu $uid | grep $schtroumpf | grep -vc "grep $schtroumpf"`
187 |   endif
188 |   # echo "counted $n running little $schtroumpfs"
189 | 
190 |   if ("$out_err" == run || "$pgm" == run) exit 0
191 | 
192 |   if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then
193 |     if ($n > 0) then 
194 |        sleep 2 # 2 seconds for example, may be 1 sec is ok  
195 |        if ("$out_err" == wait5 || "$pgm" == wait5) sleep 298  # complete to 5 minutes
196 |        goto count_jobs
197 |     endif
198 |   else
199 |     if ($n >= $NCORE2) then
200 |       sleep 5 # 5 seconds for example, may be 2 sec is ok
201 |       goto count_jobs
202 |     endif
203 |     echo "background submit: pgm=###$pgm###    stdout/err=###$out_err.out/err### $n running $schtroumpf_exe"
204 |     $schtroumpf_exe $out_err "$pgm" &
205 |     if ($?MAGIC_SUBMIT_DELAY) then
206 |       if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY
207 |     else
208 |       sleep .1  # 2 second delay before the next call to submit
209 |     endif
210 |   endif
211 | 
212 |   exit 0
213 | 
214 | endif # MULTICORE
215 | 
216 | ##############################################
217 | ## Submit a program to the SGE Sun Grid Engine
218 | 
219 | # UGE (Univa Grid Engine) is the continuation of SGE (Sun Grid Engine) with tiny syntax modifs
220 | # after implementing the modifs, we fall thru on SGE case
221 | if ($farm == UGE || $farm == UGE4) then
222 |   # source /netopt/uge/ncbi/common/settings.csh
223 |   unsetenv LD_RUN_PATH
224 |   unsetenv LD_LIBRARY_PATH
225 |   unsetenv DYLD_LIBRARY_PATH
226 |   set reserve_unit=G
227 | endif
228 | 
229 | if ($farm == SGE || $farm == UGE || $farm == UGE4) then
230 | 
231 |   setenv SGE_SUMMARY stderr
232 |   setenv SGE_NOMAIL
233 | 
234 |   if ("$out_err" == run || "$pgm" == run) exit 0
235 | 
236 |   if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then
237 |     if ("$out_err" == wait5 || "$pgm" == wait5) then
238 |       $scripts/qusage 5
239 |     else
240 |       $scripts/qusage 1
241 |     endif
242 |   else
243 |     if (! $?queue) set queue=unified
244 | # set queue=low
245 |     set xx=""
246 |     if ($queue == test) set xx=',express=TRUE'
247 | 
248 |     set isBlastp=""
249 |     if ("$pgm" =~ "*blastp*") set isBlastp="-l blastp_frosty"
250 |     echo "SGE submit: pgm=###$pgm###    stdout/err=###$out_err.out/err### queue=###$queue### isBlasp=###$isBlastp###"
251 | 
252 |     if ($queue == profiling) then
253 |       set numero=`qsub -V -b y -j n -N $MAGIC -P unified $isBlastp -l  h_rt=86400,h_vmem=64G,mem_free=64G,m_mem_free=32G,cputype="*E5-2650*",m_core=16,mem_total=125G   -o $out_err.out -e $out_err.err "/bin/time -p $pgm"`
254 |     else if ($farm == UGE4) then
255 |       echo qsub -V -b y -j n -N $MAGIC -P  $queue -pe multicore 4  -R y $isBlastp -l h_rt=86400,h_vmem=$mem_requested,mem_free=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx  -o $out_err.out -e $out_err.err \"/bin/time -p $pgm\"
256 |       set numero=`qsub -V -b y -j n -N $MAGIC -P  $queue -pe multicore 4  -R y $isBlastp -l h_rt=86400,h_vmem=$mem_requested,mem_free=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx  -o $out_err.out -e $out_err.err "/bin/time -p $pgm"`
257 |     else
258 | # new syntax as of 2018 on UGE_ncbi qsub -l m_mem_free=4G,h_rt=14400,h_vmem=10G <other options> <job>
259 |              echo qsub -V -b y -j n -N $MAGIC -P  $queue  $isBlastp  $request_multicore -l h_rt=86400,h_vmem=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx  -o $out_err.out -e $out_err.err \"/bin/time -p $pgm\"
260 |       set numero=`qsub -V -b y -j n -N $MAGIC -P  $queue  $isBlastp  $request_multicore -l h_rt=86400,h_vmem=$mem_requested,m_mem_free=$reserve_mem$reserve_unit$xx  -o $out_err.out -e $out_err.err "/bin/time -p $pgm"`
261 |     endif
262 | 
263 | # multithreaded tasks
264 | # qsub -P unified -N $MAGIC -pe multicore 4  -R y <other options> job
265 | # see https://confluence.ncbi.nlm.nih.gov/pages/viewpage.action?pageId=13402915
266 | #  "-R y" tells SGE to reserve slots on a host, so that a single core job doesn't run first and keep your job waiting forever
267 | 
268 |     # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main
269 |     if (1) sleep .1
270 |     if ($?MAGIC_SUBMIT_DELAY) then
271 |       if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY
272 |     else
273 |       echo sleep .1
274 |     endif
275 |     echo "####numero=$numero####"
276 |   endif
277 | 
278 |   exit 0
279 | 
280 | endif
281 | 
282 | ##############################################
283 | ## Submit a program to the SLURM system
284 | 
285 | # SLURM is used on helix.nih and in obsolete Fudan
286 | 
287 | if ($farm == SLURM) then
288 |   setenv farm SLURM
289 |   set partition=CLUSTER
290 |   if ("$out_err" == run || "$pgm" == run) exit 0
291 | 
292 |   if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then
293 |     if ("$out_err" == wait5 || "$pgm" == wait5) then
294 |       $scripts/qusage 5
295 |     else
296 |       $scripts/qusage 1
297 |     endif
298 |   else
299 |    set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/")n=1;}END{print n+0;}'`
300 |   yhresubmit:
301 |    if ($isShell == 1) then
302 |      echo "sbatch -N 1 -J $MAGIC --share --mem=24000 --time-min=120 --distribution=cyclic -o $out_err.out -e $out_err.err $pgm"
303 |            sbatch -N 1 -J $MAGIC --share --mem=24000 --time-min=120 --distribution=cyclic -o $out_err.out -e $out_err.err $pgm
304 |      set Status=$status
305 |    else
306 |      echo "srun -J $MAGIC  -o $out_err.out -e $out_err.err $pgm"
307 |            srun  -J $MAGIC  -o $out_err.out -e $out_err.err $pgm
308 |      set Status=$status
309 |    endif
310 |    if ($Status != 0) then
311 |      sleep 5
312 |      goto yhresubmit
313 |    endif
314 | 
315 |     # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main
316 |     if (1) sleep .1
317 |     if ($?MAGIC_SUBMIT_DELAY) then
318 |       if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY
319 |     else
320 |       echo sleep .1
321 |     endif
322 |  
323 |   endif
324 | 
325 |   exit 0
326 | 
327 | endif
328 | 
329 | ##############################################
330 | ## Submit a program to the BIOWULF cluster
331 | 
332 | # BIOWULF 
333 | 
334 | if ($farm == BIOWULF) then
335 |   setenv farm BIOWULF
336 |   # setenv TMPDIR "/lscratch/magic.$MAGIC"
337 |   # setenv TMPDIR "/lscratch/$SLURM_JOBID"
338 | # partition: norm (normal) quick (<4h) 
339 |   set partition=quick
340 |   if ("$out_err" == run || "$pgm" == run) exit 0
341 | 
342 |   if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then
343 |     if ("$out_err" == wait5 || "$pgm" == wait5) then
344 |       $scripts/qusage 5
345 |     else
346 |       $scripts/qusage 1
347 |     endif
348 |   else
349 |    set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/" || index($1,".tcsh")>0)n=1;}END{print n+0;}'`
350 |   yhresubmit:
351 |    if ($isShell == 1) then
352 |    echo "sbatch --ntasks 1 -J $MAGIC --share --gres=lscratch:100 --mem=24g --time-min=120 --distribution=cyclic --partition=$partition --output $out_err.out --error $out_err.err $pgm"
353 |    sbatch --ntasks 1 -J $MAGIC --share --gres=lscratch:100 --mem=24g --time-min=120 --distribution=cyclic --partition=$partition --output $out_err.out --error $out_err.err $pgm
354 |      set Status=$status
355 |    else
356 |      echo "srun -J $MAGIC  -o $out_err.out -e $out_err.err $pgm"
357 |            srun -J $MAGIC  -o $out_err.out -e $out_err.err $pgm
358 |      set Status=$status
359 |    endif
360 |    if ($Status != 0) then
361 |      sleep 5
362 |      goto yhresubmit
363 |    endif
364 | 
365 |     # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main
366 |     if (1) sleep .1
367 |     if ($?MAGIC_SUBMIT_DELAY) then
368 |       if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY
369 |     else
370 |       echo sleep .1
371 |     endif
372 |  
373 |   endif
374 | 
375 |   exit 0
376 | 
377 | endif
378 | 
379 | ##############################################
380 | ## Submit a program to the FUDAN system
381 | 
382 | # FUDAN is used in Fudan
383 | 
384 | if ($farm == FUDAN) then
385 |   setenv farm FUDAN
386 |   set partition=CLUSTER
387 |   if ("$out_err" == run || "$pgm" == run) exit 0
388 | 
389 |   if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then
390 |     if ("$out_err" == wait5 || "$pgm" == wait5) then
391 |       $scripts/qusage 5
392 |     else
393 |       $scripts/qusage 1
394 |     endif
395 |   else
396 |    set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/")n=1;}END{print n+0;}'`
397 |   fudanresubmit:
398 |    if ($isShell == 1) then
399 |      echo "qsub -N $MAGIC --share --mem=24000 --time-min=600 -o $out_err.out -e $out_err.err $pgm"
400 |            qsub -N $MAGIC --share --mem=24000 --time-min=600 -o $out_err.out -e $out_err.err $pgm
401 |      set Status=$status
402 |    else
403 |      echo "srun -N $MAGIC  -o $out_err.out -e $out_err.err $pgm"
404 |            srun -N $MAGIC  -o $out_err.out -e $out_err.err $pgm
405 |      set Status=$status
406 |    endif
407 |    if ($Status != 0) then
408 |      sleep 5
409 |      goto fudanresubmit
410 |    endif
411 | 
412 |     # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main
413 |     if (1) sleep .1
414 |     if ($?MAGIC_SUBMIT_DELAY) then
415 |       if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY
416 |     else
417 |       echo sleep .1
418 |     endif
419 |  
420 |   endif
421 | 
422 |   exit 0
423 | 
424 | endif
425 | 
426 | ##############################################
427 | ## Submit a program to the TH2 super computer
428 | 
429 | # TH2 used on the chinese super computer
430 | 
431 | if ($farm == TH2) then
432 |   setenv farm TH2
433 |   if ("$out_err" == run || "$pgm" == run) exit 0
434 | 
435 |   if ("$out_err" == wait || "$pgm" == wait || "$out_err" == wait5 || "$pgm" == wait5) then
436 |     if ("$out_err" == wait5 || "$pgm" == wait5) then
437 |       $scripts/qusage 5
438 |     else
439 |       $scripts/qusage 1
440 |     endif
441 |   else
442 |    set isShell=`echo $pgm | gawk '{if(substr($1,1,8) == "scripts/")n=1;}END{print n+0;}'`
443 |   yhresubmit:
444 |    if ($isShell == 1) then
445 |      echo "yhbatch -p nsfc1 -N 1 -J $MAGIC --share --mem=16000 --time-min=600 --distribution=cyclic -o $out_err.out -e $out_err.err $pgm"
446 |            yhbatch -p nsfc1 -N 1 -J $MAGIC --share --mem=16000 --time-min=600 --distribution=cyclic  -o $out_err.out -e $out_err.err $pgm
447 |      set Status=$status
448 |    else
449 |      echo "yhrun -p nsfc1 -J $MAGIC  -o $out_err.out -e $out_err.err $pgm"
450 |            yhrun -p nsfc1 -J $MAGIC  -o $out_err.out -e $out_err.err $pgm
451 |      set Status=$status
452 |    endif
453 |    if ($Status != 0) then
454 |      sleep 5
455 |      goto yhresubmit
456 |    endif
457 | 
458 |     # limit the speed of submissions to 1 per second, 14s is nice for ALIGN phase in SEQC_main
459 |     if (1) sleep .1
460 |     if ($?MAGIC_SUBMIT_DELAY) then
461 |       if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY
462 |     else
463 |       echo sleep .1
464 |     endif
465 |  
466 |   endif
467 | 
468 |   exit 0
469 | 
470 | endif
471 | 
472 | ##############################################
473 | ## Submit a program to the Parasol parallel batch system of Jim Kent
474 | 
475 | if ($farm == PARASOL) then
476 | 
477 |   if ("$out_err" == wait || "$pgm" == wait) then
478 |     if (-e tmp/parasol.job.list) then 
479 |       para problems
480 |       mv tmp/parasol.job.list tmp/parasol.job.done.$$
481 |     endif
482 |   else  if ("$out_err" == run || "$pgm" == run) then
483 |     parasol make tmp/Parasol/job.list 
484 |   else
485 |     if (! -d $CaliRootDir/tmp/Parasol) mkdir $CaliRootDir/tmp/Parasol
486 |     set toto=$CaliRootDir/tmp/Parasol/job.$$
487 |     echo '#!/bin/csh -e' > $toto
488 |     echo -n "cd  " >> $toto
489 |     pwd >> $toto
490 |     echo "(/bin/time -p $pgm > $out_err.out) >& $out_err.err " >>   $toto
491 |     echo "csh $toto" >> $CaliRootDir/tmp/Parasol/job.list
492 |   endif
493 | 
494 |   exit 0
495 | 
496 | endif
497 | 
498 | ##############################################
499 | ## Submit a program to the LSF queue
500 | 
501 | if ($farm == LSF) then
502 | 
503 |   if ("$pgm" == wait) then
504 |     echo " i do not remember the LSF syntax to synchronize, maybe  'bjobs'  please edit scripts/submit and mail me the fix, thanks"
505 |   else if ("$pgm" != run) then
506 |     if (! $?queue) set queue=unified
507 |     echo " i do not remember the LSF syntax, please edit scripts/submit"
508 |     echo "LSF submit: pgm=###$pgm###    stdout/err=###$out_err.out/err### queue=###$queue###"
509 |     bsub -q $queue -R "select[$linux] rusage[cpu=1:mem=8000:duration=30000]" -o $out_err.err -e $out_err.err "/bin/time -p $pgm"
510 | 
511 |     if ($?MAGIC_SUBMIT_DELAY) then
512 |       if ($MAGIC_SUBMIT_DELAY > 0) sleep $MAGIC_SUBMIT_DELAY
513 |     else
514 |       sleep 1
515 |     endif
516 | 
517 |   endif
518 |   exit 1
519 | 
520 | endif
521 | 
522 | echo "ERROR : Unknown third parameter $farm in"
523 | echo "ERROR : $0 $*"
524 | echo "ERROR : The only acceptable values are: local, MULTICORE, SGE and LSF"
525 | echo "ERROR : Please edit the script or modify the call to the script"
526 | exit 1
527 | 
528 | ##############################################
529 | ##############################################
530 | 
531 | foreach ii (1 2 3 4 5 6 7 8 9 10 11 12 13 14)
532 |   pushd B$ii
533 |   ln -s tmp.$ii tmp
534 |   if (! -d tmp.$ii) mkdir tmp.$ii
535 |   ln -s ../TARGET
536 |   ln -s tmp.$ii/Batch$ii Fastc
537 |   ln -s ../LIMITS
538 |   mkdir RESULTS
539 |   mkdir MetaDB
540 |   cd MetaDB
541 |     ln -s ../../MetaDB/database
542 |     ln -s ../../MetaDB/wspec
543 |   cd ..
544 |   ln -s ../bin
545 |   ln -s ../metaData
546 |   ln -s ../scripts
547 |   ln -s scripts/MAGIC
548 |   cp ../ZZZZZ .
549 |   cp ../ZZZZZ.gz .
550 |   ln -s bin/xacembly bly
551 |   ln -s ./bin/tacembly tbly
552 |   popd
553 | end
554 | 
555 | #############################################
556 | 
557 | # find suspended jobs
558 | 
559 | qstat > _qs1
560 | qwhat > _qs2
561 | cat _qs1 ZZZZZ _qs2 | gawk '/^ZZZZZ/{zz++;next;}{if(zz<1){if($5=="S")ss[$1]=1;next;}if(ss[$1]==1)print}' > _qs3
562 | cat _qs3 | gawk '{if($4=="scripts/alignAndSelectBest.tcsh")printf("qdel %s\n",$1);}' > _qs4
563 | 


--------------------------------------------------------------------------------
/article/scripts/tags:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh -f
 2 | # ===========================================================================
 3 | #
 4 | #                            PUBLIC DOMAIN NOTICE
 5 | #               National Center for Biotechnology Information
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software/database is freely available
11 | #  to the public for use. The National Library of Medicine and the U.S.
12 | #  Government have not placed any restriction on its use or reproduction.
13 | #
14 | #  Although all reasonable efforts have been taken to ensure the accuracy
15 | #  and reliability of the software and data, the NLM and the U.S.
16 | #  Government do not and cannot warrant the performance or results that
17 | #  may be obtained by using this software or data. The NLM and the U.S.
18 | #  Government disclaim all warranties, express or implied, including
19 | #  warranties of performance, merchantability or fitness for any particular
20 | #  purpose.
21 | #
22 | #  Please cite the author in any work or product based on this material.
23 | #
24 | # ===========================================================================
25 | #
26 | #  Author: Jean Thierry-Mieg
27 | #
28 | 
29 | if ("$1" == "-t") then
30 |   gawk -F '\t' '{n[$1]++;}END{for (k in n)printf("%s\t%d\n",k,n[k]);}' $2 | sort
31 | else
32 |   gawk '{n[$1]++;}END{for (k in n)printf("%s\t%d\n",k,n[k]);}' $1 | sort
33 | endif
34 | 


--------------------------------------------------------------------------------
/article/scripts/transpose:
--------------------------------------------------------------------------------
1 | gawk -F '\t' -f scripts/transpose.awk 
2 | 


--------------------------------------------------------------------------------
/article/scripts/transpose.awk:
--------------------------------------------------------------------------------
 1 | { j++ ; for (i = 1 ; i <= NF ; i++)a[i,j] = $i ; if(NF>iMax) iMax = NF ; jMax = j ;}
 2 | END { 
 3 |   for (i = 1 ; i <= iMax ; i++) 
 4 |     {
 5 |       printf ("%s", a[i,1]) ;
 6 |       for (j = 2 ; j <= jMax ; j++)  
 7 | 	printf ("\t%s", a[i,j]) ;
 8 |       printf ("\n",i) ;
 9 |     }
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | # Site title and subtitle. This is used in _includes/header.html
 2 | title: 'NCBI Magic-BLAST'
 3 | subtitle: 'RNA-seq mapping tool'
 4 | 
 5 | # if you wish to integrate disqus on pages set your shortname here
 6 | disqus_shortname: ''
 7 | 
 8 | # if you use google analytics, add your tracking id here
 9 | google_analytics_id: ''
10 | 
11 | # Enable/show navigation. There are there options:
12 | #   0 - always hide
13 | #   1 - always show
14 | #   2 - show only if posts are present
15 | navigation: 2
16 | 
17 | # URL to source code, used in _includes/footer.html
18 | codeurl: 'https://ftp.ncbi.nlm.nih.gov/blast/executables/magicblast/LATEST'
19 | 
20 | # Default categories (in order) to appear in the navigation
21 | sections: [
22 |     ['doc', 'Documentation'],
23 |     ['cook', 'Cookbook'],
24 |     ['ref', 'Reference'],
25 |     ['dev', 'Developers'],
26 |     ['release', 'Release Notes'],
27 |     ['post', 'Posts']
28 | ]
29 | 
30 | # Keep as an empty string if served up at the root. If served up at a specific
31 | # path (e.g. on GitHub pages) leave off the trailing slash, e.g. /my-project
32 | baseurl: '/magicblast'
33 | 
34 | # Dates are not included in permalinks
35 | permalink: none
36 | 
37 | # Syntax highlighting
38 | highlighter: rouge
39 | 
40 | # Since these are pages, it doesn't really matter
41 | future: true
42 | 
43 | # Exclude non-site files
44 | exclude: ['bin', 'README.md']
45 | 
46 | # Use the kramdown Markdown renderer
47 | markdown: kramdown
48 | redcarpet:
49 |     extensions: [
50 |         'no_intra_emphasis',
51 |         'fenced_code_blocks',
52 |         'autolink',
53 |         'strikethrough',
54 |         'superscript',
55 |         'with_toc_data',
56 |         'tables',
57 |         'hardwrap'
58 |     ]
59 | 


--------------------------------------------------------------------------------
/docs/_includes/disqus.html:
--------------------------------------------------------------------------------
 1 | <div id="disqus_thread"></div>
 2 | <script type="text/javascript">
 3 |     /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
 4 |     var disqus_shortname = '{{ site.disqus_shortname }}'; // required: replace example with your forum shortname
 5 | 
 6 |     /* * * DON'T EDIT BELOW THIS LINE * * */
 7 |     (function() {
 8 |         var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
 9 |         dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
10 |         (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
11 |     })();
12 | </script>
13 | <noscript>Please enable JavaScript to view the <a href="https://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
14 | 


--------------------------------------------------------------------------------
/docs/_includes/footer.html:
--------------------------------------------------------------------------------
1 | Download <big><a href="{% if site.codeurl %}{{ site.codeurl }}{% else %}{{ site.baseurl }}{% endif %}">{{ site.title }}</a></big>
2 | 


--------------------------------------------------------------------------------
/docs/_includes/google_analytics.html:
--------------------------------------------------------------------------------
 1 | <script>
 2 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
 3 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
 4 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
 5 |   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
 6 | 
 7 |   ga('create', '{{ site.google_analytics_id }}', 'auto');
 8 |   ga('send', 'pageview');
 9 | </script>
10 | 


--------------------------------------------------------------------------------
/docs/_includes/header.html:
--------------------------------------------------------------------------------
1 | <h4><a class="brand" href="{{ site.baseurl }}/">{{ site.title }}</a>
2 |     {% if site.subtitle %}<small>{{ site.subtitle }}</small>{% endif %}
3 | </h4>
4 | 


--------------------------------------------------------------------------------
/docs/_includes/navigation.html:
--------------------------------------------------------------------------------
 1 | <ul class="nav nav-list">
 2 |     <li><a href="{{ site.baseurl }}/">Home</a></li>
 3 |     {% for section in site.sections %}
 4 |         {% assign attr = section[0] %}
 5 |         {% assign label = section[1] %}
 6 | 
 7 |         {% for page in site.categories[attr] %}
 8 |             {% if forloop.first %}
 9 |                 <li class="nav-header">{{ label }}</li>
10 |             {% endif %}
11 |             <li data-order="{{ page.order }}"><a href="{{ site.baseurl }}{{ page.url }}">{{ page.title }}</a></li>
12 |         {% endfor %}
13 |     {% endfor %}
14 | <!-- List additional links. It is recommended to add a divider
15 |     e.g. <li class="divider"></li> first to break up the content. -->
16 | </ul>
17 | 


--------------------------------------------------------------------------------
/docs/_layouts/default.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <head>
 4 |         <meta charset="utf-8">
 5 |         <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
 6 |         <meta name="viewport" content="width=device-width">
 7 | 
 8 |         <title>{{ site.title }}{% if page.title %} : {{ page.title }}{% endif %}</title>
 9 |         <meta name="description" content="{{ site.subtitle }}">
10 | 
11 |         <link href="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.2/css/bootstrap-combined.no-icons.min.css" rel="stylesheet">
12 |         <link rel="stylesheet" href="{{ site.baseurl }}/css/syntax.css">
13 |         <link rel="stylesheet" href="{{ site.baseurl }}/css/main.css">
14 |     </head>
15 |     <body>
16 | 
17 |         <div class="container">
18 |             <div class=row-fluid>
19 |                 <div id=header class=span12>
20 |                     {% include header.html %}
21 |                 </div>
22 |             </div>
23 | 
24 |             <div class=row-fluid>
25 |                 {% assign post_count = site.posts|size %}
26 |                 {% if site.navigation != 0 and site.navigation == 1 or post_count > 0 %}
27 |                     <div id=navigation class=span2>
28 |                         {% include navigation.html %}
29 |                     </div>
30 | 
31 |                     <div id=content class=span10>
32 |                         {{ content }}
33 |                     </div>
34 |                 {% else %}
35 |                     <div id=content class=span12>
36 |                         {{ content }}
37 |                     </div>
38 |                 {% endif %}
39 |             </div>
40 | 
41 |             <div class=row-fluid>
42 |                 <div id=footer class=span12>
43 |                     {% include footer.html %}
44 |                 </div>
45 |             </div>
46 |         </div>
47 | 
48 |         <script>
49 |             function orderNav() {
50 |                 var list,
51 |                     section,
52 |                     header,
53 |                     sections = [],
54 |                     lists = {},
55 |                     headers = {};
56 | 
57 |                 var navUl = document.querySelectorAll('#navigation ul')[0],
58 |                     navLis = document.querySelectorAll('#navigation ul li');
59 | 
60 |                 if (!navUl) return;
61 | 
62 |                 for (var i = 0; i < navLis.length; i++) {
63 |                     var order, li = navLis[i];
64 | 
65 |                     if (li.classList.contains('nav-header')) {
66 |                         section = li.textContent || li.innerText;
67 |                         sections.push(section);
68 |                         headers[section] = li;
69 |                         continue;
70 |                     }
71 | 
72 |                     if (!lists[section]) {
73 |                         lists[section] = [];
74 |                     }
75 | 
76 |                     order = parseFloat(li.getAttribute('data-order'))
77 |                     lists[section].push([order, li]);
78 |                 }
79 | 
80 |                 for (var i = 0; i < sections.length; i++) {
81 |                     section = sections[i];
82 |                     list = lists[section].sort(function(a, b) {
83 |                         return a[0] - b[0];
84 |                     });
85 | 
86 |                     if (header = headers[section]) {
87 |                         navUl.appendChild(header);
88 |                     }
89 |                     for (var j = 0; j < list.length; j++) {
90 |                         navUl.appendChild(list[j][1]);
91 |                     }
92 |                 }
93 |             }
94 | 
95 |             if (document.querySelectorAll) orderNav();
96 |         </script>
97 |     </body>
98 | </html>
99 | 


--------------------------------------------------------------------------------
/docs/_layouts/page.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | ---
 4 | 
 5 | <div class=page-header>
 6 |     <h2>{{ page.title }}
 7 |         {% if page.subtitle %}<small>{{ page.subtitle }}</small>{% endif %}
 8 |     </h2>
 9 | </div>
10 | 
11 | {{ content }}
12 | 


--------------------------------------------------------------------------------
/docs/_posts/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi/magicblast/63d24b02fe26d6360aa23de148521e3cdea1cdbd/docs/_posts/.gitkeep


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-blastdb.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: cook
 4 | title: "Create a BLAST database"
 5 | order: 1
 6 | ---
 7 | 
 8 | First you need to create a BLAST database for your genome or transcriptome. For your reference sequences in a FASTA file, use this command line:
 9 | 
10 | ```
11 | makeblastdb -in <reference.fa> -dbtype nucl -parse_seqids -out <database_name> -title "Database title"
12 | ```
13 | 
14 | The ```-parse_seqids``` option is required to keep the original sequence identifiers. Otherwise makeblastdb will generate its own identifiers, ```-title``` is optional.
15 | 
16 | For more information on makeblastdb see [NCBI BLAST+ Command Line User Manual](https://www.ncbi.nlm.nih.gov/books/NBK279688/).
17 | 
18 | Magic-BLAST will work with a genome in a FASTA file, but will be very slow for anything larger than a bacterial genome (about 5 million bases), so we do not recommend it.
19 | 
20 |  
21 | &nbsp;
22 | 
23 | 
24 | #### Example
25 | 
26 | To create a BLAST database from the reference file ```my_reference.fa```
27 | 
28 | ```
29 | $ cat my_reference.fa
30 | >sequence_1 Homo sapiens hemoglobin subunit alpha 2 (HBA2), mRNA
31 | CATAAACCCTGGCGCGCTCGCGGGCCGGCACTCTTCTGGTCCCCACAGACTCAGAGAGAACCCACCATGG
32 | TGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCGCACGCTGGCGAGTA
33 | TGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACCAAGACCTACTTCCCGCACTTCGAC
34 | CTGAGCCACGGCTCTGCCCAGGTTAAGGGCCACGGCAAGAAGGTGGCCGACGCGCTGACCAACGCCGTGG
35 | CGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGA
36 | CCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTC
37 | ACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAATACC
38 | GTTAAGCTGGAGCCTCGGTAGCCGTTCCTCCTGCCCGCTGGGCCTCCCAACGGGCCCTCCTCCCCTCCTT
39 | GCACCGGCCCTTCCTGGTCTTTGAATAAAGTCTGAGTGGGCAGCAAAAAAAAAAAAAAAAAA
40 | >sequence_2 Homo sapiens alpha one globin (HBA1) mRNA, complete cds
41 | CAGACTCAGAGAGAACCCACCATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTA
42 | AGGTCGGCGCGCACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCAC
43 | CAAGACCTACTTCCCGCACTTCGACCTGAGCCACGGCTCTGCCCAGGTTAAGGGCCACGGCAAGAAGGTG
44 | GCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACC
45 | TGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCT
46 | GGCCGCCCACCTCCCCGCCGAGTTCACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTG
47 | AGCACCGTGCTGACCTCCAAATACCGTTAAGCTGGAGCCTCGGTGGCCATGCTTCTTGCCCCTTGGGC
48 | ```
49 | 
50 | use this command line
51 | 
52 | ```
53 | makeblastdb -in my_reference.fa -out my_reference -parse_seqids -dbtype nucl
54 | ```
55 | 
56 | Note that the word following '>' is a sequence identifier that will be used in Magic-BLAST reports. The identifier should be unique.
57 | 
58 | 
59 | ### Download a genome
60 | 
61 | There are several ways to download whole genomes, transcriptomes, or selected sequences from NCBI.
62 | 
63 | #### NCBI Datasets
64 | 
65 | You can search and download genome and transcript sequences from [NCBI Datasets Genome](https://www.ncbi.nlm.nih.gov/datasets/genome/) page. Search for an organism, select an assembly, and you will see download options.
66 | 
67 | For example, you can download the GRCh38 assembly of the human genome, from [https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/).
68 | 
69 | #### NCBI EDirect tools
70 | 
71 | To download human chromosome 1 using [NCBI EDirect tools](https://github.com/NCBI-Hackathons/EDirectCookbook) use:
72 | 
73 | ```
74 | search -db nucleotide -query NC_000001 | efetch -format fasta >NC_000001.fa
75 | ```
76 | 
77 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-copyright.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | category: dev
4 | title: "Copyright"
5 | order: 1
6 | ---
7 | With the exception of certain third-party files summarized below, this software is a "United States Government Work" under the terms of the United States Copyright Act.  It was written as part of the authors' official duties as United States Government employees and thus cannot be copyrighted.  This software is freely available to the public for use. The National Library of Medicine and the U.S. Government have not placed any restriction on its use or reproduction.
8 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-exeptions.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: dev
 4 | title: "Exceptions"
 5 | order: 2
 6 | ---
 7 | Location: configure
 8 | 
 9 | Authors:  Free Software Foundation, Inc.
10 | 
11 | License:  Unrestricted; at top of file   
12 | 
13 | ***
14 | 
15 | Location: config.guess, config.sub
16 | 
17 | Authors:  FSF
18 | 
19 | License:  Unrestricted when distributed with the Toolkit; standalone, GNU General Public License [gpl.txt]
20 | 
21 | ***
22 | 
23 | Location: {src,include}/dbapi/driver/ftds*/freetds
24 | 
25 | Authors:  See src/dbapi/driver/ftds*/freetds/AUTHORS
26 | 
27 | License:  GNU Library/Lesser General Public License
28 | [src/dbapi/driver/ftds*/freetds/COPYING.LIB]
29 | 
30 | ***
31 | 
32 | Location: include/dbapi/driver/odbc/unix_odbc
33 | 
34 | Authors:  Peter Harvey and Nick Gorham
35 | 
36 | License:  GNU LGPL
37 | 
38 | ***
39 | 
40 | Location: {src,include}/gui/widgets/FLU
41 | 
42 | Authors:  Jason Bryan
43 | 
44 | License:  GNU LGPL
45 | 
46 | ***
47 | 
48 | Location: {src,include}/gui/widgets/Fl_Table
49 | 
50 | Authors:  Greg Ercolano
51 | 
52 | License:  GNU LGPL
53 | 
54 | ***
55 | 
56 | Location: include/util/bitset
57 | 
58 | Author:   Anatoliy Kuznetsov
59 | 
60 | License:  MIT [include/util/bitset/license.txt]
61 | 
62 | ***
63 | 
64 | Location: {src,include}/util/compress/bzip2
65 | 
66 | Author:   Julian R Seward
67 | 
68 | License:  BSDish [src/util/compress/bzip2/LICENSE]
69 | 
70 | ***
71 | 
72 | Location: {src,include}/util/compress/zlib
73 | 
74 | Authors:  Jean-loup Gailly and Mark Adler
75 | 
76 | License:  BSDish [include/util/compress/zlib/zlib.h]
77 | 
78 | ***
79 | 
80 | Location: {src,include}/util/regexp
81 | 
82 | Author:   Philip Hazel
83 | 
84 | License:  BSDish [src/util/regexp/doc/LICENCE]
85 | 
86 | ***
87 | 
88 | Location: {src,include}/misc/xmlwrapp
89 | 
90 | Author:   Peter J Jones at al. [src/misc/xmlwrapp/AUTHORS]
91 | 
92 | License:  BSDish [src/misc/xmlwrapp/LICENSE]
93 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-fasta.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: cook
 4 | title: "Reads in FASTA or FASTQ"
 5 | order: 3
 6 | ---
 7 | 
 8 | If your reads are in a local FASTA file use this command line:
 9 | 
10 | ```
11 | magicblast -query reads.fa -db my_reference
12 | ```
13 | 
14 | If your reads are in a local FASTQ file use this command line:
15 | 
16 | ```
17 | magicblast -query reads.fastq -db my_reference -infmt fastq
18 | ```
19 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-multithreading.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: cook
 4 | title: "Multi-threading"
 5 | order: 6
 6 | ---
 7 | 
 8 | To use multiple CPUs, specify the maximal number of threads with the
 9 | ```-num_threads``` parameter:
10 | 
11 | ```
12 | magicblast -query reads.fa -db genome -num_threads 10
13 | ```
14 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-output.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: doc
 4 | title: "Output"
 5 | order: 1
 6 | ---
 7 | 
 8 | By default, results are provided to the standard output in the SAM format.
 9 | Use ```-out <filename>``` option to redirect output to a file.
10 | Use ```-outfmt``` option to specify the output format:
11 | 
12 | ```-outfmt sam``` : SAM format (default)
13 | 
14 | ```-outfmt tabular``` : exports a simple tab delimited format defined below.
15 | 
16 | The output can be also compressed, using the ```-gzo``` flag:
17 | 
18 | ```
19 | magicblast -query reads.fa -db genome -out output.gz -gzo
20 | ```
21 | 
22 | #### Unaligned reads
23 | 
24 | By default Magic-BLAST reports unaligned reads, with unmapped bit (4) set in SAM flags or '*' in the second column of the tabular output. If you do not want unmapped reads reported, use ```-no_unaligned``` option:
25 | 
26 | ```
27 | magicblast -query reads -db reference -no_unaligned
28 | ```
29 | 
30 | 
31 | #### Tabular output format
32 | 
33 | The tabular output format shows one alignment per line with these tab
34 | delimited fields:
35 | 
36 | 1. Query/read sequence identifier
37 | 2. Reference sequence identifier
38 | 3. Percent identity of the alignment
39 | 4. Not used
40 | 5. Not used
41 | 6. Not used
42 | 7. Alignment start position on the query sequence
43 | 8. Alignment stop position on the query sequence
44 | 9. Alignment start position on the reference sequence
45 | 10. Alignment stop position on the reference sequence
46 | 11. Not used
47 | 12. Not used
48 | 13. Alignment score
49 | 14. Query strand
50 | 15. Reference sequence strand
51 | 16. Query/read length
52 | 17. Alignment as extended BTOP string
53 |     This is the same BTOP string as in BLAST tabular output with a
54 |     few extensions:
55 |     - a number represents this many matches,
56 |     - two bases represent a mismatch and show query and reference base,
57 |     - base and gap or gap and base, show a gap in query or reference,
58 |     - ^\<number\>^ represents an intron of this number of bases,
59 |     - \_\<number\>\_ represents an insertion (gap in reference) of this number of bases,
60 |     - %\<number\>% represents a deletion (gap in read) of this number of bases,
61 |     - (\<number\>) shows number of query bases that are shared between
62 |     two parts of a spliced alignment; used when proper splice sites
63 |     were not found
64 | 18. Number of different alignments reported for this query sequence
65 | 19. Not used
66 | 20. Compartment - a unique identifier for all alignments that belong to
67 |     a single fragment. These can be two alignments for a pair of reads
68 |     or alignments to exons that were not spliced.
69 | 21. Reverse complemented unaligned query sequence from the beginning
70 |     of the query, or '-' if the query aligns to the left edge
71 | 22. Unaligned sequence at the end of the query, or '-'
72 | 23. Reference sequence identifier where the mate is aligned, if
73 |     different from the identifier in column 2, otherwise '-'
74 | 24. Alignment start position on the reference sequence for the mate, or
75 |     '-' if no alignment for the mate was found; a negative number
76 |     denotes a divergent pair
77 | 25. Composite alignment score for all exons that belong to the fragment
78 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-paired.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: cook
 4 | title: "Paired reads"
 5 | order: 4
 6 | ---
 7 | 
 8 | For SRA accessions Magic-BLAST determines whether reads are paired and maps them appropriately.
 9 | 
10 | For reads in FASTA and FASTQ files paired reads can either be in a single file, or two files.
11 | 
12 | ##### Single file
13 | 
14 | For paired reads presented as successive entries in a single FASTA or FASTQ
15 | file, i.e. read 1 and 2 of fragment 1, then read 1 and 2 of fragment 2,
16 | etc., simply add the parameter ```-paired```:
17 | 
18 | ```
19 | magicblast -query reads.fa -db genome -paired
20 | ```
21 | 
22 | or
23 | 
24 | ```
25 | magicblast -query reads.fastq -db genome -paired -infmt fastq
26 | ```
27 | 
28 | ##### Two files
29 | 
30 | For paired reads presented in two parallel files, use these options:
31 | 
32 | ```
33 | magicblast -query reads.fa -query_mate mates.fa -db genome
34 | ```
35 | 
36 | or
37 | 
38 | ```
39 | magicblast -query reads.fastq -query_mate mates.fastq -db genome -infmt fastq
40 | ```
41 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-rnavsdna.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: cook
 4 | title: "RNA vs DNA"
 5 | order: 5
 6 | ---
 7 | 
 8 | ##### Splicing
 9 | 
10 | By default, Magic-BLAST aligns RNA reads to a genome and reports spliced
11 | alignments, possibly spanning several exons. To disable spliced alignments,
12 | use the ```-splice F``` option.
13 | 
14 | For example, mapping RNA or DNA reads to a bacterial genome:
15 | 
16 | ```
17 | magicblast -sra SRR5647973 -db salmonella_enterica_genome -splice F
18 | ```
19 | 
20 | &nbsp;
21 | 
22 | ##### Transcriptome
23 | 
24 | Use the ```-reftype transcriptome``` option, to
25 | map reads to a transcriptome database. For example:
26 | 
27 | ```
28 | magicblast -query reads.fa -db my_transcripts -reftype transcriptome
29 | ```
30 | 
31 | The ```-ref_type transcriptome``` option is a short hand for ```-splice F -limit_lookup F```, so the above call is equivalent to:
32 | 
33 | ```magicblast -query reads.fa -db my_transcripts -splice F -limit_lookup F```
34 | 
35 | Magic-Blast finds alignments between a read and a genome based on initial
36 | common word in both. Many genomes contain interspersed repeats that make mapping much more time consuming. To make mapping faster we disregard words that appear too often in the reference. This is not desirable when mapping to transcripts, because a transcript with many variants could be considered a repeat. The ```-limit_lookup F``` option turns this functionality off.
37 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-sra.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: cook
 4 | title: "Use NCBI SRA repository"
 5 | order: 2
 6 | ---
 7 | 
 8 | If you are mapping an experiment from [NCBI Sequence Read Archive](https://www.ncbi.nlm.nih.gov/sra), use ```-sra <accession>``` option:
 9 | 
10 | ```
11 | magicblast -sra <accession> -db <database_name>
12 | ```
13 | 
14 | &nbsp;
15 | 
16 | 
17 | For example:
18 | 
19 | ```
20 | magicblast -sra SRR1237994 -db my_reference
21 | ```
22 | 
23 | &nbsp;
24 | 
25 | 
26 | To map several SRA runs use comma-separated list of accessions:
27 | 
28 | ```
29 | magicblast -sra SRR1237994,SRR1237993 -db my_reference
30 | ```
31 | or a list of accessions in text file, one per line:
32 | ```
33 | echo SRR1237994 >accessions
34 | echo SRR1237993 >>accessions
35 | magicblast -sra_batch accessions -db my_reference
36 | ```
37 | 
38 | &nbsp;
39 | 
40 | See [Create BLAST database](../cook/blastdb.html) to see how to create a BLAST database.
41 | 
42 | 


--------------------------------------------------------------------------------
/docs/_posts/2016-12-29-tutorial.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: doc
 4 | title: "How Magic-BLAST works"
 5 | order: 0
 6 | ---
 7 | #### Index vs. BLAST database 
 8 | 
 9 | Unlike most mapping tools, Magic-BLAST does not build an index of a genome and instead it builds an index of a batch of reads and scans a BLAST database for potential matches. BLAST database can be created from a FASTA file in seconds or minutes instead of hours for most indices. It also allows for mapping to or searching arbitrarily large collections of sequences.
10 | 
11 | Magic-BLAST can also work with a genome as a FASTA file. However it is not recommended for more than a few million bases, because mapping to a FASTA file is much slower than to a BLAST database. 
12 | 
13 | #### Seed and extend
14 | 
15 | Magic-BLAST works similarly to other BLAST programs. First it finds a seed alignment, an exact 16-base match and extends alignment to the left and right. Shorter alignments are combined over introns if splice signals are found. For paired reads, the cumulative pair score is used to select the best mapping.
16 | 
17 | #### Database word counts
18 | 
19 | To avoid mapping to repeats Magic-BLAST scans the database and counts 16-base words. Those that appear more than 10 times are not extended.
20 | 
21 | This functionality can be turned off with ```-limit_lookup F``` option and should not be used when mapping to transcripts.
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/_posts/2017-09-13-release.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | category: release
  4 | title: "Release Notes"
  5 | order: 0
  6 | ---
  7 | 
  8 | #### Magic-BLAST 1.7.2: April 19, 2023
  9 | 
 10 | ##### Bug fixes
 11 | * Fix for missing libraries in MacOs
 12 | 
 13 | 
 14 | ---
 15 | 
 16 | 
 17 | #### Magic-BLAST 1.7.1: February 17, 2023
 18 | 
 19 | ##### Bug fixes
 20 | * Fix for the missing nghttp2.dll in Windows
 21 | * Fix for incorrectly reported version number
 22 | 
 23 | 
 24 | ---
 25 | 
 26 | 
 27 | #### Magic-BLAST 1.7.0: October 14, 2022
 28 | 
 29 | ##### New features
 30 | * Magic-BLAST now reports read quality scores in SAM output if they were provided in the input.
 31 | * Magic-BLAST now reports Mapping Quality (MAPQ) score in the SAM output. The scores are 60 for a single alignment, and *round*(-10 *log*_10(1 - 1 / num_alignments)) for more than one alignment.
 32 | * Magic-BLAST now reports splice signals in the tabular format BTOP string. To get the old BTOP format without splice signals, set the environment variable ```BTOP_NO_SPLICE_SIGNALS```.
 33 | * New ```-tag <string>``` option to add a user label to each alignment, in the last column of the tabular report, and ```XU:Z:``` tag in the SAM format.
 34 | 
 35 | 
 36 | ##### Bug fixes
 37 | * Fixed incorrect SAM flags for unaligned paired reads
 38 | * Only correct values (2 -- 255) are allowed for ```-max_db_word_count``` parameter.
 39 | * Fixes for reporting unique paired alignments.
 40 | 
 41 | 
 42 | ---
 43 | 
 44 | 
 45 | #### Magic-BLAST 1.6.0: May 6, 2021
 46 | 
 47 | ##### New features
 48 | * Usage reporting - Help improve Magic-BLAST by sharing limited information about your search. [Details on the information collected, how it is used, and how to opt-out](https://www.ncbi.nlm.nih.gov/books/NBK563686/).
 49 | * Reads from NCBI SRA repository are streamed from the cloud when ```-sra``` or ```-sra_batch``` option is used. [More details here]({{ site.baseurl }}{% link _posts/2021-05-06-cloud-sra.md %}).
 50 | * NCBI taxonomy IDs are reported in SAM output if they are present in the target BLAST database.
 51 | * Unaligned reads can be reported separately from the aligned ones, using ```-out_unaligned <file name>``` option. One can also select the format with ```-unaligned_fmt``` option. Choices are SAM, tabular, and FASTA. The default format is the same as one for the main report (SAM or tabular, selected with ```-outfmt``` option).
 52 | * A file with list of SRA accessions can be provided to Magic-BLAST via the ```-sra_batch``` option.
 53 | 
 54 | 
 55 | ##### Bug fixes
 56 | * Magic-BLAST correctly reports database sequence accessions for BLAST databases without gis.
 57 | * Fixed discontinuity in adaptive score threshold function. Below are the new thresholds:
 58 | 
 59 |   Read length (r) | Score threshold
 60 |   --------------|:---------------:
 61 |    r <= 20      | r
 62 |    20 < r <= 34 | 20
 63 |    50 < r < 200 | 0.6 * r
 64 |    r >= 200     | 120
 65 | 
 66 | 
 67 | ---
 68 | 
 69 | 
 70 | #### Magic-BLAST 1.5.0: August 22, 2019
 71 | 
 72 | ##### New features
 73 | * Support for the new BLAST database version (BLASTDBv5) that allows for limiting search by taxonomy ([more information about database version 5 here](https://ftp.ncbi.nlm.nih.gov/blast/db/v5/blastdbv5.pdf))
 74 | * New option ```-md_tag```: SAM MD tag is no longer reported by default. To have it included in SAM report, use ```-md_tag``` option.
 75 | * New symbol in tabular report BTOP string: ```%<number>%``` that represents a deletion (gap in read) of this number of bases.
 76 | * New adaptive alignment score threshold, calculated based on read length (score thresholds below). This is the default behavior. Users can change alignment score threshold with the ```-score``` option and set it either to a constant or a linear function of read length.
 77 | 
 78 |   Read length (r) | Score threshold
 79 |   --------------|:---------------:
 80 |    r <= 20      | r
 81 |    20 < r <= 30 | 20
 82 |    30 < r <= 50 | r - 10
 83 |    50 < r < 200 | 0.6 * r
 84 |    r >= 200     | 120
 85 | 
 86 | 
 87 | ##### Improvements
 88 | * Improved multi-threading and run time.
 89 | * Improved alignment heuristics that allow for larger error rates and better alignments for long reads.
 90 | * Magic-BLAST aligns nanopore reads.
 91 | * NCBI accessions instead of gis are reported in SAM and tabular reports.
 92 | * Short, low-complexity alignments are no longer reported.
 93 | * The default value for ```-max_db_word_count``` parameter was lowered to from 60 to 30. 16-base words that appear in the genome more than this number of times will be filtered.
 94 | * The maximum insert size for properly aligned pairs is 1,000,000 bases for spliced alignments (RNA-seq) and 100,000 bases for non-spliced alignments (genomic). The alignments for pairs with larger insert size are still reported, but SAM flag for properly aligned pair is not set.
 95 | 
 96 | 
 97 | ##### Bug fixes
 98 | * SAM MD tag reports correct number of matching bases around an intron.
 99 | * Using ```-max_db_word_count``` option no longer requires explicit use of ```-limit_lookup``` option.
100 | * Magic-BLAST no longer crashes with an empty sequence in FASTQ file.
101 | 
102 | 
103 | 
104 | ---
105 | 
106 | 
107 | 
108 | #### Magic-BLAST 1.4.0: August 10, 2018
109 | 
110 | ##### New features
111 | * New option: ```-no_discordant``` to report only concordant read pairs
112 | * Report strand-specific alignments with ```-fr``` and ```-rf``` flags for forward-reveresed and reversed-forward
113 | * New option to control repeat filtering: ```-max_db_word_count```. 16-base words that appear in the genome more than this number of times will be filtered (default is 60).
114 | 
115 | 
116 | ##### Improvements
117 | * Improved sensitivity: count for frequent database words to be filtered was increased to 60 (used to be 10). This threshold can be changed with the ```-max_db_word_count``` option.
118 | * Non-cannonical splice signal now require longer alignments on the exon, with score at least 50 on both sides of an intron.
119 | * More informative error messages for SRA access
120 | * Much better handling of non-cannonical splice sites and compositionally biased genomes
121 | 
122 | 
123 | ##### Bug fixes
124 | * Alignments no longer stop prematurely
125 | * Fix for not returning unmapped reads when none is aligned
126 | * Magic-BLAST no longer reports zero-length introns
127 | * Parameters of the score threshold as a function of read length are no longer 100 smaller than specified by the user
128 | 
129 | 
130 | 
131 | ---
132 | 
133 | 
134 | #### Magic-BLAST 1.3.0: September 15, 2017
135 | 
136 | ##### New features
137 | * The alignment cutoff score (```-score``` option) can be expressed as either a constant or a function of read length in this format: L,b,a for a * length + b
138 | * Maximum edit distance cutoff for alignments can be specified with ```-max_edit_dist``` option
139 | * SRA caching is now turned off by default and can be turned on with the ```-sra_cache``` flag. When accessing sequences in NCBI SRA repository the data can be cached in a local file so that it is not downloaded over the network again when reused later. It may result in very large local files and is only needed if you use SRA accessions multiple times, have very limited network bandwidth and a large disk space.
140 | 
141 | 
142 | ##### Improvements
143 | * Unmapped reads are reported in the SAM and Tabular reports, use ```-no_unaligned``` option to not have unaligned reads reported
144 | * The XS tag (used by transcript assemblers) is now reported in SAM output
145 | * The score threshold (```-score``` option) now applies to the whole spliced alignment
146 | * The query batch size (number of reads processed at a time) is now controlled with environment variable BATCH_SIZE expressed in cumulative number of bases
147 | * The default mismatch and gap extension penalties are now set to 4
148 | * Improved sensitivity and run time
149 | 
150 | 
151 | ##### Bug fixes
152 | * Read ids for pairs are printed properly in the SAM format (no ".1" ".2" or "/1" "/2")
153 | * The secondary alignment bit (256) is set in SAM flags
154 | * Maximum intron length option ```-max_intron_length``` works properly
155 | 
156 | 
157 | ---
158 | 
159 | 
160 | #### Magic-BLAST 1.2.0: February 17, 2017
161 | 
162 | ##### Improvements
163 | 
164 | * Improved multi-threading for larger genomes
165 | * Improved splice site detection
166 | 
167 | ##### Bug fixes
168 | 
169 | * Magic-BLAST now works with multiple SRA accessions
170 | * Fixed the macOS dmg installer that used to remove BLAST+ binaries
171 | * The -seqidlist option is no longer ignored
172 | 
173 | 
174 | ---
175 | 
176 | 
177 | #### Magic-BLAST 1.1.0: November 4, 2016
178 | 
179 | ##### Improvements
180 | 
181 | * -sra option connects to NCBI via HTTPS
182 | * Results are formatted with 'bare' accessions
183 | * Tabular output includes a header with column titles
184 | 
185 | ##### Bug fixes:
186 | 
187 | * Fixed SAM flag values
188 | 
189 | 
190 | ---
191 | 
192 | 
193 | #### Magic-BLAST 1.0.0: August 19, 2016
194 | 
195 | * First release
196 | 
197 | 


--------------------------------------------------------------------------------
/docs/_posts/2017-11-14-download.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: doc
 4 | title: "Download"
 5 | order: 2
 6 | ---
 7 | 
 8 | #### Source code and Linux, MacOs, and Windows binaries
 9 | 
10 | Download source code and Linux, MacOs, and Windows binaries from [https://ftp.ncbi.nlm.nih.gov/blast/executables/magicblast/LATEST](https://ftp.ncbi.nlm.nih.gov/blast/executables/magicblast/LATEST)
11 | 
12 | &nbsp;
13 | 
14 | #### Bioconda
15 | 
16 | You can also install Magic-BLAST from [Bioconda](https://anaconda.org/bioconda/magicblast):
17 | 
18 | ```
19 | conda install -c bioconda magicblast
20 | ```
21 | 
22 | &nbsp;
23 | 
24 | #### Docker
25 | 
26 | We also provide `ncbi/magicblast` docker image:
27 | 
28 | ```
29 | docker pull ncbi/magicblast
30 | ```
31 | 
32 | For more information about `ncbi/magicblast` image see [https://github.com/ncbi/docker/tree/master/magicblast](https://github.com/ncbi/docker/tree/master/magicblast)
33 | 
34 | Magic-BLAST is also a part of `ncbi/blast-workbench` image that contains additional tools: [BLAST+ package](https://www.ncbi.nlm.nih.gov/books/NBK279690/) (including `makeblastdb` program to create a BLAST database), and [EDirect](https://dataguide.nlm.nih.gov/edirect/documentation.html):
35 | 
36 | ```
37 | docker pull ncbi/blast-workbench
38 | ```
39 | 
40 | For more information about `ncbi/blast-workbench` image see [https://github.com/ncbi/docker/tree/master/blast-workbench](https://github.com/ncbi/docker/tree/master/blast-workbench)
41 | 
42 | 


--------------------------------------------------------------------------------
/docs/_posts/2020-05-15-license.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | category: dev
4 | title: "License"
5 | order: 0
6 | ---
7 | 
8 | Please refer to the [license file](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/lxr/source/scripts/projects/magicblast/LICENSE) for license information.
9 | 


--------------------------------------------------------------------------------
/docs/_posts/2020-07-14-feedback.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: doc
 4 | title: "Feedback"
 5 | order: 3
 6 | ---
 7 | 
 8 | For feedback, bug reports, questions, suggestions, or feature requests, please 
 9 | - e-mail <blast-help@ncbi.nlm.nih.gov>, or
10 | -  open an issue in <https://github.com/ncbi/magicblast>
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/docs/_posts/2021-05-06-cloud-sra.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | category: cook
 4 | title: "Access SRA reads in the cloud"
 5 | order: 7
 6 | ---
 7 | 
 8 | NCBI provides SRA data in the cloud to support large scale hyper parallel data analyses. Starting with version 1.6.0 Magic-BLAST downloads SRA data from the cloud providers rather than NCBI servers. Benefits include uninterrupted downloads, faster download speeds, and huge bandwidth for parallel downloads to multiple ruining processes or cloud instances. For more information see [SRA in the Cloud](https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud).
 9 | 
10 | To take advantage of the increased download bandwidth when running Magic-BLAST in the cloud, please configure SRA downloads, using ```vdb-config -i``` program from [SRA Toolkit](https://github.com/ncbi/sra-tools).
11 | 
12 | Here are webinars prepared by NCBI SRA group for how to set up a cloud instance and configure SRA downloads:
13 | * [Download SRA reads from Amazon Web Services (AWS)](https://youtu.be/rjjrHnZfymU?list=PLH-TjWpFfWrt5MNqU7Jvsk73QefO3ADwD)
14 | * [Download SRA reads from Google Cloud Platform (GCP)](https://youtu.be/RNmBINl0bxc?list=PLH-TjWpFfWrt5MNqU7Jvsk73QefO3ADwD)
15 | 


--------------------------------------------------------------------------------
/docs/css/main.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     font-weight: 400;
 3 |     text-shadow: 0 1px 1px rgba(255, 255, 255, 0.7);
 4 | }
 5 | 
 6 | pre, code, pre code {
 7 |     border: none;
 8 |     border-radius: 0;
 9 |     background-color: #f9f9f9;
10 |     font-size: 0.85em;
11 | }
12 | 
13 | .highlight {
14 |     background-color: #f9f9f9;
15 | }
16 | 
17 | pre {
18 |     font-size: 1em;
19 | }
20 | 
21 | code {
22 |     color: inherit;
23 | }
24 | 
25 | #header {
26 |     border-bottom: 1px solid #eee;
27 |     margin-bottom: 20px;
28 | }
29 | 
30 | #header a:hover {
31 |     text-decoration: none;
32 | }
33 | 
34 | #footer {
35 |     margin: 20px 0;
36 |     font-size: 0.85em;
37 |     color: #999;
38 |     text-align: center;
39 | }
40 | 
41 | #content > .page-header:first-child {
42 |     margin-top: 0;
43 | }
44 | 
45 | #content > .page-header:first-child h2 {
46 |     margin-top: 0;
47 | }
48 | 
49 | 
50 | #navigation {
51 |     font-size: 0.9em;
52 | }
53 | 
54 | #navigation li a {
55 |     padding-left: 10px;
56 |     padding-right: 10px;
57 | }
58 | 
59 | #navigation .nav-header {
60 |     padding-left: 0;
61 |     padding-right: 0;
62 | }
63 | 
64 | body.rtl {
65 | 	direction: rtl;
66 | }
67 | 
68 | body.rtl #header .brand {
69 | 	float: right;
70 |     margin-left: 5px;
71 | }
72 | body.rtl .row-fluid [class*="span"] {
73 | 	float: right !important;
74 | 	margin-left: 0;
75 | 	margin-right: 2.564102564102564%;
76 | }
77 | body.rtl .row-fluid [class*="span"]:first-child {
78 | 	margin-right: 0;
79 | }
80 | 
81 | body.rtl ul, body.rtl ol {
82 |     margin: 0 25px 10px 0;
83 | }
84 | 
85 | table {
86 |   margin-bottom: 1rem;
87 |   border: 1px solid #e5e5e5;
88 |   border-collapse: collapse;
89 | }
90 | 
91 | td, th {
92 |   padding: .25rem .5rem;
93 |   border: 1px solid #e5e5e5;
94 | }
95 | 


--------------------------------------------------------------------------------
/docs/css/syntax.css:
--------------------------------------------------------------------------------
 1 | .highlight .hll { background-color: #ffffcc }
 2 | .highlight  { background: #ffffff; }
 3 | .highlight .c { color: #888888 } /* Comment */
 4 | .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
 5 | .highlight .k { color: #008800; font-weight: bold } /* Keyword */
 6 | .highlight .cm { color: #888888 } /* Comment.Multiline */
 7 | .highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
 8 | .highlight .c1 { color: #888888 } /* Comment.Single */
 9 | .highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
10 | .highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
11 | .highlight .ge { font-style: italic } /* Generic.Emph */
12 | .highlight .gr { color: #aa0000 } /* Generic.Error */
13 | .highlight .gh { color: #333333 } /* Generic.Heading */
14 | .highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
15 | .highlight .go { color: #888888 } /* Generic.Output */
16 | .highlight .gp { color: #555555 } /* Generic.Prompt */
17 | .highlight .gs { font-weight: bold } /* Generic.Strong */
18 | .highlight .gu { color: #666666 } /* Generic.Subheading */
19 | .highlight .gt { color: #aa0000 } /* Generic.Traceback */
20 | .highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
21 | .highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
22 | .highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
23 | .highlight .kp { color: #008800 } /* Keyword.Pseudo */
24 | .highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
25 | .highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
26 | .highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
27 | .highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
28 | .highlight .na { color: #336699 } /* Name.Attribute */
29 | .highlight .nb { color: #003388 } /* Name.Builtin */
30 | .highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
31 | .highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
32 | .highlight .nd { color: #555555 } /* Name.Decorator */
33 | .highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
34 | .highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
35 | .highlight .nl { color: #336699; font-style: italic } /* Name.Label */
36 | .highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
37 | .highlight .py { color: #336699; font-weight: bold } /* Name.Property */
38 | .highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
39 | .highlight .nv { color: #336699 } /* Name.Variable */
40 | .highlight .ow { color: #008800 } /* Operator.Word */
41 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
42 | .highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
43 | .highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
44 | .highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
45 | .highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
46 | .highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
47 | .highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
48 | .highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
49 | .highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
50 | .highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
51 | .highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
52 | .highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
53 | .highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
54 | .highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
55 | .highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
56 | .highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
57 | .highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
58 | .highlight .vc { color: #336699 } /* Name.Variable.Class */
59 | .highlight .vg { color: #dd7700 } /* Name.Variable.Global */
60 | .highlight .vi { color: #3333bb } /* Name.Variable.Instance */
61 | .highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
62 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: "NCBI Magic-BLAST Documentation"
 4 | ---
 5 | 
 6 | Magic-BLAST is a tool for mapping large next-generation RNA or DNA sequencing
 7 | runs against a whole genome or transcriptome. Each alignment optimizes
 8 | a composite score, taking into account simultaneously the two reads of
 9 | a pair, and in case of RNA-seq, locating the candidate introns and adding
10 | up the score of all exons. This is very different from other versions of
11 | BLAST, where each exon is scored as a separate hit and read-pairing is
12 | ignored.
13 | 
14 | Magic-BLAST incorporates within the NCBI BLAST code framework ideas
15 | developed in the NCBI Magic pipeline, in particular hit extensions by
16 | local walk and jump [(http://www.ncbi.nlm.nih.gov/pubmed/26109056)](http://www.ncbi.nlm.nih.gov/pubmed/26109056), and recursive clipping of
17 | mismatches near the edges of the reads, which avoids accumulating
18 | artefactual mismatches near splice sites and is needed to distinguish
19 | short indels from substitutions near the edges.
20 | 
21 | More details about the algorithm and comparison with other similar tools are published here:
22 | 
23 | Boratyn GM, Thierry-Mieg J, Thierry-Mieg D, Busby B, Madden TL. (2019) **Magic-BLAST, an accurate RNA-seq aligner for long and short reads.** *BMC Bioinformatics* 20: 405. \[[article](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2996-x)\]
24 | 
25 | We call the whole next generation run (from Illumina, Roche-454, ABI, or
26 | another sequencing platform excluding SOLiD), a query. The input reads may
27 | be provided as SRA accession or a file in a SRA, FASTA, and FASTQ format.
28 | Read pairs can be presented as parallel files, or as successive reads in a
29 | single file.
30 | 
31 | The reference genome or transcriptome can be given as a BLAST database
32 | or a FASTA file. It is preferable to use BLAST database for large genomes,
33 | such as human, or transcript collections, such as all of RefSeq, Ensembl,
34 | or AceView. See here on [how to create a BLAST database](/magicblast/cook/blastdb.html).
35 | 
36 | The full list of options is listed when you use ```-help``` option.
37 | 
38 | Thank you for trying this tool and providing us with feedback. Please,
39 | let us know of any desired enhancement, problem or difficulty.
40 | 
41 | E-mail blast-help@ncbi.nlm.nih.gov with questions or comments.
42 | 


--------------------------------------------------------------------------------
/magicblast-tools/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *~
3 | 


--------------------------------------------------------------------------------
/magicblast-tools/README.md:
--------------------------------------------------------------------------------
 1 | Magic-BLAST tools
 2 | ===
 3 | 
 4 | A a few scripts to scripts to postprocess SAM/BAM files.
 5 | 
 6 | 
 7 | Features
 8 | --------
 9 | * get-introns.py - collect intron locations.
10 | * get-transcripts.py - assemble transcript sequences from a genome using a GFF or GTF annotation.
11 | * combine-genome-transcript.py - iterate over read alignments to a genome and transcripts, select better scoring alignments, remap transcript alignments to the genome, and save them in a SAM or BAM file.
12 | 
13 | 
14 | ## Dependencies
15 | The programs in this directory work with Python 3.6+ and require these packages:
16 | * pysam
17 | * pyfaidx
18 | * pandas
19 | 
20 | To install them run:
21 | ```
22 | pip install -r requirements.txt
23 | ```
24 | 
25 | 
26 | Usage
27 | -----
28 | 
29 | ### Get intron locations from a SAM/BAM file
30 | ```
31 | get-introns.py --bam <SAM/BAM file> --gff <GFF/GTF/SAM/BAM file with annotation> --introns <output file>
32 | ```
33 | 
34 | The output is a tab delimited file with intron locations marked as KNOWN or NEW.
35 | 
36 | 
37 | ### Assemble transcript sequences from a genome
38 | ```
39 | get-transcripts.py --genome <FASTA genome> --gff <GFF/GTF file>
40 | ```
41 | The transcripts will be written to the standard output.
42 | 
43 | 
44 | ### Combine alignments to genome and transcripts
45 | ```
46 | combine-genome-transcripts.py --to-genome <SAM/BAM genome alignments> --to-transcripts <SAM/BAM transcript alignments> --gff <GFF/GTF annotation> --out <output SAM/BAM file> [-b]
47 | ```
48 | 


--------------------------------------------------------------------------------
/magicblast-tools/base.py:
--------------------------------------------------------------------------------
 1 | #============================================================================
 2 | #
 3 | #                           PUBLIC DOMAIN NOTICE
 4 | #               National Center for Biotechnology Information
 5 | #
 6 | #  This software/database is a "United States Government Work" under the
 7 | #  terms of the United States Copyright Act.  It was written as part of
 8 | #  the author's official duties as a United States Government employee and
 9 | #  thus cannot be copyrighted.  This software/database is freely available
10 | #  to the public for use. The National Library of Medicine and the U.S.
11 | #  Government have not placed any restriction on its use or reproduction.
12 | #
13 | #  Although all reasonable efforts have been taken to ensure the accuracy
14 | #  and reliability of the software and data, the NLM and the U.S.
15 | #  Government do not and cannot warrant the performance or results that
16 | #  may be obtained by using this software or data. The NLM and the U.S.
17 | #  Government disclaim all warranties, express or implied, including
18 | #  warranties of performance, merchantability or fitness for any particular
19 | #  purpose.
20 | #
21 | #  Please cite the author in any work or product based on this material.
22 | #
23 | # ===========================================================================
24 | #
25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
26 | #
27 | # ---------------------------------------------------------------------------
28 | 
29 | """Basic definitions used in other libraries"""
30 | 
31 | from collections import namedtuple
32 | 
33 | Intron = namedtuple('Intron', ['seqid', 'start', 'end', 'strand'])
34 | Exon = namedtuple('Exon', ['seqid', 'start', 'end', 'strand'])
35 | mRNA = namedtuple('mRNA', ['seqid', 'start', 'end', 'strand', 'exons', 'attributes'])
36 | 


--------------------------------------------------------------------------------
/magicblast-tools/combine-genome-transcripts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #============================================================================
  3 | #
  4 | #                           PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | #
 26 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
 27 | #
 28 | # ---------------------------------------------------------------------------
 29 | 
 30 | """Compare mapping to genome with mapping to transcripts"""
 31 | 
 32 | import sam
 33 | import pysam
 34 | import gff
 35 | import gtf
 36 | import argparse
 37 | import bisect
 38 | import sys
 39 | import contextlib
 40 | 
 41 | 
 42 | def is_equal(query_name_1, query_name_2):
 43 |     """Determine whether two read ids represent the same read"""
 44 |     q1 = query_name_1
 45 |     q2 = query_name_2
 46 |     if q1[-1] in ['a', 'b']:
 47 |         q1 = q1[:-1]
 48 |     if q2[-1] in ['a', 'b']:
 49 |         q2 = q2[:-1]
 50 | 
 51 |     return q1 == q2
 52 | 
 53 | def get_aligns(stream):
 54 |     """Get all alignments for a single reads. The SAM or BAM file must be
 55 |     sorted by read name"""
 56 | 
 57 |     result = []
 58 |     for align in stream:
 59 |         if len(result) == 0 or \
 60 |                is_equal(result[0].query_name, align.query_name):
 61 |             result.append(align)
 62 |         else:
 63 |             yield result
 64 |             result = [align]
 65 | 
 66 |     if len(result) > 0:
 67 |         yield result
 68 | 
 69 | 
 70 | def index_aligns(aligns):
 71 |     """Index alignments by reference name and reference start position"""
 72 |     result = {}
 73 |     for i in aligns:
 74 | 
 75 |         k = (i.reference_name, i.reference_start)
 76 |         result[k] = i
 77 | 
 78 |     return result
 79 | 
 80 | 
 81 | def get_score(aligns):
 82 |     """Get alignment score, composite for paired reads"""
 83 |     # for single reads return score of the first alignment
 84 |     # we assume that only top scoring alignments are reported
 85 |     if aligns[0].flag & 1 == 0:
 86 |         return aligns[0].get_tag('AS') if aligns[0].flag & 4 == 0 else 0
 87 | 
 88 |     # for paired reads report sum of scores for properly paired alignments
 89 |     # (bit 2 set), and single read score for other cases
 90 |     scores = []
 91 |     forward = {}
 92 |     reverse = {}
 93 |     for i in aligns:
 94 |         # if alignment is not properly paired, save score
 95 |         if i.flag & 2 == 0:
 96 |             scores.append(i.get_tag('AS') if i.flag & 4 == 0 else 0)
 97 |             continue
 98 |         
 99 |         # for properly paired alignments, find the mate alignment and add
100 |         # scores
101 |         d = None
102 |         if i.flag & 64:
103 |             d = forward
104 |         elif i.flag & 128:
105 |             d = reverse
106 |         else:
107 |             raise ValueError(f'Unrecognised paired flags for alignment: {i}')
108 | 
109 |         k = (i.reference_name, i.reference_start)
110 |         d[k] = i
111 | 
112 |     for i in forward.values():
113 |         k = (i.next_reference_name, i.next_reference_start)
114 |         if k not in reverse:
115 |             raise ValueError(f'Missing mate for {i}')
116 |         scores.append(i.get_tag('AS') +  reverse[k].get_tag('AS'))
117 | 
118 |     return max(scores)
119 |             
120 | 
121 | def compare_alignments(align_1, align_2):
122 |     """Compare alignmet scores.
123 |     Return 1 if align_1 score is larger than align_2 score, zero if
124 |     align_1 score == align_2 score, and -1 if align_1 score is smaller than
125 |     align_2 score"""
126 |     score_1 = get_score(align_1)
127 |     score_2 = get_score(align_2)
128 | 
129 |     if score_1 == score_2:
130 |         return 0
131 |     elif score_1 > score_2:
132 |         return 1
133 |     return -1
134 | 
135 | 
136 | def get_alignment_end(align):
137 |     """Find end position of a SAM alignment"""
138 |     if align.flag & 4:
139 |         raise ValueError(f'Read unaligned: {align}')
140 | 
141 |     pos = align.reference_start
142 |     for op, num in align.cigartuples:
143 |         if op in [sam.CIGAR_MATCH, sam.CIGAR_DELETION, sam.CIGAR_INTRON]:
144 |             pos += num
145 | 
146 |     return pos
147 |     
148 | 
149 | def transcript2genome(positions, align, transcript):
150 |     """Translate a position on a transcript to position on a genome"""
151 | 
152 |     # find cumulative exon lengths
153 |     exon_lens = [e.end - e.start + 1 for e in transcript.exons]
154 |     cum_exon_lens = [sum(exon_lens[:i + 1]) for i in range(0, len(exon_lens))]
155 |         
156 |     for pos in positions:
157 | 
158 |         # if transcript is annotated on the negative negative strand of the
159 |         # genome, go from the end of the read alignment to the transcript:
160 |         # find alignment position from sequence end and reverse CIGAR
161 |         if transcript.strand == '-':
162 |             align_start = sum(exon_lens) - get_alignment_end(align) + 1
163 | 
164 |         ind = bisect.bisect_left(cum_exon_lens, align_start)
165 |         if ind > 0 and cum_exon_lens[ind] == align_start:
166 |             ind += 1
167 |         if ind == 0:
168 |             yield transcript.exons[ind].start + align_start - 1
169 |         else:
170 |             yield transcript.exons[ind].start + align_start - 1 - \
171 |                       cum_exon_lens[ind - 1]
172 | 
173 | 
174 | def reverse_complement(sequence):
175 |     """Reverse complement a sequence"""
176 |     complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
177 |     result = []
178 |     for i in reversed(sequence):
179 |         result.append(complement.get(i, 'N'))
180 |     return ''.join(result)
181 | 
182 | 
183 | def remap_to_genome(align, annot, mates = None):
184 |     """Remap alignment to a transctript to alignment on a genome. Generates
185 |     a string, single line SAM output"""
186 |     # if a read is unaligned retrun the same SAM line
187 |     if align.flag & 4:
188 |         return align.to_string()
189 | 
190 |     if align.reference_name not in annot:
191 |         raise ValueError(f'{align.reference_name} not present in annotation')
192 | 
193 |     transcript = annot[align.reference_name]
194 | 
195 |     flag = align.flag
196 |     sequence = align.query_sequence
197 |     if transcript.strand == '-':
198 |         flag ^= 16
199 |         sequence = reverse_complement(sequence)
200 | 
201 |     exon_lens = [e.end - e.start + 1 for e in transcript.exons]
202 | 
203 |     # alignment of read to transcript left-most position 
204 |     align_start = align.reference_start + 1
205 |     cigartuples = align.cigartuples
206 | 
207 |     # if transcript is annotated on the negative negative strand of the genome
208 |     # go from the end of the read alignment to the transcript:
209 |     # find alignment position from sequence end and reverse CIGAR
210 |     if transcript.strand == '-':
211 |         align_start = sum(exon_lens) - get_alignment_end(align) + 1
212 |         cigartuples = reversed(cigartuples)
213 | 
214 |     # find read alignment to the genome start position
215 |     cum_exon_lens = [sum(exon_lens[:i + 1]) for i in range(0, len(exon_lens))]
216 |     ind = bisect.bisect_left(cum_exon_lens, align_start)
217 |     if ind > 0 and cum_exon_lens[ind] == align_start:
218 |         ind += 1
219 |     if ind == 0:
220 |         start = transcript.exons[ind].start + align_start - 1
221 |     else:
222 |         start = transcript.exons[ind].start + align_start - 1 - \
223 |                 cum_exon_lens[ind - 1]
224 | 
225 |     op2cigar = {sam.CIGAR_MATCH: 'M', sam.CIGAR_INSERTION: 'I',
226 |                sam.CIGAR_DELETION: 'D', sam.CIGAR_SOFT_CLIP: 'S'}
227 | 
228 |     # generate new CIGAR
229 |     cigar = ''
230 |     g_pos = start
231 |     for op, num in cigartuples:
232 |         if op == sam.CIGAR_INTRON:
233 |             raise RuntimeError('Bad alignment')
234 | 
235 |         if op in [sam.CIGAR_MATCH, sam.CIGAR_DELETION]:
236 | 
237 |             bases_left = num
238 |             while bases_left > 0:
239 | 
240 |                 # if rthe current alignment segment ends within current exon
241 |                 if g_pos + bases_left <= transcript.exons[ind].end:
242 |                     cigar += f'{bases_left}{op2cigar[op]}'
243 |                     g_pos += bases_left
244 |                     break
245 |                 else:
246 | 
247 |                     # if the current alignment segment spans an intron
248 |                     exon_bases = transcript.exons[ind].end - g_pos + 1
249 |                     bases_left -= exon_bases
250 |                     g_pos += exon_bases
251 |                     cigar += f'{exon_bases}{op2cigar[op]}'
252 |                     if bases_left > 0:
253 |                         intron = transcript.exons[ind + 1].start - transcript.exons[ind].end - 1
254 |                         cigar += f'{intron}N'
255 |                         g_pos += intron
256 |                         ind += 1
257 | 
258 |         elif op in [sam.CIGAR_INSERTION, sam.CIGAR_SOFT_CLIP]:
259 |             cigar += f'{num}{op2cigar[op]}'
260 |         else:
261 |             raise ValueError(f'Unsupported CIGAR operation: {op}')
262 | 
263 |     # compute mate start for paired reads
264 |     mate_name = '*'
265 |     mate_start = 0
266 |     if align.flag & 1 and align.next_reference_name is not None :
267 |         if align.next_reference_name == align.reference_name:
268 |             mate_name = '='
269 |             mate_start = transcript2genome([align.next_reference_start], align,
270 |                                            transcript)
271 | 
272 |         else:
273 |             # get alignment start for a new transcript
274 |             mate_transcript = annot[align.next_reference_name]
275 |             mate_name = annot[align.next_reference_name].seqid
276 | 
277 |             mate_align = mates[(align.next_reference_name,
278 |                                 align.next_reference_start)]
279 | 
280 |             mate_start = transcript2genome([align.next_reference_start],
281 |                                            mate_align,
282 |                                            mate_transcript)
283 | 
284 | 
285 |     result = f'{align.query_name}\t{flag}\t{annot[align.reference_name].seqid}'\
286 |              f'\t{start}\t255\t{cigar}\t{mate_name}\t{mate_start}\t0' \
287 |              f'\t{sequence}\t*'\
288 |              f'\tNH:i:{align.get_tag("NH")}\tAS:i:{align.get_tag("AS")}'\
289 |              f'\tNM:i:{align.get_tag("NM")}'
290 | 
291 |     return result
292 | 
293 | 
294 | @contextlib.contextmanager
295 | def wopen(filename = None, mode = 'w', template = None):
296 |     """Open stream for writing either to stdout or a file"""
297 |     if 'b' in mode:
298 |         ff = pysam.AlignmentFile(filename, mode, template = template)
299 |     else:
300 |         if filename and filename != '-' and filename != 'stdout':
301 |             ff = open(filename, 'w')
302 |         else:
303 |             ff = sys.stdout
304 |         print(template.text, file=ff)
305 | 
306 |     try:
307 |         yield ff
308 |     finally:
309 |         if ff is not sys.stdout:
310 |             ff.close()
311 |             
312 | 
313 | if __name__ == '__main__':
314 | 
315 |     parser = argparse.ArgumentParser(description='Add annotation to Magic-BLAST mapping')
316 |     parser.add_argument('--to-genome', metavar='FILE', dest='genome', type=str,
317 |                         help='BAM file with mapping to a genome')
318 |     parser.add_argument('--to-transcripts', metavar='FILE', dest='transcripts',
319 |                         type=str, help='BAM file with mapping to a transcripts')
320 |     parser.add_argument('--gff', metavar='FILE', dest='gfffile', type=str,
321 |                         help='Genome annotations file')
322 |     parser.add_argument('--out', metavar='FILE', dest='outfile', type=str,
323 |                         help='Output SAM file', default='-')
324 |     parser.add_argument('-b', dest='isbam', action='store_true',
325 |                         help='Output BAM')
326 |                         
327 | 
328 |     args = parser.parse_args()
329 |                             
330 |     # read annotation file
331 |     with open(args.gfffile) as f:
332 |         if args.gfffile.endswith('gff'):
333 |             m = gff.get_mrnas(f)
334 | 
335 |             # index mRNAs by accession
336 |             mrnas = {}
337 |             for k in m.keys():
338 |                 new_key = ':'.join([m[k].attributes['Name'], m[k].seqid])
339 |                 if not new_key.startswith('NM_'):
340 |                     continue
341 | 
342 |                 if new_key in mrnas:
343 |                     raise ValueError(f'{new_key} already present in mrnas')
344 | 
345 |                 m[k].exons.sort(key = lambda x: x.start)
346 |                 mrnas[new_key] = m[k]
347 |         else:
348 |             m = gtf.get_transcripts(f)
349 |             mrnas = m
350 | 
351 |     mode = 'w'
352 |     isbam = args.isbam or args.outfile.endswith('bam')
353 |     if isbam:
354 |         mode = mode + 'b'
355 | 
356 |     # read and compare BAM files
357 |     with sam.open_sam_or_bam(args.genome) as fg, sam.open_sam_or_bam(args.transcripts) as ft, wopen(args.outfile, mode = mode, template = fg) as out:
358 |         for genome, transcript in zip(get_aligns(fg), get_aligns(ft)):
359 |             
360 | #            print(f'genome:\t{genome[0]}')
361 | #            print(f'transcript\t{transcript[0]}')
362 | #            print('')
363 | 
364 |             duplicates = {}
365 |             if compare_alignments(genome, transcript) >= 0:
366 |                 for i in genome:
367 |                     if isbam:
368 |                         out.write(i)
369 |                     else:
370 |                         print(i.to_string(), file=out)
371 |             else:
372 | 
373 |                 if transcript[0].flag & 1:
374 |                     indexed = index_aligns(transcript)
375 | 
376 |                 for i in transcript:
377 |                     remapped = remap_to_genome(i, mrnas, indexed)
378 |                     fields = remapped.split()
379 |                     key = (fields[2], fields[3])
380 |                     # check if a similar alignment was already produced;
381 |                     # alignments to RNA variants may result in the
382 |                     # same alignment on the genome
383 |                     if key not in duplicates:
384 |                         duplicates[key] = 1
385 |                         if isbam:
386 |                             out.write(pysam.AlignedSegment.fromstring(
387 |                                 remapped, fg.header))
388 |                         else:
389 |                             print(remapped, file=out)
390 | 
391 | 


--------------------------------------------------------------------------------
/magicblast-tools/get-introns.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #============================================================================
  3 | #
  4 | #                           PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | #
 26 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
 27 | #
 28 | # ---------------------------------------------------------------------------
 29 | 
 30 | """Parse a SAM/BAM file and get an intron support table"""
 31 | 
 32 | import sam
 33 | import gff
 34 | import gtf
 35 | import txt
 36 | import gzip
 37 | import pickle
 38 | import argparse
 39 | import sys
 40 | import numpy as np
 41 | import pandas as pd
 42 | from pyfaidx import Fasta
 43 | from collections import Counter
 44 | from collections import defaultdict
 45 | 
 46 | def get_splice_signals(introns, fasta_filename):
 47 |     """Find splice signals for introns and return as dictionary indexed by introns"""
 48 |     sites = {}
 49 |     genome = Fasta(fasta_filename)
 50 |     for i in introns:
 51 |         if i.seqid not in genome:
 52 |             sites[i] = 'xxxx'
 53 |         else:
 54 |             # indices into pyfaidx sequences are zero-based
 55 |             sites[i] = genome[i.seqid][(i.start - 1):(i.start + 1)].seq.upper() + genome[i.seqid][(i.end - 2):i.end].seq.upper()
 56 | 
 57 |     return sites
 58 | 
 59 | 
 60 | def print_splice_signal_histogram(introns, signals):
 61 |     """Find and print histogram of splice signals for a given set of introns"""
 62 |     hist = Counter()
 63 |     for i in introns:
 64 |         hist[signals[i]] += 1
 65 |     for s, c in sorted([(s, hist[s]) for s in hist], key=lambda x: x[1], reverse=True):
 66 |         print('{0}\t{1}'.format(s, c))
 67 |     print('')
 68 | 
 69 | 
 70 | class EmptySpliceSignal:
 71 |     """No splice signal, class needed for defaultdict"""
 72 |     def __call__(self):
 73 |         """Return a constant representing empty splice signal"""
 74 |         return '---'
 75 | 
 76 | 
 77 | if __name__ == '__main__':
 78 | 
 79 |     parser = argparse.ArgumentParser(description='Generate intron support table from a SAM/BAM file')
 80 |     parser.add_argument('--bam', metavar='FILE', dest='bamfile', type=str,
 81 |                         help='BAM file')
 82 |     parser.add_argument('--gff', metavar='FILE', dest='gfffile', type=str,
 83 |                         help='GFF file')
 84 |     parser.add_argument('--genome', metavar='FILE', dest='genome', type=str,
 85 |                         help='Genome FASTA file')
 86 |     parser.add_argument('--filter-by', metavar='LIST', dest='filter_by',
 87 |                         type=str, help='Filter reads')
 88 |     parser.add_argument('--filter-annot', metavar='STRING',
 89 |                         dest='filter_annot', type=str,
 90 |                         help='Filter annotated introns by transcript accession')
 91 |     parser.add_argument('--introns', metavar='FILE', dest='introns', type=str,
 92 |                         help='Output file for the intron table'
 93 |                         '(default: stdout)', default='-')
 94 |     parser.add_argument('--spec', dest='spec', action='store_true',
 95 |                         help='Show results for sensitivity and specificity'
 96 |                         ' analysis')
 97 |     parser.add_argument('--splice_histogram', dest='splice_histogram',
 98 |                         action='store_true',
 99 |                         help='Show splice signal histogram')
100 |     parser.add_argument('--numbers', dest='numbers', action='store_true',
101 |                         help='Show numbers of annotated and unannotated '
102 |                         'introns')
103 |     parser.add_argument('--sort', metavar='FILE', dest='sort', type=str,
104 |                         help='Output sort file for ROC score computation')
105 |     parser.add_argument('--counts', metavar='FILE', dest='counts', type=str,
106 |                         help='Tab delimited file with numbers of read '
107 |                         'placements for weighting read contrinution.'
108 |                         'Format: read_id, mate_id (1, 2), number of placements')
109 |     parser.add_argument('--max-count', metavar='FILE', dest='max_count',
110 |                         type=int, help='Maximum count for computation of '
111 |                         'weighted read alignment counts')
112 |                         
113 | 
114 |     args = parser.parse_args()
115 | 
116 | #    if not args.gfffile:
117 | #        raise InputError('Annotation file not specified, use --gff option')
118 | 
119 |     gff_introns = {}
120 |     if args.gfffile:
121 |         print('Reading annotatiotns', file=sys.stderr)
122 | 
123 |         if args.gfffile.endswith('pickle'):
124 |             f = open(args.gfffile, 'rb')
125 |             gff_introns = pickle.load(f)
126 |             f.close()
127 |         else:
128 |             if args.gfffile.endswith('.gz'):
129 |                 f = gzip.GzipFile(args.gfffile, 'r')
130 |             else:
131 |                 f = open(args.gfffile)
132 | 
133 |             if args.gfffile.endswith('.gff') or args.gfffile.endswith('.gff.gz'):
134 |                 gff_introns = gff.get_splice_sites(f,
135 |                                                    accession=args.filter_annot)
136 |             elif args.gfffile.endswith('.gtf') or args.gfffile.endswith('.gtf.gz'):
137 |                 gff_introns = gtf.get_splice_sites(f)
138 |             elif args.gfffile.endswith('.sam') or args.gfffile.endswith('.sam.gz'):
139 |                 gff_introns = sam.get_introns_with_reads(args.gfffile)
140 |             elif args.gfffile.endswith('.txt'):
141 |                 gff_introns = txt.get_introns(f)
142 |             else:
143 |                 raise InputError('Unrecognized annotation file extension, '\
144 |                                  'must be one of these: .gff, .gtf, .sam, '\
145 |                                  '.pickle')
146 |             f.close()
147 |         print('{0} introns in the annotation'.format(len(gff_introns)))
148 |         print('done', file=sys.stderr)
149 |     else:
150 |         gff_introns = {}
151 | 
152 |     placements = None
153 |     if args.counts:
154 |         max_count = None
155 |         if args.max_count:
156 |             max_count = args.max_count
157 |         placements = sam.read_placements(args.counts, max_count)
158 | 
159 |     # parse command line filtering arguments
160 |     filter_by = None
161 |     if args.filter_by:
162 |         filter_by = {}
163 |         a = args.filter_by.split()
164 |         for w, i in zip(a[::2], a[1::2]):
165 |             if w == 'read_id':
166 |                 f = open(i)
167 |                 filter_by[w] = {i: 1 for i in f.read().splitlines()}
168 |                 f.close()
169 |             else:
170 |                 filter_by[w] = int(i)
171 |             print('   filtering reads by {0}\t{1}'.format(w, i), file=sys.stderr)
172 | 
173 | 
174 |     # get introns from a SAM/BAM file
175 |     print('Reading SAM/BAM file', file=sys.stderr)
176 |     introns = sam.get_introns_with_reads(args.bamfile, force_single = True,
177 |                                          filter_by = filter_by,
178 |                                          placements = placements)
179 |     print('done', file=sys.stderr)
180 | 
181 |     splice_signals = defaultdict(EmptySpliceSignal())
182 |     if args.genome:
183 |         splice_signals = get_splice_signals(introns, args.genome)
184 | 
185 |         if args.splice_histogram:
186 |             print_splice_signal_histogram(introns, splice_signals)
187 | 
188 | 
189 |     # sort introns by end position, start position, reference sequence id
190 |     keys = sorted(introns.keys(), key=lambda x: x.end)
191 |     keys.sort(key=lambda x: x.start)
192 |     keys.sort(key=lambda x: x.seqid)
193 | 
194 |     # print intron coverage table
195 |     f = sys.stdout
196 |     if args.introns != '-':
197 |         f = open(args.introns, 'w')
198 |     for k in keys:
199 |         known = 'NEW'
200 |         if k in gff_introns:
201 |             known = 'KNOWN'
202 |         print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'\
203 |               .format(k.seqid, k.start, k.end, introns[k], known,
204 |                       splice_signals[k]), file=f)
205 |     if args.introns != '-':
206 |         f.close()
207 |     
208 | 
209 |     # create a dataframe to compute sensitivity, precision, and a sort file
210 |     if args.spec or args.sort or args.numbers:
211 | 
212 |         d = {'intron': [':'.join(str(i) for i in [k.seqid, k.start, k.end]) \
213 |                         for k in keys],
214 |              'coverage': [introns[k] for k in keys],
215 |              'annotation': ['KNOWN' if k in gff_introns else 'NEW' \
216 |                             for k in keys]}
217 | 
218 |         df = pd.DataFrame(data=d)
219 | 
220 |         num_annot = df['annotation'].apply(func=lambda x: 0 if x == 'KNOWN' \
221 |                                            else 1)
222 |         df['key'] = 2 * df['coverage'] + num_annot
223 |         df.sort_values(by='key', ascending=False, inplace=True)
224 | #        df = df.reindex(columns=['intron', 'coverage', 'annotation', 'key'])
225 | 
226 |     # print sensitivity/specificity report
227 |     if args.spec:
228 |         num_known = len(gff_introns)
229 |         print('Number of known introns: {0}'.format(num_known))
230 |         for i in [1, 2, 3, 5]:
231 |             print('Coverage >= {0}'.format(i))
232 |             idx = (df['coverage'] >= i)
233 |             num_tp = df[idx & (df['annotation'] == 'KNOWN')].shape[0]
234 |             sensitivity = float(num_tp) / float(num_known)
235 |             specificity = float(num_tp) / float(df[idx].shape[0])
236 |             print('Sensitivity: {0}'.format(sensitivity))
237 |             print('Precision: {0}'.format(specificity))
238 |             print('')
239 |                   
240 |     # print number of annotated and unannotated introns
241 |     if args.numbers:
242 |         num_known = len(gff_introns)
243 |         print('Number of known introns: {0}'.format(num_known))
244 |         for i in [1, 2, 3, 5]:
245 |             print('Coverage >= {0}'.format(i))
246 |             idx = (df['coverage'] >= i)
247 |             num_tp = df[idx & (df['annotation'] == 'KNOWN')].shape[0]
248 |             num_fp = df[idx & (df['annotation'] != 'KNOWN')].shape[0]
249 |             print('Number of annotated introns: {0}'.format(num_tp))
250 |             print('Number of unannotated introns: {0}'.format(num_fp))
251 |             print('')
252 | 
253 |     # print the sort file
254 |     if args.sort:
255 |         with open(args.sort, 'w') as f:
256 |             n = 0
257 |             for r in df.iterrows():
258 |                 print('\t'.join(str(i) for i in [n, r[1]['coverage'], \
259 |                               '+' if r[1]['annotation'] == 'KNOWN' else '-', \
260 |                               r[1]['intron']]), file=f)
261 |                 n += 1
262 | 
263 | 
264 | 
265 | 


--------------------------------------------------------------------------------
/magicblast-tools/get-transcripts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #============================================================================
  3 | #
  4 | #                           PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | #
 26 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
 27 | #
 28 | # ---------------------------------------------------------------------------
 29 | 
 30 | """Get transcript sequences from a genome and a GTF file"""
 31 | 
 32 | import gtf
 33 | import gff
 34 | from pyfaidx import Fasta
 35 | import argparse
 36 | 
 37 | if __name__ == '__main__':
 38 | 
 39 |     parser = argparse.ArgumentParser(description='Get transcript sequences from a genome and a GTF file')
 40 |     parser.add_argument('--genome', metavar='FILE', dest='genome', type=str,
 41 |                         help='Reference sequence in FASTA format')
 42 |     parser.add_argument('--gff', metavar='FILE', dest='gff', type=str,
 43 |                         help='GFF or GTF file')
 44 |     parser.add_argument('--select', metavar='STRING', dest='select', type=str,
 45 |                         help='Print only sequences whose id contain provided '
 46 |                         'string')
 47 | 
 48 |     args = parser.parse_args()
 49 | 
 50 |     f = open(args.gff)
 51 |     if args.gff.endswith('.gtf'):
 52 |         transcripts = gtf.get_transcripts(f)
 53 |     elif args.gff.endswith('.gff'):
 54 |         transcripts = gff.get_mrnas(f)
 55 |     else:
 56 |         raise ValueError('Unrecognized file extension for: {}. Only GFF or GTF'
 57 |                          ' files are allowed'.format(args.gff))
 58 |     f.close()
 59 | 
 60 |     genome = Fasta(args.genome)
 61 | 
 62 |     for i in transcripts:
 63 | 
 64 |         strand = transcripts[i].exons[0].strand
 65 |         sequence = ''
 66 | 
 67 |         exons = sorted(transcripts[i].exons, key=lambda x: x.start,
 68 |                        reverse = (strand == '-'))
 69 | 
 70 |         seqid = transcripts[i].seqid
 71 | 
 72 |         for exon in exons:
 73 | 
 74 |             if exon.strand != strand:
 75 |                 raise ValueError('Mismatched strands for transcript: {0}'.\
 76 |                                  format(i))
 77 | 
 78 |             if strand == '-':
 79 |                 sequence += genome[seqid][(exon.start - 1):(exon.end)].\
 80 |                             reverse.complement.seq.upper()
 81 |             else:
 82 |                 sequence += genome[seqid][(exon.start - 1):(exon.end)].\
 83 |                             seq.upper()
 84 | 
 85 |         seqid = i
 86 |         if 'Name' in transcripts[i].attributes:
 87 |             # A few mRNAs align to both X and Y chromosomes in slightly
 88 |             # different locations, so we are adding reference id to sequence
 89 |             # id to distinguish between the two alignments
 90 |             seqid = transcripts[i].attributes['Name'] + ':' + transcripts[i].seqid
 91 | 
 92 |         if args.select and args.select not in seqid:
 93 |             continue
 94 | 
 95 |         print('>{0}'.format(seqid))
 96 |         for n in range(0, len(sequence), 80):
 97 |             print('{0}'.format(sequence[n:(n + 80)]))
 98 | 
 99 |             
100 | 


--------------------------------------------------------------------------------
/magicblast-tools/gff.py:
--------------------------------------------------------------------------------
  1 | #============================================================================
  2 | #
  3 | #                           PUBLIC DOMAIN NOTICE
  4 | #               National Center for Biotechnology Information
  5 | #
  6 | #  This software/database is a "United States Government Work" under the
  7 | #  terms of the United States Copyright Act.  It was written as part of
  8 | #  the author's official duties as a United States Government employee and
  9 | #  thus cannot be copyrighted.  This software/database is freely available
 10 | #  to the public for use. The National Library of Medicine and the U.S.
 11 | #  Government have not placed any restriction on its use or reproduction.
 12 | #
 13 | #  Although all reasonable efforts have been taken to ensure the accuracy
 14 | #  and reliability of the software and data, the NLM and the U.S.
 15 | #  Government do not and cannot warrant the performance or results that
 16 | #  may be obtained by using this software or data. The NLM and the U.S.
 17 | #  Government disclaim all warranties, express or implied, including
 18 | #  warranties of performance, merchantability or fitness for any particular
 19 | #  purpose.
 20 | #
 21 | #  Please cite the author in any work or product based on this material.
 22 | #
 23 | # ===========================================================================
 24 | #
 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
 26 | #
 27 | # ---------------------------------------------------------------------------
 28 | 
 29 | """GFF file parser"""
 30 | 
 31 | from collections import namedtuple
 32 | from base import Intron
 33 | from base import Exon
 34 | from base import mRNA
 35 | import sys
 36 | import re
 37 | 
 38 | 
 39 | Record = namedtuple('Record', ['seqid', 'source', 'feature', 'start',
 40 |                                'end', 'score', 'strand', 'frame',
 41 |                                'attribute'])
 42 | 
 43 | def parse(line):
 44 |     """Parse a single line and return Record"""
 45 |     fields = line.rstrip().split('\t')
 46 |     try:
 47 |         r = Record(
 48 |             seqid = None if fields[0] == '.' else fields[0],
 49 |             source = None if fields[1] == '.' else fields[1],
 50 |             feature = None if fields[2] == '.' else fields[2],
 51 |             start = None if fields[3] == '.' else int(fields[3]),
 52 |             end = None if fields[4] == '.' else int(fields[4]),
 53 |             score = None if fields[5] == '.' else float(fields[5]),
 54 |             strand = None if fields[6] == '.' else fields[6],
 55 |             frame = None if fields[7] == '.' else int(fields[7]),
 56 |             attribute = None if fields[8] == '.' else fields[8]
 57 |             )
 58 |     except ValueError:
 59 |         print(line)
 60 | 
 61 |     return r
 62 | 
 63 | 
 64 | def get_introns(stream):
 65 |     """Collect introns from gff stream and return as a dictionary"""
 66 |     introns = {}
 67 |     for line in stream:
 68 |         if line.startswith('#'):
 69 |             continue
 70 | 
 71 |         r = parse(line)
 72 |         if r.feature != 'intron':
 73 |             continue
 74 | 
 75 |         introns[Intron(seqid = r.seqid, start = r.start, end = r.end,
 76 |                        strand = r.strand)] = 1
 77 | 
 78 |     return introns
 79 | 
 80 | 
 81 | def get_mrnas(stream, source = None):
 82 |     """Collect mRNA extents with exons"""
 83 |     mrnas = {}
 84 |     
 85 |     for line__ in stream:
 86 |         if isinstance(line__, str):
 87 |             line = line__
 88 |         elif isinstance(line__, bytes):
 89 |             line = line__.decode()
 90 |         else:
 91 |             raise InputError('Unsupported stream data')
 92 | 
 93 |         if line.startswith('#'):
 94 |             continue
 95 |         
 96 |         f = parse(line)
 97 | 
 98 |         if source is not None and f.source != source:
 99 |             continue
100 |         
101 | #        if f.feature in ['mRNA', 'transcript', 'primary_transcript', 'miRNA',
102 | #                         'lnc_RNA', 'gene', 'snoRNA', 'antisense_RNA']:
103 | #        if f.feature in ['mRNA', 'transcript']:
104 |         if f.feature == 'mRNA':
105 | #            m = re.search('ID=(rna\d\d*);', f.attribute)
106 |             m = re.search('ID=(\w[\w:]*)', f.attribute)
107 |             if not m:
108 |                 raise ValueError('mRNA id could not be found')
109 |             index = m.group(1)
110 |             exons = []
111 |             if index in mrnas:
112 |                 if mrnas[index].start is not None:
113 |                     raise RuntimeError('mRNA with the same id is already present')
114 |                 exons = mrna[index].exons
115 | 
116 |             attributes = {}
117 |             for a in f.attribute.rstrip().split(';'):
118 |                 r = a.split('=')
119 |                 if (len(r) == 2):
120 |                     attributes[r[0]] = r[1]
121 | 
122 |             mrnas[index] = mRNA(seqid = f.seqid, start = f.start, end = f.end,
123 |                                 strand = f.strand, exons = exons,
124 |                                 attributes = attributes)
125 | 
126 |         if f.feature == 'exon':
127 | #            m = re.search('Parent=(rna\d\d*)', f.attribute)
128 |             m = re.search('Parent=(\w[\w:]*)', f.attribute)
129 |             if not m:
130 |                 # there seem to be exons not assigned to mRNAs
131 | #                raise ValueError('Exon without parent: {0}'.format(f.attribute))
132 |                 print('WARNING: Exon without parent: {0}'.format(line))
133 |                 continue
134 | 
135 |             index = m.group(1)
136 |             if index not in mrnas:
137 | #                raise  ValueError('Parent of the exon not found: {0}'.format(line))
138 | #                mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None,
139 | #                                    strand = None, exons = [])
140 |                 continue
141 | 
142 |             mrnas[index].exons.append(Exon(seqid = f.seqid, start = f.start,
143 |                                             end = f.end, strand = f.strand))
144 | 
145 |         # these things appear in RNA-seq, but are not mRNA
146 |         if f.feature == 'five_prime_UTR':
147 |             m = re.search('ID=(id\d\d*);', f.attribute)
148 |             if not m:
149 |                 raise ValueError("5'UTR id could not be found")
150 |             
151 |             index = m.group(1)
152 |             if index not in mrnas:
153 |                 mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None,
154 |                                     strand = None, exons = [], attributes = {})
155 | 
156 |             mrnas[index].exons.append(Exon(seqid = f.seqid, start = f.start,
157 |                                            end = f.end, strand = f.strand))
158 | 
159 |         # this is a hack
160 |         # there are introns with no exons in the gff file, we create fake
161 |         # exons for easier processing
162 |         if f.feature == 'intron':
163 |             m = re.search('ID=(id\d\d*);', f.attribute)
164 |             if not m:
165 |                 raise ValueError('Intron id could not be found')
166 |             
167 |             index = m.group(1)
168 |             if index in mrnas:
169 |                 raise RuntimeError('mRNA element already present')
170 | 
171 |             exon1 = Exon(seqid = f.seqid, start = f.start - 2,
172 |                          end = f.start - 1, strand = f.strand)
173 |             exon2 = Exon(seqid = f.seqid, start = f.end + 1, end = f.end + 2,
174 |                          strand = f.strand)
175 |             mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None,
176 |                                 strand = None, exons = [exon1, exon2],
177 |                                 attributes = {})
178 | 
179 | 
180 |     return mrnas
181 | 
182 | def get_splice_sites_from_exons(exons, use_strand):
183 |     """Collect splice sites from list of exons and return as a dictionary"""
184 |     sites = {}
185 |     sorted_exons = sorted(exons, key=lambda x: x.start)
186 |     for f, s in zip(sorted_exons, sorted_exons[1:]):
187 |         strand = None
188 |         if use_strand:
189 |             strand = f.strand
190 |         sites[Intron(seqid = f.seqid, start = f.end + 1, end = s.start - 1,
191 |                      strand = strand)] = 1
192 |         
193 |     return sites
194 | 
195 | def get_splice_sites(stream, source = None, use_strand = False, accession = None):
196 |     """Get splice sites from a mRNAs and returs and a dictionary of introns"""
197 |     sites = {}
198 |     mrnas = get_mrnas(stream, source)
199 |     for r in mrnas:
200 |         if accession is not None and 'Name' in mrnas[r].attributes and \
201 |            not mrnas[r].attributes['Name'].startswith(accession):
202 |             continue
203 | 
204 |         s = get_splice_sites_from_exons(mrnas[r].exons, use_strand)
205 |         for i in s:
206 |             sites[i] = 1
207 |     return sites
208 | 
209 | 
210 | 
211 | if __name__ == '__main__':
212 | 
213 |     import argparse
214 | 
215 |     parser = argparse.ArgumentParser(description='Generate a list of introns from a GFF file')
216 |     parser.add_argument('gfffile', metavar='FILE', type=str, help='GFF file')
217 | 
218 |     args = parser.parse_args()
219 | 
220 |     with open(args.gfffile) as f:
221 |         introns = get_splice_sites(f, use_strand = True)
222 | 
223 |     for k in introns:
224 |         print(f'{k.seqid}\t{k.start}\t{k.end}\t{k.strand}')
225 | 
226 |             
227 | 


--------------------------------------------------------------------------------
/magicblast-tools/gtf.py:
--------------------------------------------------------------------------------
  1 | #============================================================================
  2 | #
  3 | #                           PUBLIC DOMAIN NOTICE
  4 | #               National Center for Biotechnology Information
  5 | #
  6 | #  This software/database is a "United States Government Work" under the
  7 | #  terms of the United States Copyright Act.  It was written as part of
  8 | #  the author's official duties as a United States Government employee and
  9 | #  thus cannot be copyrighted.  This software/database is freely available
 10 | #  to the public for use. The National Library of Medicine and the U.S.
 11 | #  Government have not placed any restriction on its use or reproduction.
 12 | #
 13 | #  Although all reasonable efforts have been taken to ensure the accuracy
 14 | #  and reliability of the software and data, the NLM and the U.S.
 15 | #  Government do not and cannot warrant the performance or results that
 16 | #  may be obtained by using this software or data. The NLM and the U.S.
 17 | #  Government disclaim all warranties, express or implied, including
 18 | #  warranties of performance, merchantability or fitness for any particular
 19 | #  purpose.
 20 | #
 21 | #  Please cite the author in any work or product based on this material.
 22 | #
 23 | # ===========================================================================
 24 | #
 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
 26 | #
 27 | # ---------------------------------------------------------------------------
 28 | 
 29 | """GTF file parser"""
 30 | 
 31 | from collections import namedtuple
 32 | from base import Intron
 33 | from base import Exon
 34 | from base import mRNA
 35 | import sys
 36 | import re
 37 | 
 38 | 
 39 | Record = namedtuple('Record', ['seqid', 'source', 'feature', 'start',
 40 |                                'end', 'score', 'strand', 'frame',
 41 |                                'attribute'])
 42 | 
 43 | def parse(line):
 44 |     """Parse a single line and return Record"""
 45 |     fields = line.rstrip().split('\t')
 46 |     r = Record(
 47 |         seqid = None if fields[0] == '.' else fields[0],
 48 |         source = None if fields[1] == '.' else fields[1],
 49 |         feature = None if fields[2] == '.' else fields[2],
 50 |         start = None if fields[3] == '.' else int(fields[3]),
 51 |         end = None if fields[4] == '.' else int(fields[4]),
 52 |         score = None if fields[5] == '.' else float(fields[5]),
 53 |         strand = None if fields[6] == '.' else fields[6],
 54 |         frame = None if fields[7] == '.' else int(fields[7]),
 55 |         attribute = None if fields[8] == '.' else fields[8]
 56 |         )
 57 |     return r
 58 | 
 59 | 
 60 | 
 61 | def get_transcripts(stream):
 62 |     """Collect transcripts as collections of CDS/exons"""
 63 |     transcripts = {}
 64 |     
 65 |     for line__ in stream:
 66 |         if isinstance(line__, str):
 67 |             line = line__
 68 |         elif isinstance(line__, bytes):
 69 |             line = line__.decode()
 70 |         else:
 71 |             raise InputError('Unsupported stream data')
 72 | 
 73 |         if line.startswith('#') or not line.strip():
 74 |             continue
 75 |         
 76 |         f = parse(line)
 77 |         if f.feature == 'exon':
 78 |             m = re.search('transcript_id "([a-zA-Z0-9\.]+)";', f.attribute)
 79 |             if not m:
 80 |                 raise ValueError('Gene id could not be found')
 81 |             index = m.group(1)
 82 |             exons = []
 83 |             if index not in transcripts:
 84 |                 transcripts[index] = mRNA(seqid = f.seqid, start = None,
 85 |                                           end = None, strand = f.strand,
 86 |                                           exons = [], attributes = '')
 87 |                 
 88 |             transcripts[index].exons.append(Exon(seqid = f.seqid,
 89 |                                                  start = f.start, end = f.end,
 90 |                                                  strand = f.strand))
 91 | 
 92 |     return transcripts
 93 | 
 94 | 
 95 | def get_splice_sites_from_exons(exons, use_strand):
 96 |     """Collect splice sites from list of exons and return as a dictionary"""
 97 |     sites = {}
 98 |     sorted_exons = sorted(exons, key=lambda x: x.start)
 99 |     for f, s in zip(sorted_exons, sorted_exons[1:]):
100 |         strand = None
101 |         if use_strand:
102 |             strand = f.strand
103 |         sites[Intron(seqid = f.seqid, start = f.end + 1, end = s.start - 1,
104 |                      strand = strand)] = 1
105 |         
106 |     return sites
107 | 
108 | def get_splice_sites(stream, use_strand = False):
109 |     """Get splice sites from a mRNAs and returs and a dictionary of introns"""
110 |     sites = {}
111 |     transcripts = get_transcripts(stream)
112 |     for r in transcripts:
113 |         s = get_splice_sites_from_exons(transcripts[r].exons, use_strand)
114 |         for i in s:
115 |             sites[i] = 1
116 |     return sites
117 | 
118 | 
119 | 
120 | if __name__ == '__main__':
121 | 
122 |     import argparse
123 | 
124 |     parser = argparse.ArgumentParser(description='Generate a list of introns from a GTF file')
125 |     parser.add_argument('gtffile', metavar='FILE', type=str, help='GFF file')
126 | 
127 |     args = parser.parse_args()
128 | 
129 |     with open(args.gtffile) as f:
130 |         introns = get_splice_sites(f, use_strand = True)
131 | 
132 |     for k in introns:
133 |         print(f'{k.seqid}\t{k.start - 2}\t{k.end}\t{k.strand}')
134 | 
135 | 


--------------------------------------------------------------------------------
/magicblast-tools/requirements.txt:
--------------------------------------------------------------------------------
1 | pysam
2 | pyfaidx
3 | pandas
4 | 


--------------------------------------------------------------------------------
/magicblast-tools/sam.py:
--------------------------------------------------------------------------------
  1 | #============================================================================
  2 | #
  3 | #                           PUBLIC DOMAIN NOTICE
  4 | #               National Center for Biotechnology Information
  5 | #
  6 | #  This software/database is a "United States Government Work" under the
  7 | #  terms of the United States Copyright Act.  It was written as part of
  8 | #  the author's official duties as a United States Government employee and
  9 | #  thus cannot be copyrighted.  This software/database is freely available
 10 | #  to the public for use. The National Library of Medicine and the U.S.
 11 | #  Government have not placed any restriction on its use or reproduction.
 12 | #
 13 | #  Although all reasonable efforts have been taken to ensure the accuracy
 14 | #  and reliability of the software and data, the NLM and the U.S.
 15 | #  Government do not and cannot warrant the performance or results that
 16 | #  may be obtained by using this software or data. The NLM and the U.S.
 17 | #  Government disclaim all warranties, express or implied, including
 18 | #  warranties of performance, merchantability or fitness for any particular
 19 | #  purpose.
 20 | #
 21 | #  Please cite the author in any work or product based on this material.
 22 | #
 23 | # ===========================================================================
 24 | #
 25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
 26 | #
 27 | # ---------------------------------------------------------------------------
 28 | 
 29 | """Useful functions for getting information from a SAM/BAM file that work on
 30 | top of pysam"""
 31 | 
 32 | import pysam
 33 | from collections import Counter
 34 | from collections import namedtuple
 35 | from collections import defaultdict
 36 | from base import Intron
 37 | import re
 38 | 
 39 | 
 40 | CIGAR_MATCH = 0
 41 | CIGAR_INSERTION = 1
 42 | CIGAR_DELETION = 2
 43 | CIGAR_INTRON = 3
 44 | CIGAR_SOFT_CLIP = 4
 45 | 
 46 | 
 47 | def open_sam_or_bam(filename):
 48 |     """Open a SAM or BAM file and return file handle"""
 49 |     is_bam = ""
 50 |     if filename.endswith('.bam'):
 51 |         is_bam = 'b'
 52 |     return pysam.AlignmentFile(filename, 'r' + is_bam)
 53 | 
 54 | 
 55 | def get_standard_read_name(r, force_single = False, trim = False):
 56 |     """Standardize read names. Some programs change read names.
 57 |        If force_single is true, .1 and .2 will be added to read names so that
 58 |        they can be treated as single. If trim is true, then the last 2
 59 |        characters of read name will be trimmed. This is to remove .1 and .2
 60 |        from a SAM or BAM file."""
 61 |     read_name = r.query_name
 62 |     # standardise read accessions
 63 |     # hisat puts .R. before read number and removes .1 and .2
 64 |     read_name = re.sub(r'\.R\.', '.', read_name)
 65 |     if force_single and r.flag & 1:
 66 |         if r.flag & 64:
 67 |             read_name += '.1'
 68 |         else:
 69 |             read_name += '.2'
 70 | 
 71 |     if trim:
 72 |         read_name = read_name[:-2]
 73 | 
 74 |     return read_name
 75 | 
 76 | 
 77 | 
 78 | def get_introns_from_cigar(position, cigar):
 79 |     """Get introns start and stop positions from SAM alignment position and
 80 |     cigar string"""
 81 | 
 82 |     # pysam cigartuple op codes
 83 |     introns = []
 84 |     kMatch = 0
 85 |     kIns = 1
 86 |     kDel = 2
 87 |     kIntron = 3
 88 |     kSoftClip = 4
 89 | 
 90 |     kMatchOnly = 7
 91 |     kMismatch = 8
 92 | 
 93 |     s_offset = position
 94 | 
 95 |     for op, num in cigar:
 96 |         if op in [kMatch, kMatchOnly, kMismatch, kDel]:
 97 |             s_offset += num
 98 |         elif op == kIntron:
 99 |             introns.append((s_offset, s_offset + num - 1))
100 |             s_offset += num
101 | 
102 |     return introns
103 | 
104 | 
105 | def get_exons_from_cigar(position, cigar):
106 |     """Get exon start and stop positions from SAM alignment position and CIGAR
107 |     string"""
108 |     exons = []
109 |     s_offset = position
110 |     start = s_offset
111 |     for op, num in cigar:
112 |         if op in [0, 2, 7, 8]:
113 |             s_offset += num
114 |         elif op == 3:
115 |             exons.append((start + 1, s_offset))
116 |             s_offset += num
117 |             start = s_offset
118 | 
119 |     exons.append((start + 1, s_offset))
120 | 
121 |     return exons
122 | 
123 | 
124 | def get_exons(line):
125 |     """Get a list of exons from a single SAM alignment"""
126 |     return get_exons_from_cigar(line.reference_start, line.cigartuples)
127 | 
128 | 
129 | def do_filter(r, filter_by):
130 |     """Apply filter to an alignment, return True if alignment passes"""
131 | 
132 |     if not isinstance(filter_by, dict):
133 |         raise ValueError('filter_by argument must be a dictionary')
134 | 
135 |     for k in filter_by:
136 |         if k == 'edit_distance':
137 |             edist = r.get_tag('NM')
138 |             if edist > filter_by[k]:
139 |                 return False
140 | 
141 |         elif k == 'score':
142 |             score = get_score(r)
143 |             if score < filter_by[k]:
144 |                 return False
145 | 
146 |         elif k == 'count':
147 |             count = r.get_tag('NH')
148 |             if count > filter_by[k]:
149 |                 return False
150 | 
151 |         elif k == 'edit_distance_clip':
152 |             edist = 0
153 |             for op, num in r.cigartuples:
154 |                 if op == 4:
155 |                     edist += num
156 |             edist += r.get_tag('NM')
157 |             if edist > filter_by[k]:
158 |                 return False
159 | 
160 |         elif k == 'read_id':
161 |             read_name = r.query_name.replace('.R.', '.')
162 |             if read_name not in filter_by[k]:
163 |                 return False
164 | 
165 |         else:
166 |             raise ValueError('Unrecognised filter name: {0}'.format(k))
167 |             
168 |                     
169 |     return True
170 | 
171 | 
172 | def get_introns_(stream, force_single, filter_by):
173 |     """Get intron postitions from a SAM stream"""
174 |     introns = Counter()
175 | 
176 |     for read in stream:
177 | 
178 |         # skip unaligned reads
179 |         if read.flag & 4:
180 |             continue
181 | 
182 |         # apply alignment filters
183 |         if filter_by is not None:
184 |             if not do_filter(r, filter_by):
185 |                 continue
186 | 
187 |         # read.reference_start is zero based
188 |         for (f, t) in get_introns_from_cigar(read.reference_start + 1,
189 |                                              read.cigartuples):
190 | 
191 |             subject = stream.getrname(read.reference_id)
192 |             introns[Intron(seqid = subject, start = f, end = t,
193 |                            strand = None)] += 1
194 | 
195 |     return introns
196 | 
197 | 
198 | def get_introns(filename, force_single, filter_by, trim):
199 |     """Get intron positions"""
200 |     f = open_sam_or_bam(filename)
201 |     introns = get_introns_(f, force_single, filter_by)
202 |     f.close()
203 |     return introns
204 | 
205 | 
206 | def get_introns_with_reads_(stream, force_single, filter_by, trim, with_reads,
207 |                             placements):
208 |     """Get intron postitions from a SAM stream"""
209 |     introns = None
210 |     if with_reads:
211 |         introns = defaultdict(set)
212 |     else:
213 |         if placements is None:
214 |             introns = defaultdict(int)
215 |         else:
216 |             introns = defaultdict(float)
217 | 
218 |     for r in stream:
219 | 
220 |         # skip unalined reads
221 |         if r.cigartuples is None:
222 |             continue
223 | 
224 |         # apply alignment filters
225 |         if filter_by is not None:
226 |             if not do_filter(r, filter_by):
227 |                 continue
228 | 
229 |         # read.reference_start is zero based
230 |         for (f, t) in get_introns_from_cigar(r.reference_start + 1, r.cigartuples):
231 |             strand = '+'
232 |             if r.flag & 16 != 0:
233 |                 strand = '-'
234 | 
235 |             subject = stream.getrname(r.reference_id)
236 |             i = Intron(seqid = subject, start = f, end = t, strand = None)
237 |             # standardise read accessions
238 |             # hisat puts .R. before read number and removes .1 and .2
239 |             read_name = get_standard_read_name(r, force_single = force_single,
240 |                                                trim = trim)
241 | 
242 |             num_clipped = 0
243 |             for op, num in r.cigartuples:
244 |                 if op == 4:
245 |                     num_clipped += num
246 | 
247 | #            if i not in introns:
248 | #                introns[i] = set()
249 |             if with_reads:
250 |                 introns[i].add(read_name)
251 |             else:
252 |                 if placements is None:
253 |                     introns[i] += 1
254 |                 else:
255 |                     introns[i] += 1.0 / placements[read_name]
256 |                 
257 |     return introns
258 | 
259 | 
260 | def get_introns_with_reads(filename, force_single = False, filter_by = None,
261 |                            trim = False, with_reads = False, placements = None):
262 |     """Get intron positions with reads"""
263 |     f = open_sam_or_bam(filename)
264 |     introns = get_introns_with_reads_(f, force_single = force_single,
265 |                                       filter_by = filter_by, trim = trim,
266 |                                       with_reads = with_reads,
267 |                                       placements = placements)
268 |     f.close()
269 |     return introns
270 | 
271 | 
272 | def get_score(line):
273 |     """Compute alignment score from CIGAR and edit distance for a single line
274 |     of SAM/BAM file. This is prefered to relaying on AS tag to be able to
275 |     compare mappers with different scoring schemes"""
276 |     penalty = 8
277 |     score = 0
278 | 
279 |     # get edit distance
280 |     edit_dist = line.get_tag('NM')
281 | 
282 |     for op, num in line.cigar:
283 |         # score matches (and mismatches as matches)
284 |         if op == 0:
285 |             score += num
286 |         # score gaps
287 |         elif op in [1, 2]:
288 |             score -= num * penalty
289 |             edit_dist -= num
290 | 
291 |     # add penalty for mismatches and subtract match scores for them
292 |     score -= (edit_dist * penalty) + edit_dist
293 |     return score
294 | 


--------------------------------------------------------------------------------
/magicblast-tools/txt.py:
--------------------------------------------------------------------------------
 1 | #============================================================================
 2 | #
 3 | #                           PUBLIC DOMAIN NOTICE
 4 | #               National Center for Biotechnology Information
 5 | #
 6 | #  This software/database is a "United States Government Work" under the
 7 | #  terms of the United States Copyright Act.  It was written as part of
 8 | #  the author's official duties as a United States Government employee and
 9 | #  thus cannot be copyrighted.  This software/database is freely available
10 | #  to the public for use. The National Library of Medicine and the U.S.
11 | #  Government have not placed any restriction on its use or reproduction.
12 | #
13 | #  Although all reasonable efforts have been taken to ensure the accuracy
14 | #  and reliability of the software and data, the NLM and the U.S.
15 | #  Government do not and cannot warrant the performance or results that
16 | #  may be obtained by using this software or data. The NLM and the U.S.
17 | #  Government disclaim all warranties, express or implied, including
18 | #  warranties of performance, merchantability or fitness for any particular
19 | #  purpose.
20 | #
21 | #  Please cite the author in any work or product based on this material.
22 | #
23 | # ===========================================================================
24 | #
25 | # Author: Greg Boratyn boratyng@ncbi.nlm.nih.gov
26 | #
27 | # ---------------------------------------------------------------------------
28 | 
29 | """Parse a text junctions file"""
30 | 
31 | from base import Intron
32 | 
33 | def get_introns(stream):
34 |     """Get introns locations and return them as a dictionary"""
35 |     introns = {}
36 |     for line in stream:
37 |         fields = line.rstrip().split()[1].split(':')
38 |         seqid = fields[0]
39 |         ff = fields[1].split('-')
40 |         start = int(ff[0])
41 |         end = int(ff[1])
42 |         introns[Intron(seqid = seqid, start = start + 1, end = end - 1,
43 |                        strand = None)] = 1
44 | 
45 |     return introns
46 | 
47 | 


--------------------------------------------------------------------------------