├── bin ├── __init__.py ├── rename_gtf.py ├── fix_gtf_ids.py ├── get_longest_isoform.py ├── evidence.py ├── features.py ├── get_overlapping_genes.py ├── LICENSE.txt ├── tsebra.py ├── compleasm-LICENSE.txt ├── overlap_graph.py └── genome_anno.py ├── docs ├── .gitkeep └── TSEBRA_Logo.png ├── tests ├── __init__.py ├── graph │ ├── ex2_anno1.gtf │ ├── ex2_anno2.gtf │ ├── ex4_anno1.gtf │ ├── ex4_anno2.gtf │ ├── ex3_anno2.gtf │ ├── ex3_anno1.gtf │ ├── ex_feature_hint1.gff │ ├── ex1_anno2.gtf │ ├── ex1_anno1.gtf │ ├── ex_feature_hint2.gff │ ├── ex_feature_anno2.gtf │ └── ex_feature_anno1.gtf ├── evidence │ ├── hint1.gff │ ├── hint3.gff │ └── hint2.gff ├── genome_anno │ ├── tx1.gtf │ ├── missing_gid.gtf │ ├── format_error.gtf │ └── anno1.gtf ├── test_evidence.py ├── test_graph.py ├── test_genome_anno.py ├── combined.gtf └── prep_files.py ├── config ├── braker3.cfg ├── default.cfg ├── keep_ab_initio.cfg └── pref_braker1.cfg ├── example ├── run_tsebra_example.sh └── braker1_results │ └── hintsfile.gff └── README.md /bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/TSEBRA_Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gaius-Augustus/TSEBRA/HEAD/docs/TSEBRA_Logo.png -------------------------------------------------------------------------------- /tests/graph/ex2_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 200 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 200 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS transcript 200 300 0 + 0 t1 -------------------------------------------------------------------------------- /tests/evidence/hint1.gff: -------------------------------------------------------------------------------- 1 | 3L ProtHint intron 5812862 5812941 24 - . src=M;mult=24;pri=4 2 | 3L ProtHint intron 12291242 12291299 8 - . transcript_id="t1" 3 | 3L ProtHint intron 12291242 12291299 8 - . src=M;pri=4 4 | 3L ProtHint intron 12291242 -------------------------------------------------------------------------------- /tests/graph/ex2_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS transcript 100 200 0 + 0 t1 4 | 3R AUGUSTUS exon 301 400 0 + 0 transcript_id "t2"; gene_id "t2_g"; 5 | 3R AUGUSTUS CDS 301 400 0 + 0 transcript_id "t2"; gene_id "t2_g"; 6 | 3R AUGUSTUS transcript 301 400 0 + 0 t2 -------------------------------------------------------------------------------- /config/braker3.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 1 4 | E 20 5 | C 1 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 1.0 10 | stasto_support 2 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.1 14 | e_2 0.5 15 | e_3 0.05 16 | e_4 0.2 -------------------------------------------------------------------------------- /config/default.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 1 4 | E 20 5 | C 1 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 1.0 10 | stasto_support 2 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.1 14 | e_2 0.5 15 | e_3 0.05 16 | e_4 0.18 -------------------------------------------------------------------------------- /tests/graph/ex4_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 201 299 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 100 400 0 + 0 t1 -------------------------------------------------------------------------------- /tests/graph/ex4_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 101 201 1 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 101 201 1 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 202 300 1 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 301 401 1 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 301 401 1 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 101 401 1 + 0 t1 -------------------------------------------------------------------------------- /config/keep_ab_initio.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 0.1 4 | E 10 5 | C 5 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 0 10 | stasto_support 1 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.1 14 | e_2 0.5 15 | # Values have to be >0 16 | e_3 0.05 17 | e_4 0.18 -------------------------------------------------------------------------------- /config/pref_braker1.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 0.1 4 | E 10000 5 | C 5 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 0.25 10 | stasto_support 2 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.25 14 | e_2 1 15 | # Values have to be >0 16 | e_3 0.05 17 | e_4 0.18 18 | -------------------------------------------------------------------------------- /tests/graph/ex3_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 110 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 110 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 201 799 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 800 1000 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 800 1000 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 110 1000 0 + 0 t1 7 | 3R AUGUSTUS exon 350 450 0 + 0 transcript_id "t2"; gene_id "t2_g"; 8 | 3R AUGUSTUS CDS 350 450 0 + 0 transcript_id "t2"; gene_id "t2_g"; 9 | 3R AUGUSTUS transcript 350 450 0 + 0 t2 -------------------------------------------------------------------------------- /tests/graph/ex3_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 301 499 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 500 700 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 500 700 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS intron 701 899 0 + 0 transcript_id "t1"; gene_id "t1_g"; 7 | 3R AUGUSTUS exon 900 1100 0 + 0 transcript_id "t1"; gene_id "t1_g"; 8 | 3R AUGUSTUS CDS 900 1100 0 + 0 transcript_id "t1"; gene_id "t1_g"; 9 | 3R AUGUSTUS transcript 100 1100 0 + 0 t1 -------------------------------------------------------------------------------- /tests/genome_anno/tx1.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | -------------------------------------------------------------------------------- /tests/evidence/hint3.gff: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS start_codon 100 102 10 + . src=E;mult=2;pri=4 2 | 3R AUGUSTUS intron 501 599 10 + . src=E;mult=2;pri=4 3 | 3R AUGUSTUS intron 501 599 10 + . src=P;mult=14;pri=4 4 | 3R AUGUSTUS stop_codon 698 700 10 + . src=E;mult=2;pri=4 5 | 3R AUGUSTUS intron 801 899 10 + . src=E;mult=2;pri=4 6 | 2L AUGUSTUS intron 801 899 10 + . src=E;mult=2;pri=4 7 | 3R AUGUSTUS intron 801 899 10 + . src=P;mult=24;pri=4 8 | 3R AUGUSTUS intron 801 949 10 + . src=E;mult=2;pri=4 9 | 3R AUGUSTUS intron 801 899 10 + . src=E;mult=2;pri=4 10 | 3R AUGUSTUS intron 1001 1099 10 + . src=E;mult=2;pri=4 11 | 3R AUGUSTUS stop_codon 1198 1200 10 + . src=E;mult=2;pri=4 12 | 3R AUGUSTUS intron 1601 1699 10 + . src=E;mult=2;pri=4 -------------------------------------------------------------------------------- /tests/graph/ex_feature_hint1.gff: -------------------------------------------------------------------------------- 1 | 3R b2h intron 21737122 21737185 6 - . mult=6;pri=4;src=E 2 | 3R b2h intron 21738629 21738695 42 - . mult=42;pri=4;src=E 3 | 3R b2h intron 21738939 21739000 30 - . mult=30;pri=4;src=E 4 | 3R b2h intron 21740644 21741666 4 + . mult=4;pri=4;src=E 5 | 3R b2h intron 21741826 21741884 12 + . mult=12;pri=4;src=E 6 | 3R b2h intron 21742360 21742427 2 + . mult=2;pri=4;src=E 7 | 3R b2h intron 21743988 21744047 2 + . mult=2;pri=4;src=E 8 | 3R b2h intron 21745856 21746185 166 + . mult=166;pri=4;src=E 9 | 3R b2h intron 21746342 21746473 196 + . mult=196;pri=4;src=E 10 | 3R b2h intron 21747188 21747389 200 + . mult=200;pri=4;src=E 11 | 3R b2h intron 21748618 21748687 340 + . mult=340;pri=4;src=E 12 | -------------------------------------------------------------------------------- /tests/graph/ex1_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 250 500 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 250 500 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 501 599 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 600 750 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 600 750 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 250 750 0 + 0 t1 7 | 3R AUGUSTUS exon 1050 1250 0 + 0 transcript_id "t2"; gene_id "t2_g"; 8 | 3R AUGUSTUS CDS 1050 1250 0 + 0 transcript_id "t2"; gene_id "t2_g"; 9 | 3R AUGUSTUS transcript 1050 1250 0 + 0 t2 10 | 3R AUGUSTUS exon 1700 1800 0 + 0 transcript_id "t3"; gene_id "t3_g"; 11 | 3R AUGUSTUS CDS 1700 1800 0 + 0 transcript_id "t3"; gene_id "t3_g"; 12 | 3R AUGUSTUS transcript 1700 1800 0 + 0 t3 -------------------------------------------------------------------------------- /example/run_tsebra_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # if this file is not executable run: chmod +x run_prevco_example.sh 3 | 4 | c="${0%/*}" 5 | # prediciton and hint files that are included in the standard output of a BRAKER run 6 | b1=$c/braker1_results/braker.gtf 7 | b2=$c/braker2_results/braker.gtf 8 | h1=$c/braker1_results/hintsfile.gff 9 | h2=$c/braker2_results/hintsfile.gff 10 | 11 | # create working directory 12 | d=$c/tsebra_workdir/ 13 | mkdir -p $d 14 | 15 | # Make sure that the transcript IDs of the BRAKER predicitons are in order 16 | # This step is OPTIONAL and not necassary for a succefull combination 17 | 18 | echo "\n*** Fix possible ID errors in *.gtf files ***\n" 19 | 20 | new_b1=$d/braker1.gtf 21 | new_b2=$d/braker2.gtf 22 | $c/../bin/fix_gtf_ids.py --gtf $b1 --out $new_b1 23 | $c/../bin/fix_gtf_ids.py --gtf $b2 --out $new_b2 24 | b1=$new_b1 25 | b2=$new_b2 26 | 27 | # Combine BRAKER1 and BRAKER2 predicitons 28 | 29 | o=$d/braker1+2.gtf 30 | 31 | echo "*** Running TSEBRA ***\n" 32 | 33 | $c/../bin/tsebra.py -g $b1,$b2 -c $c/../config/default.cfg -e $h1,$h2 -o $o 34 | 35 | echo "\n*** Finished. Result at: $o ***\n" 36 | -------------------------------------------------------------------------------- /tests/test_evidence.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pytest 4 | import csv 5 | 6 | testDir = os.path.abspath(os.path.dirname(__file__)) 7 | sys.path.append(testDir + '/../bin/') 8 | 9 | from evidence import NotGtfFormat, AttributeMissing, Hint, Evidence 10 | 11 | @pytest.fixture 12 | def hints1(): 13 | hints = [] 14 | with open(testDir + '/evidence/hint1.gff') as file: 15 | hints_tab = csv.reader(file, delimiter='\t') 16 | for line in hints_tab: 17 | hints.append(line) 18 | return hints 19 | 20 | def test_hint(hints1): 21 | hint = Hint(hints1[0]) 22 | assert list(map(str, hint.hint2list())) == hints1[0] 23 | hint = Hint(hints1[2]) 24 | assert list(map(str, hint.hint2list())) == hints1[2] 25 | 26 | def test_hint_error(hints1): 27 | with pytest.raises(AttributeMissing): 28 | Hint(hints1[1]) 29 | with pytest.raises(NotGtfFormat): 30 | Hint(hints1[3]) 31 | 32 | def test_get_hint(): 33 | evi = Evidence() 34 | evi.add_hintfile(testDir + '/evidence/hint3.gff') 35 | mult = evi.get_hint('3R','801','899','intron','+') 36 | assert sum(mult.values()) == 28 37 | -------------------------------------------------------------------------------- /tests/graph/ex1_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 201 299 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 100 400 0 + 0 t1 7 | 3R AUGUSTUS exon 700 800 0 + 0 transcript_id "t2"; gene_id "t2_g"; 8 | 3R AUGUSTUS CDS 700 800 0 + 0 transcript_id "t2"; gene_id "t2_g"; 9 | 3R AUGUSTUS intron 801 899 0 + 0 transcript_id "t2"; gene_id "t2_g"; 10 | 3R AUGUSTUS exon 900 1000 0 + 0 transcript_id "t2"; gene_id "t2_g"; 11 | 3R AUGUSTUS CDS 900 1000 0 + 0 transcript_id "t2"; gene_id "t2_g"; 12 | 3R AUGUSTUS intron 1001 1099 0 + 0 transcript_id "t2"; gene_id "t2_g"; 13 | 3R AUGUSTUS exon 1100 1200 0 + 0 transcript_id "t2"; gene_id "t2_g"; 14 | 3R AUGUSTUS CDS 1100 1200 0 + 0 transcript_id "t2"; gene_id "t2_g"; 15 | 3R AUGUSTUS transcript 700 1200 0 + 0 t2 16 | 3R AUGUSTUS exon 1500 1600 0 + 0 transcript_id "t3"; gene_id "t3_g"; 17 | 3R AUGUSTUS CDS 1500 1600 0 + 0 transcript_id "t3"; gene_id "t3_g"; 18 | 3R AUGUSTUS transcript 1500 1600 0 + 0 t3 -------------------------------------------------------------------------------- /tests/evidence/hint2.gff: -------------------------------------------------------------------------------- 1 | 3L ProtHint intron 5812862 5812941 24 - . src=M;mult=24;pri=4 2 | 3L ProtHint intron 12291242 12291299 8 - . src=M;mult=8;pri=4 3 | 3R ProtHint intron 17440148 17440207 25 - . src=M;mult=25;pri=4 4 | 2R ProtHint intron 5760114 5760177 23 - . src=M;mult=23;pri=4 5 | 2R ProtHint intron 6210484 6210546 21 - . src=M;mult=21;pri=4 6 | 3L ProtHint intron 20527281 20527592 25 + . src=M;mult=25;pri=4 7 | 2L ProtHint intron 12400752 12400814 24 + . src=M;mult=24;pri=4 8 | 2R ProtHint intron 14988084 14988142 25 - . src=M;mult=25;pri=4 9 | 2L ProtHint intron 6667531 6667670 5 - . src=M;mult=5;pri=4 10 | 3R ProtHint intron 5537551 5537605 22 + . src=M;mult=22;pri=4 11 | 3R ProtHint intron 20813612 20813665 12 - . src=M;mult=12;pri=4 12 | X ProtHint intron 2145714 2147174 25 + . src=M;mult=25;pri=4 13 | 3L ProtHint intron 8114197 8114256 25 - . src=M;mult=25;pri=4 14 | X ProtHint intron 11048602 11048941 25 + . src=M;mult=25;pri=4 15 | 2L ProtHint intron 3807462 3807524 18 + . src=M;mult=18;pri=4 16 | 3R ProtHint intron 27059120 27059364 19 - . src=M;mult=19;pri=4 17 | 2R ProtHint intron 13821370 13821432 24 - . src=M;mult=24;pri=4 18 | X ProtHint intron 8173462 8173860 6 - . src=M;mult=6;pri=4 19 | X ProtHint intron 13270643 13271481 16 - . src=M;mult=16;pri=4 20 | X ProtHint intron 2079645 2079714 25 - . src=M;mult=25;pri=4 21 | -------------------------------------------------------------------------------- /tests/graph/ex_feature_hint2.gff: -------------------------------------------------------------------------------- 1 | 3R ProtHint intron 21747188 21747389 16 + . src=M;mult=16;pri=4 2 | 3R ProtHint intron 21742667 21742741 9 + . src=M;mult=9;pri=4 3 | 3R ProtHint intron 21742360 21742427 10 + . src=M;mult=10;pri=4 4 | 3R ProtHint intron 21745856 21746185 18 + . src=M;mult=18;pri=4 5 | 3R ProtHint intron 21740644 21741666 8 + . src=M;mult=8;pri=4 6 | 3R ProtHint intron 21740644 21741666 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 7 | 3R ProtHint intron 21741826 21741884 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 8 | 3R ProtHint intron 21742360 21742427 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 9 | 3R ProtHint intron 21742667 21742741 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 10 | 3R ProtHint intron 21743988 21744047 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 11 | 3R ProtHint intron 21745856 21746185 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 12 | 3R ProtHint intron 21746342 21746473 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 13 | 3R ProtHint intron 21747188 21747389 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 14 | 3R ProtHint intron 21748618 21748687 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 15 | 3R ProtHint intron 21743988 21744047 0 + . src=P;mult=2;pri=4; 16 | 3R ProtHint intron 21746342 21746473 2 + . src=P;mult=14;pri=4; 17 | 3R ProtHint intron 21741826 21741884 0 + . src=P;mult=3;pri=4; 18 | 3R ProtHint intron 21747188 21747389 2 + . src=P;mult=16;pri=4; 19 | 3R ProtHint intron 21742667 21742741 2 + . src=P;mult=9;pri=4; 20 | 3R ProtHint intron 21742360 21742427 2 + . src=P;mult=10;pri=4; 21 | 3R ProtHint intron 21745856 21746185 2 + . src=P;mult=18;pri=4; 22 | 3R ProtHint intron 21748618 21748687 2 + . src=P;mult=17;pri=4; 23 | 3R ProtHint intron 21740644 21741666 2 + . src=P;mult=8;pri=4; 24 | 3R ProtHint stop 21748922 21748924 0 + 0 src=P;mult=1;pri=4; 25 | -------------------------------------------------------------------------------- /tests/test_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import pytest 5 | 6 | testDir = os.path.abspath(os.path.dirname(__file__)) 7 | sys.path.append(testDir + '/../bin/') 8 | 9 | from genome_anno import Anno 10 | from overlap_graph import Graph, Node 11 | from evidence import Hintfile 12 | 13 | example_files = testDir + '/graph/' 14 | 15 | def compare_lists(list1, list2): 16 | assert len(list1) == len(list2) 17 | list1 = [set(l) for l in list1] 18 | list2 = [set(l) for l in list2] 19 | for element in list1: 20 | assert element in list2 21 | 22 | def test_example_1(): 23 | result = [['anno1;t1', 'anno2;t1', 'anno1;t2', 'anno2;t2'], ['anno1;t3'], ['anno2;t3']] 24 | anno1 = Anno(example_files + '/ex1_anno1.gtf', 'anno1') 25 | anno1.addGtf() 26 | anno1.norm_tx_format() 27 | anno2 = Anno(example_files + '/ex1_anno2.gtf', 'anno2') 28 | anno2.addGtf() 29 | graph = Graph([anno1, anno2], {}) 30 | graph.build() 31 | component_list = graph.connected_components() 32 | compare_lists(result, component_list) 33 | 34 | def test_example_2(): 35 | result = [['anno2;t1'], ['anno1;t1'], ['anno2;t2']] 36 | anno1 = Anno(example_files + '/ex2_anno1.gtf', 'anno1') 37 | anno1.addGtf() 38 | anno1.norm_tx_format() 39 | anno2 = Anno(example_files + '/ex2_anno2.gtf', 'anno2') 40 | anno2.addGtf() 41 | anno2.norm_tx_format() 42 | graph = Graph([anno1, anno2], {}) 43 | graph.build() 44 | component_list = graph.connected_components() 45 | compare_lists(result, component_list) 46 | 47 | def test_example_3(): 48 | result = [['anno1;t1', 'anno2;t1'], ['anno2;t2']] 49 | anno1 = Anno(example_files + '/ex3_anno1.gtf', 'anno1') 50 | anno1.addGtf() 51 | anno1.norm_tx_format() 52 | anno2 = Anno(example_files + '/ex3_anno2.gtf', 'anno2') 53 | anno2.addGtf() 54 | graph = Graph([anno1, anno2], {}) 55 | graph.build() 56 | component_list = graph.connected_components() 57 | compare_lists(result, component_list) 58 | -------------------------------------------------------------------------------- /bin/rename_gtf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # Rename the transcripts and genes of a GTF file. 6 | # ============================================================== 7 | import argparse 8 | import os 9 | import csv 10 | class FileNotFound(Exception): 11 | pass 12 | 13 | def main(): 14 | args = parseCmd() 15 | from genome_anno import Anno 16 | 17 | args = parseCmd() 18 | 19 | if not os.path.exists(args.gtf): 20 | raise FileNotFound('File not found: {}'.format(args.gtf)) 21 | prefix = '' 22 | if args.prefix: 23 | prefix = args.prefix 24 | 25 | 26 | anno = Anno(args.gtf, id='') 27 | anno.addGtf() 28 | anno.norm_tx_format() 29 | anno.find_genes() 30 | tx_tab = anno.rename_tx_ids(prefix) 31 | anno.write_anno(args.out) 32 | if args.translation_tab: 33 | with open(args.translation_tab, 'w+') as file: 34 | out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n') 35 | for line in tx_tab: 36 | out_writer.writerow(line) 37 | 38 | def parseCmd(): 39 | """Parse command line arguments 40 | 41 | Returns: 42 | dictionary: Dictionary with arguments 43 | """ 44 | parser = argparse.ArgumentParser(description='Renames the transcripts and genes of a GTF file.') 45 | parser.add_argument('--gtf', type=str, required=True, 46 | help='Path to a gene prediciton file in GTF format, for example the output of TSEBRA.') 47 | parser.add_argument('--prefix', type=str, 48 | help='The string is added as a prefix to all transcript and gene IDs.') 49 | parser.add_argument('--translation_tab', type=str, 50 | help='Writes the translation table for old transcript IDs to new transcript IDs to the given file path.') 51 | parser.add_argument('--out', type=str, required=True, 52 | help='Path to the output file.') 53 | return parser.parse_args() 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /bin/fix_gtf_ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # Lars Gabriel 4 | # 5 | # Fixes an transcript and gene id error, where transcripts/genes have 6 | # the same ID on different chromosomes or strands. 7 | # ============================================================== 8 | import sys 9 | import os 10 | import argparse 11 | 12 | class FormatError(Exception): 13 | pass 14 | 15 | class Chr: 16 | def __init__(self): 17 | self.genes = {} 18 | self.txs = {} 19 | 20 | def start2int(line): 21 | line[3] = int(line[3]) 22 | return line 23 | 24 | def main(): 25 | # replace gene/tx oldID with chr_strand_oldID 26 | args = parseCmd() 27 | result = '' 28 | with open(args.gtf, 'r') as file: 29 | for line in file.readlines(): 30 | line = line.split('\t') 31 | if len(line) == 9: 32 | if line[2] in ['gene', 'transcript']: 33 | continue 34 | id_prefix = line[0] + line[6] 35 | id_prefix = id_prefix.replace(' ', '') 36 | transcript_id = line[8].split('transcript_id "')[1].split('";')[0] 37 | temp = line[8].split('transcript_id "') 38 | line[8] = '{}transcript_id "{}_{}";{}'.format(temp[0], id_prefix, transcript_id, '";'.join(temp[1].split('";')[1:])) 39 | gene_id = line[8].split('gene_id "')[1].split('";')[0] 40 | temp = line[8].split('gene_id "') 41 | line[8] = '{}gene_id "{}_{}";{}'.format(temp[0], id_prefix, gene_id, '";'.join(temp[1].split('";')[1:])) 42 | result += '\t'.join(line) 43 | with open(args.out, 'w+') as file: 44 | file.write(result) 45 | 46 | def parseCmd(): 47 | """Parse command line arguments 48 | 49 | Returns: 50 | dictionary: Dictionary with arguments 51 | """ 52 | parser = argparse.ArgumentParser(description='') 53 | parser.add_argument('--gtf', type=str, 54 | help='') 55 | parser.add_argument('--out', type=str, 56 | help='') 57 | return parser.parse_args() 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /tests/graph/ex_feature_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS start_codon 21740168 21740170 . + 0 transcript_id "g7701.t1"; gene_id "g7701"; 2 | 3R AUGUSTUS CDS 21740168 21740643 1 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 3 | 3R AUGUSTUS exon 21740168 21740643 . + . transcript_id "g7701.t1"; gene_id "g7701"; 4 | 3R AUGUSTUS intron 21740644 21741666 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 5 | 3R AUGUSTUS CDS 21741667 21741825 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 6 | 3R AUGUSTUS exon 21741667 21741825 . + . transcript_id "g7701.t1"; gene_id "g7701"; 7 | 3R AUGUSTUS intron 21741826 21741884 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 8 | 3R AUGUSTUS CDS 21741885 21742359 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 9 | 3R AUGUSTUS exon 21741885 21742359 . + . transcript_id "g7701.t1"; gene_id "g7701"; 10 | 3R AUGUSTUS intron 21742360 21742427 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 11 | 3R AUGUSTUS CDS 21742428 21742666 1 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 12 | 3R AUGUSTUS exon 21742428 21742666 . + . transcript_id "g7701.t1"; gene_id "g7701"; 13 | 3R AUGUSTUS intron 21742667 21742741 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 14 | 3R AUGUSTUS CDS 21742742 21743987 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 15 | 3R AUGUSTUS exon 21742742 21743987 . + . transcript_id "g7701.t1"; gene_id "g7701"; 16 | 3R AUGUSTUS intron 21743988 21744047 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 17 | 3R AUGUSTUS CDS 21744048 21744355 0.52 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 18 | 3R AUGUSTUS exon 21744048 21744355 . + . transcript_id "g7701.t1"; gene_id "g7701"; 19 | 3R AUGUSTUS intron 21744356 21745282 0.52 + . transcript_id "g7701.t1"; gene_id "g7701"; 20 | 3R AUGUSTUS CDS 21745283 21745855 0.53 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 21 | 3R AUGUSTUS exon 21745283 21745855 . + . transcript_id "g7701.t1"; gene_id "g7701"; 22 | 3R AUGUSTUS intron 21745856 21746185 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 23 | 3R AUGUSTUS CDS 21746186 21746341 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 24 | 3R AUGUSTUS exon 21746186 21746341 . + . transcript_id "g7701.t1"; gene_id "g7701"; 25 | 3R AUGUSTUS intron 21746342 21746473 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 26 | 3R AUGUSTUS CDS 21746474 21747187 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 27 | 3R AUGUSTUS exon 21746474 21747187 . + . transcript_id "g7701.t1"; gene_id "g7701"; 28 | 3R AUGUSTUS intron 21747188 21747389 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 29 | 3R AUGUSTUS CDS 21747390 21748617 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 30 | 3R AUGUSTUS exon 21747390 21748617 . + . transcript_id "g7701.t1"; gene_id "g7701"; 31 | 3R AUGUSTUS intron 21748618 21748687 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 32 | 3R AUGUSTUS CDS 21748688 21748924 1 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 33 | 3R AUGUSTUS transcript 21740168 21748924 0.52 + . g7701.t1 34 | 3R AUGUSTUS exon 21748688 21748924 . + . transcript_id "g7701.t1"; gene_id "g7701"; 35 | 3R AUGUSTUS stop_codon 21748922 21748924 . + 0 transcript_id "g7701.t1"; gene_id "g7701"; 36 | 3R AUGUSTUS stop_codon 21737497 21737499 . - 0 transcript_id "g7700.t1"; gene_id "g7700"; 37 | 3R AUGUSTUS CDS 21737497 21737706 0.84 - 0 transcript_id "g7700.t1"; gene_id "g7700"; 38 | 3R AUGUSTUS exon 21737497 21737706 . - . transcript_id "g7700.t1"; gene_id "g7700"; 39 | 3R AUGUSTUS intron 21737707 21739000 0.76 - . transcript_id "g7700.t1"; gene_id "g7700"; 40 | 3R AUGUSTUS CDS 21739001 21739099 0.75 - 0 transcript_id "g7700.t1"; gene_id "g7700"; 41 | 3R AUGUSTUS transcript 21737497 21739099 0.75 - . g7700.t1 42 | 3R AUGUSTUS exon 21739001 21739099 . - . transcript_id "g7700.t1"; gene_id "g7700"; 43 | 3R AUGUSTUS start_codon 21739097 21739099 . - 0 transcript_id "g7700.t1"; gene_id "g7700"; 44 | -------------------------------------------------------------------------------- /tests/test_genome_anno.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import pytest 5 | import csv 6 | 7 | testDir = os.path.abspath(os.path.dirname(__file__)) 8 | sys.path.append(testDir + '/../bin/') 9 | 10 | from genome_anno import Transcript, Anno, NotGtfFormat 11 | 12 | anno1 = testDir + '/genome_anno/anno1.gtf' 13 | anno_format_error = testDir + '/genome_anno/format_error.gtf' 14 | anno_missing_gid = testDir + '/genome_anno/missing_gid.gtf' 15 | tx1 = testDir + '/genome_anno/tx1.gtf' 16 | tx1_args = ('tx1', 'g.tx1', '3L', 'GeneMark.hmm', '-') 17 | 18 | @pytest.fixture 19 | def transcript(): 20 | return Transcript(*tx1_args) 21 | 22 | @pytest.fixture 23 | def file_tx1(): 24 | result = [] 25 | with open(tx1, 'r') as file: 26 | file_tab = csv.reader(file, delimiter='\t') 27 | for line in file_tab: 28 | result.append(line) 29 | return result 30 | 31 | @pytest.fixture 32 | def file_anno1(): 33 | result = [] 34 | with open(anno1, 'r') as file: 35 | file_tab = csv.reader(file, delimiter='\t') 36 | for line in file_tab: 37 | result.append(line) 38 | return result 39 | 40 | @pytest.fixture 41 | def transcript_tx1(file_tx1): 42 | t = Transcript(*tx1_args) 43 | for line in file_tx1: 44 | t.add_line(line) 45 | return t 46 | 47 | @pytest.fixture 48 | def anno_anno1(): 49 | anno = Anno(anno1, 'anno1') 50 | anno.addGtf() 51 | return anno 52 | 53 | def test_transcript_defaults(transcript): 54 | assert transcript.id == tx1_args[0] 55 | assert transcript.gene_id == tx1_args[1] 56 | assert transcript.chr == tx1_args[2] 57 | assert transcript.source_anno == tx1_args[3] 58 | 59 | def test_transcript_add_lines(transcript_tx1, file_tx1): 60 | list = [] 61 | for key in transcript_tx1.transcript_lines.keys(): 62 | list += transcript_tx1.transcript_lines[key] 63 | assert len(list) == len(file_tx1) 64 | for line in list: 65 | assert line in file_tx1 66 | 67 | def test_transcript_find_lines(transcript_tx1): 68 | missing = {"intron" : [['3L', 'GeneMark.hmm', 'intron', 18462541, 18462718, \ 69 | '.', '-', '0', \ 70 | 'gene_id "g.tx1"; transcript_id "tx1";']], \ 71 | "start_codon" : [['3L', 'GeneMark.hmm', 'start_codon', 18463066, 18463068, \ 72 | '.', '-', '.', 'gene_id "g.tx1"; transcript_id "tx1";']], \ 73 | "transcript" : [['3L', 'GeneMark.hmm', 'transcript', 18462228, 18463068, \ 74 | '.', '-', '.', 'tx1']]} 75 | transcript_tx1.add_missing_lines() 76 | for key in missing.keys(): 77 | for line in missing[key]: 78 | assert line in transcript_tx1.transcript_lines[key] 79 | 80 | def test_anno_read_file(anno_anno1, file_anno1): 81 | gtf_anno = anno_anno1.get_gtf() 82 | gtf_anno = [list(map(str, g[:8])) for g in gtf_anno] 83 | file_anno1 = [f[:8] for f in file_anno1] 84 | assert len(gtf_anno) == len(file_anno1) 85 | for line in file_anno1: 86 | print(line) 87 | print(gtf_anno) 88 | assert line in gtf_anno 89 | 90 | def test_format_error(): 91 | anno = Anno(anno_format_error, 'error_anno') 92 | with pytest.raises(NotGtfFormat): 93 | anno.addGtf() 94 | 95 | def test_missing_gid(file_anno1): 96 | anno = Anno(anno_missing_gid, 'anno1') 97 | anno.addGtf() 98 | gtf_anno = anno.get_gtf() 99 | gtf_anno = [list(map(str, g[:8])) for g in gtf_anno] 100 | file_anno1 = [f[:8] for f in file_anno1] 101 | assert len(gtf_anno) == len(file_anno1) 102 | for line in gtf_anno: 103 | assert line in file_anno1 104 | 105 | 106 | 107 | 108 | if __name__ == '__main__': 109 | os.mkdir(tempDir) 110 | #sys.path.append(testDir + "/../bin") 111 | -------------------------------------------------------------------------------- /bin/get_longest_isoform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # get_longest_isoform.py: combines gene sets into one that 6 | # consists only of the longest isoform from each gene loci 7 | # ============================================================== 8 | import argparse 9 | import sys 10 | import os 11 | import csv 12 | 13 | class ConfigFileError(Exception): 14 | pass 15 | 16 | class GeneSetMissing(Exception): 17 | pass 18 | 19 | gtf = [] 20 | anno = [] 21 | hintfiles = [] 22 | graph = None 23 | out = '' 24 | v = 0 25 | quiet = False 26 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \ 27 | 'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0} 28 | 29 | def main(): 30 | from genome_anno import Anno 31 | from overlap_graph import Graph 32 | 33 | global anno, graph, parameter 34 | 35 | args = parseCmd() 36 | init(args) 37 | 38 | if v > 0: 39 | print(gtf) 40 | 41 | # read gene prediciton files 42 | c = 1 43 | for c, g in enumerate(gtf): 44 | if not quiet: 45 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 46 | anno.append(Anno(g, f'anno{c+1}')) 47 | anno[-1].addGtf() 48 | anno[-1].norm_tx_format() 49 | 50 | # create graph with an edge for each unique transcript 51 | # and an edge if two transcripts overlap 52 | # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides 53 | graph = Graph(anno, para=parameter, verbose=v) 54 | if not quiet: 55 | sys.stderr.write('### BUILD OVERLAP GRAPH\n') 56 | graph.build() 57 | 58 | combined_anno = Anno('', 'combined_annotation') 59 | # for each gene locus, choose the transcript with longes coding sequence 60 | if not quiet: 61 | sys.stderr.write('### CHOOSE LONGEST ISOFORM FOR EACH GENE\n') 62 | for i, comp in enumerate(graph.connected_components()): 63 | tx_longest = sorted([graph.__tx_from_key__(n) for \ 64 | n in comp], key=lambda t:t.get_cds_len())[-1] 65 | tx_longest.set_gene_id(f'g_{i+1}') 66 | tx_longest.id = f'{tx_longest.source_anno}.{tx_longest.id}' 67 | combined_anno.transcripts.update({tx_longest.id : tx_longest}) 68 | combined_anno.find_genes() 69 | combined_anno.write_anno(out) 70 | 71 | if not quiet: 72 | sys.stderr.write('### FINISHED\n\n') 73 | sys.stderr.write('### The longest isoforms are located at {}.\n'.format(\ 74 | out)) 75 | 76 | def init(args): 77 | global gtf, out, v, quiet 78 | if args.gtf: 79 | gtf = args.gtf.split(',') 80 | if args.out: 81 | out = args.out 82 | if args.verbose: 83 | v = args.verbose 84 | if args.quiet: 85 | quiet = True 86 | 87 | def parseCmd(): 88 | """Parse command line arguments 89 | 90 | Returns: 91 | dictionary: Dictionary with arguments 92 | """ 93 | parser = argparse.ArgumentParser(description='Combine gene sets by choosing ' \ 94 | 'the isoform with the longes coding sequence for each gene locus.') 95 | parser.add_argument('-g', '--gtf', type=str, required=True, 96 | help='List (separated by commas) of gene prediciton files in gtf.\n' \ 97 | + '(e.g. gene_pred1.gtf,gene_pred2.gtf,gene_pred3.gtf)') 98 | parser.add_argument('-o', '--out', type=str, required=True, 99 | help='Outputfile for the combined gene prediciton in gtf.') 100 | parser.add_argument('-q', '--quiet', action='store_true', 101 | help='Quiet mode.') 102 | parser.add_argument('-v', '--verbose', type=int, 103 | help='') 104 | return parser.parse_args() 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /tests/combined.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS stop_codon 21737497 21737499 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 2 | 3R AUGUSTUS CDS 21737497 21737706 0.99 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 3 | 3R AUGUSTUS exon 21737497 21737706 . - . transcript_id "g7603.t1"; gene_id "g7603"; 4 | 3R AUGUSTUS transcript 21737497 21738709 0.98 - . g7603.t1 5 | 3R AUGUSTUS intron 21737707 21738606 0.99 - . transcript_id "g7603.t1"; gene_id "g7603"; 6 | 3R AUGUSTUS CDS 21738607 21738628 0.99 - 1 transcript_id "g7603.t1"; gene_id "g7603"; 7 | 3R AUGUSTUS exon 21738607 21738628 . - . transcript_id "g7603.t1"; gene_id "g7603"; 8 | 3R AUGUSTUS intron 21738629 21738695 1 - . transcript_id "g7603.t1"; gene_id "g7603"; 9 | 3R AUGUSTUS CDS 21738696 21738709 1 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 10 | 3R AUGUSTUS exon 21738696 21738709 . - . transcript_id "g7603.t1"; gene_id "g7603"; 11 | 3R AUGUSTUS start_codon 21738707 21738709 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 12 | 3R AUGUSTUS start_codon 21740168 21740170 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 13 | 3R AUGUSTUS CDS 21740168 21740643 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 14 | 3R AUGUSTUS exon 21740168 21740643 . + . transcript_id "g7604.t1"; gene_id "g7604"; 15 | 3R AUGUSTUS transcript 21740168 21744359 0.53 + . g7604.t1 16 | 3R AUGUSTUS intron 21740644 21741666 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 17 | 3R AUGUSTUS CDS 21741667 21741825 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 18 | 3R AUGUSTUS exon 21741667 21741825 . + . transcript_id "g7604.t1"; gene_id "g7604"; 19 | 3R AUGUSTUS intron 21741826 21741884 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 20 | 3R AUGUSTUS CDS 21741885 21742359 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 21 | 3R AUGUSTUS exon 21741885 21742359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 22 | 3R AUGUSTUS intron 21742360 21742427 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 23 | 3R AUGUSTUS CDS 21742428 21742666 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 24 | 3R AUGUSTUS exon 21742428 21742666 . + . transcript_id "g7604.t1"; gene_id "g7604"; 25 | 3R AUGUSTUS intron 21742667 21742741 0.84 + . transcript_id "g7604.t1"; gene_id "g7604"; 26 | 3R AUGUSTUS CDS 21742742 21743987 0.79 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 27 | 3R AUGUSTUS exon 21742742 21743987 . + . transcript_id "g7604.t1"; gene_id "g7604"; 28 | 3R AUGUSTUS intron 21743988 21744047 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 29 | 3R AUGUSTUS CDS 21744048 21744359 0.68 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 30 | 3R AUGUSTUS exon 21744048 21744359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 31 | 3R AUGUSTUS stop_codon 21744357 21744359 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 32 | 3R AUGUSTUS start_codon 21745305 21745307 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; 33 | 3R AUGUSTUS CDS 21745305 21745855 0.69 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 34 | 3R AUGUSTUS exon 21745305 21745855 . + . transcript_id "g7605.t1"; gene_id "g7605"; 35 | 3R AUGUSTUS transcript 21745305 21748924 0.49 + . g7605.t1 36 | 3R AUGUSTUS intron 21745856 21746185 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 37 | 3R AUGUSTUS CDS 21746186 21746341 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 38 | 3R AUGUSTUS exon 21746186 21746341 . + . transcript_id "g7605.t1"; gene_id "g7605"; 39 | 3R AUGUSTUS intron 21746342 21746473 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 40 | 3R AUGUSTUS CDS 21746474 21747187 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 41 | 3R AUGUSTUS exon 21746474 21747187 . + . transcript_id "g7605.t1"; gene_id "g7605"; 42 | 3R AUGUSTUS intron 21747188 21747389 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 43 | 3R AUGUSTUS CDS 21747390 21748617 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 44 | 3R AUGUSTUS exon 21747390 21748617 . + . transcript_id "g7605.t1"; gene_id "g7605"; 45 | 3R AUGUSTUS intron 21748618 21748687 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 46 | 3R AUGUSTUS CDS 21748688 21748924 0.71 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 47 | 3R AUGUSTUS exon 21748688 21748924 . + . transcript_id "g7605.t1"; gene_id "g7605"; 48 | 3R AUGUSTUS stop_codon 21748922 21748924 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; -------------------------------------------------------------------------------- /tests/graph/ex_feature_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS stop_codon 21737497 21737499 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 2 | 3R AUGUSTUS CDS 21737497 21737706 0.99 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 3 | 3R AUGUSTUS exon 21737497 21737706 . - . transcript_id "g7603.t1"; gene_id "g7603"; 4 | 3R AUGUSTUS transcript 21737497 21738709 0.98 - . g7603.t1 5 | 3R AUGUSTUS intron 21737707 21738606 0.99 - . transcript_id "g7603.t1"; gene_id "g7603"; 6 | 3R AUGUSTUS CDS 21738607 21738628 0.99 - 1 transcript_id "g7603.t1"; gene_id "g7603"; 7 | 3R AUGUSTUS exon 21738607 21738628 . - . transcript_id "g7603.t1"; gene_id "g7603"; 8 | 3R AUGUSTUS intron 21738629 21738695 1 - . transcript_id "g7603.t1"; gene_id "g7603"; 9 | 3R AUGUSTUS CDS 21738696 21738709 1 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 10 | 3R AUGUSTUS exon 21738696 21738709 . - . transcript_id "g7603.t1"; gene_id "g7603"; 11 | 3R AUGUSTUS start_codon 21738707 21738709 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 12 | 3R AUGUSTUS start_codon 21740168 21740170 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 13 | 3R AUGUSTUS CDS 21740168 21740643 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 14 | 3R AUGUSTUS exon 21740168 21740643 . + . transcript_id "g7604.t1"; gene_id "g7604"; 15 | 3R AUGUSTUS transcript 21740168 21744359 0.53 + . g7604.t1 16 | 3R AUGUSTUS intron 21740644 21741666 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 17 | 3R AUGUSTUS CDS 21741667 21741825 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 18 | 3R AUGUSTUS exon 21741667 21741825 . + . transcript_id "g7604.t1"; gene_id "g7604"; 19 | 3R AUGUSTUS intron 21741826 21741884 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 20 | 3R AUGUSTUS CDS 21741885 21742359 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 21 | 3R AUGUSTUS exon 21741885 21742359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 22 | 3R AUGUSTUS intron 21742360 21742427 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 23 | 3R AUGUSTUS CDS 21742428 21742666 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 24 | 3R AUGUSTUS exon 21742428 21742666 . + . transcript_id "g7604.t1"; gene_id "g7604"; 25 | 3R AUGUSTUS intron 21742667 21742741 0.84 + . transcript_id "g7604.t1"; gene_id "g7604"; 26 | 3R AUGUSTUS CDS 21742742 21743987 0.79 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 27 | 3R AUGUSTUS exon 21742742 21743987 . + . transcript_id "g7604.t1"; gene_id "g7604"; 28 | 3R AUGUSTUS intron 21743988 21744047 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 29 | 3R AUGUSTUS CDS 21744048 21744359 0.68 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 30 | 3R AUGUSTUS exon 21744048 21744359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 31 | 3R AUGUSTUS stop_codon 21744357 21744359 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 32 | 3R AUGUSTUS start_codon 21745305 21745307 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; 33 | 3R AUGUSTUS CDS 21745305 21745855 0.69 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 34 | 3R AUGUSTUS exon 21745305 21745855 . + . transcript_id "g7605.t1"; gene_id "g7605"; 35 | 3R AUGUSTUS transcript 21745305 21748924 0.49 + . g7605.t1 36 | 3R AUGUSTUS intron 21745856 21746185 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 37 | 3R AUGUSTUS CDS 21746186 21746341 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 38 | 3R AUGUSTUS exon 21746186 21746341 . + . transcript_id "g7605.t1"; gene_id "g7605"; 39 | 3R AUGUSTUS intron 21746342 21746473 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 40 | 3R AUGUSTUS CDS 21746474 21747187 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 41 | 3R AUGUSTUS exon 21746474 21747187 . + . transcript_id "g7605.t1"; gene_id "g7605"; 42 | 3R AUGUSTUS intron 21747188 21747389 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 43 | 3R AUGUSTUS CDS 21747390 21748617 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 44 | 3R AUGUSTUS exon 21747390 21748617 . + . transcript_id "g7605.t1"; gene_id "g7605"; 45 | 3R AUGUSTUS intron 21748618 21748687 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 46 | 3R AUGUSTUS CDS 21748688 21748924 0.71 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 47 | 3R AUGUSTUS exon 21748688 21748924 . + . transcript_id "g7605.t1"; gene_id "g7605"; 48 | 3R AUGUSTUS stop_codon 21748922 21748924 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; 49 | -------------------------------------------------------------------------------- /bin/evidence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # evdence.py: Handles the extrinsic evidence from the hintfiles 6 | # ============================================================== 7 | import csv 8 | 9 | class NotGtfFormat(Exception): 10 | pass 11 | 12 | class AttributeMissing(Exception): 13 | pass 14 | 15 | class Hint: 16 | """ 17 | Class handling the data structures and methods for a hint 18 | """ 19 | def __init__(self, line): 20 | """ 21 | Create a hint from a gff line. The line has to include 'src=' as 22 | an attribute in the last column. Only introns, start/stop codons 23 | are used. 24 | 25 | Args: 26 | line (list(str)): GFF line for one hint from extrinsic evidence. 27 | """ 28 | if not len(line) == 9: 29 | raise NotGtfFormat('File not in gtf Format. Error at line: {}'.format(line)) 30 | self.chr, self.source_program, self.type, self.start, self.end, \ 31 | self.score, self.strand, self.phase, attribute = line 32 | self.start = int(self.start) 33 | self.end = int(self.end) 34 | 35 | try: 36 | self.src = attribute.split('src=')[1].split(';')[0] 37 | except IndexError: 38 | raise AttributeMissing('Source of Hint is missing in line {}.'.format(line)) 39 | self.score = float(self.score) 40 | self.mult = 1 41 | if 'mult=' in attribute: 42 | self.mult = int(attribute.split('mult=')[1].split(';')[0]) 43 | 44 | self.pri = '' 45 | if 'pri=' in attribute: 46 | self.pri = attribute.split('pri=')[1].split(';')[0] 47 | 48 | if self.type == 'stop_codon': 49 | self.type = 'stop' 50 | elif self.type == 'start_codon': 51 | self.type = 'start' 52 | 53 | def hint2list(self): 54 | """ 55 | Returns: 56 | line (list(str)): GFF line for the hint. 57 | """ 58 | attribute = ['src=' + self.src] 59 | if int(self.mult) > 1: 60 | attribute.append('mult={}'.format(self.mult)) 61 | if self.pri: 62 | attribute.append('pri={}'.format(self.pri)) 63 | return [self.chr, self.source_program, self.type, self.start, self.end, \ 64 | self.score, self.strand, self.phase, ';'.join(attribute)] 65 | 66 | class Hintfile: 67 | """ 68 | Class handling the data structures and methods for a hintfile 69 | """ 70 | def __init__(self, path): 71 | """ 72 | Args: 73 | path (str): Path to the hintfile. 74 | """ 75 | # dictonary containing evidence 76 | # self.hints[chromosom_id] = [Hints()] 77 | self.hints = {} 78 | # dictionary with self.src[src] = sum_of_all_mults_of_hints_from_src 79 | self.src = {} 80 | self.read_file(path) 81 | 82 | def read_file(self, path): 83 | """ 84 | Read a gff file with intron or start/stop codon hints 85 | and create a dict of Hints. 86 | """ 87 | # 88 | with open(path, 'r') as file: 89 | hints_csv = csv.reader(file, delimiter='\t') 90 | for line in hints_csv: 91 | if line[0][0] == '#': 92 | continue 93 | new_hint = Hint(line) 94 | if not new_hint.chr in self.hints.keys(): 95 | self.hints.update({new_hint.chr : []}) 96 | self.hints[new_hint.chr].append(new_hint) 97 | if new_hint.src not in self.src: 98 | self.src.update({new_hint.src : 0}) 99 | self.src[new_hint.src] += new_hint.mult 100 | 101 | class Evidence: 102 | """ 103 | Class handling the data structures and methods for extrinsic evidence 104 | from one or more hintfiles. 105 | """ 106 | def __init__(self): 107 | # hint_keys[chr][start_end_type_strand][src] = multiplicity 108 | self.hint_keys = {} 109 | self.src = {} 110 | 111 | def add_hintfile(self, path_to_hintfile): 112 | """ 113 | Read hintfile 114 | """ 115 | # read hintfile 116 | hintfile = Hintfile(path_to_hintfile) 117 | for s in hintfile.src: 118 | if s not in self.src: 119 | self.src.update({s : 0}) 120 | self.src[s] += hintfile.src[s] 121 | for chr in hintfile.hints.keys(): 122 | if chr not in self.hint_keys.keys(): 123 | self.hint_keys.update({chr : {}}) 124 | for hint in hintfile.hints[chr]: 125 | new_key = '{}_{}_{}_{}'.format(hint.start, hint.end, \ 126 | hint.type, hint.strand) 127 | if not new_key in self.hint_keys[chr].keys(): 128 | self.hint_keys[chr].update({new_key : {}}) 129 | if not hint.src in self.hint_keys[chr][new_key].keys(): 130 | self.hint_keys[chr][new_key].update({hint.src : 0}) 131 | self.hint_keys[chr][new_key][hint.src] += int(hint.mult) 132 | 133 | def get_hint(self, chr, start, end, type, strand): 134 | if type == 'start_codon': 135 | type = 'start' 136 | elif type == 'stop_codon': 137 | type = 'stop' 138 | key = '{}_{}_{}_{}'.format(start, end, type, strand) 139 | if chr in self.hint_keys.keys(): 140 | if key in self.hint_keys[chr].keys(): 141 | return self.hint_keys[chr][key] 142 | return {} 143 | -------------------------------------------------------------------------------- /bin/features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # features.py: Handles the features for a transcript 6 | # ============================================================== 7 | import numpy as np 8 | 9 | class Node_features: 10 | """ 11 | Class handling the features for a transcripts. 12 | Features are scores that characterize the support of the transcript 13 | by extrinsic evidence in different ways. 14 | """ 15 | def __init__(self, tx, evi, hint_source_weight={'P' : 1, 'E' : 20, 'C' : 1, 'M' : 1}): 16 | """ 17 | Args: 18 | tx (Transcript): Transcript class object containing a transcript. 19 | evi (Evidence): Evidence class object containing all extrinsic evidence. 20 | hint_source_weight (dict(int)): Weights for each evidence source. 21 | """ 22 | self.sw = hint_source_weight 23 | self.scores = [] 24 | self.epsi = 1e-5 25 | self.evi_list = {'intron' : [], 'start_codon' : [], 'stop_codon': []} 26 | self.numb_introns = 0 27 | self.__init_hints__(tx, evi) 28 | # feature vector specifies the support of 29 | # introns, start/stop codons for a transcript 30 | # self.feature_vector[0] : (supported introns by evidence of tx) / (number of introns in tx) 31 | # self.feature_vector[1] : (supported start/stop codons by evidence of tx) / 2 32 | # self.feature_vector[2] : sum of multiplicities of intron evidence for tx 33 | # self.feature_vector[3] : sum of multiplicities of start/stop codon evidence for tx 34 | # self.feature_vector[4] : 1 if tx is from anno_pref, 0 otherwise 35 | self.feature_vector = self.create_feature_vec() 36 | 37 | def __init_hints__(self, tx, evi): 38 | """ 39 | Collect hints from evi that support tx. 40 | 41 | Args: 42 | tx (Transcript): Transcript class object containing a transcript. 43 | evi (Evidence): Evidence class object containing all extrinsic evidence. 44 | """ 45 | cds_len = 0 46 | for type in ['intron', 'start_codon', 'stop_codon']: 47 | for line in tx.transcript_lines[type]: 48 | hint = evi.get_hint(line[0], line[3], line[4], line[2], \ 49 | line[6]) 50 | if hint: 51 | self.evi_list[type].append(hint) 52 | if tx.transcript_lines['intron']: 53 | self.numb_introns = len(tx.transcript_lines['intron']) 54 | 55 | def create_feature_vec(self): 56 | """ 57 | Compute all features. 58 | 59 | Returns: 60 | (list(float)): List of feature scores. 61 | """ 62 | return [self.relative_support(['intron'], self.numb_introns), \ 63 | self.relative_support(['start_codon', 'stop_codon'], 2.0), 64 | self.absolute_support(['intron']), \ 65 | self.absolute_support(['start_codon', 'stop_codon'])] 66 | 67 | def relative_support(self, gene_feature_types, abs_numb): 68 | """ 69 | Compute relative support of introns or start/stop-codons. 70 | 71 | Args: 72 | gene_feature_types (str): Either introns or start/stop-codons 73 | abs_numb (int): absolute number of gene_feature_type in tx 74 | (e.g. number of introns in tx) 75 | 76 | Returns: 77 | (float): Relative support in [0,1]. 78 | """ 79 | if abs_numb > 0: 80 | hint_numb = 0 81 | for type in gene_feature_types: 82 | hint_numb += len(self.evi_list[type]) 83 | return hint_numb / abs_numb 84 | return 1 85 | 86 | def absolute_support(self, gene_feature_types): 87 | """ 88 | Compute absolute support of introns or start/stop-codons. 89 | 90 | Args: 91 | gene_feature_types (str): Either introns or start/stop-codons 92 | 93 | Returns: 94 | (float): Multiplicity*weight of supporting hints for gene_feature_types. 95 | """ 96 | score = 0.0 97 | for type in gene_feature_types: 98 | for hint in self.evi_list[type]: 99 | for src in hint.keys(): 100 | score += self.sw[src] * hint[src] 101 | #print(score) 102 | return np.log(score + self.epsi) 103 | 104 | # currently not used 105 | def mean_support(self, gene_feature_types, abs_numb): 106 | """ 107 | Compute absolute support of introns or start/stop-codons. 108 | 109 | Args: 110 | gene_feature_types (str): Either introns or start/stop-codons 111 | 112 | Returns: 113 | (float): Multiplicity*weight of supporting hints for gene_feature_types. 114 | """ 115 | score = 0.0 116 | if abs_numb > 0: 117 | for type in gene_feature_types: 118 | for hint in self.evi_list[type]: 119 | for src in hint.keys(): 120 | score += self.sw[src] * hint[src] 121 | return np.log((score / abs_numb)+self.epsi) 122 | else: 123 | return np.log(self.epsi) 124 | 125 | # currently not used 126 | def min_support(self, gene_feature_types, abs_numb): 127 | """ 128 | Compute absolute support of introns or start/stop-codons. 129 | 130 | Args: 131 | gene_feature_types (str): Either introns or start/stop-codons 132 | 133 | Returns: 134 | (float): Multiplicity*weight of supporting hints for gene_feature_types. 135 | """ 136 | score = 0.0 137 | for type in gene_feature_types: 138 | if len(self.evi_list[type]) < abs_numb: 139 | return np.log(self.epsi) 140 | if abs_numb > 0: 141 | score = 10000000000000000000.0 142 | for type in gene_feature_types: 143 | for hint in self.evi_list[type]: 144 | new_score = 0 145 | for src in hint.keys(): 146 | new_score += self.sw[src] * hint[src] 147 | score = np.minimum(score, new_score) 148 | return np.log(score+self.epsi) 149 | else: 150 | return np.log(self.epsi) 151 | 152 | def get_features(self): 153 | """ 154 | Returns: 155 | (list(float)): List of feature scores. 156 | """ 157 | return self.feature_vector -------------------------------------------------------------------------------- /tests/genome_anno/missing_gid.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 transcript_id "7789_t"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | 3L GeneMark.hmm start_codon 18463066 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 7 | 3R AUGUSTUS start_codon 7686444 7686446 . + 0 transcript_id "g5980.t1"; 8 | 3R AUGUSTUS CDS 7686444 7686623 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 9 | 3R AUGUSTUS exon 7686444 7686623 . + . transcript_id "g5980.t1"; gene_id "g5980"; 10 | 3R AUGUSTUS intron 7686624 7690691 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 11 | 3R AUGUSTUS CDS 7690692 7690843 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 12 | 3R AUGUSTUS exon 7690692 7690843 . + . transcript_id "g5980.t1"; gene_id "g5980"; 13 | 3R AUGUSTUS intron 7690844 7691514 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 14 | 3R AUGUSTUS CDS 7691515 7691630 1 + 1 transcript_id "g5980.t1"; gene_id "g5980"; 15 | 3R AUGUSTUS exon 7691515 7691630 . + . transcript_id "g5980.t1"; gene_id "g5980"; 16 | 3R AUGUSTUS intron 7691631 7691712 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 17 | 3R AUGUSTUS CDS 7691713 7693700 1 + 2 transcript_id "g5980.t1"; gene_id "g5980"; 18 | 3R AUGUSTUS gene 7686444 7693700 1 + . g5980 19 | 3R AUGUSTUS transcript 7686444 7693700 1 + . g5980.t1 20 | 3R AUGUSTUS exon 7691713 7693700 . + . transcript_id "g5980.t1"; gene_id "g5980"; 21 | 3R AUGUSTUS stop_codon 7693698 7693700 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 22 | X AUGUSTUS stop_codon 2065454 2065456 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 23 | X AUGUSTUS CDS 2065454 2065891 0.75 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 24 | X AUGUSTUS exon 2065454 2065891 . - . transcript_id "g12130.t1"; gene_id "g12130"; 25 | X AUGUSTUS intron 2065892 2065944 0.98 - . transcript_id "g12130.t1"; gene_id "g12130"; 26 | X AUGUSTUS CDS 2065945 2066088 0.93 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 27 | X AUGUSTUS exon 2065945 2066088 . - . transcript_id "g12130.t1"; gene_id "g12130"; 28 | X AUGUSTUS intron 2066089 2066148 0.92 - . transcript_id "g12130.t1"; gene_id "g12130"; 29 | X AUGUSTUS CDS 2066149 2066238 0.92 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 30 | X AUGUSTUS gene 2065454 2066238 0.7 - . g12130 31 | X AUGUSTUS transcript 2065454 2066238 0.7 - . g12130.t1 32 | X AUGUSTUS exon 2066149 2066238 . - . transcript_id "g12130.t1"; gene_id "g12130"; 33 | X AUGUSTUS start_codon 2066236 2066238 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 34 | 2R AUGUSTUS stop_codon 16433896 16433898 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 35 | 2R AUGUSTUS CDS 16433896 16435797 1 - 0 transcript_id "g10583.t1"; gene_id "g10583"; 36 | 2R AUGUSTUS exon 16433896 16435797 . - . transcript_id "g10583.t1"; gene_id "g10583"; 37 | 2R AUGUSTUS start_codon 16435795 16435797 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 38 | 2R AUGUSTUS gene 16433896 16435797 1 - . g10583 39 | 2R AUGUSTUS transcript 16433896 16435797 1 - . g10583.t1 40 | 2R AUGUSTUS stop_codon 24640803 24640805 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 41 | 2R AUGUSTUS CDS 24640803 24642212 1 - 0 transcript_id "g11793.t1"; gene_id "g11793"; 42 | 2R AUGUSTUS exon 24640803 24642212 . - . transcript_id "g11793.t1"; gene_id "g11793"; 43 | 2R AUGUSTUS start_codon 24642210 24642212 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 44 | 2R AUGUSTUS gene 24640803 24642212 1 - . g11793 45 | 2R AUGUSTUS transcript 24640803 24642212 1 - . g11793.t1 46 | 2L AUGUSTUS stop_codon 11989063 11989065 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 47 | 2L AUGUSTUS CDS 11989063 11989803 0.73 - 0 transcript_id "g1539.t1"; gene_id "g1539"; 48 | 2L AUGUSTUS exon 11989063 11989803 . - . transcript_id "g1539.t1"; gene_id "g1539"; 49 | 2L AUGUSTUS start_codon 11989801 11989803 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 50 | 2L AUGUSTUS gene 11989063 11989803 0.73 - . g1539 51 | 2L AUGUSTUS transcript 11989063 11989803 0.73 - . g1539.t1 52 | 2L AUGUSTUS start_codon 4686242 4686244 . + 0 transcript_id "g562.t1"; gene_id "g562"; 53 | 2L AUGUSTUS CDS 4686242 4687105 1 + 0 transcript_id "g562.t1"; gene_id "g562"; 54 | 2L AUGUSTUS exon 4686242 4687105 . + . transcript_id "g562.t1"; gene_id "g562"; 55 | 2L AUGUSTUS stop_codon 4687103 4687105 . + 0 transcript_id "g562.t1"; gene_id "g562"; 56 | 2L AUGUSTUS gene 4686242 4687105 1 + . g562 57 | 2L AUGUSTUS transcript 4686242 4687105 1 + . g562.t1 58 | 3L AUGUSTUS stop_codon 11362605 11362607 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 59 | 3L AUGUSTUS CDS 11362605 11363086 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 60 | 3L AUGUSTUS exon 11362605 11363086 . - . transcript_id "g3988.t1"; gene_id "g3988"; 61 | 3L AUGUSTUS intron 11363087 11363276 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 62 | 3L AUGUSTUS CDS 11363277 11363918 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 63 | 3L AUGUSTUS exon 11363277 11363918 . - . transcript_id "g3988.t1"; gene_id "g3988"; 64 | 3L AUGUSTUS intron 11363919 11364608 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 65 | 3L AUGUSTUS CDS 11364609 11364771 1 - 0 transcript_id "g3988.t1"; gene_id "g3988"; 66 | 3L AUGUSTUS gene 11362605 11364771 1 - . g3988 67 | 3L AUGUSTUS transcript 11362605 11364771 1 - . g3988.t1 68 | 3L AUGUSTUS exon 11364609 11364771 . - . transcript_id "g3988.t1"; gene_id "g3988"; 69 | 3L AUGUSTUS start_codon 11364769 11364771 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 70 | 3R AUGUSTUS start_codon 12691822 12691824 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 71 | 3R AUGUSTUS CDS 12691822 12691869 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 72 | 3R AUGUSTUS exon 12691822 12691869 . + . transcript_id "g6660.t1"; gene_id "g6660"; 73 | 3R AUGUSTUS intron 12691870 12692642 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 74 | 3R AUGUSTUS CDS 12692643 12692707 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 75 | 3R AUGUSTUS exon 12692643 12692707 . + . transcript_id "g6660.t1"; gene_id "g6660"; 76 | 3R AUGUSTUS intron 12692708 12692769 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 77 | 3R AUGUSTUS CDS 12692770 12692944 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 78 | 3R AUGUSTUS exon 12692770 12692944 . + . transcript_id "g6660.t1"; gene_id "g6660"; 79 | 3R AUGUSTUS intron 12692945 12693003 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 80 | 3R AUGUSTUS CDS 12693004 12693155 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 81 | 3R AUGUSTUS exon 12693004 12693155 . + . transcript_id "g6660.t1"; gene_id "g6660"; 82 | 3R AUGUSTUS intron 12693156 12693214 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 83 | 3R AUGUSTUS CDS 12693215 12693761 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 84 | 3R AUGUSTUS exon 12693215 12693761 . + . transcript_id "g6660.t1"; gene_id "g6660"; 85 | 3R AUGUSTUS intron 12693762 12693829 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 86 | 3R AUGUSTUS CDS 12693830 12693973 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 87 | 3R AUGUSTUS gene 12691822 12693973 1 + . g6660 88 | 3R AUGUSTUS transcript 12691822 12693973 1 + . g6660.t1 89 | 3R AUGUSTUS exon 12693830 12693973 . + . transcript_id "g6660.t1"; gene_id "g6660"; 90 | 3R AUGUSTUS stop_codon 12693971 12693973 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 91 | 2R AUGUSTUS stop_codon 20354214 20354216 . - 0 transcript_id "g11080.t1"; 92 | 2R AUGUSTUS CDS 20354214 20355053 1 - 0 transcript_id "g11080.t1"; 93 | 2R AUGUSTUS exon 20354214 20355053 . - . transcript_id "g11080.t1"; 94 | 2R AUGUSTUS start_codon 20355051 20355053 . - 0 transcript_id "g11080.t1"; 95 | 2R AUGUSTUS gene 20354214 20355053 1 - . g11080 96 | 2R AUGUSTUS transcript 20354214 20355053 1 - . g11080.t1 -------------------------------------------------------------------------------- /tests/genome_anno/format_error.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 gene_id "7789_g"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | 3L GeneMark.hmm start_codon 18463066 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 7 | 3R AUGUSTUS start_codon 7686444 7686446 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 8 | 3R AUGUSTUS CDS 7686444 7686623 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 9 | 3R AUGUSTUS exon 7686444 7686623 . + . transcript_id "g5980.t1"; gene_id "g5980"; 10 | 3R AUGUSTUS intron 7686624 7690691 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 11 | 3R AUGUSTUS CDS 7690692 7690843 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 12 | 3R AUGUSTUS exon 7690692 7690843 . + . transcript_id "g5980.t1"; gene_id "g5980"; 13 | 3R AUGUSTUS intron 7690844 7691514 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 14 | 3R AUGUSTUS CDS 7691515 7691630 1 + 1 transcript_id "g5980.t1"; gene_id "g5980"; 15 | 3R AUGUSTUS exon 7691515 7691630 . + . transcript_id "g5980.t1"; gene_id "g5980"; 16 | 3R AUGUSTUS intron 7691631 7691712 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 17 | 3R AUGUSTUS CDS 7691713 7693700 1 + 2 transcript_id "g5980.t1"; gene_id "g5980"; 18 | 3R AUGUSTUS gene 7686444 7693700 1 + . g5980 19 | 3R AUGUSTUS transcript 7686444 7693700 1 + . g5980.t1 20 | 3R AUGUSTUS exon 7691713 7693700 . + . transcript_id "g5980.t1"; gene_id "g5980"; 21 | 3R AUGUSTUS stop_codon 7693698 7693700 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 22 | X AUGUSTUS stop_codon 2065454 2065456 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 23 | X AUGUSTUS CDS 2065454 2065891 0.75 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 24 | X AUGUSTUS exon 2065454 2065891 . - . transcript_id "g12130.t1"; gene_id "g12130"; 25 | X AUGUSTUS intron 2065892 2065944 0.98 - . transcript_id "g12130.t1"; gene_id "g12130"; 26 | X AUGUSTUS CDS 2065945 2066088 0.93 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 27 | X AUGUSTUS exon 2065945 2066088 . - . transcript_id "g12130.t1"; gene_id "g12130"; 28 | X AUGUSTUS intron 2066089 2066148 0.92 - . transcript_id "g12130.t1"; gene_id "g12130"; 29 | X AUGUSTUS CDS 2066149 2066238 0.92 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 30 | X AUGUSTUS gene 2065454 2066238 0.7 - . g12130 31 | X AUGUSTUS transcript 2065454 2066238 0.7 - . g12130.t1 32 | X AUGUSTUS exon 2066149 2066238 . - . transcript_id "g12130.t1"; gene_id "g12130"; 33 | X AUGUSTUS start_codon 2066236 2066238 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 34 | 2R AUGUSTUS stop_codon 16433896 16433898 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 35 | 2R AUGUSTUS CDS 16433896 16435797 1 - 0 transcript_id "g10583.t1"; gene_id "g10583"; 36 | 2R AUGUSTUS exon 16433896 16435797 . - . transcript_id "g10583.t1"; gene_id "g10583"; 37 | 2R AUGUSTUS start_codon 16435795 16435797 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 38 | 2R AUGUSTUS gene 16433896 16435797 1 - . g10583 39 | 2R AUGUSTUS transcript 16433896 16435797 1 - . g10583.t1 40 | 2R AUGUSTUS stop_codon 24640803 24640805 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 41 | 2R AUGUSTUS CDS 24640803 24642212 1 - 0 transcript_id "g11793.t1"; gene_id "g11793"; 42 | 2R AUGUSTUS exon 24640803 24642212 . - . transcript_id "g11793.t1"; gene_id "g11793"; 43 | 2R AUGUSTUS start_codon 24642210 24642212 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 44 | 2R AUGUSTUS gene 24640803 24642212 1 - . g11793 45 | 2R AUGUSTUS transcript 24640803 24642212 1 - . g11793.t1 46 | 2L AUGUSTUS stop_codon 11989063 11989065 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 47 | 2L AUGUSTUS CDS 11989063 11989803 0.73 - 0 transcript_id "g1539.t1"; gene_id "g1539"; 48 | 2L AUGUSTUS exon 11989063 11989803 . - . transcript_id "g1539.t1"; gene_id "g1539"; 49 | 2L AUGUSTUS start_codon 11989801 11989803 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 50 | 2L AUGUSTUS gene 11989063 11989803 0.73 - . g1539 51 | 2L AUGUSTUS transcript 11989063 11989803 0.73 - . g1539.t1 52 | 2L AUGUSTUS start_codon 4686242 4686244 . + 0 transcript_id "g562.t1"; gene_id "g562"; 53 | 2L AUGUSTUS CDS 4686242 4687105 1 + 0 transcript_id "g562.t1"; gene_id "g562"; 54 | 2L AUGUSTUS exon 4686242 4687105 . + . transcript_id "g562.t1"; gene_id "g562"; 55 | 2L AUGUSTUS stop_codon 4687103 4687105 . + 0 transcript_id "g562.t1"; gene_id "g562"; 56 | 2L AUGUSTUS gene 4686242 4687105 1 + . g562 57 | 2L AUGUSTUS transcript 4686242 4687105 1 + . g562.t1 58 | 3L AUGUSTUS stop_codon 11362605 11362607 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 59 | 3L AUGUSTUS CDS 11362605 11363086 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 60 | 3L AUGUSTUS exon 11362605 11363086 . - . transcript_id "g3988.t1"; gene_id "g3988"; 61 | 3L AUGUSTUS intron 11363087 11363276 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 62 | 3L AUGUSTUS CDS 11363277 11363918 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 63 | 3L AUGUSTUS exon 11363277 11363918 . - . transcript_id "g3988.t1"; gene_id "g3988"; 64 | 3L AUGUSTUS intron 11363919 11364608 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 65 | 3L AUGUSTUS CDS 11364609 11364771 1 - 0 transcript_id "g3988.t1"; gene_id "g3988"; 66 | 3L AUGUSTUS gene 11362605 11364771 1 - . g3988 67 | 3L AUGUSTUS transcript 11362605 11364771 1 - . g3988.t1 68 | 3L AUGUSTUS exon 11364609 11364771 . - . transcript_id "g3988.t1"; gene_id "g3988"; 69 | 3L AUGUSTUS start_codon 11364769 11364771 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 70 | 3R AUGUSTUS start_codon 12691822 12691824 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 71 | 3R AUGUSTUS CDS 12691822 12691869 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 72 | 3R AUGUSTUS exon 12691822 12691869 . + . transcript_id "g6660.t1"; gene_id "g6660"; 73 | 3R AUGUSTUS intron 12691870 12692642 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 74 | 3R AUGUSTUS CDS 12692643 12692707 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 75 | 3R AUGUSTUS exon 12692643 12692707 . + . transcript_id "g6660.t1"; gene_id "g6660"; 76 | 3R AUGUSTUS intron 12692708 12692769 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 77 | 3R AUGUSTUS CDS 12692770 12692944 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 78 | 3R AUGUSTUS exon 12692770 12692944 . + . transcript_id "g6660.t1"; gene_id "g6660"; 79 | 3R AUGUSTUS intron 12692945 12693003 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 80 | 3R AUGUSTUS CDS 12693004 12693155 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 81 | 3R AUGUSTUS exon 12693004 12693155 . + . transcript_id "g6660.t1"; gene_id "g6660"; 82 | 3R AUGUSTUS intron 12693156 12693214 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 83 | 3R AUGUSTUS CDS 12693215 12693761 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 84 | 3R AUGUSTUS exon 12693215 12693761 . + . transcript_id "g6660.t1"; gene_id "g6660"; 85 | 3R AUGUSTUS intron 12693762 12693829 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 86 | 3R AUGUSTUS CDS 12693830 12693973 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 87 | 3R AUGUSTUS gene 12691822 12693973 1 + . g6660 88 | 3R AUGUSTUS transcript 12691822 12693973 1 + . g6660.t1 89 | 3R AUGUSTUS exon 12693830 12693973 . + . transcript_id "g6660.t1"; gene_id "g6660"; 90 | 3R AUGUSTUS stop_codon 12693971 12693973 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 91 | 2R AUGUSTUS stop_codon 20354214 20354216 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 92 | 2R AUGUSTUS CDS 20354214 20355053 1 - 0 transcript_id "g11080.t1"; gene_id "g11080"; 93 | 2R AUGUSTUS exon 20354214 20355053 . - . transcript_id "g11080.t1"; gene_id "g11080"; 94 | 2R AUGUSTUS start_codon 20355051 20355053 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 95 | 2R AUGUSTUS gene 20354214 20355053 1 - . g11080 96 | 2R AUGUSTUS transcript 20354214 20355053 1 - . g11080.t1 -------------------------------------------------------------------------------- /tests/genome_anno/anno1.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | 3L GeneMark.hmm start_codon 18463066 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 7 | 3R AUGUSTUS start_codon 7686444 7686446 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 8 | 3R AUGUSTUS CDS 7686444 7686623 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 9 | 3R AUGUSTUS exon 7686444 7686623 . + . transcript_id "g5980.t1"; gene_id "g5980"; 10 | 3R AUGUSTUS intron 7686624 7690691 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 11 | 3R AUGUSTUS CDS 7690692 7690843 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 12 | 3R AUGUSTUS exon 7690692 7690843 . + . transcript_id "g5980.t1"; gene_id "g5980"; 13 | 3R AUGUSTUS intron 7690844 7691514 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 14 | 3R AUGUSTUS CDS 7691515 7691630 1 + 1 transcript_id "g5980.t1"; gene_id "g5980"; 15 | 3R AUGUSTUS exon 7691515 7691630 . + . transcript_id "g5980.t1"; gene_id "g5980"; 16 | 3R AUGUSTUS intron 7691631 7691712 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 17 | 3R AUGUSTUS CDS 7691713 7693700 1 + 2 transcript_id "g5980.t1"; gene_id "g5980"; 18 | 3R AUGUSTUS gene 7686444 7693700 1 + . g5980 19 | 3R AUGUSTUS transcript 7686444 7693700 1 + . g5980.t1 20 | 3R AUGUSTUS exon 7691713 7693700 . + . transcript_id "g5980.t1"; gene_id "g5980"; 21 | 3R AUGUSTUS stop_codon 7693698 7693700 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 22 | X AUGUSTUS stop_codon 2065454 2065456 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 23 | X AUGUSTUS CDS 2065454 2065891 0.75 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 24 | X AUGUSTUS exon 2065454 2065891 . - . transcript_id "g12130.t1"; gene_id "g12130"; 25 | X AUGUSTUS intron 2065892 2065944 0.98 - . transcript_id "g12130.t1"; gene_id "g12130"; 26 | X AUGUSTUS CDS 2065945 2066088 0.93 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 27 | X AUGUSTUS exon 2065945 2066088 . - . transcript_id "g12130.t1"; gene_id "g12130"; 28 | X AUGUSTUS intron 2066089 2066148 0.92 - . transcript_id "g12130.t1"; gene_id "g12130"; 29 | X AUGUSTUS CDS 2066149 2066238 0.92 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 30 | X AUGUSTUS gene 2065454 2066238 0.7 - . g12130 31 | X AUGUSTUS transcript 2065454 2066238 0.7 - . g12130.t1 32 | X AUGUSTUS exon 2066149 2066238 . - . transcript_id "g12130.t1"; gene_id "g12130"; 33 | X AUGUSTUS start_codon 2066236 2066238 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 34 | 2R AUGUSTUS stop_codon 16433896 16433898 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 35 | 2R AUGUSTUS CDS 16433896 16435797 1 - 0 transcript_id "g10583.t1"; gene_id "g10583"; 36 | 2R AUGUSTUS exon 16433896 16435797 . - . transcript_id "g10583.t1"; gene_id "g10583"; 37 | 2R AUGUSTUS start_codon 16435795 16435797 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 38 | 2R AUGUSTUS gene 16433896 16435797 1 - . g10583 39 | 2R AUGUSTUS transcript 16433896 16435797 1 - . g10583.t1 40 | 2R AUGUSTUS stop_codon 24640803 24640805 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 41 | 2R AUGUSTUS CDS 24640803 24642212 1 - 0 transcript_id "g11793.t1"; gene_id "g11793"; 42 | 2R AUGUSTUS exon 24640803 24642212 . - . transcript_id "g11793.t1"; gene_id "g11793"; 43 | 2R AUGUSTUS start_codon 24642210 24642212 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 44 | 2R AUGUSTUS gene 24640803 24642212 1 - . g11793 45 | 2R AUGUSTUS transcript 24640803 24642212 1 - . g11793.t1 46 | 2L AUGUSTUS stop_codon 11989063 11989065 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 47 | 2L AUGUSTUS CDS 11989063 11989803 0.73 - 0 transcript_id "g1539.t1"; gene_id "g1539"; 48 | 2L AUGUSTUS exon 11989063 11989803 . - . transcript_id "g1539.t1"; gene_id "g1539"; 49 | 2L AUGUSTUS start_codon 11989801 11989803 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 50 | 2L AUGUSTUS gene 11989063 11989803 0.73 - . g1539 51 | 2L AUGUSTUS transcript 11989063 11989803 0.73 - . g1539.t1 52 | 2L AUGUSTUS start_codon 4686242 4686244 . + 0 transcript_id "g562.t1"; gene_id "g562"; 53 | 2L AUGUSTUS CDS 4686242 4687105 1 + 0 transcript_id "g562.t1"; gene_id "g562"; 54 | 2L AUGUSTUS exon 4686242 4687105 . + . transcript_id "g562.t1"; gene_id "g562"; 55 | 2L AUGUSTUS stop_codon 4687103 4687105 . + 0 transcript_id "g562.t1"; gene_id "g562"; 56 | 2L AUGUSTUS gene 4686242 4687105 1 + . g562 57 | 2L AUGUSTUS transcript 4686242 4687105 1 + . g562.t1 58 | 3L AUGUSTUS stop_codon 11362605 11362607 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 59 | 3L AUGUSTUS CDS 11362605 11363086 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 60 | 3L AUGUSTUS exon 11362605 11363086 . - . transcript_id "g3988.t1"; gene_id "g3988"; 61 | 3L AUGUSTUS intron 11363087 11363276 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 62 | 3L AUGUSTUS CDS 11363277 11363918 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 63 | 3L AUGUSTUS exon 11363277 11363918 . - . transcript_id "g3988.t1"; gene_id "g3988"; 64 | 3L AUGUSTUS intron 11363919 11364608 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 65 | 3L AUGUSTUS CDS 11364609 11364771 1 - 0 transcript_id "g3988.t1"; gene_id "g3988"; 66 | 3L AUGUSTUS gene 11362605 11364771 1 - . g3988 67 | 3L AUGUSTUS transcript 11362605 11364771 1 - . g3988.t1 68 | 3L AUGUSTUS exon 11364609 11364771 . - . transcript_id "g3988.t1"; gene_id "g3988"; 69 | 3L AUGUSTUS start_codon 11364769 11364771 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 70 | 3R AUGUSTUS start_codon 12691822 12691824 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 71 | 3R AUGUSTUS CDS 12691822 12691869 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 72 | 3R AUGUSTUS exon 12691822 12691869 . + . transcript_id "g6660.t1"; gene_id "g6660"; 73 | 3R AUGUSTUS intron 12691870 12692642 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 74 | 3R AUGUSTUS CDS 12692643 12692707 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 75 | 3R AUGUSTUS exon 12692643 12692707 . + . transcript_id "g6660.t1"; gene_id "g6660"; 76 | 3R AUGUSTUS intron 12692708 12692769 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 77 | 3R AUGUSTUS CDS 12692770 12692944 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 78 | 3R AUGUSTUS exon 12692770 12692944 . + . transcript_id "g6660.t1"; gene_id "g6660"; 79 | 3R AUGUSTUS intron 12692945 12693003 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 80 | 3R AUGUSTUS CDS 12693004 12693155 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 81 | 3R AUGUSTUS exon 12693004 12693155 . + . transcript_id "g6660.t1"; gene_id "g6660"; 82 | 3R AUGUSTUS intron 12693156 12693214 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 83 | 3R AUGUSTUS CDS 12693215 12693761 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 84 | 3R AUGUSTUS exon 12693215 12693761 . + . transcript_id "g6660.t1"; gene_id "g6660"; 85 | 3R AUGUSTUS intron 12693762 12693829 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 86 | 3R AUGUSTUS CDS 12693830 12693973 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 87 | 3R AUGUSTUS gene 12691822 12693973 1 + . g6660 88 | 3R AUGUSTUS transcript 12691822 12693973 1 + . g6660.t1 89 | 3R AUGUSTUS exon 12693830 12693973 . + . transcript_id "g6660.t1"; gene_id "g6660"; 90 | 3R AUGUSTUS stop_codon 12693971 12693973 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 91 | 2R AUGUSTUS stop_codon 20354214 20354216 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 92 | 2R AUGUSTUS CDS 20354214 20355053 1 - 0 transcript_id "g11080.t1"; gene_id "g11080"; 93 | 2R AUGUSTUS exon 20354214 20355053 . - . transcript_id "g11080.t1"; gene_id "g11080"; 94 | 2R AUGUSTUS start_codon 20355051 20355053 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 95 | 2R AUGUSTUS gene 20354214 20355053 1 - . g11080 96 | 2R AUGUSTUS transcript 20354214 20355053 1 - . g11080.t1 97 | -------------------------------------------------------------------------------- /bin/get_overlapping_genes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # TSEBRA: Transcript Selector for BRAKER 6 | # ============================================================== 7 | import argparse 8 | import sys 9 | import os 10 | import csv 11 | 12 | class ConfigFileError(Exception): 13 | pass 14 | 15 | class GeneSetMissing(Exception): 16 | pass 17 | 18 | gtf = [] 19 | enforce_tx = [] 20 | anno = [] 21 | hintfiles = [] 22 | graph = None 23 | out = '' 24 | v = 0 25 | quiet = False 26 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \ 27 | 'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0} 28 | cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/braker3.cfg' 29 | def main(): 30 | """ 31 | Overview: 32 | 33 | 1. Read gene predicitions from .gtf files. 34 | 2. Read Evidence from .gff files. 35 | 3. Detect overlapping transcripts. 36 | 4. Create feature vector (for a list of all features see features.py) 37 | for all transcripts. 38 | 5. Compare the feature vectors of all pairs of overlapping transcripts. 39 | 6. Exclude transcripts based on the 'transcript comparison rule' and 5. 40 | 7. Remove Transcripts with low evidence support. 41 | 8. Create combined gene predicitions (all transcripts that weren't excluded). 42 | """ 43 | 44 | from genome_anno import Anno 45 | from overlap_graph import Graph 46 | from evidence import Evidence 47 | 48 | global anno, graph, parameter 49 | 50 | args = parseCmd() 51 | # init(args) 52 | set_parameter(cfg_file) 53 | if v > 0: 54 | print(gtf) 55 | tx_keys = [] 56 | # read gene prediciton files 57 | c = 1 58 | keep = [] 59 | for g in [args.geneset1, args.geneset2]: 60 | tx_keys.append([]) 61 | if not quiet: 62 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 63 | anno.append(Anno(g, f'anno{c}')) 64 | anno[-1].addGtf() 65 | anno[-1].norm_tx_format() 66 | keep.append(f'anno{c}') 67 | for tx in anno[-1].transcripts.values(): 68 | cds = tx.get_type_coords('CDS', False) 69 | key = ['_'.join(list(map(str, c_1))) for c_1 in cds] 70 | tx_keys[-1].append(key) 71 | c+=1 72 | 73 | 74 | 75 | # read hintfiles 76 | evi = Evidence() 77 | 78 | # create graph with an edge for each unique transcript 79 | # and an edge if two transcripts overlap 80 | # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides 81 | graph = Graph(anno, para=parameter, keep_tx=keep, verbose=v) 82 | if not quiet: 83 | sys.stderr.write('### BUILD OVERLAP GRAPH\n') 84 | graph.build() 85 | 86 | graph.add_node_features(evi) 87 | # apply decision rule to exclude a set of transcripts 88 | if not quiet: 89 | sys.stderr.write('### SELECT TRANSCRIPTS\n') 90 | combined_prediction = graph.get_decided_graph() 91 | 92 | if v > 0: 93 | sys.stderr.write(str(combined_prediction.keys()) + '\n') 94 | for a in anno: 95 | sys.stderr.write('Numb_tx in {}: {}\n'.format(a.id, len(combined_prediction[a.id]))) 96 | 97 | # write result to output file 98 | if not quiet: 99 | sys.stderr.write('### WRITE COMBINED GENE PREDICTION\n') 100 | combined_anno = Anno('', 'combined_annotation') 101 | for a in anno: 102 | txs = a.get_subset([t[0] for t in combined_prediction[a.id]]) 103 | for id, new_gene_id in combined_prediction[a.id]: 104 | txs[id].set_gene_id(new_gene_id) 105 | combined_anno.add_transcripts(txs, a.id + '.') 106 | combined_anno.find_genes() 107 | 108 | out_only_g1 = [] 109 | out_only_g2 = [] 110 | out_overlap_g1 = [] 111 | out_overlap_g2 = [] 112 | 113 | gene_gtf = sorted(combined_anno.gene_gtf.values(), key=lambda g: (g[0],g[3],g[4])) 114 | for gene in gene_gtf: 115 | gtf_gene = [[],[]] 116 | current_anno_sources = set([]) 117 | # gtf_gene.append(gene) 118 | for tx_id in combined_anno.genes[gene[8]]: 119 | n_id = f'{combined_anno.transcripts[tx_id].source_anno};{".".join(tx_id.split(".")[1:])}' 120 | # gtf_gene += combined_anno.transcripts[tx_id].get_gtf() 121 | # current_anno_sources = current_anno_sources.union(graph.nodes[n_id].gene_sets) 122 | cds = combined_anno.transcripts[tx_id].get_type_coords('CDS', False) 123 | key = ['_'.join(list(map(str, c_1))) for c_1 in cds] 124 | 125 | for i, k in enumerate(tx_keys): 126 | if key in k: 127 | gtf_gene[i].append(gene) 128 | gtf_gene[i] += combined_anno.transcripts[tx_id].get_gtf() 129 | 130 | # print(current_anno_sources) 131 | # print(gtf_gene) 132 | if gtf_gene[0] and gtf_gene[1]: 133 | print(current_anno_sources, 'A') 134 | out_overlap_g1 += gtf_gene[0] 135 | out_overlap_g2 += gtf_gene[1] 136 | elif gtf_gene[0]: 137 | out_only_g1 += gtf_gene[0] 138 | elif gtf_gene[1]: 139 | out_only_g2 += gtf_gene[1] 140 | else: 141 | print(current_anno_sources) 142 | 143 | 144 | for i,j in zip([out_only_g1,out_only_g2,out_overlap_g1,out_overlap_g2], 145 | [f'{args.out}_only_g1', f'{args.out}_only_g2', f'{args.out}_overlap_g1',f'{args.out}_overlap_g2']): 146 | with open(j, 'w+') as file: 147 | out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n') 148 | for line in i: 149 | out_writer.writerow(line) 150 | 151 | 152 | def set_parameter(cfg_file): 153 | """ 154 | read parameters from the cfg file and store them in parameter. 155 | 156 | Args: 157 | cfg_file (str): Path to configuration file. 158 | """ 159 | global parameter 160 | with open(cfg_file, 'r') as file: 161 | cfg = csv.reader(file, delimiter=' ') 162 | for line in cfg: 163 | if not line[0][0] == '#': 164 | if line[0] not in parameter.keys(): 165 | parameter.update({line[0] : None}) 166 | parameter[line[0]] = float(line[1]) 167 | 168 | def init(args): 169 | global gtf, hintfiles, threads, hint_source_weight, out, enforce_tx, v, quiet 170 | if args.gtf: 171 | gtf = args.gtf.split(',') 172 | if args.keep_gtf: 173 | enforce_tx = args.keep_gtf.split(',') 174 | if not args.keep_gtf and not args.gtf: 175 | raise GeneSetMissing('At least one gene set has to be provided '\ 176 | + 'either with --gtf or --kepp_all!') 177 | if args.hintfiles: 178 | hintfiles = args.hintfiles.split(',') 179 | if args.cfg: 180 | cfg_file = args.cfg 181 | else: 182 | cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/braker3.cfg' 183 | set_parameter(cfg_file) 184 | if args.out: 185 | out = args.out 186 | if args.verbose: 187 | v = args.verbose 188 | if args.quiet: 189 | quiet = True 190 | 191 | def parseCmd(): 192 | """Parse command line arguments 193 | 194 | Returns: 195 | dictionary: Dictionary with arguments 196 | """ 197 | parser = argparse.ArgumentParser(description='Input: Two gtf files; Output: 3 GTF files with overlapping/not overlapping genes.') 198 | parser.add_argument('-g1', '--geneset1', type=str, 199 | help='') 200 | parser.add_argument('-g2', '--geneset2', type=str, 201 | help='') 202 | parser.add_argument('-o', '--out', type=str, required=True, 203 | help='') 204 | parser.add_argument('-q', '--quiet', action='store_true', 205 | help='Quiet mode.') 206 | parser.add_argument('-v', '--verbose', type=int, 207 | help='') 208 | return parser.parse_args() 209 | 210 | if __name__ == '__main__': 211 | main() 212 | -------------------------------------------------------------------------------- /bin/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The Artistic License 2.0 2 | 3 | Copyright (c) 2000-2006, The Perl Foundation. 4 | 5 | Everyone is permitted to copy and distribute verbatim copies of this license 6 | document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | This license establishes the terms under which a given free software Package 11 | may be copied, modified, distributed, and/or redistributed. The intent is that 12 | the Copyright Holder maintains some artistic control over the development of 13 | that Package while still keeping the Package available as open source and free 14 | software. 15 | 16 | You are always permitted to make arrangements wholly outside of this license 17 | directly with the Copyright Holder of a given Package. If the terms of this 18 | license do not permit the full use that you propose to make of the Package, 19 | you should contact the Copyright Holder and seek a different licensing 20 | arrangement. 21 | 22 | Definitions 23 | 24 | "Copyright Holder" means the individual(s) or organization(s) named in the 25 | copyright notice for the entire Package. 26 | 27 | "Contributor" means any party that has contributed code or other material to 28 | the Package, in accordance with the Copyright Holder's procedures. 29 | 30 | "You" and "your" means any person who would like to copy, distribute, or 31 | modify the Package. 32 | 33 | "Package" means the collection of files distributed by the Copyright Holder, 34 | and derivatives of that collection and/or of those files. A given Package may 35 | consist of either the Standard Version, or a Modified Version. 36 | 37 | "Distribute" means providing a copy of the Package or making it accessible to 38 | anyone else, or in the case of a company or organization, to others outside of 39 | your company or organization. 40 | 41 | "Distributor Fee" means any fee that you charge for Distributing this Package 42 | or providing support for this Package to another party. It does not mean 43 | licensing fees. 44 | 45 | "Standard Version" refers to the Package if it has not been modified, or has 46 | been modified only in ways explicitly requested by the Copyright Holder. 47 | 48 | "Modified Version" means the Package, if it has been changed, and such changes 49 | were not explicitly requested by the Copyright Holder. 50 | 51 | "Original License" means this Artistic License as Distributed with the 52 | Standard Version of the Package, in its current version or as it may be 53 | modified by The Perl Foundation in the future. 54 | 55 | "Source" form means the source code, documentation source, and configuration 56 | files for the Package. 57 | 58 | "Compiled" form means the compiled bytecode, object code, binary, or any other 59 | form resulting from mechanical transformation or translation of the Source 60 | form. 61 | 62 | Permission for Use and Modification Without Distribution 63 | 64 | (1) You are permitted to use the Standard Version and create and use Modified 65 | Versions for any purpose without restriction, provided that you do not 66 | Distribute the Modified Version. 67 | 68 | Permissions for Redistribution of the Standard Version 69 | 70 | (2) You may Distribute verbatim copies of the Source form of the Standard 71 | Version of this Package in any medium without restriction, either gratis or 72 | for a Distributor Fee, provided that you duplicate all of the original 73 | copyright notices and associated disclaimers. At your discretion, such 74 | verbatim copies may or may not include a Compiled form of the Package. 75 | 76 | (3) You may apply any bug fixes, portability changes, and other modifications 77 | made available from the Copyright Holder. The resulting Package will still be 78 | considered the Standard Version, and as such will be subject to the Original 79 | License. 80 | 81 | Distribution of Modified Versions of the Package as Source 82 | 83 | (4) You may Distribute your Modified Version as Source (either gratis or for a 84 | Distributor Fee, and with or without a Compiled form of the Modified Version) 85 | provided that you clearly document how it differs from the Standard Version, 86 | including, but not limited to, documenting any non-standard features, 87 | executables, or modules, and provided that you do at least ONE of the 88 | following: 89 | 90 | (a) make the Modified Version available to the Copyright Holder of the 91 | Standard Version, under the Original License, so that the Copyright Holder may 92 | include your modifications in the Standard Version. 93 | 94 | (b) ensure that installation of your Modified Version does not prevent the 95 | user installing or running the Standard Version. In addition, the Modified 96 | Version must bear a name that is different from the name of the Standard 97 | Version. 98 | 99 | (c) allow anyone who receives a copy of the Modified Version to make the 100 | Source form of the Modified Version available to others under 101 | 102 | (i) the Original License or 103 | 104 | (ii) a license that permits the licensee to freely copy, modify and 105 | redistribute the Modified Version using the same licensing terms that apply to 106 | the copy that the licensee received, and requires that the Source form of the 107 | Modified Version, and of any works derived from it, be made freely available 108 | in that license fees are prohibited but Distributor Fees are allowed. 109 | 110 | Distribution of Compiled Forms of the Standard Version or Modified Versions 111 | without the Source 112 | 113 | (5) You may Distribute Compiled forms of the Standard Version without the 114 | Source, provided that you include complete instructions on how to get the 115 | Source of the Standard Version. Such instructions must be valid at the time of 116 | your distribution. If these instructions, at any time while you are carrying 117 | out such distribution, become invalid, you must provide new instructions on 118 | demand or cease further distribution. If you provide valid instructions or 119 | cease distribution within thirty days after you become aware that the 120 | instructions are invalid, then you do not forfeit any of your rights under 121 | this license. 122 | 123 | (6) You may Distribute a Modified Version in Compiled form without the Source, 124 | provided that you comply with Section 4 with respect to the Source of the 125 | Modified Version. 126 | 127 | Aggregating or Linking the Package 128 | 129 | (7) You may aggregate the Package (either the Standard Version or Modified 130 | Version) with other packages and Distribute the resulting aggregation provided 131 | that you do not charge a licensing fee for the Package. Distributor Fees are 132 | permitted, and licensing fees for other components in the aggregation are 133 | permitted. The terms of this license apply to the use and Distribution of the 134 | Standard or Modified Versions as included in the aggregation. 135 | 136 | (8) You are permitted to link Modified and Standard Versions with other works, 137 | to embed the Package in a larger work of your own, or to build stand-alone 138 | binary or bytecode versions of applications that include the Package, and 139 | Distribute the result without restriction, provided the result does not expose 140 | a direct interface to the Package. 141 | 142 | Items That are Not Considered Part of a Modified Version 143 | 144 | (9) Works (including, but not limited to, modules and scripts) that merely 145 | extend or make use of the Package, do not, by themselves, cause the Package to 146 | be a Modified Version. In addition, such works are not considered parts of the 147 | Package itself, and are not subject to the terms of this license. 148 | 149 | General Provisions 150 | 151 | (10) Any use, modification, and distribution of the Standard or Modified 152 | Versions is governed by this Artistic License. By using, modifying or 153 | distributing the Package, you accept this license. Do not use, modify, or 154 | distribute the Package, if you do not accept this license. 155 | 156 | (11) If your Modified Version has been derived from a Modified Version made by 157 | someone other than you, you are nevertheless required to ensure that your 158 | Modified Version complies with the requirements of this license. 159 | 160 | (12) This license does not grant you the right to use any trademark, service 161 | mark, tradename, or logo of the Copyright Holder. 162 | 163 | (13) This license includes the non-exclusive, worldwide, free-of-charge patent 164 | license to make, have made, use, offer to sell, sell, import and otherwise 165 | transfer the Package with respect to any patent claims licensable by the 166 | Copyright Holder that are necessarily infringed by the Package. If you 167 | institute patent litigation (including a cross-claim or counterclaim) against 168 | any party alleging that the Package constitutes direct or contributory patent 169 | infringement, then this Artistic License to you shall terminate on the date 170 | that such litigation is filed. 171 | 172 | (14) Disclaimer of Warranty: 173 | 174 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' 175 | AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF 176 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE 177 | DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, 178 | NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, 179 | INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE 180 | PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 181 | -------------------------------------------------------------------------------- /bin/tsebra.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # TSEBRA: Transcript Selector for BRAKER 6 | # ============================================================== 7 | import argparse 8 | import sys 9 | import os 10 | import csv 11 | 12 | class ConfigFileError(Exception): 13 | pass 14 | 15 | class GeneSetMissing(Exception): 16 | pass 17 | 18 | gtf = [] 19 | enforce_tx = [] 20 | anno = [] 21 | hintfiles = [] 22 | graph = None 23 | out = '' 24 | v = 0 25 | quiet = False 26 | filter_sing_exon = False 27 | ignore_tx_phase = False 28 | scores_tab = '' 29 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \ 30 | 'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0} 31 | 32 | def main(): 33 | """ 34 | Overview: 35 | 36 | 1. Read gene predicitions from .gtf files. 37 | 2. Read Evidence from .gff files. 38 | 3. Detect overlapping transcripts. 39 | 4. Create feature vector (for a list of all features see features.py) 40 | for all transcripts. 41 | 5. Compare the feature vectors of all pairs of overlapping transcripts. 42 | 6. Exclude transcripts based on the 'transcript comparison rule' and 5. 43 | 7. Remove Transcripts with low evidence support. 44 | 8. Create combined gene predicitions (all transcripts that weren't excluded). 45 | """ 46 | 47 | from genome_anno import Anno 48 | from overlap_graph import Graph 49 | from evidence import Evidence 50 | 51 | global anno, graph, parameter 52 | 53 | args = parseCmd() 54 | init(args) 55 | 56 | if v > 0: 57 | print(gtf) 58 | 59 | # read gene prediciton files 60 | c = 1 61 | keep = [] 62 | 63 | for g in gtf: 64 | if not quiet: 65 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 66 | anno.append(Anno(g, f'anno{c}')) 67 | anno[-1].addGtf() 68 | anno[-1].norm_tx_format() 69 | c += 1 70 | for g in enforce_tx: 71 | if not quiet: 72 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 73 | anno.append(Anno(g, f'anno{c}')) 74 | anno[-1].addGtf() 75 | anno[-1].norm_tx_format() 76 | keep.append(f'anno{c}') 77 | c += 1 78 | 79 | # read hintfiles 80 | evi = Evidence() 81 | for h in hintfiles: 82 | if not quiet: 83 | sys.stderr.write(f'### READING EXTRINSIC EVIDENCE: [{h}]\n') 84 | evi.add_hintfile(h) 85 | for src in evi.src: 86 | if src not in parameter.keys(): 87 | sys.stderr.write(f'ConfigError: No weight for src={src}, it is set to 1\n') 88 | parameter.update({src : 1}) 89 | 90 | # create graph with an edge for each unique transcript 91 | # and an edge if two transcripts overlap 92 | # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides 93 | 94 | graph = Graph(anno, para=parameter, keep_tx=keep, filter_single=filter_sing_exon, ignore_phase=ignore_tx_phase, verbose=v) 95 | if not quiet: 96 | sys.stderr.write('### BUILD OVERLAP GRAPH\n') 97 | graph.build() 98 | 99 | # add features 100 | if not quiet: 101 | sys.stderr.write('### ADD FEATURES TO TRANSCRIPTS\n') 102 | graph.add_node_features(evi) 103 | 104 | # apply decision rule to exclude a set of transcripts 105 | if not quiet: 106 | sys.stderr.write('### SELECT TRANSCRIPTS\n') 107 | combined_prediction = graph.get_decided_graph() 108 | 109 | if v > 0: 110 | sys.stderr.write(str(combined_prediction.keys()) + '\n') 111 | for a in anno: 112 | sys.stderr.write('Numb_tx in {}: {}\n'.format(a.id, len(combined_prediction[a.id]))) 113 | 114 | # write result to output file 115 | if not quiet: 116 | sys.stderr.write('### WRITE COMBINED GENE PREDICTION\n') 117 | combined_anno = Anno('', 'combined_annotation') 118 | for a in anno: 119 | txs = a.get_subset([t[0] for t in combined_prediction[a.id]]) 120 | for id, new_gene_id in combined_prediction[a.id]: 121 | txs[id].set_gene_id(new_gene_id) 122 | combined_anno.add_transcripts(txs, a.id + '.') 123 | combined_anno.find_genes() 124 | combined_anno.write_anno(out) 125 | 126 | if scores_tab: 127 | if not quiet: 128 | sys.stderr.write('### WRITE TRANSCRIPT SCORES\n') 129 | tab_out = [['### TX_ID','intron_support', 'stasto_support', 's1', 's2', 's3', 's4']] 130 | for node in graph.nodes.values(): 131 | tab_out += [[node.id] + list(node.feature_vector)] 132 | write_csv(scores_tab, tab_out) 133 | 134 | if not quiet: 135 | sys.stderr.write('### FINISHED\n\n') 136 | sys.stderr.write('### The combined gene prediciton is located at {}.\n'.format(\ 137 | out)) 138 | 139 | def set_parameter(cfg_file): 140 | """ 141 | Read parameters from the cfg file and store them in parameter. 142 | 143 | Args: 144 | cfg_file (str): Path to configuration file. 145 | """ 146 | global parameter 147 | with open(cfg_file, 'r') as file: 148 | cfg = csv.reader(file, delimiter=' ') 149 | for line in cfg: 150 | if not line[0][0] == '#': 151 | if line[0] not in parameter.keys(): 152 | parameter.update({line[0] : None}) 153 | parameter[line[0]] = float(line[1]) 154 | 155 | def write_csv(out_path, tab): 156 | """ 157 | Write table to out_path. 158 | Args: 159 | (str) : path to the output file 160 | (list) : table 161 | """ 162 | with open(out_path, 'w+') as file: 163 | out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n') 164 | for line in tab: 165 | out_writer.writerow(line) 166 | 167 | def init(args): 168 | global gtf, hintfiles, threads, hint_source_weight, out, enforce_tx, v, scores_tab, filter_sing_exon, ignore_tx_phase, quiet 169 | if args.gtf: 170 | gtf = args.gtf.split(',') 171 | if args.keep_gtf: 172 | enforce_tx = args.keep_gtf.split(',') 173 | if not args.keep_gtf and not args.gtf: 174 | raise GeneSetMissing('At least one gene set has to be provided '\ 175 | + 'either with --gtf or --kepp_all!') 176 | if args.hintfiles: 177 | hintfiles = args.hintfiles.split(',') 178 | if args.cfg: 179 | cfg_file = args.cfg 180 | else: 181 | cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/default.cfg' 182 | set_parameter(cfg_file) 183 | if args.score_tab: 184 | scores_tab = args.score_tab 185 | if args.filter_single_exon_genes: 186 | filter_sing_exon = args.filter_single_exon_genes 187 | if args.ignore_tx_phase: 188 | ignore_tx_phase = args.ignore_tx_phase 189 | if args.out: 190 | out = args.out 191 | if args.verbose: 192 | v = args.verbose 193 | if args.quiet: 194 | quiet = True 195 | 196 | def parseCmd(): 197 | """Parse command line arguments 198 | 199 | Returns: 200 | dictionary: Dictionary with arguments 201 | """ 202 | parser = argparse.ArgumentParser(description='TSEBRA: Transcript Selector for BRAKER\n\n' \ 203 | + 'TSEBRA combines gene predictions by selecing ' \ 204 | + 'transcripts based on their extrisic evidence support.') 205 | parser.add_argument('-g', '--gtf', type=str, 206 | help='List (separated by commas) of gene prediciton files in gtf.\n' \ 207 | + '(e.g. gene_pred1.gtf,gene_pred2.gtf,gene_pred3.gtf)') 208 | parser.add_argument('-k', '--keep_gtf', type=str, 209 | help='List (separated by commas) of gene prediciton files in gtf.\n' \ 210 | + 'These gene sets are used the same way as other inputs, but TSEBRA '\ 211 | + 'ensures that all transcripts from these gene sets are included in the output.') 212 | parser.add_argument('-e', '--hintfiles', type=str, 213 | help='List (separated by commas) of files containing extrinsic evidence in gff.\n' \ 214 | + '(e.g. hintsfile1.gff,hintsfile2.gtf,3.gtf)') 215 | parser.add_argument('-c', '--cfg', type=str, 216 | help='Configuration file that sets the parameter for TSEBRA. ' \ 217 | + 'You can find the recommended parameter at config/default.cfg.') 218 | parser.add_argument('--filter_single_exon_genes', action='store_true', 219 | help='Filter out all single-exon genes out that are not' \ 220 | + ' supported by at least one start- or stop-codon hint.') 221 | parser.add_argument('--ignore_tx_phase', action='store_true', 222 | help='Ignore the phase of transcripts while detecting clusters ' \ 223 | + 'of overlapping transcripts.') 224 | parser.add_argument('-s', '--score_tab', type=str, 225 | help='Prints the transcript scores as a table to the specified file.') 226 | parser.add_argument('-o', '--out', type=str, required=True, 227 | help='Outputfile for the combined gene prediciton in gtf.') 228 | parser.add_argument('-q', '--quiet', action='store_true', 229 | help='Quiet mode.') 230 | parser.add_argument('-v', '--verbose', type=int, 231 | help='') 232 | return parser.parse_args() 233 | 234 | if __name__ == '__main__': 235 | main() -------------------------------------------------------------------------------- /tests/prep_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # prep_files.py: create example data for pytests 6 | # ============================================================== 7 | import os 8 | testDir = os.path.abspath(os.path.dirname(__file__)) 9 | 10 | def genome_anno(): 11 | anno1 = testDir + '/genome_anno/anno1.gtf' 12 | orig = [] 13 | with open(anno1, 'r') as file: 14 | for line in file.readlines(): 15 | line = line.strip('\n') 16 | orig.append(line) 17 | orig = [f.split('\t') for f in orig] 18 | 19 | anno = orig 20 | anno[1][8] = 'gene_id "7789_g";' 21 | anno = ['\t'.join(map(str, line)) for line in anno] 22 | with open(testDir + '/genome_anno/format_error.gtf', 'w+') as file: 23 | file.write('\n'.join(anno)) 24 | 25 | anno = orig 26 | anno[1][8] = 'transcript_id "7789_t";' 27 | anno[6][8] = 'transcript_id "g5980.t1";' 28 | for line in anno: 29 | if 'transcript_id "g11080.t1";' in line[8]: 30 | line[8] = 'transcript_id "g11080.t1";' 31 | anno = ['\t'.join(map(str, line)) for line in anno] 32 | with open(testDir + '/genome_anno/missing_gid.gtf', 'w+') as file: 33 | file.write('\n'.join(anno)) 34 | 35 | def get_anno(tx_dict, phase): 36 | template = ['3R', 'AUGUSTUS', '', '', '', phase, '+', '0', ''] 37 | anno = [] 38 | for key in tx_dict: 39 | coord = tx_dict[key] 40 | template[8] = 'transcript_id "{}"; gene_id "{}";'.format(key, key + '_g') 41 | type = 'exon' 42 | pos = coord[0] 43 | for c in coord[1:]: 44 | line = template.copy() 45 | line[2] = type 46 | line[3] = pos 47 | pos += c 48 | line[4] = pos 49 | if type == 'intron': 50 | line[3] += 1 51 | line[4] -= 1 52 | anno.append(line) 53 | if type == 'exon': 54 | line = line.copy() 55 | line[2] = 'CDS' 56 | anno.append(line) 57 | type = 'intron' 58 | else: 59 | type = 'exon' 60 | line = template.copy() 61 | line[2] = 'transcript' 62 | line[3] = str(coord[0]) 63 | line[4] = str(pos) 64 | line[8] = key 65 | anno.append(line) 66 | return anno 67 | 68 | def list2string(gtf): 69 | gtf = ['\t'.join(map(str, g)) for g in gtf] 70 | return '\n'.join(gtf) 71 | 72 | def graph(): 73 | dir = testDir + '/graph/' 74 | #example 1 75 | anno1_txs = { 't1' : [100, 100, 100, 100], \ 76 | 't2' : [700, 100, 100, 100, 100, 100], \ 77 | 't3' : [1500, 100]} 78 | anno1 = get_anno(anno1_txs, '0') 79 | with open(dir + 'ex1_anno1.gtf', 'w+') as file: 80 | file.write(list2string(anno1)) 81 | 82 | anno2_txs = { 't1' : [250, 250, 100, 150], 83 | 't2' : [1050, 200], 84 | 't3' : [1700, 100]} 85 | anno2 = get_anno(anno2_txs, '0') 86 | with open(dir + 'ex1_anno2.gtf', 'w+') as file: 87 | file.write(list2string(anno2)) 88 | 89 | #example 2 90 | anno1_txs = { 't1' : [200, 100]} 91 | anno1 = get_anno(anno1_txs, '0') 92 | with open(dir + 'ex2_anno1.gtf', 'w+') as file: 93 | file.write(list2string(anno1)) 94 | 95 | anno2_txs = { 't1' : [100, 100], \ 96 | 't2' : [301, 99]} 97 | anno2 = get_anno(anno2_txs, '0') 98 | with open(dir + 'ex2_anno2.gtf', 'w+') as file: 99 | file.write(list2string(anno2)) 100 | 101 | #example 3 102 | anno1_txs = { 't1' : [100, 200, 200, 200, 200, 200]} 103 | anno1 = get_anno(anno1_txs, '0') 104 | with open(dir + 'ex3_anno1.gtf', 'w+') as file: 105 | file.write(list2string(anno1)) 106 | 107 | anno2_txs = { 't1' : [110, 90, 600, 200], \ 108 | 't2' : [350, 100]} 109 | anno2 = get_anno(anno2_txs, '0') 110 | with open(dir + 'ex3_anno2.gtf', 'w+') as file: 111 | file.write(list2string(anno2)) 112 | 113 | #example 4 114 | anno1_txs = { 't1' : [100, 100, 100, 100]} 115 | anno1 = get_anno(anno1_txs, '0') 116 | with open(dir + 'ex4_anno1.gtf', 'w+') as file: 117 | file.write(list2string(anno1)) 118 | 119 | anno2_txs = { 't1' : [101, 100, 100, 100]} 120 | anno2 = get_anno(anno2_txs, '1') 121 | with open(dir + 'ex4_anno2.gtf', 'w+') as file: 122 | file.write(list2string(anno2)) 123 | 124 | def evidence(): 125 | dir = testDir + '/evidence/' 126 | hint_test_file1 = ['3L\tProtHint\tintron\t5812862\t5812941\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \ 127 | '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\ttranscript_id="t1"\n', \ 128 | '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\tsrc=M;pri=4\n', 129 | '3L\tProtHint\tintron\t12291242\t'] 130 | with open(dir + 'hint1.gff', 'w+') as file: 131 | file.write(''.join(hint_test_file1)) 132 | 133 | hint_test_file2 = ['3L\tProtHint\tintron\t5812862\t5812941\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \ 134 | '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\tsrc=M;mult=8;pri=4\n', \ 135 | '3R\tProtHint\tintron\t17440148\t17440207\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \ 136 | '2R\tProtHint\tintron\t5760114\t5760177\t23\t-\t.\tsrc=M;mult=23;pri=4\n', \ 137 | '2R\tProtHint\tintron\t6210484\t6210546\t21\t-\t.\tsrc=M;mult=21;pri=4\n', \ 138 | '3L\tProtHint\tintron\t20527281\t20527592\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \ 139 | '2L\tProtHint\tintron\t12400752\t12400814\t24\t+\t.\tsrc=M;mult=24;pri=4\n', \ 140 | '2R\tProtHint\tintron\t14988084\t14988142\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \ 141 | '2L\tProtHint\tintron\t6667531\t6667670\t5\t-\t.\tsrc=M;mult=5;pri=4\n', \ 142 | '3R\tProtHint\tintron\t5537551\t5537605\t22\t+\t.\tsrc=M;mult=22;pri=4\n', \ 143 | '3R\tProtHint\tintron\t20813612\t20813665\t12\t-\t.\tsrc=M;mult=12;pri=4\n', \ 144 | 'X\tProtHint\tintron\t2145714\t2147174\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \ 145 | '3L\tProtHint\tintron\t8114197\t8114256\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \ 146 | 'X\tProtHint\tintron\t11048602\t11048941\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \ 147 | '2L\tProtHint\tintron\t3807462\t3807524\t18\t+\t.\tsrc=M;mult=18;pri=4\n', \ 148 | '3R\tProtHint\tintron\t27059120\t27059364\t19\t-\t.\tsrc=M;mult=19;pri=4\n', \ 149 | '2R\tProtHint\tintron\t13821370\t13821432\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \ 150 | 'X\tProtHint\tintron\t8173462\t8173860\t6\t-\t.\tsrc=M;mult=6;pri=4\n', \ 151 | 'X\tProtHint\tintron\t13270643\t13271481\t16\t-\t.\tsrc=M;mult=16;pri=4\n', \ 152 | 'X\tProtHint\tintron\t2079645\t2079714\t25\t-\t.\tsrc=M;mult=25;pri=4\n'] 153 | with open(dir + 'hint2.gff', 'w+') as file: 154 | file.write(''.join(hint_test_file2)) 155 | 156 | hint_test_file3 = [] 157 | hint_test_file3.append(get_hint(100, 102, 'start_codon')) 158 | hint_test_file3.append(get_hint(501, 599, 'intron')) 159 | hint_test_file3.append(get_hint(501, 599, 'intron', src='P', mult=14)) 160 | hint_test_file3.append(get_hint(698, 700, 'stop_codon')) 161 | hint_test_file3.append(get_hint(801, 899, 'intron')) 162 | hint_test_file3.append(get_hint(801, 899, 'intron', chr='2L')) 163 | hint_test_file3.append(get_hint(801, 899, 'intron', src='P', mult=24)) 164 | hint_test_file3.append(get_hint(801, 949, 'intron')) 165 | hint_test_file3.append(get_hint(801, 899, 'intron', strand='-')) 166 | hint_test_file3.append(get_hint(1001, 1099, 'intron')) 167 | hint_test_file3.append(get_hint(1198, 1200, 'stop_codon')) 168 | hint_test_file3.append(get_hint(1601, 1699, 'intron')) 169 | with open(dir + 'hint3.gff', 'w+') as file: 170 | file.write('\n'.join(hint_test_file3)) 171 | 172 | 173 | def get_hint(start, end, type, strand='+', chr='3R', score=10, mult=2, pri=4, src='E'): 174 | att = 'src={};mult={};pri={}'.format(src,mult,pri) 175 | template = [chr, 'AUGUSTUS', type, start, end, score, '+', '.', att] 176 | return '\t'.join(map(str, template)) 177 | 178 | def get_feature(): 179 | dir = testDir + '/graph/' 180 | result = [] 181 | with open('/home/lars/work/combiner/example/braker1/braker_fixed.gtf', 'r') as file: 182 | for line in file.readlines(): 183 | if 'g7604.t1' in line or 'g7603.t1' in line or 'g7605.t1' in line: 184 | result.append(line) 185 | with open(dir + 'ex_feature_anno1.gtf', 'w+') as file: 186 | file.write(''.join(result)) 187 | 188 | result = [] 189 | with open('/home/lars/work/combiner/example/braker2/braker.gtf', 'r') as file: 190 | for line in file.readlines(): 191 | if 'g7700.t1' in line or 'g7701.t1' in line: 192 | result.append(line) 193 | with open(dir + 'ex_feature_anno2.gtf', 'w+') as file: 194 | file.write(''.join(result)) 195 | 196 | result = [] 197 | with open('/home/lars/work/combiner/example/braker1/hintsfile.gff', 'r') as file: 198 | for line in file.readlines(): 199 | line = line.split('\t') 200 | if len(line) > 8: 201 | if int(line[3]) >= 21737000 and int(line[4]) <= 21750000 \ 202 | and line[0] == '3R' and not line[2] == 'CDSpart': 203 | result.append(line) 204 | result = ['\t'.join(r) for r in result] 205 | with open(dir + 'ex_feature_hint1.gff', 'w+') as file: 206 | file.write(''.join(result)) 207 | 208 | result = [] 209 | with open('/home/lars/work/combiner/example/braker2/hintsfile.gff', 'r') as file: 210 | for line in file.readlines(): 211 | line = line.split('\t') 212 | if len(line) > 8: 213 | if int(line[3]) >= 21737000 and int(line[4]) <= 21750000 \ 214 | and line[0] == '3R' and not line[2] == 'CDSpart': 215 | result.append(line) 216 | result = ['\t'.join(r) for r in result] 217 | with open(dir + 'ex_feature_hint2.gff', 'w+') as file: 218 | file.write(''.join(result)) 219 | 220 | if __name__ == '__main__': 221 | #genome_anno() 222 | #graph() 223 | #evidence() 224 | get_feature() 225 | -------------------------------------------------------------------------------- /bin/compleasm-LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](http://bioconda.github.io/recipes/tsebra/README.html) 2 | [](https://usegalaxy.eu/root?tool_id=tsebra) 3 | 4 | # TSEBRA: Transcript Selector for BRAKER 5 | 6 |
7 |
8 |