├── bin ├── __init__.py ├── rename_gtf.py ├── fix_gtf_ids.py ├── get_longest_isoform.py ├── evidence.py ├── features.py ├── get_overlapping_genes.py ├── LICENSE.txt ├── tsebra.py ├── compleasm-LICENSE.txt ├── overlap_graph.py └── genome_anno.py ├── docs ├── .gitkeep └── TSEBRA_Logo.png ├── tests ├── __init__.py ├── graph │ ├── ex2_anno1.gtf │ ├── ex2_anno2.gtf │ ├── ex4_anno1.gtf │ ├── ex4_anno2.gtf │ ├── ex3_anno2.gtf │ ├── ex3_anno1.gtf │ ├── ex_feature_hint1.gff │ ├── ex1_anno2.gtf │ ├── ex1_anno1.gtf │ ├── ex_feature_hint2.gff │ ├── ex_feature_anno2.gtf │ └── ex_feature_anno1.gtf ├── evidence │ ├── hint1.gff │ ├── hint3.gff │ └── hint2.gff ├── genome_anno │ ├── tx1.gtf │ ├── missing_gid.gtf │ ├── format_error.gtf │ └── anno1.gtf ├── test_evidence.py ├── test_graph.py ├── test_genome_anno.py ├── combined.gtf └── prep_files.py ├── config ├── braker3.cfg ├── default.cfg ├── keep_ab_initio.cfg └── pref_braker1.cfg ├── example ├── run_tsebra_example.sh └── braker1_results │ └── hintsfile.gff └── README.md /bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/TSEBRA_Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gaius-Augustus/TSEBRA/HEAD/docs/TSEBRA_Logo.png -------------------------------------------------------------------------------- /tests/graph/ex2_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 200 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 200 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS transcript 200 300 0 + 0 t1 -------------------------------------------------------------------------------- /tests/evidence/hint1.gff: -------------------------------------------------------------------------------- 1 | 3L ProtHint intron 5812862 5812941 24 - . src=M;mult=24;pri=4 2 | 3L ProtHint intron 12291242 12291299 8 - . transcript_id="t1" 3 | 3L ProtHint intron 12291242 12291299 8 - . src=M;pri=4 4 | 3L ProtHint intron 12291242 -------------------------------------------------------------------------------- /tests/graph/ex2_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS transcript 100 200 0 + 0 t1 4 | 3R AUGUSTUS exon 301 400 0 + 0 transcript_id "t2"; gene_id "t2_g"; 5 | 3R AUGUSTUS CDS 301 400 0 + 0 transcript_id "t2"; gene_id "t2_g"; 6 | 3R AUGUSTUS transcript 301 400 0 + 0 t2 -------------------------------------------------------------------------------- /config/braker3.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 1 4 | E 20 5 | C 1 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 1.0 10 | stasto_support 2 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.1 14 | e_2 0.5 15 | e_3 0.05 16 | e_4 0.2 -------------------------------------------------------------------------------- /config/default.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 1 4 | E 20 5 | C 1 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 1.0 10 | stasto_support 2 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.1 14 | e_2 0.5 15 | e_3 0.05 16 | e_4 0.18 -------------------------------------------------------------------------------- /tests/graph/ex4_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 201 299 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 100 400 0 + 0 t1 -------------------------------------------------------------------------------- /tests/graph/ex4_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 101 201 1 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 101 201 1 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 202 300 1 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 301 401 1 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 301 401 1 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 101 401 1 + 0 t1 -------------------------------------------------------------------------------- /config/keep_ab_initio.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 0.1 4 | E 10 5 | C 5 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 0 10 | stasto_support 1 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.1 14 | e_2 0.5 15 | # Values have to be >0 16 | e_3 0.05 17 | e_4 0.18 -------------------------------------------------------------------------------- /config/pref_braker1.cfg: -------------------------------------------------------------------------------- 1 | # Weight for each hint source 2 | # Values have to be >= 0 3 | P 0.1 4 | E 10000 5 | C 5 6 | M 1 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript 8 | # Values have to be in [0,1] 9 | intron_support 0.25 10 | stasto_support 2 11 | # Allowed difference for each feature 12 | # Values have to be in [0,1] 13 | e_1 0.25 14 | e_2 1 15 | # Values have to be >0 16 | e_3 0.05 17 | e_4 0.18 18 | -------------------------------------------------------------------------------- /tests/graph/ex3_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 110 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 110 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 201 799 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 800 1000 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 800 1000 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 110 1000 0 + 0 t1 7 | 3R AUGUSTUS exon 350 450 0 + 0 transcript_id "t2"; gene_id "t2_g"; 8 | 3R AUGUSTUS CDS 350 450 0 + 0 transcript_id "t2"; gene_id "t2_g"; 9 | 3R AUGUSTUS transcript 350 450 0 + 0 t2 -------------------------------------------------------------------------------- /tests/graph/ex3_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 300 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 301 499 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 500 700 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 500 700 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS intron 701 899 0 + 0 transcript_id "t1"; gene_id "t1_g"; 7 | 3R AUGUSTUS exon 900 1100 0 + 0 transcript_id "t1"; gene_id "t1_g"; 8 | 3R AUGUSTUS CDS 900 1100 0 + 0 transcript_id "t1"; gene_id "t1_g"; 9 | 3R AUGUSTUS transcript 100 1100 0 + 0 t1 -------------------------------------------------------------------------------- /tests/genome_anno/tx1.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | -------------------------------------------------------------------------------- /tests/evidence/hint3.gff: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS start_codon 100 102 10 + . src=E;mult=2;pri=4 2 | 3R AUGUSTUS intron 501 599 10 + . src=E;mult=2;pri=4 3 | 3R AUGUSTUS intron 501 599 10 + . src=P;mult=14;pri=4 4 | 3R AUGUSTUS stop_codon 698 700 10 + . src=E;mult=2;pri=4 5 | 3R AUGUSTUS intron 801 899 10 + . src=E;mult=2;pri=4 6 | 2L AUGUSTUS intron 801 899 10 + . src=E;mult=2;pri=4 7 | 3R AUGUSTUS intron 801 899 10 + . src=P;mult=24;pri=4 8 | 3R AUGUSTUS intron 801 949 10 + . src=E;mult=2;pri=4 9 | 3R AUGUSTUS intron 801 899 10 + . src=E;mult=2;pri=4 10 | 3R AUGUSTUS intron 1001 1099 10 + . src=E;mult=2;pri=4 11 | 3R AUGUSTUS stop_codon 1198 1200 10 + . src=E;mult=2;pri=4 12 | 3R AUGUSTUS intron 1601 1699 10 + . src=E;mult=2;pri=4 -------------------------------------------------------------------------------- /tests/graph/ex_feature_hint1.gff: -------------------------------------------------------------------------------- 1 | 3R b2h intron 21737122 21737185 6 - . mult=6;pri=4;src=E 2 | 3R b2h intron 21738629 21738695 42 - . mult=42;pri=4;src=E 3 | 3R b2h intron 21738939 21739000 30 - . mult=30;pri=4;src=E 4 | 3R b2h intron 21740644 21741666 4 + . mult=4;pri=4;src=E 5 | 3R b2h intron 21741826 21741884 12 + . mult=12;pri=4;src=E 6 | 3R b2h intron 21742360 21742427 2 + . mult=2;pri=4;src=E 7 | 3R b2h intron 21743988 21744047 2 + . mult=2;pri=4;src=E 8 | 3R b2h intron 21745856 21746185 166 + . mult=166;pri=4;src=E 9 | 3R b2h intron 21746342 21746473 196 + . mult=196;pri=4;src=E 10 | 3R b2h intron 21747188 21747389 200 + . mult=200;pri=4;src=E 11 | 3R b2h intron 21748618 21748687 340 + . mult=340;pri=4;src=E 12 | -------------------------------------------------------------------------------- /tests/graph/ex1_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 250 500 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 250 500 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 501 599 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 600 750 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 600 750 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 250 750 0 + 0 t1 7 | 3R AUGUSTUS exon 1050 1250 0 + 0 transcript_id "t2"; gene_id "t2_g"; 8 | 3R AUGUSTUS CDS 1050 1250 0 + 0 transcript_id "t2"; gene_id "t2_g"; 9 | 3R AUGUSTUS transcript 1050 1250 0 + 0 t2 10 | 3R AUGUSTUS exon 1700 1800 0 + 0 transcript_id "t3"; gene_id "t3_g"; 11 | 3R AUGUSTUS CDS 1700 1800 0 + 0 transcript_id "t3"; gene_id "t3_g"; 12 | 3R AUGUSTUS transcript 1700 1800 0 + 0 t3 -------------------------------------------------------------------------------- /example/run_tsebra_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # if this file is not executable run: chmod +x run_prevco_example.sh 3 | 4 | c="${0%/*}" 5 | # prediciton and hint files that are included in the standard output of a BRAKER run 6 | b1=$c/braker1_results/braker.gtf 7 | b2=$c/braker2_results/braker.gtf 8 | h1=$c/braker1_results/hintsfile.gff 9 | h2=$c/braker2_results/hintsfile.gff 10 | 11 | # create working directory 12 | d=$c/tsebra_workdir/ 13 | mkdir -p $d 14 | 15 | # Make sure that the transcript IDs of the BRAKER predicitons are in order 16 | # This step is OPTIONAL and not necassary for a succefull combination 17 | 18 | echo "\n*** Fix possible ID errors in *.gtf files ***\n" 19 | 20 | new_b1=$d/braker1.gtf 21 | new_b2=$d/braker2.gtf 22 | $c/../bin/fix_gtf_ids.py --gtf $b1 --out $new_b1 23 | $c/../bin/fix_gtf_ids.py --gtf $b2 --out $new_b2 24 | b1=$new_b1 25 | b2=$new_b2 26 | 27 | # Combine BRAKER1 and BRAKER2 predicitons 28 | 29 | o=$d/braker1+2.gtf 30 | 31 | echo "*** Running TSEBRA ***\n" 32 | 33 | $c/../bin/tsebra.py -g $b1,$b2 -c $c/../config/default.cfg -e $h1,$h2 -o $o 34 | 35 | echo "\n*** Finished. Result at: $o ***\n" 36 | -------------------------------------------------------------------------------- /tests/test_evidence.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pytest 4 | import csv 5 | 6 | testDir = os.path.abspath(os.path.dirname(__file__)) 7 | sys.path.append(testDir + '/../bin/') 8 | 9 | from evidence import NotGtfFormat, AttributeMissing, Hint, Evidence 10 | 11 | @pytest.fixture 12 | def hints1(): 13 | hints = [] 14 | with open(testDir + '/evidence/hint1.gff') as file: 15 | hints_tab = csv.reader(file, delimiter='\t') 16 | for line in hints_tab: 17 | hints.append(line) 18 | return hints 19 | 20 | def test_hint(hints1): 21 | hint = Hint(hints1[0]) 22 | assert list(map(str, hint.hint2list())) == hints1[0] 23 | hint = Hint(hints1[2]) 24 | assert list(map(str, hint.hint2list())) == hints1[2] 25 | 26 | def test_hint_error(hints1): 27 | with pytest.raises(AttributeMissing): 28 | Hint(hints1[1]) 29 | with pytest.raises(NotGtfFormat): 30 | Hint(hints1[3]) 31 | 32 | def test_get_hint(): 33 | evi = Evidence() 34 | evi.add_hintfile(testDir + '/evidence/hint3.gff') 35 | mult = evi.get_hint('3R','801','899','intron','+') 36 | assert sum(mult.values()) == 28 37 | -------------------------------------------------------------------------------- /tests/graph/ex1_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS exon 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 2 | 3R AUGUSTUS CDS 100 200 0 + 0 transcript_id "t1"; gene_id "t1_g"; 3 | 3R AUGUSTUS intron 201 299 0 + 0 transcript_id "t1"; gene_id "t1_g"; 4 | 3R AUGUSTUS exon 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 5 | 3R AUGUSTUS CDS 300 400 0 + 0 transcript_id "t1"; gene_id "t1_g"; 6 | 3R AUGUSTUS transcript 100 400 0 + 0 t1 7 | 3R AUGUSTUS exon 700 800 0 + 0 transcript_id "t2"; gene_id "t2_g"; 8 | 3R AUGUSTUS CDS 700 800 0 + 0 transcript_id "t2"; gene_id "t2_g"; 9 | 3R AUGUSTUS intron 801 899 0 + 0 transcript_id "t2"; gene_id "t2_g"; 10 | 3R AUGUSTUS exon 900 1000 0 + 0 transcript_id "t2"; gene_id "t2_g"; 11 | 3R AUGUSTUS CDS 900 1000 0 + 0 transcript_id "t2"; gene_id "t2_g"; 12 | 3R AUGUSTUS intron 1001 1099 0 + 0 transcript_id "t2"; gene_id "t2_g"; 13 | 3R AUGUSTUS exon 1100 1200 0 + 0 transcript_id "t2"; gene_id "t2_g"; 14 | 3R AUGUSTUS CDS 1100 1200 0 + 0 transcript_id "t2"; gene_id "t2_g"; 15 | 3R AUGUSTUS transcript 700 1200 0 + 0 t2 16 | 3R AUGUSTUS exon 1500 1600 0 + 0 transcript_id "t3"; gene_id "t3_g"; 17 | 3R AUGUSTUS CDS 1500 1600 0 + 0 transcript_id "t3"; gene_id "t3_g"; 18 | 3R AUGUSTUS transcript 1500 1600 0 + 0 t3 -------------------------------------------------------------------------------- /tests/evidence/hint2.gff: -------------------------------------------------------------------------------- 1 | 3L ProtHint intron 5812862 5812941 24 - . src=M;mult=24;pri=4 2 | 3L ProtHint intron 12291242 12291299 8 - . src=M;mult=8;pri=4 3 | 3R ProtHint intron 17440148 17440207 25 - . src=M;mult=25;pri=4 4 | 2R ProtHint intron 5760114 5760177 23 - . src=M;mult=23;pri=4 5 | 2R ProtHint intron 6210484 6210546 21 - . src=M;mult=21;pri=4 6 | 3L ProtHint intron 20527281 20527592 25 + . src=M;mult=25;pri=4 7 | 2L ProtHint intron 12400752 12400814 24 + . src=M;mult=24;pri=4 8 | 2R ProtHint intron 14988084 14988142 25 - . src=M;mult=25;pri=4 9 | 2L ProtHint intron 6667531 6667670 5 - . src=M;mult=5;pri=4 10 | 3R ProtHint intron 5537551 5537605 22 + . src=M;mult=22;pri=4 11 | 3R ProtHint intron 20813612 20813665 12 - . src=M;mult=12;pri=4 12 | X ProtHint intron 2145714 2147174 25 + . src=M;mult=25;pri=4 13 | 3L ProtHint intron 8114197 8114256 25 - . src=M;mult=25;pri=4 14 | X ProtHint intron 11048602 11048941 25 + . src=M;mult=25;pri=4 15 | 2L ProtHint intron 3807462 3807524 18 + . src=M;mult=18;pri=4 16 | 3R ProtHint intron 27059120 27059364 19 - . src=M;mult=19;pri=4 17 | 2R ProtHint intron 13821370 13821432 24 - . src=M;mult=24;pri=4 18 | X ProtHint intron 8173462 8173860 6 - . src=M;mult=6;pri=4 19 | X ProtHint intron 13270643 13271481 16 - . src=M;mult=16;pri=4 20 | X ProtHint intron 2079645 2079714 25 - . src=M;mult=25;pri=4 21 | -------------------------------------------------------------------------------- /tests/graph/ex_feature_hint2.gff: -------------------------------------------------------------------------------- 1 | 3R ProtHint intron 21747188 21747389 16 + . src=M;mult=16;pri=4 2 | 3R ProtHint intron 21742667 21742741 9 + . src=M;mult=9;pri=4 3 | 3R ProtHint intron 21742360 21742427 10 + . src=M;mult=10;pri=4 4 | 3R ProtHint intron 21745856 21746185 18 + . src=M;mult=18;pri=4 5 | 3R ProtHint intron 21740644 21741666 8 + . src=M;mult=8;pri=4 6 | 3R ProtHint intron 21740644 21741666 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 7 | 3R ProtHint intron 21741826 21741884 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 8 | 3R ProtHint intron 21742360 21742427 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 9 | 3R ProtHint intron 21742667 21742741 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 10 | 3R ProtHint intron 21743988 21744047 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 11 | 3R ProtHint intron 21745856 21746185 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 12 | 3R ProtHint intron 21746342 21746473 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 13 | 3R ProtHint intron 21747188 21747389 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 14 | 3R ProtHint intron 21748618 21748687 1 + . grp=7375_0:000e30_g7706;src=C;pri=4; 15 | 3R ProtHint intron 21743988 21744047 0 + . src=P;mult=2;pri=4; 16 | 3R ProtHint intron 21746342 21746473 2 + . src=P;mult=14;pri=4; 17 | 3R ProtHint intron 21741826 21741884 0 + . src=P;mult=3;pri=4; 18 | 3R ProtHint intron 21747188 21747389 2 + . src=P;mult=16;pri=4; 19 | 3R ProtHint intron 21742667 21742741 2 + . src=P;mult=9;pri=4; 20 | 3R ProtHint intron 21742360 21742427 2 + . src=P;mult=10;pri=4; 21 | 3R ProtHint intron 21745856 21746185 2 + . src=P;mult=18;pri=4; 22 | 3R ProtHint intron 21748618 21748687 2 + . src=P;mult=17;pri=4; 23 | 3R ProtHint intron 21740644 21741666 2 + . src=P;mult=8;pri=4; 24 | 3R ProtHint stop 21748922 21748924 0 + 0 src=P;mult=1;pri=4; 25 | -------------------------------------------------------------------------------- /tests/test_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import pytest 5 | 6 | testDir = os.path.abspath(os.path.dirname(__file__)) 7 | sys.path.append(testDir + '/../bin/') 8 | 9 | from genome_anno import Anno 10 | from overlap_graph import Graph, Node 11 | from evidence import Hintfile 12 | 13 | example_files = testDir + '/graph/' 14 | 15 | def compare_lists(list1, list2): 16 | assert len(list1) == len(list2) 17 | list1 = [set(l) for l in list1] 18 | list2 = [set(l) for l in list2] 19 | for element in list1: 20 | assert element in list2 21 | 22 | def test_example_1(): 23 | result = [['anno1;t1', 'anno2;t1', 'anno1;t2', 'anno2;t2'], ['anno1;t3'], ['anno2;t3']] 24 | anno1 = Anno(example_files + '/ex1_anno1.gtf', 'anno1') 25 | anno1.addGtf() 26 | anno1.norm_tx_format() 27 | anno2 = Anno(example_files + '/ex1_anno2.gtf', 'anno2') 28 | anno2.addGtf() 29 | graph = Graph([anno1, anno2], {}) 30 | graph.build() 31 | component_list = graph.connected_components() 32 | compare_lists(result, component_list) 33 | 34 | def test_example_2(): 35 | result = [['anno2;t1'], ['anno1;t1'], ['anno2;t2']] 36 | anno1 = Anno(example_files + '/ex2_anno1.gtf', 'anno1') 37 | anno1.addGtf() 38 | anno1.norm_tx_format() 39 | anno2 = Anno(example_files + '/ex2_anno2.gtf', 'anno2') 40 | anno2.addGtf() 41 | anno2.norm_tx_format() 42 | graph = Graph([anno1, anno2], {}) 43 | graph.build() 44 | component_list = graph.connected_components() 45 | compare_lists(result, component_list) 46 | 47 | def test_example_3(): 48 | result = [['anno1;t1', 'anno2;t1'], ['anno2;t2']] 49 | anno1 = Anno(example_files + '/ex3_anno1.gtf', 'anno1') 50 | anno1.addGtf() 51 | anno1.norm_tx_format() 52 | anno2 = Anno(example_files + '/ex3_anno2.gtf', 'anno2') 53 | anno2.addGtf() 54 | graph = Graph([anno1, anno2], {}) 55 | graph.build() 56 | component_list = graph.connected_components() 57 | compare_lists(result, component_list) 58 | -------------------------------------------------------------------------------- /bin/rename_gtf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # Rename the transcripts and genes of a GTF file. 6 | # ============================================================== 7 | import argparse 8 | import os 9 | import csv 10 | class FileNotFound(Exception): 11 | pass 12 | 13 | def main(): 14 | args = parseCmd() 15 | from genome_anno import Anno 16 | 17 | args = parseCmd() 18 | 19 | if not os.path.exists(args.gtf): 20 | raise FileNotFound('File not found: {}'.format(args.gtf)) 21 | prefix = '' 22 | if args.prefix: 23 | prefix = args.prefix 24 | 25 | 26 | anno = Anno(args.gtf, id='') 27 | anno.addGtf() 28 | anno.norm_tx_format() 29 | anno.find_genes() 30 | tx_tab = anno.rename_tx_ids(prefix) 31 | anno.write_anno(args.out) 32 | if args.translation_tab: 33 | with open(args.translation_tab, 'w+') as file: 34 | out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n') 35 | for line in tx_tab: 36 | out_writer.writerow(line) 37 | 38 | def parseCmd(): 39 | """Parse command line arguments 40 | 41 | Returns: 42 | dictionary: Dictionary with arguments 43 | """ 44 | parser = argparse.ArgumentParser(description='Renames the transcripts and genes of a GTF file.') 45 | parser.add_argument('--gtf', type=str, required=True, 46 | help='Path to a gene prediciton file in GTF format, for example the output of TSEBRA.') 47 | parser.add_argument('--prefix', type=str, 48 | help='The string is added as a prefix to all transcript and gene IDs.') 49 | parser.add_argument('--translation_tab', type=str, 50 | help='Writes the translation table for old transcript IDs to new transcript IDs to the given file path.') 51 | parser.add_argument('--out', type=str, required=True, 52 | help='Path to the output file.') 53 | return parser.parse_args() 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /bin/fix_gtf_ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # Lars Gabriel 4 | # 5 | # Fixes an transcript and gene id error, where transcripts/genes have 6 | # the same ID on different chromosomes or strands. 7 | # ============================================================== 8 | import sys 9 | import os 10 | import argparse 11 | 12 | class FormatError(Exception): 13 | pass 14 | 15 | class Chr: 16 | def __init__(self): 17 | self.genes = {} 18 | self.txs = {} 19 | 20 | def start2int(line): 21 | line[3] = int(line[3]) 22 | return line 23 | 24 | def main(): 25 | # replace gene/tx oldID with chr_strand_oldID 26 | args = parseCmd() 27 | result = '' 28 | with open(args.gtf, 'r') as file: 29 | for line in file.readlines(): 30 | line = line.split('\t') 31 | if len(line) == 9: 32 | if line[2] in ['gene', 'transcript']: 33 | continue 34 | id_prefix = line[0] + line[6] 35 | id_prefix = id_prefix.replace(' ', '') 36 | transcript_id = line[8].split('transcript_id "')[1].split('";')[0] 37 | temp = line[8].split('transcript_id "') 38 | line[8] = '{}transcript_id "{}_{}";{}'.format(temp[0], id_prefix, transcript_id, '";'.join(temp[1].split('";')[1:])) 39 | gene_id = line[8].split('gene_id "')[1].split('";')[0] 40 | temp = line[8].split('gene_id "') 41 | line[8] = '{}gene_id "{}_{}";{}'.format(temp[0], id_prefix, gene_id, '";'.join(temp[1].split('";')[1:])) 42 | result += '\t'.join(line) 43 | with open(args.out, 'w+') as file: 44 | file.write(result) 45 | 46 | def parseCmd(): 47 | """Parse command line arguments 48 | 49 | Returns: 50 | dictionary: Dictionary with arguments 51 | """ 52 | parser = argparse.ArgumentParser(description='') 53 | parser.add_argument('--gtf', type=str, 54 | help='') 55 | parser.add_argument('--out', type=str, 56 | help='') 57 | return parser.parse_args() 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /tests/graph/ex_feature_anno2.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS start_codon 21740168 21740170 . + 0 transcript_id "g7701.t1"; gene_id "g7701"; 2 | 3R AUGUSTUS CDS 21740168 21740643 1 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 3 | 3R AUGUSTUS exon 21740168 21740643 . + . transcript_id "g7701.t1"; gene_id "g7701"; 4 | 3R AUGUSTUS intron 21740644 21741666 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 5 | 3R AUGUSTUS CDS 21741667 21741825 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 6 | 3R AUGUSTUS exon 21741667 21741825 . + . transcript_id "g7701.t1"; gene_id "g7701"; 7 | 3R AUGUSTUS intron 21741826 21741884 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 8 | 3R AUGUSTUS CDS 21741885 21742359 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 9 | 3R AUGUSTUS exon 21741885 21742359 . + . transcript_id "g7701.t1"; gene_id "g7701"; 10 | 3R AUGUSTUS intron 21742360 21742427 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 11 | 3R AUGUSTUS CDS 21742428 21742666 1 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 12 | 3R AUGUSTUS exon 21742428 21742666 . + . transcript_id "g7701.t1"; gene_id "g7701"; 13 | 3R AUGUSTUS intron 21742667 21742741 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 14 | 3R AUGUSTUS CDS 21742742 21743987 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 15 | 3R AUGUSTUS exon 21742742 21743987 . + . transcript_id "g7701.t1"; gene_id "g7701"; 16 | 3R AUGUSTUS intron 21743988 21744047 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 17 | 3R AUGUSTUS CDS 21744048 21744355 0.52 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 18 | 3R AUGUSTUS exon 21744048 21744355 . + . transcript_id "g7701.t1"; gene_id "g7701"; 19 | 3R AUGUSTUS intron 21744356 21745282 0.52 + . transcript_id "g7701.t1"; gene_id "g7701"; 20 | 3R AUGUSTUS CDS 21745283 21745855 0.53 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 21 | 3R AUGUSTUS exon 21745283 21745855 . + . transcript_id "g7701.t1"; gene_id "g7701"; 22 | 3R AUGUSTUS intron 21745856 21746185 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 23 | 3R AUGUSTUS CDS 21746186 21746341 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 24 | 3R AUGUSTUS exon 21746186 21746341 . + . transcript_id "g7701.t1"; gene_id "g7701"; 25 | 3R AUGUSTUS intron 21746342 21746473 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 26 | 3R AUGUSTUS CDS 21746474 21747187 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 27 | 3R AUGUSTUS exon 21746474 21747187 . + . transcript_id "g7701.t1"; gene_id "g7701"; 28 | 3R AUGUSTUS intron 21747188 21747389 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 29 | 3R AUGUSTUS CDS 21747390 21748617 1 + 1 transcript_id "g7701.t1"; gene_id "g7701"; 30 | 3R AUGUSTUS exon 21747390 21748617 . + . transcript_id "g7701.t1"; gene_id "g7701"; 31 | 3R AUGUSTUS intron 21748618 21748687 1 + . transcript_id "g7701.t1"; gene_id "g7701"; 32 | 3R AUGUSTUS CDS 21748688 21748924 1 + 0 transcript_id "g7701.t1"; gene_id "g7701"; 33 | 3R AUGUSTUS transcript 21740168 21748924 0.52 + . g7701.t1 34 | 3R AUGUSTUS exon 21748688 21748924 . + . transcript_id "g7701.t1"; gene_id "g7701"; 35 | 3R AUGUSTUS stop_codon 21748922 21748924 . + 0 transcript_id "g7701.t1"; gene_id "g7701"; 36 | 3R AUGUSTUS stop_codon 21737497 21737499 . - 0 transcript_id "g7700.t1"; gene_id "g7700"; 37 | 3R AUGUSTUS CDS 21737497 21737706 0.84 - 0 transcript_id "g7700.t1"; gene_id "g7700"; 38 | 3R AUGUSTUS exon 21737497 21737706 . - . transcript_id "g7700.t1"; gene_id "g7700"; 39 | 3R AUGUSTUS intron 21737707 21739000 0.76 - . transcript_id "g7700.t1"; gene_id "g7700"; 40 | 3R AUGUSTUS CDS 21739001 21739099 0.75 - 0 transcript_id "g7700.t1"; gene_id "g7700"; 41 | 3R AUGUSTUS transcript 21737497 21739099 0.75 - . g7700.t1 42 | 3R AUGUSTUS exon 21739001 21739099 . - . transcript_id "g7700.t1"; gene_id "g7700"; 43 | 3R AUGUSTUS start_codon 21739097 21739099 . - 0 transcript_id "g7700.t1"; gene_id "g7700"; 44 | -------------------------------------------------------------------------------- /tests/test_genome_anno.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import pytest 5 | import csv 6 | 7 | testDir = os.path.abspath(os.path.dirname(__file__)) 8 | sys.path.append(testDir + '/../bin/') 9 | 10 | from genome_anno import Transcript, Anno, NotGtfFormat 11 | 12 | anno1 = testDir + '/genome_anno/anno1.gtf' 13 | anno_format_error = testDir + '/genome_anno/format_error.gtf' 14 | anno_missing_gid = testDir + '/genome_anno/missing_gid.gtf' 15 | tx1 = testDir + '/genome_anno/tx1.gtf' 16 | tx1_args = ('tx1', 'g.tx1', '3L', 'GeneMark.hmm', '-') 17 | 18 | @pytest.fixture 19 | def transcript(): 20 | return Transcript(*tx1_args) 21 | 22 | @pytest.fixture 23 | def file_tx1(): 24 | result = [] 25 | with open(tx1, 'r') as file: 26 | file_tab = csv.reader(file, delimiter='\t') 27 | for line in file_tab: 28 | result.append(line) 29 | return result 30 | 31 | @pytest.fixture 32 | def file_anno1(): 33 | result = [] 34 | with open(anno1, 'r') as file: 35 | file_tab = csv.reader(file, delimiter='\t') 36 | for line in file_tab: 37 | result.append(line) 38 | return result 39 | 40 | @pytest.fixture 41 | def transcript_tx1(file_tx1): 42 | t = Transcript(*tx1_args) 43 | for line in file_tx1: 44 | t.add_line(line) 45 | return t 46 | 47 | @pytest.fixture 48 | def anno_anno1(): 49 | anno = Anno(anno1, 'anno1') 50 | anno.addGtf() 51 | return anno 52 | 53 | def test_transcript_defaults(transcript): 54 | assert transcript.id == tx1_args[0] 55 | assert transcript.gene_id == tx1_args[1] 56 | assert transcript.chr == tx1_args[2] 57 | assert transcript.source_anno == tx1_args[3] 58 | 59 | def test_transcript_add_lines(transcript_tx1, file_tx1): 60 | list = [] 61 | for key in transcript_tx1.transcript_lines.keys(): 62 | list += transcript_tx1.transcript_lines[key] 63 | assert len(list) == len(file_tx1) 64 | for line in list: 65 | assert line in file_tx1 66 | 67 | def test_transcript_find_lines(transcript_tx1): 68 | missing = {"intron" : [['3L', 'GeneMark.hmm', 'intron', 18462541, 18462718, \ 69 | '.', '-', '0', \ 70 | 'gene_id "g.tx1"; transcript_id "tx1";']], \ 71 | "start_codon" : [['3L', 'GeneMark.hmm', 'start_codon', 18463066, 18463068, \ 72 | '.', '-', '.', 'gene_id "g.tx1"; transcript_id "tx1";']], \ 73 | "transcript" : [['3L', 'GeneMark.hmm', 'transcript', 18462228, 18463068, \ 74 | '.', '-', '.', 'tx1']]} 75 | transcript_tx1.add_missing_lines() 76 | for key in missing.keys(): 77 | for line in missing[key]: 78 | assert line in transcript_tx1.transcript_lines[key] 79 | 80 | def test_anno_read_file(anno_anno1, file_anno1): 81 | gtf_anno = anno_anno1.get_gtf() 82 | gtf_anno = [list(map(str, g[:8])) for g in gtf_anno] 83 | file_anno1 = [f[:8] for f in file_anno1] 84 | assert len(gtf_anno) == len(file_anno1) 85 | for line in file_anno1: 86 | print(line) 87 | print(gtf_anno) 88 | assert line in gtf_anno 89 | 90 | def test_format_error(): 91 | anno = Anno(anno_format_error, 'error_anno') 92 | with pytest.raises(NotGtfFormat): 93 | anno.addGtf() 94 | 95 | def test_missing_gid(file_anno1): 96 | anno = Anno(anno_missing_gid, 'anno1') 97 | anno.addGtf() 98 | gtf_anno = anno.get_gtf() 99 | gtf_anno = [list(map(str, g[:8])) for g in gtf_anno] 100 | file_anno1 = [f[:8] for f in file_anno1] 101 | assert len(gtf_anno) == len(file_anno1) 102 | for line in gtf_anno: 103 | assert line in file_anno1 104 | 105 | 106 | 107 | 108 | if __name__ == '__main__': 109 | os.mkdir(tempDir) 110 | #sys.path.append(testDir + "/../bin") 111 | -------------------------------------------------------------------------------- /bin/get_longest_isoform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # get_longest_isoform.py: combines gene sets into one that 6 | # consists only of the longest isoform from each gene loci 7 | # ============================================================== 8 | import argparse 9 | import sys 10 | import os 11 | import csv 12 | 13 | class ConfigFileError(Exception): 14 | pass 15 | 16 | class GeneSetMissing(Exception): 17 | pass 18 | 19 | gtf = [] 20 | anno = [] 21 | hintfiles = [] 22 | graph = None 23 | out = '' 24 | v = 0 25 | quiet = False 26 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \ 27 | 'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0} 28 | 29 | def main(): 30 | from genome_anno import Anno 31 | from overlap_graph import Graph 32 | 33 | global anno, graph, parameter 34 | 35 | args = parseCmd() 36 | init(args) 37 | 38 | if v > 0: 39 | print(gtf) 40 | 41 | # read gene prediciton files 42 | c = 1 43 | for c, g in enumerate(gtf): 44 | if not quiet: 45 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 46 | anno.append(Anno(g, f'anno{c+1}')) 47 | anno[-1].addGtf() 48 | anno[-1].norm_tx_format() 49 | 50 | # create graph with an edge for each unique transcript 51 | # and an edge if two transcripts overlap 52 | # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides 53 | graph = Graph(anno, para=parameter, verbose=v) 54 | if not quiet: 55 | sys.stderr.write('### BUILD OVERLAP GRAPH\n') 56 | graph.build() 57 | 58 | combined_anno = Anno('', 'combined_annotation') 59 | # for each gene locus, choose the transcript with longes coding sequence 60 | if not quiet: 61 | sys.stderr.write('### CHOOSE LONGEST ISOFORM FOR EACH GENE\n') 62 | for i, comp in enumerate(graph.connected_components()): 63 | tx_longest = sorted([graph.__tx_from_key__(n) for \ 64 | n in comp], key=lambda t:t.get_cds_len())[-1] 65 | tx_longest.set_gene_id(f'g_{i+1}') 66 | tx_longest.id = f'{tx_longest.source_anno}.{tx_longest.id}' 67 | combined_anno.transcripts.update({tx_longest.id : tx_longest}) 68 | combined_anno.find_genes() 69 | combined_anno.write_anno(out) 70 | 71 | if not quiet: 72 | sys.stderr.write('### FINISHED\n\n') 73 | sys.stderr.write('### The longest isoforms are located at {}.\n'.format(\ 74 | out)) 75 | 76 | def init(args): 77 | global gtf, out, v, quiet 78 | if args.gtf: 79 | gtf = args.gtf.split(',') 80 | if args.out: 81 | out = args.out 82 | if args.verbose: 83 | v = args.verbose 84 | if args.quiet: 85 | quiet = True 86 | 87 | def parseCmd(): 88 | """Parse command line arguments 89 | 90 | Returns: 91 | dictionary: Dictionary with arguments 92 | """ 93 | parser = argparse.ArgumentParser(description='Combine gene sets by choosing ' \ 94 | 'the isoform with the longes coding sequence for each gene locus.') 95 | parser.add_argument('-g', '--gtf', type=str, required=True, 96 | help='List (separated by commas) of gene prediciton files in gtf.\n' \ 97 | + '(e.g. gene_pred1.gtf,gene_pred2.gtf,gene_pred3.gtf)') 98 | parser.add_argument('-o', '--out', type=str, required=True, 99 | help='Outputfile for the combined gene prediciton in gtf.') 100 | parser.add_argument('-q', '--quiet', action='store_true', 101 | help='Quiet mode.') 102 | parser.add_argument('-v', '--verbose', type=int, 103 | help='') 104 | return parser.parse_args() 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /tests/combined.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS stop_codon 21737497 21737499 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 2 | 3R AUGUSTUS CDS 21737497 21737706 0.99 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 3 | 3R AUGUSTUS exon 21737497 21737706 . - . transcript_id "g7603.t1"; gene_id "g7603"; 4 | 3R AUGUSTUS transcript 21737497 21738709 0.98 - . g7603.t1 5 | 3R AUGUSTUS intron 21737707 21738606 0.99 - . transcript_id "g7603.t1"; gene_id "g7603"; 6 | 3R AUGUSTUS CDS 21738607 21738628 0.99 - 1 transcript_id "g7603.t1"; gene_id "g7603"; 7 | 3R AUGUSTUS exon 21738607 21738628 . - . transcript_id "g7603.t1"; gene_id "g7603"; 8 | 3R AUGUSTUS intron 21738629 21738695 1 - . transcript_id "g7603.t1"; gene_id "g7603"; 9 | 3R AUGUSTUS CDS 21738696 21738709 1 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 10 | 3R AUGUSTUS exon 21738696 21738709 . - . transcript_id "g7603.t1"; gene_id "g7603"; 11 | 3R AUGUSTUS start_codon 21738707 21738709 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 12 | 3R AUGUSTUS start_codon 21740168 21740170 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 13 | 3R AUGUSTUS CDS 21740168 21740643 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 14 | 3R AUGUSTUS exon 21740168 21740643 . + . transcript_id "g7604.t1"; gene_id "g7604"; 15 | 3R AUGUSTUS transcript 21740168 21744359 0.53 + . g7604.t1 16 | 3R AUGUSTUS intron 21740644 21741666 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 17 | 3R AUGUSTUS CDS 21741667 21741825 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 18 | 3R AUGUSTUS exon 21741667 21741825 . + . transcript_id "g7604.t1"; gene_id "g7604"; 19 | 3R AUGUSTUS intron 21741826 21741884 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 20 | 3R AUGUSTUS CDS 21741885 21742359 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 21 | 3R AUGUSTUS exon 21741885 21742359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 22 | 3R AUGUSTUS intron 21742360 21742427 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 23 | 3R AUGUSTUS CDS 21742428 21742666 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 24 | 3R AUGUSTUS exon 21742428 21742666 . + . transcript_id "g7604.t1"; gene_id "g7604"; 25 | 3R AUGUSTUS intron 21742667 21742741 0.84 + . transcript_id "g7604.t1"; gene_id "g7604"; 26 | 3R AUGUSTUS CDS 21742742 21743987 0.79 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 27 | 3R AUGUSTUS exon 21742742 21743987 . + . transcript_id "g7604.t1"; gene_id "g7604"; 28 | 3R AUGUSTUS intron 21743988 21744047 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 29 | 3R AUGUSTUS CDS 21744048 21744359 0.68 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 30 | 3R AUGUSTUS exon 21744048 21744359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 31 | 3R AUGUSTUS stop_codon 21744357 21744359 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 32 | 3R AUGUSTUS start_codon 21745305 21745307 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; 33 | 3R AUGUSTUS CDS 21745305 21745855 0.69 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 34 | 3R AUGUSTUS exon 21745305 21745855 . + . transcript_id "g7605.t1"; gene_id "g7605"; 35 | 3R AUGUSTUS transcript 21745305 21748924 0.49 + . g7605.t1 36 | 3R AUGUSTUS intron 21745856 21746185 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 37 | 3R AUGUSTUS CDS 21746186 21746341 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 38 | 3R AUGUSTUS exon 21746186 21746341 . + . transcript_id "g7605.t1"; gene_id "g7605"; 39 | 3R AUGUSTUS intron 21746342 21746473 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 40 | 3R AUGUSTUS CDS 21746474 21747187 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 41 | 3R AUGUSTUS exon 21746474 21747187 . + . transcript_id "g7605.t1"; gene_id "g7605"; 42 | 3R AUGUSTUS intron 21747188 21747389 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 43 | 3R AUGUSTUS CDS 21747390 21748617 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 44 | 3R AUGUSTUS exon 21747390 21748617 . + . transcript_id "g7605.t1"; gene_id "g7605"; 45 | 3R AUGUSTUS intron 21748618 21748687 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 46 | 3R AUGUSTUS CDS 21748688 21748924 0.71 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 47 | 3R AUGUSTUS exon 21748688 21748924 . + . transcript_id "g7605.t1"; gene_id "g7605"; 48 | 3R AUGUSTUS stop_codon 21748922 21748924 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; -------------------------------------------------------------------------------- /tests/graph/ex_feature_anno1.gtf: -------------------------------------------------------------------------------- 1 | 3R AUGUSTUS stop_codon 21737497 21737499 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 2 | 3R AUGUSTUS CDS 21737497 21737706 0.99 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 3 | 3R AUGUSTUS exon 21737497 21737706 . - . transcript_id "g7603.t1"; gene_id "g7603"; 4 | 3R AUGUSTUS transcript 21737497 21738709 0.98 - . g7603.t1 5 | 3R AUGUSTUS intron 21737707 21738606 0.99 - . transcript_id "g7603.t1"; gene_id "g7603"; 6 | 3R AUGUSTUS CDS 21738607 21738628 0.99 - 1 transcript_id "g7603.t1"; gene_id "g7603"; 7 | 3R AUGUSTUS exon 21738607 21738628 . - . transcript_id "g7603.t1"; gene_id "g7603"; 8 | 3R AUGUSTUS intron 21738629 21738695 1 - . transcript_id "g7603.t1"; gene_id "g7603"; 9 | 3R AUGUSTUS CDS 21738696 21738709 1 - 0 transcript_id "g7603.t1"; gene_id "g7603"; 10 | 3R AUGUSTUS exon 21738696 21738709 . - . transcript_id "g7603.t1"; gene_id "g7603"; 11 | 3R AUGUSTUS start_codon 21738707 21738709 . - 0 transcript_id "g7603.t1"; gene_id "g7603"; 12 | 3R AUGUSTUS start_codon 21740168 21740170 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 13 | 3R AUGUSTUS CDS 21740168 21740643 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 14 | 3R AUGUSTUS exon 21740168 21740643 . + . transcript_id "g7604.t1"; gene_id "g7604"; 15 | 3R AUGUSTUS transcript 21740168 21744359 0.53 + . g7604.t1 16 | 3R AUGUSTUS intron 21740644 21741666 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 17 | 3R AUGUSTUS CDS 21741667 21741825 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 18 | 3R AUGUSTUS exon 21741667 21741825 . + . transcript_id "g7604.t1"; gene_id "g7604"; 19 | 3R AUGUSTUS intron 21741826 21741884 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 20 | 3R AUGUSTUS CDS 21741885 21742359 1 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 21 | 3R AUGUSTUS exon 21741885 21742359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 22 | 3R AUGUSTUS intron 21742360 21742427 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 23 | 3R AUGUSTUS CDS 21742428 21742666 1 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 24 | 3R AUGUSTUS exon 21742428 21742666 . + . transcript_id "g7604.t1"; gene_id "g7604"; 25 | 3R AUGUSTUS intron 21742667 21742741 0.84 + . transcript_id "g7604.t1"; gene_id "g7604"; 26 | 3R AUGUSTUS CDS 21742742 21743987 0.79 + 1 transcript_id "g7604.t1"; gene_id "g7604"; 27 | 3R AUGUSTUS exon 21742742 21743987 . + . transcript_id "g7604.t1"; gene_id "g7604"; 28 | 3R AUGUSTUS intron 21743988 21744047 1 + . transcript_id "g7604.t1"; gene_id "g7604"; 29 | 3R AUGUSTUS CDS 21744048 21744359 0.68 + 0 transcript_id "g7604.t1"; gene_id "g7604"; 30 | 3R AUGUSTUS exon 21744048 21744359 . + . transcript_id "g7604.t1"; gene_id "g7604"; 31 | 3R AUGUSTUS stop_codon 21744357 21744359 . + 0 transcript_id "g7604.t1"; gene_id "g7604"; 32 | 3R AUGUSTUS start_codon 21745305 21745307 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; 33 | 3R AUGUSTUS CDS 21745305 21745855 0.69 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 34 | 3R AUGUSTUS exon 21745305 21745855 . + . transcript_id "g7605.t1"; gene_id "g7605"; 35 | 3R AUGUSTUS transcript 21745305 21748924 0.49 + . g7605.t1 36 | 3R AUGUSTUS intron 21745856 21746185 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 37 | 3R AUGUSTUS CDS 21746186 21746341 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 38 | 3R AUGUSTUS exon 21746186 21746341 . + . transcript_id "g7605.t1"; gene_id "g7605"; 39 | 3R AUGUSTUS intron 21746342 21746473 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 40 | 3R AUGUSTUS CDS 21746474 21747187 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 41 | 3R AUGUSTUS exon 21746474 21747187 . + . transcript_id "g7605.t1"; gene_id "g7605"; 42 | 3R AUGUSTUS intron 21747188 21747389 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 43 | 3R AUGUSTUS CDS 21747390 21748617 1 + 1 transcript_id "g7605.t1"; gene_id "g7605"; 44 | 3R AUGUSTUS exon 21747390 21748617 . + . transcript_id "g7605.t1"; gene_id "g7605"; 45 | 3R AUGUSTUS intron 21748618 21748687 1 + . transcript_id "g7605.t1"; gene_id "g7605"; 46 | 3R AUGUSTUS CDS 21748688 21748924 0.71 + 0 transcript_id "g7605.t1"; gene_id "g7605"; 47 | 3R AUGUSTUS exon 21748688 21748924 . + . transcript_id "g7605.t1"; gene_id "g7605"; 48 | 3R AUGUSTUS stop_codon 21748922 21748924 . + 0 transcript_id "g7605.t1"; gene_id "g7605"; 49 | -------------------------------------------------------------------------------- /bin/evidence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # evdence.py: Handles the extrinsic evidence from the hintfiles 6 | # ============================================================== 7 | import csv 8 | 9 | class NotGtfFormat(Exception): 10 | pass 11 | 12 | class AttributeMissing(Exception): 13 | pass 14 | 15 | class Hint: 16 | """ 17 | Class handling the data structures and methods for a hint 18 | """ 19 | def __init__(self, line): 20 | """ 21 | Create a hint from a gff line. The line has to include 'src=' as 22 | an attribute in the last column. Only introns, start/stop codons 23 | are used. 24 | 25 | Args: 26 | line (list(str)): GFF line for one hint from extrinsic evidence. 27 | """ 28 | if not len(line) == 9: 29 | raise NotGtfFormat('File not in gtf Format. Error at line: {}'.format(line)) 30 | self.chr, self.source_program, self.type, self.start, self.end, \ 31 | self.score, self.strand, self.phase, attribute = line 32 | self.start = int(self.start) 33 | self.end = int(self.end) 34 | 35 | try: 36 | self.src = attribute.split('src=')[1].split(';')[0] 37 | except IndexError: 38 | raise AttributeMissing('Source of Hint is missing in line {}.'.format(line)) 39 | self.score = float(self.score) 40 | self.mult = 1 41 | if 'mult=' in attribute: 42 | self.mult = int(attribute.split('mult=')[1].split(';')[0]) 43 | 44 | self.pri = '' 45 | if 'pri=' in attribute: 46 | self.pri = attribute.split('pri=')[1].split(';')[0] 47 | 48 | if self.type == 'stop_codon': 49 | self.type = 'stop' 50 | elif self.type == 'start_codon': 51 | self.type = 'start' 52 | 53 | def hint2list(self): 54 | """ 55 | Returns: 56 | line (list(str)): GFF line for the hint. 57 | """ 58 | attribute = ['src=' + self.src] 59 | if int(self.mult) > 1: 60 | attribute.append('mult={}'.format(self.mult)) 61 | if self.pri: 62 | attribute.append('pri={}'.format(self.pri)) 63 | return [self.chr, self.source_program, self.type, self.start, self.end, \ 64 | self.score, self.strand, self.phase, ';'.join(attribute)] 65 | 66 | class Hintfile: 67 | """ 68 | Class handling the data structures and methods for a hintfile 69 | """ 70 | def __init__(self, path): 71 | """ 72 | Args: 73 | path (str): Path to the hintfile. 74 | """ 75 | # dictonary containing evidence 76 | # self.hints[chromosom_id] = [Hints()] 77 | self.hints = {} 78 | # dictionary with self.src[src] = sum_of_all_mults_of_hints_from_src 79 | self.src = {} 80 | self.read_file(path) 81 | 82 | def read_file(self, path): 83 | """ 84 | Read a gff file with intron or start/stop codon hints 85 | and create a dict of Hints. 86 | """ 87 | # 88 | with open(path, 'r') as file: 89 | hints_csv = csv.reader(file, delimiter='\t') 90 | for line in hints_csv: 91 | if line[0][0] == '#': 92 | continue 93 | new_hint = Hint(line) 94 | if not new_hint.chr in self.hints.keys(): 95 | self.hints.update({new_hint.chr : []}) 96 | self.hints[new_hint.chr].append(new_hint) 97 | if new_hint.src not in self.src: 98 | self.src.update({new_hint.src : 0}) 99 | self.src[new_hint.src] += new_hint.mult 100 | 101 | class Evidence: 102 | """ 103 | Class handling the data structures and methods for extrinsic evidence 104 | from one or more hintfiles. 105 | """ 106 | def __init__(self): 107 | # hint_keys[chr][start_end_type_strand][src] = multiplicity 108 | self.hint_keys = {} 109 | self.src = {} 110 | 111 | def add_hintfile(self, path_to_hintfile): 112 | """ 113 | Read hintfile 114 | """ 115 | # read hintfile 116 | hintfile = Hintfile(path_to_hintfile) 117 | for s in hintfile.src: 118 | if s not in self.src: 119 | self.src.update({s : 0}) 120 | self.src[s] += hintfile.src[s] 121 | for chr in hintfile.hints.keys(): 122 | if chr not in self.hint_keys.keys(): 123 | self.hint_keys.update({chr : {}}) 124 | for hint in hintfile.hints[chr]: 125 | new_key = '{}_{}_{}_{}'.format(hint.start, hint.end, \ 126 | hint.type, hint.strand) 127 | if not new_key in self.hint_keys[chr].keys(): 128 | self.hint_keys[chr].update({new_key : {}}) 129 | if not hint.src in self.hint_keys[chr][new_key].keys(): 130 | self.hint_keys[chr][new_key].update({hint.src : 0}) 131 | self.hint_keys[chr][new_key][hint.src] += int(hint.mult) 132 | 133 | def get_hint(self, chr, start, end, type, strand): 134 | if type == 'start_codon': 135 | type = 'start' 136 | elif type == 'stop_codon': 137 | type = 'stop' 138 | key = '{}_{}_{}_{}'.format(start, end, type, strand) 139 | if chr in self.hint_keys.keys(): 140 | if key in self.hint_keys[chr].keys(): 141 | return self.hint_keys[chr][key] 142 | return {} 143 | -------------------------------------------------------------------------------- /bin/features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # features.py: Handles the features for a transcript 6 | # ============================================================== 7 | import numpy as np 8 | 9 | class Node_features: 10 | """ 11 | Class handling the features for a transcripts. 12 | Features are scores that characterize the support of the transcript 13 | by extrinsic evidence in different ways. 14 | """ 15 | def __init__(self, tx, evi, hint_source_weight={'P' : 1, 'E' : 20, 'C' : 1, 'M' : 1}): 16 | """ 17 | Args: 18 | tx (Transcript): Transcript class object containing a transcript. 19 | evi (Evidence): Evidence class object containing all extrinsic evidence. 20 | hint_source_weight (dict(int)): Weights for each evidence source. 21 | """ 22 | self.sw = hint_source_weight 23 | self.scores = [] 24 | self.epsi = 1e-5 25 | self.evi_list = {'intron' : [], 'start_codon' : [], 'stop_codon': []} 26 | self.numb_introns = 0 27 | self.__init_hints__(tx, evi) 28 | # feature vector specifies the support of 29 | # introns, start/stop codons for a transcript 30 | # self.feature_vector[0] : (supported introns by evidence of tx) / (number of introns in tx) 31 | # self.feature_vector[1] : (supported start/stop codons by evidence of tx) / 2 32 | # self.feature_vector[2] : sum of multiplicities of intron evidence for tx 33 | # self.feature_vector[3] : sum of multiplicities of start/stop codon evidence for tx 34 | # self.feature_vector[4] : 1 if tx is from anno_pref, 0 otherwise 35 | self.feature_vector = self.create_feature_vec() 36 | 37 | def __init_hints__(self, tx, evi): 38 | """ 39 | Collect hints from evi that support tx. 40 | 41 | Args: 42 | tx (Transcript): Transcript class object containing a transcript. 43 | evi (Evidence): Evidence class object containing all extrinsic evidence. 44 | """ 45 | cds_len = 0 46 | for type in ['intron', 'start_codon', 'stop_codon']: 47 | for line in tx.transcript_lines[type]: 48 | hint = evi.get_hint(line[0], line[3], line[4], line[2], \ 49 | line[6]) 50 | if hint: 51 | self.evi_list[type].append(hint) 52 | if tx.transcript_lines['intron']: 53 | self.numb_introns = len(tx.transcript_lines['intron']) 54 | 55 | def create_feature_vec(self): 56 | """ 57 | Compute all features. 58 | 59 | Returns: 60 | (list(float)): List of feature scores. 61 | """ 62 | return [self.relative_support(['intron'], self.numb_introns), \ 63 | self.relative_support(['start_codon', 'stop_codon'], 2.0), 64 | self.absolute_support(['intron']), \ 65 | self.absolute_support(['start_codon', 'stop_codon'])] 66 | 67 | def relative_support(self, gene_feature_types, abs_numb): 68 | """ 69 | Compute relative support of introns or start/stop-codons. 70 | 71 | Args: 72 | gene_feature_types (str): Either introns or start/stop-codons 73 | abs_numb (int): absolute number of gene_feature_type in tx 74 | (e.g. number of introns in tx) 75 | 76 | Returns: 77 | (float): Relative support in [0,1]. 78 | """ 79 | if abs_numb > 0: 80 | hint_numb = 0 81 | for type in gene_feature_types: 82 | hint_numb += len(self.evi_list[type]) 83 | return hint_numb / abs_numb 84 | return 1 85 | 86 | def absolute_support(self, gene_feature_types): 87 | """ 88 | Compute absolute support of introns or start/stop-codons. 89 | 90 | Args: 91 | gene_feature_types (str): Either introns or start/stop-codons 92 | 93 | Returns: 94 | (float): Multiplicity*weight of supporting hints for gene_feature_types. 95 | """ 96 | score = 0.0 97 | for type in gene_feature_types: 98 | for hint in self.evi_list[type]: 99 | for src in hint.keys(): 100 | score += self.sw[src] * hint[src] 101 | #print(score) 102 | return np.log(score + self.epsi) 103 | 104 | # currently not used 105 | def mean_support(self, gene_feature_types, abs_numb): 106 | """ 107 | Compute absolute support of introns or start/stop-codons. 108 | 109 | Args: 110 | gene_feature_types (str): Either introns or start/stop-codons 111 | 112 | Returns: 113 | (float): Multiplicity*weight of supporting hints for gene_feature_types. 114 | """ 115 | score = 0.0 116 | if abs_numb > 0: 117 | for type in gene_feature_types: 118 | for hint in self.evi_list[type]: 119 | for src in hint.keys(): 120 | score += self.sw[src] * hint[src] 121 | return np.log((score / abs_numb)+self.epsi) 122 | else: 123 | return np.log(self.epsi) 124 | 125 | # currently not used 126 | def min_support(self, gene_feature_types, abs_numb): 127 | """ 128 | Compute absolute support of introns or start/stop-codons. 129 | 130 | Args: 131 | gene_feature_types (str): Either introns or start/stop-codons 132 | 133 | Returns: 134 | (float): Multiplicity*weight of supporting hints for gene_feature_types. 135 | """ 136 | score = 0.0 137 | for type in gene_feature_types: 138 | if len(self.evi_list[type]) < abs_numb: 139 | return np.log(self.epsi) 140 | if abs_numb > 0: 141 | score = 10000000000000000000.0 142 | for type in gene_feature_types: 143 | for hint in self.evi_list[type]: 144 | new_score = 0 145 | for src in hint.keys(): 146 | new_score += self.sw[src] * hint[src] 147 | score = np.minimum(score, new_score) 148 | return np.log(score+self.epsi) 149 | else: 150 | return np.log(self.epsi) 151 | 152 | def get_features(self): 153 | """ 154 | Returns: 155 | (list(float)): List of feature scores. 156 | """ 157 | return self.feature_vector -------------------------------------------------------------------------------- /tests/genome_anno/missing_gid.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 transcript_id "7789_t"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | 3L GeneMark.hmm start_codon 18463066 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 7 | 3R AUGUSTUS start_codon 7686444 7686446 . + 0 transcript_id "g5980.t1"; 8 | 3R AUGUSTUS CDS 7686444 7686623 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 9 | 3R AUGUSTUS exon 7686444 7686623 . + . transcript_id "g5980.t1"; gene_id "g5980"; 10 | 3R AUGUSTUS intron 7686624 7690691 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 11 | 3R AUGUSTUS CDS 7690692 7690843 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 12 | 3R AUGUSTUS exon 7690692 7690843 . + . transcript_id "g5980.t1"; gene_id "g5980"; 13 | 3R AUGUSTUS intron 7690844 7691514 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 14 | 3R AUGUSTUS CDS 7691515 7691630 1 + 1 transcript_id "g5980.t1"; gene_id "g5980"; 15 | 3R AUGUSTUS exon 7691515 7691630 . + . transcript_id "g5980.t1"; gene_id "g5980"; 16 | 3R AUGUSTUS intron 7691631 7691712 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 17 | 3R AUGUSTUS CDS 7691713 7693700 1 + 2 transcript_id "g5980.t1"; gene_id "g5980"; 18 | 3R AUGUSTUS gene 7686444 7693700 1 + . g5980 19 | 3R AUGUSTUS transcript 7686444 7693700 1 + . g5980.t1 20 | 3R AUGUSTUS exon 7691713 7693700 . + . transcript_id "g5980.t1"; gene_id "g5980"; 21 | 3R AUGUSTUS stop_codon 7693698 7693700 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 22 | X AUGUSTUS stop_codon 2065454 2065456 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 23 | X AUGUSTUS CDS 2065454 2065891 0.75 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 24 | X AUGUSTUS exon 2065454 2065891 . - . transcript_id "g12130.t1"; gene_id "g12130"; 25 | X AUGUSTUS intron 2065892 2065944 0.98 - . transcript_id "g12130.t1"; gene_id "g12130"; 26 | X AUGUSTUS CDS 2065945 2066088 0.93 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 27 | X AUGUSTUS exon 2065945 2066088 . - . transcript_id "g12130.t1"; gene_id "g12130"; 28 | X AUGUSTUS intron 2066089 2066148 0.92 - . transcript_id "g12130.t1"; gene_id "g12130"; 29 | X AUGUSTUS CDS 2066149 2066238 0.92 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 30 | X AUGUSTUS gene 2065454 2066238 0.7 - . g12130 31 | X AUGUSTUS transcript 2065454 2066238 0.7 - . g12130.t1 32 | X AUGUSTUS exon 2066149 2066238 . - . transcript_id "g12130.t1"; gene_id "g12130"; 33 | X AUGUSTUS start_codon 2066236 2066238 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 34 | 2R AUGUSTUS stop_codon 16433896 16433898 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 35 | 2R AUGUSTUS CDS 16433896 16435797 1 - 0 transcript_id "g10583.t1"; gene_id "g10583"; 36 | 2R AUGUSTUS exon 16433896 16435797 . - . transcript_id "g10583.t1"; gene_id "g10583"; 37 | 2R AUGUSTUS start_codon 16435795 16435797 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 38 | 2R AUGUSTUS gene 16433896 16435797 1 - . g10583 39 | 2R AUGUSTUS transcript 16433896 16435797 1 - . g10583.t1 40 | 2R AUGUSTUS stop_codon 24640803 24640805 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 41 | 2R AUGUSTUS CDS 24640803 24642212 1 - 0 transcript_id "g11793.t1"; gene_id "g11793"; 42 | 2R AUGUSTUS exon 24640803 24642212 . - . transcript_id "g11793.t1"; gene_id "g11793"; 43 | 2R AUGUSTUS start_codon 24642210 24642212 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 44 | 2R AUGUSTUS gene 24640803 24642212 1 - . g11793 45 | 2R AUGUSTUS transcript 24640803 24642212 1 - . g11793.t1 46 | 2L AUGUSTUS stop_codon 11989063 11989065 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 47 | 2L AUGUSTUS CDS 11989063 11989803 0.73 - 0 transcript_id "g1539.t1"; gene_id "g1539"; 48 | 2L AUGUSTUS exon 11989063 11989803 . - . transcript_id "g1539.t1"; gene_id "g1539"; 49 | 2L AUGUSTUS start_codon 11989801 11989803 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 50 | 2L AUGUSTUS gene 11989063 11989803 0.73 - . g1539 51 | 2L AUGUSTUS transcript 11989063 11989803 0.73 - . g1539.t1 52 | 2L AUGUSTUS start_codon 4686242 4686244 . + 0 transcript_id "g562.t1"; gene_id "g562"; 53 | 2L AUGUSTUS CDS 4686242 4687105 1 + 0 transcript_id "g562.t1"; gene_id "g562"; 54 | 2L AUGUSTUS exon 4686242 4687105 . + . transcript_id "g562.t1"; gene_id "g562"; 55 | 2L AUGUSTUS stop_codon 4687103 4687105 . + 0 transcript_id "g562.t1"; gene_id "g562"; 56 | 2L AUGUSTUS gene 4686242 4687105 1 + . g562 57 | 2L AUGUSTUS transcript 4686242 4687105 1 + . g562.t1 58 | 3L AUGUSTUS stop_codon 11362605 11362607 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 59 | 3L AUGUSTUS CDS 11362605 11363086 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 60 | 3L AUGUSTUS exon 11362605 11363086 . - . transcript_id "g3988.t1"; gene_id "g3988"; 61 | 3L AUGUSTUS intron 11363087 11363276 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 62 | 3L AUGUSTUS CDS 11363277 11363918 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 63 | 3L AUGUSTUS exon 11363277 11363918 . - . transcript_id "g3988.t1"; gene_id "g3988"; 64 | 3L AUGUSTUS intron 11363919 11364608 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 65 | 3L AUGUSTUS CDS 11364609 11364771 1 - 0 transcript_id "g3988.t1"; gene_id "g3988"; 66 | 3L AUGUSTUS gene 11362605 11364771 1 - . g3988 67 | 3L AUGUSTUS transcript 11362605 11364771 1 - . g3988.t1 68 | 3L AUGUSTUS exon 11364609 11364771 . - . transcript_id "g3988.t1"; gene_id "g3988"; 69 | 3L AUGUSTUS start_codon 11364769 11364771 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 70 | 3R AUGUSTUS start_codon 12691822 12691824 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 71 | 3R AUGUSTUS CDS 12691822 12691869 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 72 | 3R AUGUSTUS exon 12691822 12691869 . + . transcript_id "g6660.t1"; gene_id "g6660"; 73 | 3R AUGUSTUS intron 12691870 12692642 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 74 | 3R AUGUSTUS CDS 12692643 12692707 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 75 | 3R AUGUSTUS exon 12692643 12692707 . + . transcript_id "g6660.t1"; gene_id "g6660"; 76 | 3R AUGUSTUS intron 12692708 12692769 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 77 | 3R AUGUSTUS CDS 12692770 12692944 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 78 | 3R AUGUSTUS exon 12692770 12692944 . + . transcript_id "g6660.t1"; gene_id "g6660"; 79 | 3R AUGUSTUS intron 12692945 12693003 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 80 | 3R AUGUSTUS CDS 12693004 12693155 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 81 | 3R AUGUSTUS exon 12693004 12693155 . + . transcript_id "g6660.t1"; gene_id "g6660"; 82 | 3R AUGUSTUS intron 12693156 12693214 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 83 | 3R AUGUSTUS CDS 12693215 12693761 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 84 | 3R AUGUSTUS exon 12693215 12693761 . + . transcript_id "g6660.t1"; gene_id "g6660"; 85 | 3R AUGUSTUS intron 12693762 12693829 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 86 | 3R AUGUSTUS CDS 12693830 12693973 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 87 | 3R AUGUSTUS gene 12691822 12693973 1 + . g6660 88 | 3R AUGUSTUS transcript 12691822 12693973 1 + . g6660.t1 89 | 3R AUGUSTUS exon 12693830 12693973 . + . transcript_id "g6660.t1"; gene_id "g6660"; 90 | 3R AUGUSTUS stop_codon 12693971 12693973 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 91 | 2R AUGUSTUS stop_codon 20354214 20354216 . - 0 transcript_id "g11080.t1"; 92 | 2R AUGUSTUS CDS 20354214 20355053 1 - 0 transcript_id "g11080.t1"; 93 | 2R AUGUSTUS exon 20354214 20355053 . - . transcript_id "g11080.t1"; 94 | 2R AUGUSTUS start_codon 20355051 20355053 . - 0 transcript_id "g11080.t1"; 95 | 2R AUGUSTUS gene 20354214 20355053 1 - . g11080 96 | 2R AUGUSTUS transcript 20354214 20355053 1 - . g11080.t1 -------------------------------------------------------------------------------- /tests/genome_anno/format_error.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 gene_id "7789_g"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | 3L GeneMark.hmm start_codon 18463066 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 7 | 3R AUGUSTUS start_codon 7686444 7686446 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 8 | 3R AUGUSTUS CDS 7686444 7686623 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 9 | 3R AUGUSTUS exon 7686444 7686623 . + . transcript_id "g5980.t1"; gene_id "g5980"; 10 | 3R AUGUSTUS intron 7686624 7690691 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 11 | 3R AUGUSTUS CDS 7690692 7690843 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 12 | 3R AUGUSTUS exon 7690692 7690843 . + . transcript_id "g5980.t1"; gene_id "g5980"; 13 | 3R AUGUSTUS intron 7690844 7691514 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 14 | 3R AUGUSTUS CDS 7691515 7691630 1 + 1 transcript_id "g5980.t1"; gene_id "g5980"; 15 | 3R AUGUSTUS exon 7691515 7691630 . + . transcript_id "g5980.t1"; gene_id "g5980"; 16 | 3R AUGUSTUS intron 7691631 7691712 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 17 | 3R AUGUSTUS CDS 7691713 7693700 1 + 2 transcript_id "g5980.t1"; gene_id "g5980"; 18 | 3R AUGUSTUS gene 7686444 7693700 1 + . g5980 19 | 3R AUGUSTUS transcript 7686444 7693700 1 + . g5980.t1 20 | 3R AUGUSTUS exon 7691713 7693700 . + . transcript_id "g5980.t1"; gene_id "g5980"; 21 | 3R AUGUSTUS stop_codon 7693698 7693700 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 22 | X AUGUSTUS stop_codon 2065454 2065456 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 23 | X AUGUSTUS CDS 2065454 2065891 0.75 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 24 | X AUGUSTUS exon 2065454 2065891 . - . transcript_id "g12130.t1"; gene_id "g12130"; 25 | X AUGUSTUS intron 2065892 2065944 0.98 - . transcript_id "g12130.t1"; gene_id "g12130"; 26 | X AUGUSTUS CDS 2065945 2066088 0.93 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 27 | X AUGUSTUS exon 2065945 2066088 . - . transcript_id "g12130.t1"; gene_id "g12130"; 28 | X AUGUSTUS intron 2066089 2066148 0.92 - . transcript_id "g12130.t1"; gene_id "g12130"; 29 | X AUGUSTUS CDS 2066149 2066238 0.92 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 30 | X AUGUSTUS gene 2065454 2066238 0.7 - . g12130 31 | X AUGUSTUS transcript 2065454 2066238 0.7 - . g12130.t1 32 | X AUGUSTUS exon 2066149 2066238 . - . transcript_id "g12130.t1"; gene_id "g12130"; 33 | X AUGUSTUS start_codon 2066236 2066238 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 34 | 2R AUGUSTUS stop_codon 16433896 16433898 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 35 | 2R AUGUSTUS CDS 16433896 16435797 1 - 0 transcript_id "g10583.t1"; gene_id "g10583"; 36 | 2R AUGUSTUS exon 16433896 16435797 . - . transcript_id "g10583.t1"; gene_id "g10583"; 37 | 2R AUGUSTUS start_codon 16435795 16435797 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 38 | 2R AUGUSTUS gene 16433896 16435797 1 - . g10583 39 | 2R AUGUSTUS transcript 16433896 16435797 1 - . g10583.t1 40 | 2R AUGUSTUS stop_codon 24640803 24640805 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 41 | 2R AUGUSTUS CDS 24640803 24642212 1 - 0 transcript_id "g11793.t1"; gene_id "g11793"; 42 | 2R AUGUSTUS exon 24640803 24642212 . - . transcript_id "g11793.t1"; gene_id "g11793"; 43 | 2R AUGUSTUS start_codon 24642210 24642212 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 44 | 2R AUGUSTUS gene 24640803 24642212 1 - . g11793 45 | 2R AUGUSTUS transcript 24640803 24642212 1 - . g11793.t1 46 | 2L AUGUSTUS stop_codon 11989063 11989065 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 47 | 2L AUGUSTUS CDS 11989063 11989803 0.73 - 0 transcript_id "g1539.t1"; gene_id "g1539"; 48 | 2L AUGUSTUS exon 11989063 11989803 . - . transcript_id "g1539.t1"; gene_id "g1539"; 49 | 2L AUGUSTUS start_codon 11989801 11989803 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 50 | 2L AUGUSTUS gene 11989063 11989803 0.73 - . g1539 51 | 2L AUGUSTUS transcript 11989063 11989803 0.73 - . g1539.t1 52 | 2L AUGUSTUS start_codon 4686242 4686244 . + 0 transcript_id "g562.t1"; gene_id "g562"; 53 | 2L AUGUSTUS CDS 4686242 4687105 1 + 0 transcript_id "g562.t1"; gene_id "g562"; 54 | 2L AUGUSTUS exon 4686242 4687105 . + . transcript_id "g562.t1"; gene_id "g562"; 55 | 2L AUGUSTUS stop_codon 4687103 4687105 . + 0 transcript_id "g562.t1"; gene_id "g562"; 56 | 2L AUGUSTUS gene 4686242 4687105 1 + . g562 57 | 2L AUGUSTUS transcript 4686242 4687105 1 + . g562.t1 58 | 3L AUGUSTUS stop_codon 11362605 11362607 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 59 | 3L AUGUSTUS CDS 11362605 11363086 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 60 | 3L AUGUSTUS exon 11362605 11363086 . - . transcript_id "g3988.t1"; gene_id "g3988"; 61 | 3L AUGUSTUS intron 11363087 11363276 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 62 | 3L AUGUSTUS CDS 11363277 11363918 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 63 | 3L AUGUSTUS exon 11363277 11363918 . - . transcript_id "g3988.t1"; gene_id "g3988"; 64 | 3L AUGUSTUS intron 11363919 11364608 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 65 | 3L AUGUSTUS CDS 11364609 11364771 1 - 0 transcript_id "g3988.t1"; gene_id "g3988"; 66 | 3L AUGUSTUS gene 11362605 11364771 1 - . g3988 67 | 3L AUGUSTUS transcript 11362605 11364771 1 - . g3988.t1 68 | 3L AUGUSTUS exon 11364609 11364771 . - . transcript_id "g3988.t1"; gene_id "g3988"; 69 | 3L AUGUSTUS start_codon 11364769 11364771 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 70 | 3R AUGUSTUS start_codon 12691822 12691824 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 71 | 3R AUGUSTUS CDS 12691822 12691869 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 72 | 3R AUGUSTUS exon 12691822 12691869 . + . transcript_id "g6660.t1"; gene_id "g6660"; 73 | 3R AUGUSTUS intron 12691870 12692642 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 74 | 3R AUGUSTUS CDS 12692643 12692707 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 75 | 3R AUGUSTUS exon 12692643 12692707 . + . transcript_id "g6660.t1"; gene_id "g6660"; 76 | 3R AUGUSTUS intron 12692708 12692769 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 77 | 3R AUGUSTUS CDS 12692770 12692944 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 78 | 3R AUGUSTUS exon 12692770 12692944 . + . transcript_id "g6660.t1"; gene_id "g6660"; 79 | 3R AUGUSTUS intron 12692945 12693003 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 80 | 3R AUGUSTUS CDS 12693004 12693155 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 81 | 3R AUGUSTUS exon 12693004 12693155 . + . transcript_id "g6660.t1"; gene_id "g6660"; 82 | 3R AUGUSTUS intron 12693156 12693214 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 83 | 3R AUGUSTUS CDS 12693215 12693761 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 84 | 3R AUGUSTUS exon 12693215 12693761 . + . transcript_id "g6660.t1"; gene_id "g6660"; 85 | 3R AUGUSTUS intron 12693762 12693829 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 86 | 3R AUGUSTUS CDS 12693830 12693973 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 87 | 3R AUGUSTUS gene 12691822 12693973 1 + . g6660 88 | 3R AUGUSTUS transcript 12691822 12693973 1 + . g6660.t1 89 | 3R AUGUSTUS exon 12693830 12693973 . + . transcript_id "g6660.t1"; gene_id "g6660"; 90 | 3R AUGUSTUS stop_codon 12693971 12693973 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 91 | 2R AUGUSTUS stop_codon 20354214 20354216 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 92 | 2R AUGUSTUS CDS 20354214 20355053 1 - 0 transcript_id "g11080.t1"; gene_id "g11080"; 93 | 2R AUGUSTUS exon 20354214 20355053 . - . transcript_id "g11080.t1"; gene_id "g11080"; 94 | 2R AUGUSTUS start_codon 20355051 20355053 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 95 | 2R AUGUSTUS gene 20354214 20355053 1 - . g11080 96 | 2R AUGUSTUS transcript 20354214 20355053 1 - . g11080.t1 -------------------------------------------------------------------------------- /tests/genome_anno/anno1.gtf: -------------------------------------------------------------------------------- 1 | 3L GeneMark.hmm stop_codon 18462228 18462230 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 2 | 3L GeneMark.hmm CDS 18462228 18462540 . - 1 gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 3 | 3L GeneMark.hmm exon 18462228 18462540 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2"; 4 | 3L GeneMark.hmm CDS 18462719 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 5 | 3L GeneMark.hmm exon 18462719 18463068 0 - . gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2"; 6 | 3L GeneMark.hmm start_codon 18463066 18463068 . - 0 gene_id "7789_g"; transcript_id "7789_t"; count "1_1"; 7 | 3R AUGUSTUS start_codon 7686444 7686446 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 8 | 3R AUGUSTUS CDS 7686444 7686623 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 9 | 3R AUGUSTUS exon 7686444 7686623 . + . transcript_id "g5980.t1"; gene_id "g5980"; 10 | 3R AUGUSTUS intron 7686624 7690691 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 11 | 3R AUGUSTUS CDS 7690692 7690843 1 + 0 transcript_id "g5980.t1"; gene_id "g5980"; 12 | 3R AUGUSTUS exon 7690692 7690843 . + . transcript_id "g5980.t1"; gene_id "g5980"; 13 | 3R AUGUSTUS intron 7690844 7691514 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 14 | 3R AUGUSTUS CDS 7691515 7691630 1 + 1 transcript_id "g5980.t1"; gene_id "g5980"; 15 | 3R AUGUSTUS exon 7691515 7691630 . + . transcript_id "g5980.t1"; gene_id "g5980"; 16 | 3R AUGUSTUS intron 7691631 7691712 1 + . transcript_id "g5980.t1"; gene_id "g5980"; 17 | 3R AUGUSTUS CDS 7691713 7693700 1 + 2 transcript_id "g5980.t1"; gene_id "g5980"; 18 | 3R AUGUSTUS gene 7686444 7693700 1 + . g5980 19 | 3R AUGUSTUS transcript 7686444 7693700 1 + . g5980.t1 20 | 3R AUGUSTUS exon 7691713 7693700 . + . transcript_id "g5980.t1"; gene_id "g5980"; 21 | 3R AUGUSTUS stop_codon 7693698 7693700 . + 0 transcript_id "g5980.t1"; gene_id "g5980"; 22 | X AUGUSTUS stop_codon 2065454 2065456 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 23 | X AUGUSTUS CDS 2065454 2065891 0.75 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 24 | X AUGUSTUS exon 2065454 2065891 . - . transcript_id "g12130.t1"; gene_id "g12130"; 25 | X AUGUSTUS intron 2065892 2065944 0.98 - . transcript_id "g12130.t1"; gene_id "g12130"; 26 | X AUGUSTUS CDS 2065945 2066088 0.93 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 27 | X AUGUSTUS exon 2065945 2066088 . - . transcript_id "g12130.t1"; gene_id "g12130"; 28 | X AUGUSTUS intron 2066089 2066148 0.92 - . transcript_id "g12130.t1"; gene_id "g12130"; 29 | X AUGUSTUS CDS 2066149 2066238 0.92 - 0 transcript_id "g12130.t1"; gene_id "g12130"; 30 | X AUGUSTUS gene 2065454 2066238 0.7 - . g12130 31 | X AUGUSTUS transcript 2065454 2066238 0.7 - . g12130.t1 32 | X AUGUSTUS exon 2066149 2066238 . - . transcript_id "g12130.t1"; gene_id "g12130"; 33 | X AUGUSTUS start_codon 2066236 2066238 . - 0 transcript_id "g12130.t1"; gene_id "g12130"; 34 | 2R AUGUSTUS stop_codon 16433896 16433898 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 35 | 2R AUGUSTUS CDS 16433896 16435797 1 - 0 transcript_id "g10583.t1"; gene_id "g10583"; 36 | 2R AUGUSTUS exon 16433896 16435797 . - . transcript_id "g10583.t1"; gene_id "g10583"; 37 | 2R AUGUSTUS start_codon 16435795 16435797 . - 0 transcript_id "g10583.t1"; gene_id "g10583"; 38 | 2R AUGUSTUS gene 16433896 16435797 1 - . g10583 39 | 2R AUGUSTUS transcript 16433896 16435797 1 - . g10583.t1 40 | 2R AUGUSTUS stop_codon 24640803 24640805 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 41 | 2R AUGUSTUS CDS 24640803 24642212 1 - 0 transcript_id "g11793.t1"; gene_id "g11793"; 42 | 2R AUGUSTUS exon 24640803 24642212 . - . transcript_id "g11793.t1"; gene_id "g11793"; 43 | 2R AUGUSTUS start_codon 24642210 24642212 . - 0 transcript_id "g11793.t1"; gene_id "g11793"; 44 | 2R AUGUSTUS gene 24640803 24642212 1 - . g11793 45 | 2R AUGUSTUS transcript 24640803 24642212 1 - . g11793.t1 46 | 2L AUGUSTUS stop_codon 11989063 11989065 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 47 | 2L AUGUSTUS CDS 11989063 11989803 0.73 - 0 transcript_id "g1539.t1"; gene_id "g1539"; 48 | 2L AUGUSTUS exon 11989063 11989803 . - . transcript_id "g1539.t1"; gene_id "g1539"; 49 | 2L AUGUSTUS start_codon 11989801 11989803 . - 0 transcript_id "g1539.t1"; gene_id "g1539"; 50 | 2L AUGUSTUS gene 11989063 11989803 0.73 - . g1539 51 | 2L AUGUSTUS transcript 11989063 11989803 0.73 - . g1539.t1 52 | 2L AUGUSTUS start_codon 4686242 4686244 . + 0 transcript_id "g562.t1"; gene_id "g562"; 53 | 2L AUGUSTUS CDS 4686242 4687105 1 + 0 transcript_id "g562.t1"; gene_id "g562"; 54 | 2L AUGUSTUS exon 4686242 4687105 . + . transcript_id "g562.t1"; gene_id "g562"; 55 | 2L AUGUSTUS stop_codon 4687103 4687105 . + 0 transcript_id "g562.t1"; gene_id "g562"; 56 | 2L AUGUSTUS gene 4686242 4687105 1 + . g562 57 | 2L AUGUSTUS transcript 4686242 4687105 1 + . g562.t1 58 | 3L AUGUSTUS stop_codon 11362605 11362607 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 59 | 3L AUGUSTUS CDS 11362605 11363086 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 60 | 3L AUGUSTUS exon 11362605 11363086 . - . transcript_id "g3988.t1"; gene_id "g3988"; 61 | 3L AUGUSTUS intron 11363087 11363276 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 62 | 3L AUGUSTUS CDS 11363277 11363918 1 - 2 transcript_id "g3988.t1"; gene_id "g3988"; 63 | 3L AUGUSTUS exon 11363277 11363918 . - . transcript_id "g3988.t1"; gene_id "g3988"; 64 | 3L AUGUSTUS intron 11363919 11364608 1 - . transcript_id "g3988.t1"; gene_id "g3988"; 65 | 3L AUGUSTUS CDS 11364609 11364771 1 - 0 transcript_id "g3988.t1"; gene_id "g3988"; 66 | 3L AUGUSTUS gene 11362605 11364771 1 - . g3988 67 | 3L AUGUSTUS transcript 11362605 11364771 1 - . g3988.t1 68 | 3L AUGUSTUS exon 11364609 11364771 . - . transcript_id "g3988.t1"; gene_id "g3988"; 69 | 3L AUGUSTUS start_codon 11364769 11364771 . - 0 transcript_id "g3988.t1"; gene_id "g3988"; 70 | 3R AUGUSTUS start_codon 12691822 12691824 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 71 | 3R AUGUSTUS CDS 12691822 12691869 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 72 | 3R AUGUSTUS exon 12691822 12691869 . + . transcript_id "g6660.t1"; gene_id "g6660"; 73 | 3R AUGUSTUS intron 12691870 12692642 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 74 | 3R AUGUSTUS CDS 12692643 12692707 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 75 | 3R AUGUSTUS exon 12692643 12692707 . + . transcript_id "g6660.t1"; gene_id "g6660"; 76 | 3R AUGUSTUS intron 12692708 12692769 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 77 | 3R AUGUSTUS CDS 12692770 12692944 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 78 | 3R AUGUSTUS exon 12692770 12692944 . + . transcript_id "g6660.t1"; gene_id "g6660"; 79 | 3R AUGUSTUS intron 12692945 12693003 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 80 | 3R AUGUSTUS CDS 12693004 12693155 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 81 | 3R AUGUSTUS exon 12693004 12693155 . + . transcript_id "g6660.t1"; gene_id "g6660"; 82 | 3R AUGUSTUS intron 12693156 12693214 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 83 | 3R AUGUSTUS CDS 12693215 12693761 1 + 1 transcript_id "g6660.t1"; gene_id "g6660"; 84 | 3R AUGUSTUS exon 12693215 12693761 . + . transcript_id "g6660.t1"; gene_id "g6660"; 85 | 3R AUGUSTUS intron 12693762 12693829 1 + . transcript_id "g6660.t1"; gene_id "g6660"; 86 | 3R AUGUSTUS CDS 12693830 12693973 1 + 0 transcript_id "g6660.t1"; gene_id "g6660"; 87 | 3R AUGUSTUS gene 12691822 12693973 1 + . g6660 88 | 3R AUGUSTUS transcript 12691822 12693973 1 + . g6660.t1 89 | 3R AUGUSTUS exon 12693830 12693973 . + . transcript_id "g6660.t1"; gene_id "g6660"; 90 | 3R AUGUSTUS stop_codon 12693971 12693973 . + 0 transcript_id "g6660.t1"; gene_id "g6660"; 91 | 2R AUGUSTUS stop_codon 20354214 20354216 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 92 | 2R AUGUSTUS CDS 20354214 20355053 1 - 0 transcript_id "g11080.t1"; gene_id "g11080"; 93 | 2R AUGUSTUS exon 20354214 20355053 . - . transcript_id "g11080.t1"; gene_id "g11080"; 94 | 2R AUGUSTUS start_codon 20355051 20355053 . - 0 transcript_id "g11080.t1"; gene_id "g11080"; 95 | 2R AUGUSTUS gene 20354214 20355053 1 - . g11080 96 | 2R AUGUSTUS transcript 20354214 20355053 1 - . g11080.t1 97 | -------------------------------------------------------------------------------- /bin/get_overlapping_genes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # TSEBRA: Transcript Selector for BRAKER 6 | # ============================================================== 7 | import argparse 8 | import sys 9 | import os 10 | import csv 11 | 12 | class ConfigFileError(Exception): 13 | pass 14 | 15 | class GeneSetMissing(Exception): 16 | pass 17 | 18 | gtf = [] 19 | enforce_tx = [] 20 | anno = [] 21 | hintfiles = [] 22 | graph = None 23 | out = '' 24 | v = 0 25 | quiet = False 26 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \ 27 | 'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0} 28 | cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/braker3.cfg' 29 | def main(): 30 | """ 31 | Overview: 32 | 33 | 1. Read gene predicitions from .gtf files. 34 | 2. Read Evidence from .gff files. 35 | 3. Detect overlapping transcripts. 36 | 4. Create feature vector (for a list of all features see features.py) 37 | for all transcripts. 38 | 5. Compare the feature vectors of all pairs of overlapping transcripts. 39 | 6. Exclude transcripts based on the 'transcript comparison rule' and 5. 40 | 7. Remove Transcripts with low evidence support. 41 | 8. Create combined gene predicitions (all transcripts that weren't excluded). 42 | """ 43 | 44 | from genome_anno import Anno 45 | from overlap_graph import Graph 46 | from evidence import Evidence 47 | 48 | global anno, graph, parameter 49 | 50 | args = parseCmd() 51 | # init(args) 52 | set_parameter(cfg_file) 53 | if v > 0: 54 | print(gtf) 55 | tx_keys = [] 56 | # read gene prediciton files 57 | c = 1 58 | keep = [] 59 | for g in [args.geneset1, args.geneset2]: 60 | tx_keys.append([]) 61 | if not quiet: 62 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 63 | anno.append(Anno(g, f'anno{c}')) 64 | anno[-1].addGtf() 65 | anno[-1].norm_tx_format() 66 | keep.append(f'anno{c}') 67 | for tx in anno[-1].transcripts.values(): 68 | cds = tx.get_type_coords('CDS', False) 69 | key = ['_'.join(list(map(str, c_1))) for c_1 in cds] 70 | tx_keys[-1].append(key) 71 | c+=1 72 | 73 | 74 | 75 | # read hintfiles 76 | evi = Evidence() 77 | 78 | # create graph with an edge for each unique transcript 79 | # and an edge if two transcripts overlap 80 | # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides 81 | graph = Graph(anno, para=parameter, keep_tx=keep, verbose=v) 82 | if not quiet: 83 | sys.stderr.write('### BUILD OVERLAP GRAPH\n') 84 | graph.build() 85 | 86 | graph.add_node_features(evi) 87 | # apply decision rule to exclude a set of transcripts 88 | if not quiet: 89 | sys.stderr.write('### SELECT TRANSCRIPTS\n') 90 | combined_prediction = graph.get_decided_graph() 91 | 92 | if v > 0: 93 | sys.stderr.write(str(combined_prediction.keys()) + '\n') 94 | for a in anno: 95 | sys.stderr.write('Numb_tx in {}: {}\n'.format(a.id, len(combined_prediction[a.id]))) 96 | 97 | # write result to output file 98 | if not quiet: 99 | sys.stderr.write('### WRITE COMBINED GENE PREDICTION\n') 100 | combined_anno = Anno('', 'combined_annotation') 101 | for a in anno: 102 | txs = a.get_subset([t[0] for t in combined_prediction[a.id]]) 103 | for id, new_gene_id in combined_prediction[a.id]: 104 | txs[id].set_gene_id(new_gene_id) 105 | combined_anno.add_transcripts(txs, a.id + '.') 106 | combined_anno.find_genes() 107 | 108 | out_only_g1 = [] 109 | out_only_g2 = [] 110 | out_overlap_g1 = [] 111 | out_overlap_g2 = [] 112 | 113 | gene_gtf = sorted(combined_anno.gene_gtf.values(), key=lambda g: (g[0],g[3],g[4])) 114 | for gene in gene_gtf: 115 | gtf_gene = [[],[]] 116 | current_anno_sources = set([]) 117 | # gtf_gene.append(gene) 118 | for tx_id in combined_anno.genes[gene[8]]: 119 | n_id = f'{combined_anno.transcripts[tx_id].source_anno};{".".join(tx_id.split(".")[1:])}' 120 | # gtf_gene += combined_anno.transcripts[tx_id].get_gtf() 121 | # current_anno_sources = current_anno_sources.union(graph.nodes[n_id].gene_sets) 122 | cds = combined_anno.transcripts[tx_id].get_type_coords('CDS', False) 123 | key = ['_'.join(list(map(str, c_1))) for c_1 in cds] 124 | 125 | for i, k in enumerate(tx_keys): 126 | if key in k: 127 | gtf_gene[i].append(gene) 128 | gtf_gene[i] += combined_anno.transcripts[tx_id].get_gtf() 129 | 130 | # print(current_anno_sources) 131 | # print(gtf_gene) 132 | if gtf_gene[0] and gtf_gene[1]: 133 | print(current_anno_sources, 'A') 134 | out_overlap_g1 += gtf_gene[0] 135 | out_overlap_g2 += gtf_gene[1] 136 | elif gtf_gene[0]: 137 | out_only_g1 += gtf_gene[0] 138 | elif gtf_gene[1]: 139 | out_only_g2 += gtf_gene[1] 140 | else: 141 | print(current_anno_sources) 142 | 143 | 144 | for i,j in zip([out_only_g1,out_only_g2,out_overlap_g1,out_overlap_g2], 145 | [f'{args.out}_only_g1', f'{args.out}_only_g2', f'{args.out}_overlap_g1',f'{args.out}_overlap_g2']): 146 | with open(j, 'w+') as file: 147 | out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n') 148 | for line in i: 149 | out_writer.writerow(line) 150 | 151 | 152 | def set_parameter(cfg_file): 153 | """ 154 | read parameters from the cfg file and store them in parameter. 155 | 156 | Args: 157 | cfg_file (str): Path to configuration file. 158 | """ 159 | global parameter 160 | with open(cfg_file, 'r') as file: 161 | cfg = csv.reader(file, delimiter=' ') 162 | for line in cfg: 163 | if not line[0][0] == '#': 164 | if line[0] not in parameter.keys(): 165 | parameter.update({line[0] : None}) 166 | parameter[line[0]] = float(line[1]) 167 | 168 | def init(args): 169 | global gtf, hintfiles, threads, hint_source_weight, out, enforce_tx, v, quiet 170 | if args.gtf: 171 | gtf = args.gtf.split(',') 172 | if args.keep_gtf: 173 | enforce_tx = args.keep_gtf.split(',') 174 | if not args.keep_gtf and not args.gtf: 175 | raise GeneSetMissing('At least one gene set has to be provided '\ 176 | + 'either with --gtf or --kepp_all!') 177 | if args.hintfiles: 178 | hintfiles = args.hintfiles.split(',') 179 | if args.cfg: 180 | cfg_file = args.cfg 181 | else: 182 | cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/braker3.cfg' 183 | set_parameter(cfg_file) 184 | if args.out: 185 | out = args.out 186 | if args.verbose: 187 | v = args.verbose 188 | if args.quiet: 189 | quiet = True 190 | 191 | def parseCmd(): 192 | """Parse command line arguments 193 | 194 | Returns: 195 | dictionary: Dictionary with arguments 196 | """ 197 | parser = argparse.ArgumentParser(description='Input: Two gtf files; Output: 3 GTF files with overlapping/not overlapping genes.') 198 | parser.add_argument('-g1', '--geneset1', type=str, 199 | help='') 200 | parser.add_argument('-g2', '--geneset2', type=str, 201 | help='') 202 | parser.add_argument('-o', '--out', type=str, required=True, 203 | help='') 204 | parser.add_argument('-q', '--quiet', action='store_true', 205 | help='Quiet mode.') 206 | parser.add_argument('-v', '--verbose', type=int, 207 | help='') 208 | return parser.parse_args() 209 | 210 | if __name__ == '__main__': 211 | main() 212 | -------------------------------------------------------------------------------- /bin/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The Artistic License 2.0 2 | 3 | Copyright (c) 2000-2006, The Perl Foundation. 4 | 5 | Everyone is permitted to copy and distribute verbatim copies of this license 6 | document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | This license establishes the terms under which a given free software Package 11 | may be copied, modified, distributed, and/or redistributed. The intent is that 12 | the Copyright Holder maintains some artistic control over the development of 13 | that Package while still keeping the Package available as open source and free 14 | software. 15 | 16 | You are always permitted to make arrangements wholly outside of this license 17 | directly with the Copyright Holder of a given Package. If the terms of this 18 | license do not permit the full use that you propose to make of the Package, 19 | you should contact the Copyright Holder and seek a different licensing 20 | arrangement. 21 | 22 | Definitions 23 | 24 | "Copyright Holder" means the individual(s) or organization(s) named in the 25 | copyright notice for the entire Package. 26 | 27 | "Contributor" means any party that has contributed code or other material to 28 | the Package, in accordance with the Copyright Holder's procedures. 29 | 30 | "You" and "your" means any person who would like to copy, distribute, or 31 | modify the Package. 32 | 33 | "Package" means the collection of files distributed by the Copyright Holder, 34 | and derivatives of that collection and/or of those files. A given Package may 35 | consist of either the Standard Version, or a Modified Version. 36 | 37 | "Distribute" means providing a copy of the Package or making it accessible to 38 | anyone else, or in the case of a company or organization, to others outside of 39 | your company or organization. 40 | 41 | "Distributor Fee" means any fee that you charge for Distributing this Package 42 | or providing support for this Package to another party. It does not mean 43 | licensing fees. 44 | 45 | "Standard Version" refers to the Package if it has not been modified, or has 46 | been modified only in ways explicitly requested by the Copyright Holder. 47 | 48 | "Modified Version" means the Package, if it has been changed, and such changes 49 | were not explicitly requested by the Copyright Holder. 50 | 51 | "Original License" means this Artistic License as Distributed with the 52 | Standard Version of the Package, in its current version or as it may be 53 | modified by The Perl Foundation in the future. 54 | 55 | "Source" form means the source code, documentation source, and configuration 56 | files for the Package. 57 | 58 | "Compiled" form means the compiled bytecode, object code, binary, or any other 59 | form resulting from mechanical transformation or translation of the Source 60 | form. 61 | 62 | Permission for Use and Modification Without Distribution 63 | 64 | (1) You are permitted to use the Standard Version and create and use Modified 65 | Versions for any purpose without restriction, provided that you do not 66 | Distribute the Modified Version. 67 | 68 | Permissions for Redistribution of the Standard Version 69 | 70 | (2) You may Distribute verbatim copies of the Source form of the Standard 71 | Version of this Package in any medium without restriction, either gratis or 72 | for a Distributor Fee, provided that you duplicate all of the original 73 | copyright notices and associated disclaimers. At your discretion, such 74 | verbatim copies may or may not include a Compiled form of the Package. 75 | 76 | (3) You may apply any bug fixes, portability changes, and other modifications 77 | made available from the Copyright Holder. The resulting Package will still be 78 | considered the Standard Version, and as such will be subject to the Original 79 | License. 80 | 81 | Distribution of Modified Versions of the Package as Source 82 | 83 | (4) You may Distribute your Modified Version as Source (either gratis or for a 84 | Distributor Fee, and with or without a Compiled form of the Modified Version) 85 | provided that you clearly document how it differs from the Standard Version, 86 | including, but not limited to, documenting any non-standard features, 87 | executables, or modules, and provided that you do at least ONE of the 88 | following: 89 | 90 | (a) make the Modified Version available to the Copyright Holder of the 91 | Standard Version, under the Original License, so that the Copyright Holder may 92 | include your modifications in the Standard Version. 93 | 94 | (b) ensure that installation of your Modified Version does not prevent the 95 | user installing or running the Standard Version. In addition, the Modified 96 | Version must bear a name that is different from the name of the Standard 97 | Version. 98 | 99 | (c) allow anyone who receives a copy of the Modified Version to make the 100 | Source form of the Modified Version available to others under 101 | 102 | (i) the Original License or 103 | 104 | (ii) a license that permits the licensee to freely copy, modify and 105 | redistribute the Modified Version using the same licensing terms that apply to 106 | the copy that the licensee received, and requires that the Source form of the 107 | Modified Version, and of any works derived from it, be made freely available 108 | in that license fees are prohibited but Distributor Fees are allowed. 109 | 110 | Distribution of Compiled Forms of the Standard Version or Modified Versions 111 | without the Source 112 | 113 | (5) You may Distribute Compiled forms of the Standard Version without the 114 | Source, provided that you include complete instructions on how to get the 115 | Source of the Standard Version. Such instructions must be valid at the time of 116 | your distribution. If these instructions, at any time while you are carrying 117 | out such distribution, become invalid, you must provide new instructions on 118 | demand or cease further distribution. If you provide valid instructions or 119 | cease distribution within thirty days after you become aware that the 120 | instructions are invalid, then you do not forfeit any of your rights under 121 | this license. 122 | 123 | (6) You may Distribute a Modified Version in Compiled form without the Source, 124 | provided that you comply with Section 4 with respect to the Source of the 125 | Modified Version. 126 | 127 | Aggregating or Linking the Package 128 | 129 | (7) You may aggregate the Package (either the Standard Version or Modified 130 | Version) with other packages and Distribute the resulting aggregation provided 131 | that you do not charge a licensing fee for the Package. Distributor Fees are 132 | permitted, and licensing fees for other components in the aggregation are 133 | permitted. The terms of this license apply to the use and Distribution of the 134 | Standard or Modified Versions as included in the aggregation. 135 | 136 | (8) You are permitted to link Modified and Standard Versions with other works, 137 | to embed the Package in a larger work of your own, or to build stand-alone 138 | binary or bytecode versions of applications that include the Package, and 139 | Distribute the result without restriction, provided the result does not expose 140 | a direct interface to the Package. 141 | 142 | Items That are Not Considered Part of a Modified Version 143 | 144 | (9) Works (including, but not limited to, modules and scripts) that merely 145 | extend or make use of the Package, do not, by themselves, cause the Package to 146 | be a Modified Version. In addition, such works are not considered parts of the 147 | Package itself, and are not subject to the terms of this license. 148 | 149 | General Provisions 150 | 151 | (10) Any use, modification, and distribution of the Standard or Modified 152 | Versions is governed by this Artistic License. By using, modifying or 153 | distributing the Package, you accept this license. Do not use, modify, or 154 | distribute the Package, if you do not accept this license. 155 | 156 | (11) If your Modified Version has been derived from a Modified Version made by 157 | someone other than you, you are nevertheless required to ensure that your 158 | Modified Version complies with the requirements of this license. 159 | 160 | (12) This license does not grant you the right to use any trademark, service 161 | mark, tradename, or logo of the Copyright Holder. 162 | 163 | (13) This license includes the non-exclusive, worldwide, free-of-charge patent 164 | license to make, have made, use, offer to sell, sell, import and otherwise 165 | transfer the Package with respect to any patent claims licensable by the 166 | Copyright Holder that are necessarily infringed by the Package. If you 167 | institute patent litigation (including a cross-claim or counterclaim) against 168 | any party alleging that the Package constitutes direct or contributory patent 169 | infringement, then this Artistic License to you shall terminate on the date 170 | that such litigation is filed. 171 | 172 | (14) Disclaimer of Warranty: 173 | 174 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' 175 | AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF 176 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE 177 | DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, 178 | NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, 179 | INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE 180 | PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 181 | -------------------------------------------------------------------------------- /bin/tsebra.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # TSEBRA: Transcript Selector for BRAKER 6 | # ============================================================== 7 | import argparse 8 | import sys 9 | import os 10 | import csv 11 | 12 | class ConfigFileError(Exception): 13 | pass 14 | 15 | class GeneSetMissing(Exception): 16 | pass 17 | 18 | gtf = [] 19 | enforce_tx = [] 20 | anno = [] 21 | hintfiles = [] 22 | graph = None 23 | out = '' 24 | v = 0 25 | quiet = False 26 | filter_sing_exon = False 27 | ignore_tx_phase = False 28 | scores_tab = '' 29 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \ 30 | 'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0} 31 | 32 | def main(): 33 | """ 34 | Overview: 35 | 36 | 1. Read gene predicitions from .gtf files. 37 | 2. Read Evidence from .gff files. 38 | 3. Detect overlapping transcripts. 39 | 4. Create feature vector (for a list of all features see features.py) 40 | for all transcripts. 41 | 5. Compare the feature vectors of all pairs of overlapping transcripts. 42 | 6. Exclude transcripts based on the 'transcript comparison rule' and 5. 43 | 7. Remove Transcripts with low evidence support. 44 | 8. Create combined gene predicitions (all transcripts that weren't excluded). 45 | """ 46 | 47 | from genome_anno import Anno 48 | from overlap_graph import Graph 49 | from evidence import Evidence 50 | 51 | global anno, graph, parameter 52 | 53 | args = parseCmd() 54 | init(args) 55 | 56 | if v > 0: 57 | print(gtf) 58 | 59 | # read gene prediciton files 60 | c = 1 61 | keep = [] 62 | 63 | for g in gtf: 64 | if not quiet: 65 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 66 | anno.append(Anno(g, f'anno{c}')) 67 | anno[-1].addGtf() 68 | anno[-1].norm_tx_format() 69 | c += 1 70 | for g in enforce_tx: 71 | if not quiet: 72 | sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n') 73 | anno.append(Anno(g, f'anno{c}')) 74 | anno[-1].addGtf() 75 | anno[-1].norm_tx_format() 76 | keep.append(f'anno{c}') 77 | c += 1 78 | 79 | # read hintfiles 80 | evi = Evidence() 81 | for h in hintfiles: 82 | if not quiet: 83 | sys.stderr.write(f'### READING EXTRINSIC EVIDENCE: [{h}]\n') 84 | evi.add_hintfile(h) 85 | for src in evi.src: 86 | if src not in parameter.keys(): 87 | sys.stderr.write(f'ConfigError: No weight for src={src}, it is set to 1\n') 88 | parameter.update({src : 1}) 89 | 90 | # create graph with an edge for each unique transcript 91 | # and an edge if two transcripts overlap 92 | # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides 93 | 94 | graph = Graph(anno, para=parameter, keep_tx=keep, filter_single=filter_sing_exon, ignore_phase=ignore_tx_phase, verbose=v) 95 | if not quiet: 96 | sys.stderr.write('### BUILD OVERLAP GRAPH\n') 97 | graph.build() 98 | 99 | # add features 100 | if not quiet: 101 | sys.stderr.write('### ADD FEATURES TO TRANSCRIPTS\n') 102 | graph.add_node_features(evi) 103 | 104 | # apply decision rule to exclude a set of transcripts 105 | if not quiet: 106 | sys.stderr.write('### SELECT TRANSCRIPTS\n') 107 | combined_prediction = graph.get_decided_graph() 108 | 109 | if v > 0: 110 | sys.stderr.write(str(combined_prediction.keys()) + '\n') 111 | for a in anno: 112 | sys.stderr.write('Numb_tx in {}: {}\n'.format(a.id, len(combined_prediction[a.id]))) 113 | 114 | # write result to output file 115 | if not quiet: 116 | sys.stderr.write('### WRITE COMBINED GENE PREDICTION\n') 117 | combined_anno = Anno('', 'combined_annotation') 118 | for a in anno: 119 | txs = a.get_subset([t[0] for t in combined_prediction[a.id]]) 120 | for id, new_gene_id in combined_prediction[a.id]: 121 | txs[id].set_gene_id(new_gene_id) 122 | combined_anno.add_transcripts(txs, a.id + '.') 123 | combined_anno.find_genes() 124 | combined_anno.write_anno(out) 125 | 126 | if scores_tab: 127 | if not quiet: 128 | sys.stderr.write('### WRITE TRANSCRIPT SCORES\n') 129 | tab_out = [['### TX_ID','intron_support', 'stasto_support', 's1', 's2', 's3', 's4']] 130 | for node in graph.nodes.values(): 131 | tab_out += [[node.id] + list(node.feature_vector)] 132 | write_csv(scores_tab, tab_out) 133 | 134 | if not quiet: 135 | sys.stderr.write('### FINISHED\n\n') 136 | sys.stderr.write('### The combined gene prediciton is located at {}.\n'.format(\ 137 | out)) 138 | 139 | def set_parameter(cfg_file): 140 | """ 141 | Read parameters from the cfg file and store them in parameter. 142 | 143 | Args: 144 | cfg_file (str): Path to configuration file. 145 | """ 146 | global parameter 147 | with open(cfg_file, 'r') as file: 148 | cfg = csv.reader(file, delimiter=' ') 149 | for line in cfg: 150 | if not line[0][0] == '#': 151 | if line[0] not in parameter.keys(): 152 | parameter.update({line[0] : None}) 153 | parameter[line[0]] = float(line[1]) 154 | 155 | def write_csv(out_path, tab): 156 | """ 157 | Write table to out_path. 158 | Args: 159 | (str) : path to the output file 160 | (list) : table 161 | """ 162 | with open(out_path, 'w+') as file: 163 | out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n') 164 | for line in tab: 165 | out_writer.writerow(line) 166 | 167 | def init(args): 168 | global gtf, hintfiles, threads, hint_source_weight, out, enforce_tx, v, scores_tab, filter_sing_exon, ignore_tx_phase, quiet 169 | if args.gtf: 170 | gtf = args.gtf.split(',') 171 | if args.keep_gtf: 172 | enforce_tx = args.keep_gtf.split(',') 173 | if not args.keep_gtf and not args.gtf: 174 | raise GeneSetMissing('At least one gene set has to be provided '\ 175 | + 'either with --gtf or --kepp_all!') 176 | if args.hintfiles: 177 | hintfiles = args.hintfiles.split(',') 178 | if args.cfg: 179 | cfg_file = args.cfg 180 | else: 181 | cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/default.cfg' 182 | set_parameter(cfg_file) 183 | if args.score_tab: 184 | scores_tab = args.score_tab 185 | if args.filter_single_exon_genes: 186 | filter_sing_exon = args.filter_single_exon_genes 187 | if args.ignore_tx_phase: 188 | ignore_tx_phase = args.ignore_tx_phase 189 | if args.out: 190 | out = args.out 191 | if args.verbose: 192 | v = args.verbose 193 | if args.quiet: 194 | quiet = True 195 | 196 | def parseCmd(): 197 | """Parse command line arguments 198 | 199 | Returns: 200 | dictionary: Dictionary with arguments 201 | """ 202 | parser = argparse.ArgumentParser(description='TSEBRA: Transcript Selector for BRAKER\n\n' \ 203 | + 'TSEBRA combines gene predictions by selecing ' \ 204 | + 'transcripts based on their extrisic evidence support.') 205 | parser.add_argument('-g', '--gtf', type=str, 206 | help='List (separated by commas) of gene prediciton files in gtf.\n' \ 207 | + '(e.g. gene_pred1.gtf,gene_pred2.gtf,gene_pred3.gtf)') 208 | parser.add_argument('-k', '--keep_gtf', type=str, 209 | help='List (separated by commas) of gene prediciton files in gtf.\n' \ 210 | + 'These gene sets are used the same way as other inputs, but TSEBRA '\ 211 | + 'ensures that all transcripts from these gene sets are included in the output.') 212 | parser.add_argument('-e', '--hintfiles', type=str, 213 | help='List (separated by commas) of files containing extrinsic evidence in gff.\n' \ 214 | + '(e.g. hintsfile1.gff,hintsfile2.gtf,3.gtf)') 215 | parser.add_argument('-c', '--cfg', type=str, 216 | help='Configuration file that sets the parameter for TSEBRA. ' \ 217 | + 'You can find the recommended parameter at config/default.cfg.') 218 | parser.add_argument('--filter_single_exon_genes', action='store_true', 219 | help='Filter out all single-exon genes out that are not' \ 220 | + ' supported by at least one start- or stop-codon hint.') 221 | parser.add_argument('--ignore_tx_phase', action='store_true', 222 | help='Ignore the phase of transcripts while detecting clusters ' \ 223 | + 'of overlapping transcripts.') 224 | parser.add_argument('-s', '--score_tab', type=str, 225 | help='Prints the transcript scores as a table to the specified file.') 226 | parser.add_argument('-o', '--out', type=str, required=True, 227 | help='Outputfile for the combined gene prediciton in gtf.') 228 | parser.add_argument('-q', '--quiet', action='store_true', 229 | help='Quiet mode.') 230 | parser.add_argument('-v', '--verbose', type=int, 231 | help='') 232 | return parser.parse_args() 233 | 234 | if __name__ == '__main__': 235 | main() -------------------------------------------------------------------------------- /tests/prep_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # author: Lars Gabriel 4 | # 5 | # prep_files.py: create example data for pytests 6 | # ============================================================== 7 | import os 8 | testDir = os.path.abspath(os.path.dirname(__file__)) 9 | 10 | def genome_anno(): 11 | anno1 = testDir + '/genome_anno/anno1.gtf' 12 | orig = [] 13 | with open(anno1, 'r') as file: 14 | for line in file.readlines(): 15 | line = line.strip('\n') 16 | orig.append(line) 17 | orig = [f.split('\t') for f in orig] 18 | 19 | anno = orig 20 | anno[1][8] = 'gene_id "7789_g";' 21 | anno = ['\t'.join(map(str, line)) for line in anno] 22 | with open(testDir + '/genome_anno/format_error.gtf', 'w+') as file: 23 | file.write('\n'.join(anno)) 24 | 25 | anno = orig 26 | anno[1][8] = 'transcript_id "7789_t";' 27 | anno[6][8] = 'transcript_id "g5980.t1";' 28 | for line in anno: 29 | if 'transcript_id "g11080.t1";' in line[8]: 30 | line[8] = 'transcript_id "g11080.t1";' 31 | anno = ['\t'.join(map(str, line)) for line in anno] 32 | with open(testDir + '/genome_anno/missing_gid.gtf', 'w+') as file: 33 | file.write('\n'.join(anno)) 34 | 35 | def get_anno(tx_dict, phase): 36 | template = ['3R', 'AUGUSTUS', '', '', '', phase, '+', '0', ''] 37 | anno = [] 38 | for key in tx_dict: 39 | coord = tx_dict[key] 40 | template[8] = 'transcript_id "{}"; gene_id "{}";'.format(key, key + '_g') 41 | type = 'exon' 42 | pos = coord[0] 43 | for c in coord[1:]: 44 | line = template.copy() 45 | line[2] = type 46 | line[3] = pos 47 | pos += c 48 | line[4] = pos 49 | if type == 'intron': 50 | line[3] += 1 51 | line[4] -= 1 52 | anno.append(line) 53 | if type == 'exon': 54 | line = line.copy() 55 | line[2] = 'CDS' 56 | anno.append(line) 57 | type = 'intron' 58 | else: 59 | type = 'exon' 60 | line = template.copy() 61 | line[2] = 'transcript' 62 | line[3] = str(coord[0]) 63 | line[4] = str(pos) 64 | line[8] = key 65 | anno.append(line) 66 | return anno 67 | 68 | def list2string(gtf): 69 | gtf = ['\t'.join(map(str, g)) for g in gtf] 70 | return '\n'.join(gtf) 71 | 72 | def graph(): 73 | dir = testDir + '/graph/' 74 | #example 1 75 | anno1_txs = { 't1' : [100, 100, 100, 100], \ 76 | 't2' : [700, 100, 100, 100, 100, 100], \ 77 | 't3' : [1500, 100]} 78 | anno1 = get_anno(anno1_txs, '0') 79 | with open(dir + 'ex1_anno1.gtf', 'w+') as file: 80 | file.write(list2string(anno1)) 81 | 82 | anno2_txs = { 't1' : [250, 250, 100, 150], 83 | 't2' : [1050, 200], 84 | 't3' : [1700, 100]} 85 | anno2 = get_anno(anno2_txs, '0') 86 | with open(dir + 'ex1_anno2.gtf', 'w+') as file: 87 | file.write(list2string(anno2)) 88 | 89 | #example 2 90 | anno1_txs = { 't1' : [200, 100]} 91 | anno1 = get_anno(anno1_txs, '0') 92 | with open(dir + 'ex2_anno1.gtf', 'w+') as file: 93 | file.write(list2string(anno1)) 94 | 95 | anno2_txs = { 't1' : [100, 100], \ 96 | 't2' : [301, 99]} 97 | anno2 = get_anno(anno2_txs, '0') 98 | with open(dir + 'ex2_anno2.gtf', 'w+') as file: 99 | file.write(list2string(anno2)) 100 | 101 | #example 3 102 | anno1_txs = { 't1' : [100, 200, 200, 200, 200, 200]} 103 | anno1 = get_anno(anno1_txs, '0') 104 | with open(dir + 'ex3_anno1.gtf', 'w+') as file: 105 | file.write(list2string(anno1)) 106 | 107 | anno2_txs = { 't1' : [110, 90, 600, 200], \ 108 | 't2' : [350, 100]} 109 | anno2 = get_anno(anno2_txs, '0') 110 | with open(dir + 'ex3_anno2.gtf', 'w+') as file: 111 | file.write(list2string(anno2)) 112 | 113 | #example 4 114 | anno1_txs = { 't1' : [100, 100, 100, 100]} 115 | anno1 = get_anno(anno1_txs, '0') 116 | with open(dir + 'ex4_anno1.gtf', 'w+') as file: 117 | file.write(list2string(anno1)) 118 | 119 | anno2_txs = { 't1' : [101, 100, 100, 100]} 120 | anno2 = get_anno(anno2_txs, '1') 121 | with open(dir + 'ex4_anno2.gtf', 'w+') as file: 122 | file.write(list2string(anno2)) 123 | 124 | def evidence(): 125 | dir = testDir + '/evidence/' 126 | hint_test_file1 = ['3L\tProtHint\tintron\t5812862\t5812941\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \ 127 | '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\ttranscript_id="t1"\n', \ 128 | '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\tsrc=M;pri=4\n', 129 | '3L\tProtHint\tintron\t12291242\t'] 130 | with open(dir + 'hint1.gff', 'w+') as file: 131 | file.write(''.join(hint_test_file1)) 132 | 133 | hint_test_file2 = ['3L\tProtHint\tintron\t5812862\t5812941\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \ 134 | '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\tsrc=M;mult=8;pri=4\n', \ 135 | '3R\tProtHint\tintron\t17440148\t17440207\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \ 136 | '2R\tProtHint\tintron\t5760114\t5760177\t23\t-\t.\tsrc=M;mult=23;pri=4\n', \ 137 | '2R\tProtHint\tintron\t6210484\t6210546\t21\t-\t.\tsrc=M;mult=21;pri=4\n', \ 138 | '3L\tProtHint\tintron\t20527281\t20527592\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \ 139 | '2L\tProtHint\tintron\t12400752\t12400814\t24\t+\t.\tsrc=M;mult=24;pri=4\n', \ 140 | '2R\tProtHint\tintron\t14988084\t14988142\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \ 141 | '2L\tProtHint\tintron\t6667531\t6667670\t5\t-\t.\tsrc=M;mult=5;pri=4\n', \ 142 | '3R\tProtHint\tintron\t5537551\t5537605\t22\t+\t.\tsrc=M;mult=22;pri=4\n', \ 143 | '3R\tProtHint\tintron\t20813612\t20813665\t12\t-\t.\tsrc=M;mult=12;pri=4\n', \ 144 | 'X\tProtHint\tintron\t2145714\t2147174\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \ 145 | '3L\tProtHint\tintron\t8114197\t8114256\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \ 146 | 'X\tProtHint\tintron\t11048602\t11048941\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \ 147 | '2L\tProtHint\tintron\t3807462\t3807524\t18\t+\t.\tsrc=M;mult=18;pri=4\n', \ 148 | '3R\tProtHint\tintron\t27059120\t27059364\t19\t-\t.\tsrc=M;mult=19;pri=4\n', \ 149 | '2R\tProtHint\tintron\t13821370\t13821432\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \ 150 | 'X\tProtHint\tintron\t8173462\t8173860\t6\t-\t.\tsrc=M;mult=6;pri=4\n', \ 151 | 'X\tProtHint\tintron\t13270643\t13271481\t16\t-\t.\tsrc=M;mult=16;pri=4\n', \ 152 | 'X\tProtHint\tintron\t2079645\t2079714\t25\t-\t.\tsrc=M;mult=25;pri=4\n'] 153 | with open(dir + 'hint2.gff', 'w+') as file: 154 | file.write(''.join(hint_test_file2)) 155 | 156 | hint_test_file3 = [] 157 | hint_test_file3.append(get_hint(100, 102, 'start_codon')) 158 | hint_test_file3.append(get_hint(501, 599, 'intron')) 159 | hint_test_file3.append(get_hint(501, 599, 'intron', src='P', mult=14)) 160 | hint_test_file3.append(get_hint(698, 700, 'stop_codon')) 161 | hint_test_file3.append(get_hint(801, 899, 'intron')) 162 | hint_test_file3.append(get_hint(801, 899, 'intron', chr='2L')) 163 | hint_test_file3.append(get_hint(801, 899, 'intron', src='P', mult=24)) 164 | hint_test_file3.append(get_hint(801, 949, 'intron')) 165 | hint_test_file3.append(get_hint(801, 899, 'intron', strand='-')) 166 | hint_test_file3.append(get_hint(1001, 1099, 'intron')) 167 | hint_test_file3.append(get_hint(1198, 1200, 'stop_codon')) 168 | hint_test_file3.append(get_hint(1601, 1699, 'intron')) 169 | with open(dir + 'hint3.gff', 'w+') as file: 170 | file.write('\n'.join(hint_test_file3)) 171 | 172 | 173 | def get_hint(start, end, type, strand='+', chr='3R', score=10, mult=2, pri=4, src='E'): 174 | att = 'src={};mult={};pri={}'.format(src,mult,pri) 175 | template = [chr, 'AUGUSTUS', type, start, end, score, '+', '.', att] 176 | return '\t'.join(map(str, template)) 177 | 178 | def get_feature(): 179 | dir = testDir + '/graph/' 180 | result = [] 181 | with open('/home/lars/work/combiner/example/braker1/braker_fixed.gtf', 'r') as file: 182 | for line in file.readlines(): 183 | if 'g7604.t1' in line or 'g7603.t1' in line or 'g7605.t1' in line: 184 | result.append(line) 185 | with open(dir + 'ex_feature_anno1.gtf', 'w+') as file: 186 | file.write(''.join(result)) 187 | 188 | result = [] 189 | with open('/home/lars/work/combiner/example/braker2/braker.gtf', 'r') as file: 190 | for line in file.readlines(): 191 | if 'g7700.t1' in line or 'g7701.t1' in line: 192 | result.append(line) 193 | with open(dir + 'ex_feature_anno2.gtf', 'w+') as file: 194 | file.write(''.join(result)) 195 | 196 | result = [] 197 | with open('/home/lars/work/combiner/example/braker1/hintsfile.gff', 'r') as file: 198 | for line in file.readlines(): 199 | line = line.split('\t') 200 | if len(line) > 8: 201 | if int(line[3]) >= 21737000 and int(line[4]) <= 21750000 \ 202 | and line[0] == '3R' and not line[2] == 'CDSpart': 203 | result.append(line) 204 | result = ['\t'.join(r) for r in result] 205 | with open(dir + 'ex_feature_hint1.gff', 'w+') as file: 206 | file.write(''.join(result)) 207 | 208 | result = [] 209 | with open('/home/lars/work/combiner/example/braker2/hintsfile.gff', 'r') as file: 210 | for line in file.readlines(): 211 | line = line.split('\t') 212 | if len(line) > 8: 213 | if int(line[3]) >= 21737000 and int(line[4]) <= 21750000 \ 214 | and line[0] == '3R' and not line[2] == 'CDSpart': 215 | result.append(line) 216 | result = ['\t'.join(r) for r in result] 217 | with open(dir + 'ex_feature_hint2.gff', 'w+') as file: 218 | file.write(''.join(result)) 219 | 220 | if __name__ == '__main__': 221 | #genome_anno() 222 | #graph() 223 | #evidence() 224 | get_feature() 225 | -------------------------------------------------------------------------------- /bin/compleasm-LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/tsebra/README.html) 2 | [![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=)](https://usegalaxy.eu/root?tool_id=tsebra) 3 | 4 | # TSEBRA: Transcript Selector for BRAKER 5 | 6 |

7 | drawing 8 |

9 | 10 | ### Introduction 11 | [TSEBRA](https://doi.org/10.1186/s12859-021-04482-0) is a combiner tool that selects transcripts from gene predictions based on the support by extrisic evidence in form of introns and start/stop codons. It was developed to combine BRAKER1[1](#ref1) and BRAKER2[2](#ref2) predicitons to increase their accuracies. 12 | 13 | ## Prerequisites 14 | TSEBRA itself requires Python 3.5.2 or higher. 15 | 16 | `best_by_compleasm.py`, a script that may re-run TSEBRA on a BRAKER output folder to maximize BUSCO presence in the output gene set, requires compleasm v0.2.4 or newer (https://github.com/huangnengCSU/compleasm), and the python module pandas. 17 | 18 | ## Installation 19 | Download TSEBRA: 20 | ``` 21 | git clone https://github.com/Gaius-Augustus/TSEBRA 22 | ``` 23 | 24 | If desired, download compleasm: 25 | 26 | ``` 27 | wget https://github.com/huangnengCSU/compleasm/releases/download/v0.2.4/compleasm-0.2.4_x64-linux.tar.bz2 28 | tar -xvjf compleasm-0.2.4_x64-linux.tar.bz2 29 | ``` 30 | 31 | Add the resulting folder compleasm_kit to your `$PATH` variable, e.g.: 32 | ``` 33 | export PATH=$PATH:/your/path/to/compleasm_kit 34 | ``` 35 | 36 | Compleasm requires pandas, which can be installed with: 37 | 38 | ``` 39 | pip install pandas 40 | ``` 41 | 42 | ## Usage 43 | The main script is ```./bin/tsebra.py```. For usage information run ```./bin/tsebra.py --help```. 44 | 45 | ## Input Files 46 | TSEBRA takes a list of gene prediciton files, a list of hintfiles and a configuration file as mandatory input. 47 | 48 | #### Gene Predictions 49 | The gene prediction files have to be in gtf format. This is the standard output format of a BRAKER or AUGUSTUS[3,](#ref3)[4](#ref4) gene prediciton. 50 | 51 | Example: 52 | ```console 53 | 2L AUGUSTUS gene 83268 87026 0.88 - . g5332 54 | 2L AUGUSTUS transcript 83268 87026 0.88 - . g5332.t1 55 | 2L AUGUSTUS intron 84278 87019 1 - . transcript_id "file_1_file_1_g5332.t1"; gene_id "file_1_file_1_g5332"; 56 | 2L AUGUSTUS CDS 87020 87026 0.88 - 0 transcript_id "file_1_file_1_g5332.t1"; gene_id "file_1_file_1_g5332"; 57 | 2L AUGUSTUS exon 87020 87026 . - . transcript_id "file_1_file_1_g5332.t1"; gene_id "file_1_file_1_g5332"; 58 | ``` 59 | 60 | #### Hint Files 61 | The hints files have to be in gff format, the last column must include an attribute for the source for the hint with 'src=' and can include the number of hints supporting the gene structure segment with 'mult='. This is the standard file format of the ```hintsfile.gff``` in a BRAKER working directory. 62 | 63 | Example: 64 | ```console 65 | 2L ProtHint intron 279806 279869 2 + . src=P;mult=25;pri=4;al_score=0.437399; 66 | 2L ProtHint intron 275252 275318 2 - . src=P;mult=19;pri=4;al_score=0.430006; 67 | 2L ProtHint stop 293000 293002 1 + 0 grp=7220_0:002b08_g42;src=C;pri=4; 68 | 2L ProtHint intron 207632 207710 1 + . grp=7220_0:002afa_g26;src=C;pri=4; 69 | 2L ProtHint start 207512 207514 1 + 0 grp=7220_0:002afa_g26;src=C;pri=4; 70 | ``` 71 | 72 | #### Configuration File 73 | The configuration file has to include three different sets of parameter: 74 | 1. Weights for all sources of hints. The source of a hint is specified by the mandatory 'src=' attribute in the last column of the ```hintsfile.gff``` (see section 'Hint Files'). See section 'Transcript scores' in [TSEBRA](https://doi.org/10.1101/2021.06.07.447316) for more information on how these weigths are used. 75 | A weight is set to 1, if the weight for a hint source is not specified in the configuration file. 76 | 77 | * *Notes on adjusting these parameters: Increase the weight of the hint sources that have the highest quality. For example, if the protein database includes only species that are remotely related to the target species, the hints produced by BRAKER2 might be less accurate than the RNA-seq evidence. Then, you should increase the weight of the source related to the RNA-seq hints.* 78 | 79 | 80 | 2. Required fractions of supported introns or supported start/stop-codons for a transcript. A transcript is not included in the TSEBRA result if the fractions of introns and start/stop codons supported by extrinsic evidence are lower than the thresholds. 81 | 82 | * *Notes on adjusting these parameters: The low evidence support thresholds for low evidence support are quite strict in the default configuration file. In this configuration, only transcripts with very high evidence support are allowed in the TSBERA result. In some cases, the default setting might be too strict, so that too many transcripts are filtered out. In this case, you should reduce the threshold of 'intron_support' (e.g., to 0.2).* 83 | 84 | 85 | 3. Allowed difference between two overlapping transcripts for the six transcript scores. TSEBRA compares transcripts via their transcript scores and removes the one with the lower score if their difference exceeds the respective threshold. 86 | Note that it is recommended to choose thesholds between [0,2], since the transcript scores are normalized to [-1,1]. 87 | 88 | * *Notes on adjusting these parameters: The higher the thresholds are set the less transcripts are filtered by the respective rule. With these thresholds one can adjust the effect of each filtering rule of TSEBRA. As these thresholds are increased, more transcripts are included in the TSEBRA result, in particular, more alternatively spliced isoforms per gene are contained in the result.* 89 | 90 | 91 | 92 | The name and the value of a parameter are separated by a space, and each parameter is listed in a different line. 93 | Example: 94 | ```console 95 | # Weight for each hint source 96 | # Values have to be >= 0 97 | P 1 98 | E 1 99 | C 1 100 | M 1 101 | # Required fraction of supported introns 102 | # or supported start/stop-codons for a transcript 103 | # Values have to be in [0,1] 104 | intron_support 0.8 105 | stasto_support 1 106 | # Allowed difference for each feature 107 | # Values have to be in [0,2] 108 | e_1 0.0 109 | e_2 0.5 110 | e_3 0.096 111 | e_4 0.02 112 | e_5 0.18 113 | e_6 0.18 114 | ``` 115 | Description of evidence sources in default BRAKER1 and BRAKER2 outputs: 116 | ``` 117 | E = RNA-seq hints 118 | M = manual hints, these are hints that are enforced during the prediction step of BRAKER, 119 | C = protein hints from proteins with a 'high' spliced alignment score. 120 | P = protein hints from proteins that have a 'good' spliced alignment score, 121 | but that is lower than the score from the ones in 'C'. 122 | ``` 123 | 124 | ## Use Case 125 | The recommended and most common usage for TSEBRA is to combine the resulting ```augustus.hints.gtf``` files of a BRAKER1 and a BRAKER2 run using the hintsfile.gff from both working directories. However, TSEBRA can be applied to any number (>1) of gene predictions and hint files as long as they are in the correct format. 126 | 127 | A common case might be that a user wants to annotate a novel genome with BRAKER and has: 128 | * a novel genome with repeats masked: ```genome.fasta.masked```, 129 | * hints for intron positions from RNA-seq reads```rna_seq_hints.gff```, 130 | * database of homologous proteins: ```proteins.fa```. 131 | 132 | 1. Run BRAKER1 and BRAKER2 for example with 133 | ```console 134 | ### BRAKER1 135 | braker.pl --genome=genome.fasta.masked --hints=rna_seq_hints.gff \ 136 | --softmasking --species=species_name --workingdir=braker1_out 137 | ### BRAKER2 138 | braker.pl --genome=genome.fasta.masked --prot_seq=proteins.fa \ 139 | --softmasking --species=species_name --epmode \ 140 | --workingdir=braker2_out 141 | ``` 142 | 2. Combine predicitons with TSEBRA 143 | ```console 144 | ./bin/tsebra.py -g braker1_out/augustus.hints.gtf,braker2_out/augustus.hints.gtf -c default.cfg \ 145 | -e braker1_out/hintsfile.gff,braker2_out/hintsfile.gff \ 146 | -o braker1+2_combined.gtf 147 | ``` 148 | The combined gene prediciton is ```braker1+2_combined.gtf```. 149 | 150 | ## Example 151 | A small example is located at ```example/```. Run ```./example/run_prevco_example.sh``` to execute the example and to check if TSEBRA runs properly. 152 | 153 | ## Enforcing a gene set 154 | A gene set can be enforced in the TSEBRA output, i.e. all transcript are guaranteed to be included in the output, with the `--keep_gtf` option. The transcripts of enforced gene sets are still compared to all gene sets and used to evaluate them. 155 | Example: 156 | ```console 157 | ./bin/tsebra.py -g gene_set1,gene_set2 -c default.cfg \ 158 | -k enforced_set1,enforced_set2 -e hintsfile1.gff,braker2_out/hintsfile2.gff \ 159 | -o tsebra.gtf 160 | ``` 161 | To merge two gene sets, simply omit the `-g` option. 162 | 163 | 164 | ## Filter single-exon genes out 165 | In default mode, TSEBRA is conservative in filtering single exon genes out. In some cases BRAKER predicts a lot of false positive single exon genes. In these cases, it is recommended to run TSBERA using the `--filter_single_exon_genes`. In this mode, TSBERA filters additonally all single-exon genes out that have no support by a start or stop codon hint. 166 | 167 | ## Print transcript scores 168 | The transcript scores play a very improtant role in TSEBRA. These are used for pairwise comparison of all transcripts isoforms that have overlapping coding regions. You can print the scores as table to a file with the option `--score_tab /path/to/output/file.tab`. 169 | 170 | ## Ignore Frame 171 | By default, TSEBRA groups all transcript isoforms that have overlapping coding regions in the same open reading frame (phase column in gtf) to candidates of the same gene. However, in some cases, it might be desired to consider already all transcripts with overlapping conding regions (regardless of the reading frame) as candidates for a gene. In this case add the `--ignore_tx_phase` to the TSEBRA commmand. 172 | 173 | ## Other scripts in the TSEBRA repository 174 | 175 | ### Renaming transcripts from a TSEBRA output 176 | The IDs of the transcripts and genes in the TSEBRA output can be renamed such that the gene and transcript ID match. 177 | Genes and transcript are numbered consecutively and for example, the second transcript of gene "g12" has the ID "g12.t2". 178 | If a prefix is set then it will be added before all IDs, for example, the transcript ID is "dmel_g12.t2" if the prefix is set to "dmel". 179 | Additionally, a translation table can be produced that provides the mapping from old to new transcript IDs. 180 | 181 | Example for renaming ```tsebra_result.gtf```: 182 | ```console 183 | ./bin/rename_gtf.py --gtf tsebra_result.gtf --prefix dmel --translation_tab translation.tab --out tsebra_result_renamed.gtf 184 | ``` 185 | The arguments ```--prefix``` and ```--translation_tab``` are optional. 186 | 187 | ### Fixing the formatting issue of `braker.gtf` 188 | A BRAKER run produces a second complete gene set named `braker.gtf`, besides the official output `augustus.hints.gtf`. The `braker.gtf` is the result of merging `augustus.hints.gtf` with some 'high-confidents' genes from the GeneMark prediction. However, the merging process leads to a formatting issue in `braker.gtf`. 189 | A quick fix for this formatting issue is the script `fix_gtf_ids.py`, e.g.: 190 | ```console 191 | ./bin/fix_gtf_ids.py --gtf braker_out/braker.gtf --out braker1_fixed.gtf 192 | ``` 193 | Take note that the `braker.gtf` and `fix_gtf_ids.py` haven't been tested sufficently and there is no guarantee that this gene set is superior to `augustus.hints.gtf`. 194 | 195 | ### Getting the longest isoform of each gene loci from different gene sets 196 | Combines multiple gene sets and reports the transcript with the longest coding region for each cluster of overlapping transcripts (one transcript per gene loci), e.g. 197 | ```console 198 | ./bin/get_longest_isoform.py --gtf gene_set1.gtf,gene_set2.gtf --out longest_insoforms.gtf 199 | ``` 200 | 201 | ## Licence 202 | All source code, i.e. `bin/*.py` are under the [Artistic License](bin/LICENSE.txt) (see ). 203 | 204 | ## Citing TSEBRA 205 | Gabriel, L., Hoff, K.J., Brůna, T. *et al.* TSEBRA: transcript selector for BRAKER. *BMC Bioinformatics* **22**, 566 (2021). https://doi.org/10.1186/s12859-021-04482-0 206 | 207 | ## References 208 | [1] Hoff, Katharina J, Simone Lange, Alexandre Lomsadze, Mark Borodovsky, and Mario Stanke. 2015. “BRAKER1: Unsupervised Rna-Seq-Based Genome Annotation with Genemark-et and Augustus.” *Bioinformatics* 32 (5). Oxford University Press: 767--69.[↑](#a1) 209 | 210 | [2] Tomas Bruna, Katharina J. Hoff, Alexandre Lomsadze, Mario Stanke and Mark Borodvsky. 2021. “BRAKER2: automatic eukaryotic genome annotation with GeneMark-EP+ and AUGUSTUS supported by a protein database." *NAR Genomics and Bioinformatics* 3(1):lqaa108.[↑](#a2) 211 | 212 | [3] Stanke, Mario, Mark Diekhans, Robert Baertsch, and David Haussler. 2008. “Using Native and Syntenically Mapped cDNA Alignments to Improve de Novo Gene Finding.” *Bioinformatics* 24 (5). Oxford University Press: 637--44.[↑](#a3) 213 | 214 | [4] Stanke, Mario, Oliver Schöffmann, Burkhard Morgenstern, and Stephan Waack. 2006. “Gene Prediction in Eukaryotes with a Generalized Hidden Markov Model That Uses Hints from External Sources.” *BMC Bioinformatics* 7 (1). BioMed Central: 62.[↑](#a4) 215 | -------------------------------------------------------------------------------- /bin/overlap_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # Lars Gabriel 4 | # 5 | # Graph for transcripts of multiple genome annotations. 6 | # It can detect overlapping transcripts. 7 | # Add a feature vector to each node. 8 | # Compare nodes with the 'decision rule'. 9 | # ============================================================== 10 | from features import Node_features 11 | import numpy as np 12 | 13 | class Edge: 14 | """ 15 | Class handling an edge in the overlap graph. 16 | """ 17 | def __init__(self, n1_id, n2_id): 18 | """ 19 | Args: 20 | n1_id (str): Node ID from overlap graph 21 | n2_id (str): Node ID from overlap graph 22 | """ 23 | self.node1 = n1_id 24 | self.node2 = n2_id 25 | self.node_to_remove = None 26 | 27 | class Node: 28 | """ 29 | Class handling a node that represents a transcript in the overlap graph. 30 | """ 31 | def __init__(self, a_id, t_id): 32 | """ 33 | Args: 34 | a_id (str): Annotation ID of the transcript from Anno object 35 | t_id (str): Transcript ID from Transcrpt object 36 | """ 37 | self.id = '{};{}'.format(a_id, t_id) 38 | self.transcript_id = t_id 39 | self.is_in_ref_anno = 0.0 40 | # ID of original annotation/gene prediction 41 | self.anno_id = a_id 42 | # unique ID for a cluster of overlapping transcripts 43 | self.component_id = None 44 | 45 | # dict of edge_ids of edges that are incident 46 | # self.edge_to[id of incident Node] = edge_id 47 | self.edge_to = {} 48 | self.feature_vector = [None] * 4 49 | self.evi_support = False 50 | self.enforce = False 51 | self.gene_sets = set() 52 | 53 | class Graph: 54 | """ 55 | Overlap graph that can detect and filter overlapping transcripts. 56 | """ 57 | def __init__(self, genome_anno_lst, para, keep_tx=[], filter_single=False, ignore_phase=False, verbose=0): 58 | """ 59 | Args: 60 | genome_anno_lst (list(Anno)): List of Anno class objects 61 | containing genome annotations. 62 | para (dict(float)): Dictionary for parameter used for filtering of transcripts. 63 | verbose (int): Verbose mode if verbose >0 . 64 | """ 65 | # self.nodes['anno;txid'] = Node(anno, txid) 66 | self.nodes = {} 67 | 68 | # self.edges['ei'] = Edge() 69 | self.edges = {} 70 | 71 | # self.anno[annoid] = Anno() 72 | self.anno = {} 73 | 74 | # list of connected graph components 75 | self.component_index = 0 76 | self.component_list = [] 77 | 78 | # subset of all transcripts that weren't removed by the transcript comparison rule 79 | self.decided_graph = [] 80 | 81 | # dict of duplicate genome annotation ids to new ids 82 | self.duplicates = {} 83 | 84 | # variables for verbose mode 85 | self.v = verbose 86 | self.f = [[],[],[],[]] 87 | self.ties = 0 88 | 89 | # parameters for decision rule 90 | self.para = para 91 | 92 | # list of transcript set names that are enforced 93 | self.keep_tx = keep_tx 94 | 95 | # init annotations, check for duplicate ids 96 | self.init_anno(genome_anno_lst) 97 | 98 | # filter single exon genes 99 | self.filter_single = filter_single 100 | self.ignore_phase = ignore_phase 101 | 102 | def init_anno(self, genome_anno_lst): 103 | # make sure that the genome_anno ids are unique 104 | counter = 0 105 | for ga in genome_anno_lst: 106 | if ga.id in self.anno.keys(): 107 | counter += 1 108 | new_id = "duplicate.anno.{}".format(counter) 109 | self.duplicates.update({new_id : ga.id}) 110 | ga.change_id(new_id) 111 | self.anno.update({ga.id : ga}) 112 | 113 | def __tx_from_key__(self, key): 114 | """ 115 | Gets a transcript of a node. 116 | 117 | Args: 118 | key (str): ID of a node as 'anno_id;tx_id' 119 | 120 | Returns: 121 | (Transcript): Transcript class object with id = tx_id 122 | from Anno() with id = anno_id 123 | """ 124 | anno_id, tx_id = key.split(';') 125 | return self.anno[anno_id].transcripts[tx_id] 126 | 127 | def build(self): 128 | """ 129 | Builds the overlap graph for >=1 Anno() objects. 130 | Each node of the graph represents a unique transcript from any annotation. 131 | Two nodes have an edge if their transcripts overlap. 132 | Two transcripts overlap if they share at least 3 adjacent protein coding nucleotides. 133 | """ 134 | 135 | # tx_start_end[chr] = [tx_id, coord, id for start or end] 136 | # for every tx one element for start and one for end 137 | # this dict is used to check for overlapping transcripts 138 | tx_start_end = {} 139 | # check for duplicate txs, list of ['start_end_strand'] 140 | unique_tx_keys = {} 141 | 142 | for k in self.anno.keys(): 143 | for tx in self.anno[k].get_transcript_list(): 144 | key = f'{tx.source_anno};{tx.id}' 145 | if tx.chr not in tx_start_end.keys(): 146 | tx_start_end.update({tx.chr : []}) 147 | unique_tx_keys.update({tx.chr : {}}) 148 | unique_key = '{}_{}_{}'.format(tx.start, tx.end, tx.strand) 149 | if unique_key in unique_tx_keys[tx.chr].keys(): 150 | check = False 151 | coords = tx.get_type_coords('CDS') 152 | for t in unique_tx_keys[tx.chr][unique_key]: 153 | if coords == t.get_type_coords('CDS'): 154 | check = True 155 | break 156 | if check: 157 | if tx.source_anno in self.keep_tx: 158 | self.nodes[f'{t.source_anno};{t.id}'].enforce = True 159 | self.nodes[f'{t.source_anno};{t.id}'].gene_sets.add(t.source_anno) 160 | continue 161 | else: 162 | unique_tx_keys[tx.chr].update({unique_key : []}) 163 | unique_tx_keys[tx.chr][unique_key].append(tx) 164 | self.nodes.update({key : Node(tx.source_anno, \ 165 | tx.id)}) 166 | self.nodes[f'{tx.source_anno};{tx.id}'].gene_sets.add(tx.source_anno) 167 | if tx.source_anno in self.keep_tx: 168 | self.nodes[key].enforce = True 169 | tx_start_end[tx.chr].append([key, tx.start, 0]) 170 | tx_start_end[tx.chr].append([key, tx.end, 1]) 171 | 172 | # detect overlapping nodes 173 | edge_count = 0 174 | for chr in tx_start_end.keys(): 175 | tx_start_end[chr] = sorted(tx_start_end[chr], key=lambda t:(t[1], t[2])) 176 | open_intervals = [] 177 | for interval in tx_start_end[chr]: 178 | if interval[2] == 0: 179 | open_intervals.append(interval[0]) 180 | else: 181 | open_intervals.remove(interval[0]) 182 | for match in open_intervals: 183 | tx1 = self.__tx_from_key__(interval[0]) 184 | tx2 = self.__tx_from_key__(match) 185 | if self.compare_tx_cds(tx1, tx2): 186 | new_edge_key = 'e{}'.format(edge_count) 187 | edge_count += 1 188 | self.edges.update({new_edge_key : Edge(interval[0], match)}) 189 | self.nodes[interval[0]].edge_to.update({match : new_edge_key}) 190 | self.nodes[match].edge_to.update({interval[0] : new_edge_key}) 191 | 192 | def compare_tx_cds(self, tx1, tx2): 193 | """ 194 | Check if two transcripts share at least 3 adjacent protein 195 | coding nucleotides on the same strand and reading frame. 196 | 197 | Args: 198 | tx1 (Transcript): Transcript class object of first transcript 199 | tx2 (Transcript): Transcript class object of second transcript 200 | 201 | Returns: 202 | (boolean): TRUE if they overlap and FALSE otherwise 203 | """ 204 | if not tx1.strand == tx2.strand: 205 | return False 206 | coords = [] 207 | coords += [c + [int(phase)] for phase, coord_phase in tx1.get_type_coords('CDS').items() for c in coord_phase] 208 | coords += [c + [int(phase)] for phase, coord_phase in tx2.get_type_coords('CDS').items() for c in coord_phase] 209 | coords = sorted(coords, key = lambda x: x[0]) 210 | 211 | for i in range(1, len(coords)): 212 | if coords[i-1][1] - coords[i][0] > 0: 213 | if self.ignore_phase: 214 | return True 215 | elif tx1.strand == '+' and \ 216 | abs(coords[i-1][0]-coords[i-1][2]-coords[i][0]+coords[i][2])%3 == 0: 217 | return True 218 | elif abs(coords[i-1][1]+coords[i-1][2]-coords[i][1]-coords[i][2])%3 == 0: 219 | return True 220 | return False 221 | 222 | def add_reference_anno_label(self, ref_anno): 223 | """ 224 | Sets the value of is_in_ref_anno for each node to 1 225 | if the coding sequence of the corresponding transcript matches the 226 | coding sequence of a transcript in the reference anno 227 | 228 | Args: 229 | ref_anno (Anno): Anno() obeject of reference annotation 230 | """ 231 | def get_cds_keys(tx): 232 | keys = [tx.chr, tx.strand] + [str(c[0]) + '_' + str(c[1]) \ 233 | for c in tx.get_type_coords('CDS', frame=False)] 234 | return keys 235 | ref_anno_keys = [] 236 | ref_anno_cds = [] 237 | for tx in ref_anno.transcripts.values(): 238 | cds_keys = get_cds_keys(tx) 239 | ref_anno_cds += cds_keys 240 | ref_anno_keys.append('_'.join(cds_keys)) 241 | ref_anno_cds = set(ref_anno_cds) 242 | ref_anno_keys = set(ref_anno_keys) 243 | false_cds_keys = set([]) 244 | correct_cds_keys = set([]) 245 | numb_correct_tx = 0 246 | for n in self.nodes: 247 | self.nodes[n].is_in_ref_anno = 0.0 248 | c_keys = get_cds_keys(self.__tx_from_key__(n)) 249 | if '_'.join(c_keys) in ref_anno_keys: 250 | self.nodes[n].is_in_ref_anno = 1.0 251 | 252 | def print_nodes(self): 253 | # prints all nodes of the graph (only used for development) 254 | for k in self.nodes.keys(): 255 | print(self.nodes[k].id) 256 | print(self.nodes[k].transcript_id) 257 | print(self.nodes[k].anno_id) 258 | print(self.nodes[k].edge_to.keys()) 259 | print('\n') 260 | 261 | def connected_components(self): 262 | """ 263 | Compute all clusters of connected transcripts. 264 | A cluster is connected component of the graph. 265 | Adds component IDs to nodes. 266 | 267 | Returns: 268 | (list(list(str))): Lists of list of all node IDs of a component. 269 | """ 270 | visited = [] 271 | self.component_list = [] 272 | self.component_index = 0 273 | for key in list(self.nodes.keys()): 274 | component = [key] 275 | if key in visited: 276 | continue 277 | visited.append(key) 278 | not_visited = list(self.nodes[key].edge_to.keys()) 279 | component += not_visited 280 | while not_visited: 281 | next_node = not_visited.pop() 282 | visited.append(next_node) 283 | new_nodes = [n for n in self.nodes[next_node].edge_to.keys() if n not in component] 284 | not_visited += new_nodes 285 | component += new_nodes 286 | self.component_list.append(component) 287 | self.component_index += 1 288 | for node in component: 289 | self.nodes[node].component_id = 'g_{}'.format(self.component_index) 290 | return self.component_list 291 | 292 | def add_node_features(self, evi): 293 | """ 294 | Compute for all nodes the feature vector based on the evidence support by evi. 295 | 296 | Args: 297 | evi (Evidence): Evidence class object with all hints from any source. 298 | """ 299 | all_features = [] 300 | for key in self.nodes.keys(): 301 | tx = self.__tx_from_key__(key) 302 | new_node_feature = Node_features(tx, evi, self.para) 303 | self.nodes[key].feature_vector = np.array(new_node_feature.get_features()) 304 | all_features.append(self.nodes[key].feature_vector) 305 | std = np.std(np.array(all_features)[:,2:], axis=0) 306 | mean = np.mean(np.array(all_features)[:,2:], axis=0) 307 | for key in self.nodes.keys(): 308 | tx = self.__tx_from_key__(key) 309 | self.nodes[key].feature_vector[2:] -= mean 310 | self.nodes[key].feature_vector[2:] /= std 311 | if self.nodes[key].feature_vector[0] >= self.para['intron_support'] \ 312 | or self.nodes[key].feature_vector[1] >= self.para['stasto_support']: 313 | self.nodes[key].evi_support = True 314 | if self.filter_single: 315 | if len(tx.transcript_lines['intron']) == 0 and \ 316 | self.nodes[key].feature_vector[1] == 0: 317 | self.nodes[key].evi_support = False 318 | 319 | def decide_edge(self, edge, iter_range = range(0,6)): 320 | """Apply transcript comparison rule to two overlapping transcripts 321 | 322 | Args: 323 | edge (Edge): edge between two transcripts 324 | 325 | Returns: 326 | (str): node ID of the transcript that is marked for removal 327 | """ 328 | 329 | n1 = self.nodes[edge.node1] 330 | n2 = self.nodes[edge.node2] 331 | if n1.evi_support and n2.evi_support: 332 | tx1 = self.__tx_from_key__(n1.id) 333 | tx2 = self.__tx_from_key__(n2.id) 334 | iter_range = range(4) 335 | if len(tx1.transcript_lines['intron']) == 0 or \ 336 | len(tx2.transcript_lines['intron']) == 0: 337 | iter_range = [1,3] 338 | 339 | for i in iter_range: 340 | diff = n1.feature_vector[i] - n2.feature_vector[i] 341 | if diff > self.para[f'e_{i+1}']: 342 | return n2.id 343 | elif diff < (-1 * self.para[f'e_{i+1}']): 344 | return n1.id 345 | return None 346 | 347 | def decide_component(self, component): 348 | """Applies transcript comparison rule to all transcripts of one component 349 | and returns the node IDs of all transcripts that are not removed by 350 | a comparison. 351 | 352 | Args: 353 | component (list(str)): List of node IDs 354 | 355 | Returns: 356 | (list(str)): Filtered subset of component list. 357 | """ 358 | # return all ids of vertices of a graph component, that weren't excluded by the decision rule 359 | result = component.copy() 360 | for node_id in component: 361 | for e_id in self.nodes[node_id].edge_to.values(): 362 | node_to_remove = self.edges[e_id].node_to_remove 363 | if node_to_remove: 364 | if node_to_remove in result and \ 365 | not self.nodes[node_to_remove].enforce: 366 | result.remove(node_to_remove) 367 | if node_id in result and not self.nodes[node_id].evi_support and \ 368 | not self.nodes[node_id].enforce: 369 | result.remove(node_id) 370 | new_components = [[]] 371 | visited = [] 372 | for k, n_id in enumerate(result): 373 | if n_id not in visited: 374 | if k > 0: 375 | self.component_index += 1 376 | not_visited = [n_id] 377 | while not_visited: 378 | n2_id = not_visited.pop() 379 | visited.append(n2_id) 380 | new_components[-1].append(n2_id) 381 | not_visited += [n for n in self.nodes[n2_id].edge_to \ 382 | if n in result and n not in not_visited + visited] 383 | if k > 0: 384 | self.nodes[n2_id].component_id = f'g_{self.component_index}' 385 | return result 386 | 387 | def decide_graph(self): 388 | """ 389 | Create list of connected components of the graph and apply the 390 | transcript comparison rule to all components. 391 | """ 392 | for key in self.edges.keys(): 393 | self.edges[key].node_to_remove = self.decide_edge(self.edges[key]) 394 | self.decided_graph = [] 395 | if not self.component_list: 396 | self.connected_components() 397 | for component in self.component_list: 398 | if len(component) > 1: 399 | self.decided_graph += self.decide_component(component) 400 | elif self.nodes[component[0]].evi_support \ 401 | or self.nodes[component[0]].enforce: 402 | self.decided_graph += component 403 | 404 | def get_decided_graph(self): 405 | """ 406 | Filter graph with the transcript comparison rule. 407 | Then, remove all transcripts with low evidence support and 408 | compute the subset of transcripts that are included in the 409 | combined gene prediciton. 410 | 411 | Returns: 412 | (dict(list(list(str))): Dictionary with transcript IDs and new 413 | gene IDs of all transcripts included in the combined gene prediciton 414 | for all input annotations 415 | """ 416 | if not self.decided_graph: 417 | self.decide_graph() 418 | # result[anno_id] = [[tx_ids, new_gene_id]] 419 | result = {} 420 | for key in self.anno.keys(): 421 | result.update({key : []}) 422 | for node in self.decided_graph: 423 | # if self.nodes[node].evi_support or self.nodes[node].enforce: 424 | anno_id, tx_id = node.split(';') 425 | result[anno_id].append([tx_id, self.nodes[node].component_id]) 426 | 427 | if self.v > 0: 428 | print('NODES: {}'.format(len(self.nodes.keys()))) 429 | f = list(map(set, self.f)) 430 | print('f1: {}'.format(len(f[0]))) 431 | u = f[0] 432 | print('f2: {}'.format(len(f[1]))) 433 | print('f2/f1: {}'.format(len(f[1].difference(u)))) 434 | u = u.union(f[1]) 435 | print('f3: {}'.format(len(f[2]))) 436 | print('f3/f2/f1: {}'.format(len(f[2].difference(u)))) 437 | u = u.union(f[2]) 438 | print('f4: {}'.format(len(f[3]))) 439 | print('f4/f3/f2/f1: {}'.format(len(f[3].difference(u)))) 440 | 441 | return result 442 | -------------------------------------------------------------------------------- /example/braker1_results/hintsfile.gff: -------------------------------------------------------------------------------- 1 | 2L b2h intron 20750 20830 0 - . mult=2;pri=4;src=E 2 | 2L b2h intron 20969 21065 0 - . mult=2;pri=4;src=E 3 | 2L b2h intron 20974 21065 0 - . pri=4;src=E 4 | 2L b2h intron 20999 21065 9 - . mult=9;pri=4;src=E 5 | 2L b2h intron 21201 21346 6 - . mult=6;pri=4;src=E 6 | 2L b2h intron 21201 21426 0 - . pri=4;src=E 7 | 2L b2h intron 22942 22997 0 + . pri=4;src=E 8 | 2L b2h intron 23874 23928 0 - . pri=4;src=E 9 | 2L b2h intron 26689 26765 63 - . mult=63;pri=4;src=E 10 | 2L b2h intron 26965 27052 56 - . mult=56;pri=4;src=E 11 | 2L b2h intron 27491 28014 42 - . mult=42;pri=4;src=E 12 | 2L b2h intron 27491 28410 0 - . pri=4;src=E 13 | 2L b2h intron 28241 28410 2 - . mult=2;pri=4;src=E 14 | 2L b2h intron 28241 28732 53 - . mult=53;pri=4;src=E 15 | 2L b2h intron 28927 28981 43 - . mult=43;pri=4;src=E 16 | 2L b2h intron 29069 30393 21 - . mult=21;pri=4;src=E 17 | 2L b2h intron 33271 33728 0 - . pri=4;src=E 18 | 2L b2h intron 29069 33844 21 - . mult=21;pri=4;src=E 19 | 2L b2h intron 30587 33844 0 - . pri=4;src=E 20 | 2L b2h intron 33271 33844 13 - . mult=13;pri=4;src=E 21 | 2L b2h intron 29069 34557 29 - . mult=29;pri=4;src=E 22 | 2L b2h intron 33271 34557 6 - . mult=6;pri=4;src=E 23 | 2L b2h intron 34289 34557 27 - . mult=27;pri=4;src=E 24 | 2L b2h intron 34605 34719 60 - . mult=60;pri=4;src=E 25 | 2L b2h intron 37812 37915 0 - . pri=4;src=E 26 | 2L b2h intron 34913 38534 9 - . mult=9;pri=4;src=E 27 | 2L b2h intron 35213 38534 36 - . mult=36;pri=4;src=E 28 | 2L b2h intron 38299 38534 8 - . mult=8;pri=4;src=E 29 | 2L b2h intron 38732 39300 40 - . mult=40;pri=4;src=E 30 | 2L b2h intron 25177 54176 0 + . pri=4;src=E 31 | 2L b2h intron 39858 58080 48 - . mult=48;pri=4;src=E 32 | 2L b2h intron 58183 58958 5 - . mult=5;pri=4;src=E 33 | 2L b2h intron 58183 59189 21 - . mult=21;pri=4;src=E 34 | 2L b2h intron 58732 59189 0 - . pri=4;src=E 35 | 2L b2h intron 21371 64891 0 + . pri=4;src=E 36 | 2L b2h intron 58183 65345 0 - . pri=4;src=E 37 | 2L b2h intron 58183 66575 4 - . mult=4;pri=4;src=E 38 | 2L b2h intron 66613 66675 0 + . pri=4;src=E 39 | 2L b2h intron 65411 66681 0 - . pri=4;src=E 40 | 2L b2h intron 66854 67388 0 + . pri=4;src=E 41 | 2L b2h intron 67112 67388 0 + . pri=4;src=E 42 | 2L b2h intron 67138 67388 3 + . mult=3;pri=4;src=E 43 | 2L b2h intron 23423 67568 0 + . pri=4;src=E 44 | 2L b2h intron 67508 67568 30 + . mult=30;pri=4;src=E 45 | 2L b2h intron 67763 67891 1070 + . mult=1070;pri=4;src=E 46 | 2L b2h intron 68024 68084 862 + . mult=862;pri=4;src=E 47 | 2L b2h intron 67996 70268 0 + . pri=4;src=E 48 | 2L b2h intron 70550 70606 81 + . mult=81;pri=4;src=E 49 | 2L b2h intron 70550 70613 0 + . pri=4;src=E 50 | 2L b2h intron 71805 71924 0 + . pri=4;src=E 51 | 2L b2h intron 71805 71949 36 + . mult=36;pri=4;src=E 52 | 2L b2h intron 58732 73110 0 - . pri=4;src=E 53 | 2L b2h intron 73448 73557 0 - . pri=4;src=E 54 | 2L b2h intron 72954 73668 0 + . pri=4;src=E 55 | 2L b2h intron 39858 73754 0 - . pri=4;src=E 56 | 2L b2h intron 73693 73819 5 + . mult=5;pri=4;src=E 57 | 2L b2h intron 72082 74902 13 + . mult=13;pri=4;src=E 58 | 2L b2h intron 72954 74902 0 + . mult=2;pri=4;src=E 59 | 2L b2h intron 72978 74902 188 + . mult=188;pri=4;src=E 60 | 2L b2h intron 73546 74902 0 + . pri=4;src=E 61 | 2L b2h intron 73586 74902 3 + . mult=3;pri=4;src=E 62 | 2L b2h intron 73693 74902 77 + . mult=77;pri=4;src=E 63 | 2L b2h intron 73898 74902 14 + . mult=14;pri=4;src=E 64 | 2L b2h intron 74573 74902 46 + . mult=46;pri=4;src=E 65 | 2L b2h intron 72978 75077 0 + . mult=2;pri=4;src=E 66 | 2L b2h intron 73693 75077 3 + . mult=3;pri=4;src=E 67 | 2L b2h intron 74573 75077 0 + . mult=3;pri=4;src=E 68 | 2L b2h intron 75019 75077 435 + . mult=435;pri=4;src=E 69 | 2L b2h intron 75367 75426 6 + . mult=6;pri=4;src=E 70 | 2L b2h intron 75367 77480 2 + . mult=2;pri=4;src=E 71 | 2L b2h intron 77584 77641 4 + . mult=4;pri=4;src=E 72 | 2L b2h intron 78870 78938 0 - . mult=2;pri=4;src=E 73 | 2L b2h intron 78231 81244 2 - . mult=2;pri=4;src=E 74 | 2L b2h intron 64019 82446 0 - . pri=4;src=E 75 | 2L b2h intron 75367 85176 0 + . pri=4;src=E 76 | 2L b2h intron 85177 85243 2 + . mult=2;pri=4;src=E 77 | 2L b2h intron 68160 85470 0 + . pri=4;src=E 78 | 2L b2h intron 84278 87019 9 - . mult=9;pri=4;src=E 79 | 2L b2h intron 94893 94988 37 + . mult=37;pri=4;src=E 80 | 2L b2h intron 95071 95131 50 + . mult=50;pri=4;src=E 81 | 2L b2h intron 94893 95144 0 + . pri=4;src=E 82 | 2L b2h intron 95302 95353 73 + . mult=73;pri=4;src=E 83 | 2L b2h intron 97834 97883 26 + . mult=26;pri=4;src=E 84 | 2L b2h intron 97834 97895 0 + . pri=4;src=E 85 | 2L b2h intron 98471 98526 74 + . mult=74;pri=4;src=E 86 | 2L b2h intron 99347 99400 0 + . pri=4;src=E 87 | 2L b2h intron 99347 99658 0 + . pri=4;src=E 88 | 2L b2h intron 99724 99784 99 + . mult=99;pri=4;src=E 89 | 2L b2h intron 100517 100571 81 + . mult=81;pri=4;src=E 90 | 2L b2h intron 100704 100761 93 + . mult=93;pri=4;src=E 91 | 2L b2h intron 100704 100810 0 + . pri=4;src=E 92 | 2L b2h intron 100943 101015 88 + . mult=88;pri=4;src=E 93 | 2L b2h intron 101195 101248 90 + . mult=90;pri=4;src=E 94 | 2L b2h intron 101620 101875 62 + . mult=62;pri=4;src=E 95 | 2L b2h intron 101915 101978 6 + . mult=6;pri=4;src=E 96 | 2L b2h intron 102907 102964 4 + . mult=4;pri=4;src=E 97 | 2L b2h intron 102904 102991 6 + . mult=6;pri=4;src=E 98 | 2L b2h intron 102907 102993 0 - . pri=4;src=E 99 | 2L b2h intron 102907 103005 3610 + . mult=3610;pri=4;src=E 100 | 2L b2h intron 103435 103515 203 + . mult=203;pri=4;src=E 101 | 2L b2h intron 103435 103877 9 + . mult=9;pri=4;src=E 102 | 2L b2h intron 94762 104804 0 - . mult=2;pri=4;src=E 103 | 2L b2h intron 104919 105004 0 - . pri=4;src=E 104 | 2L b2h intron 104948 105004 127 - . mult=127;pri=4;src=E 105 | 2L b2h intron 105337 105390 113 - . mult=113;pri=4;src=E 106 | 2L b2h intron 105456 105511 177 - . mult=177;pri=4;src=E 107 | 2L b2h intron 105456 105514 0 - . pri=4;src=E 108 | 2L b2h intron 105916 105968 116 - . mult=116;pri=4;src=E 109 | 2L b2h intron 105456 106437 0 - . pri=4;src=E 110 | 2L b2h intron 33980 106454 0 + . pri=4;src=E 111 | 2L b2h intron 107839 108147 4 + . mult=4;pri=4;src=E 112 | 2L b2h intron 107485 108439 0 - . mult=2;pri=4;src=E 113 | 2L b2h intron 106603 108587 0 + . pri=4;src=E 114 | 2L b2h intron 107839 108587 175 + . mult=175;pri=4;src=E 115 | 2L b2h intron 108102 108587 134 + . mult=134;pri=4;src=E 116 | 2L b2h intron 108211 108587 2 + . mult=2;pri=4;src=E 117 | 2L b2h intron 108227 108587 649 + . mult=649;pri=4;src=E 118 | 2L b2h intron 108347 108587 42 + . mult=42;pri=4;src=E 119 | 2L b2h intron 105456 109079 0 - . mult=2;pri=4;src=E 120 | 2L b2h intron 108810 109703 0 + . pri=4;src=E 121 | 2L b2h intron 107839 110405 0 + . pri=4;src=E 122 | 2L b2h intron 108810 110405 1039 + . mult=1039;pri=4;src=E 123 | 2L b2h intron 110484 110754 1082 + . mult=1082;pri=4;src=E 124 | 2L b2h intron 110878 111004 258 + . mult=258;pri=4;src=E 125 | 2L b2h intron 110878 111037 0 + . pri=4;src=E 126 | 2L b2h intron 110878 111368 0 + . pri=4;src=E 127 | 2L b2h intron 111118 111368 0 + . mult=2;pri=4;src=E 128 | 2L b2h intron 110878 111906 1090 + . mult=1090;pri=4;src=E 129 | 2L b2h intron 111118 111906 11 + . mult=11;pri=4;src=E 130 | 2L b2h intron 111100 112001 2 + . mult=2;pri=4;src=E 131 | 2L b2h intron 110878 112670 0 + . mult=2;pri=4;src=E 132 | 2L b2h intron 110878 112689 11 + . mult=11;pri=4;src=E 133 | 2L b2h intron 111118 112689 237 + . mult=237;pri=4;src=E 134 | 2L b2h intron 112020 112689 966 + . mult=966;pri=4;src=E 135 | 2L b2h intron 112022 112689 0 + . pri=4;src=E 136 | 2L b2h intron 112913 113097 0 - . pri=4;src=E 137 | 2L b2h intron 106792 113176 0 + . pri=4;src=E 138 | 2L b2h intron 113370 113433 1447 + . mult=1447;pri=4;src=E 139 | 2L b2h intron 117760 117819 198 - . mult=198;pri=4;src=E 140 | 2L b2h intron 117776 117819 3 - . mult=3;pri=4;src=E 141 | 2L b2h intron 118077 118135 215 - . mult=215;pri=4;src=E 142 | 2L b2h intron 100082 118360 0 - . pri=4;src=E 143 | 2L b2h intron 118305 118360 159 - . mult=159;pri=4;src=E 144 | 2L b2h intron 118875 118930 291 - . mult=291;pri=4;src=E 145 | 2L b2h intron 119077 119133 181 - . mult=181;pri=4;src=E 146 | 2L b2h intron 119080 119133 2 - . mult=2;pri=4;src=E 147 | 2L b2h intron 119215 119287 0 - . pri=4;src=E 148 | 2L b2h intron 119236 119287 172 - . mult=172;pri=4;src=E 149 | 2L b2h intron 119238 119287 5 + . mult=5;pri=4;src=E 150 | 2L b2h intron 119236 119291 3 - . mult=3;pri=4;src=E 151 | 2L b2h intron 119236 119430 7 - . mult=7;pri=4;src=E 152 | 2L b2h intron 119376 119430 143 - . mult=143;pri=4;src=E 153 | 2L b2h intron 119555 119827 216 - . mult=216;pri=4;src=E 154 | 2L b2h intron 120081 120167 29 - . mult=29;pri=4;src=E 155 | 2L b2h intron 120081 120199 2 - . mult=2;pri=4;src=E 156 | 2L b2h intron 120048 120420 0 - . pri=4;src=E 157 | 2L b2h intron 120078 120420 2 - . mult=2;pri=4;src=E 158 | 2L b2h intron 120081 120420 76 - . mult=76;pri=4;src=E 159 | 2L b2h intron 120266 120420 6 - . mult=6;pri=4;src=E 160 | 2L b2h intron 120361 120420 2 - . mult=2;pri=4;src=E 161 | 2L b2h intron 120365 120420 26 - . mult=26;pri=4;src=E 162 | 2L b2h intron 120458 120510 5 - . mult=5;pri=4;src=E 163 | 2L b2h intron 121032 121090 0 + . pri=4;src=E 164 | 2L b2h intron 121302 121354 3 + . mult=3;pri=4;src=E 165 | 2L b2h intron 121302 121386 0 + . mult=3;pri=4;src=E 166 | 2L b2h intron 120081 121632 32 - . mult=32;pri=4;src=E 167 | 2L b2h intron 122995 123080 148 - . mult=148;pri=4;src=E 168 | 2L b2h intron 123073 123139 20 + . mult=20;pri=4;src=E 169 | 2L b2h intron 123073 123146 0 + . mult=2;pri=4;src=E 170 | 2L b2h intron 123259 123324 5 + . mult=5;pri=4;src=E 171 | 2L b2h intron 123630 123693 854 - . mult=854;pri=4;src=E 172 | 2L b2h intron 123795 123855 843 - . mult=843;pri=4;src=E 173 | 2L b2h intron 123988 124086 0 - . pri=4;src=E 174 | 2L b2h intron 124025 124086 1061 - . mult=1061;pri=4;src=E 175 | 2L b2h intron 124027 124086 6 + . mult=6;pri=4;src=E 176 | 2L b2h intron 124921 125076 133 - . mult=133;pri=4;src=E 177 | 2L b2h intron 125267 126109 666 - . mult=666;pri=4;src=E 178 | 2L b2h intron 126228 126309 7 - . mult=7;pri=4;src=E 179 | 2L b2h intron 126228 127142 0 - . pri=4;src=E 180 | 2L b2h intron 125267 127380 0 - . pri=4;src=E 181 | 2L b2h intron 127295 127380 4 - . mult=4;pri=4;src=E 182 | 2L b2h intron 126228 128043 0 - . pri=4;src=E 183 | 2L b2h intron 124180 128437 0 - . pri=4;src=E 184 | 2L b2h intron 127620 128495 0 - . pri=4;src=E 185 | 2L b2h intron 126412 128554 3 - . mult=3;pri=4;src=E 186 | 2L b2h intron 126228 128799 681 - . mult=681;pri=4;src=E 187 | 2L b2h intron 126228 130507 2 - . mult=2;pri=4;src=E 188 | 2L b2h intron 128943 130507 10 - . mult=10;pri=4;src=E 189 | 2L b2h intron 128883 131979 0 + . pri=4;src=E 190 | 2L b2h intron 132256 132475 621 + . mult=621;pri=4;src=E 191 | 2L b2h intron 132746 132790 0 - . mult=2;pri=4;src=E 192 | 2L b2h intron 133182 133268 10 + . mult=10;pri=4;src=E 193 | 2L b2h intron 121750 133663 0 - . pri=4;src=E 194 | 2L b2h intron 139199 139255 0 - . pri=4;src=E 195 | 2L b2h intron 139688 139740 23 - . mult=23;pri=4;src=E 196 | 2L b2h intron 139688 139757 0 - . mult=2;pri=4;src=E 197 | 2L b2h intron 139955 140113 4 - . mult=4;pri=4;src=E 198 | 2L b2h intron 140697 140767 0 - . pri=4;src=E 199 | 2L b2h intron 141341 141395 212 - . mult=212;pri=4;src=E 200 | 2L b2h intron 141557 141609 0 - . pri=4;src=E 201 | 2L b2h intron 141610 141661 0 - . mult=2;pri=4;src=E 202 | 2L b2h intron 141610 141670 290 - . mult=290;pri=4;src=E 203 | 2L b2h intron 141621 141670 0 - . pri=4;src=E 204 | 2L b2h intron 121846 144080 0 - . pri=4;src=E 205 | 2L b2h intron 145911 145974 13 - . mult=13;pri=4;src=E 206 | 2L b2h intron 145911 146592 0 - . mult=2;pri=4;src=E 207 | 2L b2h intron 139688 147348 0 - . pri=4;src=E 208 | 2L b2h intron 145911 147348 15 - . mult=15;pri=4;src=E 209 | 2L b2h intron 147455 147510 10 - . mult=10;pri=4;src=E 210 | 2L b2h intron 147711 147764 9 - . mult=9;pri=4;src=E 211 | 2L b2h intron 147937 147994 5 - . mult=5;pri=4;src=E 212 | 2L b2h intron 148038 148091 4 - . mult=4;pri=4;src=E 213 | 2L b2h intron 148174 148337 9 - . mult=9;pri=4;src=E 214 | 2L b2h intron 148826 148878 9 - . mult=9;pri=4;src=E 215 | 2L b2h intron 132587 149154 0 - . pri=4;src=E 216 | 2L b2h intron 149227 151114 5 - . mult=5;pri=4;src=E 217 | 2L b2h intron 155060 155127 45 - . mult=45;pri=4;src=E 218 | 2L b2h intron 155179 155249 5 + . mult=5;pri=4;src=E 219 | 2L b2h intron 155411 155465 0 + . pri=4;src=E 220 | 2L b2h intron 155430 155493 6 + . mult=6;pri=4;src=E 221 | 2L b2h intron 114992 155545 2 + . mult=2;pri=4;src=E 222 | 2L b2h intron 141191 155545 3 + . mult=3;pri=4;src=E 223 | 2L b2h intron 155430 155545 216 + . mult=216;pri=4;src=E 224 | 2L b2h intron 114992 155566 15 + . mult=15;pri=4;src=E 225 | 2L b2h intron 155179 155566 0 + . pri=4;src=E 226 | 2L b2h intron 155430 155566 552 + . mult=552;pri=4;src=E 227 | 2L b2h intron 114992 155637 3 + . mult=3;pri=4;src=E 228 | 2L b2h intron 155179 155637 2 + . mult=2;pri=4;src=E 229 | 2L b2h intron 155430 155637 19 + . mult=19;pri=4;src=E 230 | 2L b2h intron 155785 155857 1373 + . mult=1373;pri=4;src=E 231 | 2L b2h intron 155797 155857 0 + . mult=2;pri=4;src=E 232 | 2L b2h intron 77435 155873 0 - . pri=4;src=E 233 | 2L b2h intron 155971 156044 9 + . mult=9;pri=4;src=E 234 | 2L b2h intron 155971 156175 0 + . pri=4;src=E 235 | 2L b2h intron 156349 156422 230 + . mult=230;pri=4;src=E 236 | 2L b2h intron 114992 156735 0 + . mult=3;pri=4;src=E 237 | 2L b2h intron 156553 156735 307 + . mult=307;pri=4;src=E 238 | 2L b2h intron 158376 158435 79 - . mult=79;pri=4;src=E 239 | 2L b2h intron 84105 159306 0 - . pri=4;src=E 240 | 2L b2h intron 116076 160008 0 + . pri=4;src=E 241 | 2L b2h intron 159187 160008 0 + . pri=4;src=E 242 | 2L b2h intron 159818 160008 0 + . pri=4;src=E 243 | 2L b2h intron 159820 160008 95 + . mult=95;pri=4;src=E 244 | 2L b2h intron 114992 162591 0 + . pri=4;src=E 245 | 2L b2h intron 160024 162591 53 + . mult=53;pri=4;src=E 246 | 2L b2h intron 160129 162591 4 + . mult=4;pri=4;src=E 247 | 2L b2h intron 163002 164158 3 + . mult=3;pri=4;src=E 248 | 2L b2h intron 161449 172975 0 - . pri=4;src=E 249 | 2L b2h intron 160024 175360 0 + . pri=4;src=E 250 | 2L b2h intron 163002 175360 13 + . mult=13;pri=4;src=E 251 | 2L b2h intron 164235 175360 0 + . pri=4;src=E 252 | 2L b2h intron 164279 175360 0 + . pri=4;src=E 253 | 2L b2h intron 180732 180799 0 - . mult=2;pri=4;src=E 254 | 2L b2h intron 181361 181469 0 - . pri=4;src=E 255 | 2L b2h intron 160024 183760 0 + . pri=4;src=E 256 | 2L b2h intron 163002 183760 10 + . mult=10;pri=4;src=E 257 | 2L b2h intron 175448 183760 10 + . mult=10;pri=4;src=E 258 | 2L b2h intron 183420 183760 12 + . mult=12;pri=4;src=E 259 | 2L b2h intron 185083 185151 42 + . mult=42;pri=4;src=E 260 | 2L b2h intron 183788 185507 0 - . pri=4;src=E 261 | 2L b2h intron 186124 186192 56 + . mult=56;pri=4;src=E 262 | 2L b2h intron 186856 186909 38 + . mult=38;pri=4;src=E 263 | 2L b2h intron 187017 187481 72 + . mult=72;pri=4;src=E 264 | 2L b2h intron 187184 187481 2 + . mult=2;pri=4;src=E 265 | 2L b2h intron 184508 189378 0 - . pri=4;src=E 266 | 2L b2h intron 141702 193142 0 + . pri=4;src=E 267 | 2L b2h intron 197637 197736 0 + . pri=4;src=E 268 | 2L b2h intron 155547 198232 0 + . pri=4;src=E 269 | 2L b2h intron 199576 199636 160 + . mult=160;pri=4;src=E 270 | 2L b2h intron 199744 199902 45 + . mult=45;pri=4;src=E 271 | 2L b2h intron 199744 200119 98 + . mult=98;pri=4;src=E 272 | 2L b2h intron 199984 200119 51 + . mult=51;pri=4;src=E 273 | 2L b2h intron 200316 200375 167 + . mult=167;pri=4;src=E 274 | 2L b2h intron 200676 200741 149 + . mult=149;pri=4;src=E 275 | 2L b2h intron 200676 200752 0 + . pri=4;src=E 276 | 2L b2h intron 201062 201126 223 + . mult=223;pri=4;src=E 277 | 2L b2h intron 194140 203534 0 - . pri=4;src=E 278 | 2L b2h intron 203466 203534 0 - . pri=4;src=E 279 | 2L b2h intron 203892 203992 290 + . mult=290;pri=4;src=E 280 | 2L b2h intron 195421 204324 0 - . pri=4;src=E 281 | 2L b2h intron 204919 204989 0 + . pri=4;src=E 282 | 2L b2h intron 205209 205271 3 - . mult=3;pri=4;src=E 283 | 2L b2h intron 205213 205271 43 - . mult=43;pri=4;src=E 284 | 2L b2h intron 205365 205434 0 - . pri=4;src=E 285 | 2L b2h intron 205477 205536 0 - . mult=4;pri=4;src=E 286 | 2L b2h intron 205944 206000 23 - . mult=23;pri=4;src=E 287 | 2L b2h intron 206051 206699 20 - . mult=20;pri=4;src=E 288 | 2L b2h intron 207632 207710 94 + . mult=94;pri=4;src=E 289 | 2L b2h intron 208609 208728 95 + . mult=95;pri=4;src=E 290 | 2L b2h intron 193216 209411 0 - . pri=4;src=E 291 | 2L b2h intron 214179 214277 2 - . mult=2;pri=4;src=E 292 | 2L b2h intron 206282 214280 0 + . pri=4;src=E 293 | 2L b2h intron 214943 215033 296 - . mult=296;pri=4;src=E 294 | 2L b2h intron 215669 215772 0 - . mult=4;pri=4;src=E 295 | 2L b2h intron 215677 215772 96 - . mult=96;pri=4;src=E 296 | 2L b2h intron 215677 215985 0 - . pri=4;src=E 297 | 2L b2h intron 216008 216076 219 - . mult=219;pri=4;src=E 298 | 2L b2h intron 216276 216344 258 - . mult=258;pri=4;src=E 299 | 2L b2h intron 216574 216631 231 - . mult=231;pri=4;src=E 300 | 2L b2h intron 218886 221505 11 - . mult=11;pri=4;src=E 301 | 2L b2h intron 218886 226115 67 - . mult=67;pri=4;src=E 302 | 2L b2h intron 218886 227371 13 - . mult=13;pri=4;src=E 303 | 2L b2h intron 226509 227371 67 - . mult=67;pri=4;src=E 304 | 2L b2h intron 227548 228126 0 - . pri=4;src=E 305 | 2L b2h intron 227548 228132 64 - . mult=64;pri=4;src=E 306 | 2L b2h intron 216893 228135 0 + . pri=4;src=E 307 | 2L b2h intron 228282 229172 123 - . mult=123;pri=4;src=E 308 | 2L b2h intron 228282 229929 0 - . pri=4;src=E 309 | 2L b2h intron 229383 229929 12 - . mult=12;pri=4;src=E 310 | 2L b2h intron 228282 230537 8 - . mult=8;pri=4;src=E 311 | 2L b2h intron 229351 230537 0 - . pri=4;src=E 312 | 2L b2h intron 229383 230537 76 - . mult=76;pri=4;src=E 313 | 2L b2h intron 229978 230537 7 - . mult=7;pri=4;src=E 314 | 2L b2h intron 230628 230757 90 - . mult=90;pri=4;src=E 315 | 2L b2h intron 231035 231367 0 - . pri=4;src=E 316 | 2L b2h intron 231035 231947 32 - . mult=32;pri=4;src=E 317 | 2L b2h intron 231035 231962 62 - . mult=62;pri=4;src=E 318 | 2L b2h intron 232102 232291 234 - . mult=234;pri=4;src=E 319 | 2L b2h intron 233155 233216 43 + . mult=43;pri=4;src=E 320 | 2L b2h intron 232380 233996 124 - . mult=124;pri=4;src=E 321 | 2L b2h intron 234175 234236 115 - . mult=115;pri=4;src=E 322 | 2L b2h intron 240406 240489 0 - . pri=4;src=E 323 | 2L b2h intron 240076 241608 0 - . pri=4;src=E 324 | 2L b2h intron 242144 242243 100 - . mult=100;pri=4;src=E 325 | 2L b2h intron 242429 242507 7 - . mult=7;pri=4;src=E 326 | 2L b2h intron 242429 242519 0 - . pri=4;src=E 327 | 2L b2h intron 242745 242800 3 - . mult=3;pri=4;src=E 328 | 2L b2h intron 242926 243198 0 - . pri=4;src=E 329 | 2L b2h intron 201826 245768 0 - . pri=4;src=E 330 | 2L b2h intron 244568 246393 0 - . pri=4;src=E 331 | 2L b2h intron 242926 248604 5 - . mult=5;pri=4;src=E 332 | 2L b2h intron 242745 249005 0 - . pri=4;src=E 333 | 2L b2h intron 242745 249036 3 - . mult=3;pri=4;src=E 334 | 2L b2h intron 242745 250756 67 - . mult=67;pri=4;src=E 335 | 2L b2h intron 242926 250756 2 - . mult=2;pri=4;src=E 336 | 2L b2h intron 248855 250756 2 - . mult=2;pri=4;src=E 337 | 2L b2h intron 242745 250806 9 - . mult=9;pri=4;src=E 338 | 2L b2h intron 251442 251526 4 - . mult=4;pri=4;src=E 339 | 2L b2h intron 251465 251526 109 - . mult=109;pri=4;src=E 340 | 2L b2h intron 251802 251855 67 - . mult=67;pri=4;src=E 341 | 2L b2h intron 251925 251978 66 - . mult=66;pri=4;src=E 342 | 2L b2h intron 252167 252226 84 - . mult=84;pri=4;src=E 343 | 2L b2h intron 253100 253182 0 + . mult=2;pri=4;src=E 344 | 2L b2h intron 253083 253219 0 - . pri=4;src=E 345 | 2L b2h intron 253146 253219 80 - . mult=80;pri=4;src=E 346 | 2L b2h intron 253386 253445 90 - . mult=90;pri=4;src=E 347 | 2L b2h intron 253891 253955 73 - . mult=73;pri=4;src=E 348 | 2L b2h intron 254398 255617 0 - . pri=4;src=E 349 | 2L b2h intron 254398 258452 0 - . pri=4;src=E 350 | 2L b2h intron 197125 261850 0 - . pri=4;src=E 351 | 2L b2h intron 231424 263049 0 + . pri=4;src=E 352 | 2L b2h intron 254398 263752 0 - . pri=4;src=E 353 | 2L b2h intron 268031 268331 0 + . pri=4;src=E 354 | 2L b2h intron 195330 269137 0 + . pri=4;src=E 355 | 2L b2h intron 269390 269457 113 + . mult=113;pri=4;src=E 356 | 2L b2h intron 270607 270660 107 + . mult=107;pri=4;src=E 357 | 2L b2h intron 195330 271024 0 + . pri=4;src=E 358 | 2L b2h intron 254398 271628 25 - . mult=25;pri=4;src=E 359 | 2L b2h intron 254425 271628 0 - . mult=2;pri=4;src=E 360 | 2L b2h intron 272507 272554 101 - . mult=101;pri=4;src=E 361 | 2L b2h intron 275252 275318 108 - . mult=108;pri=4;src=E 362 | 2L b2h intron 276498 276738 159 - . mult=159;pri=4;src=E 363 | 2L b2h intron 276904 276958 111 - . mult=111;pri=4;src=E 364 | 2L b2h intron 276904 276973 12 - . mult=12;pri=4;src=E 365 | 2L b2h intron 277864 277929 32 + . mult=32;pri=4;src=E 366 | 2L b2h intron 278324 278642 32 + . mult=32;pri=4;src=E 367 | 2L b2h intron 279125 279184 43 + . mult=43;pri=4;src=E 368 | 2L b2h intron 215420 279383 0 - . pri=4;src=E 369 | 2L b2h intron 279806 279869 52 + . mult=52;pri=4;src=E 370 | 2L b2h intron 280041 280109 59 + . mult=59;pri=4;src=E 371 | 2L b2h intron 280423 280476 0 + . pri=4;src=E 372 | 2L b2h intron 282506 282653 50 - . mult=50;pri=4;src=E 373 | 2L b2h intron 261097 283606 0 + . pri=4;src=E 374 | 2L b2h intron 283653 283705 0 - . pri=4;src=E 375 | 2L b2h intron 284211 284321 3 - . mult=3;pri=4;src=E 376 | 2L b2h intron 284211 284457 0 - . mult=2;pri=4;src=E 377 | 2L b2h intron 282506 284747 2 - . mult=2;pri=4;src=E 378 | 2L b2h intron 284211 284747 1248 - . mult=1248;pri=4;src=E 379 | 2L b2h intron 284457 284747 0 - . mult=2;pri=4;src=E 380 | 2L b2h intron 284969 285366 5 - . mult=5;pri=4;src=E 381 | 2L b2h intron 284211 285432 0 - . pri=4;src=E 382 | 2L b2h intron 284969 285432 356 - . mult=356;pri=4;src=E 383 | 2L b2h intron 284969 286306 81 - . mult=81;pri=4;src=E 384 | 2L b2h intron 284969 290812 545 - . mult=545;pri=4;src=E 385 | 2L b2h intron 287094 290815 6 + . mult=6;pri=4;src=E 386 | 2L b2h intron 291032 291566 0 - . pri=4;src=E 387 | 2L b2h intron 291032 291606 217 - . mult=217;pri=4;src=E 388 | 2L b2h intron 291032 291626 0 - . pri=4;src=E 389 | 2L b2h intron 291032 291660 17 - . mult=17;pri=4;src=E 390 | 2L b2h intron 292661 292716 172 + . mult=172;pri=4;src=E 391 | 2L b2h intron 292706 293993 0 - . pri=4;src=E 392 | 2L b2h intron 294373 294440 251 - . mult=251;pri=4;src=E 393 | -------------------------------------------------------------------------------- /bin/genome_anno.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # ============================================================== 3 | # Lars Gabriel 4 | # 5 | # genome_anno.py: Handles the data structure for a genome annotation file 6 | # ============================================================== 7 | 8 | import os 9 | import sys 10 | import csv 11 | 12 | class NotGtfFormat(Exception): 13 | pass 14 | 15 | class Transcript: 16 | """ 17 | Class handling the data structures and methods for a transcript 18 | """ 19 | def __init__(self, id, gene_id, chr, source_anno, strand): 20 | """ 21 | Args: 22 | id (str): Transcript ID 23 | gene_id (str): Gene ID 24 | chr (str): Chromosome/Sequence name where the transcript is located 25 | source_anno (str): Anno ID 26 | strand (str): Strand (+/-) on which the transctipt is located 27 | """ 28 | self.id = id 29 | self.chr = chr 30 | self.gene_id = gene_id 31 | # self.transcript_lines[segment_type] = [lines of segment type] 32 | self.transcript_lines = {} 33 | self.gtf = [] 34 | self.source_anno = source_anno 35 | self.start = -1 36 | self.end = -1 37 | self.cds_len = -1 38 | self.cds_coords = {} 39 | self.strand = strand 40 | self.source_method = '' 41 | 42 | def add_line(self, line): 43 | """ 44 | Add a single line from the gtf file to the transcript data structure. 45 | 46 | Args: 47 | line (list): List of all elements of a line from a gtf file 48 | """ 49 | if not (line[0] == self.chr or line[6] == self.strand): 50 | raise NotGtfFormat('File is not in gtf format. ' \ 51 | + 'Error in line {}\n'.format('\t'.join(map(str, line))) 52 | + 'Transcript ID is not unique') 53 | 54 | if line[2] not in self.transcript_lines.keys(): 55 | self.transcript_lines.update({line[2] : []}) 56 | 57 | self.source_method = line[1] 58 | 59 | line[3] = int(line[3]) 60 | line[4] = int(line[4]) 61 | if self.start < 0 or line[3] < self.start: 62 | self.start = line[3] 63 | if self.end < 0 or line[4] > self.end: 64 | self.end = line[4] 65 | if self.gene_id == '' and not line[2] == 'transcript': 66 | self.gene_id = line[8].split('gene_id "')[1].split('";')[0] 67 | self.transcript_lines[line[2]].append(line) 68 | 69 | def set_gene_id(self, new_gene_id): 70 | self.gene_id = new_gene_id 71 | 72 | def get_cds_len(self): 73 | cds = self.get_type_coords('CDS', False) 74 | return sum([c[1] - c[0] + 1 for c in cds]) 75 | 76 | def get_type_coords(self, type, frame=True): 77 | """ 78 | Get the coordinates and reading frame of the coding regions 79 | Returns: 80 | (dict(list(list(int)))): Dictionary with list of type coords for 81 | each each frame phase (0,1,2) 82 | """ 83 | # returns dict of cds_coords[phase] = [start_coord, end_coord] of all CDS 84 | if frame: 85 | coords = {'0' : [], '1' : [], '2' : [], '.' : []} 86 | else: 87 | coords = [] 88 | # if type == 'CDS' and type not in self.transcript_lines.keys(): 89 | # type = 'exon' 90 | if type not in self.transcript_lines.keys(): 91 | return coords 92 | for line in self.transcript_lines[type]: 93 | if frame: 94 | coords[line[7]].append([line[3], line[4]]) 95 | else: 96 | coords.append([line[3], line[4]]) 97 | if frame: 98 | for k in coords.keys(): 99 | coords[k].sort(key=lambda c: (c[0],c[1])) 100 | if type == 'CDS': 101 | coords['0'] += coords['.'] 102 | del coords['.'] 103 | else: 104 | coords.sort(key=lambda c: (c[0],c[1])) 105 | return coords 106 | 107 | def get_cds_coords(self): 108 | """ 109 | Get the coordinates and reading frame of the coding regions 110 | 111 | Returns: 112 | (dict(list(list(int)))): Dictionary with list of CDS coords for 113 | each each frame phase (0,1,2) 114 | """ 115 | # returns dict of cds_coords[phase] = [start_coord, end_coord] of all CDS 116 | if not self.cds_coords.keys(): 117 | self.cds_coords = {'0' : [], '1' : [], '2' : []} 118 | if 'CDS' in self.transcript_lines.keys(): 119 | key = 'CDS' 120 | else: 121 | key = 'exon' 122 | for line in self.transcript_lines[key]: 123 | self.cds_coords[line[7]].append([line[3], line[4]]) 124 | for k in self.cds_coords.keys(): 125 | self.cds_coords[k].sort(key=lambda c: (c[0],c[1])) 126 | return self.cds_coords 127 | 128 | def add_missing_lines(self): 129 | """ 130 | Add transcript, intron, CDS, exon coordinates if they were not 131 | included in the gtf file 132 | 133 | Returns: 134 | (boolean): FALSE if no cds were found for the tx, TRUE otherwise 135 | """ 136 | # add intron lines 137 | self.find_introns() 138 | # check if tx has cds or exon 139 | if not self.check_cds_exons(): 140 | return False 141 | # add transcript line 142 | self.find_transcript() 143 | # add start/stop codon line 144 | self.find_start_stop_codon() 145 | return True 146 | 147 | def check_cds_exons(self): 148 | """ 149 | Check if tx has CDS or exons. 150 | """ 151 | if 'CDS' not in self.transcript_lines.keys() and 'exon' not in self.transcript_lines.keys(): 152 | sys.stderr.write('Skipping transcript {}, no CDS nor exons in {}\n'.format(self.id, self.id)) 153 | return False 154 | return True 155 | 156 | def find_introns(self): 157 | """ 158 | Add intron lines. 159 | """ 160 | if not 'intron' in self.transcript_lines.keys(): 161 | self.transcript_lines.update({'intron' : []}) 162 | key = '' 163 | if 'CDS' in self.transcript_lines.keys(): 164 | key = 'CDS' 165 | elif 'exon' in self.transcript_lines.keys(): 166 | key = 'exon' 167 | if key: 168 | exon_lst = [] 169 | for line in self.transcript_lines[key]: 170 | exon_lst.append(line) 171 | exon_lst = sorted(exon_lst, key=lambda e:e[3]) 172 | for i in range(1, len(exon_lst)): 173 | intron = [] 174 | intron += exon_lst[i][0:2] 175 | intron.append('intron') 176 | intron.append(exon_lst[i-1][4] + 1) 177 | intron.append(exon_lst[i][3] - 1) 178 | intron += exon_lst[i][5:8] 179 | intron.append("gene_id \"{}\"; transcript_id \"{}\";".format(\ 180 | self.gene_id, self.id)) 181 | self.transcript_lines['intron'].append(intron) 182 | 183 | def find_transcript(self): 184 | """ 185 | Add transcript lines. 186 | """ 187 | if not 'transcript' in self.transcript_lines.keys(): 188 | for k in self.transcript_lines.keys(): 189 | for line in self.transcript_lines[k]: 190 | if line[3] < self.start or self.start < 0: 191 | self.start = line[3] 192 | if line[4] > self.end: 193 | self.end = line[4] 194 | tx_line = [self.chr, line[1], 'transcript', self.start, self.end, \ 195 | '.', line[6], '.', self.id] 196 | self.add_line(tx_line) 197 | 198 | def find_start_stop_codon(self): 199 | """ 200 | Add start/stop codon lines. 201 | """ 202 | 203 | if not 'start_codon' in self.transcript_lines.keys(): 204 | self.transcript_lines.update({'start_codon' : []}) 205 | if not 'stop_codon' in self.transcript_lines.keys(): 206 | self.transcript_lines.update({'stop_codon' : []}) 207 | 208 | 209 | key = '' 210 | if 'CDS' in self.transcript_lines.keys(): 211 | key = 'CDS' 212 | elif 'exon' in self.transcript_lines.keys(): 213 | key = 'exon' 214 | if key: 215 | self.transcript_lines[key].sort(key = lambda x : x[3]) 216 | tx = self.transcript_lines[key][0] 217 | line1 = [self.chr, tx[1], '', tx[3], tx[3] + 2, \ 218 | '.', self.strand, '0', "gene_id \"{}\"; transcript_id \"{}\";".format(\ 219 | self.gene_id, self.id)] 220 | tx = self.transcript_lines[key][-1] 221 | line2 = [self.chr, tx[1], '', tx[4] - 2, tx[4], \ 222 | '.', self.strand, '0', "gene_id \"{}\"; transcript_id \"{}\";".format(\ 223 | self.gene_id, self.id)] 224 | 225 | fragmented_transcript = True 226 | if tx[6] == '+': 227 | line1[2] = 'start_codon' 228 | line2[2] = 'stop_codon' 229 | if self.transcript_lines[key][0][7] == 0: 230 | fragmented_transcript = False 231 | start = line1 232 | stop = line2 233 | else: 234 | line1[2] = 'stop_codon' 235 | line2[2] = 'start_codon' 236 | if self.transcript_lines[key][-1][7] == 0: 237 | fragmented_transcript = False 238 | stop = line1 239 | start = line2 240 | if not 'start_codon' in self.transcript_lines.keys() and not fragmented_transcript: 241 | if not fragmented_transcript: 242 | self.add_line(start) 243 | else: 244 | self.transcript_lines.update({'start_codon' : []}) 245 | if not 'stop_codon' in self.transcript_lines.keys(): 246 | self.add_line(stop) 247 | 248 | 249 | def get_gtf(self, prefix=''): 250 | """ 251 | Creates gtf output for the transcript. 252 | 253 | Returns: 254 | (list(list(str))): List of lines in gtf format as lists 255 | """ 256 | gtf = [] 257 | if prefix: 258 | prefix += '.' 259 | tx_line = [] 260 | for k in self.transcript_lines.keys(): 261 | for g in self.transcript_lines[k]: 262 | if k == 'transcript': 263 | tx_line = g 264 | tx_line[8] = prefix + self.id 265 | continue 266 | else: 267 | g[8] = f'transcript_id \"{prefix + self.id}\"; gene_id \"{self.gene_id}";' 268 | gtf.append(g) 269 | 270 | if not 'exon' in self.transcript_lines.keys(): 271 | for g in self.transcript_lines['CDS']: 272 | gtf.append(g[:2] + ['exon'] + g[3:]) 273 | 274 | gtf = sorted(gtf, key=lambda g: (g[3],g[4])) 275 | if tx_line: 276 | gtf = [tx_line] + gtf 277 | return gtf 278 | 279 | class Anno: 280 | """ 281 | Class handling the data structures and methods for a one genome annotation file 282 | """ 283 | def __init__(self, path, id): 284 | """ 285 | Args: 286 | path (str): Path to the annotation/gene prediction file in gtf format. 287 | id (str): Annotation ID 288 | """ 289 | self.id = id 290 | self.genes = {'None' : []} 291 | self.gene_gtf = {} 292 | self.transcripts = {} 293 | self.path = path 294 | self.translation_tab = [] 295 | 296 | def addGtf(self): 297 | """ 298 | Read a gtf file and create a dictionary of Transcript objects for 299 | all transcript in the file 300 | """ 301 | with open (self.path, 'r') as file: 302 | file_lines = csv.reader(file, delimiter='\t') 303 | for line in file_lines: 304 | line = [l.strip(' ') for l in line] 305 | if line[0][0] == '#': 306 | continue 307 | line[3] = int(line[3]) 308 | line[4] = int(line[4]) 309 | if line[2] == 'gene': 310 | gene_id = line[8] 311 | self.genes_update(gene_id) 312 | if not gene_id in self.gene_gtf.keys(): 313 | self.gene_gtf.update({gene_id : line}) 314 | else: 315 | sys.stderr.write('ERROR, gene_id not unique: {}\n'.format(gene_id)) 316 | elif line[2] == 'transcript': 317 | transcript_id = line[8] 318 | gene_id = '' 319 | self.transcript_update(transcript_id, gene_id, line[0], line[6]) 320 | self.transcripts[transcript_id].add_line(line) 321 | else: 322 | transcript_id = line[8].split('transcript_id "') 323 | if len(transcript_id) > 1: 324 | transcript_id = transcript_id[1].split('";')[0] 325 | else: 326 | raise NotGtfFormat('File: "{}" is not in gtf format. \n'.format(\ 327 | self.path) + 'Error in line {}\n'.format('\t'.join(map(str, line)))) 328 | 329 | gene_id = line[8].split('gene_id "') 330 | if len(gene_id) > 1: 331 | gene_id = gene_id[1].split('";')[0] 332 | else: 333 | gene_id = 'None' 334 | for key, value in self.genes.items(): 335 | if value == transcript_id: 336 | gene_id = key 337 | 338 | self.transcript_update(transcript_id, gene_id, line[0], line[6]) 339 | self.genes_update(gene_id, transcript_id) 340 | self.transcripts[transcript_id].add_line(line) 341 | 342 | for tx_id in self.genes['None']: 343 | gene_id = tx_id + '_g' 344 | self.genes_update(gene_id, tx_id) 345 | 346 | def norm_tx_format(self): 347 | """ 348 | Add to all Transcript objects transcript, intron, CDS, exon 349 | coordinates if they were not included in the gtf file. 350 | Delete all transripts that have no exons or CDS 351 | """ 352 | tx_no_cds = [] 353 | # add missing lines to all tx 354 | for k in self.transcripts.keys(): 355 | if not self.transcripts[k].add_missing_lines(): 356 | tx_no_cds.append(k) 357 | for k in tx_no_cds: 358 | del self.transcripts[k] 359 | 360 | def genes_update(self, gene_id, transcript_id=''): 361 | """ 362 | Update gene ID dict. 363 | Args: 364 | gene_id (str): Gene ID 365 | transcript_id (str): Transcript ID 366 | """ 367 | # update gene ids 368 | if not gene_id in self.genes.keys(): 369 | self.genes.update({ gene_id : []}) 370 | if transcript_id and transcript_id not in self.genes[gene_id]: 371 | self.genes[gene_id].append(transcript_id) 372 | if transcript_id in self.genes['None'] and not gene_id == 'None': 373 | self.genes['None'].remove(transcript_id) 374 | self.transcripts[transcript_id].gene_id = gene_id 375 | 376 | def transcript_update(self, t_id, g_id, chr, strand): 377 | """ 378 | Update transcript ID dict. 379 | Args: 380 | t_id (str): Transcript ID 381 | g_id (str): Gene ID 382 | chr (str): Chromosome name 383 | strand (str): Strand (+/-) 384 | """ 385 | if not t_id in self.transcripts.keys(): 386 | self.transcripts.update({ t_id : Transcript(t_id, g_id, chr, self.id, strand)}) 387 | 388 | def find_genes(self): 389 | """ 390 | Find all genes in the annotation and find the transcripts that 391 | belong to each gene. Also, cretae a dict with the gtf lines for each gene. 392 | """ 393 | self.gene_gtf = {} 394 | self.genes = {} 395 | for tx in self.transcripts.values(): 396 | if tx.gene_id in self.genes.keys(): 397 | if not (tx.chr == self.gene_gtf[tx.gene_id][0] and \ 398 | tx.strand == self.gene_gtf[tx.gene_id][6]): 399 | sys.stderr.write('ERROR, gene_id not unique: {}.'.format(tx.gene_id)) 400 | tx.gene_id = tx.gene_id + '.' + tx.chr + '.' + tx.strand 401 | sys.stderr.write(' Adding new gene: {}\n'.format(tx.gene_id)) 402 | else: 403 | self.genes[tx.gene_id].append(tx.id) 404 | self.gene_gtf[tx.gene_id][3] = min(self.gene_gtf[tx.gene_id][3], \ 405 | tx.start) 406 | self.gene_gtf[tx.gene_id][4] = max(self.gene_gtf[tx.gene_id][4], \ 407 | tx.end) 408 | continue 409 | self.genes.update({tx.gene_id : [tx.id]}) 410 | self.gene_gtf.update({tx.gene_id : [tx.chr, tx.source_method, 'gene', \ 411 | tx.start, tx.end, '.', tx.strand, '.', tx.gene_id]}) 412 | 413 | def get_gtf(self): 414 | """ 415 | Get annotaion file as gtf list. 416 | Returns: 417 | list(list(str)): Gtf file as list of lists 418 | """ 419 | gtf = [] 420 | gene_gtf = sorted(self.gene_gtf.values(), key=lambda g: (g[0],g[3],g[4])) 421 | for gene in gene_gtf: 422 | gtf.append(gene) 423 | for tx_id in self.genes[gene[8]]: 424 | gtf += self.transcripts[tx_id].get_gtf() 425 | return gtf 426 | 427 | def add_transcripts(self, txs, id_prefix=''): 428 | """ 429 | Adds a dict of transcripts to the transcripts of the annotation. 430 | Args: 431 | dict(Transcript()): dictionary of Transcripts added to the annotation 432 | """ 433 | if not id_prefix: 434 | self.transcripts.update({txs}) 435 | else: 436 | for tx in txs.values(): 437 | tx.id = id_prefix + tx.id 438 | self.transcripts.update({tx.id : tx}) 439 | 440 | def get_subset(self, tx_list): 441 | """ 442 | Get annotaion file for a subset of transcripts. 443 | Args: 444 | tx_list (list(str)): List of transcript IDs 445 | Returns: 446 | list(list(str)): Gtf file as list of lists 447 | """ 448 | tx_subset = {} 449 | for tx in tx_list: 450 | tx_subset.update({tx : self.transcripts[tx]}) 451 | return tx_subset 452 | 453 | def change_id(self, new_id): 454 | """ 455 | Change annotation file ID. 456 | """ 457 | self.id = new_id 458 | for k in self.transcripts.keys(): 459 | self.transcripts[k].source_anno = self.id 460 | 461 | def get_transcript_list(self): 462 | """ 463 | Returns: 464 | (List(Transcript)): List of all transcripts. 465 | """ 466 | return list(self.transcripts.values()) 467 | 468 | def rename_tx_ids(self, prefix=''): 469 | """ 470 | Renames all tx and genes and returns translation table for old tx id to new tx id. 471 | Args: 472 | prefix (string): String added before each tx and gene ID. 473 | Returns: 474 | translation_tab (list(str, str)): Translation table for old tx id to new tx id. 475 | """ 476 | self.translation_tab = [] 477 | gene_numb = 1 478 | old_gene_gtf = sorted(self.gene_gtf.values(), key=lambda g: (g[0],g[3],g[4])) 479 | self.gene_gtf = {} 480 | old_genes = self.genes 481 | self.genes = {} 482 | old_txs = self.transcripts 483 | self.transcripts = {} 484 | if prefix: 485 | prefix += '_' 486 | for gene in old_gene_gtf: 487 | tx_numb = 1 488 | old_gene_id = gene[8] 489 | new_gene_id = "{}g{}".format(prefix, gene_numb) 490 | gene[8] = new_gene_id 491 | self.genes.update({new_gene_id : []}) 492 | self.gene_gtf.update({new_gene_id : gene}) 493 | for old_tx_id in old_genes[old_gene_id]: 494 | new_tx_id = "{}g{}.t{}".format(prefix, gene_numb, tx_numb) 495 | self.transcripts.update({new_tx_id : old_txs[old_tx_id]}) 496 | self.transcripts[new_tx_id].id = new_tx_id 497 | self.transcripts[new_tx_id].gene_id = new_gene_id 498 | self.genes[new_gene_id].append(new_tx_id) 499 | tx_numb +=1 500 | self.translation_tab.append([new_tx_id, old_tx_id]) 501 | gene_numb += 1 502 | return self.translation_tab 503 | 504 | def write_anno(self, out_path): 505 | """ 506 | Write Annotation in gtf format to out_path. 507 | Args: 508 | (str) : path to the output file 509 | """ 510 | with open(out_path, 'w+') as file: 511 | out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n') 512 | for line in self.get_gtf(): 513 | out_writer.writerow(line) --------------------------------------------------------------------------------