├── bin
    ├── __init__.py
    ├── rename_gtf.py
    ├── fix_gtf_ids.py
    ├── get_longest_isoform.py
    ├── evidence.py
    ├── features.py
    ├── get_overlapping_genes.py
    ├── LICENSE.txt
    ├── tsebra.py
    ├── compleasm-LICENSE.txt
    ├── overlap_graph.py
    └── genome_anno.py
├── docs
    ├── .gitkeep
    └── TSEBRA_Logo.png
├── tests
    ├── __init__.py
    ├── graph
    │   ├── ex2_anno1.gtf
    │   ├── ex2_anno2.gtf
    │   ├── ex4_anno1.gtf
    │   ├── ex4_anno2.gtf
    │   ├── ex3_anno2.gtf
    │   ├── ex3_anno1.gtf
    │   ├── ex_feature_hint1.gff
    │   ├── ex1_anno2.gtf
    │   ├── ex1_anno1.gtf
    │   ├── ex_feature_hint2.gff
    │   ├── ex_feature_anno2.gtf
    │   └── ex_feature_anno1.gtf
    ├── evidence
    │   ├── hint1.gff
    │   ├── hint3.gff
    │   └── hint2.gff
    ├── genome_anno
    │   ├── tx1.gtf
    │   ├── missing_gid.gtf
    │   ├── format_error.gtf
    │   └── anno1.gtf
    ├── test_evidence.py
    ├── test_graph.py
    ├── test_genome_anno.py
    ├── combined.gtf
    └── prep_files.py
├── config
    ├── braker3.cfg
    ├── default.cfg
    ├── keep_ab_initio.cfg
    └── pref_braker1.cfg
├── example
    ├── run_tsebra_example.sh
    └── braker1_results
    │   └── hintsfile.gff
└── README.md


/bin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/TSEBRA_Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gaius-Augustus/TSEBRA/HEAD/docs/TSEBRA_Logo.png


--------------------------------------------------------------------------------
/tests/graph/ex2_anno1.gtf:
--------------------------------------------------------------------------------
1 | 3R	AUGUSTUS	exon	200	300	0	+	0	transcript_id "t1"; gene_id "t1_g";
2 | 3R	AUGUSTUS	CDS	200	300	0	+	0	transcript_id "t1"; gene_id "t1_g";
3 | 3R	AUGUSTUS	transcript	200	300	0	+	0	t1


--------------------------------------------------------------------------------
/tests/evidence/hint1.gff:
--------------------------------------------------------------------------------
1 | 3L	ProtHint	intron	5812862	5812941	24	-	.	src=M;mult=24;pri=4
2 | 3L	ProtHint	intron	12291242	12291299	8	-	.	transcript_id="t1"
3 | 3L	ProtHint	intron	12291242	12291299	8	-	.	src=M;pri=4
4 | 3L	ProtHint	intron	12291242	


--------------------------------------------------------------------------------
/tests/graph/ex2_anno2.gtf:
--------------------------------------------------------------------------------
1 | 3R	AUGUSTUS	exon	100	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
2 | 3R	AUGUSTUS	CDS	100	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
3 | 3R	AUGUSTUS	transcript	100	200	0	+	0	t1
4 | 3R	AUGUSTUS	exon	301	400	0	+	0	transcript_id "t2"; gene_id "t2_g";
5 | 3R	AUGUSTUS	CDS	301	400	0	+	0	transcript_id "t2"; gene_id "t2_g";
6 | 3R	AUGUSTUS	transcript	301	400	0	+	0	t2


--------------------------------------------------------------------------------
/config/braker3.cfg:
--------------------------------------------------------------------------------
 1 | # Weight for each hint source
 2 | # Values have to be >= 0
 3 | P 1
 4 | E 20
 5 | C 1
 6 | M 1
 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript
 8 | # Values have to be in [0,1]
 9 | intron_support 1.0
10 | stasto_support 2
11 | # Allowed difference for each feature 
12 | # Values have to be in [0,1]
13 | e_1 0.1
14 | e_2 0.5
15 | e_3 0.05
16 | e_4 0.2


--------------------------------------------------------------------------------
/config/default.cfg:
--------------------------------------------------------------------------------
 1 | # Weight for each hint source
 2 | # Values have to be >= 0
 3 | P 1
 4 | E 20
 5 | C 1
 6 | M 1
 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript
 8 | # Values have to be in [0,1]
 9 | intron_support 1.0
10 | stasto_support 2
11 | # Allowed difference for each feature 
12 | # Values have to be in [0,1]
13 | e_1 0.1
14 | e_2 0.5
15 | e_3 0.05
16 | e_4 0.18


--------------------------------------------------------------------------------
/tests/graph/ex4_anno1.gtf:
--------------------------------------------------------------------------------
1 | 3R	AUGUSTUS	exon	100	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
2 | 3R	AUGUSTUS	CDS	100	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
3 | 3R	AUGUSTUS	intron	201	299	0	+	0	transcript_id "t1"; gene_id "t1_g";
4 | 3R	AUGUSTUS	exon	300	400	0	+	0	transcript_id "t1"; gene_id "t1_g";
5 | 3R	AUGUSTUS	CDS	300	400	0	+	0	transcript_id "t1"; gene_id "t1_g";
6 | 3R	AUGUSTUS	transcript	100	400	0	+	0	t1


--------------------------------------------------------------------------------
/tests/graph/ex4_anno2.gtf:
--------------------------------------------------------------------------------
1 | 3R	AUGUSTUS	exon	101	201	1	+	0	transcript_id "t1"; gene_id "t1_g";
2 | 3R	AUGUSTUS	CDS	101	201	1	+	0	transcript_id "t1"; gene_id "t1_g";
3 | 3R	AUGUSTUS	intron	202	300	1	+	0	transcript_id "t1"; gene_id "t1_g";
4 | 3R	AUGUSTUS	exon	301	401	1	+	0	transcript_id "t1"; gene_id "t1_g";
5 | 3R	AUGUSTUS	CDS	301	401	1	+	0	transcript_id "t1"; gene_id "t1_g";
6 | 3R	AUGUSTUS	transcript	101	401	1	+	0	t1


--------------------------------------------------------------------------------
/config/keep_ab_initio.cfg:
--------------------------------------------------------------------------------
 1 | # Weight for each hint source
 2 | # Values have to be >= 0
 3 | P 0.1
 4 | E 10
 5 | C 5
 6 | M 1
 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript
 8 | # Values have to be in [0,1]
 9 | intron_support 0
10 | stasto_support 1
11 | # Allowed difference for each feature 
12 | # Values have to be in [0,1]
13 | e_1 0.1
14 | e_2 0.5
15 | # Values have to be >0
16 | e_3 0.05
17 | e_4 0.18


--------------------------------------------------------------------------------
/config/pref_braker1.cfg:
--------------------------------------------------------------------------------
 1 | # Weight for each hint source
 2 | # Values have to be >= 0
 3 | P 0.1
 4 | E 10000
 5 | C 5
 6 | M 1
 7 | # Required fraction of supported introns or supported start/stop-codons for a transcript
 8 | # Values have to be in [0,1]
 9 | intron_support 0.25
10 | stasto_support 2
11 | # Allowed difference for each feature 
12 | # Values have to be in [0,1]
13 | e_1 0.25
14 | e_2 1
15 | # Values have to be >0
16 | e_3 0.05
17 | e_4 0.18
18 | 


--------------------------------------------------------------------------------
/tests/graph/ex3_anno2.gtf:
--------------------------------------------------------------------------------
1 | 3R	AUGUSTUS	exon	110	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
2 | 3R	AUGUSTUS	CDS	110	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
3 | 3R	AUGUSTUS	intron	201	799	0	+	0	transcript_id "t1"; gene_id "t1_g";
4 | 3R	AUGUSTUS	exon	800	1000	0	+	0	transcript_id "t1"; gene_id "t1_g";
5 | 3R	AUGUSTUS	CDS	800	1000	0	+	0	transcript_id "t1"; gene_id "t1_g";
6 | 3R	AUGUSTUS	transcript	110	1000	0	+	0	t1
7 | 3R	AUGUSTUS	exon	350	450	0	+	0	transcript_id "t2"; gene_id "t2_g";
8 | 3R	AUGUSTUS	CDS	350	450	0	+	0	transcript_id "t2"; gene_id "t2_g";
9 | 3R	AUGUSTUS	transcript	350	450	0	+	0	t2


--------------------------------------------------------------------------------
/tests/graph/ex3_anno1.gtf:
--------------------------------------------------------------------------------
1 | 3R	AUGUSTUS	exon	100	300	0	+	0	transcript_id "t1"; gene_id "t1_g";
2 | 3R	AUGUSTUS	CDS	100	300	0	+	0	transcript_id "t1"; gene_id "t1_g";
3 | 3R	AUGUSTUS	intron	301	499	0	+	0	transcript_id "t1"; gene_id "t1_g";
4 | 3R	AUGUSTUS	exon	500	700	0	+	0	transcript_id "t1"; gene_id "t1_g";
5 | 3R	AUGUSTUS	CDS	500	700	0	+	0	transcript_id "t1"; gene_id "t1_g";
6 | 3R	AUGUSTUS	intron	701	899	0	+	0	transcript_id "t1"; gene_id "t1_g";
7 | 3R	AUGUSTUS	exon	900	1100	0	+	0	transcript_id "t1"; gene_id "t1_g";
8 | 3R	AUGUSTUS	CDS	900	1100	0	+	0	transcript_id "t1"; gene_id "t1_g";
9 | 3R	AUGUSTUS	transcript	100	1100	0	+	0	t1


--------------------------------------------------------------------------------
/tests/genome_anno/tx1.gtf:
--------------------------------------------------------------------------------
1 | 3L	GeneMark.hmm	stop_codon	18462228	18462230	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; count "1_1";
2 | 3L	GeneMark.hmm	CDS	18462228	18462540	.	-	1	gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2";
3 | 3L	GeneMark.hmm	exon	18462228	18462540	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2";
4 | 3L	GeneMark.hmm	CDS	18462719	18463068	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
5 | 3L	GeneMark.hmm	exon	18462719	18463068	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
6 | 


--------------------------------------------------------------------------------
/tests/evidence/hint3.gff:
--------------------------------------------------------------------------------
 1 | 3R	AUGUSTUS	start_codon	100	102	10	+	.	src=E;mult=2;pri=4
 2 | 3R	AUGUSTUS	intron	501	599	10	+	.	src=E;mult=2;pri=4
 3 | 3R	AUGUSTUS	intron	501	599	10	+	.	src=P;mult=14;pri=4
 4 | 3R	AUGUSTUS	stop_codon	698	700	10	+	.	src=E;mult=2;pri=4
 5 | 3R	AUGUSTUS	intron	801	899	10	+	.	src=E;mult=2;pri=4
 6 | 2L	AUGUSTUS	intron	801	899	10	+	.	src=E;mult=2;pri=4
 7 | 3R	AUGUSTUS	intron	801	899	10	+	.	src=P;mult=24;pri=4
 8 | 3R	AUGUSTUS	intron	801	949	10	+	.	src=E;mult=2;pri=4
 9 | 3R	AUGUSTUS	intron	801	899	10	+	.	src=E;mult=2;pri=4
10 | 3R	AUGUSTUS	intron	1001	1099	10	+	.	src=E;mult=2;pri=4
11 | 3R	AUGUSTUS	stop_codon	1198	1200	10	+	.	src=E;mult=2;pri=4
12 | 3R	AUGUSTUS	intron	1601	1699	10	+	.	src=E;mult=2;pri=4


--------------------------------------------------------------------------------
/tests/graph/ex_feature_hint1.gff:
--------------------------------------------------------------------------------
 1 | 3R	b2h	intron	21737122	21737185	6	-	.	mult=6;pri=4;src=E
 2 | 3R	b2h	intron	21738629	21738695	42	-	.	mult=42;pri=4;src=E
 3 | 3R	b2h	intron	21738939	21739000	30	-	.	mult=30;pri=4;src=E
 4 | 3R	b2h	intron	21740644	21741666	4	+	.	mult=4;pri=4;src=E
 5 | 3R	b2h	intron	21741826	21741884	12	+	.	mult=12;pri=4;src=E
 6 | 3R	b2h	intron	21742360	21742427	2	+	.	mult=2;pri=4;src=E
 7 | 3R	b2h	intron	21743988	21744047	2	+	.	mult=2;pri=4;src=E
 8 | 3R	b2h	intron	21745856	21746185	166	+	.	mult=166;pri=4;src=E
 9 | 3R	b2h	intron	21746342	21746473	196	+	.	mult=196;pri=4;src=E
10 | 3R	b2h	intron	21747188	21747389	200	+	.	mult=200;pri=4;src=E
11 | 3R	b2h	intron	21748618	21748687	340	+	.	mult=340;pri=4;src=E
12 | 


--------------------------------------------------------------------------------
/tests/graph/ex1_anno2.gtf:
--------------------------------------------------------------------------------
 1 | 3R	AUGUSTUS	exon	250	500	0	+	0	transcript_id "t1"; gene_id "t1_g";
 2 | 3R	AUGUSTUS	CDS	250	500	0	+	0	transcript_id "t1"; gene_id "t1_g";
 3 | 3R	AUGUSTUS	intron	501	599	0	+	0	transcript_id "t1"; gene_id "t1_g";
 4 | 3R	AUGUSTUS	exon	600	750	0	+	0	transcript_id "t1"; gene_id "t1_g";
 5 | 3R	AUGUSTUS	CDS	600	750	0	+	0	transcript_id "t1"; gene_id "t1_g";
 6 | 3R	AUGUSTUS	transcript	250	750	0	+	0	t1
 7 | 3R	AUGUSTUS	exon	1050	1250	0	+	0	transcript_id "t2"; gene_id "t2_g";
 8 | 3R	AUGUSTUS	CDS	1050	1250	0	+	0	transcript_id "t2"; gene_id "t2_g";
 9 | 3R	AUGUSTUS	transcript	1050	1250	0	+	0	t2
10 | 3R	AUGUSTUS	exon	1700	1800	0	+	0	transcript_id "t3"; gene_id "t3_g";
11 | 3R	AUGUSTUS	CDS	1700	1800	0	+	0	transcript_id "t3"; gene_id "t3_g";
12 | 3R	AUGUSTUS	transcript	1700	1800	0	+	0	t3


--------------------------------------------------------------------------------
/example/run_tsebra_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # if this file is not executable run: chmod +x run_prevco_example.sh
 3 | 
 4 | c="${0%/*}"
 5 | # prediciton and hint files that are included in the standard output of a BRAKER run
 6 | b1=$c/braker1_results/braker.gtf
 7 | b2=$c/braker2_results/braker.gtf
 8 | h1=$c/braker1_results/hintsfile.gff
 9 | h2=$c/braker2_results/hintsfile.gff
10 | 
11 | # create working directory
12 | d=$c/tsebra_workdir/
13 | mkdir -p $d
14 | 
15 | # Make sure that the transcript IDs of the BRAKER predicitons are in order
16 | # This step is OPTIONAL and not necassary for a succefull combination
17 | 
18 | echo "\n*** Fix possible ID errors in *.gtf files ***\n"
19 | 
20 | new_b1=$d/braker1.gtf
21 | new_b2=$d/braker2.gtf
22 | $c/../bin/fix_gtf_ids.py --gtf $b1 --out $new_b1
23 | $c/../bin/fix_gtf_ids.py --gtf $b2 --out $new_b2
24 | b1=$new_b1
25 | b2=$new_b2
26 | 
27 | # Combine BRAKER1 and BRAKER2 predicitons
28 | 
29 | o=$d/braker1+2.gtf
30 | 
31 | echo "*** Running TSEBRA ***\n"
32 | 
33 | $c/../bin/tsebra.py -g $b1,$b2 -c $c/../config/default.cfg -e $h1,$h2 -o $o
34 | 
35 | echo "\n*** Finished. Result at: $o ***\n"
36 | 


--------------------------------------------------------------------------------
/tests/test_evidence.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import pytest
 4 | import csv
 5 | 
 6 | testDir = os.path.abspath(os.path.dirname(__file__))
 7 | sys.path.append(testDir + '/../bin/')
 8 | 
 9 | from evidence import NotGtfFormat, AttributeMissing, Hint, Evidence
10 | 
11 | @pytest.fixture
12 | def hints1():
13 |     hints = []
14 |     with open(testDir + '/evidence/hint1.gff') as file:
15 |         hints_tab = csv.reader(file, delimiter='\t')
16 |         for line in hints_tab:
17 |             hints.append(line)
18 |     return hints
19 | 
20 | def test_hint(hints1):
21 |     hint = Hint(hints1[0])
22 |     assert list(map(str, hint.hint2list())) == hints1[0]
23 |     hint = Hint(hints1[2])
24 |     assert  list(map(str, hint.hint2list())) == hints1[2]
25 | 
26 | def test_hint_error(hints1):
27 |     with pytest.raises(AttributeMissing):
28 |         Hint(hints1[1])
29 |     with pytest.raises(NotGtfFormat):
30 |         Hint(hints1[3])
31 | 
32 | def test_get_hint():
33 |     evi = Evidence()
34 |     evi.add_hintfile(testDir + '/evidence/hint3.gff')
35 |     mult = evi.get_hint('3R','801','899','intron','+')
36 |     assert sum(mult.values()) == 28
37 | 


--------------------------------------------------------------------------------
/tests/graph/ex1_anno1.gtf:
--------------------------------------------------------------------------------
 1 | 3R	AUGUSTUS	exon	100	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
 2 | 3R	AUGUSTUS	CDS	100	200	0	+	0	transcript_id "t1"; gene_id "t1_g";
 3 | 3R	AUGUSTUS	intron	201	299	0	+	0	transcript_id "t1"; gene_id "t1_g";
 4 | 3R	AUGUSTUS	exon	300	400	0	+	0	transcript_id "t1"; gene_id "t1_g";
 5 | 3R	AUGUSTUS	CDS	300	400	0	+	0	transcript_id "t1"; gene_id "t1_g";
 6 | 3R	AUGUSTUS	transcript	100	400	0	+	0	t1
 7 | 3R	AUGUSTUS	exon	700	800	0	+	0	transcript_id "t2"; gene_id "t2_g";
 8 | 3R	AUGUSTUS	CDS	700	800	0	+	0	transcript_id "t2"; gene_id "t2_g";
 9 | 3R	AUGUSTUS	intron	801	899	0	+	0	transcript_id "t2"; gene_id "t2_g";
10 | 3R	AUGUSTUS	exon	900	1000	0	+	0	transcript_id "t2"; gene_id "t2_g";
11 | 3R	AUGUSTUS	CDS	900	1000	0	+	0	transcript_id "t2"; gene_id "t2_g";
12 | 3R	AUGUSTUS	intron	1001	1099	0	+	0	transcript_id "t2"; gene_id "t2_g";
13 | 3R	AUGUSTUS	exon	1100	1200	0	+	0	transcript_id "t2"; gene_id "t2_g";
14 | 3R	AUGUSTUS	CDS	1100	1200	0	+	0	transcript_id "t2"; gene_id "t2_g";
15 | 3R	AUGUSTUS	transcript	700	1200	0	+	0	t2
16 | 3R	AUGUSTUS	exon	1500	1600	0	+	0	transcript_id "t3"; gene_id "t3_g";
17 | 3R	AUGUSTUS	CDS	1500	1600	0	+	0	transcript_id "t3"; gene_id "t3_g";
18 | 3R	AUGUSTUS	transcript	1500	1600	0	+	0	t3


--------------------------------------------------------------------------------
/tests/evidence/hint2.gff:
--------------------------------------------------------------------------------
 1 | 3L	ProtHint	intron	5812862	5812941	24	-	.	src=M;mult=24;pri=4
 2 | 3L	ProtHint	intron	12291242	12291299	8	-	.	src=M;mult=8;pri=4
 3 | 3R	ProtHint	intron	17440148	17440207	25	-	.	src=M;mult=25;pri=4
 4 | 2R	ProtHint	intron	5760114	5760177	23	-	.	src=M;mult=23;pri=4
 5 | 2R	ProtHint	intron	6210484	6210546	21	-	.	src=M;mult=21;pri=4
 6 | 3L	ProtHint	intron	20527281	20527592	25	+	.	src=M;mult=25;pri=4
 7 | 2L	ProtHint	intron	12400752	12400814	24	+	.	src=M;mult=24;pri=4
 8 | 2R	ProtHint	intron	14988084	14988142	25	-	.	src=M;mult=25;pri=4
 9 | 2L	ProtHint	intron	6667531	6667670	5	-	.	src=M;mult=5;pri=4
10 | 3R	ProtHint	intron	5537551	5537605	22	+	.	src=M;mult=22;pri=4
11 | 3R	ProtHint	intron	20813612	20813665	12	-	.	src=M;mult=12;pri=4
12 | X	ProtHint	intron	2145714	2147174	25	+	.	src=M;mult=25;pri=4
13 | 3L	ProtHint	intron	8114197	8114256	25	-	.	src=M;mult=25;pri=4
14 | X	ProtHint	intron	11048602	11048941	25	+	.	src=M;mult=25;pri=4
15 | 2L	ProtHint	intron	3807462	3807524	18	+	.	src=M;mult=18;pri=4
16 | 3R	ProtHint	intron	27059120	27059364	19	-	.	src=M;mult=19;pri=4
17 | 2R	ProtHint	intron	13821370	13821432	24	-	.	src=M;mult=24;pri=4
18 | X	ProtHint	intron	8173462	8173860	6	-	.	src=M;mult=6;pri=4
19 | X	ProtHint	intron	13270643	13271481	16	-	.	src=M;mult=16;pri=4
20 | X	ProtHint	intron	2079645	2079714	25	-	.	src=M;mult=25;pri=4
21 | 


--------------------------------------------------------------------------------
/tests/graph/ex_feature_hint2.gff:
--------------------------------------------------------------------------------
 1 | 3R	ProtHint	intron	21747188	21747389	16	+	.	src=M;mult=16;pri=4
 2 | 3R	ProtHint	intron	21742667	21742741	9	+	.	src=M;mult=9;pri=4
 3 | 3R	ProtHint	intron	21742360	21742427	10	+	.	src=M;mult=10;pri=4
 4 | 3R	ProtHint	intron	21745856	21746185	18	+	.	src=M;mult=18;pri=4
 5 | 3R	ProtHint	intron	21740644	21741666	8	+	.	src=M;mult=8;pri=4
 6 | 3R	ProtHint	intron	21740644	21741666	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
 7 | 3R	ProtHint	intron	21741826	21741884	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
 8 | 3R	ProtHint	intron	21742360	21742427	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
 9 | 3R	ProtHint	intron	21742667	21742741	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
10 | 3R	ProtHint	intron	21743988	21744047	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
11 | 3R	ProtHint	intron	21745856	21746185	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
12 | 3R	ProtHint	intron	21746342	21746473	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
13 | 3R	ProtHint	intron	21747188	21747389	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
14 | 3R	ProtHint	intron	21748618	21748687	1	+	.	grp=7375_0:000e30_g7706;src=C;pri=4;
15 | 3R	ProtHint	intron	21743988	21744047	0	+	.	src=P;mult=2;pri=4;
16 | 3R	ProtHint	intron	21746342	21746473	2	+	.	src=P;mult=14;pri=4;
17 | 3R	ProtHint	intron	21741826	21741884	0	+	.	src=P;mult=3;pri=4;
18 | 3R	ProtHint	intron	21747188	21747389	2	+	.	src=P;mult=16;pri=4;
19 | 3R	ProtHint	intron	21742667	21742741	2	+	.	src=P;mult=9;pri=4;
20 | 3R	ProtHint	intron	21742360	21742427	2	+	.	src=P;mult=10;pri=4;
21 | 3R	ProtHint	intron	21745856	21746185	2	+	.	src=P;mult=18;pri=4;
22 | 3R	ProtHint	intron	21748618	21748687	2	+	.	src=P;mult=17;pri=4;
23 | 3R	ProtHint	intron	21740644	21741666	2	+	.	src=P;mult=8;pri=4;
24 | 3R	ProtHint	stop	21748922	21748924	0	+	0	src=P;mult=1;pri=4;
25 | 


--------------------------------------------------------------------------------
/tests/test_graph.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | import pytest
 5 | 
 6 | testDir = os.path.abspath(os.path.dirname(__file__))
 7 | sys.path.append(testDir + '/../bin/')
 8 | 
 9 | from genome_anno import Anno
10 | from overlap_graph import Graph, Node
11 | from evidence import Hintfile
12 | 
13 | example_files = testDir + '/graph/'
14 | 
15 | def compare_lists(list1, list2):
16 |     assert len(list1) == len(list2)
17 |     list1 = [set(l) for l in list1]
18 |     list2 = [set(l) for l in list2]
19 |     for element in list1:
20 |         assert element in list2
21 | 
22 | def test_example_1():
23 |     result = [['anno1;t1', 'anno2;t1', 'anno1;t2', 'anno2;t2'], ['anno1;t3'], ['anno2;t3']]
24 |     anno1 = Anno(example_files + '/ex1_anno1.gtf', 'anno1')
25 |     anno1.addGtf()
26 |     anno1.norm_tx_format()
27 |     anno2 = Anno(example_files + '/ex1_anno2.gtf', 'anno2')
28 |     anno2.addGtf()
29 |     graph = Graph([anno1, anno2], {})
30 |     graph.build()
31 |     component_list = graph.connected_components()
32 |     compare_lists(result, component_list)
33 | 
34 | def test_example_2():
35 |     result = [['anno2;t1'], ['anno1;t1'], ['anno2;t2']]
36 |     anno1 = Anno(example_files + '/ex2_anno1.gtf', 'anno1')
37 |     anno1.addGtf()
38 |     anno1.norm_tx_format()
39 |     anno2 = Anno(example_files + '/ex2_anno2.gtf', 'anno2')
40 |     anno2.addGtf()
41 |     anno2.norm_tx_format()
42 |     graph = Graph([anno1, anno2], {})
43 |     graph.build()
44 |     component_list = graph.connected_components()
45 |     compare_lists(result, component_list)
46 | 
47 | def test_example_3():
48 |     result = [['anno1;t1', 'anno2;t1'], ['anno2;t2']]
49 |     anno1 = Anno(example_files + '/ex3_anno1.gtf', 'anno1')
50 |     anno1.addGtf()
51 |     anno1.norm_tx_format()
52 |     anno2 = Anno(example_files + '/ex3_anno2.gtf', 'anno2')
53 |     anno2.addGtf()
54 |     graph = Graph([anno1, anno2], {})
55 |     graph.build()
56 |     component_list = graph.connected_components()
57 |     compare_lists(result, component_list)
58 | 


--------------------------------------------------------------------------------
/bin/rename_gtf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # ==============================================================
 3 | # author: Lars Gabriel
 4 | #
 5 | # Rename the transcripts and genes of a GTF file.
 6 | # ==============================================================
 7 | import argparse
 8 | import os
 9 | import csv
10 | class FileNotFound(Exception):
11 |     pass
12 | 
13 | def main():
14 |     args = parseCmd()
15 |     from genome_anno import Anno
16 | 
17 |     args = parseCmd()
18 | 
19 |     if not os.path.exists(args.gtf):
20 |         raise FileNotFound('File not found: {}'.format(args.gtf))
21 |     prefix = ''
22 |     if args.prefix:
23 |         prefix = args.prefix
24 | 
25 | 
26 |     anno = Anno(args.gtf, id='')
27 |     anno.addGtf()
28 |     anno.norm_tx_format()
29 |     anno.find_genes()
30 |     tx_tab = anno.rename_tx_ids(prefix)
31 |     anno.write_anno(args.out)
32 |     if args.translation_tab:
33 |         with open(args.translation_tab, 'w+') as file:
34 |             out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n')
35 |             for line in tx_tab:
36 |                 out_writer.writerow(line)
37 | 
38 | def parseCmd():
39 |     """Parse command line arguments
40 | 
41 |     Returns:
42 |         dictionary: Dictionary with arguments
43 |     """
44 |     parser = argparse.ArgumentParser(description='Renames the transcripts and genes of a GTF file.')
45 |     parser.add_argument('--gtf', type=str, required=True,
46 |         help='Path to a gene prediciton file in GTF format, for example the output of TSEBRA.')
47 |     parser.add_argument('--prefix', type=str,
48 |         help='The string is added as a prefix to all transcript and gene IDs.')
49 |     parser.add_argument('--translation_tab', type=str,
50 |         help='Writes the translation table for old transcript IDs to new transcript IDs to the given file path.')
51 |     parser.add_argument('--out', type=str, required=True,
52 |         help='Path to the output file.')
53 |     return parser.parse_args()
54 | 
55 | if __name__ == '__main__':
56 |     main()
57 | 


--------------------------------------------------------------------------------
/bin/fix_gtf_ids.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # ==============================================================
 3 | # Lars Gabriel
 4 | #
 5 | # Fixes an transcript and gene id error, where transcripts/genes have
 6 | # the same ID on different chromosomes or strands.
 7 | # ==============================================================
 8 | import sys
 9 | import os
10 | import argparse
11 | 
12 | class FormatError(Exception):
13 |     pass
14 | 
15 | class Chr:
16 |     def __init__(self):
17 |         self.genes = {}
18 |         self.txs = {}
19 | 
20 | def start2int(line):
21 |     line[3] = int(line[3])
22 |     return line
23 | 
24 | def main():
25 |     # replace gene/tx oldID with chr_strand_oldID
26 |     args = parseCmd()
27 |     result = ''
28 |     with open(args.gtf, 'r') as file:
29 |         for line in file.readlines():
30 |             line = line.split('\t')
31 |             if len(line) == 9:
32 |                 if line[2] in ['gene', 'transcript']:
33 |                     continue
34 |                 id_prefix = line[0] + line[6]
35 |                 id_prefix = id_prefix.replace(' ', '')                
36 |                 transcript_id = line[8].split('transcript_id "')[1].split('";')[0]
37 |                 temp = line[8].split('transcript_id "')
38 |                 line[8] = '{}transcript_id "{}_{}";{}'.format(temp[0], id_prefix, transcript_id, '";'.join(temp[1].split('";')[1:]))
39 |                 gene_id = line[8].split('gene_id "')[1].split('";')[0]
40 |                 temp = line[8].split('gene_id "')
41 |                 line[8] = '{}gene_id "{}_{}";{}'.format(temp[0], id_prefix, gene_id, '";'.join(temp[1].split('";')[1:]))
42 |                 result += '\t'.join(line)
43 |     with open(args.out, 'w+') as file:
44 |         file.write(result)
45 | 
46 | def parseCmd():
47 |     """Parse command line arguments
48 | 
49 |     Returns:
50 |         dictionary: Dictionary with arguments
51 |     """
52 |     parser = argparse.ArgumentParser(description='')
53 |     parser.add_argument('--gtf', type=str,
54 |         help='')
55 |     parser.add_argument('--out', type=str,
56 |         help='')
57 |     return parser.parse_args()
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/tests/graph/ex_feature_anno2.gtf:
--------------------------------------------------------------------------------
 1 | 3R	AUGUSTUS	start_codon	21740168	21740170	.	+	0	transcript_id "g7701.t1"; gene_id "g7701";
 2 | 3R	AUGUSTUS	CDS	21740168	21740643	1	+	0	transcript_id "g7701.t1"; gene_id "g7701";
 3 | 3R	AUGUSTUS	exon	21740168	21740643	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
 4 | 3R	AUGUSTUS	intron	21740644	21741666	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
 5 | 3R	AUGUSTUS	CDS	21741667	21741825	1	+	1	transcript_id "g7701.t1"; gene_id "g7701";
 6 | 3R	AUGUSTUS	exon	21741667	21741825	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
 7 | 3R	AUGUSTUS	intron	21741826	21741884	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
 8 | 3R	AUGUSTUS	CDS	21741885	21742359	1	+	1	transcript_id "g7701.t1"; gene_id "g7701";
 9 | 3R	AUGUSTUS	exon	21741885	21742359	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
10 | 3R	AUGUSTUS	intron	21742360	21742427	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
11 | 3R	AUGUSTUS	CDS	21742428	21742666	1	+	0	transcript_id "g7701.t1"; gene_id "g7701";
12 | 3R	AUGUSTUS	exon	21742428	21742666	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
13 | 3R	AUGUSTUS	intron	21742667	21742741	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
14 | 3R	AUGUSTUS	CDS	21742742	21743987	1	+	1	transcript_id "g7701.t1"; gene_id "g7701";
15 | 3R	AUGUSTUS	exon	21742742	21743987	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
16 | 3R	AUGUSTUS	intron	21743988	21744047	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
17 | 3R	AUGUSTUS	CDS	21744048	21744355	0.52	+	0	transcript_id "g7701.t1"; gene_id "g7701";
18 | 3R	AUGUSTUS	exon	21744048	21744355	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
19 | 3R	AUGUSTUS	intron	21744356	21745282	0.52	+	.	transcript_id "g7701.t1"; gene_id "g7701";
20 | 3R	AUGUSTUS	CDS	21745283	21745855	0.53	+	1	transcript_id "g7701.t1"; gene_id "g7701";
21 | 3R	AUGUSTUS	exon	21745283	21745855	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
22 | 3R	AUGUSTUS	intron	21745856	21746185	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
23 | 3R	AUGUSTUS	CDS	21746186	21746341	1	+	1	transcript_id "g7701.t1"; gene_id "g7701";
24 | 3R	AUGUSTUS	exon	21746186	21746341	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
25 | 3R	AUGUSTUS	intron	21746342	21746473	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
26 | 3R	AUGUSTUS	CDS	21746474	21747187	1	+	1	transcript_id "g7701.t1"; gene_id "g7701";
27 | 3R	AUGUSTUS	exon	21746474	21747187	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
28 | 3R	AUGUSTUS	intron	21747188	21747389	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
29 | 3R	AUGUSTUS	CDS	21747390	21748617	1	+	1	transcript_id "g7701.t1"; gene_id "g7701";
30 | 3R	AUGUSTUS	exon	21747390	21748617	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
31 | 3R	AUGUSTUS	intron	21748618	21748687	1	+	.	transcript_id "g7701.t1"; gene_id "g7701";
32 | 3R	AUGUSTUS	CDS	21748688	21748924	1	+	0	transcript_id "g7701.t1"; gene_id "g7701";
33 | 3R	AUGUSTUS	transcript	21740168	21748924	0.52	+	.	g7701.t1
34 | 3R	AUGUSTUS	exon	21748688	21748924	.	+	.	transcript_id "g7701.t1"; gene_id "g7701";
35 | 3R	AUGUSTUS	stop_codon	21748922	21748924	.	+	0	transcript_id "g7701.t1"; gene_id "g7701";
36 | 3R	AUGUSTUS	stop_codon	21737497	21737499	.	-	0	transcript_id "g7700.t1"; gene_id "g7700";
37 | 3R	AUGUSTUS	CDS	21737497	21737706	0.84	-	0	transcript_id "g7700.t1"; gene_id "g7700";
38 | 3R	AUGUSTUS	exon	21737497	21737706	.	-	.	transcript_id "g7700.t1"; gene_id "g7700";
39 | 3R	AUGUSTUS	intron	21737707	21739000	0.76	-	.	transcript_id "g7700.t1"; gene_id "g7700";
40 | 3R	AUGUSTUS	CDS	21739001	21739099	0.75	-	0	transcript_id "g7700.t1"; gene_id "g7700";
41 | 3R	AUGUSTUS	transcript	21737497	21739099	0.75	-	.	g7700.t1
42 | 3R	AUGUSTUS	exon	21739001	21739099	.	-	.	transcript_id "g7700.t1"; gene_id "g7700";
43 | 3R	AUGUSTUS	start_codon	21739097	21739099	.	-	0	transcript_id "g7700.t1"; gene_id "g7700";
44 | 


--------------------------------------------------------------------------------
/tests/test_genome_anno.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import pytest
  5 | import csv
  6 | 
  7 | testDir = os.path.abspath(os.path.dirname(__file__))
  8 | sys.path.append(testDir + '/../bin/')
  9 | 
 10 | from genome_anno import Transcript, Anno, NotGtfFormat
 11 | 
 12 | anno1 = testDir + '/genome_anno/anno1.gtf'
 13 | anno_format_error = testDir + '/genome_anno/format_error.gtf'
 14 | anno_missing_gid = testDir + '/genome_anno/missing_gid.gtf'
 15 | tx1 = testDir + '/genome_anno/tx1.gtf'
 16 | tx1_args = ('tx1', 'g.tx1', '3L', 'GeneMark.hmm', '-')
 17 | 
 18 | @pytest.fixture
 19 | def transcript():
 20 |     return Transcript(*tx1_args)
 21 | 
 22 | @pytest.fixture
 23 | def file_tx1():
 24 |     result = []
 25 |     with open(tx1, 'r') as file:
 26 |         file_tab = csv.reader(file, delimiter='\t')
 27 |         for line in file_tab:
 28 |             result.append(line)
 29 |     return result
 30 | 
 31 | @pytest.fixture
 32 | def file_anno1():
 33 |     result = []
 34 |     with open(anno1, 'r') as file:
 35 |         file_tab = csv.reader(file, delimiter='\t')
 36 |         for line in file_tab:
 37 |             result.append(line)
 38 |     return result
 39 | 
 40 | @pytest.fixture
 41 | def transcript_tx1(file_tx1):
 42 |     t = Transcript(*tx1_args)
 43 |     for line in file_tx1:
 44 |         t.add_line(line)
 45 |     return t
 46 | 
 47 | @pytest.fixture
 48 | def anno_anno1():
 49 |     anno = Anno(anno1, 'anno1')
 50 |     anno.addGtf()
 51 |     return anno
 52 | 
 53 | def test_transcript_defaults(transcript):
 54 |     assert transcript.id == tx1_args[0]
 55 |     assert transcript.gene_id == tx1_args[1]
 56 |     assert transcript.chr == tx1_args[2]
 57 |     assert transcript.source_anno == tx1_args[3]
 58 | 
 59 | def test_transcript_add_lines(transcript_tx1, file_tx1):
 60 |     list = []
 61 |     for key in transcript_tx1.transcript_lines.keys():
 62 |         list += transcript_tx1.transcript_lines[key]
 63 |     assert len(list) == len(file_tx1)
 64 |     for line in list:
 65 |         assert line in file_tx1
 66 | 
 67 | def test_transcript_find_lines(transcript_tx1):
 68 |     missing = {"intron" : [['3L', 'GeneMark.hmm', 'intron', 18462541, 18462718, \
 69 |             '.', '-', '0', \
 70 |             'gene_id "g.tx1"; transcript_id "tx1";']], \
 71 |         "start_codon" : [['3L', 'GeneMark.hmm', 'start_codon', 18463066, 18463068, \
 72 |             '.', '-', '.', 'gene_id "g.tx1"; transcript_id "tx1";']], \
 73 |         "transcript" : [['3L', 'GeneMark.hmm', 'transcript', 18462228, 18463068, \
 74 |             '.', '-', '.', 'tx1']]}
 75 |     transcript_tx1.add_missing_lines()
 76 |     for key in missing.keys():
 77 |         for line in missing[key]:
 78 |             assert line in transcript_tx1.transcript_lines[key]
 79 | 
 80 | def test_anno_read_file(anno_anno1, file_anno1):
 81 |     gtf_anno = anno_anno1.get_gtf()
 82 |     gtf_anno = [list(map(str, g[:8])) for g in gtf_anno]
 83 |     file_anno1 = [f[:8] for f in file_anno1]
 84 |     assert len(gtf_anno) == len(file_anno1)
 85 |     for line in file_anno1:
 86 |         print(line)
 87 |         print(gtf_anno)
 88 |         assert line in gtf_anno
 89 | 
 90 | def test_format_error():
 91 |     anno = Anno(anno_format_error, 'error_anno')
 92 |     with pytest.raises(NotGtfFormat):
 93 |         anno.addGtf()
 94 | 
 95 | def test_missing_gid(file_anno1):
 96 |     anno = Anno(anno_missing_gid, 'anno1')
 97 |     anno.addGtf()
 98 |     gtf_anno = anno.get_gtf()
 99 |     gtf_anno = [list(map(str, g[:8])) for g in gtf_anno]
100 |     file_anno1 = [f[:8] for f in file_anno1]
101 |     assert len(gtf_anno) == len(file_anno1)
102 |     for line in gtf_anno:
103 |         assert line in file_anno1
104 | 
105 | 
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     os.mkdir(tempDir)
110 |     #sys.path.append(testDir + "/../bin")
111 | 


--------------------------------------------------------------------------------
/bin/get_longest_isoform.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # author: Lars Gabriel
  4 | #
  5 | # get_longest_isoform.py: combines gene sets into one that 
  6 | #      consists only of the longest isoform from each gene loci
  7 | # ==============================================================
  8 | import argparse
  9 | import sys
 10 | import os
 11 | import csv
 12 | 
 13 | class ConfigFileError(Exception):
 14 |     pass
 15 | 
 16 | class GeneSetMissing(Exception):
 17 |     pass
 18 | 
 19 | gtf = []
 20 | anno = []
 21 | hintfiles = []
 22 | graph = None
 23 | out = ''
 24 | v = 0
 25 | quiet = False
 26 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \
 27 |     'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0}
 28 | 
 29 | def main():
 30 |     from genome_anno import Anno
 31 |     from overlap_graph import Graph
 32 | 
 33 |     global anno, graph, parameter
 34 | 
 35 |     args = parseCmd()
 36 |     init(args)
 37 | 
 38 |     if v > 0:
 39 |         print(gtf)
 40 | 
 41 |     # read gene prediciton files
 42 |     c = 1
 43 |     for c, g in enumerate(gtf):
 44 |         if not quiet:
 45 |             sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n')
 46 |         anno.append(Anno(g, f'anno{c+1}'))
 47 |         anno[-1].addGtf()
 48 |         anno[-1].norm_tx_format()
 49 | 
 50 |     # create graph with an edge for each unique transcript
 51 |     # and an edge if two transcripts overlap
 52 |     # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides
 53 |     graph = Graph(anno, para=parameter, verbose=v)
 54 |     if not quiet:
 55 |         sys.stderr.write('### BUILD OVERLAP GRAPH\n')
 56 |     graph.build()
 57 |     
 58 |     combined_anno = Anno('', 'combined_annotation')
 59 |     # for each gene locus, choose the transcript with longes coding sequence
 60 |     if not quiet:
 61 |         sys.stderr.write('### CHOOSE LONGEST ISOFORM FOR EACH GENE\n')
 62 |     for i, comp in enumerate(graph.connected_components()):
 63 |         tx_longest = sorted([graph.__tx_from_key__(n) for \
 64 |                    n in comp], key=lambda t:t.get_cds_len())[-1]
 65 |         tx_longest.set_gene_id(f'g_{i+1}')
 66 |         tx_longest.id = f'{tx_longest.source_anno}.{tx_longest.id}'
 67 |         combined_anno.transcripts.update({tx_longest.id : tx_longest})    
 68 |     combined_anno.find_genes()
 69 |     combined_anno.write_anno(out)
 70 | 
 71 |     if not quiet:
 72 |         sys.stderr.write('### FINISHED\n\n')
 73 |         sys.stderr.write('### The longest isoforms are located at {}.\n'.format(\
 74 |             out))
 75 | 
 76 | def init(args):
 77 |     global gtf, out, v, quiet
 78 |     if args.gtf:
 79 |         gtf = args.gtf.split(',')    
 80 |     if args.out:
 81 |         out = args.out
 82 |     if args.verbose:
 83 |         v = args.verbose
 84 |     if args.quiet:
 85 |         quiet = True
 86 | 
 87 | def parseCmd():
 88 |     """Parse command line arguments
 89 | 
 90 |     Returns:
 91 |         dictionary: Dictionary with arguments
 92 |     """
 93 |     parser = argparse.ArgumentParser(description='Combine gene sets by choosing ' \
 94 |                  'the isoform with the longes coding sequence for each gene locus.')
 95 |     parser.add_argument('-g', '--gtf', type=str, required=True,
 96 |         help='List (separated by commas) of gene prediciton files in gtf.\n' \
 97 |             + '(e.g. gene_pred1.gtf,gene_pred2.gtf,gene_pred3.gtf)')
 98 |     parser.add_argument('-o', '--out', type=str, required=True,
 99 |         help='Outputfile for the combined gene prediciton in gtf.')
100 |     parser.add_argument('-q', '--quiet', action='store_true',
101 |         help='Quiet mode.')
102 |     parser.add_argument('-v', '--verbose', type=int,
103 |         help='')
104 |     return parser.parse_args()
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/tests/combined.gtf:
--------------------------------------------------------------------------------
 1 | 3R	AUGUSTUS	stop_codon	21737497	21737499	.	-	0	transcript_id "g7603.t1"; gene_id "g7603";
 2 | 3R	AUGUSTUS	CDS	21737497	21737706	0.99	-	0	transcript_id "g7603.t1"; gene_id "g7603";
 3 | 3R	AUGUSTUS	exon	21737497	21737706	.	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 4 | 3R	AUGUSTUS	transcript	21737497	21738709	0.98	-	.	g7603.t1
 5 | 3R	AUGUSTUS	intron	21737707	21738606	0.99	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 6 | 3R	AUGUSTUS	CDS	21738607	21738628	0.99	-	1	transcript_id "g7603.t1"; gene_id "g7603";
 7 | 3R	AUGUSTUS	exon	21738607	21738628	.	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 8 | 3R	AUGUSTUS	intron	21738629	21738695	1	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 9 | 3R	AUGUSTUS	CDS	21738696	21738709	1	-	0	transcript_id "g7603.t1"; gene_id "g7603";
10 | 3R	AUGUSTUS	exon	21738696	21738709	.	-	.	transcript_id "g7603.t1"; gene_id "g7603";
11 | 3R	AUGUSTUS	start_codon	21738707	21738709	.	-	0	transcript_id "g7603.t1"; gene_id "g7603";
12 | 3R	AUGUSTUS	start_codon	21740168	21740170	.	+	0	transcript_id "g7604.t1"; gene_id "g7604";
13 | 3R	AUGUSTUS	CDS	21740168	21740643	1	+	0	transcript_id "g7604.t1"; gene_id "g7604";
14 | 3R	AUGUSTUS	exon	21740168	21740643	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
15 | 3R	AUGUSTUS	transcript	21740168	21744359	0.53	+	.	g7604.t1
16 | 3R	AUGUSTUS	intron	21740644	21741666	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
17 | 3R	AUGUSTUS	CDS	21741667	21741825	1	+	1	transcript_id "g7604.t1"; gene_id "g7604";
18 | 3R	AUGUSTUS	exon	21741667	21741825	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
19 | 3R	AUGUSTUS	intron	21741826	21741884	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
20 | 3R	AUGUSTUS	CDS	21741885	21742359	1	+	1	transcript_id "g7604.t1"; gene_id "g7604";
21 | 3R	AUGUSTUS	exon	21741885	21742359	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
22 | 3R	AUGUSTUS	intron	21742360	21742427	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
23 | 3R	AUGUSTUS	CDS	21742428	21742666	1	+	0	transcript_id "g7604.t1"; gene_id "g7604";
24 | 3R	AUGUSTUS	exon	21742428	21742666	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
25 | 3R	AUGUSTUS	intron	21742667	21742741	0.84	+	.	transcript_id "g7604.t1"; gene_id "g7604";
26 | 3R	AUGUSTUS	CDS	21742742	21743987	0.79	+	1	transcript_id "g7604.t1"; gene_id "g7604";
27 | 3R	AUGUSTUS	exon	21742742	21743987	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
28 | 3R	AUGUSTUS	intron	21743988	21744047	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
29 | 3R	AUGUSTUS	CDS	21744048	21744359	0.68	+	0	transcript_id "g7604.t1"; gene_id "g7604";
30 | 3R	AUGUSTUS	exon	21744048	21744359	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
31 | 3R	AUGUSTUS	stop_codon	21744357	21744359	.	+	0	transcript_id "g7604.t1"; gene_id "g7604";
32 | 3R	AUGUSTUS	start_codon	21745305	21745307	.	+	0	transcript_id "g7605.t1"; gene_id "g7605";
33 | 3R	AUGUSTUS	CDS	21745305	21745855	0.69	+	0	transcript_id "g7605.t1"; gene_id "g7605";
34 | 3R	AUGUSTUS	exon	21745305	21745855	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
35 | 3R	AUGUSTUS	transcript	21745305	21748924	0.49	+	.	g7605.t1
36 | 3R	AUGUSTUS	intron	21745856	21746185	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
37 | 3R	AUGUSTUS	CDS	21746186	21746341	1	+	1	transcript_id "g7605.t1"; gene_id "g7605";
38 | 3R	AUGUSTUS	exon	21746186	21746341	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
39 | 3R	AUGUSTUS	intron	21746342	21746473	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
40 | 3R	AUGUSTUS	CDS	21746474	21747187	1	+	1	transcript_id "g7605.t1"; gene_id "g7605";
41 | 3R	AUGUSTUS	exon	21746474	21747187	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
42 | 3R	AUGUSTUS	intron	21747188	21747389	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
43 | 3R	AUGUSTUS	CDS	21747390	21748617	1	+	1	transcript_id "g7605.t1"; gene_id "g7605";
44 | 3R	AUGUSTUS	exon	21747390	21748617	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
45 | 3R	AUGUSTUS	intron	21748618	21748687	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
46 | 3R	AUGUSTUS	CDS	21748688	21748924	0.71	+	0	transcript_id "g7605.t1"; gene_id "g7605";
47 | 3R	AUGUSTUS	exon	21748688	21748924	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
48 | 3R	AUGUSTUS	stop_codon	21748922	21748924	.	+	0	transcript_id "g7605.t1"; gene_id "g7605";


--------------------------------------------------------------------------------
/tests/graph/ex_feature_anno1.gtf:
--------------------------------------------------------------------------------
 1 | 3R	AUGUSTUS	stop_codon	21737497	21737499	.	-	0	transcript_id "g7603.t1"; gene_id "g7603";
 2 | 3R	AUGUSTUS	CDS	21737497	21737706	0.99	-	0	transcript_id "g7603.t1"; gene_id "g7603";
 3 | 3R	AUGUSTUS	exon	21737497	21737706	.	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 4 | 3R	AUGUSTUS	transcript	21737497	21738709	0.98	-	.	g7603.t1
 5 | 3R	AUGUSTUS	intron	21737707	21738606	0.99	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 6 | 3R	AUGUSTUS	CDS	21738607	21738628	0.99	-	1	transcript_id "g7603.t1"; gene_id "g7603";
 7 | 3R	AUGUSTUS	exon	21738607	21738628	.	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 8 | 3R	AUGUSTUS	intron	21738629	21738695	1	-	.	transcript_id "g7603.t1"; gene_id "g7603";
 9 | 3R	AUGUSTUS	CDS	21738696	21738709	1	-	0	transcript_id "g7603.t1"; gene_id "g7603";
10 | 3R	AUGUSTUS	exon	21738696	21738709	.	-	.	transcript_id "g7603.t1"; gene_id "g7603";
11 | 3R	AUGUSTUS	start_codon	21738707	21738709	.	-	0	transcript_id "g7603.t1"; gene_id "g7603";
12 | 3R	AUGUSTUS	start_codon	21740168	21740170	.	+	0	transcript_id "g7604.t1"; gene_id "g7604";
13 | 3R	AUGUSTUS	CDS	21740168	21740643	1	+	0	transcript_id "g7604.t1"; gene_id "g7604";
14 | 3R	AUGUSTUS	exon	21740168	21740643	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
15 | 3R	AUGUSTUS	transcript	21740168	21744359	0.53	+	.	g7604.t1
16 | 3R	AUGUSTUS	intron	21740644	21741666	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
17 | 3R	AUGUSTUS	CDS	21741667	21741825	1	+	1	transcript_id "g7604.t1"; gene_id "g7604";
18 | 3R	AUGUSTUS	exon	21741667	21741825	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
19 | 3R	AUGUSTUS	intron	21741826	21741884	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
20 | 3R	AUGUSTUS	CDS	21741885	21742359	1	+	1	transcript_id "g7604.t1"; gene_id "g7604";
21 | 3R	AUGUSTUS	exon	21741885	21742359	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
22 | 3R	AUGUSTUS	intron	21742360	21742427	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
23 | 3R	AUGUSTUS	CDS	21742428	21742666	1	+	0	transcript_id "g7604.t1"; gene_id "g7604";
24 | 3R	AUGUSTUS	exon	21742428	21742666	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
25 | 3R	AUGUSTUS	intron	21742667	21742741	0.84	+	.	transcript_id "g7604.t1"; gene_id "g7604";
26 | 3R	AUGUSTUS	CDS	21742742	21743987	0.79	+	1	transcript_id "g7604.t1"; gene_id "g7604";
27 | 3R	AUGUSTUS	exon	21742742	21743987	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
28 | 3R	AUGUSTUS	intron	21743988	21744047	1	+	.	transcript_id "g7604.t1"; gene_id "g7604";
29 | 3R	AUGUSTUS	CDS	21744048	21744359	0.68	+	0	transcript_id "g7604.t1"; gene_id "g7604";
30 | 3R	AUGUSTUS	exon	21744048	21744359	.	+	.	transcript_id "g7604.t1"; gene_id "g7604";
31 | 3R	AUGUSTUS	stop_codon	21744357	21744359	.	+	0	transcript_id "g7604.t1"; gene_id "g7604";
32 | 3R	AUGUSTUS	start_codon	21745305	21745307	.	+	0	transcript_id "g7605.t1"; gene_id "g7605";
33 | 3R	AUGUSTUS	CDS	21745305	21745855	0.69	+	0	transcript_id "g7605.t1"; gene_id "g7605";
34 | 3R	AUGUSTUS	exon	21745305	21745855	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
35 | 3R	AUGUSTUS	transcript	21745305	21748924	0.49	+	.	g7605.t1
36 | 3R	AUGUSTUS	intron	21745856	21746185	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
37 | 3R	AUGUSTUS	CDS	21746186	21746341	1	+	1	transcript_id "g7605.t1"; gene_id "g7605";
38 | 3R	AUGUSTUS	exon	21746186	21746341	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
39 | 3R	AUGUSTUS	intron	21746342	21746473	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
40 | 3R	AUGUSTUS	CDS	21746474	21747187	1	+	1	transcript_id "g7605.t1"; gene_id "g7605";
41 | 3R	AUGUSTUS	exon	21746474	21747187	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
42 | 3R	AUGUSTUS	intron	21747188	21747389	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
43 | 3R	AUGUSTUS	CDS	21747390	21748617	1	+	1	transcript_id "g7605.t1"; gene_id "g7605";
44 | 3R	AUGUSTUS	exon	21747390	21748617	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
45 | 3R	AUGUSTUS	intron	21748618	21748687	1	+	.	transcript_id "g7605.t1"; gene_id "g7605";
46 | 3R	AUGUSTUS	CDS	21748688	21748924	0.71	+	0	transcript_id "g7605.t1"; gene_id "g7605";
47 | 3R	AUGUSTUS	exon	21748688	21748924	.	+	.	transcript_id "g7605.t1"; gene_id "g7605";
48 | 3R	AUGUSTUS	stop_codon	21748922	21748924	.	+	0	transcript_id "g7605.t1"; gene_id "g7605";
49 | 


--------------------------------------------------------------------------------
/bin/evidence.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # author: Lars Gabriel
  4 | #
  5 | # evdence.py: Handles the extrinsic evidence from the hintfiles
  6 | # ==============================================================
  7 | import csv
  8 | 
  9 | class NotGtfFormat(Exception):
 10 |     pass
 11 | 
 12 | class AttributeMissing(Exception):
 13 |     pass
 14 | 
 15 | class Hint:
 16 |     """
 17 |         Class handling the data structures and methods for a hint
 18 |     """
 19 |     def __init__(self, line):
 20 |         """
 21 |             Create a hint from a gff line. The line has to include 'src=' as
 22 |             an attribute in the last column. Only introns, start/stop codons
 23 |             are used.
 24 | 
 25 |             Args:
 26 |                 line (list(str)): GFF line for one hint from extrinsic evidence.
 27 |         """
 28 |         if not len(line) == 9:
 29 |             raise NotGtfFormat('File not in gtf Format. Error at line: {}'.format(line))
 30 |         self.chr, self.source_program, self.type, self.start, self.end, \
 31 |             self.score, self.strand, self.phase, attribute = line
 32 |         self.start = int(self.start)
 33 |         self.end = int(self.end)
 34 | 
 35 |         try:
 36 |             self.src = attribute.split('src=')[1].split(';')[0]
 37 |         except IndexError:
 38 |             raise AttributeMissing('Source of Hint is missing in line {}.'.format(line))
 39 |         self.score = float(self.score)
 40 |         self.mult = 1
 41 |         if 'mult=' in attribute:
 42 |             self.mult = int(attribute.split('mult=')[1].split(';')[0])       
 43 | 
 44 |         self.pri = ''
 45 |         if 'pri=' in attribute:
 46 |             self.pri = attribute.split('pri=')[1].split(';')[0]
 47 | 
 48 |         if self.type == 'stop_codon':
 49 |             self.type = 'stop'
 50 |         elif self.type == 'start_codon':
 51 |             self.type = 'start'
 52 | 
 53 |     def hint2list(self):
 54 |         """
 55 |             Returns:
 56 |                 line (list(str)): GFF line for the hint.
 57 |         """
 58 |         attribute = ['src=' + self.src]
 59 |         if int(self.mult) > 1:
 60 |             attribute.append('mult={}'.format(self.mult))
 61 |         if self.pri:
 62 |             attribute.append('pri={}'.format(self.pri))
 63 |         return [self.chr, self.source_program, self.type, self.start, self.end, \
 64 |             self.score, self.strand, self.phase, ';'.join(attribute)]
 65 | 
 66 | class Hintfile:
 67 |     """
 68 |         Class handling the data structures and methods for a hintfile
 69 |     """
 70 |     def __init__(self, path):
 71 |         """
 72 |             Args:
 73 |                 path (str): Path to the hintfile.
 74 |         """
 75 |         # dictonary containing evidence
 76 |         # self.hints[chromosom_id] = [Hints()]
 77 |         self.hints = {}
 78 |         # dictionary with self.src[src] = sum_of_all_mults_of_hints_from_src
 79 |         self.src = {}
 80 |         self.read_file(path)
 81 | 
 82 |     def read_file(self, path):
 83 |         """
 84 |             Read a gff file with intron or start/stop codon hints
 85 |             and create a dict of Hints.
 86 |         """
 87 |         #
 88 |         with open(path, 'r') as file:
 89 |             hints_csv = csv.reader(file, delimiter='\t')
 90 |             for line in hints_csv:
 91 |                 if line[0][0] == '#':
 92 |                     continue
 93 |                 new_hint = Hint(line)
 94 |                 if not new_hint.chr in self.hints.keys():
 95 |                     self.hints.update({new_hint.chr : []})
 96 |                 self.hints[new_hint.chr].append(new_hint)
 97 |                 if new_hint.src not in self.src:
 98 |                     self.src.update({new_hint.src : 0})
 99 |                 self.src[new_hint.src] += new_hint.mult
100 | 
101 | class Evidence:
102 |     """
103 |         Class handling the data structures and methods for extrinsic evidence
104 |         from one or more hintfiles.
105 |     """
106 |     def __init__(self):
107 |         # hint_keys[chr][start_end_type_strand][src] = multiplicity
108 |         self.hint_keys = {}
109 |         self.src = {}
110 | 
111 |     def add_hintfile(self, path_to_hintfile):
112 |         """
113 |             Read hintfile
114 |         """
115 |         # read hintfile
116 |         hintfile = Hintfile(path_to_hintfile)
117 |         for s in hintfile.src:
118 |             if s not in self.src:
119 |                 self.src.update({s : 0})
120 |             self.src[s] += hintfile.src[s]
121 |         for chr in hintfile.hints.keys():
122 |             if chr not in self.hint_keys.keys():
123 |                 self.hint_keys.update({chr : {}})
124 |             for hint in hintfile.hints[chr]:
125 |                 new_key = '{}_{}_{}_{}'.format(hint.start, hint.end, \
126 |                     hint.type, hint.strand)
127 |                 if not new_key in self.hint_keys[chr].keys():
128 |                     self.hint_keys[chr].update({new_key : {}})
129 |                 if not hint.src in self.hint_keys[chr][new_key].keys():
130 |                     self.hint_keys[chr][new_key].update({hint.src : 0})
131 |                 self.hint_keys[chr][new_key][hint.src] += int(hint.mult)
132 | 
133 |     def get_hint(self, chr, start, end, type, strand):
134 |         if type == 'start_codon':
135 |             type = 'start'
136 |         elif type == 'stop_codon':
137 |             type = 'stop'
138 |         key = '{}_{}_{}_{}'.format(start, end, type, strand)
139 |         if chr in self.hint_keys.keys():
140 |             if key in self.hint_keys[chr].keys():
141 |                 return self.hint_keys[chr][key]
142 |         return {}
143 | 


--------------------------------------------------------------------------------
/bin/features.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # author: Lars Gabriel
  4 | #
  5 | # features.py: Handles the features for a transcript
  6 | # ==============================================================
  7 | import numpy as np
  8 | 
  9 | class Node_features:
 10 |     """
 11 |         Class handling the features for a transcripts.
 12 |         Features are scores that characterize the support of the transcript
 13 |         by extrinsic evidence in different ways.
 14 |     """
 15 |     def __init__(self, tx, evi, hint_source_weight={'P' : 1, 'E' : 20, 'C' : 1,  'M' : 1}):
 16 |         """
 17 |             Args:
 18 |                 tx (Transcript): Transcript class object containing a transcript.
 19 |                 evi (Evidence): Evidence class object containing all extrinsic evidence.
 20 |                 hint_source_weight (dict(int)): Weights for each evidence source.
 21 |         """
 22 |         self.sw = hint_source_weight
 23 |         self.scores = []
 24 |         self.epsi = 1e-5
 25 |         self.evi_list = {'intron' : [], 'start_codon' : [], 'stop_codon': []}
 26 |         self.numb_introns = 0
 27 |         self.__init_hints__(tx, evi)
 28 |         # feature vector specifies the support of
 29 |         # introns, start/stop codons for a transcript
 30 |         # self.feature_vector[0] : (supported introns by evidence of tx) / (number of introns in tx)
 31 |         # self.feature_vector[1] : (supported start/stop codons by evidence of tx) / 2
 32 |         # self.feature_vector[2] : sum of multiplicities of intron evidence for tx
 33 |         # self.feature_vector[3] : sum of multiplicities of start/stop codon evidence for tx
 34 |         # self.feature_vector[4] : 1 if tx is from anno_pref, 0 otherwise
 35 |         self.feature_vector = self.create_feature_vec()        
 36 | 
 37 |     def __init_hints__(self, tx, evi):
 38 |         """
 39 |             Collect hints from evi that support tx.
 40 | 
 41 |             Args:
 42 |                 tx (Transcript): Transcript class object containing a transcript.
 43 |                 evi (Evidence): Evidence class object containing all extrinsic evidence.
 44 |         """
 45 |         cds_len = 0
 46 |         for type in ['intron', 'start_codon', 'stop_codon']:
 47 |             for line in tx.transcript_lines[type]:
 48 |                 hint = evi.get_hint(line[0], line[3], line[4], line[2], \
 49 |                     line[6])
 50 |                 if hint:
 51 |                     self.evi_list[type].append(hint)
 52 |         if tx.transcript_lines['intron']:
 53 |             self.numb_introns = len(tx.transcript_lines['intron'])
 54 | 
 55 |     def create_feature_vec(self):
 56 |         """
 57 |             Compute all features.
 58 | 
 59 |             Returns:
 60 |                 (list(float)): List of feature scores.
 61 |         """
 62 |         return [self.relative_support(['intron'], self.numb_introns), \
 63 |                 self.relative_support(['start_codon', 'stop_codon'], 2.0), 
 64 |                 self.absolute_support(['intron']), \
 65 |                 self.absolute_support(['start_codon', 'stop_codon'])]
 66 |         
 67 |     def relative_support(self, gene_feature_types, abs_numb):
 68 |         """
 69 |             Compute relative support of introns or start/stop-codons.
 70 | 
 71 |             Args:
 72 |                 gene_feature_types (str): Either introns or start/stop-codons
 73 |                 abs_numb (int): absolute number of gene_feature_type in tx
 74 |                                 (e.g. number of introns in tx)
 75 | 
 76 |             Returns:
 77 |                 (float): Relative support in [0,1].
 78 |         """
 79 |         if abs_numb > 0:
 80 |             hint_numb = 0
 81 |             for type in gene_feature_types:
 82 |                 hint_numb += len(self.evi_list[type])
 83 |             return hint_numb / abs_numb
 84 |         return 1
 85 | 
 86 |     def absolute_support(self, gene_feature_types):
 87 |         """
 88 |             Compute absolute support of introns or start/stop-codons.
 89 | 
 90 |             Args:
 91 |                 gene_feature_types (str): Either introns or start/stop-codons
 92 | 
 93 |             Returns:
 94 |                 (float): Multiplicity*weight of supporting hints for gene_feature_types.
 95 |         """
 96 |         score = 0.0
 97 |         for type in gene_feature_types:
 98 |             for hint in self.evi_list[type]:
 99 |                 for src in hint.keys():
100 |                     score += self.sw[src] * hint[src]
101 |         #print(score)
102 |         return np.log(score + self.epsi)
103 | 
104 |     # currently not used 
105 |     def mean_support(self, gene_feature_types, abs_numb):
106 |         """
107 |             Compute absolute support of introns or start/stop-codons.
108 | 
109 |             Args:
110 |                 gene_feature_types (str): Either introns or start/stop-codons
111 | 
112 |             Returns:
113 |                 (float): Multiplicity*weight of supporting hints for gene_feature_types.
114 |         """
115 |         score = 0.0
116 |         if abs_numb > 0:
117 |             for type in gene_feature_types:
118 |                 for hint in self.evi_list[type]:
119 |                     for src in hint.keys():
120 |                         score += self.sw[src] * hint[src]
121 |             return np.log((score / abs_numb)+self.epsi)
122 |         else:
123 |             return np.log(self.epsi)
124 |         
125 |     # currently not used 
126 |     def min_support(self, gene_feature_types, abs_numb):
127 |         """
128 |             Compute absolute support of introns or start/stop-codons.
129 | 
130 |             Args:
131 |                 gene_feature_types (str): Either introns or start/stop-codons
132 | 
133 |             Returns:
134 |                 (float): Multiplicity*weight of supporting hints for gene_feature_types.
135 |         """
136 |         score = 0.0
137 |         for type in gene_feature_types:  
138 |             if len(self.evi_list[type]) < abs_numb:
139 |                 return np.log(self.epsi)
140 |         if abs_numb > 0:
141 |             score = 10000000000000000000.0
142 |             for type in gene_feature_types:                
143 |                 for hint in self.evi_list[type]: 
144 |                     new_score = 0
145 |                     for src in hint.keys():
146 |                         new_score += self.sw[src] * hint[src]
147 |                     score = np.minimum(score, new_score)            
148 |             return np.log(score+self.epsi)
149 |         else:
150 |             return np.log(self.epsi)
151 |     
152 |     def get_features(self):
153 |         """
154 |             Returns:
155 |                 (list(float)): List of feature scores.
156 |         """
157 |         return self.feature_vector


--------------------------------------------------------------------------------
/tests/genome_anno/missing_gid.gtf:
--------------------------------------------------------------------------------
 1 | 3L	GeneMark.hmm	stop_codon	18462228	18462230	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; count "1_1";
 2 | 3L	GeneMark.hmm	CDS	18462228	18462540	.	-	1	transcript_id "7789_t";
 3 | 3L	GeneMark.hmm	exon	18462228	18462540	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2";
 4 | 3L	GeneMark.hmm	CDS	18462719	18463068	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
 5 | 3L	GeneMark.hmm	exon	18462719	18463068	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
 6 | 3L	GeneMark.hmm	start_codon	18463066	18463068	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; count "1_1";
 7 | 3R	AUGUSTUS	start_codon	7686444	7686446	.	+	0	transcript_id "g5980.t1";
 8 | 3R	AUGUSTUS	CDS	7686444	7686623	1	+	0	transcript_id "g5980.t1"; gene_id "g5980";
 9 | 3R	AUGUSTUS	exon	7686444	7686623	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
10 | 3R	AUGUSTUS	intron	7686624	7690691	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
11 | 3R	AUGUSTUS	CDS	7690692	7690843	1	+	0	transcript_id "g5980.t1"; gene_id "g5980";
12 | 3R	AUGUSTUS	exon	7690692	7690843	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
13 | 3R	AUGUSTUS	intron	7690844	7691514	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
14 | 3R	AUGUSTUS	CDS	7691515	7691630	1	+	1	transcript_id "g5980.t1"; gene_id "g5980";
15 | 3R	AUGUSTUS	exon	7691515	7691630	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
16 | 3R	AUGUSTUS	intron	7691631	7691712	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
17 | 3R	AUGUSTUS	CDS	7691713	7693700	1	+	2	transcript_id "g5980.t1"; gene_id "g5980";
18 | 3R	AUGUSTUS	gene	7686444	7693700	1	+	.	g5980
19 | 3R	AUGUSTUS	transcript	7686444	7693700	1	+	.	g5980.t1
20 | 3R	AUGUSTUS	exon	7691713	7693700	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
21 | 3R	AUGUSTUS	stop_codon	7693698	7693700	.	+	0	transcript_id "g5980.t1"; gene_id "g5980";
22 | X	AUGUSTUS	stop_codon	2065454	2065456	.	-	0	transcript_id "g12130.t1"; gene_id "g12130";
23 | X	AUGUSTUS	CDS	2065454	2065891	0.75	-	0	transcript_id "g12130.t1"; gene_id "g12130";
24 | X	AUGUSTUS	exon	2065454	2065891	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
25 | X	AUGUSTUS	intron	2065892	2065944	0.98	-	.	transcript_id "g12130.t1"; gene_id "g12130";
26 | X	AUGUSTUS	CDS	2065945	2066088	0.93	-	0	transcript_id "g12130.t1"; gene_id "g12130";
27 | X	AUGUSTUS	exon	2065945	2066088	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
28 | X	AUGUSTUS	intron	2066089	2066148	0.92	-	.	transcript_id "g12130.t1"; gene_id "g12130";
29 | X	AUGUSTUS	CDS	2066149	2066238	0.92	-	0	transcript_id "g12130.t1"; gene_id "g12130";
30 | X	AUGUSTUS	gene	2065454	2066238	0.7	-	.	g12130
31 | X	AUGUSTUS	transcript	2065454	2066238	0.7	-	.	g12130.t1
32 | X	AUGUSTUS	exon	2066149	2066238	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
33 | X	AUGUSTUS	start_codon	2066236	2066238	.	-	0	transcript_id "g12130.t1"; gene_id "g12130";
34 | 2R	AUGUSTUS	stop_codon	16433896	16433898	.	-	0	transcript_id "g10583.t1"; gene_id "g10583";
35 | 2R	AUGUSTUS	CDS	16433896	16435797	1	-	0	transcript_id "g10583.t1"; gene_id "g10583";
36 | 2R	AUGUSTUS	exon	16433896	16435797	.	-	.	transcript_id "g10583.t1"; gene_id "g10583";
37 | 2R	AUGUSTUS	start_codon	16435795	16435797	.	-	0	transcript_id "g10583.t1"; gene_id "g10583";
38 | 2R	AUGUSTUS	gene	16433896	16435797	1	-	.	g10583
39 | 2R	AUGUSTUS	transcript	16433896	16435797	1	-	.	g10583.t1
40 | 2R	AUGUSTUS	stop_codon	24640803	24640805	.	-	0	transcript_id "g11793.t1"; gene_id "g11793";
41 | 2R	AUGUSTUS	CDS	24640803	24642212	1	-	0	transcript_id "g11793.t1"; gene_id "g11793";
42 | 2R	AUGUSTUS	exon	24640803	24642212	.	-	.	transcript_id "g11793.t1"; gene_id "g11793";
43 | 2R	AUGUSTUS	start_codon	24642210	24642212	.	-	0	transcript_id "g11793.t1"; gene_id "g11793";
44 | 2R	AUGUSTUS	gene	24640803	24642212	1	-	.	g11793
45 | 2R	AUGUSTUS	transcript	24640803	24642212	1	-	.	g11793.t1
46 | 2L	AUGUSTUS	stop_codon	11989063	11989065	.	-	0	transcript_id "g1539.t1"; gene_id "g1539";
47 | 2L	AUGUSTUS	CDS	11989063	11989803	0.73	-	0	transcript_id "g1539.t1"; gene_id "g1539";
48 | 2L	AUGUSTUS	exon	11989063	11989803	.	-	.	transcript_id "g1539.t1"; gene_id "g1539";
49 | 2L	AUGUSTUS	start_codon	11989801	11989803	.	-	0	transcript_id "g1539.t1"; gene_id "g1539";
50 | 2L	AUGUSTUS	gene	11989063	11989803	0.73	-	.	g1539
51 | 2L	AUGUSTUS	transcript	11989063	11989803	0.73	-	.	g1539.t1
52 | 2L	AUGUSTUS	start_codon	4686242	4686244	.	+	0	transcript_id "g562.t1"; gene_id "g562";
53 | 2L	AUGUSTUS	CDS	4686242	4687105	1	+	0	transcript_id "g562.t1"; gene_id "g562";
54 | 2L	AUGUSTUS	exon	4686242	4687105	.	+	.	transcript_id "g562.t1"; gene_id "g562";
55 | 2L	AUGUSTUS	stop_codon	4687103	4687105	.	+	0	transcript_id "g562.t1"; gene_id "g562";
56 | 2L	AUGUSTUS	gene	4686242	4687105	1	+	.	g562
57 | 2L	AUGUSTUS	transcript	4686242	4687105	1	+	.	g562.t1
58 | 3L	AUGUSTUS	stop_codon	11362605	11362607	.	-	0	transcript_id "g3988.t1"; gene_id "g3988";
59 | 3L	AUGUSTUS	CDS	11362605	11363086	1	-	2	transcript_id "g3988.t1"; gene_id "g3988";
60 | 3L	AUGUSTUS	exon	11362605	11363086	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
61 | 3L	AUGUSTUS	intron	11363087	11363276	1	-	.	transcript_id "g3988.t1"; gene_id "g3988";
62 | 3L	AUGUSTUS	CDS	11363277	11363918	1	-	2	transcript_id "g3988.t1"; gene_id "g3988";
63 | 3L	AUGUSTUS	exon	11363277	11363918	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
64 | 3L	AUGUSTUS	intron	11363919	11364608	1	-	.	transcript_id "g3988.t1"; gene_id "g3988";
65 | 3L	AUGUSTUS	CDS	11364609	11364771	1	-	0	transcript_id "g3988.t1"; gene_id "g3988";
66 | 3L	AUGUSTUS	gene	11362605	11364771	1	-	.	g3988
67 | 3L	AUGUSTUS	transcript	11362605	11364771	1	-	.	g3988.t1
68 | 3L	AUGUSTUS	exon	11364609	11364771	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
69 | 3L	AUGUSTUS	start_codon	11364769	11364771	.	-	0	transcript_id "g3988.t1"; gene_id "g3988";
70 | 3R	AUGUSTUS	start_codon	12691822	12691824	.	+	0	transcript_id "g6660.t1"; gene_id "g6660";
71 | 3R	AUGUSTUS	CDS	12691822	12691869	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
72 | 3R	AUGUSTUS	exon	12691822	12691869	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
73 | 3R	AUGUSTUS	intron	12691870	12692642	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
74 | 3R	AUGUSTUS	CDS	12692643	12692707	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
75 | 3R	AUGUSTUS	exon	12692643	12692707	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
76 | 3R	AUGUSTUS	intron	12692708	12692769	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
77 | 3R	AUGUSTUS	CDS	12692770	12692944	1	+	1	transcript_id "g6660.t1"; gene_id "g6660";
78 | 3R	AUGUSTUS	exon	12692770	12692944	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
79 | 3R	AUGUSTUS	intron	12692945	12693003	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
80 | 3R	AUGUSTUS	CDS	12693004	12693155	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
81 | 3R	AUGUSTUS	exon	12693004	12693155	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
82 | 3R	AUGUSTUS	intron	12693156	12693214	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
83 | 3R	AUGUSTUS	CDS	12693215	12693761	1	+	1	transcript_id "g6660.t1"; gene_id "g6660";
84 | 3R	AUGUSTUS	exon	12693215	12693761	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
85 | 3R	AUGUSTUS	intron	12693762	12693829	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
86 | 3R	AUGUSTUS	CDS	12693830	12693973	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
87 | 3R	AUGUSTUS	gene	12691822	12693973	1	+	.	g6660
88 | 3R	AUGUSTUS	transcript	12691822	12693973	1	+	.	g6660.t1
89 | 3R	AUGUSTUS	exon	12693830	12693973	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
90 | 3R	AUGUSTUS	stop_codon	12693971	12693973	.	+	0	transcript_id "g6660.t1"; gene_id "g6660";
91 | 2R	AUGUSTUS	stop_codon	20354214	20354216	.	-	0	transcript_id "g11080.t1";
92 | 2R	AUGUSTUS	CDS	20354214	20355053	1	-	0	transcript_id "g11080.t1";
93 | 2R	AUGUSTUS	exon	20354214	20355053	.	-	.	transcript_id "g11080.t1";
94 | 2R	AUGUSTUS	start_codon	20355051	20355053	.	-	0	transcript_id "g11080.t1";
95 | 2R	AUGUSTUS	gene	20354214	20355053	1	-	.	g11080
96 | 2R	AUGUSTUS	transcript	20354214	20355053	1	-	.	g11080.t1


--------------------------------------------------------------------------------
/tests/genome_anno/format_error.gtf:
--------------------------------------------------------------------------------
 1 | 3L	GeneMark.hmm	stop_codon	18462228	18462230	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; count "1_1";
 2 | 3L	GeneMark.hmm	CDS	18462228	18462540	.	-	1	gene_id "7789_g";
 3 | 3L	GeneMark.hmm	exon	18462228	18462540	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2";
 4 | 3L	GeneMark.hmm	CDS	18462719	18463068	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
 5 | 3L	GeneMark.hmm	exon	18462719	18463068	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
 6 | 3L	GeneMark.hmm	start_codon	18463066	18463068	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; count "1_1";
 7 | 3R	AUGUSTUS	start_codon	7686444	7686446	.	+	0	transcript_id "g5980.t1"; gene_id "g5980";
 8 | 3R	AUGUSTUS	CDS	7686444	7686623	1	+	0	transcript_id "g5980.t1"; gene_id "g5980";
 9 | 3R	AUGUSTUS	exon	7686444	7686623	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
10 | 3R	AUGUSTUS	intron	7686624	7690691	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
11 | 3R	AUGUSTUS	CDS	7690692	7690843	1	+	0	transcript_id "g5980.t1"; gene_id "g5980";
12 | 3R	AUGUSTUS	exon	7690692	7690843	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
13 | 3R	AUGUSTUS	intron	7690844	7691514	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
14 | 3R	AUGUSTUS	CDS	7691515	7691630	1	+	1	transcript_id "g5980.t1"; gene_id "g5980";
15 | 3R	AUGUSTUS	exon	7691515	7691630	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
16 | 3R	AUGUSTUS	intron	7691631	7691712	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
17 | 3R	AUGUSTUS	CDS	7691713	7693700	1	+	2	transcript_id "g5980.t1"; gene_id "g5980";
18 | 3R	AUGUSTUS	gene	7686444	7693700	1	+	.	g5980
19 | 3R	AUGUSTUS	transcript	7686444	7693700	1	+	.	g5980.t1
20 | 3R	AUGUSTUS	exon	7691713	7693700	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
21 | 3R	AUGUSTUS	stop_codon	7693698	7693700	.	+	0	transcript_id "g5980.t1"; gene_id "g5980";
22 | X	AUGUSTUS	stop_codon	2065454	2065456	.	-	0	transcript_id "g12130.t1"; gene_id "g12130";
23 | X	AUGUSTUS	CDS	2065454	2065891	0.75	-	0	transcript_id "g12130.t1"; gene_id "g12130";
24 | X	AUGUSTUS	exon	2065454	2065891	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
25 | X	AUGUSTUS	intron	2065892	2065944	0.98	-	.	transcript_id "g12130.t1"; gene_id "g12130";
26 | X	AUGUSTUS	CDS	2065945	2066088	0.93	-	0	transcript_id "g12130.t1"; gene_id "g12130";
27 | X	AUGUSTUS	exon	2065945	2066088	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
28 | X	AUGUSTUS	intron	2066089	2066148	0.92	-	.	transcript_id "g12130.t1"; gene_id "g12130";
29 | X	AUGUSTUS	CDS	2066149	2066238	0.92	-	0	transcript_id "g12130.t1"; gene_id "g12130";
30 | X	AUGUSTUS	gene	2065454	2066238	0.7	-	.	g12130
31 | X	AUGUSTUS	transcript	2065454	2066238	0.7	-	.	g12130.t1
32 | X	AUGUSTUS	exon	2066149	2066238	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
33 | X	AUGUSTUS	start_codon	2066236	2066238	.	-	0	transcript_id "g12130.t1"; gene_id "g12130";
34 | 2R	AUGUSTUS	stop_codon	16433896	16433898	.	-	0	transcript_id "g10583.t1"; gene_id "g10583";
35 | 2R	AUGUSTUS	CDS	16433896	16435797	1	-	0	transcript_id "g10583.t1"; gene_id "g10583";
36 | 2R	AUGUSTUS	exon	16433896	16435797	.	-	.	transcript_id "g10583.t1"; gene_id "g10583";
37 | 2R	AUGUSTUS	start_codon	16435795	16435797	.	-	0	transcript_id "g10583.t1"; gene_id "g10583";
38 | 2R	AUGUSTUS	gene	16433896	16435797	1	-	.	g10583
39 | 2R	AUGUSTUS	transcript	16433896	16435797	1	-	.	g10583.t1
40 | 2R	AUGUSTUS	stop_codon	24640803	24640805	.	-	0	transcript_id "g11793.t1"; gene_id "g11793";
41 | 2R	AUGUSTUS	CDS	24640803	24642212	1	-	0	transcript_id "g11793.t1"; gene_id "g11793";
42 | 2R	AUGUSTUS	exon	24640803	24642212	.	-	.	transcript_id "g11793.t1"; gene_id "g11793";
43 | 2R	AUGUSTUS	start_codon	24642210	24642212	.	-	0	transcript_id "g11793.t1"; gene_id "g11793";
44 | 2R	AUGUSTUS	gene	24640803	24642212	1	-	.	g11793
45 | 2R	AUGUSTUS	transcript	24640803	24642212	1	-	.	g11793.t1
46 | 2L	AUGUSTUS	stop_codon	11989063	11989065	.	-	0	transcript_id "g1539.t1"; gene_id "g1539";
47 | 2L	AUGUSTUS	CDS	11989063	11989803	0.73	-	0	transcript_id "g1539.t1"; gene_id "g1539";
48 | 2L	AUGUSTUS	exon	11989063	11989803	.	-	.	transcript_id "g1539.t1"; gene_id "g1539";
49 | 2L	AUGUSTUS	start_codon	11989801	11989803	.	-	0	transcript_id "g1539.t1"; gene_id "g1539";
50 | 2L	AUGUSTUS	gene	11989063	11989803	0.73	-	.	g1539
51 | 2L	AUGUSTUS	transcript	11989063	11989803	0.73	-	.	g1539.t1
52 | 2L	AUGUSTUS	start_codon	4686242	4686244	.	+	0	transcript_id "g562.t1"; gene_id "g562";
53 | 2L	AUGUSTUS	CDS	4686242	4687105	1	+	0	transcript_id "g562.t1"; gene_id "g562";
54 | 2L	AUGUSTUS	exon	4686242	4687105	.	+	.	transcript_id "g562.t1"; gene_id "g562";
55 | 2L	AUGUSTUS	stop_codon	4687103	4687105	.	+	0	transcript_id "g562.t1"; gene_id "g562";
56 | 2L	AUGUSTUS	gene	4686242	4687105	1	+	.	g562
57 | 2L	AUGUSTUS	transcript	4686242	4687105	1	+	.	g562.t1
58 | 3L	AUGUSTUS	stop_codon	11362605	11362607	.	-	0	transcript_id "g3988.t1"; gene_id "g3988";
59 | 3L	AUGUSTUS	CDS	11362605	11363086	1	-	2	transcript_id "g3988.t1"; gene_id "g3988";
60 | 3L	AUGUSTUS	exon	11362605	11363086	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
61 | 3L	AUGUSTUS	intron	11363087	11363276	1	-	.	transcript_id "g3988.t1"; gene_id "g3988";
62 | 3L	AUGUSTUS	CDS	11363277	11363918	1	-	2	transcript_id "g3988.t1"; gene_id "g3988";
63 | 3L	AUGUSTUS	exon	11363277	11363918	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
64 | 3L	AUGUSTUS	intron	11363919	11364608	1	-	.	transcript_id "g3988.t1"; gene_id "g3988";
65 | 3L	AUGUSTUS	CDS	11364609	11364771	1	-	0	transcript_id "g3988.t1"; gene_id "g3988";
66 | 3L	AUGUSTUS	gene	11362605	11364771	1	-	.	g3988
67 | 3L	AUGUSTUS	transcript	11362605	11364771	1	-	.	g3988.t1
68 | 3L	AUGUSTUS	exon	11364609	11364771	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
69 | 3L	AUGUSTUS	start_codon	11364769	11364771	.	-	0	transcript_id "g3988.t1"; gene_id "g3988";
70 | 3R	AUGUSTUS	start_codon	12691822	12691824	.	+	0	transcript_id "g6660.t1"; gene_id "g6660";
71 | 3R	AUGUSTUS	CDS	12691822	12691869	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
72 | 3R	AUGUSTUS	exon	12691822	12691869	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
73 | 3R	AUGUSTUS	intron	12691870	12692642	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
74 | 3R	AUGUSTUS	CDS	12692643	12692707	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
75 | 3R	AUGUSTUS	exon	12692643	12692707	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
76 | 3R	AUGUSTUS	intron	12692708	12692769	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
77 | 3R	AUGUSTUS	CDS	12692770	12692944	1	+	1	transcript_id "g6660.t1"; gene_id "g6660";
78 | 3R	AUGUSTUS	exon	12692770	12692944	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
79 | 3R	AUGUSTUS	intron	12692945	12693003	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
80 | 3R	AUGUSTUS	CDS	12693004	12693155	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
81 | 3R	AUGUSTUS	exon	12693004	12693155	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
82 | 3R	AUGUSTUS	intron	12693156	12693214	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
83 | 3R	AUGUSTUS	CDS	12693215	12693761	1	+	1	transcript_id "g6660.t1"; gene_id "g6660";
84 | 3R	AUGUSTUS	exon	12693215	12693761	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
85 | 3R	AUGUSTUS	intron	12693762	12693829	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
86 | 3R	AUGUSTUS	CDS	12693830	12693973	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
87 | 3R	AUGUSTUS	gene	12691822	12693973	1	+	.	g6660
88 | 3R	AUGUSTUS	transcript	12691822	12693973	1	+	.	g6660.t1
89 | 3R	AUGUSTUS	exon	12693830	12693973	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
90 | 3R	AUGUSTUS	stop_codon	12693971	12693973	.	+	0	transcript_id "g6660.t1"; gene_id "g6660";
91 | 2R	AUGUSTUS	stop_codon	20354214	20354216	.	-	0	transcript_id "g11080.t1"; gene_id "g11080";
92 | 2R	AUGUSTUS	CDS	20354214	20355053	1	-	0	transcript_id "g11080.t1"; gene_id "g11080";
93 | 2R	AUGUSTUS	exon	20354214	20355053	.	-	.	transcript_id "g11080.t1"; gene_id "g11080";
94 | 2R	AUGUSTUS	start_codon	20355051	20355053	.	-	0	transcript_id "g11080.t1"; gene_id "g11080";
95 | 2R	AUGUSTUS	gene	20354214	20355053	1	-	.	g11080
96 | 2R	AUGUSTUS	transcript	20354214	20355053	1	-	.	g11080.t1


--------------------------------------------------------------------------------
/tests/genome_anno/anno1.gtf:
--------------------------------------------------------------------------------
 1 | 3L	GeneMark.hmm	stop_codon	18462228	18462230	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; count "1_1";
 2 | 3L	GeneMark.hmm	CDS	18462228	18462540	.	-	1	gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2";
 3 | 3L	GeneMark.hmm	exon	18462228	18462540	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "0_1"; cds_type "Terminal"; count "2_2";
 4 | 3L	GeneMark.hmm	CDS	18462719	18463068	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
 5 | 3L	GeneMark.hmm	exon	18462719	18463068	0	-	.	gene_id "7789_g"; transcript_id "7789_t"; evidence "1_0"; cds_type "Initial"; count "1_2";
 6 | 3L	GeneMark.hmm	start_codon	18463066	18463068	.	-	0	gene_id "7789_g"; transcript_id "7789_t"; count "1_1";
 7 | 3R	AUGUSTUS	start_codon	7686444	7686446	.	+	0	transcript_id "g5980.t1"; gene_id "g5980";
 8 | 3R	AUGUSTUS	CDS	7686444	7686623	1	+	0	transcript_id "g5980.t1"; gene_id "g5980";
 9 | 3R	AUGUSTUS	exon	7686444	7686623	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
10 | 3R	AUGUSTUS	intron	7686624	7690691	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
11 | 3R	AUGUSTUS	CDS	7690692	7690843	1	+	0	transcript_id "g5980.t1"; gene_id "g5980";
12 | 3R	AUGUSTUS	exon	7690692	7690843	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
13 | 3R	AUGUSTUS	intron	7690844	7691514	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
14 | 3R	AUGUSTUS	CDS	7691515	7691630	1	+	1	transcript_id "g5980.t1"; gene_id "g5980";
15 | 3R	AUGUSTUS	exon	7691515	7691630	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
16 | 3R	AUGUSTUS	intron	7691631	7691712	1	+	.	transcript_id "g5980.t1"; gene_id "g5980";
17 | 3R	AUGUSTUS	CDS	7691713	7693700	1	+	2	transcript_id "g5980.t1"; gene_id "g5980";
18 | 3R	AUGUSTUS	gene	7686444	7693700	1	+	.	g5980
19 | 3R	AUGUSTUS	transcript	7686444	7693700	1	+	.	g5980.t1
20 | 3R	AUGUSTUS	exon	7691713	7693700	.	+	.	transcript_id "g5980.t1"; gene_id "g5980";
21 | 3R	AUGUSTUS	stop_codon	7693698	7693700	.	+	0	transcript_id "g5980.t1"; gene_id "g5980";
22 | X	AUGUSTUS	stop_codon	2065454	2065456	.	-	0	transcript_id "g12130.t1"; gene_id "g12130";
23 | X	AUGUSTUS	CDS	2065454	2065891	0.75	-	0	transcript_id "g12130.t1"; gene_id "g12130";
24 | X	AUGUSTUS	exon	2065454	2065891	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
25 | X	AUGUSTUS	intron	2065892	2065944	0.98	-	.	transcript_id "g12130.t1"; gene_id "g12130";
26 | X	AUGUSTUS	CDS	2065945	2066088	0.93	-	0	transcript_id "g12130.t1"; gene_id "g12130";
27 | X	AUGUSTUS	exon	2065945	2066088	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
28 | X	AUGUSTUS	intron	2066089	2066148	0.92	-	.	transcript_id "g12130.t1"; gene_id "g12130";
29 | X	AUGUSTUS	CDS	2066149	2066238	0.92	-	0	transcript_id "g12130.t1"; gene_id "g12130";
30 | X	AUGUSTUS	gene	2065454	2066238	0.7	-	.	g12130
31 | X	AUGUSTUS	transcript	2065454	2066238	0.7	-	.	g12130.t1
32 | X	AUGUSTUS	exon	2066149	2066238	.	-	.	transcript_id "g12130.t1"; gene_id "g12130";
33 | X	AUGUSTUS	start_codon	2066236	2066238	.	-	0	transcript_id "g12130.t1"; gene_id "g12130";
34 | 2R	AUGUSTUS	stop_codon	16433896	16433898	.	-	0	transcript_id "g10583.t1"; gene_id "g10583";
35 | 2R	AUGUSTUS	CDS	16433896	16435797	1	-	0	transcript_id "g10583.t1"; gene_id "g10583";
36 | 2R	AUGUSTUS	exon	16433896	16435797	.	-	.	transcript_id "g10583.t1"; gene_id "g10583";
37 | 2R	AUGUSTUS	start_codon	16435795	16435797	.	-	0	transcript_id "g10583.t1"; gene_id "g10583";
38 | 2R	AUGUSTUS	gene	16433896	16435797	1	-	.	g10583
39 | 2R	AUGUSTUS	transcript	16433896	16435797	1	-	.	g10583.t1
40 | 2R	AUGUSTUS	stop_codon	24640803	24640805	.	-	0	transcript_id "g11793.t1"; gene_id "g11793";
41 | 2R	AUGUSTUS	CDS	24640803	24642212	1	-	0	transcript_id "g11793.t1"; gene_id "g11793";
42 | 2R	AUGUSTUS	exon	24640803	24642212	.	-	.	transcript_id "g11793.t1"; gene_id "g11793";
43 | 2R	AUGUSTUS	start_codon	24642210	24642212	.	-	0	transcript_id "g11793.t1"; gene_id "g11793";
44 | 2R	AUGUSTUS	gene	24640803	24642212	1	-	.	g11793
45 | 2R	AUGUSTUS	transcript	24640803	24642212	1	-	.	g11793.t1
46 | 2L	AUGUSTUS	stop_codon	11989063	11989065	.	-	0	transcript_id "g1539.t1"; gene_id "g1539";
47 | 2L	AUGUSTUS	CDS	11989063	11989803	0.73	-	0	transcript_id "g1539.t1"; gene_id "g1539";
48 | 2L	AUGUSTUS	exon	11989063	11989803	.	-	.	transcript_id "g1539.t1"; gene_id "g1539";
49 | 2L	AUGUSTUS	start_codon	11989801	11989803	.	-	0	transcript_id "g1539.t1"; gene_id "g1539";
50 | 2L	AUGUSTUS	gene	11989063	11989803	0.73	-	.	g1539
51 | 2L	AUGUSTUS	transcript	11989063	11989803	0.73	-	.	g1539.t1
52 | 2L	AUGUSTUS	start_codon	4686242	4686244	.	+	0	transcript_id "g562.t1"; gene_id "g562";
53 | 2L	AUGUSTUS	CDS	4686242	4687105	1	+	0	transcript_id "g562.t1"; gene_id "g562";
54 | 2L	AUGUSTUS	exon	4686242	4687105	.	+	.	transcript_id "g562.t1"; gene_id "g562";
55 | 2L	AUGUSTUS	stop_codon	4687103	4687105	.	+	0	transcript_id "g562.t1"; gene_id "g562";
56 | 2L	AUGUSTUS	gene	4686242	4687105	1	+	.	g562
57 | 2L	AUGUSTUS	transcript	4686242	4687105	1	+	.	g562.t1
58 | 3L	AUGUSTUS	stop_codon	11362605	11362607	.	-	0	transcript_id "g3988.t1"; gene_id "g3988";
59 | 3L	AUGUSTUS	CDS	11362605	11363086	1	-	2	transcript_id "g3988.t1"; gene_id "g3988";
60 | 3L	AUGUSTUS	exon	11362605	11363086	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
61 | 3L	AUGUSTUS	intron	11363087	11363276	1	-	.	transcript_id "g3988.t1"; gene_id "g3988";
62 | 3L	AUGUSTUS	CDS	11363277	11363918	1	-	2	transcript_id "g3988.t1"; gene_id "g3988";
63 | 3L	AUGUSTUS	exon	11363277	11363918	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
64 | 3L	AUGUSTUS	intron	11363919	11364608	1	-	.	transcript_id "g3988.t1"; gene_id "g3988";
65 | 3L	AUGUSTUS	CDS	11364609	11364771	1	-	0	transcript_id "g3988.t1"; gene_id "g3988";
66 | 3L	AUGUSTUS	gene	11362605	11364771	1	-	.	g3988
67 | 3L	AUGUSTUS	transcript	11362605	11364771	1	-	.	g3988.t1
68 | 3L	AUGUSTUS	exon	11364609	11364771	.	-	.	transcript_id "g3988.t1"; gene_id "g3988";
69 | 3L	AUGUSTUS	start_codon	11364769	11364771	.	-	0	transcript_id "g3988.t1"; gene_id "g3988";
70 | 3R	AUGUSTUS	start_codon	12691822	12691824	.	+	0	transcript_id "g6660.t1"; gene_id "g6660";
71 | 3R	AUGUSTUS	CDS	12691822	12691869	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
72 | 3R	AUGUSTUS	exon	12691822	12691869	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
73 | 3R	AUGUSTUS	intron	12691870	12692642	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
74 | 3R	AUGUSTUS	CDS	12692643	12692707	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
75 | 3R	AUGUSTUS	exon	12692643	12692707	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
76 | 3R	AUGUSTUS	intron	12692708	12692769	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
77 | 3R	AUGUSTUS	CDS	12692770	12692944	1	+	1	transcript_id "g6660.t1"; gene_id "g6660";
78 | 3R	AUGUSTUS	exon	12692770	12692944	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
79 | 3R	AUGUSTUS	intron	12692945	12693003	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
80 | 3R	AUGUSTUS	CDS	12693004	12693155	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
81 | 3R	AUGUSTUS	exon	12693004	12693155	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
82 | 3R	AUGUSTUS	intron	12693156	12693214	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
83 | 3R	AUGUSTUS	CDS	12693215	12693761	1	+	1	transcript_id "g6660.t1"; gene_id "g6660";
84 | 3R	AUGUSTUS	exon	12693215	12693761	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
85 | 3R	AUGUSTUS	intron	12693762	12693829	1	+	.	transcript_id "g6660.t1"; gene_id "g6660";
86 | 3R	AUGUSTUS	CDS	12693830	12693973	1	+	0	transcript_id "g6660.t1"; gene_id "g6660";
87 | 3R	AUGUSTUS	gene	12691822	12693973	1	+	.	g6660
88 | 3R	AUGUSTUS	transcript	12691822	12693973	1	+	.	g6660.t1
89 | 3R	AUGUSTUS	exon	12693830	12693973	.	+	.	transcript_id "g6660.t1"; gene_id "g6660";
90 | 3R	AUGUSTUS	stop_codon	12693971	12693973	.	+	0	transcript_id "g6660.t1"; gene_id "g6660";
91 | 2R	AUGUSTUS	stop_codon	20354214	20354216	.	-	0	transcript_id "g11080.t1"; gene_id "g11080";
92 | 2R	AUGUSTUS	CDS	20354214	20355053	1	-	0	transcript_id "g11080.t1"; gene_id "g11080";
93 | 2R	AUGUSTUS	exon	20354214	20355053	.	-	.	transcript_id "g11080.t1"; gene_id "g11080";
94 | 2R	AUGUSTUS	start_codon	20355051	20355053	.	-	0	transcript_id "g11080.t1"; gene_id "g11080";
95 | 2R	AUGUSTUS	gene	20354214	20355053	1	-	.	g11080
96 | 2R	AUGUSTUS	transcript	20354214	20355053	1	-	.	g11080.t1
97 | 


--------------------------------------------------------------------------------
/bin/get_overlapping_genes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # author: Lars Gabriel
  4 | #
  5 | # TSEBRA: Transcript Selector for BRAKER
  6 | # ==============================================================
  7 | import argparse
  8 | import sys
  9 | import os
 10 | import csv
 11 | 
 12 | class ConfigFileError(Exception):
 13 |     pass
 14 | 
 15 | class GeneSetMissing(Exception):
 16 |     pass
 17 | 
 18 | gtf = []
 19 | enforce_tx = []
 20 | anno = []
 21 | hintfiles = []
 22 | graph = None
 23 | out = ''
 24 | v = 0
 25 | quiet = False
 26 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \
 27 |     'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0}
 28 | cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/braker3.cfg'
 29 | def main():
 30 |     """
 31 |         Overview:
 32 | 
 33 |         1. Read gene predicitions from .gtf files.
 34 |         2. Read Evidence from .gff files.
 35 |         3. Detect overlapping transcripts.
 36 |         4. Create feature vector (for a list of all features see features.py)
 37 |            for all transcripts.
 38 |         5. Compare the feature vectors of all pairs of overlapping transcripts.
 39 |         6. Exclude transcripts based on the 'transcript comparison rule' and 5.
 40 |         7. Remove Transcripts with low evidence support.
 41 |         8. Create combined gene predicitions (all transcripts that weren't excluded).
 42 |     """
 43 | 
 44 |     from genome_anno import Anno
 45 |     from overlap_graph import Graph
 46 |     from evidence import Evidence
 47 | 
 48 |     global anno, graph, parameter
 49 | 
 50 |     args = parseCmd()
 51 | #     init(args)
 52 |     set_parameter(cfg_file)
 53 |     if v > 0:
 54 |         print(gtf)
 55 |     tx_keys = []
 56 |     # read gene prediciton files
 57 |     c = 1
 58 |     keep = []
 59 |     for g in [args.geneset1, args.geneset2]:
 60 |         tx_keys.append([])
 61 |         if not quiet:
 62 |             sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n')
 63 |         anno.append(Anno(g, f'anno{c}'))
 64 |         anno[-1].addGtf()
 65 |         anno[-1].norm_tx_format()
 66 |         keep.append(f'anno{c}')
 67 |         for tx in anno[-1].transcripts.values():
 68 |             cds = tx.get_type_coords('CDS', False)
 69 |             key = ['_'.join(list(map(str, c_1))) for c_1 in cds]
 70 |             tx_keys[-1].append(key)
 71 |         c+=1
 72 |         
 73 |     
 74 | 
 75 |     # read hintfiles
 76 |     evi = Evidence()
 77 | 
 78 |     # create graph with an edge for each unique transcript
 79 |     # and an edge if two transcripts overlap
 80 |     # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides
 81 |     graph = Graph(anno, para=parameter, keep_tx=keep, verbose=v)
 82 |     if not quiet:
 83 |         sys.stderr.write('### BUILD OVERLAP GRAPH\n')
 84 |     graph.build()
 85 | 
 86 |     graph.add_node_features(evi)
 87 |     # apply decision rule to exclude a set of transcripts
 88 |     if not quiet:
 89 |         sys.stderr.write('### SELECT TRANSCRIPTS\n')
 90 |     combined_prediction = graph.get_decided_graph()
 91 | 
 92 |     if v > 0:
 93 |         sys.stderr.write(str(combined_prediction.keys()) + '\n')
 94 |         for a in anno:
 95 |             sys.stderr.write('Numb_tx in {}: {}\n'.format(a.id, len(combined_prediction[a.id])))
 96 | 
 97 |     # write result to output file
 98 |     if not quiet:
 99 |         sys.stderr.write('### WRITE COMBINED GENE PREDICTION\n')
100 |     combined_anno = Anno('', 'combined_annotation')
101 |     for a in anno:
102 |         txs = a.get_subset([t[0] for t in combined_prediction[a.id]])
103 |         for id, new_gene_id in combined_prediction[a.id]:
104 |             txs[id].set_gene_id(new_gene_id)
105 |         combined_anno.add_transcripts(txs, a.id + '.')
106 |     combined_anno.find_genes()
107 |     
108 |     out_only_g1 = []
109 |     out_only_g2 = []
110 |     out_overlap_g1 = []
111 |     out_overlap_g2 = []
112 |     
113 |     gene_gtf = sorted(combined_anno.gene_gtf.values(), key=lambda g: (g[0],g[3],g[4]))
114 |     for gene in gene_gtf:
115 |         gtf_gene = [[],[]]
116 |         current_anno_sources = set([])
117 | #         gtf_gene.append(gene)
118 |         for tx_id in combined_anno.genes[gene[8]]:
119 |             n_id = f'{combined_anno.transcripts[tx_id].source_anno};{".".join(tx_id.split(".")[1:])}'
120 | #             gtf_gene += combined_anno.transcripts[tx_id].get_gtf()
121 | #             current_anno_sources = current_anno_sources.union(graph.nodes[n_id].gene_sets)
122 |             cds = combined_anno.transcripts[tx_id].get_type_coords('CDS', False)
123 |             key = ['_'.join(list(map(str, c_1))) for c_1 in cds]
124 |         
125 |             for i, k in enumerate(tx_keys):
126 |                 if key in k:
127 |                     gtf_gene[i].append(gene)
128 |                     gtf_gene[i] += combined_anno.transcripts[tx_id].get_gtf()                    
129 |             
130 | #         print(current_anno_sources)
131 | #         print(gtf_gene)
132 |         if gtf_gene[0] and gtf_gene[1]:
133 |             print(current_anno_sources, 'A')
134 |             out_overlap_g1 += gtf_gene[0]
135 |             out_overlap_g2 += gtf_gene[1]
136 |         elif gtf_gene[0]:
137 |             out_only_g1 += gtf_gene[0]
138 |         elif gtf_gene[1]:
139 |             out_only_g2 += gtf_gene[1]
140 |         else:
141 |             print(current_anno_sources)
142 |     
143 |     
144 |     for i,j in zip([out_only_g1,out_only_g2,out_overlap_g1,out_overlap_g2], 
145 |                    [f'{args.out}_only_g1', f'{args.out}_only_g2', f'{args.out}_overlap_g1',f'{args.out}_overlap_g2']):
146 |         with open(j, 'w+') as file:
147 |             out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n')
148 |             for line in i:
149 |                 out_writer.writerow(line)
150 |     
151 | 
152 | def set_parameter(cfg_file):
153 |     """
154 |         read parameters from the cfg file and store them in parameter.
155 | 
156 |         Args:
157 |             cfg_file (str): Path to configuration file.
158 |     """
159 |     global parameter
160 |     with open(cfg_file, 'r') as file:
161 |         cfg = csv.reader(file, delimiter=' ')
162 |         for line in cfg:
163 |             if not line[0][0] == '#':
164 |                 if line[0] not in parameter.keys():
165 |                     parameter.update({line[0] : None})
166 |                 parameter[line[0]] = float(line[1])
167 | 
168 | def init(args):
169 |     global gtf, hintfiles, threads, hint_source_weight, out, enforce_tx, v, quiet
170 |     if args.gtf:
171 |         gtf = args.gtf.split(',')
172 |     if args.keep_gtf:
173 |         enforce_tx = args.keep_gtf.split(',')
174 |     if not args.keep_gtf and not args.gtf:
175 |         raise GeneSetMissing('At least one gene set has to be provided '\
176 |             + 'either with --gtf or --kepp_all!')
177 |     if args.hintfiles:
178 |         hintfiles = args.hintfiles.split(',')
179 |     if args.cfg:
180 |         cfg_file = args.cfg
181 |     else:
182 |         cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/braker3.cfg'
183 |     set_parameter(cfg_file)
184 |     if args.out:
185 |         out = args.out
186 |     if args.verbose:
187 |         v = args.verbose
188 |     if args.quiet:
189 |         quiet = True
190 | 
191 | def parseCmd():
192 |     """Parse command line arguments
193 | 
194 |     Returns:
195 |         dictionary: Dictionary with arguments
196 |     """
197 |     parser = argparse.ArgumentParser(description='Input: Two gtf files; Output: 3 GTF files with overlapping/not overlapping genes.')
198 |     parser.add_argument('-g1', '--geneset1', type=str,
199 |         help='') 
200 |     parser.add_argument('-g2', '--geneset2', type=str,
201 |         help='') 
202 |     parser.add_argument('-o', '--out', type=str, required=True,
203 |         help='')
204 |     parser.add_argument('-q', '--quiet', action='store_true',
205 |         help='Quiet mode.')
206 |     parser.add_argument('-v', '--verbose', type=int,
207 |         help='')
208 |     return parser.parse_args()
209 | 
210 | if __name__ == '__main__':
211 |     main()
212 | 


--------------------------------------------------------------------------------
/bin/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | The Artistic License 2.0
  2 | 
  3 | Copyright (c) 2000-2006, The Perl Foundation.
  4 | 
  5 | Everyone is permitted to copy and distribute verbatim copies of this license
  6 | document, but changing it is not allowed.
  7 | 
  8 | Preamble
  9 | 
 10 | This license establishes the terms under which a given free software Package
 11 | may be copied, modified, distributed, and/or redistributed. The intent is that
 12 | the Copyright Holder maintains some artistic control over the development of
 13 | that Package while still keeping the Package available as open source and free
 14 | software.
 15 | 
 16 | You are always permitted to make arrangements wholly outside of this license
 17 | directly with the Copyright Holder of a given Package. If the terms of this
 18 | license do not permit the full use that you propose to make of the Package,
 19 | you should contact the Copyright Holder and seek a different licensing
 20 | arrangement.
 21 | 
 22 | Definitions
 23 | 
 24 | "Copyright Holder" means the individual(s) or organization(s) named in the
 25 | copyright notice for the entire Package.
 26 | 
 27 | "Contributor" means any party that has contributed code or other material to
 28 | the Package, in accordance with the Copyright Holder&apos;s procedures.
 29 | 
 30 | "You" and "your" means any person who would like to copy, distribute, or
 31 | modify the Package.
 32 | 
 33 | "Package" means the collection of files distributed by the Copyright Holder,
 34 | and derivatives of that collection and/or of those files. A given Package may
 35 | consist of either the Standard Version, or a Modified Version.
 36 | 
 37 | "Distribute" means providing a copy of the Package or making it accessible to
 38 | anyone else, or in the case of a company or organization, to others outside of
 39 | your company or organization.
 40 | 
 41 | "Distributor Fee" means any fee that you charge for Distributing this Package
 42 | or providing support for this Package to another party. It does not mean
 43 | licensing fees.
 44 | 
 45 | "Standard Version" refers to the Package if it has not been modified, or has
 46 | been modified only in ways explicitly requested by the Copyright Holder.
 47 | 
 48 | "Modified Version" means the Package, if it has been changed, and such changes
 49 | were not explicitly requested by the Copyright Holder.
 50 | 
 51 | "Original License" means this Artistic License as Distributed with the
 52 | Standard Version of the Package, in its current version or as it may be
 53 | modified by The Perl Foundation in the future.
 54 | 
 55 | "Source" form means the source code, documentation source, and configuration
 56 | files for the Package.
 57 | 
 58 | "Compiled" form means the compiled bytecode, object code, binary, or any other
 59 | form resulting from mechanical transformation or translation of the Source
 60 | form.
 61 | 
 62 | Permission for Use and Modification Without Distribution
 63 | 
 64 | (1) You are permitted to use the Standard Version and create and use Modified
 65 | Versions for any purpose without restriction, provided that you do not
 66 | Distribute the Modified Version.
 67 | 
 68 | Permissions for Redistribution of the Standard Version
 69 | 
 70 | (2) You may Distribute verbatim copies of the Source form of the Standard
 71 | Version of this Package in any medium without restriction, either gratis or
 72 | for a Distributor Fee, provided that you duplicate all of the original
 73 | copyright notices and associated disclaimers. At your discretion, such
 74 | verbatim copies may or may not include a Compiled form of the Package.
 75 | 
 76 | (3) You may apply any bug fixes, portability changes, and other modifications
 77 | made available from the Copyright Holder. The resulting Package will still be
 78 | considered the Standard Version, and as such will be subject to the Original
 79 | License.
 80 | 
 81 | Distribution of Modified Versions of the Package as Source
 82 | 
 83 | (4) You may Distribute your Modified Version as Source (either gratis or for a
 84 | Distributor Fee, and with or without a Compiled form of the Modified Version)
 85 | provided that you clearly document how it differs from the Standard Version,
 86 | including, but not limited to, documenting any non-standard features,
 87 | executables, or modules, and provided that you do at least ONE of the
 88 | following:
 89 | 
 90 | (a) make the Modified Version available to the Copyright Holder of the
 91 | Standard Version, under the Original License, so that the Copyright Holder may
 92 | include your modifications in the Standard Version.
 93 | 
 94 | (b) ensure that installation of your Modified Version does not prevent the
 95 | user installing or running the Standard Version. In addition, the Modified
 96 | Version must bear a name that is different from the name of the Standard
 97 | Version.
 98 | 
 99 | (c) allow anyone who receives a copy of the Modified Version to make the
100 | Source form of the Modified Version available to others under
101 | 
102 | (i) the Original License or
103 | 
104 | (ii) a license that permits the licensee to freely copy, modify and
105 | redistribute the Modified Version using the same licensing terms that apply to
106 | the copy that the licensee received, and requires that the Source form of the
107 | Modified Version, and of any works derived from it, be made freely available
108 | in that license fees are prohibited but Distributor Fees are allowed.
109 | 
110 | Distribution of Compiled Forms of the Standard Version or Modified Versions
111 | without the Source
112 | 
113 | (5) You may Distribute Compiled forms of the Standard Version without the
114 | Source, provided that you include complete instructions on how to get the
115 | Source of the Standard Version. Such instructions must be valid at the time of
116 | your distribution. If these instructions, at any time while you are carrying
117 | out such distribution, become invalid, you must provide new instructions on
118 | demand or cease further distribution. If you provide valid instructions or
119 | cease distribution within thirty days after you become aware that the
120 | instructions are invalid, then you do not forfeit any of your rights under
121 | this license.
122 | 
123 | (6) You may Distribute a Modified Version in Compiled form without the Source,
124 | provided that you comply with Section 4 with respect to the Source of the
125 | Modified Version.
126 | 
127 | Aggregating or Linking the Package
128 | 
129 | (7) You may aggregate the Package (either the Standard Version or Modified
130 | Version) with other packages and Distribute the resulting aggregation provided
131 | that you do not charge a licensing fee for the Package. Distributor Fees are
132 | permitted, and licensing fees for other components in the aggregation are
133 | permitted. The terms of this license apply to the use and Distribution of the
134 | Standard or Modified Versions as included in the aggregation.
135 | 
136 | (8) You are permitted to link Modified and Standard Versions with other works,
137 | to embed the Package in a larger work of your own, or to build stand-alone
138 | binary or bytecode versions of applications that include the Package, and
139 | Distribute the result without restriction, provided the result does not expose
140 | a direct interface to the Package.
141 | 
142 | Items That are Not Considered Part of a Modified Version
143 | 
144 | (9) Works (including, but not limited to, modules and scripts) that merely
145 | extend or make use of the Package, do not, by themselves, cause the Package to
146 | be a Modified Version. In addition, such works are not considered parts of the
147 | Package itself, and are not subject to the terms of this license.
148 | 
149 | General Provisions
150 | 
151 | (10) Any use, modification, and distribution of the Standard or Modified
152 | Versions is governed by this Artistic License. By using, modifying or
153 | distributing the Package, you accept this license. Do not use, modify, or
154 | distribute the Package, if you do not accept this license.
155 | 
156 | (11) If your Modified Version has been derived from a Modified Version made by
157 | someone other than you, you are nevertheless required to ensure that your
158 | Modified Version complies with the requirements of this license.
159 | 
160 | (12) This license does not grant you the right to use any trademark, service
161 | mark, tradename, or logo of the Copyright Holder.
162 | 
163 | (13) This license includes the non-exclusive, worldwide, free-of-charge patent
164 | license to make, have made, use, offer to sell, sell, import and otherwise
165 | transfer the Package with respect to any patent claims licensable by the
166 | Copyright Holder that are necessarily infringed by the Package. If you
167 | institute patent litigation (including a cross-claim or counterclaim) against
168 | any party alleging that the Package constitutes direct or contributory patent
169 | infringement, then this Artistic License to you shall terminate on the date
170 | that such litigation is filed.
171 | 
172 | (14) Disclaimer of Warranty:
173 | 
174 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS&apos;
175 | AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF
176 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE
177 | DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW,
178 | NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT,
179 | INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE
180 | PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
181 | 


--------------------------------------------------------------------------------
/bin/tsebra.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # author: Lars Gabriel
  4 | #
  5 | # TSEBRA: Transcript Selector for BRAKER
  6 | # ==============================================================
  7 | import argparse
  8 | import sys
  9 | import os
 10 | import csv
 11 | 
 12 | class ConfigFileError(Exception):
 13 |     pass
 14 | 
 15 | class GeneSetMissing(Exception):
 16 |     pass
 17 | 
 18 | gtf = []
 19 | enforce_tx = []
 20 | anno = []
 21 | hintfiles = []
 22 | graph = None
 23 | out = ''
 24 | v = 0
 25 | quiet = False
 26 | filter_sing_exon = False
 27 | ignore_tx_phase = False
 28 | scores_tab = ''
 29 | parameter = {'intron_support' : 0, 'stasto_support' : 0, \
 30 |     'e_1' : 0, 'e_2' : 0, 'e_3' : 0, 'e_4' : 0}
 31 | 
 32 | def main():
 33 |     """
 34 |         Overview:
 35 | 
 36 |         1. Read gene predicitions from .gtf files.
 37 |         2. Read Evidence from .gff files.
 38 |         3. Detect overlapping transcripts.
 39 |         4. Create feature vector (for a list of all features see features.py)
 40 |            for all transcripts.
 41 |         5. Compare the feature vectors of all pairs of overlapping transcripts.
 42 |         6. Exclude transcripts based on the 'transcript comparison rule' and 5.
 43 |         7. Remove Transcripts with low evidence support.
 44 |         8. Create combined gene predicitions (all transcripts that weren't excluded).
 45 |     """
 46 | 
 47 |     from genome_anno import Anno
 48 |     from overlap_graph import Graph
 49 |     from evidence import Evidence
 50 | 
 51 |     global anno, graph, parameter
 52 | 
 53 |     args = parseCmd()
 54 |     init(args)
 55 | 
 56 |     if v > 0:
 57 |         print(gtf)
 58 | 
 59 |     # read gene prediciton files
 60 |     c = 1
 61 |     keep = []
 62 |     
 63 |     for g in gtf:
 64 |         if not quiet:
 65 |             sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n')
 66 |         anno.append(Anno(g, f'anno{c}'))
 67 |         anno[-1].addGtf()
 68 |         anno[-1].norm_tx_format()
 69 |         c += 1
 70 |     for g in enforce_tx:
 71 |         if not quiet:
 72 |             sys.stderr.write(f'### READING GENE PREDICTION: [{g}]\n')
 73 |         anno.append(Anno(g, f'anno{c}'))
 74 |         anno[-1].addGtf()
 75 |         anno[-1].norm_tx_format()
 76 |         keep.append(f'anno{c}')
 77 |         c += 1
 78 |     
 79 |     # read hintfiles
 80 |     evi = Evidence()
 81 |     for h in hintfiles:
 82 |         if not quiet:
 83 |             sys.stderr.write(f'### READING EXTRINSIC EVIDENCE: [{h}]\n')
 84 |         evi.add_hintfile(h)
 85 |     for src in evi.src:
 86 |         if src not in parameter.keys():
 87 |             sys.stderr.write(f'ConfigError: No weight for src={src}, it is set to 1\n')
 88 |             parameter.update({src : 1})
 89 | 
 90 |     # create graph with an edge for each unique transcript
 91 |     # and an edge if two transcripts overlap
 92 |     # two transcripts overlap if they share at least 3 adjacent protein coding nucleotides
 93 |     
 94 |     graph = Graph(anno, para=parameter, keep_tx=keep, filter_single=filter_sing_exon, ignore_phase=ignore_tx_phase, verbose=v)
 95 |     if not quiet:
 96 |         sys.stderr.write('### BUILD OVERLAP GRAPH\n')
 97 |     graph.build()
 98 | 
 99 |     # add features
100 |     if not quiet:
101 |         sys.stderr.write('### ADD FEATURES TO TRANSCRIPTS\n')
102 |     graph.add_node_features(evi)
103 | 
104 |     # apply decision rule to exclude a set of transcripts
105 |     if not quiet:
106 |         sys.stderr.write('### SELECT TRANSCRIPTS\n')
107 |     combined_prediction = graph.get_decided_graph()
108 | 
109 |     if v > 0:
110 |         sys.stderr.write(str(combined_prediction.keys()) + '\n')
111 |         for a in anno:
112 |             sys.stderr.write('Numb_tx in {}: {}\n'.format(a.id, len(combined_prediction[a.id])))
113 | 
114 |     # write result to output file
115 |     if not quiet:
116 |         sys.stderr.write('### WRITE COMBINED GENE PREDICTION\n')
117 |     combined_anno = Anno('', 'combined_annotation')
118 |     for a in anno:
119 |         txs = a.get_subset([t[0] for t in combined_prediction[a.id]])
120 |         for id, new_gene_id in combined_prediction[a.id]:
121 |             txs[id].set_gene_id(new_gene_id)
122 |         combined_anno.add_transcripts(txs, a.id + '.')
123 |     combined_anno.find_genes()
124 |     combined_anno.write_anno(out)
125 | 
126 |     if scores_tab:
127 |         if not quiet:
128 |             sys.stderr.write('### WRITE TRANSCRIPT SCORES\n')
129 |         tab_out = [['### TX_ID','intron_support', 'stasto_support', 's1', 's2', 's3', 's4']]
130 |         for node in graph.nodes.values():
131 |             tab_out += [[node.id] + list(node.feature_vector)]
132 |         write_csv(scores_tab, tab_out)
133 | 
134 |     if not quiet:
135 |         sys.stderr.write('### FINISHED\n\n')
136 |         sys.stderr.write('### The combined gene prediciton is located at {}.\n'.format(\
137 |             out))
138 | 
139 | def set_parameter(cfg_file):
140 |     """
141 |         Read parameters from the cfg file and store them in parameter.
142 | 
143 |         Args:
144 |             cfg_file (str): Path to configuration file.
145 |     """
146 |     global parameter
147 |     with open(cfg_file, 'r') as file:
148 |         cfg = csv.reader(file, delimiter=' ')
149 |         for line in cfg:
150 |             if not line[0][0] == '#':
151 |                 if line[0] not in parameter.keys():
152 |                     parameter.update({line[0] : None})
153 |                 parameter[line[0]] = float(line[1])
154 | 
155 | def write_csv(out_path, tab):
156 |     """
157 |         Write table to out_path.
158 |         Args:
159 |             (str) : path to the output file
160 |             (list) : table  
161 |     """
162 |     with open(out_path, 'w+') as file:
163 |         out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n')
164 |         for line in tab:
165 |             out_writer.writerow(line)
166 | 
167 | def init(args):
168 |     global gtf, hintfiles, threads, hint_source_weight, out, enforce_tx, v, scores_tab, filter_sing_exon, ignore_tx_phase, quiet
169 |     if args.gtf:
170 |         gtf = args.gtf.split(',')
171 |     if args.keep_gtf:
172 |         enforce_tx = args.keep_gtf.split(',')
173 |     if not args.keep_gtf and not args.gtf:
174 |         raise GeneSetMissing('At least one gene set has to be provided '\
175 |             + 'either with --gtf or --kepp_all!')
176 |     if args.hintfiles:
177 |         hintfiles = args.hintfiles.split(',')
178 |     if args.cfg:
179 |         cfg_file = args.cfg
180 |     else:
181 |         cfg_file = os.path.dirname(os.path.realpath(__file__)) + '/../config/default.cfg'
182 |     set_parameter(cfg_file)
183 |     if args.score_tab:
184 |         scores_tab = args.score_tab
185 |     if args.filter_single_exon_genes:
186 |         filter_sing_exon = args.filter_single_exon_genes
187 |     if args.ignore_tx_phase:
188 |         ignore_tx_phase = args.ignore_tx_phase
189 |     if args.out:
190 |         out = args.out
191 |     if args.verbose:
192 |         v = args.verbose
193 |     if args.quiet:
194 |         quiet = True
195 | 
196 | def parseCmd():
197 |     """Parse command line arguments
198 | 
199 |     Returns:
200 |         dictionary: Dictionary with arguments
201 |     """
202 |     parser = argparse.ArgumentParser(description='TSEBRA: Transcript Selector for BRAKER\n\n' \
203 |         + 'TSEBRA combines gene predictions by selecing ' \
204 |         + 'transcripts based on their extrisic evidence support.')
205 |     parser.add_argument('-g', '--gtf', type=str,
206 |         help='List (separated by commas) of gene prediciton files in gtf.\n' \
207 |             + '(e.g. gene_pred1.gtf,gene_pred2.gtf,gene_pred3.gtf)')
208 |     parser.add_argument('-k', '--keep_gtf', type=str,
209 |         help='List (separated by commas) of gene prediciton files in gtf.\n' \
210 |             + 'These gene sets are used the same way as other inputs, but TSEBRA '\
211 |             + 'ensures that all transcripts from these gene sets are included in the output.')
212 |     parser.add_argument('-e', '--hintfiles', type=str,
213 |         help='List (separated by commas) of files containing extrinsic evidence in gff.\n' \
214 |             + '(e.g. hintsfile1.gff,hintsfile2.gtf,3.gtf)')
215 |     parser.add_argument('-c', '--cfg', type=str,
216 |         help='Configuration file that sets the parameter for TSEBRA. ' \
217 |             + 'You can find the recommended parameter at config/default.cfg.')
218 |     parser.add_argument('--filter_single_exon_genes', action='store_true',
219 |         help='Filter out all single-exon genes out that are not' \
220 |             + ' supported by at least one start- or stop-codon hint.')
221 |     parser.add_argument('--ignore_tx_phase', action='store_true',
222 |         help='Ignore the phase of transcripts while detecting clusters ' \
223 |             + 'of overlapping transcripts.')
224 |     parser.add_argument('-s', '--score_tab', type=str,
225 |         help='Prints the transcript scores as a table to the specified file.')
226 |     parser.add_argument('-o', '--out', type=str, required=True,
227 |         help='Outputfile for the combined gene prediciton in gtf.')
228 |     parser.add_argument('-q', '--quiet', action='store_true',
229 |         help='Quiet mode.')
230 |     parser.add_argument('-v', '--verbose', type=int,
231 |         help='')
232 |     return parser.parse_args()
233 | 
234 | if __name__ == '__main__':
235 |     main()


--------------------------------------------------------------------------------
/tests/prep_files.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # author: Lars Gabriel
  4 | #
  5 | # prep_files.py: create example data for pytests
  6 | # ==============================================================
  7 | import os
  8 | testDir = os.path.abspath(os.path.dirname(__file__))
  9 | 
 10 | def genome_anno():
 11 |     anno1 = testDir + '/genome_anno/anno1.gtf'
 12 |     orig = []
 13 |     with open(anno1, 'r') as file:
 14 |         for line in file.readlines():
 15 |             line = line.strip('\n')
 16 |             orig.append(line)
 17 |     orig = [f.split('\t') for f in orig]
 18 | 
 19 |     anno = orig
 20 |     anno[1][8] = 'gene_id "7789_g";'
 21 |     anno = ['\t'.join(map(str, line)) for line in anno]
 22 |     with open(testDir + '/genome_anno/format_error.gtf', 'w+') as file:
 23 |         file.write('\n'.join(anno))
 24 | 
 25 |     anno = orig
 26 |     anno[1][8] = 'transcript_id "7789_t";'
 27 |     anno[6][8] = 'transcript_id "g5980.t1";'
 28 |     for line in anno:
 29 |         if 'transcript_id "g11080.t1";' in line[8]:
 30 |             line[8] = 'transcript_id "g11080.t1";'
 31 |     anno = ['\t'.join(map(str, line)) for line in anno]
 32 |     with open(testDir + '/genome_anno/missing_gid.gtf', 'w+') as file:
 33 |         file.write('\n'.join(anno))
 34 | 
 35 | def get_anno(tx_dict, phase):
 36 |         template = ['3R', 'AUGUSTUS', '', '', '', phase, '+', '0', '']
 37 |         anno = []
 38 |         for key in tx_dict:
 39 |             coord = tx_dict[key]
 40 |             template[8] = 'transcript_id "{}"; gene_id "{}";'.format(key, key + '_g')
 41 |             type = 'exon'
 42 |             pos = coord[0]
 43 |             for c in coord[1:]:
 44 |                 line = template.copy()
 45 |                 line[2] = type
 46 |                 line[3] = pos
 47 |                 pos += c
 48 |                 line[4] = pos
 49 |                 if type == 'intron':
 50 |                     line[3] += 1
 51 |                     line[4] -= 1
 52 |                 anno.append(line)
 53 |                 if type == 'exon':
 54 |                     line = line.copy()
 55 |                     line[2] = 'CDS'
 56 |                     anno.append(line)
 57 |                     type = 'intron'
 58 |                 else:
 59 |                     type = 'exon'
 60 |             line = template.copy()
 61 |             line[2] = 'transcript'
 62 |             line[3] = str(coord[0])
 63 |             line[4] = str(pos)
 64 |             line[8] = key
 65 |             anno.append(line)
 66 |         return anno
 67 | 
 68 | def list2string(gtf):
 69 |     gtf = ['\t'.join(map(str, g)) for g in gtf]
 70 |     return '\n'.join(gtf)
 71 | 
 72 | def graph():
 73 |     dir = testDir + '/graph/'
 74 |     #example 1
 75 |     anno1_txs = { 't1' : [100, 100, 100, 100], \
 76 |             't2' : [700, 100, 100, 100, 100, 100], \
 77 |             't3' : [1500, 100]}
 78 |     anno1 = get_anno(anno1_txs, '0')
 79 |     with open(dir + 'ex1_anno1.gtf', 'w+') as file:
 80 |         file.write(list2string(anno1))
 81 | 
 82 |     anno2_txs = {   't1' : [250, 250, 100, 150],
 83 |                     't2' : [1050, 200],
 84 |                     't3' : [1700, 100]}
 85 |     anno2 = get_anno(anno2_txs, '0')
 86 |     with open(dir + 'ex1_anno2.gtf', 'w+') as file:
 87 |         file.write(list2string(anno2))
 88 | 
 89 |     #example 2
 90 |     anno1_txs = { 't1' : [200, 100]}
 91 |     anno1 = get_anno(anno1_txs, '0')
 92 |     with open(dir + 'ex2_anno1.gtf', 'w+') as file:
 93 |         file.write(list2string(anno1))
 94 | 
 95 |     anno2_txs = {   't1' : [100, 100], \
 96 |                     't2' : [301, 99]}
 97 |     anno2 = get_anno(anno2_txs, '0')
 98 |     with open(dir + 'ex2_anno2.gtf', 'w+') as file:
 99 |         file.write(list2string(anno2))
100 | 
101 |     #example 3
102 |     anno1_txs = { 't1' : [100, 200, 200, 200, 200, 200]}
103 |     anno1 = get_anno(anno1_txs, '0')
104 |     with open(dir + 'ex3_anno1.gtf', 'w+') as file:
105 |         file.write(list2string(anno1))
106 | 
107 |     anno2_txs = {   't1' : [110, 90, 600, 200], \
108 |                     't2' : [350, 100]}
109 |     anno2 = get_anno(anno2_txs, '0')
110 |     with open(dir + 'ex3_anno2.gtf', 'w+') as file:
111 |         file.write(list2string(anno2))
112 | 
113 |     #example 4
114 |     anno1_txs = { 't1' : [100, 100, 100, 100]}
115 |     anno1 = get_anno(anno1_txs, '0')
116 |     with open(dir + 'ex4_anno1.gtf', 'w+') as file:
117 |         file.write(list2string(anno1))
118 | 
119 |     anno2_txs = { 't1' : [101, 100, 100, 100]}
120 |     anno2 = get_anno(anno2_txs, '1')
121 |     with open(dir + 'ex4_anno2.gtf', 'w+') as file:
122 |         file.write(list2string(anno2))
123 | 
124 | def evidence():
125 |     dir = testDir + '/evidence/'
126 |     hint_test_file1 = ['3L\tProtHint\tintron\t5812862\t5812941\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \
127 |         '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\ttranscript_id="t1"\n', \
128 |         '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\tsrc=M;pri=4\n',
129 |         '3L\tProtHint\tintron\t12291242\t']
130 |     with open(dir + 'hint1.gff', 'w+') as file:
131 |         file.write(''.join(hint_test_file1))
132 | 
133 |     hint_test_file2 = ['3L\tProtHint\tintron\t5812862\t5812941\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \
134 |         '3L\tProtHint\tintron\t12291242\t12291299\t8\t-\t.\tsrc=M;mult=8;pri=4\n', \
135 |         '3R\tProtHint\tintron\t17440148\t17440207\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \
136 |         '2R\tProtHint\tintron\t5760114\t5760177\t23\t-\t.\tsrc=M;mult=23;pri=4\n', \
137 |         '2R\tProtHint\tintron\t6210484\t6210546\t21\t-\t.\tsrc=M;mult=21;pri=4\n', \
138 |         '3L\tProtHint\tintron\t20527281\t20527592\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \
139 |         '2L\tProtHint\tintron\t12400752\t12400814\t24\t+\t.\tsrc=M;mult=24;pri=4\n', \
140 |         '2R\tProtHint\tintron\t14988084\t14988142\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \
141 |         '2L\tProtHint\tintron\t6667531\t6667670\t5\t-\t.\tsrc=M;mult=5;pri=4\n', \
142 |         '3R\tProtHint\tintron\t5537551\t5537605\t22\t+\t.\tsrc=M;mult=22;pri=4\n', \
143 |         '3R\tProtHint\tintron\t20813612\t20813665\t12\t-\t.\tsrc=M;mult=12;pri=4\n', \
144 |         'X\tProtHint\tintron\t2145714\t2147174\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \
145 |         '3L\tProtHint\tintron\t8114197\t8114256\t25\t-\t.\tsrc=M;mult=25;pri=4\n', \
146 |         'X\tProtHint\tintron\t11048602\t11048941\t25\t+\t.\tsrc=M;mult=25;pri=4\n', \
147 |         '2L\tProtHint\tintron\t3807462\t3807524\t18\t+\t.\tsrc=M;mult=18;pri=4\n', \
148 |         '3R\tProtHint\tintron\t27059120\t27059364\t19\t-\t.\tsrc=M;mult=19;pri=4\n', \
149 |         '2R\tProtHint\tintron\t13821370\t13821432\t24\t-\t.\tsrc=M;mult=24;pri=4\n', \
150 |         'X\tProtHint\tintron\t8173462\t8173860\t6\t-\t.\tsrc=M;mult=6;pri=4\n', \
151 |         'X\tProtHint\tintron\t13270643\t13271481\t16\t-\t.\tsrc=M;mult=16;pri=4\n', \
152 |         'X\tProtHint\tintron\t2079645\t2079714\t25\t-\t.\tsrc=M;mult=25;pri=4\n']
153 |     with open(dir + 'hint2.gff', 'w+') as file:
154 |         file.write(''.join(hint_test_file2))
155 | 
156 |     hint_test_file3 = []
157 |     hint_test_file3.append(get_hint(100, 102, 'start_codon'))
158 |     hint_test_file3.append(get_hint(501, 599, 'intron'))
159 |     hint_test_file3.append(get_hint(501, 599, 'intron', src='P', mult=14))
160 |     hint_test_file3.append(get_hint(698, 700, 'stop_codon'))
161 |     hint_test_file3.append(get_hint(801, 899, 'intron'))
162 |     hint_test_file3.append(get_hint(801, 899, 'intron', chr='2L'))
163 |     hint_test_file3.append(get_hint(801, 899, 'intron', src='P', mult=24))
164 |     hint_test_file3.append(get_hint(801, 949, 'intron'))
165 |     hint_test_file3.append(get_hint(801, 899, 'intron', strand='-'))
166 |     hint_test_file3.append(get_hint(1001, 1099, 'intron'))
167 |     hint_test_file3.append(get_hint(1198, 1200, 'stop_codon'))
168 |     hint_test_file3.append(get_hint(1601, 1699, 'intron'))
169 |     with open(dir + 'hint3.gff', 'w+') as file:
170 |         file.write('\n'.join(hint_test_file3))
171 | 
172 | 
173 | def get_hint(start, end, type, strand='+', chr='3R', score=10, mult=2, pri=4, src='E'):
174 |     att = 'src={};mult={};pri={}'.format(src,mult,pri)
175 |     template = [chr, 'AUGUSTUS', type, start, end, score, '+', '.', att]
176 |     return '\t'.join(map(str, template))
177 | 
178 | def get_feature():
179 |     dir = testDir + '/graph/'
180 |     result = []
181 |     with open('/home/lars/work/combiner/example/braker1/braker_fixed.gtf', 'r') as file:
182 |         for line in file.readlines():
183 |             if 'g7604.t1' in line or 'g7603.t1' in line or 'g7605.t1' in line:
184 |                 result.append(line)
185 |     with open(dir + 'ex_feature_anno1.gtf', 'w+') as file:
186 |         file.write(''.join(result))
187 | 
188 |     result = []
189 |     with open('/home/lars/work/combiner/example/braker2/braker.gtf', 'r') as file:
190 |         for line in file.readlines():
191 |             if 'g7700.t1' in line or 'g7701.t1' in line:
192 |                 result.append(line)
193 |     with open(dir + 'ex_feature_anno2.gtf', 'w+') as file:
194 |         file.write(''.join(result))
195 | 
196 |     result = []
197 |     with open('/home/lars/work/combiner/example/braker1/hintsfile.gff', 'r') as file:
198 |         for line in file.readlines():
199 |             line = line.split('\t')
200 |             if len(line) > 8:
201 |                 if int(line[3]) >= 21737000 and int(line[4]) <= 21750000 \
202 |                     and line[0] == '3R' and not line[2] == 'CDSpart':
203 |                     result.append(line)
204 |     result = ['\t'.join(r) for r in result]
205 |     with open(dir + 'ex_feature_hint1.gff', 'w+') as file:
206 |         file.write(''.join(result))
207 | 
208 |     result = []
209 |     with open('/home/lars/work/combiner/example/braker2/hintsfile.gff', 'r') as file:
210 |         for line in file.readlines():
211 |             line = line.split('\t')
212 |             if len(line) > 8:
213 |                 if int(line[3]) >= 21737000 and int(line[4]) <= 21750000 \
214 |                     and line[0] == '3R' and not line[2] == 'CDSpart':
215 |                     result.append(line)
216 |     result = ['\t'.join(r) for r in result]
217 |     with open(dir + 'ex_feature_hint2.gff', 'w+') as file:
218 |         file.write(''.join(result))
219 | 
220 | if __name__ == '__main__':
221 |     #genome_anno()
222 |     #graph()
223 |     #evidence()
224 |     get_feature()
225 | 


--------------------------------------------------------------------------------
/bin/compleasm-LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/tsebra/README.html)
  2 | [![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAASCAYAAABB7B6eAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAACXBIWXMAAAsTAAALEwEAmpwYAAACC2lUWHRYTUw6Y29tLmFkb2JlLnhtcAAAAAAAPHg6eG1wbWV0YSB4bWxuczp4PSJhZG9iZTpuczptZXRhLyIgeDp4bXB0az0iWE1QIENvcmUgNS40LjAiPgogICA8cmRmOlJERiB4bWxuczpyZGY9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyMiPgogICAgICA8cmRmOkRlc2NyaXB0aW9uIHJkZjphYm91dD0iIgogICAgICAgICAgICB4bWxuczp0aWZmPSJodHRwOi8vbnMuYWRvYmUuY29tL3RpZmYvMS4wLyI+CiAgICAgICAgIDx0aWZmOlJlc29sdXRpb25Vbml0PjI8L3RpZmY6UmVzb2x1dGlvblVuaXQ+CiAgICAgICAgIDx0aWZmOkNvbXByZXNzaW9uPjE8L3RpZmY6Q29tcHJlc3Npb24+CiAgICAgICAgIDx0aWZmOk9yaWVudGF0aW9uPjE8L3RpZmY6T3JpZW50YXRpb24+CiAgICAgICAgIDx0aWZmOlBob3RvbWV0cmljSW50ZXJwcmV0YXRpb24+MjwvdGlmZjpQaG90b21ldHJpY0ludGVycHJldGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KD0UqkwAAAn9JREFUOBGlVEuLE0EQruqZiftwDz4QYT1IYM8eFkHFw/4HYX+GB3/B4l/YP+CP8OBNTwpCwFMQXAQPKtnsg5nJZpKdni6/6kzHvAYDFtRUT71f3UwAEbkLch9ogQxcBwRKMfAnM1/CBwgrbxkgPAYqlBOy1jfovlaPsEiWPROZmqmZKKzOYCJb/AbdYLso9/9B6GppBRqCrjSYYaquZq20EUKAzVpjo1FzWRDVrNay6C/HDxT92wXrAVCH3ASqq5VqEtv1WZ13Mdwf8LFyyKECNbgHHAObWhScf4Wnj9CbQpPzWYU3UFoX3qkhlG8AY2BTQt5/EA7qaEPQsgGLWied0A8VKrHAsCC1eJ6EFoUd1v6GoPOaRAtDPViUr/wPzkIFV9AaAZGtYB568VyJfijV+ZBzlVZJ3W7XHB2RESGe4opXIGzRTdjcAupOK09RA6kzr1NTrTj7V1ugM4VgPGWEw+e39CxO6JUw5XhhKihmaDacU2GiR0Ohcc4cZ+Kq3AjlEnEeRSazLs6/9b/kh4eTC+hngE3QQD7Yyclxsrf3cpxsPXn+cFdenF9aqlBXMXaDiEyfyfawBz2RqC/O9WF1ysacOpytlUSoqNrtfbS642+4D4CS9V3xb4u8P/ACI4O810efRu6KsC0QnjHJGaq4IOGUjWTo/YDZDB3xSIxcGyNlWcTucb4T3in/3IaueNrZyX0lGOrWndstOr+w21UlVFokILjJLFhPukbVY8OmwNQ3nZgNJNmKDccusSb4UIe+gtkI+9/bSLJDjqn763f5CQ5TLApmICkqwR0QnUPKZFIUnoozWcQuRbC0Km02knj0tPYx63furGs3x/iPnz83zJDVNtdP3QAAAABJRU5ErkJggg==)](https://usegalaxy.eu/root?tool_id=tsebra)
  3 | 
  4 | # TSEBRA: Transcript Selector for BRAKER
  5 | 
  6 | <p align="center">
  7 | <img src="docs/TSEBRA_Logo.png" alt="drawing" width="700"/>
  8 | </p>
  9 | 
 10 | ### Introduction
 11 | [TSEBRA](https://doi.org/10.1186/s12859-021-04482-0) is a combiner tool that selects transcripts from gene predictions based on the support by extrisic evidence in form of introns and start/stop codons. It was developed to combine BRAKER1<sup name="a1">[1](#ref1)</sup> and BRAKER2<sup name="a2">[2](#ref2)</sup> predicitons to increase their accuracies.
 12 | 
 13 | ## Prerequisites
 14 | TSEBRA itself requires Python 3.5.2 or higher.
 15 | 
 16 | `best_by_compleasm.py`, a script that may re-run TSEBRA on a BRAKER output folder to maximize BUSCO presence in the output gene set, requires compleasm v0.2.4 or newer (https://github.com/huangnengCSU/compleasm), and the python module pandas.
 17 | 
 18 | ## Installation
 19 | Download TSEBRA:
 20 | ```
 21 | git clone https://github.com/Gaius-Augustus/TSEBRA
 22 | ```
 23 | 
 24 | If desired, download compleasm:
 25 | 
 26 | ```
 27 | wget https://github.com/huangnengCSU/compleasm/releases/download/v0.2.4/compleasm-0.2.4_x64-linux.tar.bz2
 28 | tar -xvjf compleasm-0.2.4_x64-linux.tar.bz2
 29 | ```
 30 | 
 31 | Add the resulting folder compleasm_kit to your `$PATH` variable, e.g.:
 32 | ```
 33 | export PATH=$PATH:/your/path/to/compleasm_kit
 34 | ```
 35 | 
 36 | Compleasm requires pandas, which can be installed with:
 37 | 
 38 | ```
 39 | pip install pandas
 40 | ```
 41 | 
 42 | ## Usage
 43 | The main script is ```./bin/tsebra.py```. For usage information run ```./bin/tsebra.py --help```.
 44 | 
 45 | ## Input Files
 46 | TSEBRA takes a list of gene prediciton files, a list of hintfiles and a configuration file as mandatory input.
 47 | 
 48 | #### Gene Predictions
 49 | The gene prediction files have to be in gtf format. This is the standard output format of a BRAKER or AUGUSTUS<sup name="a3">[3,](#ref3)</sup><sup name="a4">[4](#ref4)</sup> gene prediciton.
 50 | 
 51 | Example:
 52 | ```console
 53 | 2L      AUGUSTUS        gene    83268   87026   0.88    -       .       g5332
 54 | 2L      AUGUSTUS        transcript      83268   87026   0.88    -       .       g5332.t1
 55 | 2L      AUGUSTUS        intron  84278   87019   1       -       .       transcript_id "file_1_file_1_g5332.t1"; gene_id "file_1_file_1_g5332";
 56 | 2L      AUGUSTUS        CDS     87020   87026   0.88    -       0       transcript_id "file_1_file_1_g5332.t1"; gene_id "file_1_file_1_g5332";
 57 | 2L      AUGUSTUS        exon    87020   87026   .       -       .       transcript_id "file_1_file_1_g5332.t1"; gene_id "file_1_file_1_g5332";
 58 | ```
 59 | 
 60 | #### Hint Files
 61 | The hints files have to be in gff format, the last column must include an attribute for the source for the hint with 'src=' and can include the number of hints supporting the gene structure segment with 'mult='. This is the standard file format of the ```hintsfile.gff``` in a BRAKER working directory.
 62 | 
 63 | Example:
 64 | ```console
 65 | 2L      ProtHint        intron  279806  279869  2       +       .       src=P;mult=25;pri=4;al_score=0.437399;
 66 | 2L      ProtHint        intron  275252  275318  2       -       .       src=P;mult=19;pri=4;al_score=0.430006;
 67 | 2L      ProtHint        stop    293000  293002  1       +       0       grp=7220_0:002b08_g42;src=C;pri=4;
 68 | 2L      ProtHint        intron  207632  207710  1       +       .       grp=7220_0:002afa_g26;src=C;pri=4;
 69 | 2L      ProtHint        start   207512  207514  1       +       0       grp=7220_0:002afa_g26;src=C;pri=4;
 70 | ```
 71 | 
 72 | #### Configuration File
 73 | The configuration file has to include three different sets of parameter:
 74 | 1. Weights for all sources of hints. The source of a hint is specified by the mandatory 'src=' attribute in the last column of the ```hintsfile.gff``` (see section 'Hint Files'). See section 'Transcript scores' in [TSEBRA](https://doi.org/10.1101/2021.06.07.447316) for more information on how these weigths are used.  
 75 | A weight is set to 1, if the weight for a hint source is not specified in the configuration file.  
 76 | 
 77 |    * *Notes on adjusting these parameters: Increase the weight of the hint sources that have the highest quality. For example, if the protein database includes only species that are remotely related to the target species, the hints produced by BRAKER2 might be less accurate than the RNA-seq evidence. Then, you should increase the weight of the source related to the RNA-seq hints.*    
 78 | 
 79 | 
 80 | 2. Required fractions of supported introns or supported start/stop-codons for a transcript. A transcript is not included in the TSEBRA result if the fractions of introns and start/stop codons supported by extrinsic evidence are lower than the thresholds.  
 81 | 
 82 |    * *Notes on adjusting these parameters: The low evidence support thresholds for low evidence support are quite strict in the default configuration file. In this configuration, only transcripts with very high evidence support are allowed in the TSBERA result. In some cases, the default setting might be too strict, so that too many transcripts are filtered out. In this case, you should reduce the threshold of 'intron_support' (e.g., to 0.2).*  
 83 | 
 84 | 
 85 | 3. Allowed difference between two overlapping transcripts for the six transcript scores. TSEBRA compares transcripts via their transcript scores and removes the one with the lower score if their difference exceeds the respective threshold.  
 86 | Note that it is recommended to choose thesholds between [0,2], since the transcript scores are normalized to [-1,1]. 
 87 | 
 88 |    * *Notes on adjusting these parameters: The higher the thresholds are set the less transcripts are filtered by the respective rule. With these thresholds one can adjust the effect of each filtering rule of TSEBRA. As these thresholds are increased, more transcripts are included in the TSEBRA result, in particular, more alternatively spliced isoforms per gene are contained in the result.*  
 89 | 
 90 | 
 91 | 
 92 | The name and the value of a parameter are separated by a space, and each parameter is listed in a different line.  
 93 | Example:
 94 | ```console
 95 | # Weight for each hint source
 96 | # Values have to be >= 0
 97 | P 1
 98 | E 1
 99 | C 1
100 | M 1
101 | # Required fraction of supported introns 
102 | # or supported start/stop-codons for a transcript
103 | # Values have to be in [0,1]
104 | intron_support 0.8
105 | stasto_support 1
106 | # Allowed difference for each feature 
107 | # Values have to be in [0,2]
108 | e_1 0.0
109 | e_2 0.5
110 | e_3 0.096
111 | e_4 0.02
112 | e_5 0.18
113 | e_6 0.18
114 | ```
115 | Description of evidence sources in default BRAKER1 and BRAKER2 outputs:
116 | ```
117 | E = RNA-seq hints
118 | M = manual hints, these are hints that are enforced during the prediction step of BRAKER,
119 | C = protein hints from proteins with a 'high' spliced alignment score.
120 | P = protein hints from proteins that have a 'good' spliced alignment score, 
121 |      but that is lower than the score from the ones in 'C'. 
122 | ```
123 | 
124 | ## Use Case
125 | The recommended and most common usage for TSEBRA is to combine the resulting ```augustus.hints.gtf``` files of a BRAKER1 and a BRAKER2 run using the hintsfile.gff from both working directories. However, TSEBRA can be applied to any number (>1) of gene predictions and hint files as long as they are in the correct format.
126 | 
127 | A common case might be that a user wants to annotate a novel genome with BRAKER and has:
128 | * a novel genome with repeats masked: ```genome.fasta.masked```,
129 | * hints for intron positions from RNA-seq reads```rna_seq_hints.gff```,
130 | * database of homologous proteins: ```proteins.fa```.
131 | 
132 | 1. Run BRAKER1 and BRAKER2 for example with
133 | ```console
134 | ### BRAKER1
135 | braker.pl --genome=genome.fasta.masked --hints=rna_seq_hints.gff \
136 |             --softmasking --species=species_name --workingdir=braker1_out
137 | ### BRAKER2
138 | braker.pl --genome=genome.fasta.masked --prot_seq=proteins.fa \
139 |     --softmasking --species=species_name --epmode \
140 |     --workingdir=braker2_out
141 | ```
142 | 2. Combine predicitons with TSEBRA
143 | ```console
144 | ./bin/tsebra.py -g braker1_out/augustus.hints.gtf,braker2_out/augustus.hints.gtf -c default.cfg \
145 |     -e braker1_out/hintsfile.gff,braker2_out/hintsfile.gff \
146 |     -o braker1+2_combined.gtf
147 | ```
148 | The combined gene prediciton is ```braker1+2_combined.gtf```.
149 | 
150 | ## Example
151 | A small example is located at ```example/```. Run ```./example/run_prevco_example.sh``` to execute the example and to check if TSEBRA runs properly.
152 | 
153 | ## Enforcing a gene set
154 | A gene set can be enforced in the TSEBRA output, i.e. all transcript are guaranteed to be included in the output, with the `--keep_gtf` option. The transcripts of enforced gene sets are still compared to all gene sets and used to evaluate them.
155 | Example:
156 |  ```console
157 | ./bin/tsebra.py -g gene_set1,gene_set2 -c default.cfg \
158 |     -k enforced_set1,enforced_set2 -e hintsfile1.gff,braker2_out/hintsfile2.gff \
159 |     -o tsebra.gtf
160 | ```
161 | To merge two gene sets, simply omit the `-g` option.
162 | 
163 | 
164 | ## Filter single-exon genes out
165 | In default mode, TSEBRA is conservative in filtering single exon genes out. In some cases BRAKER predicts a lot of false positive single exon genes. In these cases, it is recommended to run TSBERA using the `--filter_single_exon_genes`. In this mode, TSBERA filters additonally all single-exon genes out that have no support by a start or stop codon hint. 
166 | 
167 | ## Print transcript scores
168 | The transcript scores play a very improtant role in TSEBRA. These are used for pairwise comparison of all transcripts isoforms that have overlapping coding regions. You can print the scores as table to a file with the option `--score_tab /path/to/output/file.tab`.
169 | 
170 | ## Ignore Frame
171 | By default, TSEBRA groups all transcript isoforms that have overlapping coding regions in the same open reading frame (phase column in gtf) to candidates of the same gene. However, in some cases, it might be desired to consider already all transcripts with overlapping conding regions (regardless of the reading frame) as candidates for a gene. In this case add the `--ignore_tx_phase` to the TSEBRA commmand. 
172 | 
173 | ## Other scripts in the TSEBRA repository
174 | 
175 | ### Renaming transcripts from a TSEBRA output
176 | The IDs of the transcripts and genes in the TSEBRA output can be renamed such that the gene and transcript ID match.
177 | Genes and transcript are numbered consecutively and for example, the second transcript of gene "g12" has the ID "g12.t2".
178 | If a prefix is set then it will be added before all IDs, for example, the transcript ID is "dmel_g12.t2" if the prefix is set to "dmel".
179 | Additionally, a translation table can be produced that provides the mapping from old to new transcript IDs.
180 | 
181 | Example for renaming ```tsebra_result.gtf```:
182 | ```console
183 | ./bin/rename_gtf.py --gtf tsebra_result.gtf --prefix dmel --translation_tab translation.tab --out tsebra_result_renamed.gtf
184 | ```
185 | The arguments ```--prefix``` and ```--translation_tab``` are optional.
186 | 
187 | ### Fixing the formatting issue of `braker.gtf`
188 | A BRAKER run produces a second complete gene set named `braker.gtf`, besides the official output `augustus.hints.gtf`. The `braker.gtf` is the result of merging `augustus.hints.gtf` with some 'high-confidents' genes from the GeneMark prediction. However, the merging process leads to a formatting issue in `braker.gtf`. 
189 | A quick fix for this formatting issue is the script `fix_gtf_ids.py`, e.g.:
190 | ```console
191 | ./bin/fix_gtf_ids.py --gtf braker_out/braker.gtf --out braker1_fixed.gtf
192 | ```
193 | Take note that the `braker.gtf` and `fix_gtf_ids.py` haven't been tested sufficently and there is no guarantee that this gene set is superior to `augustus.hints.gtf`.
194 | 
195 | ### Getting the longest isoform of each gene loci from different gene sets
196 | Combines multiple gene sets and reports the transcript with the longest coding region for each cluster of overlapping transcripts (one transcript per gene loci), e.g.
197 | ```console
198 | ./bin/get_longest_isoform.py --gtf gene_set1.gtf,gene_set2.gtf --out longest_insoforms.gtf
199 | ```
200 | 
201 | ## Licence
202 | All source code, i.e. `bin/*.py` are under the [Artistic License](bin/LICENSE.txt) (see <https://opensource.org/licenses/Artistic-2.0>).
203 | 
204 | ## Citing TSEBRA
205 | Gabriel, L., Hoff, K.J., Brůna, T. *et al.* TSEBRA: transcript selector for BRAKER. *BMC Bioinformatics* **22**, 566 (2021). https://doi.org/10.1186/s12859-021-04482-0
206 | 
207 | ## References
208 | <b id="ref1">[1]</b> Hoff, Katharina J, Simone Lange, Alexandre Lomsadze, Mark Borodovsky, and Mario Stanke. 2015. “BRAKER1: Unsupervised Rna-Seq-Based Genome Annotation with Genemark-et and Augustus.” *Bioinformatics* 32 (5). Oxford University Press: 767--69.[↑](#a1)
209 | 
210 | <b id="ref2">[2]</b> Tomas Bruna, Katharina J. Hoff, Alexandre Lomsadze, Mario Stanke and Mark Borodvsky. 2021. “BRAKER2: automatic eukaryotic genome annotation with GeneMark-EP+ and AUGUSTUS supported by a protein database." *NAR Genomics and Bioinformatics* 3(1):lqaa108.[↑](#a2)
211 | 
212 | <b id="ref3">[3]</b> Stanke, Mario, Mark Diekhans, Robert Baertsch, and David Haussler. 2008. “Using Native and Syntenically Mapped cDNA Alignments to Improve de Novo Gene Finding.” *Bioinformatics* 24 (5). Oxford University Press: 637--44.[↑](#a3)
213 | 
214 | <b id="ref4">[4]</b> Stanke, Mario, Oliver Schöffmann, Burkhard Morgenstern, and Stephan Waack. 2006. “Gene Prediction in Eukaryotes with a Generalized Hidden Markov Model That Uses Hints from External Sources.” *BMC Bioinformatics* 7 (1). BioMed Central: 62.[↑](#a4)
215 | 


--------------------------------------------------------------------------------
/bin/overlap_graph.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # Lars Gabriel
  4 | #
  5 | # Graph for transcripts of multiple genome annotations.
  6 | # It can detect overlapping transcripts.
  7 | # Add a feature vector to each node.
  8 | # Compare nodes with the 'decision rule'.
  9 | # ==============================================================
 10 | from features import Node_features
 11 | import numpy as np
 12 | 
 13 | class Edge:
 14 |     """
 15 |         Class handling an edge in the overlap graph.
 16 |     """
 17 |     def __init__(self, n1_id, n2_id):
 18 |         """
 19 |             Args:
 20 |                 n1_id (str): Node ID from overlap graph
 21 |                 n2_id (str): Node ID from overlap graph
 22 |         """
 23 |         self.node1 = n1_id
 24 |         self.node2 = n2_id
 25 |         self.node_to_remove = None
 26 | 
 27 | class Node:
 28 |     """
 29 |         Class handling a node that represents a transcript in the overlap graph.
 30 |     """
 31 |     def __init__(self, a_id, t_id):
 32 |         """
 33 |             Args:
 34 |                 a_id (str): Annotation ID of the transcript from Anno object
 35 |                 t_id (str): Transcript ID from Transcrpt object
 36 |         """
 37 |         self.id = '{};{}'.format(a_id, t_id)
 38 |         self.transcript_id = t_id
 39 |         self.is_in_ref_anno = 0.0
 40 |         # ID of original annotation/gene prediction
 41 |         self.anno_id = a_id
 42 |         # unique ID for a cluster of overlapping transcripts
 43 |         self.component_id = None
 44 | 
 45 |         # dict of edge_ids of edges that are incident
 46 |         # self.edge_to[id of incident Node] = edge_id
 47 |         self.edge_to = {}
 48 |         self.feature_vector = [None] * 4
 49 |         self.evi_support = False
 50 |         self.enforce = False
 51 |         self.gene_sets = set()
 52 | 
 53 | class Graph:
 54 |     """
 55 |         Overlap graph that can detect and filter overlapping transcripts.
 56 |     """
 57 |     def __init__(self, genome_anno_lst, para, keep_tx=[], filter_single=False, ignore_phase=False, verbose=0):
 58 |         """
 59 |             Args:
 60 |                 genome_anno_lst (list(Anno)): List of Anno class objects
 61 |                                               containing genome annotations.
 62 |                 para (dict(float)): Dictionary for parameter used for filtering of transcripts.
 63 |                 verbose (int): Verbose mode if verbose >0 .
 64 |         """
 65 |         # self.nodes['anno;txid'] = Node(anno, txid)
 66 |         self.nodes = {}
 67 | 
 68 |         # self.edges['ei'] = Edge()
 69 |         self.edges = {}
 70 | 
 71 |         # self.anno[annoid] = Anno()
 72 |         self.anno = {}
 73 |         
 74 |         # list of connected graph components
 75 |         self.component_index = 0
 76 |         self.component_list = []
 77 |         
 78 |         # subset of all transcripts that weren't removed by the transcript comparison rule
 79 |         self.decided_graph = []
 80 | 
 81 |         # dict of duplicate genome annotation ids to new ids
 82 |         self.duplicates = {}
 83 | 
 84 |         # variables for verbose mode
 85 |         self.v = verbose
 86 |         self.f = [[],[],[],[]]
 87 |         self.ties = 0
 88 | 
 89 |         # parameters for decision rule
 90 |         self.para = para
 91 | 
 92 |         # list of transcript set names that are enforced
 93 |         self.keep_tx = keep_tx
 94 | 
 95 |         # init annotations, check for duplicate ids
 96 |         self.init_anno(genome_anno_lst)
 97 |         
 98 |         # filter single exon genes
 99 |         self.filter_single = filter_single
100 |         self.ignore_phase = ignore_phase
101 | 
102 |     def init_anno(self, genome_anno_lst):
103 |         # make sure that the genome_anno ids are unique
104 |         counter = 0
105 |         for ga in genome_anno_lst:
106 |             if ga.id in self.anno.keys():
107 |                 counter += 1
108 |                 new_id = "duplicate.anno.{}".format(counter)
109 |                 self.duplicates.update({new_id : ga.id})
110 |                 ga.change_id(new_id)
111 |             self.anno.update({ga.id : ga})
112 | 
113 |     def __tx_from_key__(self, key):
114 |         """
115 |             Gets a transcript of a node.
116 | 
117 |             Args:
118 |                 key (str): ID of a node as 'anno_id;tx_id'
119 | 
120 |             Returns:
121 |                 (Transcript): Transcript class object with id = tx_id
122 |                               from Anno() with id = anno_id
123 |         """
124 |         anno_id, tx_id = key.split(';')
125 |         return self.anno[anno_id].transcripts[tx_id]
126 | 
127 |     def build(self):
128 |         """
129 |             Builds the overlap graph for >=1 Anno() objects.
130 |             Each node of the graph represents a unique transcript from any annotation.
131 |             Two nodes have an edge if their transcripts overlap.
132 |             Two transcripts overlap if they share at least 3 adjacent protein coding nucleotides.
133 |         """
134 | 
135 |         # tx_start_end[chr] = [tx_id, coord, id for start or end]
136 |         # for every tx one element for start and one for end
137 |         # this dict is used to check for overlapping transcripts
138 |         tx_start_end = {}
139 |         # check for duplicate txs, list of ['start_end_strand']
140 |         unique_tx_keys = {}
141 | 
142 |         for k in self.anno.keys():
143 |             for tx in self.anno[k].get_transcript_list():
144 |                 key = f'{tx.source_anno};{tx.id}'
145 |                 if tx.chr not in tx_start_end.keys():
146 |                     tx_start_end.update({tx.chr : []})
147 |                     unique_tx_keys.update({tx.chr : {}})
148 |                 unique_key = '{}_{}_{}'.format(tx.start, tx.end, tx.strand)
149 |                 if unique_key in unique_tx_keys[tx.chr].keys():
150 |                     check = False
151 |                     coords = tx.get_type_coords('CDS')
152 |                     for t in unique_tx_keys[tx.chr][unique_key]:
153 |                         if coords == t.get_type_coords('CDS'):
154 |                             check = True
155 |                             break
156 |                     if check:
157 |                         if tx.source_anno in self.keep_tx:
158 |                             self.nodes[f'{t.source_anno};{t.id}'].enforce = True
159 |                         self.nodes[f'{t.source_anno};{t.id}'].gene_sets.add(t.source_anno)
160 |                         continue
161 |                 else:
162 |                     unique_tx_keys[tx.chr].update({unique_key : []})
163 |                 unique_tx_keys[tx.chr][unique_key].append(tx)
164 |                 self.nodes.update({key : Node(tx.source_anno, \
165 |                     tx.id)})
166 |                 self.nodes[f'{tx.source_anno};{tx.id}'].gene_sets.add(tx.source_anno)
167 |                 if tx.source_anno in self.keep_tx:
168 |                     self.nodes[key].enforce = True
169 |                 tx_start_end[tx.chr].append([key, tx.start, 0])
170 |                 tx_start_end[tx.chr].append([key, tx.end, 1])
171 | 
172 |         # detect overlapping nodes
173 |         edge_count = 0
174 |         for chr in tx_start_end.keys():
175 |             tx_start_end[chr] = sorted(tx_start_end[chr], key=lambda t:(t[1], t[2]))
176 |             open_intervals = []
177 |             for interval in tx_start_end[chr]:
178 |                 if interval[2] == 0:
179 |                     open_intervals.append(interval[0])
180 |                 else:
181 |                     open_intervals.remove(interval[0])
182 |                     for match in open_intervals:
183 |                         tx1 = self.__tx_from_key__(interval[0])
184 |                         tx2 = self.__tx_from_key__(match)
185 |                         if self.compare_tx_cds(tx1, tx2):
186 |                             new_edge_key = 'e{}'.format(edge_count)
187 |                             edge_count += 1
188 |                             self.edges.update({new_edge_key : Edge(interval[0], match)})
189 |                             self.nodes[interval[0]].edge_to.update({match : new_edge_key})
190 |                             self.nodes[match].edge_to.update({interval[0] : new_edge_key})
191 | 
192 |     def compare_tx_cds(self, tx1, tx2):
193 |         """
194 |             Check if two transcripts share at least 3 adjacent protein
195 |             coding nucleotides on the same strand and reading frame.
196 | 
197 |             Args:
198 |                 tx1 (Transcript): Transcript class object of first transcript
199 |                 tx2 (Transcript): Transcript class object of second transcript
200 | 
201 |             Returns:
202 |                 (boolean): TRUE if they overlap and FALSE otherwise
203 |         """
204 |         if not tx1.strand == tx2.strand:
205 |             return False
206 |         coords = []
207 |         coords += [c + [int(phase)] for phase, coord_phase in tx1.get_type_coords('CDS').items() for c in coord_phase]
208 |         coords += [c + [int(phase)] for phase, coord_phase in tx2.get_type_coords('CDS').items() for c in coord_phase]
209 |         coords = sorted(coords, key = lambda x: x[0])
210 |         
211 |         for i in range(1, len(coords)):
212 |             if coords[i-1][1] - coords[i][0] > 0:       
213 |                 if self.ignore_phase:
214 |                     return True
215 |                 elif tx1.strand == '+' and \
216 |                     abs(coords[i-1][0]-coords[i-1][2]-coords[i][0]+coords[i][2])%3 == 0:
217 |                     return True
218 |                 elif abs(coords[i-1][1]+coords[i-1][2]-coords[i][1]-coords[i][2])%3 == 0:
219 |                     return True
220 |         return False
221 | 
222 |     def add_reference_anno_label(self, ref_anno):
223 |         """
224 |             Sets the value of is_in_ref_anno for each node to 1
225 |             if the coding sequence of the corresponding transcript matches the
226 |             coding sequence of a transcript in the reference anno
227 | 
228 |             Args:
229 |                 ref_anno (Anno): Anno() obeject of reference annotation
230 |         """
231 |         def get_cds_keys(tx):
232 |             keys = [tx.chr, tx.strand] + [str(c[0]) + '_' + str(c[1]) \
233 |                 for c in tx.get_type_coords('CDS', frame=False)]
234 |             return keys
235 |         ref_anno_keys = []
236 |         ref_anno_cds = []
237 |         for tx in ref_anno.transcripts.values():
238 |             cds_keys = get_cds_keys(tx)
239 |             ref_anno_cds += cds_keys
240 |             ref_anno_keys.append('_'.join(cds_keys))
241 |         ref_anno_cds = set(ref_anno_cds)
242 |         ref_anno_keys = set(ref_anno_keys)        
243 |         false_cds_keys = set([])
244 |         correct_cds_keys = set([])
245 |         numb_correct_tx = 0
246 |         for n in self.nodes:
247 |             self.nodes[n].is_in_ref_anno = 0.0
248 |             c_keys = get_cds_keys(self.__tx_from_key__(n))
249 |             if '_'.join(c_keys) in ref_anno_keys:
250 |                 self.nodes[n].is_in_ref_anno = 1.0
251 | 
252 |     def print_nodes(self):
253 |         # prints all nodes of the graph (only used for development)
254 |         for k in self.nodes.keys():
255 |             print(self.nodes[k].id)
256 |             print(self.nodes[k].transcript_id)
257 |             print(self.nodes[k].anno_id)
258 |             print(self.nodes[k].edge_to.keys())
259 |             print('\n')
260 | 
261 |     def connected_components(self):
262 |         """
263 |             Compute all clusters of connected transcripts.
264 |             A cluster is connected component of the graph.
265 |             Adds component IDs to nodes.
266 | 
267 |             Returns:
268 |                 (list(list(str))): Lists of list of all node IDs of a component.
269 |         """
270 |         visited = []
271 |         self.component_list = []
272 |         self.component_index = 0
273 |         for key in list(self.nodes.keys()):
274 |             component = [key]
275 |             if key in visited:
276 |                 continue
277 |             visited.append(key)
278 |             not_visited = list(self.nodes[key].edge_to.keys())
279 |             component += not_visited
280 |             while not_visited:
281 |                 next_node = not_visited.pop()
282 |                 visited.append(next_node)
283 |                 new_nodes = [n for n in self.nodes[next_node].edge_to.keys() if n not in component]
284 |                 not_visited += new_nodes
285 |                 component += new_nodes
286 |             self.component_list.append(component)
287 |             self.component_index += 1
288 |             for node in component:
289 |                 self.nodes[node].component_id = 'g_{}'.format(self.component_index)
290 |         return self.component_list
291 | 
292 |     def add_node_features(self, evi):
293 |         """
294 |             Compute for all nodes the feature vector based on the evidence support by evi.
295 | 
296 |             Args:
297 |                 evi (Evidence): Evidence class object with all hints from any source.
298 |         """
299 |         all_features = []
300 |         for key in self.nodes.keys():
301 |             tx = self.__tx_from_key__(key)
302 |             new_node_feature = Node_features(tx, evi, self.para)
303 |             self.nodes[key].feature_vector = np.array(new_node_feature.get_features())
304 |             all_features.append(self.nodes[key].feature_vector)            
305 |         std = np.std(np.array(all_features)[:,2:], axis=0)
306 |         mean = np.mean(np.array(all_features)[:,2:], axis=0)
307 |         for key in self.nodes.keys():
308 |             tx = self.__tx_from_key__(key)
309 |             self.nodes[key].feature_vector[2:] -= mean
310 |             self.nodes[key].feature_vector[2:] /= std
311 |             if self.nodes[key].feature_vector[0] >= self.para['intron_support'] \
312 |                 or self.nodes[key].feature_vector[1] >= self.para['stasto_support']:
313 |                 self.nodes[key].evi_support = True
314 |             if self.filter_single:
315 |                 if len(tx.transcript_lines['intron']) == 0 and \
316 |                     self.nodes[key].feature_vector[1] == 0:
317 |                     self.nodes[key].evi_support = False
318 |         
319 |     def decide_edge(self, edge, iter_range = range(0,6)):
320 |         """Apply transcript comparison rule to two overlapping transcripts
321 | 
322 |             Args:
323 |                 edge (Edge): edge between two transcripts
324 | 
325 |             Returns:
326 |                 (str): node ID of the transcript that is marked for removal
327 |         """
328 |         
329 |         n1 = self.nodes[edge.node1]
330 |         n2 = self.nodes[edge.node2]
331 |         if n1.evi_support and n2.evi_support:
332 |             tx1 = self.__tx_from_key__(n1.id)
333 |             tx2 = self.__tx_from_key__(n2.id)
334 |             iter_range = range(4)
335 |             if len(tx1.transcript_lines['intron']) == 0 or \
336 |                 len(tx2.transcript_lines['intron']) == 0:
337 |                 iter_range = [1,3]
338 |                 
339 |             for i in iter_range:
340 |                 diff = n1.feature_vector[i] - n2.feature_vector[i]
341 |                 if diff > self.para[f'e_{i+1}']:
342 |                     return n2.id
343 |                 elif diff < (-1 * self.para[f'e_{i+1}']):
344 |                     return n1.id       
345 |         return None
346 | 
347 |     def decide_component(self, component):
348 |         """Applies transcript comparison rule to all transcripts of one component
349 |             and returns the node IDs of all transcripts that are not removed by
350 |             a comparison.
351 | 
352 |             Args:
353 |                 component (list(str)): List of node IDs
354 | 
355 |             Returns:
356 |                 (list(str)): Filtered subset of component list.
357 |         """
358 |         # return all ids of vertices of a graph component, that weren't excluded by the decision rule
359 |         result = component.copy()
360 |         for node_id in component:
361 |             for e_id in self.nodes[node_id].edge_to.values():
362 |                 node_to_remove = self.edges[e_id].node_to_remove
363 |                 if node_to_remove:
364 |                     if node_to_remove in result and \
365 |                         not self.nodes[node_to_remove].enforce:
366 |                         result.remove(node_to_remove)
367 |             if node_id in result and not self.nodes[node_id].evi_support and \
368 |                 not self.nodes[node_id].enforce:
369 |                 result.remove(node_id)
370 |         new_components = [[]]
371 |         visited = []
372 |         for k, n_id in enumerate(result):
373 |             if n_id not in visited:
374 |                 if k > 0:
375 |                     self.component_index += 1
376 |                 not_visited = [n_id]
377 |                 while not_visited:
378 |                     n2_id = not_visited.pop()
379 |                     visited.append(n2_id)
380 |                     new_components[-1].append(n2_id)
381 |                     not_visited += [n for n in self.nodes[n2_id].edge_to \
382 |                         if n in result and n not in not_visited + visited]
383 |                     if k > 0:
384 |                         self.nodes[n2_id].component_id = f'g_{self.component_index}'
385 |         return result
386 | 
387 |     def decide_graph(self):
388 |         """
389 |             Create list of connected components of the graph and apply the
390 |             transcript comparison rule to all components.
391 |         """
392 |         for key in self.edges.keys():
393 |             self.edges[key].node_to_remove = self.decide_edge(self.edges[key])
394 |         self.decided_graph = []
395 |         if not self.component_list:
396 |             self.connected_components()
397 |         for component in self.component_list:
398 |             if len(component) > 1:
399 |                 self.decided_graph += self.decide_component(component)
400 |             elif self.nodes[component[0]].evi_support \
401 |                 or self.nodes[component[0]].enforce:
402 |                 self.decided_graph += component
403 | 
404 |     def get_decided_graph(self):
405 |         """
406 |             Filter graph with the transcript comparison rule.
407 |             Then, remove all transcripts with low evidence support and
408 |             compute the subset of transcripts that are included in the
409 |             combined gene prediciton.
410 | 
411 |             Returns:
412 |                 (dict(list(list(str))): Dictionary with transcript IDs and new
413 |                 gene IDs of all transcripts included in the combined gene prediciton
414 |                 for all input annotations
415 |         """
416 |         if not self.decided_graph:
417 |             self.decide_graph()
418 |         # result[anno_id] = [[tx_ids, new_gene_id]]
419 |         result = {}
420 |         for key in self.anno.keys():
421 |             result.update({key : []})
422 |         for node in self.decided_graph:
423 |             # if self.nodes[node].evi_support or self.nodes[node].enforce:
424 |             anno_id, tx_id = node.split(';')
425 |             result[anno_id].append([tx_id, self.nodes[node].component_id])
426 | 
427 |         if self.v > 0:
428 |             print('NODES: {}'.format(len(self.nodes.keys())))
429 |             f = list(map(set, self.f))
430 |             print('f1: {}'.format(len(f[0])))
431 |             u = f[0]
432 |             print('f2: {}'.format(len(f[1])))
433 |             print('f2/f1: {}'.format(len(f[1].difference(u))))
434 |             u = u.union(f[1])
435 |             print('f3: {}'.format(len(f[2])))
436 |             print('f3/f2/f1: {}'.format(len(f[2].difference(u))))
437 |             u = u.union(f[2])
438 |             print('f4: {}'.format(len(f[3])))
439 |             print('f4/f3/f2/f1: {}'.format(len(f[3].difference(u))))
440 | 
441 |         return result
442 | 


--------------------------------------------------------------------------------
/example/braker1_results/hintsfile.gff:
--------------------------------------------------------------------------------
  1 | 2L	b2h	intron	20750	20830	0	-	.	mult=2;pri=4;src=E
  2 | 2L	b2h	intron	20969	21065	0	-	.	mult=2;pri=4;src=E
  3 | 2L	b2h	intron	20974	21065	0	-	.	pri=4;src=E
  4 | 2L	b2h	intron	20999	21065	9	-	.	mult=9;pri=4;src=E
  5 | 2L	b2h	intron	21201	21346	6	-	.	mult=6;pri=4;src=E
  6 | 2L	b2h	intron	21201	21426	0	-	.	pri=4;src=E
  7 | 2L	b2h	intron	22942	22997	0	+	.	pri=4;src=E
  8 | 2L	b2h	intron	23874	23928	0	-	.	pri=4;src=E
  9 | 2L	b2h	intron	26689	26765	63	-	.	mult=63;pri=4;src=E
 10 | 2L	b2h	intron	26965	27052	56	-	.	mult=56;pri=4;src=E
 11 | 2L	b2h	intron	27491	28014	42	-	.	mult=42;pri=4;src=E
 12 | 2L	b2h	intron	27491	28410	0	-	.	pri=4;src=E
 13 | 2L	b2h	intron	28241	28410	2	-	.	mult=2;pri=4;src=E
 14 | 2L	b2h	intron	28241	28732	53	-	.	mult=53;pri=4;src=E
 15 | 2L	b2h	intron	28927	28981	43	-	.	mult=43;pri=4;src=E
 16 | 2L	b2h	intron	29069	30393	21	-	.	mult=21;pri=4;src=E
 17 | 2L	b2h	intron	33271	33728	0	-	.	pri=4;src=E
 18 | 2L	b2h	intron	29069	33844	21	-	.	mult=21;pri=4;src=E
 19 | 2L	b2h	intron	30587	33844	0	-	.	pri=4;src=E
 20 | 2L	b2h	intron	33271	33844	13	-	.	mult=13;pri=4;src=E
 21 | 2L	b2h	intron	29069	34557	29	-	.	mult=29;pri=4;src=E
 22 | 2L	b2h	intron	33271	34557	6	-	.	mult=6;pri=4;src=E
 23 | 2L	b2h	intron	34289	34557	27	-	.	mult=27;pri=4;src=E
 24 | 2L	b2h	intron	34605	34719	60	-	.	mult=60;pri=4;src=E
 25 | 2L	b2h	intron	37812	37915	0	-	.	pri=4;src=E
 26 | 2L	b2h	intron	34913	38534	9	-	.	mult=9;pri=4;src=E
 27 | 2L	b2h	intron	35213	38534	36	-	.	mult=36;pri=4;src=E
 28 | 2L	b2h	intron	38299	38534	8	-	.	mult=8;pri=4;src=E
 29 | 2L	b2h	intron	38732	39300	40	-	.	mult=40;pri=4;src=E
 30 | 2L	b2h	intron	25177	54176	0	+	.	pri=4;src=E
 31 | 2L	b2h	intron	39858	58080	48	-	.	mult=48;pri=4;src=E
 32 | 2L	b2h	intron	58183	58958	5	-	.	mult=5;pri=4;src=E
 33 | 2L	b2h	intron	58183	59189	21	-	.	mult=21;pri=4;src=E
 34 | 2L	b2h	intron	58732	59189	0	-	.	pri=4;src=E
 35 | 2L	b2h	intron	21371	64891	0	+	.	pri=4;src=E
 36 | 2L	b2h	intron	58183	65345	0	-	.	pri=4;src=E
 37 | 2L	b2h	intron	58183	66575	4	-	.	mult=4;pri=4;src=E
 38 | 2L	b2h	intron	66613	66675	0	+	.	pri=4;src=E
 39 | 2L	b2h	intron	65411	66681	0	-	.	pri=4;src=E
 40 | 2L	b2h	intron	66854	67388	0	+	.	pri=4;src=E
 41 | 2L	b2h	intron	67112	67388	0	+	.	pri=4;src=E
 42 | 2L	b2h	intron	67138	67388	3	+	.	mult=3;pri=4;src=E
 43 | 2L	b2h	intron	23423	67568	0	+	.	pri=4;src=E
 44 | 2L	b2h	intron	67508	67568	30	+	.	mult=30;pri=4;src=E
 45 | 2L	b2h	intron	67763	67891	1070	+	.	mult=1070;pri=4;src=E
 46 | 2L	b2h	intron	68024	68084	862	+	.	mult=862;pri=4;src=E
 47 | 2L	b2h	intron	67996	70268	0	+	.	pri=4;src=E
 48 | 2L	b2h	intron	70550	70606	81	+	.	mult=81;pri=4;src=E
 49 | 2L	b2h	intron	70550	70613	0	+	.	pri=4;src=E
 50 | 2L	b2h	intron	71805	71924	0	+	.	pri=4;src=E
 51 | 2L	b2h	intron	71805	71949	36	+	.	mult=36;pri=4;src=E
 52 | 2L	b2h	intron	58732	73110	0	-	.	pri=4;src=E
 53 | 2L	b2h	intron	73448	73557	0	-	.	pri=4;src=E
 54 | 2L	b2h	intron	72954	73668	0	+	.	pri=4;src=E
 55 | 2L	b2h	intron	39858	73754	0	-	.	pri=4;src=E
 56 | 2L	b2h	intron	73693	73819	5	+	.	mult=5;pri=4;src=E
 57 | 2L	b2h	intron	72082	74902	13	+	.	mult=13;pri=4;src=E
 58 | 2L	b2h	intron	72954	74902	0	+	.	mult=2;pri=4;src=E
 59 | 2L	b2h	intron	72978	74902	188	+	.	mult=188;pri=4;src=E
 60 | 2L	b2h	intron	73546	74902	0	+	.	pri=4;src=E
 61 | 2L	b2h	intron	73586	74902	3	+	.	mult=3;pri=4;src=E
 62 | 2L	b2h	intron	73693	74902	77	+	.	mult=77;pri=4;src=E
 63 | 2L	b2h	intron	73898	74902	14	+	.	mult=14;pri=4;src=E
 64 | 2L	b2h	intron	74573	74902	46	+	.	mult=46;pri=4;src=E
 65 | 2L	b2h	intron	72978	75077	0	+	.	mult=2;pri=4;src=E
 66 | 2L	b2h	intron	73693	75077	3	+	.	mult=3;pri=4;src=E
 67 | 2L	b2h	intron	74573	75077	0	+	.	mult=3;pri=4;src=E
 68 | 2L	b2h	intron	75019	75077	435	+	.	mult=435;pri=4;src=E
 69 | 2L	b2h	intron	75367	75426	6	+	.	mult=6;pri=4;src=E
 70 | 2L	b2h	intron	75367	77480	2	+	.	mult=2;pri=4;src=E
 71 | 2L	b2h	intron	77584	77641	4	+	.	mult=4;pri=4;src=E
 72 | 2L	b2h	intron	78870	78938	0	-	.	mult=2;pri=4;src=E
 73 | 2L	b2h	intron	78231	81244	2	-	.	mult=2;pri=4;src=E
 74 | 2L	b2h	intron	64019	82446	0	-	.	pri=4;src=E
 75 | 2L	b2h	intron	75367	85176	0	+	.	pri=4;src=E
 76 | 2L	b2h	intron	85177	85243	2	+	.	mult=2;pri=4;src=E
 77 | 2L	b2h	intron	68160	85470	0	+	.	pri=4;src=E
 78 | 2L	b2h	intron	84278	87019	9	-	.	mult=9;pri=4;src=E
 79 | 2L	b2h	intron	94893	94988	37	+	.	mult=37;pri=4;src=E
 80 | 2L	b2h	intron	95071	95131	50	+	.	mult=50;pri=4;src=E
 81 | 2L	b2h	intron	94893	95144	0	+	.	pri=4;src=E
 82 | 2L	b2h	intron	95302	95353	73	+	.	mult=73;pri=4;src=E
 83 | 2L	b2h	intron	97834	97883	26	+	.	mult=26;pri=4;src=E
 84 | 2L	b2h	intron	97834	97895	0	+	.	pri=4;src=E
 85 | 2L	b2h	intron	98471	98526	74	+	.	mult=74;pri=4;src=E
 86 | 2L	b2h	intron	99347	99400	0	+	.	pri=4;src=E
 87 | 2L	b2h	intron	99347	99658	0	+	.	pri=4;src=E
 88 | 2L	b2h	intron	99724	99784	99	+	.	mult=99;pri=4;src=E
 89 | 2L	b2h	intron	100517	100571	81	+	.	mult=81;pri=4;src=E
 90 | 2L	b2h	intron	100704	100761	93	+	.	mult=93;pri=4;src=E
 91 | 2L	b2h	intron	100704	100810	0	+	.	pri=4;src=E
 92 | 2L	b2h	intron	100943	101015	88	+	.	mult=88;pri=4;src=E
 93 | 2L	b2h	intron	101195	101248	90	+	.	mult=90;pri=4;src=E
 94 | 2L	b2h	intron	101620	101875	62	+	.	mult=62;pri=4;src=E
 95 | 2L	b2h	intron	101915	101978	6	+	.	mult=6;pri=4;src=E
 96 | 2L	b2h	intron	102907	102964	4	+	.	mult=4;pri=4;src=E
 97 | 2L	b2h	intron	102904	102991	6	+	.	mult=6;pri=4;src=E
 98 | 2L	b2h	intron	102907	102993	0	-	.	pri=4;src=E
 99 | 2L	b2h	intron	102907	103005	3610	+	.	mult=3610;pri=4;src=E
100 | 2L	b2h	intron	103435	103515	203	+	.	mult=203;pri=4;src=E
101 | 2L	b2h	intron	103435	103877	9	+	.	mult=9;pri=4;src=E
102 | 2L	b2h	intron	94762	104804	0	-	.	mult=2;pri=4;src=E
103 | 2L	b2h	intron	104919	105004	0	-	.	pri=4;src=E
104 | 2L	b2h	intron	104948	105004	127	-	.	mult=127;pri=4;src=E
105 | 2L	b2h	intron	105337	105390	113	-	.	mult=113;pri=4;src=E
106 | 2L	b2h	intron	105456	105511	177	-	.	mult=177;pri=4;src=E
107 | 2L	b2h	intron	105456	105514	0	-	.	pri=4;src=E
108 | 2L	b2h	intron	105916	105968	116	-	.	mult=116;pri=4;src=E
109 | 2L	b2h	intron	105456	106437	0	-	.	pri=4;src=E
110 | 2L	b2h	intron	33980	106454	0	+	.	pri=4;src=E
111 | 2L	b2h	intron	107839	108147	4	+	.	mult=4;pri=4;src=E
112 | 2L	b2h	intron	107485	108439	0	-	.	mult=2;pri=4;src=E
113 | 2L	b2h	intron	106603	108587	0	+	.	pri=4;src=E
114 | 2L	b2h	intron	107839	108587	175	+	.	mult=175;pri=4;src=E
115 | 2L	b2h	intron	108102	108587	134	+	.	mult=134;pri=4;src=E
116 | 2L	b2h	intron	108211	108587	2	+	.	mult=2;pri=4;src=E
117 | 2L	b2h	intron	108227	108587	649	+	.	mult=649;pri=4;src=E
118 | 2L	b2h	intron	108347	108587	42	+	.	mult=42;pri=4;src=E
119 | 2L	b2h	intron	105456	109079	0	-	.	mult=2;pri=4;src=E
120 | 2L	b2h	intron	108810	109703	0	+	.	pri=4;src=E
121 | 2L	b2h	intron	107839	110405	0	+	.	pri=4;src=E
122 | 2L	b2h	intron	108810	110405	1039	+	.	mult=1039;pri=4;src=E
123 | 2L	b2h	intron	110484	110754	1082	+	.	mult=1082;pri=4;src=E
124 | 2L	b2h	intron	110878	111004	258	+	.	mult=258;pri=4;src=E
125 | 2L	b2h	intron	110878	111037	0	+	.	pri=4;src=E
126 | 2L	b2h	intron	110878	111368	0	+	.	pri=4;src=E
127 | 2L	b2h	intron	111118	111368	0	+	.	mult=2;pri=4;src=E
128 | 2L	b2h	intron	110878	111906	1090	+	.	mult=1090;pri=4;src=E
129 | 2L	b2h	intron	111118	111906	11	+	.	mult=11;pri=4;src=E
130 | 2L	b2h	intron	111100	112001	2	+	.	mult=2;pri=4;src=E
131 | 2L	b2h	intron	110878	112670	0	+	.	mult=2;pri=4;src=E
132 | 2L	b2h	intron	110878	112689	11	+	.	mult=11;pri=4;src=E
133 | 2L	b2h	intron	111118	112689	237	+	.	mult=237;pri=4;src=E
134 | 2L	b2h	intron	112020	112689	966	+	.	mult=966;pri=4;src=E
135 | 2L	b2h	intron	112022	112689	0	+	.	pri=4;src=E
136 | 2L	b2h	intron	112913	113097	0	-	.	pri=4;src=E
137 | 2L	b2h	intron	106792	113176	0	+	.	pri=4;src=E
138 | 2L	b2h	intron	113370	113433	1447	+	.	mult=1447;pri=4;src=E
139 | 2L	b2h	intron	117760	117819	198	-	.	mult=198;pri=4;src=E
140 | 2L	b2h	intron	117776	117819	3	-	.	mult=3;pri=4;src=E
141 | 2L	b2h	intron	118077	118135	215	-	.	mult=215;pri=4;src=E
142 | 2L	b2h	intron	100082	118360	0	-	.	pri=4;src=E
143 | 2L	b2h	intron	118305	118360	159	-	.	mult=159;pri=4;src=E
144 | 2L	b2h	intron	118875	118930	291	-	.	mult=291;pri=4;src=E
145 | 2L	b2h	intron	119077	119133	181	-	.	mult=181;pri=4;src=E
146 | 2L	b2h	intron	119080	119133	2	-	.	mult=2;pri=4;src=E
147 | 2L	b2h	intron	119215	119287	0	-	.	pri=4;src=E
148 | 2L	b2h	intron	119236	119287	172	-	.	mult=172;pri=4;src=E
149 | 2L	b2h	intron	119238	119287	5	+	.	mult=5;pri=4;src=E
150 | 2L	b2h	intron	119236	119291	3	-	.	mult=3;pri=4;src=E
151 | 2L	b2h	intron	119236	119430	7	-	.	mult=7;pri=4;src=E
152 | 2L	b2h	intron	119376	119430	143	-	.	mult=143;pri=4;src=E
153 | 2L	b2h	intron	119555	119827	216	-	.	mult=216;pri=4;src=E
154 | 2L	b2h	intron	120081	120167	29	-	.	mult=29;pri=4;src=E
155 | 2L	b2h	intron	120081	120199	2	-	.	mult=2;pri=4;src=E
156 | 2L	b2h	intron	120048	120420	0	-	.	pri=4;src=E
157 | 2L	b2h	intron	120078	120420	2	-	.	mult=2;pri=4;src=E
158 | 2L	b2h	intron	120081	120420	76	-	.	mult=76;pri=4;src=E
159 | 2L	b2h	intron	120266	120420	6	-	.	mult=6;pri=4;src=E
160 | 2L	b2h	intron	120361	120420	2	-	.	mult=2;pri=4;src=E
161 | 2L	b2h	intron	120365	120420	26	-	.	mult=26;pri=4;src=E
162 | 2L	b2h	intron	120458	120510	5	-	.	mult=5;pri=4;src=E
163 | 2L	b2h	intron	121032	121090	0	+	.	pri=4;src=E
164 | 2L	b2h	intron	121302	121354	3	+	.	mult=3;pri=4;src=E
165 | 2L	b2h	intron	121302	121386	0	+	.	mult=3;pri=4;src=E
166 | 2L	b2h	intron	120081	121632	32	-	.	mult=32;pri=4;src=E
167 | 2L	b2h	intron	122995	123080	148	-	.	mult=148;pri=4;src=E
168 | 2L	b2h	intron	123073	123139	20	+	.	mult=20;pri=4;src=E
169 | 2L	b2h	intron	123073	123146	0	+	.	mult=2;pri=4;src=E
170 | 2L	b2h	intron	123259	123324	5	+	.	mult=5;pri=4;src=E
171 | 2L	b2h	intron	123630	123693	854	-	.	mult=854;pri=4;src=E
172 | 2L	b2h	intron	123795	123855	843	-	.	mult=843;pri=4;src=E
173 | 2L	b2h	intron	123988	124086	0	-	.	pri=4;src=E
174 | 2L	b2h	intron	124025	124086	1061	-	.	mult=1061;pri=4;src=E
175 | 2L	b2h	intron	124027	124086	6	+	.	mult=6;pri=4;src=E
176 | 2L	b2h	intron	124921	125076	133	-	.	mult=133;pri=4;src=E
177 | 2L	b2h	intron	125267	126109	666	-	.	mult=666;pri=4;src=E
178 | 2L	b2h	intron	126228	126309	7	-	.	mult=7;pri=4;src=E
179 | 2L	b2h	intron	126228	127142	0	-	.	pri=4;src=E
180 | 2L	b2h	intron	125267	127380	0	-	.	pri=4;src=E
181 | 2L	b2h	intron	127295	127380	4	-	.	mult=4;pri=4;src=E
182 | 2L	b2h	intron	126228	128043	0	-	.	pri=4;src=E
183 | 2L	b2h	intron	124180	128437	0	-	.	pri=4;src=E
184 | 2L	b2h	intron	127620	128495	0	-	.	pri=4;src=E
185 | 2L	b2h	intron	126412	128554	3	-	.	mult=3;pri=4;src=E
186 | 2L	b2h	intron	126228	128799	681	-	.	mult=681;pri=4;src=E
187 | 2L	b2h	intron	126228	130507	2	-	.	mult=2;pri=4;src=E
188 | 2L	b2h	intron	128943	130507	10	-	.	mult=10;pri=4;src=E
189 | 2L	b2h	intron	128883	131979	0	+	.	pri=4;src=E
190 | 2L	b2h	intron	132256	132475	621	+	.	mult=621;pri=4;src=E
191 | 2L	b2h	intron	132746	132790	0	-	.	mult=2;pri=4;src=E
192 | 2L	b2h	intron	133182	133268	10	+	.	mult=10;pri=4;src=E
193 | 2L	b2h	intron	121750	133663	0	-	.	pri=4;src=E
194 | 2L	b2h	intron	139199	139255	0	-	.	pri=4;src=E
195 | 2L	b2h	intron	139688	139740	23	-	.	mult=23;pri=4;src=E
196 | 2L	b2h	intron	139688	139757	0	-	.	mult=2;pri=4;src=E
197 | 2L	b2h	intron	139955	140113	4	-	.	mult=4;pri=4;src=E
198 | 2L	b2h	intron	140697	140767	0	-	.	pri=4;src=E
199 | 2L	b2h	intron	141341	141395	212	-	.	mult=212;pri=4;src=E
200 | 2L	b2h	intron	141557	141609	0	-	.	pri=4;src=E
201 | 2L	b2h	intron	141610	141661	0	-	.	mult=2;pri=4;src=E
202 | 2L	b2h	intron	141610	141670	290	-	.	mult=290;pri=4;src=E
203 | 2L	b2h	intron	141621	141670	0	-	.	pri=4;src=E
204 | 2L	b2h	intron	121846	144080	0	-	.	pri=4;src=E
205 | 2L	b2h	intron	145911	145974	13	-	.	mult=13;pri=4;src=E
206 | 2L	b2h	intron	145911	146592	0	-	.	mult=2;pri=4;src=E
207 | 2L	b2h	intron	139688	147348	0	-	.	pri=4;src=E
208 | 2L	b2h	intron	145911	147348	15	-	.	mult=15;pri=4;src=E
209 | 2L	b2h	intron	147455	147510	10	-	.	mult=10;pri=4;src=E
210 | 2L	b2h	intron	147711	147764	9	-	.	mult=9;pri=4;src=E
211 | 2L	b2h	intron	147937	147994	5	-	.	mult=5;pri=4;src=E
212 | 2L	b2h	intron	148038	148091	4	-	.	mult=4;pri=4;src=E
213 | 2L	b2h	intron	148174	148337	9	-	.	mult=9;pri=4;src=E
214 | 2L	b2h	intron	148826	148878	9	-	.	mult=9;pri=4;src=E
215 | 2L	b2h	intron	132587	149154	0	-	.	pri=4;src=E
216 | 2L	b2h	intron	149227	151114	5	-	.	mult=5;pri=4;src=E
217 | 2L	b2h	intron	155060	155127	45	-	.	mult=45;pri=4;src=E
218 | 2L	b2h	intron	155179	155249	5	+	.	mult=5;pri=4;src=E
219 | 2L	b2h	intron	155411	155465	0	+	.	pri=4;src=E
220 | 2L	b2h	intron	155430	155493	6	+	.	mult=6;pri=4;src=E
221 | 2L	b2h	intron	114992	155545	2	+	.	mult=2;pri=4;src=E
222 | 2L	b2h	intron	141191	155545	3	+	.	mult=3;pri=4;src=E
223 | 2L	b2h	intron	155430	155545	216	+	.	mult=216;pri=4;src=E
224 | 2L	b2h	intron	114992	155566	15	+	.	mult=15;pri=4;src=E
225 | 2L	b2h	intron	155179	155566	0	+	.	pri=4;src=E
226 | 2L	b2h	intron	155430	155566	552	+	.	mult=552;pri=4;src=E
227 | 2L	b2h	intron	114992	155637	3	+	.	mult=3;pri=4;src=E
228 | 2L	b2h	intron	155179	155637	2	+	.	mult=2;pri=4;src=E
229 | 2L	b2h	intron	155430	155637	19	+	.	mult=19;pri=4;src=E
230 | 2L	b2h	intron	155785	155857	1373	+	.	mult=1373;pri=4;src=E
231 | 2L	b2h	intron	155797	155857	0	+	.	mult=2;pri=4;src=E
232 | 2L	b2h	intron	77435	155873	0	-	.	pri=4;src=E
233 | 2L	b2h	intron	155971	156044	9	+	.	mult=9;pri=4;src=E
234 | 2L	b2h	intron	155971	156175	0	+	.	pri=4;src=E
235 | 2L	b2h	intron	156349	156422	230	+	.	mult=230;pri=4;src=E
236 | 2L	b2h	intron	114992	156735	0	+	.	mult=3;pri=4;src=E
237 | 2L	b2h	intron	156553	156735	307	+	.	mult=307;pri=4;src=E
238 | 2L	b2h	intron	158376	158435	79	-	.	mult=79;pri=4;src=E
239 | 2L	b2h	intron	84105	159306	0	-	.	pri=4;src=E
240 | 2L	b2h	intron	116076	160008	0	+	.	pri=4;src=E
241 | 2L	b2h	intron	159187	160008	0	+	.	pri=4;src=E
242 | 2L	b2h	intron	159818	160008	0	+	.	pri=4;src=E
243 | 2L	b2h	intron	159820	160008	95	+	.	mult=95;pri=4;src=E
244 | 2L	b2h	intron	114992	162591	0	+	.	pri=4;src=E
245 | 2L	b2h	intron	160024	162591	53	+	.	mult=53;pri=4;src=E
246 | 2L	b2h	intron	160129	162591	4	+	.	mult=4;pri=4;src=E
247 | 2L	b2h	intron	163002	164158	3	+	.	mult=3;pri=4;src=E
248 | 2L	b2h	intron	161449	172975	0	-	.	pri=4;src=E
249 | 2L	b2h	intron	160024	175360	0	+	.	pri=4;src=E
250 | 2L	b2h	intron	163002	175360	13	+	.	mult=13;pri=4;src=E
251 | 2L	b2h	intron	164235	175360	0	+	.	pri=4;src=E
252 | 2L	b2h	intron	164279	175360	0	+	.	pri=4;src=E
253 | 2L	b2h	intron	180732	180799	0	-	.	mult=2;pri=4;src=E
254 | 2L	b2h	intron	181361	181469	0	-	.	pri=4;src=E
255 | 2L	b2h	intron	160024	183760	0	+	.	pri=4;src=E
256 | 2L	b2h	intron	163002	183760	10	+	.	mult=10;pri=4;src=E
257 | 2L	b2h	intron	175448	183760	10	+	.	mult=10;pri=4;src=E
258 | 2L	b2h	intron	183420	183760	12	+	.	mult=12;pri=4;src=E
259 | 2L	b2h	intron	185083	185151	42	+	.	mult=42;pri=4;src=E
260 | 2L	b2h	intron	183788	185507	0	-	.	pri=4;src=E
261 | 2L	b2h	intron	186124	186192	56	+	.	mult=56;pri=4;src=E
262 | 2L	b2h	intron	186856	186909	38	+	.	mult=38;pri=4;src=E
263 | 2L	b2h	intron	187017	187481	72	+	.	mult=72;pri=4;src=E
264 | 2L	b2h	intron	187184	187481	2	+	.	mult=2;pri=4;src=E
265 | 2L	b2h	intron	184508	189378	0	-	.	pri=4;src=E
266 | 2L	b2h	intron	141702	193142	0	+	.	pri=4;src=E
267 | 2L	b2h	intron	197637	197736	0	+	.	pri=4;src=E
268 | 2L	b2h	intron	155547	198232	0	+	.	pri=4;src=E
269 | 2L	b2h	intron	199576	199636	160	+	.	mult=160;pri=4;src=E
270 | 2L	b2h	intron	199744	199902	45	+	.	mult=45;pri=4;src=E
271 | 2L	b2h	intron	199744	200119	98	+	.	mult=98;pri=4;src=E
272 | 2L	b2h	intron	199984	200119	51	+	.	mult=51;pri=4;src=E
273 | 2L	b2h	intron	200316	200375	167	+	.	mult=167;pri=4;src=E
274 | 2L	b2h	intron	200676	200741	149	+	.	mult=149;pri=4;src=E
275 | 2L	b2h	intron	200676	200752	0	+	.	pri=4;src=E
276 | 2L	b2h	intron	201062	201126	223	+	.	mult=223;pri=4;src=E
277 | 2L	b2h	intron	194140	203534	0	-	.	pri=4;src=E
278 | 2L	b2h	intron	203466	203534	0	-	.	pri=4;src=E
279 | 2L	b2h	intron	203892	203992	290	+	.	mult=290;pri=4;src=E
280 | 2L	b2h	intron	195421	204324	0	-	.	pri=4;src=E
281 | 2L	b2h	intron	204919	204989	0	+	.	pri=4;src=E
282 | 2L	b2h	intron	205209	205271	3	-	.	mult=3;pri=4;src=E
283 | 2L	b2h	intron	205213	205271	43	-	.	mult=43;pri=4;src=E
284 | 2L	b2h	intron	205365	205434	0	-	.	pri=4;src=E
285 | 2L	b2h	intron	205477	205536	0	-	.	mult=4;pri=4;src=E
286 | 2L	b2h	intron	205944	206000	23	-	.	mult=23;pri=4;src=E
287 | 2L	b2h	intron	206051	206699	20	-	.	mult=20;pri=4;src=E
288 | 2L	b2h	intron	207632	207710	94	+	.	mult=94;pri=4;src=E
289 | 2L	b2h	intron	208609	208728	95	+	.	mult=95;pri=4;src=E
290 | 2L	b2h	intron	193216	209411	0	-	.	pri=4;src=E
291 | 2L	b2h	intron	214179	214277	2	-	.	mult=2;pri=4;src=E
292 | 2L	b2h	intron	206282	214280	0	+	.	pri=4;src=E
293 | 2L	b2h	intron	214943	215033	296	-	.	mult=296;pri=4;src=E
294 | 2L	b2h	intron	215669	215772	0	-	.	mult=4;pri=4;src=E
295 | 2L	b2h	intron	215677	215772	96	-	.	mult=96;pri=4;src=E
296 | 2L	b2h	intron	215677	215985	0	-	.	pri=4;src=E
297 | 2L	b2h	intron	216008	216076	219	-	.	mult=219;pri=4;src=E
298 | 2L	b2h	intron	216276	216344	258	-	.	mult=258;pri=4;src=E
299 | 2L	b2h	intron	216574	216631	231	-	.	mult=231;pri=4;src=E
300 | 2L	b2h	intron	218886	221505	11	-	.	mult=11;pri=4;src=E
301 | 2L	b2h	intron	218886	226115	67	-	.	mult=67;pri=4;src=E
302 | 2L	b2h	intron	218886	227371	13	-	.	mult=13;pri=4;src=E
303 | 2L	b2h	intron	226509	227371	67	-	.	mult=67;pri=4;src=E
304 | 2L	b2h	intron	227548	228126	0	-	.	pri=4;src=E
305 | 2L	b2h	intron	227548	228132	64	-	.	mult=64;pri=4;src=E
306 | 2L	b2h	intron	216893	228135	0	+	.	pri=4;src=E
307 | 2L	b2h	intron	228282	229172	123	-	.	mult=123;pri=4;src=E
308 | 2L	b2h	intron	228282	229929	0	-	.	pri=4;src=E
309 | 2L	b2h	intron	229383	229929	12	-	.	mult=12;pri=4;src=E
310 | 2L	b2h	intron	228282	230537	8	-	.	mult=8;pri=4;src=E
311 | 2L	b2h	intron	229351	230537	0	-	.	pri=4;src=E
312 | 2L	b2h	intron	229383	230537	76	-	.	mult=76;pri=4;src=E
313 | 2L	b2h	intron	229978	230537	7	-	.	mult=7;pri=4;src=E
314 | 2L	b2h	intron	230628	230757	90	-	.	mult=90;pri=4;src=E
315 | 2L	b2h	intron	231035	231367	0	-	.	pri=4;src=E
316 | 2L	b2h	intron	231035	231947	32	-	.	mult=32;pri=4;src=E
317 | 2L	b2h	intron	231035	231962	62	-	.	mult=62;pri=4;src=E
318 | 2L	b2h	intron	232102	232291	234	-	.	mult=234;pri=4;src=E
319 | 2L	b2h	intron	233155	233216	43	+	.	mult=43;pri=4;src=E
320 | 2L	b2h	intron	232380	233996	124	-	.	mult=124;pri=4;src=E
321 | 2L	b2h	intron	234175	234236	115	-	.	mult=115;pri=4;src=E
322 | 2L	b2h	intron	240406	240489	0	-	.	pri=4;src=E
323 | 2L	b2h	intron	240076	241608	0	-	.	pri=4;src=E
324 | 2L	b2h	intron	242144	242243	100	-	.	mult=100;pri=4;src=E
325 | 2L	b2h	intron	242429	242507	7	-	.	mult=7;pri=4;src=E
326 | 2L	b2h	intron	242429	242519	0	-	.	pri=4;src=E
327 | 2L	b2h	intron	242745	242800	3	-	.	mult=3;pri=4;src=E
328 | 2L	b2h	intron	242926	243198	0	-	.	pri=4;src=E
329 | 2L	b2h	intron	201826	245768	0	-	.	pri=4;src=E
330 | 2L	b2h	intron	244568	246393	0	-	.	pri=4;src=E
331 | 2L	b2h	intron	242926	248604	5	-	.	mult=5;pri=4;src=E
332 | 2L	b2h	intron	242745	249005	0	-	.	pri=4;src=E
333 | 2L	b2h	intron	242745	249036	3	-	.	mult=3;pri=4;src=E
334 | 2L	b2h	intron	242745	250756	67	-	.	mult=67;pri=4;src=E
335 | 2L	b2h	intron	242926	250756	2	-	.	mult=2;pri=4;src=E
336 | 2L	b2h	intron	248855	250756	2	-	.	mult=2;pri=4;src=E
337 | 2L	b2h	intron	242745	250806	9	-	.	mult=9;pri=4;src=E
338 | 2L	b2h	intron	251442	251526	4	-	.	mult=4;pri=4;src=E
339 | 2L	b2h	intron	251465	251526	109	-	.	mult=109;pri=4;src=E
340 | 2L	b2h	intron	251802	251855	67	-	.	mult=67;pri=4;src=E
341 | 2L	b2h	intron	251925	251978	66	-	.	mult=66;pri=4;src=E
342 | 2L	b2h	intron	252167	252226	84	-	.	mult=84;pri=4;src=E
343 | 2L	b2h	intron	253100	253182	0	+	.	mult=2;pri=4;src=E
344 | 2L	b2h	intron	253083	253219	0	-	.	pri=4;src=E
345 | 2L	b2h	intron	253146	253219	80	-	.	mult=80;pri=4;src=E
346 | 2L	b2h	intron	253386	253445	90	-	.	mult=90;pri=4;src=E
347 | 2L	b2h	intron	253891	253955	73	-	.	mult=73;pri=4;src=E
348 | 2L	b2h	intron	254398	255617	0	-	.	pri=4;src=E
349 | 2L	b2h	intron	254398	258452	0	-	.	pri=4;src=E
350 | 2L	b2h	intron	197125	261850	0	-	.	pri=4;src=E
351 | 2L	b2h	intron	231424	263049	0	+	.	pri=4;src=E
352 | 2L	b2h	intron	254398	263752	0	-	.	pri=4;src=E
353 | 2L	b2h	intron	268031	268331	0	+	.	pri=4;src=E
354 | 2L	b2h	intron	195330	269137	0	+	.	pri=4;src=E
355 | 2L	b2h	intron	269390	269457	113	+	.	mult=113;pri=4;src=E
356 | 2L	b2h	intron	270607	270660	107	+	.	mult=107;pri=4;src=E
357 | 2L	b2h	intron	195330	271024	0	+	.	pri=4;src=E
358 | 2L	b2h	intron	254398	271628	25	-	.	mult=25;pri=4;src=E
359 | 2L	b2h	intron	254425	271628	0	-	.	mult=2;pri=4;src=E
360 | 2L	b2h	intron	272507	272554	101	-	.	mult=101;pri=4;src=E
361 | 2L	b2h	intron	275252	275318	108	-	.	mult=108;pri=4;src=E
362 | 2L	b2h	intron	276498	276738	159	-	.	mult=159;pri=4;src=E
363 | 2L	b2h	intron	276904	276958	111	-	.	mult=111;pri=4;src=E
364 | 2L	b2h	intron	276904	276973	12	-	.	mult=12;pri=4;src=E
365 | 2L	b2h	intron	277864	277929	32	+	.	mult=32;pri=4;src=E
366 | 2L	b2h	intron	278324	278642	32	+	.	mult=32;pri=4;src=E
367 | 2L	b2h	intron	279125	279184	43	+	.	mult=43;pri=4;src=E
368 | 2L	b2h	intron	215420	279383	0	-	.	pri=4;src=E
369 | 2L	b2h	intron	279806	279869	52	+	.	mult=52;pri=4;src=E
370 | 2L	b2h	intron	280041	280109	59	+	.	mult=59;pri=4;src=E
371 | 2L	b2h	intron	280423	280476	0	+	.	pri=4;src=E
372 | 2L	b2h	intron	282506	282653	50	-	.	mult=50;pri=4;src=E
373 | 2L	b2h	intron	261097	283606	0	+	.	pri=4;src=E
374 | 2L	b2h	intron	283653	283705	0	-	.	pri=4;src=E
375 | 2L	b2h	intron	284211	284321	3	-	.	mult=3;pri=4;src=E
376 | 2L	b2h	intron	284211	284457	0	-	.	mult=2;pri=4;src=E
377 | 2L	b2h	intron	282506	284747	2	-	.	mult=2;pri=4;src=E
378 | 2L	b2h	intron	284211	284747	1248	-	.	mult=1248;pri=4;src=E
379 | 2L	b2h	intron	284457	284747	0	-	.	mult=2;pri=4;src=E
380 | 2L	b2h	intron	284969	285366	5	-	.	mult=5;pri=4;src=E
381 | 2L	b2h	intron	284211	285432	0	-	.	pri=4;src=E
382 | 2L	b2h	intron	284969	285432	356	-	.	mult=356;pri=4;src=E
383 | 2L	b2h	intron	284969	286306	81	-	.	mult=81;pri=4;src=E
384 | 2L	b2h	intron	284969	290812	545	-	.	mult=545;pri=4;src=E
385 | 2L	b2h	intron	287094	290815	6	+	.	mult=6;pri=4;src=E
386 | 2L	b2h	intron	291032	291566	0	-	.	pri=4;src=E
387 | 2L	b2h	intron	291032	291606	217	-	.	mult=217;pri=4;src=E
388 | 2L	b2h	intron	291032	291626	0	-	.	pri=4;src=E
389 | 2L	b2h	intron	291032	291660	17	-	.	mult=17;pri=4;src=E
390 | 2L	b2h	intron	292661	292716	172	+	.	mult=172;pri=4;src=E
391 | 2L	b2h	intron	292706	293993	0	-	.	pri=4;src=E
392 | 2L	b2h	intron	294373	294440	251	-	.	mult=251;pri=4;src=E
393 | 


--------------------------------------------------------------------------------
/bin/genome_anno.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ==============================================================
  3 | # Lars Gabriel
  4 | #
  5 | # genome_anno.py: Handles the data structure for a genome annotation file
  6 | # ==============================================================
  7 | 
  8 | import os
  9 | import sys
 10 | import csv
 11 | 
 12 | class NotGtfFormat(Exception):
 13 |     pass
 14 | 
 15 | class Transcript:
 16 |     """
 17 |         Class handling the data structures and methods for a transcript
 18 |     """
 19 |     def __init__(self, id, gene_id, chr, source_anno, strand):
 20 |         """
 21 |             Args:
 22 |                 id (str): Transcript ID
 23 |                 gene_id (str): Gene ID
 24 |                 chr (str): Chromosome/Sequence name where the transcript is located
 25 |                 source_anno (str): Anno ID
 26 |                 strand (str): Strand (+/-) on which the transctipt is located
 27 |         """
 28 |         self.id = id
 29 |         self.chr = chr
 30 |         self.gene_id = gene_id
 31 |         # self.transcript_lines[segment_type] = [lines of segment type]
 32 |         self.transcript_lines = {}
 33 |         self.gtf = []
 34 |         self.source_anno = source_anno
 35 |         self.start = -1
 36 |         self.end = -1
 37 |         self.cds_len = -1
 38 |         self.cds_coords = {}
 39 |         self.strand = strand
 40 |         self.source_method = ''
 41 | 
 42 |     def add_line(self, line):
 43 |         """
 44 |             Add a single line from the gtf file to the transcript data structure.
 45 | 
 46 |             Args:
 47 |                 line (list): List of all elements of a line from a gtf file
 48 |         """
 49 |         if not (line[0] == self.chr or line[6] == self.strand):
 50 |             raise NotGtfFormat('File is not in gtf format. ' \
 51 |                 + 'Error in line {}\n'.format('\t'.join(map(str, line)))
 52 |                 + 'Transcript ID is not unique')
 53 | 
 54 |         if line[2] not in self.transcript_lines.keys():
 55 |             self.transcript_lines.update({line[2] : []})
 56 | 
 57 |         self.source_method = line[1]
 58 | 
 59 |         line[3] = int(line[3])
 60 |         line[4] = int(line[4])
 61 |         if self.start < 0 or line[3] < self.start:
 62 |             self.start = line[3]
 63 |         if self.end < 0 or line[4] > self.end:
 64 |             self.end = line[4]
 65 |         if self.gene_id == '' and not line[2] == 'transcript':
 66 |             self.gene_id = line[8].split('gene_id "')[1].split('";')[0]
 67 |         self.transcript_lines[line[2]].append(line)
 68 | 
 69 |     def set_gene_id(self, new_gene_id):
 70 |         self.gene_id = new_gene_id
 71 |     
 72 |     def get_cds_len(self):
 73 |         cds = self.get_type_coords('CDS', False)
 74 |         return sum([c[1] - c[0] + 1 for c in cds])
 75 |     
 76 |     def get_type_coords(self, type, frame=True):
 77 |         """
 78 |             Get the coordinates and reading frame of the coding regions
 79 |             Returns:
 80 |                 (dict(list(list(int)))): Dictionary with list of type coords for
 81 |                                         each each frame phase (0,1,2)
 82 |         """
 83 |         # returns dict of cds_coords[phase] = [start_coord, end_coord] of all CDS
 84 |         if frame:
 85 |             coords = {'0' : [], '1' : [], '2' : [], '.' : []}
 86 |         else:
 87 |             coords = []
 88 |         # if type == 'CDS' and type not in self.transcript_lines.keys():
 89 |         #     type = 'exon'
 90 |         if type not in self.transcript_lines.keys():
 91 |             return coords
 92 |         for line in self.transcript_lines[type]:
 93 |             if frame:
 94 |                 coords[line[7]].append([line[3], line[4]])
 95 |             else:
 96 |                 coords.append([line[3], line[4]])
 97 |         if frame:
 98 |             for k in coords.keys():
 99 |                 coords[k].sort(key=lambda c: (c[0],c[1]))
100 |             if type == 'CDS':
101 |                 coords['0'] += coords['.']
102 |                 del coords['.']
103 |         else:
104 |             coords.sort(key=lambda c: (c[0],c[1]))
105 |         return coords
106 | 
107 |     def get_cds_coords(self):
108 |         """
109 |             Get the coordinates and reading frame of the coding regions
110 | 
111 |             Returns:
112 |                 (dict(list(list(int)))): Dictionary with list of CDS coords for
113 |                                         each each frame phase (0,1,2)
114 |         """
115 |         # returns dict of cds_coords[phase] = [start_coord, end_coord] of all CDS
116 |         if not self.cds_coords.keys():
117 |             self.cds_coords = {'0' : [], '1' : [], '2' : []}
118 |             if 'CDS' in self.transcript_lines.keys():
119 |                 key  = 'CDS'
120 |             else:
121 |                 key = 'exon'
122 |             for line in self.transcript_lines[key]:
123 |                 self.cds_coords[line[7]].append([line[3], line[4]])
124 |             for k in self.cds_coords.keys():
125 |                 self.cds_coords[k].sort(key=lambda c: (c[0],c[1]))
126 |         return self.cds_coords
127 | 
128 |     def add_missing_lines(self):
129 |         """
130 |             Add transcript, intron, CDS, exon coordinates if they were not
131 |             included in the gtf file
132 | 
133 |             Returns:
134 |                 (boolean): FALSE if no cds were found for the tx, TRUE otherwise
135 |         """
136 |         # add intron lines
137 |         self.find_introns()
138 |         # check if tx has cds or exon
139 |         if not self.check_cds_exons():
140 |             return False
141 |         # add transcript line
142 |         self.find_transcript()
143 |         # add start/stop codon line
144 |         self.find_start_stop_codon()
145 |         return True
146 | 
147 |     def check_cds_exons(self):
148 |         """
149 |             Check if tx has CDS or exons.
150 |         """
151 |         if 'CDS' not in self.transcript_lines.keys() and 'exon' not in self.transcript_lines.keys():
152 |             sys.stderr.write('Skipping transcript {}, no CDS nor exons in {}\n'.format(self.id, self.id))
153 |             return False
154 |         return True
155 | 
156 |     def find_introns(self):
157 |         """
158 |             Add intron lines.
159 |         """
160 |         if not 'intron' in self.transcript_lines.keys():
161 |             self.transcript_lines.update({'intron' : []})
162 |             key = ''
163 |             if 'CDS' in self.transcript_lines.keys():
164 |                 key = 'CDS'
165 |             elif 'exon' in self.transcript_lines.keys():
166 |                 key = 'exon'
167 |             if key:
168 |                 exon_lst = []
169 |                 for line in self.transcript_lines[key]:
170 |                     exon_lst.append(line)
171 |                 exon_lst = sorted(exon_lst, key=lambda e:e[3])
172 |                 for i in range(1, len(exon_lst)):
173 |                     intron = []
174 |                     intron += exon_lst[i][0:2]
175 |                     intron.append('intron')
176 |                     intron.append(exon_lst[i-1][4] + 1)
177 |                     intron.append(exon_lst[i][3] - 1)
178 |                     intron += exon_lst[i][5:8]
179 |                     intron.append("gene_id \"{}\"; transcript_id \"{}\";".format(\
180 |                     self.gene_id, self.id))
181 |                     self.transcript_lines['intron'].append(intron)
182 | 
183 |     def find_transcript(self):
184 |         """
185 |             Add transcript lines.
186 |         """
187 |         if not 'transcript' in self.transcript_lines.keys():
188 |             for k in self.transcript_lines.keys():
189 |                 for line in self.transcript_lines[k]:
190 |                     if line[3] < self.start or self.start < 0:
191 |                         self.start = line[3]
192 |                     if line[4] > self.end:
193 |                         self.end = line[4]
194 |             tx_line = [self.chr, line[1], 'transcript', self.start, self.end, \
195 |             '.', line[6], '.', self.id]
196 |             self.add_line(tx_line)
197 | 
198 |     def find_start_stop_codon(self):
199 |         """
200 |             Add start/stop codon lines.
201 |         """
202 | 
203 |         if not 'start_codon' in self.transcript_lines.keys():
204 |             self.transcript_lines.update({'start_codon' : []})
205 |         if not 'stop_codon' in self.transcript_lines.keys():
206 |             self.transcript_lines.update({'stop_codon' : []})
207 | 
208 | 
209 |         key = ''
210 |         if 'CDS' in self.transcript_lines.keys():
211 |             key = 'CDS'
212 |         elif 'exon' in self.transcript_lines.keys():
213 |             key = 'exon'
214 |         if key:
215 |             self.transcript_lines[key].sort(key = lambda x : x[3])
216 |             tx = self.transcript_lines[key][0]
217 |             line1 = [self.chr, tx[1], '', tx[3], tx[3] + 2, \
218 |             '.', self.strand, '0', "gene_id \"{}\"; transcript_id \"{}\";".format(\
219 |             self.gene_id, self.id)]
220 |             tx = self.transcript_lines[key][-1]
221 |             line2 = [self.chr, tx[1], '', tx[4] - 2, tx[4], \
222 |             '.', self.strand, '0', "gene_id \"{}\"; transcript_id \"{}\";".format(\
223 |             self.gene_id, self.id)]
224 | 
225 |             fragmented_transcript = True
226 |             if tx[6] == '+':
227 |                 line1[2] = 'start_codon'
228 |                 line2[2] = 'stop_codon'
229 |                 if self.transcript_lines[key][0][7] == 0:
230 |                     fragmented_transcript = False
231 |                 start = line1
232 |                 stop = line2
233 |             else:
234 |                 line1[2] = 'stop_codon'
235 |                 line2[2] = 'start_codon'
236 |                 if self.transcript_lines[key][-1][7] == 0:
237 |                     fragmented_transcript = False
238 |                 stop = line1
239 |                 start = line2
240 |             if not 'start_codon' in self.transcript_lines.keys() and not fragmented_transcript:
241 |                 if not fragmented_transcript:
242 |                     self.add_line(start)
243 |                 else:
244 |                     self.transcript_lines.update({'start_codon' : []})
245 |             if not 'stop_codon' in self.transcript_lines.keys():
246 |                 self.add_line(stop)
247 | 
248 | 
249 |     def get_gtf(self, prefix=''):
250 |         """
251 |             Creates gtf output for the transcript.
252 | 
253 |             Returns:
254 |                 (list(list(str))): List of lines in gtf format as lists
255 |         """
256 |         gtf = []
257 |         if prefix:
258 |             prefix += '.'
259 |         tx_line = []
260 |         for k in self.transcript_lines.keys():
261 |             for g in self.transcript_lines[k]:
262 |                 if k == 'transcript':
263 |                     tx_line  = g
264 |                     tx_line[8] = prefix + self.id
265 |                     continue
266 |                 else:
267 |                     g[8] = f'transcript_id \"{prefix + self.id}\"; gene_id \"{self.gene_id}";'
268 |                 gtf.append(g) 
269 | 
270 |         if not 'exon' in self.transcript_lines.keys():
271 |             for g in self.transcript_lines['CDS']: 
272 |                 gtf.append(g[:2] + ['exon'] + g[3:])                                
273 | 
274 |         gtf = sorted(gtf, key=lambda g: (g[3],g[4]))
275 |         if tx_line:
276 |             gtf = [tx_line] + gtf
277 |         return gtf
278 | 
279 | class Anno:
280 |     """
281 |         Class handling the data structures and methods for a one genome annotation file
282 |     """
283 |     def __init__(self, path, id):
284 |         """
285 |             Args:
286 |                 path (str): Path to the annotation/gene prediction file in gtf format.
287 |                 id (str): Annotation ID
288 |         """
289 |         self.id = id
290 |         self.genes = {'None' : []}
291 |         self.gene_gtf = {}
292 |         self.transcripts = {}
293 |         self.path = path
294 |         self.translation_tab = []
295 | 
296 |     def addGtf(self):
297 |         """
298 |             Read a gtf file and create a dictionary of Transcript objects for
299 |             all transcript in the file
300 |         """
301 |         with open (self.path, 'r') as file:
302 |             file_lines = csv.reader(file, delimiter='\t')
303 |             for line in file_lines:
304 |                 line = [l.strip(' ') for l in line]
305 |                 if line[0][0] ==  '#':
306 |                     continue
307 |                 line[3] = int(line[3])
308 |                 line[4] = int(line[4])
309 |                 if line[2] == 'gene':
310 |                     gene_id = line[8]
311 |                     self.genes_update(gene_id)
312 |                     if not gene_id in self.gene_gtf.keys():
313 |                         self.gene_gtf.update({gene_id : line})
314 |                     else:
315 |                         sys.stderr.write('ERROR, gene_id not unique: {}\n'.format(gene_id))
316 |                 elif line[2] == 'transcript':
317 |                     transcript_id = line[8]
318 |                     gene_id = ''
319 |                     self.transcript_update(transcript_id, gene_id, line[0], line[6])
320 |                     self.transcripts[transcript_id].add_line(line)
321 |                 else:
322 |                     transcript_id = line[8].split('transcript_id "')
323 |                     if len(transcript_id) > 1:
324 |                         transcript_id = transcript_id[1].split('";')[0]
325 |                     else:
326 |                         raise NotGtfFormat('File: "{}" is not in gtf format. \n'.format(\
327 |                             self.path) + 'Error in line {}\n'.format('\t'.join(map(str, line))))
328 | 
329 |                     gene_id = line[8].split('gene_id "')
330 |                     if len(gene_id) > 1:
331 |                         gene_id = gene_id[1].split('";')[0]
332 |                     else:
333 |                         gene_id = 'None'
334 |                         for key, value in self.genes.items():
335 |                             if value == transcript_id:
336 |                                 gene_id = key
337 | 
338 |                     self.transcript_update(transcript_id, gene_id, line[0], line[6])
339 |                     self.genes_update(gene_id, transcript_id)
340 |                     self.transcripts[transcript_id].add_line(line)
341 | 
342 |         for tx_id in self.genes['None']:
343 |             gene_id = tx_id + '_g'
344 |             self.genes_update(gene_id, tx_id)
345 | 
346 |     def norm_tx_format(self):
347 |         """
348 |             Add to all Transcript objects transcript, intron, CDS, exon
349 |             coordinates if they were not included in the gtf file.
350 |             Delete all transripts that have no exons or CDS
351 |         """
352 |         tx_no_cds = []
353 |         # add missing lines to all tx
354 |         for k in self.transcripts.keys():
355 |             if not self.transcripts[k].add_missing_lines():
356 |                 tx_no_cds.append(k)
357 |         for k in tx_no_cds:
358 |             del self.transcripts[k]
359 | 
360 |     def genes_update(self, gene_id, transcript_id=''):
361 |         """
362 |             Update gene ID dict.
363 |             Args:
364 |                 gene_id (str): Gene ID
365 |                 transcript_id (str): Transcript ID
366 |         """
367 |         # update gene ids
368 |         if not gene_id in self.genes.keys():
369 |             self.genes.update({ gene_id : []})
370 |         if transcript_id and transcript_id not in self.genes[gene_id]:
371 |             self.genes[gene_id].append(transcript_id)
372 |         if transcript_id in self.genes['None'] and not gene_id == 'None':
373 |             self.genes['None'].remove(transcript_id)
374 |             self.transcripts[transcript_id].gene_id = gene_id
375 | 
376 |     def transcript_update(self, t_id, g_id, chr, strand):
377 |         """
378 |             Update transcript ID dict.
379 |             Args:
380 |                 t_id (str): Transcript ID
381 |                 g_id (str): Gene ID
382 |                 chr (str): Chromosome name
383 |                 strand (str): Strand (+/-)
384 |         """
385 |         if not t_id in self.transcripts.keys():
386 |             self.transcripts.update({ t_id : Transcript(t_id, g_id, chr, self.id, strand)})
387 | 
388 |     def find_genes(self):
389 |         """
390 |             Find all genes in the annotation and find the transcripts that
391 |             belong to each gene. Also, cretae a dict with the gtf lines for each gene.
392 |         """
393 |         self.gene_gtf = {}
394 |         self.genes = {}
395 |         for tx in self.transcripts.values():
396 |             if tx.gene_id in self.genes.keys():
397 |                 if not (tx.chr == self.gene_gtf[tx.gene_id][0] and \
398 |                     tx.strand == self.gene_gtf[tx.gene_id][6]):
399 |                     sys.stderr.write('ERROR, gene_id not unique: {}.'.format(tx.gene_id))
400 |                     tx.gene_id = tx.gene_id + '.' + tx.chr + '.' + tx.strand
401 |                     sys.stderr.write(' Adding new gene: {}\n'.format(tx.gene_id))
402 |                 else:
403 |                     self.genes[tx.gene_id].append(tx.id)
404 |                     self.gene_gtf[tx.gene_id][3] = min(self.gene_gtf[tx.gene_id][3], \
405 |                         tx.start)
406 |                     self.gene_gtf[tx.gene_id][4] = max(self.gene_gtf[tx.gene_id][4], \
407 |                         tx.end)
408 |                     continue
409 |             self.genes.update({tx.gene_id : [tx.id]})
410 |             self.gene_gtf.update({tx.gene_id : [tx.chr, tx.source_method, 'gene', \
411 |                 tx.start, tx.end, '.', tx.strand, '.', tx.gene_id]})
412 | 
413 |     def get_gtf(self):
414 |         """
415 |             Get annotaion file as gtf list.
416 |             Returns:
417 |                 list(list(str)): Gtf file as list of lists
418 |         """
419 |         gtf = []
420 |         gene_gtf = sorted(self.gene_gtf.values(), key=lambda g: (g[0],g[3],g[4]))
421 |         for gene in gene_gtf:
422 |             gtf.append(gene)
423 |             for tx_id in self.genes[gene[8]]:
424 |                 gtf += self.transcripts[tx_id].get_gtf()
425 |         return gtf
426 | 
427 |     def add_transcripts(self, txs, id_prefix=''):
428 |         """
429 |             Adds a dict of transcripts to the transcripts of the annotation.
430 |             Args:
431 |                 dict(Transcript()): dictionary of Transcripts added to the annotation
432 |         """
433 |         if not id_prefix:
434 |             self.transcripts.update({txs})
435 |         else:
436 |             for tx in txs.values():
437 |                 tx.id = id_prefix + tx.id
438 |                 self.transcripts.update({tx.id : tx})
439 | 
440 |     def get_subset(self, tx_list):
441 |         """
442 |             Get annotaion file for a subset of transcripts.
443 |             Args:
444 |                 tx_list (list(str)): List of transcript IDs
445 |             Returns:
446 |                 list(list(str)): Gtf file as list of lists
447 |         """
448 |         tx_subset = {}
449 |         for tx in tx_list:
450 |             tx_subset.update({tx : self.transcripts[tx]})
451 |         return tx_subset
452 | 
453 |     def change_id(self, new_id):
454 |         """
455 |             Change annotation file ID.
456 |         """
457 |         self.id = new_id
458 |         for k in self.transcripts.keys():
459 |             self.transcripts[k].source_anno = self.id
460 | 
461 |     def get_transcript_list(self):
462 |         """
463 |             Returns:
464 |                 (List(Transcript)): List of all transcripts.
465 |         """
466 |         return list(self.transcripts.values())
467 | 
468 |     def rename_tx_ids(self, prefix=''):
469 |         """
470 |             Renames all tx and genes and returns translation table for old tx id to new tx id.
471 |             Args:
472 |                 prefix (string): String added before each tx and gene ID.
473 |             Returns:
474 |                 translation_tab (list(str, str)): Translation table for old tx id to new tx id.
475 |         """
476 |         self.translation_tab = []
477 |         gene_numb = 1
478 |         old_gene_gtf = sorted(self.gene_gtf.values(), key=lambda g: (g[0],g[3],g[4]))
479 |         self.gene_gtf = {}
480 |         old_genes = self.genes
481 |         self.genes = {}
482 |         old_txs = self.transcripts
483 |         self.transcripts = {}
484 |         if prefix:
485 |             prefix += '_'
486 |         for gene in old_gene_gtf:
487 |             tx_numb = 1
488 |             old_gene_id = gene[8]
489 |             new_gene_id = "{}g{}".format(prefix, gene_numb)
490 |             gene[8] = new_gene_id
491 |             self.genes.update({new_gene_id : []})
492 |             self.gene_gtf.update({new_gene_id : gene})
493 |             for old_tx_id in old_genes[old_gene_id]:
494 |                 new_tx_id = "{}g{}.t{}".format(prefix, gene_numb, tx_numb)
495 |                 self.transcripts.update({new_tx_id : old_txs[old_tx_id]})
496 |                 self.transcripts[new_tx_id].id = new_tx_id
497 |                 self.transcripts[new_tx_id].gene_id = new_gene_id
498 |                 self.genes[new_gene_id].append(new_tx_id)
499 |                 tx_numb +=1
500 |                 self.translation_tab.append([new_tx_id, old_tx_id])
501 |             gene_numb += 1
502 |         return self.translation_tab
503 | 
504 |     def write_anno(self, out_path):
505 |         """
506 |             Write Annotation in gtf format to out_path.
507 |             Args:
508 |                 (str) : path to the output file
509 |         """
510 |         with open(out_path, 'w+') as file:
511 |             out_writer = csv.writer(file, delimiter='\t', quotechar = "|", lineterminator = '\n')
512 |             for line in self.get_gtf():
513 |                 out_writer.writerow(line)


--------------------------------------------------------------------------------