├── .gitignore
├── MANIFEST.in
├── tests
    ├── mock_args_container.py
    ├── test_projectcreator.py
    ├── test_change_db_format.py
    ├── test_plot_coverage_table.py
    ├── mock_helper.py
    ├── test_seqmodifier.py
    ├── test_splice_parser.py
    ├── mock_gff3.py
    ├── test_plot_mountain.py
    ├── test_TSSpredator.py
    ├── uni_report.py
    ├── test_modify_rbs_table.py
    ├── test_parser_wig.py
    ├── test_gff3.py
    ├── test_gen_table_tran.py
    ├── test_seq_editer.py
    ├── test_color_png.py
    ├── test_filter_TSS_pro.py
    ├── test_blast_class.py
    ├── test_goterm.py
    ├── test_expresssion.py
    ├── test_gen_svg.py
    ├── test_stat_TSSpredater.py
    ├── test_meme.py
    ├── test_stat_sublocal.py
    ├── test_optimize.py
    ├── test_paths.py
    ├── test_operon.py
    ├── test_plot_TSS_venn.py
    ├── test_sORF_intergenic.py
    └── test_compare_sRNA_sORF.py
├── docs
    └── source
    │   ├── logo
    │       ├── logo_annogesic.pdf
    │       ├── logo_annogesic.png
    │       ├── READemption_logo.png
    │       └── annogesic_logo_white.png
    │   ├── license.rst
    │   ├── docker.rst
    │   └── installation.rst
├── run_test.py
├── tutorial_data
    ├── mutation.csv
    └── replace_seq_id.py
├── CITATION.cff
├── annogesiclib
    ├── change_db_format.py
    ├── splice_parser.py
    ├── get_Rfam_ribo.py
    ├── print_rank_all.py
    ├── projectcreator.py
    ├── sRNA_filter_min_utr.py
    ├── extract_sec_info.py
    ├── plot_tran.py
    ├── blast_class.py
    ├── filter_TSS_pro.py
    ├── seqmodifier.py
    ├── map_ribos.py
    ├── plot_mountain.py
    ├── TSSpredator_parser.py
    ├── plot_coverage_table.py
    ├── sRNA_filter_frag.py
    ├── modify_rbs_table.py
    ├── parser_wig.py
    ├── gen_promoter_table.py
    ├── output_cutoff_table.py
    ├── rbs_overlap.py
    ├── reorganize_table.py
    ├── lib_reader.py
    ├── check_srna_overlap.py
    ├── gen_svg.py
    ├── overlap.py
    ├── compare_sRNA_sORF.py
    ├── compare_srna_promoter.py
    ├── sRNA_antisense.py
    ├── screen.py
    ├── gff3.py
    ├── get_input.py
    ├── color_png.py
    ├── expression.py
    ├── combine_frag_tex.py
    └── stat_operon.py
├── database
    ├── Rfam_RNA_thermometer_ID.csv
    └── Rfam_riboswitch_ID.csv
├── benchmark_sRNAs
    ├── Campylobacter.csv
    └── Helicobacter.csv
├── LICENSE
├── comparison
    ├── README.md
    ├── compare_TSS_Mendoza_Vargas.py
    ├── compare_term_ecocyc.py
    ├── compare_sORF.py
    ├── compare_promoter_regulondb.py
    ├── compare_tran.py
    ├── compare_term_regulon.py
    ├── compare_TSS_Salgado.py
    ├── compare_operon_regulondb.py
    ├── compare_srna.py
    ├── compare_operon_door.py
    └── gff3.py
├── setup.py
├── Makefile
└── Table_dependency_version.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *pyc
2 | *~
3 | __pycache__
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | #documentation
2 | recursive-include html *
3 | 
4 | #Misc
5 | include LICENSE
6 | 


--------------------------------------------------------------------------------
/tests/mock_args_container.py:
--------------------------------------------------------------------------------
1 | class MockClass(object):
2 |     
3 |     def mock(self):
4 |         return self
5 | 


--------------------------------------------------------------------------------
/docs/source/logo/logo_annogesic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/logo_annogesic.pdf


--------------------------------------------------------------------------------
/docs/source/logo/logo_annogesic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/logo_annogesic.png


--------------------------------------------------------------------------------
/docs/source/logo/READemption_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/READemption_logo.png


--------------------------------------------------------------------------------
/docs/source/logo/annogesic_logo_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/annogesic_logo_white.png


--------------------------------------------------------------------------------
/run_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | all_tests = unittest.TestLoader().discover("./tests")
3 | unittest.TextTestRunner(verbosity=1).run(all_tests)
4 | 


--------------------------------------------------------------------------------
/tutorial_data/mutation.csv:
--------------------------------------------------------------------------------
1 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	BAM
2 | NC_009839.1	3	.	g	c	.	.	.	.	.
3 | NC_009839.1	6	.	t	-	.	.	.	.	.
4 | NC_009839.1	600	.	-	g	.	.	.	.	.
5 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.0.3
 2 | message: If you use ANNOgesic, please cite it as below.
 3 | authors:
 4 |   - family-names: Yu
 5 |     given-names: Sung-Huan
 6 |     orcid: https://orcid.org/0000-0001-7955-8645
 7 |   - family-names: Vogel
 8 |     given-names: Jörg
 9 |     orcid: https://orcid.org/0000-0003-2220-1404 
10 |   - family-names: Förstner
11 |     given-names: Konrad Ulrich
12 |     orcid: https://orcid.org/0000-0002-1481-2996
13 | title: ANNOgesic
14 | version: 1.0.15
15 | doi: 10.1093/gigascience/giy096
16 | date-released: 2018-09-03
17 | 


--------------------------------------------------------------------------------
/annogesiclib/change_db_format.py:
--------------------------------------------------------------------------------
 1 | def change_format(input_file, output_file):
 2 |     '''change the format of sRNA database'''
 3 |     num = 1
 4 |     out = open(output_file, "w")
 5 |     with open(input_file) as f_h:
 6 |         for line in f_h:
 7 |             line = line.strip()
 8 |             if line.startswith(">"):
 9 |                 datas = line.split("|")
10 |                 if datas[0][1:] == "NA":
11 |                     datas[0] = ">srn_" + str(num)
12 |                     num += 1
13 |                 out.write("|".join(datas[:3]) + "\n")
14 |             else:
15 |                 out.write(line + "\n")
16 |     out.close()
17 | 


--------------------------------------------------------------------------------
/database/Rfam_RNA_thermometer_ID.csv:
--------------------------------------------------------------------------------
 1 | #Rfam_ID	Name	Description
 2 | RF01795	FourU	FourU thermometer RNA element 			
 3 | RF02358	hsp17	Hsp17 thermometer 			
 4 | RF01832	ROSE_2	Repression of heat shock gene expression ROSE element 			
 5 | RF02523	ROSE_3	Repression of heat shock gene expression ROSE element 			
 6 | RF00038	PrfA	PrfA thermoregulator UTR 			
 7 | RF00433	Hsp90_CRE	Hsp90 cis regulatory element 			
 8 | RF00435	ROSE	Repression of heat shock gene expression ROSE element 			
 9 | RF01766	cspA	cspA thermoregulator 			
10 | RF01859	Phe_leader	Phenylalanine leader peptide 			
11 | RF01804	Lambda_thermo	Lambda phage CIII thermoregulator element 	
12 | 


--------------------------------------------------------------------------------
/benchmark_sRNAs/Campylobacter.csv:
--------------------------------------------------------------------------------
 1 | start	end	strand
 2 | 75877	75984	+
 3 | 100738	100833	+
 4 | 248102	248257	-
 5 | 427729	427864	+
 6 | 439584	-	+
 7 | 518348	518664	-
 8 | 650864	650959	+
 9 | 681025	681305	-
10 | 879931	880026	+
11 | 879955	880052	-
12 | 996243	996319	+
13 | 1148713	1148849	+
14 | 1200515	1200700	+
15 | 1209854	1209926	+
16 | 1293194	1293552	-
17 | 1440760	1440797	+
18 | 1440826	1440863	+
19 | 1440893	1440930	+
20 | 1440958	1440995	+
21 | 1441025	1441062	+
22 | 1441090	1441127	+
23 | 1441156	1441193	+
24 | 1441289	1441362	+
25 | 1542619	1542645	+
26 | 1563092	1563246	+
27 | 1563121	1563337	-
28 | 1568600	1568750	+
29 | 1624613	1624711	-
30 | -	671301	-
31 | 174436	-	+
32 | 947560	-	+
33 | 


--------------------------------------------------------------------------------
/docs/source/license.rst:
--------------------------------------------------------------------------------
 1 | License
 2 | ==========
 3 | 
 4 | ANNOgesic is open source software and available under the ISC license.
 5 | 
 6 | Copyright (c) 2013-2020, Sung-Huan Yu <shyu@biochem.mpg.de>
 7 | 
 8 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
 9 | 
10 | THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | ANNOgesic is open source software and available under the ISC license.
2 | 
3 | Copyright (c) 2013-2023, Sung-Huan Yu <silasysh@mail.nsysu.edu.tw>
4 |                          Konrad Förstner <konrad@foerstner.org>
5 | 
6 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
7 | 
8 | THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
9 | 


--------------------------------------------------------------------------------
/annogesiclib/splice_parser.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | class SpliceParser(object):
 5 |     '''parser the splice data of segemehl'''
 6 | 
 7 |     def parser(self, splice_fh):
 8 |         for row in csv.reader(splice_fh, delimiter="\t"):
 9 |             yield assign_value(row)
10 | 
11 | 
12 | class assign_value(object):
13 | 
14 |     def __init__(self, row):
15 |         self.strain = row[0]
16 |         self.start = int(row[1])
17 |         self.end = int(row[2])
18 |         self.splice = row[3]
19 |         splice = row[3].split(":")
20 |         self.supported_reads = int(splice[1])
21 |         self.start_site_reads = int(splice[2])
22 |         self.end_site_reads = int(splice[3])
23 |         self.splice_type = splice[4]
24 |         self.situation = splice[5]
25 |         self.strand = row[5]
26 |         self.info = ("\t".join(row))
27 | 
28 |     def __str__(self):
29 |         return "{0} {1} {2} {3} {4}".format(
30 |                self.strain, self.start, self.end, self.splice, self.strand)
31 | 


--------------------------------------------------------------------------------
/comparison/README.md:
--------------------------------------------------------------------------------
 1 | The scripts for comparison between ANNOgesic predictions and several databases
 2 | ------------------------------------------------------------------------------
 3 | 
 4 | 1. Please download the data from RegulonDB (http://regulondb.ccg.unam.mx/menu/download/datasets/index.jsp), 
 5 | EcoCyc (https://ecocyc.org/site-search.shtml) or DOOR2 (http://csbl.bmb.uga.edu/DOOR/index.php).
 6 | 
 7 | 2. In order to make the comparison more reliable, please remove the non-expressed 
 8 | features of databases. Otherwise the performances will be influenced by the
 9 | non-expressed features.
10 | 
11 | 3. For terminators, we also suggest to remove the terminators from the databases 
12 | which does not contain a coverage significant decrease.
13 | 
14 | 4. For sORF, we used the study of Hemm et. al (2010) as benchmarking set. Please 
15 | convert the sORF information to Gff3 format.
16 | 
17 | 5. The number of CRISPRs and riboswitches are few in databases, the manual 
18 | comparison can be implemented easily. Thus, no scrips for the comparison is provided.
19 | 


--------------------------------------------------------------------------------
/annogesiclib/get_Rfam_ribo.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | def rbs_from_rfam(ribo_table, rfam_file, out_file):
 5 |     ribos = []
 6 |     out = open(out_file, "w")
 7 |     f_h = open(ribo_table, "r")
 8 |     for row in csv.reader(f_h, delimiter="\t"):
 9 |         if not row[0].startswith("#"):
10 |             ribos.append(row[0].strip())
11 |     detect = False
12 |     with open(rfam_file, "r") as r_h:
13 |         for line in r_h:
14 |             line = line.rstrip("\n")
15 |             datas = line.split(" ")
16 |             if ("INFERNAL" in datas[0]) or (
17 |                     "HMMER" in datas[0]):
18 |                 header = line
19 |                 detect = False
20 |             elif "NAME" in datas[0]:
21 |                 name = line
22 |             elif ("ACC" in datas[0]):
23 |                 for ribo in ribos:
24 |                     if datas[-1] == ribo:
25 |                         out.write("{0}\n{1}\n{2}\n".format(header, name, line))
26 |                         detect = True
27 |             else:
28 |                 if (detect):
29 |                     out.write(line + "\n")
30 |     out.close()
31 |     f_h.close()
32 | 


--------------------------------------------------------------------------------
/annogesiclib/print_rank_all.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | 
 5 | 
 6 | def print_rank_all(all_table, best_table):
 7 |     out = open("tmp_rank_table", "w")
 8 |     fh = open(best_table, "r")
 9 |     rank = 0
10 |     bests = []
11 |     for row in csv.reader(fh, delimiter='\t'):
12 |         if row[0] != "Rank":
13 |             bests.append(row)
14 |             rank = int(row[0])
15 |         out.write("\t".join(row) + "\n")
16 |     fh.close()
17 |     fh = open(all_table, "r")
18 |     for row in csv.reader(fh, delimiter='\t'):
19 |         detect = False
20 |         if row[0] != "rank":
21 |             for best in bests:
22 |                 if (row[1] == best[1]) and (
23 |                         row[3] == best[3]) and (
24 |                         row[4] == best[4]) and (
25 |                         row[5] == best[5]):
26 |                     detect = True
27 |                     break
28 |             if not detect:
29 |                 rank += 1
30 |                 row[0] = str(rank)
31 |                 out.write("\t".join(row) + "\n")
32 |     os.remove(all_table)
33 |     shutil.move("tmp_rank_table", all_table)
34 | 


--------------------------------------------------------------------------------
/annogesiclib/projectcreator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | class ProjectCreator(object):
 6 | 
 7 |     def create_root_folder(self, project_name):
 8 |         """Create the root folder of a new project with the given name.
 9 |         Arguments:
10 |         - `project_name`: Name of the project root folder
11 |         """
12 |         if not os.path.exists(project_name):
13 |             os.mkdir(project_name)
14 |         else:
15 |             sys.stderr.write("Cannot create folder \"%s\"! File/folder with "
16 |                              "the same name exists already.\n" % project_name)
17 |             sys.exit(2)
18 | 
19 |     def create_subfolders(self, subfolders):
20 |         """Create required subfolders in the given folder.
21 |         Arguments:
22 |         - `project_name`: Name of the project root folder
23 |         """
24 |         for folder in subfolders:
25 |             if not os.path.exists(folder):
26 |                 os.mkdir(folder)
27 | 
28 |     def create_version_file(self, version_file_path, version):
29 |         with open(version_file_path, "w") as fh:
30 |             fh.write("ANNOgesic version %s" % version)
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | try:
 3 |     from setuptools import setup
 4 | except ImportError:
 5 |     from distutils.core import setup
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | with open(path.join(here, 'README.rst')) as f:
10 |     long_description = f.read()
11 | 
12 | setup(
13 |     name='ANNOgesic',
14 |     version='1.1.14',
15 |     packages=['annogesiclib'],
16 |     author='Sung-Huan Yu',
17 |     author_email='silasysh@g-mail.nsysu.edu.tw',
18 |     description='ANNOgesic - A tool for bacterial/archaeal RNA-Seq based genome annotations',
19 |     long_description=long_description,
20 |     url='https://github.com/Sung-Huan/ANNOgesic',
21 |     install_requires=[
22 |         "biopython >= 1.65",
23 |         "matplotlib >= 1.5.0",
24 |         "numpy >= 1.9.2",
25 |         "networkx >= 1.9.1"
26 |     ],
27 |     scripts=['bin/annogesic'],
28 |     license='ISC License (ISCL)',
29 |     classifiers=[
30 |         'License :: OSI Approved :: ISC License (ISCL)',
31 |         'Operating System :: POSIX',
32 |         'Programming Language :: Python :: 3',
33 |         'Topic :: Scientific/Engineering :: Bio-Informatics',
34 |     ]
35 | )
36 | 


--------------------------------------------------------------------------------
/tutorial_data/replace_seq_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import shutil
 5 | import argparse
 6 | 
 7 | __author__ = "Sung-Huan Yu <shyu@biochem.mpg.de>"
 8 | __email__ = "shyu@biochem.mpg.de"
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("-i","--input_wig_folder",help="input wig file")
12 | parser.add_argument("-n","--strain_name",help="strain_name")
13 | args = parser.parse_args()
14 | 
15 | def main():
16 |     for wig in os.listdir(args.input_wig_folder):
17 |         out = open("tmp", "w")
18 |         with open(os.path.join(args.input_wig_folder, wig)) as fh:
19 |             for line in fh:
20 |                 if line.startswith("variableStep"):
21 |                     data = line.split(" ")
22 |                     choms = data[1].split("=")
23 |                     choms[-1] = args.strain_name
24 |                     data[1] = "=".join(choms)
25 |                     out.write(" ".join(data))
26 |                 else:
27 |                     out.write(line)
28 |         out.close()
29 |         os.remove(os.path.join(args.input_wig_folder, wig))
30 |         shutil.move("tmp", os.path.join(args.input_wig_folder, wig))
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/tests/test_projectcreator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import sys
 4 | import shutil
 5 | sys.path.append(".")
 6 | from annogesiclib.projectcreator import ProjectCreator
 7 | 
 8 | 
 9 | class TestProjectCreator(unittest.TestCase):
10 | 
11 |     def setUp(self):
12 |         self.root_folder_name = "a_test_project"
13 |         self.projectcreator = ProjectCreator()
14 |     
15 |     def tearDown(self):
16 |         if os.path.exists(self.root_folder_name):
17 |             shutil.rmtree(self.root_folder_name)
18 |     
19 |     def test_create_root_folder(self):
20 |         self.projectcreator.create_root_folder(self.root_folder_name)
21 |         assert(os.path.exists(self.root_folder_name))
22 |         shutil.rmtree(self.root_folder_name)
23 |     
24 |     def test_create_subfolders(self):
25 |         self.projectcreator.create_root_folder(self.root_folder_name)
26 |         subfolders = ["test_a", "test_b", "test_c"]
27 |         subfolders = [self.root_folder_name + "/" + subfolder for
28 |         subfolder in subfolders]
29 |         self.projectcreator.create_subfolders(subfolders)
30 |         for subfolder in subfolders:
31 |             assert(os.path.exists(subfolder))
32 |     
33 | if __name__ == "__main__":
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------
/tests/test_change_db_format.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_gff3 import Create_generator
 8 | from mock_helper import gen_file, import_data
 9 | import annogesiclib.change_db_format as cdf
10 | 
11 | 
12 | class TestChangeDBFormat(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 |         self.test_folder = "test_folder"
16 |         if (not os.path.exists(self.test_folder)):
17 |             os.mkdir(self.test_folder)
18 | 
19 |     def tearDown(self):
20 |         if os.path.exists(self.test_folder):
21 |             shutil.rmtree(self.test_folder)
22 | 
23 |     def test_change_format(self):
24 |         input_file = os.path.join(self.test_folder, "input")
25 |         output_file = os.path.join(self.test_folder, "output")
26 |         gen_file(input_file,
27 |                  ">srna_1|Staphylococcus|Aar|12314|12444|forward\nATAGATTCCCGCGTATAGTCATCATTGTAC")
28 |         cdf.change_format(input_file, output_file)
29 |         data = import_data(output_file)
30 |         self.assertListEqual(data, ['>srna_1|Staphylococcus|Aar',
31 |                                     'ATAGATTCCCGCGTATAGTCATCATTGTAC'])
32 | 
33 | if __name__ == "__main__":
34 |     unittest.main()
35 | 
36 | 


--------------------------------------------------------------------------------
/annogesiclib/sRNA_filter_min_utr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | 
 6 | 
 7 | def filter_utr(srna_gff, srna_table, min_utr):
 8 |     out = open("tmp_utr_srna.gff", "w")
 9 |     out_ta = open("tmp_utr_srna.csv", "w")
10 |     out.write("##gff-version 3\n")
11 |     gffs = []
12 |     tables = []
13 |     gff_parser = Gff3Parser()
14 |     g_f = open(srna_gff, "r")
15 |     for entry in gff_parser.entries(g_f):
16 |         gffs.append(entry)
17 |     fh = open(srna_table, "r")
18 |     for row in csv.reader(fh, delimiter='\t'):
19 |         if row[0] != "rank":
20 |             if (float(row[7]) >= min_utr):
21 |                 tables.append(row)
22 |                 out_ta.write("\t".join(row) + "\n")
23 |     for gff in gffs:
24 |         for table in tables:
25 |             if (table[0] == gff.seq_id) and (
26 |                     int(table[2]) == gff.start) and (
27 |                     int(table[3]) == gff.end) and (
28 |                     table[4] == gff.strand):
29 |                 out.write(gff.info + "\n")
30 |     g_f.close()
31 |     fh.close()
32 |     os.remove(srna_gff)
33 |     os.remove(srna_table)
34 |     shutil.move("tmp_utr_srna.gff", srna_gff)
35 |     shutil.move("tmp_utr_srna.csv", srna_table)
36 |     out.close()
37 |     out_ta.close()
38 | 


--------------------------------------------------------------------------------
/tests/test_plot_coverage_table.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_gff3 import Create_generator
 8 | from mock_helper import gen_file
 9 | import annogesiclib.plot_coverage_table as pct
10 | 
11 | 
12 | class Mock_func(object):
13 | 
14 |     def mock_fig(self, rowlabels, collabels, cells, filename,
15 |                  max_color, min_color):
16 |         gen_file(filename, "test")
17 |         pass
18 | 
19 | class TestPlotCoverageTable(unittest.TestCase):
20 | 
21 |     def setUp(self):
22 |         self.test_folder = "test_folder"
23 |         if (not os.path.exists(self.test_folder)):
24 |             os.mkdir(self.test_folder)
25 | 
26 |     def tearDown(self):
27 |         if os.path.exists(self.test_folder):
28 |             shutil.rmtree(self.test_folder)
29 | 
30 | 
31 |     def test_plot_table(self):
32 |         pct.fig = Mock_func().mock_fig
33 |         plots = [{"aaa": {"cond_1": {"track_1": 3.543, "track_2": 4.523},
34 |                           "cond_2": {"track_1": 4.43, "track_2": 0.523}}}]
35 |         pct.plot_table(plots, 100, 0, os.path.join(self.test_folder, "test"))
36 |         self.assertTrue(os.path.exists(os.path.join(self.test_folder, "test")))
37 | 
38 | if __name__ == "__main__":
39 |     unittest.main()
40 | 
41 | 


--------------------------------------------------------------------------------
/benchmark_sRNAs/Helicobacter.csv:
--------------------------------------------------------------------------------
 1 | start	end	strand
 2 | 22809	22931	-
 3 | 78090	78365	+
 4 | 141579	141833	+
 5 | 170022	170227	+
 6 | 314965	315169	-
 7 | 367053	367555	+
 8 | 439217	439567	+
 9 | 444827	445139	-
10 | 466371	466793	-
11 | 479648	479856	-
12 | 513559	513627	-
13 | 515574	515653	-
14 | 516627	517186	+
15 | 537305	537624	-
16 | 540217	540473	-
17 | 541026	541298	-
18 | 567949	568607	-
19 | 664270	664447	-
20 | 684255	684584	+
21 | 756936	757177	+
22 | 804286	804580	-
23 | 865570	865916	+
24 | 946294	946540	+
25 | 964751	964805	+
26 | 968583	968616	+
27 | 968980	969164	+
28 | 996891	997299	+
29 | 998717	998995	+
30 | 1026267	1026428	-
31 | 1046292	1046837	+
32 | 1070879	1071067	+
33 | 1071536	1071801	+
34 | 1071960	1072171	+
35 | 1105620	1106041	+
36 | 1111333	1111469	+
37 | 1120506	1120704	-
38 | 1156100	1156429	+
39 | 1178410	1178489	+
40 | 1180436	1180504	+
41 | 1217306	1217526	+
42 | 1235678	1235907	+
43 | 1243404	1243474	-
44 | 1245610	1245780	-
45 | 1295413	1295598	-
46 | 1302757	1303070	-
47 | 1307821	1307963	-
48 | 1366650	1366889	-
49 | 1394991	1345126	+
50 | 1414321	1414603	+
51 | 1439464	1439745	+
52 | 1449788	1450126	+
53 | 1470642	1470983	-
54 | 1477003	1477319	+
55 | 1482579	1482926	-
56 | 1502823	1503160	-
57 | 1508086	1508595	-
58 | 1510538	1510962	+
59 | 1514863	1515121	-
60 | 1524329	1524681	-
61 | 1543943	1544194	+
62 | 1589890	1589984	-
63 | 1612281	1612596	-
64 | 1647007	1647568	+
65 | 


--------------------------------------------------------------------------------
/annogesiclib/extract_sec_info.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | def mod_file(input_file, out, indexs):
 5 |     with open(input_file) as fh:
 6 |         for line in fh:
 7 |             line = line.strip()
 8 |             if line.startswith(">"):
 9 |                 out.write(indexs[line] + "\n")
10 |             else:
11 |                 out.write(line + "\n")
12 |     out.close()
13 | 
14 | def extract_info_sec(sec_file, seq_file, index_file):
15 |     out_sec = open(sec_file + "tmp", "w")
16 |     out_seq = open(seq_file + "tmp", "w")
17 |     indexs = {}
18 |     with open(index_file) as hi:
19 |         for line in hi:
20 |             line = line.strip()
21 |             if line.startswith(">"):
22 |                 tag = line.split("|")[0]
23 |                 indexs[tag] = line
24 |     mod_file(sec_file, out_sec, indexs)
25 |     mod_file(seq_file, out_seq, indexs)
26 |     os.remove(sec_file)
27 |     shutil.move(sec_file + "tmp", sec_file)
28 |     os.remove(seq_file)
29 |     shutil.move(seq_file + "tmp", seq_file)
30 | 
31 | def modify_header(seq_file, index_file):
32 |     out = open(seq_file, "w")
33 |     with open(index_file) as fh:
34 |         for line in fh:
35 |             line = line.strip()
36 |             if line.startswith(">"):
37 |                 tag = line.split("|")[0]
38 |                 out.write(tag + "\n")
39 |             else:
40 |                 out.write(line + "\n")
41 | 


--------------------------------------------------------------------------------
/tests/mock_helper.py:
--------------------------------------------------------------------------------
 1 | from mock_gff3 import Create_generator
 2 | 
 3 | def convert_dict(line_list):
 4 |     datas = {}
 5 |     for data in line_list:
 6 |         datas[data] = data
 7 |     return datas
 8 | 
 9 | def gen_file(out_file, content):
10 |     with open(out_file, "w") as fh:
11 |         fh.write(content)
12 | 
13 | def import_data(filename):
14 |     datas = []
15 |     with open(filename) as fh:
16 |         for line in fh:
17 |             line = line.rstrip()
18 |             datas.append(line)
19 |     return datas
20 | 
21 | def extract_info(out_file, type_):
22 |     datas = []
23 |     attributes = []
24 |     if type_ == "file":
25 |         with open(out_file) as fh:
26 |             for line in fh:
27 |                 line = line.rstrip()
28 |                 if (line != "##gff-version 3") and len(line):
29 |                     attributes.append(line.split("\t")[-1].split(";"))
30 |                     datas.append("\t".join(line.split("\t")[0:-1]))
31 |     else:
32 |         for line in out_file.split("\n"):
33 |             line = line.rstrip()
34 |             if len(line):
35 |                 attributes.append(line.split("\t")[-1].split(";"))
36 |                 datas.append("\t".join(line.split("\t")[0:-1]))
37 |     
38 |     return datas, attributes
39 | 
40 | def read_dict(num, gff, attributes):
41 |     gffs = []
42 |     for index in range(0, num):
43 |         gffs.append(Create_generator(gff[index], attributes[index], "gff"))
44 |     return gffs
45 | 


--------------------------------------------------------------------------------
/tests/test_seqmodifier.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_helper import gen_file, import_data
 8 | from annogesiclib.seqmodifier import SeqModifier
 9 | 
10 | 
11 | class TestSeqModifier(unittest.TestCase):
12 | 
13 |     def setUp(self):
14 |         self.test_folder = "test_folder"
15 |         if (not os.path.exists(self.test_folder)):
16 |             os.mkdir(self.test_folder)
17 |         self.seq = SeqModifier("AATTATATAGGAAGGCCC")
18 | 
19 |     def tearDown(self):
20 |         if os.path.exists(self.test_folder):
21 |             shutil.rmtree(self.test_folder)
22 | 
23 |     def test_init_pos_dict(self):
24 |         self.seq._init_pos_dict()
25 |         self.assertDictEqual(self.seq._org_pos_to_internal_pos,
26 |                              {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6,
27 |                               8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12,
28 |                               14: 13, 15: 14, 16: 15, 17: 16, 18: 17})
29 | 
30 |     def test_replace(self):
31 |         self.seq.replace(2, "G")
32 |         self.assertEqual(self.seq._seq, "AGTTATATAGGAAGGCCC")
33 | 
34 |     def test_remove(self):
35 |         self.seq.remove(8, 1)
36 |         self.assertEqual(self.seq._seq, "AATTATAAGGAAGGCCC")
37 | 
38 |     def test_insert(self):
39 |         self.seq.insert(5, "C")
40 |         self.assertEqual(self.seq._seq, "AATTCATATAGGAAGGCCC")
41 | 
42 | if __name__ == "__main__":
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/annogesiclib/plot_tran.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import math
 3 | import matplotlib as mpl
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | from annogesiclib.helper import Helper
 6 | import numpy as np
 7 | mpl.use('Agg')
 8 | import matplotlib.pyplot as plt
 9 | plt.style.use('ggplot')
10 | 
11 | 
12 | def plot(lens, out_figure):
13 |     ticks = max(lens) / 50
14 |     bin_num = np.arange(0, max(lens), ticks)
15 |     n, bins, hist1 = plt.hist(lens, bin_num,
16 |                               color="#FF9999", label='Transcript',
17 |                               edgecolor='black', linewidth=1)
18 |     plt.xlabel("Transcript_length (nt)")
19 |     plt.ylabel("Amount")
20 |     plt.savefig(out_figure)
21 |     plt.clf()
22 | 
23 | 
24 | def plot_tran(tran_folder, stat_folder, max_dist):
25 |     lens = []
26 |     less = []
27 |     for tran in os.listdir(tran_folder):
28 |         if tran.endswith(".gff"):
29 |             prefix = tran.replace("_transcript.gff", "")
30 |             gff_f = open(os.path.join(tran_folder, tran), "r")
31 |             for entry in Gff3Parser().entries(gff_f):
32 |                 if entry.feature == "transcript":
33 |                     lens.append(entry.end - entry.start)
34 |                     if entry.end - entry.start <= max_dist:
35 |                         less.append(entry.end - entry.start)
36 |             plot(lens, os.path.join(stat_folder, prefix + "_length_all.png"))
37 |             plot(less, os.path.join(stat_folder, prefix + "_length_less_" + 
38 |                                     str(max_dist) + ".png"))
39 | 
40 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | test:
 2 | 	python3 -m pytest tests
 3 | 	# python3 run_test.py
 4 | 
 5 | coverage:
 6 | 	python3 -m coverage run run_test.py
 7 | 	@echo "computing coverage.."
 8 | 	python3 -m coverage report > "unittest"
 9 | 	python3 tests/uni_report.py -i "unittest" -o "uni_report"
10 | 	rm "unittest"
11 | 	@echo "check uni_report.."
12 | 
13 | package:
14 | 	rm -rf dist
15 | 	python3 setup.py bdist_wheel
16 | 	rm -rf ANNOgesic.egg-info
17 | 	ls dist/*
18 | 
19 | build:
20 | 	 python3 setup.py bdist
21 | 
22 | package_to_pypi:
23 | 	twine upload dist/*
24 | 
25 | html_doc:
26 | 	cd docs && make html && cd ..
27 | 
28 | new_release:
29 | 	new_release:
30 | 	@echo "* Create/checkout a release branch"
31 | 	@echo "  git branch release_v0.3.X"
32 | 	@echo "  git checkout release_v0.3.X"
33 | 	@echo "* Change bin/reademption"
34 | 	@echo "* Change setup.py"
35 | 	@echo "* Change docs/source/conf.py"
36 | 	@echo "* Change CHANGELOG.txt"
37 | 	@echo "* Create new docs"
38 | 	@echo "* Test package creation"
39 | 	@echo "* Test doc creation"
40 | 	@echo "* make package_to_pypi"
41 | 	@echo "* git add CHANGELOG.txt bin/reademption docs/source/conf.py setup.py"
42 | 	@echo "* Commit changes e.g. 'git commit -m \"Set version to 0.3.X\"'"
43 | 	@echo "* Tag the commit e.g. 'git tag -a v0.3.X -m \"version v0.3.X\"'"
44 | 	@echo "* Merge release into dev and master"
45 | 	@echo "* Push it to github: git push"
46 | 	@echo "* Generate a new release based on this tag at"
47 | 	@echo "  https://github.com/konrad/READemption/releases/new"
48 | 	@echo "* Upload new docs using 'make upload_doc'"
49 | 


--------------------------------------------------------------------------------
/annogesiclib/blast_class.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | def read_file(srna_file, nums):
 5 |     srna_f = open(srna_file, "r")
 6 |     for row in csv.reader(srna_f, delimiter="\t"):
 7 |         if (row[-6] != "NA") and (row[0] != "Rank"):
 8 |             if row[1] not in nums.keys():
 9 |                 nums[row[1]] = {}
10 |             if row[2] not in nums[row[1]].keys():
11 |                 nums[row[1]][row[2]] = 1
12 |             else:
13 |                 nums[row[1]][row[2]] += 1
14 |             if row[2] not in nums["total"].keys():
15 |                 nums["total"][row[2]] = 1
16 |             else:
17 |                 nums["total"][row[2]] += 1
18 |     srna_f.close()
19 | 
20 | 
21 | def blast_class(srna_file, out_file):
22 |     '''statistics of the results of blast sRNA database'''
23 |     nums = {}
24 |     nums["total"] = {}
25 |     read_file(srna_file, nums)
26 |     out = open(out_file, "w")
27 |     if len(nums) > 1:
28 |         if len(nums) > 2:
29 |             out.write("All genomes:\n")
30 |             out.write("sRNA_name\tamount\n")
31 |             for blast, num in nums["total"].items():
32 |                 out.write("{0}\t{1}\n".format(blast, num))
33 |         for strain, srna_name in nums.items():
34 |             if strain != "total":
35 |                 out.write(strain + ":\n")
36 |                 out.write("sRNA_name\tamount\n")
37 |                 for blast, num in srna_name.items():
38 |                     out.write("{0}\t{1}\n".format(blast, num))
39 |     else:
40 |         out.write("No known sRNA!!\n")
41 |     out.close()
42 | 


--------------------------------------------------------------------------------
/annogesiclib/filter_TSS_pro.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import math
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | 
 6 | 
 7 | def read_gff(input_file):
 8 |     datas = []
 9 |     gff_parser = Gff3Parser()
10 |     f_h = open(input_file, "r")
11 |     for entry in gff_parser.entries(f_h):
12 |         datas.append(entry)
13 |     datas = sorted(datas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
14 |     return datas
15 | 
16 | 
17 | def compare_tss_pro(tars, refs, out, cluster):
18 |     '''compare between TSS and processing site'''
19 |     for tar in tars:
20 |         for ref in refs:
21 |             if (tar.seq_id == ref.seq_id) and (
22 |                     tar.strand == ref.strand):
23 |                 if math.fabs(tar.start - ref.start) <= cluster:
24 |                     break
25 |                 elif (ref.start - tar.start) > cluster:
26 |                     out.write(tar.info + "\n")
27 |                     break
28 | 
29 | 
30 | def filter_tss_pro(tss_file, pro_file, feature, cluster):
31 |     '''deal with the overlap of TSS and processing site'''
32 |     tsss = read_gff(tss_file)
33 |     pros = read_gff(pro_file)
34 |     out = open("tmp_filter", "w")
35 |     out.write("##gff-version 3\n")
36 |     if feature.lower() == "tss":
37 |         compare_tss_pro(pros, tsss, out, cluster)
38 |         os.remove(pro_file)
39 |         shutil.move("tmp_filter", pro_file)
40 |     elif feature.lower() == "processing":
41 |         compare_tss_pro(tsss, pros, out, cluster)
42 |         os.remove(tss_file)
43 |         shutil.move("tmp_filter", tss_file)
44 | 


--------------------------------------------------------------------------------
/tests/test_splice_parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import shutil
 7 | import unittest
 8 | from io import StringIO
 9 | sys.path.append(".")
10 | from annogesiclib.splice_parser import SpliceParser
11 | 
12 | 
13 | class TestGff3Parser(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.example = Example()
17 |         self.s_parser = SpliceParser()
18 |         self.test_folder = "test_folder"
19 |         if (not os.path.exists(self.test_folder)):
20 |             os.mkdir(self.test_folder)
21 | 
22 |     def tearDown(self):
23 |         if os.path.exists(self.test_folder):
24 |             shutil.rmtree(self.test_folder)
25 | 
26 |     def test_parser(self):
27 |         splice_fh = StringIO(self.example.splice)
28 |         starts = []
29 |         splices = []
30 |         for entry in self.s_parser.parser(splice_fh):
31 |             starts.append(entry.start)
32 |             splices.append(entry.splice)
33 |         self.assertListEqual(starts, [17647, 20734, 43490, 49952])
34 |         self.assertListEqual(splices, ['splits:1:1:1:N:F', 'splits:1:1:1:C:P',
35 |                                        'splits:1:1:1:N:P', 'splits:2:2:2:N:P'])
36 | 
37 | class Example(object):
38 | 
39 |     splice = """Staphylococcus_aureus_HG003	17647	17667	splits:1:1:1:N:F	0	+
40 | Staphylococcus_aureus_HG003	20734	21396	splits:1:1:1:C:P	0	+
41 | Staphylococcus_aureus_HG003	43490	43644	splits:1:1:1:N:P	0	+
42 | Staphylococcus_aureus_HG003	49952	50016	splits:2:2:2:N:P	0	+"""
43 | 
44 | if __name__ == "__main__":
45 |     unittest.main()
46 | 
47 | 


--------------------------------------------------------------------------------
/database/Rfam_riboswitch_ID.csv:
--------------------------------------------------------------------------------
 1 | RF00162 	SAM 	SAM riboswitch box leader 
 2 | RF00174 	Cobalamin 	Cobalamin riboswitch 
 3 | RF00634 	SAM-IV 	S adenosyl methionine SAM riboswitch 
 4 | RF00059 	TPP 	TPP riboswitch THI element 
 5 | RF00167 	Purine 	Purine riboswitch 
 6 | RF00168 	Lysine 	Lysine riboswitch 
 7 | RF00504 	Glycine 	Glycine riboswitch 
 8 | RF00521 	SAM_alpha 	SAM riboswitch alpha proteobacteria 
 9 | RF01051 	c-di-GMP-I 	Cyclic di GMP riboswitch 
10 | RF01055 	MOCO_RNA_motif 	Moco molybdenum cofactor riboswitch 
11 | RF01057 	SAH_riboswitch 	S adenosyl homocysteine riboswitch 
12 | RF01510 	MFR 	M florum riboswitch 
13 | RF01767 	SMK_box_riboswitch 	SMK box translational riboswitch 
14 | RF01826 	SAM_V 	SAM riboswitch 
15 | RF01831 	THF 	THF riboswitch 
16 | RF01689 	AdoCbl-variant 	AdoCbl variant RNA 
17 | RF01725 	SAM-I-IV-variant 	SAM IV variant riboswitch 
18 | RF00050 	FMN 	FMN riboswitch RFN element
19 | RF00234 	glmS 	glmS glucosamine phosphate activated ribozyme 
20 | RF00522 	PreQ1 	PreQ1 riboswitch 
21 | RF01054 	preQ1-II 	preQ1 II pre queuosine riboswitch 
22 | RF01056 	Mg_sensor 	Magnesium Sensor 
23 | RF01482 	AdoCbl_riboswitch 	AdoCbl riboswitch 
24 | RF01727 	SAM-SAH 	SAM SAH riboswitch 
25 | RF01786 	c-di-GMP-II 	Cyclic di GMP II riboswitch 
26 | RF01787 	drz-agam-1 	drz agam riboswitch 
27 | RF01788 	drz-agam-2-2 	drz agam riboswitch 
28 | RF00080 	yybP-ykoY 	yybP ykoY leader 
29 | RF00379 	ydaO-yuaA 	ydaO yuaA leader 
30 | RF00380 	ykoK 	ykoK leader 
31 | RF00442 	ykkC-yxkD 	ykkC yxkD leader 
32 | RF00516 	ylbH 	ylbH leader 
33 | RF00517 	serC 	serC leader 
34 | RF00518 	speF 	speF leader 
35 | RF00519 	suhB 	suhB 
36 | RF00520 	ybhL 	ybhL leader
37 | 


--------------------------------------------------------------------------------
/tests/mock_gff3.py:
--------------------------------------------------------------------------------
 1 | class Create_generator(object):
 2 | 
 3 |     def __init__(self, gff, attributes, type_):
 4 |         if (type_ == "gff") or (type_ == "circ"):
 5 |             self.seq_id = gff["seq_id"]
 6 |             self.strain = gff["seq_id"]
 7 |             self.strand = gff["strand"]
 8 |             self.start = gff["start"]
 9 |             self.end = gff["end"]
10 |             self.feature = gff["feature"]
11 |             self.phase = gff["phase"]
12 |             self.score = gff["score"]
13 |             self.source = gff["source"]
14 |             if type_ == "circ":
15 |                 self.supported_reads = gff["support"]
16 |                 self.start_site_reads = gff["start_site"]
17 |                 self.end_site_reads = gff["end_site"]
18 |                 self.situation = gff["situation"]
19 |                 self.splice_type = gff["splice_type"]
20 |             self.attributes = {}
21 |             for key, value in attributes.items():
22 |                 self.attributes[key] = value
23 |             self.attribute_string = ";".join(
24 |                 ["=".join(items) for items in self.attributes.items()])
25 |             self.info = "\t".join([str(field) for field in [
26 |                             self.seq_id, self.source, self.feature, self.start,
27 |                             self.end, self.score, self.strand, self.phase,
28 |                             self.attribute_string]])
29 |             self.info_without_attributes = "\t".join([str(field) for field in [
30 |                             self.seq_id, self.source, self.feature, self.start,
31 |                             self.end, self.score, self.strand, self.phase]])
32 |         if type_ == "wig":
33 |             self.coverage = gff["coverage"]
34 | 


--------------------------------------------------------------------------------
/annogesiclib/seqmodifier.py:
--------------------------------------------------------------------------------
 1 | class SeqModifier(object):
 2 |     """Help to apply SNPs, insertion and deletions to a sequence."""
 3 | 
 4 |     def __init__(self, seq):
 5 |         self._seq = seq
 6 |         self._init_pos_dict()
 7 | 
 8 |     def seq(self):
 9 |         return self._seq
10 | 
11 |     def _init_pos_dict(self):
12 |         self._org_pos_to_internal_pos = dict(
13 |             [(pos, pos-1)
14 |              for pos in range(1, len(self._seq) + 1)])
15 | 
16 |     def replace(self, pos, nucleotide):
17 |         seq_as_list = list(self._seq)
18 |         seq_as_list[self._org_pos_to_internal_pos[pos]] = nucleotide
19 |         self._seq = "".join(seq_as_list)
20 | 
21 |     def remove(self, pos, num):
22 |         int_pos = self._org_pos_to_internal_pos[pos]
23 |         self._seq = self._seq[:int_pos] + self._seq[int_pos+1:]
24 |         del(self._org_pos_to_internal_pos[pos])
25 |         for pos in range(pos, len(self._seq) + 2):
26 |             try:
27 |                 self._org_pos_to_internal_pos[pos] = (
28 |                     self._org_pos_to_internal_pos[pos] - num)
29 |             except KeyError:
30 |                 pass
31 | 
32 |     def insert(self, pos, nucleotide):
33 |         """Insert after nucleotide of the given position"""
34 |         int_pos = self._org_pos_to_internal_pos[pos]
35 |         self._seq = self._seq[:int_pos] + nucleotide + self._seq[int_pos:]
36 |         for pos in range(pos + 1, len(self._seq) + 1):
37 |             try:
38 |                 self._org_pos_to_internal_pos[pos] = (
39 |                     self._org_pos_to_internal_pos[pos] + len(nucleotide))
40 |             except KeyError:
41 |                 pass
42 | 
43 |     def get_nucl(self, pos):
44 |         return self._seq[self._org_pos_to_internal_pos[pos]]
45 | 


--------------------------------------------------------------------------------
/comparison/compare_TSS_Mendoza_Vargas.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | import math
 8 | from gff3 import Gff3Parser
 9 | 
10 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
11 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("-k","--regulondb_file",help="TSS of Mendoza-Vargas in RegulonDB")
15 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted TSS file")
16 | parser.add_argument("-f","--fuzzy", type=int, help="tolerance of nts for comparison")
17 | args = parser.parse_args()
18 | 
19 | def main():
20 |     pros = {}
21 |     tsss = []
22 |     total = 0
23 |     detect = 0
24 |     for entry in Gff3Parser().entries(open(args.predict_file)):
25 |         tsss.append(entry)
26 |     fh = open(args.regulondb_file, "r")
27 |     for row in csv.reader(fh, delimiter='\t'):
28 |         if (not row[0].startswith("#")) and (row[-1] != "weak"):
29 |             total += 1
30 |             if row[5] == "forward":
31 |                 strand = "+"
32 |             else:
33 |                 strand = "-"
34 |             pros[row[1]] = {"start": int(row[3]), "strand": strand}
35 |     for ref in pros.values():
36 |         for pre in tsss:
37 |             if pre.strand == ref["strand"]:
38 |                 if (math.fabs(ref["start"] - pre.start) <= args.fuzzy):
39 |                     detect += 1
40 |                     break
41 |     print("the number of published TSSs which can be detected by ANNOgesic:" + str(detect))
42 |     print("the total number of TSSs from Mendoza-Vargas in Regulon DB" + str(total))
43 |     print("detection rate:" + str(float(detect)/float(total)))
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/annogesiclib/map_ribos.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | 
 5 | 
 6 | def mapping_ribos(table_folder, id_file, feature):
 7 |     ids = []
 8 |     ih = open(id_file, "r")
 9 |     for row in csv.reader(ih, delimiter='\t'):
10 |         if not row[0].startswith("#"):
11 |             ids.append({"id": row[0].strip(),
12 |                         "name": row[1].strip(),
13 |                         "info": row[2].strip()})
14 |     for table_file in os.listdir(table_folder):
15 |         if table_file.endswith("_" + feature + ".csv"):
16 |             tmp_table = os.path.join(table_folder, "tmp" + table_file)
17 |             table_file = os.path.join(table_folder, table_file)
18 |             out = open(tmp_table, "w")
19 |             tables = []
20 |             fh = open(table_file, "r")
21 |             out.write("#ID\tGenome\tStrand\tAssociated_CDS\tStart_genome\t"
22 |                       "End_genome\tRfam_ID\tRfam_name\tE_value\tScore\t"
23 |                       "Start_align\tEnd_align\n")
24 |             for row in csv.reader(fh, delimiter='\t'):
25 |                 if not row[0].startswith("#"):
26 |                     tables.append({"input": row[0:6], "Rfam": row[6],
27 |                                    "e": row[7], "score": row[8],
28 |                                    "start": row[9], "end": row[10]})
29 |             for table in tables:
30 |                 for id_ in ids:
31 |                     if table["Rfam"] == id_["id"]:
32 |                         name = id_["name"]
33 |                 out.write("\t".join(table["input"] + [table["Rfam"], name,
34 |                                     table["e"],  table["score"],
35 |                                     table["start"], table["end"]]) + "\n")
36 |             out.close()
37 |             os.remove(table_file)
38 |             shutil.move(tmp_table, table_file)
39 | 


--------------------------------------------------------------------------------
/comparison/compare_term_ecocyc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | from gff3 import Gff3Parser
 8 | 
 9 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
10 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("-k","--ecocyc_file",help="terminators of EcoCyc")
14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted terminator file")
15 | args = parser.parse_args()
16 | 
17 | def main():
18 |     terms = []
19 |     detect = 0
20 |     total = 0
21 |     fh = open(args.ecocyc_file, "r")
22 |     for row in csv.reader(fh, delimiter='\t'):
23 |         if len(row) >= 4:
24 |             total += 1
25 |             terms.append({"id": row[0], "start": int(row[1]),
26 |                           "end": int(row[2])})
27 |     tot_term = 0
28 |     for pre in Gff3Parser().entries(open(args.predict_file)):
29 |         tot_term += 1
30 |         for ref in terms:
31 |             if ((pre.start >= ref["start"]) and (
32 |                  pre.end <= ref["end"])) or (
33 |                 (pre.start <= ref["start"]) and (
34 |                  pre.end >= ref["end"])) or (
35 |                 (pre.start >= ref["start"]) and (
36 |                  pre.start <= ref["end"]) and (
37 |                  pre.end >= ref["end"])) or (
38 |                 (pre.start <= ref["start"]) and (
39 |                  pre.end >= ref["start"]) and (
40 |                  pre.end <= ref["end"])):
41 |                 detect += 1
42 |                 break
43 |     print("the number of published terminators can be detected by ANNOgesic:" + str(detect))
44 |     print("total number of terminators in EcoCyc:" + str(total))
45 |     print("detection rate:" + str(float(detect)/float(total)))
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/comparison/compare_sORF.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | from gff3 import Gff3Parser
 8 | 
 9 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
10 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("-k","--benchmark_file",help="the benchmarking set of sORF")
14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted sORF file")
15 | args = parser.parse_args()
16 | 
17 | def main():
18 |     sorfs = []
19 |     pres = []
20 |     num_ref = 0
21 |     detect = 0
22 |     for sorf in Gff3Parser().entries(open(args.benchmark_file)):
23 |         num_ref += 1
24 |         sorfs.append(sorf)       
25 |     for pre in Gff3Parser().entries(open(args.predict_file)):
26 |         pres.append(pre)
27 |     for sorf in sorfs:
28 |         for pre in pres:
29 |             if pre.strand == sorf.strand:
30 |                 if ((pre.start >= sorf.start) and (
31 |                      pre.end <= sorf.end)) or (
32 |                     (pre.start <= sorf.start) and (
33 |                      pre.end >= sorf.end)) or (
34 |                     (pre.start >= sorf.start) and (
35 |                      pre.start <= sorf.end) and (
36 |                      pre.end >= sorf.end)) or (
37 |                     (pre.start <= sorf.start) and (
38 |                      pre.end >= sorf.start) and (
39 |                      pre.end <= sorf.end)):
40 |                     detect += 1
41 |                     sorf.attributes["detect"] = True
42 |                     break
43 |     print("the number of known sORFs which can be detected by ANNOgesic:" + str(detect))
44 |     print("the total number of known sORFs:" + str(num_ref))
45 |     print("the detection rate:"+ str(float(detect) / float(num_ref)))
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/tests/test_plot_mountain.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_gff3 import Create_generator
 8 | from mock_helper import gen_file
 9 | import annogesiclib.plot_mountain as pm
10 | 
11 | 
12 | class TestPlotMountain(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 |         self.test_folder = "test_folder"
16 |         if (not os.path.exists(self.test_folder)):
17 |             os.mkdir(self.test_folder)
18 |         self.example = Example()
19 | 
20 |     def tearDown(self):
21 |         if os.path.exists(self.test_folder):
22 |             shutil.rmtree(self.test_folder)
23 | 
24 |     def test_plot_mountain_plot(self):
25 |         gen_file(os.path.join(self.test_folder, "test"), self.example.mountain)
26 |         pm.plot_mountain_plot(os.path.join(self.test_folder, "test"),
27 |                               os.path.join(self.test_folder, "out"))
28 | 
29 |         self.assertTrue(os.path.exists(os.path.join(self.test_folder, "out")))
30 | 
31 | class Example(object):
32 | 
33 |     mountain = """   1        0
34 |    2  0.001304
35 |    3  0.0037577
36 |    4  0.0068858
37 |    5  0.015473
38 |    6  0.025351
39 |    7  0.71432
40 |    8   1.6366
41 |    9    2.615
42 |   10   3.6091
43 | &
44 |    1     0
45 |    2     0
46 |    3     0
47 |    4     0
48 |    5     0
49 |    6     0
50 |    7     1
51 |    8     2
52 |    9     3
53 |   10     4
54 | &
55 |    1  0.018708
56 |    2  0.035075
57 |    3  0.043831
58 |    4  0.093979
59 |    5  0.10259
60 |    6  0.96026
61 |    7   0.4509
62 |    8  0.18699
63 |    9  0.062985
64 |   10  0.0055594
65 | &
66 |    1     0
67 |    2     0
68 |    3     0
69 |    4     0
70 |    5     0
71 |    6     0
72 |    7     1
73 |    8     2
74 |    9     3
75 |   10     4"""
76 | 
77 | if __name__ == "__main__":
78 |     unittest.main()
79 | 
80 | 


--------------------------------------------------------------------------------
/comparison/compare_promoter_regulondb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | import math
 8 | from gff3 import Gff3Parser
 9 | 
10 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
11 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("-k","--regulondb_file",help="RegulonDB promoter file")
15 | parser.add_argument("-p","--predict_file",help="ANNOgesic promoter table")
16 | parser.add_argument("-f","--fuzzy", type=int, help="the tolerance nts for comparison")
17 | args = parser.parse_args()
18 | 
19 | def main():
20 |     pros = {}
21 |     pres = []
22 |     total = 0
23 |     detect = 0
24 |     ph = open(args.predict_file, "r")
25 |     for row in csv.reader(ph, delimiter='\t'): 
26 |         if row[0] != "strain":      
27 |             pres.append({"start": int(row[1]), "strand": row[2]})
28 |     fh = open(args.regulondb_file, "r")
29 |     for row in csv.reader(fh, delimiter='\t'):
30 |         if (not row[0].startswith("#")) and (row[-1].lower() != "weak"):
31 |             if row[2] == "forward":
32 |                 strand = "+"
33 |             else:
34 |                 strand = "-"
35 |             if int(row[3]) != 0:
36 |                 total += 1
37 |                 pros[row[0]] = {"start": int(row[3]), "strand": strand}
38 |     for ref in pros.values():
39 |         for pre in pres:
40 |             if pre["strand"] == ref["strand"]:
41 |                 if (math.fabs(ref["start"] - pre["start"]) <= args.fuzzy):
42 |                     detect += 1
43 |                     break
44 |     print("the number of published promoters which can be found by ANNOgesic:" + str(detect))
45 |     print("total number of promoters in RegulonDB:" + str(total))
46 |     print("detection rate:" + str(float(detect)/float(total)))
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/tests/test_TSSpredator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import shutil
 7 | import unittest
 8 | from io import StringIO
 9 | sys.path.append(".")
10 | from annogesiclib.TSSpredator_parser import TSSPredatorReader
11 | 
12 | 
13 | class TestTSSPredatorReader(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.example = Example()
17 |         self.test_folder = "test_folder"
18 |         if (not os.path.exists(self.test_folder)):
19 |             os.mkdir(self.test_folder)
20 |         self.tss = TSSPredatorReader()
21 | 
22 |     def tearDown(self):
23 |         if os.path.exists(self.test_folder):
24 |             shutil.rmtree(self.test_folder)
25 | 
26 |     def test_entries(self):
27 |         input_fh = StringIO(self.example.master)
28 |         tsss = []
29 |         for entry in self.tss.entries(input_fh):
30 |             tsss.append(entry)
31 |         self.assertEqual(tsss[0].pos, 179)
32 |         self.assertTrue(tsss[1].is_primary)
33 |         self.assertTrue(tsss[2].is_internal)
34 | 
35 | 
36 | class Example(object):
37 | 
38 |     master = """SuperPos	SuperStrand	mapCount	detCount	Genome	detected	enriched	stepHeight	stepFactor	enrichmentFactor	classCount	Pos	Strand	Locus_tag	sRNA/asRNA	Product	UTRlength	GeneLength	Primary	Secondary	Internal	Antisense	Automated	Manual	Putative sRNA	Putative asRNA	Comment	Sequence -50 nt upstream + TSS (51nt)
39 | 179	-	1	1	test	1	1	4.45	31.93	8.69	1	179	-	orphan		orphan	NA	NA	0	0	0	0	1	0	0	0		ACCCTTGAATTGAGGGTGTTTTATACCTAAATTTAAAAAATGATGCTATAA
40 | 681	-	1	1	test	1	1	4.2	3.0	3.54	2	681	-	HP0001		transcription antitermination protein NusB	48	417	1	0	0	0	1	0	0	0		GATTGAAAGAGCGGGCAGTAAAGCCGGCAATAAGGGCTTTGAAGCGATGAG
41 | 681	-	1	1	test	1	1	4.2	3.0	3.54	2	681	-	HP0002		6%2C7-dimethyl-8-ribityllumazine synthase	NA	471	0	0	1	0	1	0	0	0		GATTGAAAGAGCGGGCAGTAAAGCCGGCAATAAGGGCTTTGAAGCGATGAG"""
42 | if __name__ == "__main__":
43 |     unittest.main()
44 | 
45 | 


--------------------------------------------------------------------------------
/tests/uni_report.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("-i","--input_file", help="input file")
10 | parser.add_argument("-o","--output_file",help="output file")
11 | args = parser.parse_args()
12 | 
13 | def main():
14 |     out = open(args.output_file, "w")
15 |     out.write("Name\tStmts\tMiss\tCover\n----------------------------------------------------\n")
16 |     sts = 0
17 |     miss = 0
18 |     with open(args.input_file) as fh:
19 |         for line in fh:
20 |             line = line.strip()
21 |             if line.startswith("annogesic"):
22 |                 datas = line.split(" ")
23 |                 covers = []
24 |                 for data in datas:
25 |                     if len(data):
26 |                         covers.append(data)
27 |                 sts = sts + int(covers[1])
28 |                 miss = miss + int(covers[2])
29 |                 out.write("\t".join(covers))
30 |                 out.write("\n")
31 | #    sts = 0
32 | #    miss = 0
33 | #    for input_file in args.input_files:
34 | #        with open(input_file) as fh:
35 | #            for line in fh:
36 | #                line = line.strip()
37 | #                datas = line.split(" ")
38 | #                covers = []
39 | #                if datas[0].split("/")[-1] == input_file.split("/")[-1].replace("unitest_test_", ""):
40 | #                    for data in datas:
41 | #                        if len(data):
42 | #                            covers.append(data)
43 | #                    sts = sts + int(covers[1])
44 | #                    miss = miss + int(covers[2])
45 | #                    out.write("\t".join(covers))
46 | #                    out.write("\n")
47 |     out.write("----------------------------------------------------\n")
48 |     out.write("Total = " + str(100 - (100*(float(miss) / float(sts)))) + "%")
49 |     out.close()
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/tests/test_modify_rbs_table.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_helper import gen_file, import_data
 8 | import annogesiclib.modify_rbs_table as mrt
 9 | 
10 | 
11 | class TestGenSvg(unittest.TestCase):
12 | 
13 |     def setUp(self):
14 |         self.test_folder = "test_folder"
15 |         self.example = Example()
16 |         if (not os.path.exists(self.test_folder)):
17 |             os.mkdir(self.test_folder)
18 | 
19 |     def tearDown(self):
20 |         if os.path.exists(self.test_folder):
21 |             shutil.rmtree(self.test_folder)
22 | 
23 |     def test_modify_table(self):
24 |         result = """#ID\tGenome\tStrand\tAssociated_CDS\tStart_genome\tEnd_genome\tRfam\tE_value\tScore\tStart_align\tEnd_align
25 | riboswitch_5\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00013\t15948\t16046\tRF00162\t1.6e-18\t74\t1\t99
26 | riboswitch_11\tStaphylococcus_aureus_HG003\t-\tSAOUHSC_00007\t27955\t28053\tRF00162\t1.6e-18\t74\t1\t99
27 | riboswitch_183\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00372\t377996\t378098\tRF00167\t2.2e-18\t45\t1\t103"""
28 |         table = os.path.join(self.test_folder, "test")
29 |         gen_file(table, self.example.ribos)
30 |         mrt.modify_table(table, True)
31 |         data = import_data(table)
32 |         self.assertEqual("\n".join(data), result)
33 |         gen_file(table, self.example.ribos)
34 |         mrt.modify_table(table, False)
35 |         data = import_data(table)
36 |         self.assertEqual("\n".join(data), result)
37 | 
38 | class Example(object):
39 | 
40 |     ribos = """riboswitch_5\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00013\t15948\t16046\tRF00162\t1.6e-18\t74\t1\t99
41 | riboswitch_11\tStaphylococcus_aureus_HG003\t-\tSAOUHSC_00007\t27955\t28053\tRF00162\t1.6e-18\t74\t1\t99
42 | riboswitch_183\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00372\t377996\t378098\tRF00167\t2.2e-18\t45\t1\t103"""
43 | 
44 | if __name__ == "__main__":
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/tests/test_parser_wig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import unittest
 7 | import shutil
 8 | from io import StringIO
 9 | sys.path.append(".")
10 | from annogesiclib.parser_wig import WigParser
11 | 
12 | 
13 | class TestParserWig(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.example = Example()
17 |         self.wig_parser = WigParser()
18 |         self.test_folder = "test_folder"
19 |         if (not os.path.exists(self.test_folder)):
20 |             os.mkdir(self.test_folder)
21 | 
22 |     def tearDown(self):
23 |         if os.path.exists(self.test_folder):
24 |             shutil.rmtree(self.test_folder)
25 | 
26 |     def test_parser(self):
27 |         wigs = []
28 |         wig_f_fh = StringIO(self.example.wig_forward_file)
29 |         for entry in self.wig_parser.parser(wig_f_fh, "+"):
30 |             self.assertEqual(entry.strain, "aaa")
31 |             self.assertEqual(entry.track, "TSB_t0_TEX_forward")
32 |             wigs.append(entry)
33 |         self.assertEqual(wigs[2].pos, 3)
34 |         self.assertEqual(wigs[2].coverage, 1.4041251228308191)
35 |         wigs = []
36 |         wig_r_fh = StringIO(self.example.wig_reverse_file)
37 |         for entry in self.wig_parser.parser(wig_r_fh, "-"):
38 |             self.assertEqual(entry.strain, "aaa")
39 |             self.assertEqual(entry.track, "TSB_t0_TEX_reverse")
40 |             wigs.append(entry)
41 |         self.assertEqual(wigs[2].pos, 3)
42 |         self.assertEqual(wigs[2].coverage, 1.4041251228308191)
43 |  
44 | class Example(object):
45 |     wig_forward_file = """track type=wiggle_0 name="TSB_t0_TEX_forward"
46 | variableStep chrom=aaa span=1
47 | 3 1.4041251228308191
48 | 4 56.867067474648174
49 | 5 56.867067474648174"""
50 | 
51 |     wig_reverse_file = """track type=wiggle_0 name="TSB_t0_TEX_reverse"
52 | variableStep chrom=aaa span=1
53 | 3 -1.4041251228308191
54 | 4 -56.867067474648174
55 | 5 -56.867067474648174"""
56 | if __name__ == "__main__":
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/annogesiclib/plot_mountain.py:
--------------------------------------------------------------------------------
 1 | import matplotlib as mpl
 2 | mpl.use('Agg')
 3 | import matplotlib.pyplot
 4 | matplotlib.pyplot.style.use('ggplot')
 5 | 
 6 | 
 7 | def plot_mountain_plot(input_file, output_name):
 8 |     poss = []
 9 |     values = []
10 |     check = 0
11 |     pre_check = 0
12 |     f_h = open(input_file, "r")
13 |     while True:
14 |         line = f_h.readline()
15 |         line = line.rstrip()
16 |         if not line:
17 |             matplotlib.pyplot.figure(1)
18 |             matplotlib.pyplot.subplot(212)
19 |             matplotlib.pyplot.xlabel('Nucleotide position')
20 |             matplotlib.pyplot.ylabel('Entropy')
21 |             matplotlib.pyplot.plot(values, color="black")
22 |             matplotlib.pyplot.savefig(output_name, format='pdf')
23 |             break
24 |         elif line == "&":
25 |             line = f_h.readline()
26 |             line = line.rstrip()
27 |             check += 1
28 |         else:
29 |             poss.append(float(line[0:4].replace(" ", "")))
30 |             values.append(float(line[5:].replace(" ", "")))
31 |         if check != pre_check:
32 |             pre_check = check
33 |             if check == 1:
34 |                 matplotlib.pyplot.figure(1)
35 |                 matplotlib.pyplot.subplot(211)
36 |                 ylabel = ("Number of enclosing nucleotides\nor\n"
37 |                           "Min free energy structure")
38 |                 matplotlib.pyplot.ylabel(
39 |                     ylabel, fontsize=10, multialignment='left')
40 |                 matplotlib.pyplot.plot(values, label='pair probabilities')
41 |                 values = []
42 |                 poss = []
43 |             elif check == 2:
44 |                 matplotlib.pyplot.plot(values, label='mfe structure')
45 |                 matplotlib.pyplot.legend(
46 |                     bbox_to_anchor=(0., 1.02, 1., .102),
47 |                     loc=3, ncol=2, mode="expand", borderaxespad=0.)
48 |                 values = []
49 |                 poss = []
50 |     f_h.close()
51 |     matplotlib.pyplot.cla()
52 |     matplotlib.pyplot.clf()
53 | 


--------------------------------------------------------------------------------
/comparison/compare_tran.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | from gff3 import Gff3Parser
 8 | 
 9 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
10 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("-k","--ecocyc_file",help="the transcripts from EcoCyc")
14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted transcripts")
15 | args = parser.parse_args()
16 | 
17 | def main():
18 |     trans = {}
19 |     pres = []
20 |     total = 0
21 |     detect = 0
22 |     for entry in Gff3Parser().entries(open(args.predict_file)):
23 |         pres.append(entry)
24 |     fh = open(args.ecocyc_file, "r")
25 |     for row in csv.reader(fh, delimiter='\t'):
26 |         if row[0] not in trans.keys():
27 |             total += 1
28 |             trans[row[0]] = {"start": int(row[1]), "end": int(row[2])}
29 |         else:
30 |             if int(row[1]) < trans[row[0]]["start"]:
31 |                 trans[row[0]]["start"] = int(row[1])
32 |             if int(row[2]) > trans[row[0]]["end"]:
33 |                 trans[row[0]]["end"] = int(row[2])
34 |     for ref in trans.values():
35 |         for pre in pres:
36 |             if ((pre.start >= ref["start"]) and (
37 |                  pre.end <= ref["end"])) or (
38 |                 (pre.start <= ref["start"]) and (
39 |                  pre.end >= ref["end"])) or (
40 |                 (pre.start >= ref["start"]) and (
41 |                  pre.start <= ref["end"]) and (
42 |                  pre.end >= ref["end"])) or (
43 |                 (pre.start <= ref["start"]) and (
44 |                  pre.end >= ref["start"]) and (
45 |                  pre.end <= ref["end"])):
46 |                 detect += 1
47 |                 break
48 |     print("the number of published transcripts which can be detected by ANNOgesic:" + str(detect))
49 |     print("total number of transcripts in EcoCyc:" + str(total))
50 |     print("detection rate:" + str(float(detect)/float(total)))
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/Table_dependency_version.txt:
--------------------------------------------------------------------------------
 1 | Basic requirement:
 2 | Python : version higher or equal to 3.4.
 3 | BioPython: version higher or equal to 1.65.
 4 | Wget: version higher or equal to 1.17.1.
 5 | Matplotlib : version higher or equal to 1.5.0.
 6 | 
 7 | Annotation transfer:
 8 | BioPerl: version higher or equal to 1.6.1.
 9 | RATT : version higher or equal to 1.64.
10 | 
11 | SNP calling:
12 | Samtools : version higher or equal to 1.3.1 (using htslib 1.3.1).
13 | Bcftools : version higher or equal to 1.3.1 (using htslib 1.3.1).
14 | 
15 | TSS and PS prediction:
16 | TSSpredator : version higher or equal to 1.06.
17 | 
18 | TSS and PS parameter optimization:
19 | TSSpredator : version higher or equal to 1.06.
20 | 
21 | sRNA detection:
22 | Blast+ : version higher or equal to 2.2.28+.
23 | ViennaRNA : version higher or equal to 2.3.2. RNAfold, mountain.pl and relplot.pl are needed for sRNA prediction.
24 | 
25 | Terminator detection:
26 | TranstermHP : version higher or equal to 2.09.
27 | ViennaRNA : version higher or equal to 2.3.2. RNAfold is needed for terminator prediction.
28 | 
29 | Promoter search:
30 | MEME : version higher or equal to 4.11.1.
31 | GLAM2 : version higher or equal to 4.11.1.
32 | MPICH : version higher or equal to 3.2. It is for parallel version of promoter detection.
33 | 
34 | sRNA target prediction:
35 | ViennaRNA : version higher or equal to 2.3.2.
36 | RNAup, RNAplex, RNAplfold are required for executing many modules of ANNOgesic.
37 | IntaRNA: version higher or equal to 2.0.4.
38 | 
39 | Circular RNA detection:
40 | Samtools : version higher or equal to 1.3.1 (using htslib 1.3.1).
41 | Segemehl : version higher or equal to 0.1.9.
42 | 
43 | Riboswitch and RNA thermometer identification:
44 | Infernal : version higher or equal to 1.1.1.
45 | 
46 | CRISPR detection:
47 | CRT: version higher or equal to 1.2.
48 | 
49 | Subcellular localization prediction:
50 | Psortb : version higher or equal to 3.0.
51 | 
52 | Protein-protein interaction detection:
53 | Networkx : version higher or equal to 1.10.
54 | 
55 | Generating screenshots of IGV:
56 | IGV : version higher or equal to 2.3.20.
57 | 
58 | Colorization of screenshots:
59 | ImageMagick : version higher or equal to 6.9.0-0.
60 | 


--------------------------------------------------------------------------------
/annogesiclib/TSSpredator_parser.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | class TSSPredatorReader(object):
 5 | 
 6 |     def entries(self, input_fh):
 7 |         for row in csv.reader(input_fh, delimiter="\t"):
 8 |             if row[0].startswith("SuperPos"):
 9 |                 continue
10 |             yield TSSPredatorEntry(row)
11 | 
12 | 
13 | class TSSPredatorEntry(object):
14 | 
15 |     def __init__(self, row):
16 |         assert(len(row) == 30)
17 |         self.super_pos = int(row[0])
18 |         self.super_strand = row[1]
19 |         self.map_count = int(row[2])
20 |         self.det_count = int(row[3])
21 |         self.genome = row[4]
22 |         self.is_detected = True if row[5] == "1" else False
23 |         self.is_enriched = True if row[6] == "1" else False
24 |         self.step_heigth = row[7]
25 |         self.step_factor = row[8]
26 |         self.enrichment_factor = row[9]
27 |         self.class_count = int(row[10])
28 |         self.pos = int(row[11])
29 |         self.strand = row[12]
30 |         self.locus_tag = row[13]
31 |         self.srna_asrna = row[14]
32 |         self.product = row[15]
33 |         self.utr_length = row[16]
34 |         self.gene_length = row[17]
35 |         self.is_primary = True if row[18] == "1" else False
36 |         self.is_secondary = True if row[19] == "1" else False
37 |         self.is_internal = True if row[20] == "1" else False
38 |         self.is_antisense = True if row[21] == "1" else False
39 |         self.is_automated = True if row[22] == "1" else False
40 |         self.is_manual = True if row[23] == "1" else False
41 |         self.is_putative_srna = True if row[24] == "1" else False
42 |         self.is_putative_asrna = True if row[25] == "1" else False
43 |         self.comment = row[26]
44 |         self.seq = row[27]
45 |         self.contig_pos = row[28]
46 |         self.contig_id = row[29]
47 |         self.is_orphan = False
48 |         if (self.is_primary is False and self.is_secondary is False and
49 |                 self.is_internal is False and self.is_antisense is False):
50 |             self.is_orphan = True
51 | 
52 |     def __str__(self):
53 |         return "%s %s %s" % (self.super_pos, self.super_strand, self.genome)
54 | 


--------------------------------------------------------------------------------
/annogesiclib/plot_coverage_table.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | import matplotlib as mpl
 3 | mpl.use('Agg')
 4 | import matplotlib.pyplot as plt
 5 | plt.style.use('ggplot')
 6 | 
 7 | 
 8 | def fig(rowlabels, collabels, cells, filename, max_color, min_color):
 9 |     row_num = len(rowlabels) / 100
10 |     if row_num == 0:
11 |         row_num = 1
12 |     col_num = len(collabels) / 8
13 |     if col_num == 0:
14 |         col_num = 1
15 |     plt.figure(figsize=(18*col_num, 10*row_num), edgecolor=None)
16 |     img = plt.imshow(cells, interpolation='none', aspect='auto', cmap="RdBu_r")
17 |     plt.xticks(range(len(collabels)), collabels, fontsize=6)
18 |     plt.yticks(range(len(rowlabels)), rowlabels, fontsize=6)
19 |     plt.colorbar(fraction=0.046, pad=0.04)
20 |     img.set_clim(vmin=min_color, vmax=max_color)
21 |     plt.savefig(filename)
22 | 
23 | 
24 | def plot_table(plots, max_color, min_color, filename):
25 |     rowlabels = []
26 |     collabels = []
27 |     cells = []
28 |     first = True
29 |     t_num = 0
30 |     for plot in plots:
31 |         for key, value in plot.items():
32 |             rowlabels.append(key)
33 |         cell = []
34 |         for cond, tracks in value.items():
35 |             for track, cover in tracks.items():
36 |                 if first:
37 |                     name = track
38 |                     if len(track) > 16:
39 |                         diff = int(len(name) / 16)
40 |                         for i in range(diff):
41 |                             name = (name[:(16)*(i+1)+i] + "\n" +
42 |                                     name[(16)*(i+1)+i:])
43 |                     collabels.append(name)
44 |                 cell.append(round(cover, 1))
45 |         cells.append(deepcopy(cell))
46 |         first = False
47 |         if len(rowlabels) >= 500:
48 |             plotname = (filename[:-4] + "_" + str(t_num) + "-" +
49 |                         str(t_num + 500) + ".png")
50 |             fig(rowlabels, collabels, cells, plotname, max_color, min_color)
51 |             t_num = t_num + 500
52 |             rowlabels = []
53 |             cells = []
54 |     if t_num == 0:
55 |         fig(rowlabels, collabels, cells, filename, max_color, min_color)
56 | 


--------------------------------------------------------------------------------
/comparison/compare_term_regulon.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | from gff3 import Gff3Parser
 8 | 
 9 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
10 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("-k","--regulondb_file",help="terminators in RegulonDB")
14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted terminators")
15 | args = parser.parse_args()
16 | 
17 | def main():
18 |     terms = []
19 |     detect = 0
20 |     total = 0
21 |     fh = open(args.regulondb_file, "r")
22 |     for row in csv.reader(fh, delimiter='\t'):
23 |         if row[3] == "forward":
24 |             strand = "+"
25 |         else:
26 |             strand = "-"
27 |         total += 1
28 |         terms.append({"id": row[0], "start": int(row[1]),
29 |                       "end": int(row[2]), "strand": strand})
30 |         if row[3] == "both":
31 |             terms.append({"id": row[0], "start": int(row[1]),
32 |                           "end": int(row[2]), "strand": "+"})
33 |             total += 1
34 |     for pre in Gff3Parser().entries(open(args.predict_file)):
35 |         for ref in terms:
36 |             if pre.strand == ref["strand"]:
37 |                 if ((pre.start >= ref["start"]) and (
38 |                      pre.end <= ref["end"])) or (
39 |                     (pre.start <= ref["start"]) and (
40 |                      pre.end >= ref["end"])) or (
41 |                     (pre.start >= ref["start"]) and (
42 |                      pre.start <= ref["end"]) and (
43 |                      pre.end >= ref["end"])) or (
44 |                     (pre.start <= ref["start"]) and (
45 |                      pre.end >= ref["start"]) and (
46 |                      pre.end <= ref["end"])):
47 |                     detect += 1
48 |                     break
49 |     print("the number of published terminators which can be detected by ANNOgesic:" + str(detect))
50 |     print("total number of terminators in RegulonDB:" + str(total))
51 |     print("detection rate:" + str(float(detect)/float(total)))
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/annogesiclib/sRNA_filter_frag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | 
 6 | 
 7 | def filter_frag(srna_table, srna_gff):
 8 |     out = open("tmp_srna.gff", "w")
 9 |     out_ta = open("tmp_srna.csv", "w")
10 |     out.write("##gff-version 3\n")
11 |     gffs = []
12 |     tables = []
13 |     gff_parser = Gff3Parser()
14 |     g_f = open(srna_gff, "r")
15 |     for entry in gff_parser.entries(g_f):
16 |         gffs.append(entry)
17 |     fh = open(srna_table, "r")
18 |     for row in csv.reader(fh, delimiter='\t'):
19 |         tables.append(row)
20 |     new_gffs = []
21 |     for gff in gffs:
22 |         if ("UTR_type" in gff.attributes.keys()):
23 |             if ("5utr" in gff.attributes["UTR_type"]) or (
24 |                     "interCDS" in gff.attributes["UTR_type"]):
25 |                 for table in tables:
26 |                     if (gff.seq_id == table[0]) and (
27 |                             gff.start == int(table[2])) and (
28 |                             gff.end == int(table[3])) and (
29 |                             gff.strand == table[4]):
30 |                         if "frag" in table[5]:
31 |                             new_gffs.append(gff)
32 |             elif "3utr" in gff.attributes["UTR_type"]:
33 |                 new_gffs.append(gff)
34 |         else:
35 |             new_gffs.append(gff)
36 |     new_tables = []
37 |     for table in tables:
38 |         for gff in new_gffs:
39 |             if (gff.seq_id == table[0]) and (
40 |                  gff.start == int(table[2])) and (
41 |                  gff.end == int(table[3])) and (
42 |                  gff.strand == table[4]):
43 |                 new_tables.append(table)
44 |                 out_ta.write("\t".join(table) + "\n")
45 |     for gff in new_gffs:
46 |         for table in new_tables:
47 |             if (gff.seq_id == table[0]) and (
48 |                  gff.start == int(table[2])) and (
49 |                  gff.end == int(table[3])) and (
50 |                  gff.strand == table[4]):
51 |                 out.write(gff.info + "\n")
52 |     g_f.close()
53 |     fh.close()
54 |     out.close()
55 |     out_ta.close()
56 |     os.remove(srna_gff)
57 |     os.remove(srna_table)
58 |     shutil.move("tmp_srna.gff", srna_gff)
59 |     shutil.move("tmp_srna.csv", srna_table)
60 | 


--------------------------------------------------------------------------------
/annogesiclib/modify_rbs_table.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import csv
 3 | 
 4 | 
 5 | def import_data(row):
 6 |     return{"strain": row[1], "strand": row[2],
 7 |            "associate": row[3], "start_seq": int(row[4]),
 8 |            "end_seq": int(row[5]), "rfam": row[6], "e": row[7],
 9 |            "score": row[8],
10 |            "start_align": int(row[9]), "end_align": int(row[10]),
11 |            "info": row[0:6], "ID": row[0]}
12 | 
13 | 
14 | def modify_table(table, output_all):
15 |     first = True
16 |     rbss = []
17 |     out = open("tmp.csv", "w")
18 |     out.write("#ID\tGenome\tStrand\tAssociated_CDS\tStart_genome\t"
19 |               "End_genome\tRfam\tE_value\tScore\tStart_align\tEnd_align\n")
20 |     if output_all:
21 |         with open(table) as fh:
22 |             for line in fh:
23 |                 line = line.strip()
24 |                 if first:
25 |                     first = False
26 |                     rbss.append(line)
27 |                     out.write(line + "\n")
28 |                 else:
29 |                     if line not in rbss:
30 |                         rbss.append(line)
31 |                         out.write(line + "\n")
32 |     else:
33 |         fh = open(table, "r")
34 |         for row in csv.reader(fh, delimiter='\t'):
35 |             rbss.append(import_data(row))
36 |         ids = []
37 |         for rbs1 in rbss:
38 |             repeat = False
39 |             if "print" not in rbs1.keys():
40 |                 rbs1["print"] = True
41 |                 for rbs2 in rbss:
42 |                     if (rbs1["strain"] == rbs2["strain"]) and \
43 |                        (rbs1["strand"] == rbs2["strand"]) and \
44 |                        (rbs1["ID"] == rbs2["ID"]):
45 |                         if "print" not in rbs2.keys():
46 |                             rbs2["print"] = True
47 |                             repeat = True
48 |                 if (not repeat) or (rbs1["ID"] not in ids):
49 |                     ids.append(rbs1["ID"])
50 |                     out.write("\t".join(rbs1["info"] + [rbs1["rfam"],
51 |                                         rbs1["e"], rbs1["score"],
52 |                                         str(rbs1["start_align"]),
53 |                                         str(rbs1["end_align"])]) + "\n")
54 |         fh.close()
55 |     out.close()
56 |     shutil.move("tmp.csv", table)
57 | 


--------------------------------------------------------------------------------
/comparison/compare_TSS_Salgado.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | import math
 8 | from gff3 import Gff3Parser
 9 | 
10 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
11 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("-k","--regulondb_file",help="TSS file of Salgado et. al in RegulonDB")
15 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted TSS file")
16 | parser.add_argument("-f","--fuzzy", type=int, help="tolerance of nts for comparison")
17 | args = parser.parse_args()
18 | 
19 | def main():
20 |     pros = {}
21 |     tsss = []
22 |     total = 0
23 |     detect = 0
24 |     for entry in Gff3Parser().entries(open(args.predict_file)):
25 |         tsss.append(entry)
26 |     refs = []
27 |     fh = open(args.regulondb_file, "r")
28 |     for row in csv.reader(fh, delimiter='\t'):
29 |         if not row[0].startswith("#"):
30 |             if row[5] == "forward":
31 |                 strand = "+"
32 |             else:
33 |                 strand = "-"
34 |             total += 1
35 |             refs.append({"start": int(row[0]),
36 |                           "end": int(row[1]), "strand": strand})
37 |     for ref in refs:
38 |         ref["start"] = ref["start"] - args.fuzzy
39 |         ref["end"] = ref["end"] + args.fuzzy
40 |         for pre in tsss:
41 |             if pre.strand == ref["strand"]:
42 |                 if ((pre.start >= ref["start"]) and (
43 |                      pre.end <= ref["end"])) or (
44 |                     (pre.start <= ref["start"]) and (
45 |                      pre.end >= ref["end"])) or (
46 |                     (pre.start >= ref["start"]) and (
47 |                      pre.start <= ref["end"]) and (
48 |                      pre.end >= ref["end"])) or (
49 |                     (pre.start <= ref["start"]) and (
50 |                      pre.end >= ref["start"]) and (
51 |                      pre.end <= ref["end"])):
52 |                     detect += 1
53 |                     break
54 |     print("the number of reported TSSs which can be detected by ANNOgesic:" + str(detect))
55 |     print("total number of TSSs from Salgado et. al in RegulonDB:" + str(detect))
56 |     print("detection rate:" + str(float(detect)/float(total)))
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/annogesiclib/parser_wig.py:
--------------------------------------------------------------------------------
 1 | class WigParser(object):
 2 |     '''parser the wiggle file based on 
 3 |     strain, track, position and coverage'''
 4 | 
 5 |     def parser(self, wig_fh, strand):
 6 |         track = ""
 7 |         strain = ""
 8 |         for line in wig_fh.readlines():
 9 |             line = line.strip()
10 |             if (len(line) != 0) and (not line.startswith("#")):
11 |                 datas = line.split(" ")
12 |                 if (datas[0] == "variableStep") or (datas[0] == "fixedStep"):
13 |                     strain = datas[1].split("=")
14 |                     strain = strain[1].strip()
15 |                     pre_pos = 0
16 |                     first = True
17 |                 if (datas[0] == "track"):
18 |                     track = datas[2].split("=")
19 |                     track = track[1].replace("\"", "")
20 |                     pre_pos = 0
21 |                     first = True
22 |                 if (datas[0] != "track") and (
23 |                         datas[0] != "variableStep") and (
24 |                         datas[0] != "fixedStep"):
25 |                     if len(datas) != 2:
26 |                         datas = line.split("\t")
27 |                     if int(datas[0]) - 1 != pre_pos:
28 |                         for pos in range(pre_pos + 1, int(datas[0])):
29 |                             yield AssignValue(pos, 0, strand, strain, track)
30 |                         pre_pos = int(datas[0])
31 |                         first = True
32 |                     if (int(datas[0]) - 1 == pre_pos) or (first):
33 |                         pre_pos = int(datas[0])
34 |                         first = False
35 |                         yield AssignValue(datas[0], datas[1],
36 |                                            strand, strain, track)
37 | 
38 | 
39 | class AssignValue(object):
40 | 
41 |     def __init__(self, pos, coverage, strand, strain, track):
42 |         self.pos = int(pos)
43 |         if strand == "+":
44 |             self.coverage = float(coverage)
45 |         else:
46 |             if float(coverage) < 0:
47 |                 self.coverage = -1 * float(coverage)
48 |             else:
49 |                 self.coverage = float(coverage)
50 |         self.strand = strand
51 |         self.strain = strain
52 |         self.track = track
53 | 
54 |     def __str__(self):
55 |         return "{0} {1} {2} {3} {4}".format(
56 |                 self.pos, self.coverage, self.strand, self.strain, self.track)
57 | 


--------------------------------------------------------------------------------
/annogesiclib/gen_promoter_table.py:
--------------------------------------------------------------------------------
 1 | from annogesiclib.gff3 import Gff3Parser
 2 | 
 3 | 
 4 | def gen_promoter_table(input_file, output_file, tss_file, type_):
 5 |     '''generate the table of promoter based on MEME'''
 6 |     tsss = []
 7 |     gff_f = open(tss_file, "r")
 8 |     for entry in Gff3Parser().entries(gff_f):
 9 |         tsss.append(entry)
10 |     out = open(output_file, "w")
11 |     out.write("\t".join(["Genome", "TSS_position",
12 |                          "TSS_strand", "Motif"]) + "\n")
13 |     detect = False
14 |     num = 1
15 |     with open(input_file) as fh:
16 |         for line in fh:
17 |             line = line.strip()
18 |             if type_ == "meme":
19 |                 if line.startswith("MOTIF"):
20 |                     motif = line.split("MEME")[0].strip()
21 |                     datas = motif.split(" ")
22 |                     motif = datas[0] + "_" + datas[-1]
23 |                     detect = False
24 |                 elif (line.startswith("Sequence name")) and (
25 |                         line.endswith("Site")):
26 |                     detect = True
27 |                 elif (len(line) == 0):
28 |                     detect = False
29 |                 elif (detect) and (not line.startswith("---")):
30 |                     tag = line.split(" ")[0]
31 |                     datas = tag.split("_")
32 |                     for tss in tsss:
33 |                         if ("_".join(datas[2:]) in tss.seq_id) and (
34 |                                 datas[0] == str(tss.start)) and (
35 |                                 datas[1] == tss.strand):
36 |                             out.write("\t".join([tss.seq_id, datas[0],
37 |                                                  datas[1], motif]) + "\n")
38 |             elif type_ == "glam2":
39 |                 if line.startswith("*"):
40 |                     detect = True
41 |                     motif = "MOTIF_" + str(num)
42 |                     num += 1
43 |                 elif len(line) == 0:
44 |                     detect = False
45 |                 elif detect:
46 |                     datas = line.split(" ")[0].split("_")
47 |                     for tss in tsss:
48 |                         if ("_".join(datas[2:]) in tss.seq_id) and (
49 |                                 datas[0] == str(tss.start)) and (
50 |                                 datas[1] == tss.strand):
51 |                             out.write("\t".join([tss.seq_id, datas[0],
52 |                                                  datas[1], motif]) + "\n")
53 | 


--------------------------------------------------------------------------------
/comparison/compare_operon_regulondb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | 
 8 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
 9 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("-d","--regulondb_file",help="RegulonDB file")
13 | parser.add_argument("-p","--predict_file",help="ANNOgesic operon table")
14 | args = parser.parse_args()
15 | 
16 | def main():
17 |     pre_op = ""
18 |     operons = []
19 |     nums = {"detect": 0, "total": 0}
20 |     fh = open(args.regulondb_file, "r")
21 |     for row in csv.reader(fh, delimiter='\t'):
22 |         if (not row[0].startswith("#")) and (row[-1] != "Weak"):
23 |             nums["total"] += 1
24 |             if row[3] == "forward":
25 |                 row[3] = "+"
26 |             else:
27 |                 row[3] = "-"
28 |             operons.append({"start": int(row[1]), "end": int(row[2]), "strand": row[3]})
29 |     sh = open(args.predict_file, "r")
30 |     uniqs = []
31 |     for row in csv.reader(sh, delimiter='\t'):
32 |         if row[0] != "Operon_ID":
33 |             start = int(row[2].split("-")[0])
34 |             end = int(row[2].split("-")[-1])
35 |             for operon in operons:
36 |                 if operon["strand"] == row[3]:
37 |                     if ((operon["start"] <= start) and (
38 |                             operon["end"] >= end)) or (
39 |                             (operon["start"] >= start) and (
40 |                             operon["end"] >= end)) or (
41 |                             (operon["start"] >= start) and (
42 |                             operon["start"] <= end) and (
43 |                             operon["end"] >= end)) or (
44 |                             (operon["start"] <= start) and (
45 |                             operon["end"] >= start) and (
46 |                             operon["end"] <= end)):
47 |                         if operon not in uniqs :
48 |                             nums["detect"] += 1
49 |                             uniqs.append(operon)
50 |                             operon["detect"] = True
51 |                             break
52 |             pre_op = {"start": start, "end": end, "strand": row[3]}
53 |     print("detected operons by ANNOgesic:" + str(nums["detect"]))
54 |     print("total operon in RegulonDB:" + str(nums["total"]))
55 |     print("detection rate:" + str(float(nums["detect"]/nums["total"])))
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/annogesiclib/output_cutoff_table.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | from annogesiclib.gff3 import Gff3Parser
 4 | 
 5 | 
 6 | def output_coverage(table_file, gff_file, cutoff_cover, stat_file, out_folder):
 7 |     out = open(os.path.join(out_folder, "tmp_srna_table"), "w")
 8 |     out_g = open(os.path.join(out_folder, "tmp_srna_gff"), "w")
 9 |     out.write("\t".join([
10 |         "Rank", "Genome", "Name", "Start", "End", "Strand",
11 |         "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates",
12 |         "Lib_type", "Best_avg_coverage", "Best_highest_coverage",
13 |         "Best_lower_coverage", "Track/Coverage",
14 |         "Normalized_secondary_energy_change(by_length)",
15 |         "UTR_derived/Intergenic", "Confliction_of_sORF", "nr_hit_number",
16 |         "sRNA_hit_number", "nr_hit_top3|ID|e-value", "sRNA_hit|e-value",
17 |         "Overlap_CDS", "Overlap_percent", "End_with_terminator"]) + "\n")
18 |     out_g.write("##gff-version 3\n")
19 |     stat_out = open(stat_file, "w")
20 |     nums = {5: 0}
21 |     for i in range(10, 100, 10):
22 |         nums[i] = 0
23 |     for i in range(100, 1000, 100):
24 |         nums[i] = 0
25 |     for i in range(1000, 5000, 500):
26 |         nums[i] = 0
27 |     gffs = []
28 |     gh = open(gff_file, "r")
29 |     for entry in Gff3Parser().entries(gh):
30 |         gffs.append(entry)
31 |     fh = open(table_file, "r")
32 |     rank = 1
33 |     new_gffs = []
34 |     for row in csv.reader(fh, delimiter='\t'):
35 |         if row[0] != "rank":
36 |             for cutoff in nums.keys():
37 |                 if float(row[10]) >= cutoff:
38 |                     nums[cutoff] += 1
39 |             if float(row[10]) >= cutoff_cover:
40 |                 row[0] = str(rank)
41 |                 out.write("\t".join(row) + "\n")
42 |                 rank += 1
43 |                 for gff in gffs:
44 |                     if (row[1] == gff.seq_id) and (
45 |                             row[3] == str(gff.start)) and (
46 |                             row[4] == str(gff.end)) and (
47 |                             row[5] == gff.strand):
48 |                         new_gffs.append(gff)
49 |     sort_gffs = sorted(new_gffs, key=lambda k: (k.seq_id, k.start,
50 |                                                 k.end, k.strand))
51 |     for gff in sort_gffs:
52 |         out_g.write(gff.info + "\n")
53 |     coverlist = sorted(nums, key=lambda key: nums[key])
54 |     stat_out.write("coverage\tfrequency\n")
55 |     for cover in coverlist:
56 |         stat_out.write("\t".join([str(cover), str(nums[cover])]) + "\n")
57 | 


--------------------------------------------------------------------------------
/tests/test_gff3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import shutil
 7 | import unittest
 8 | from io import StringIO
 9 | sys.path.append(".")
10 | from annogesiclib.gff3 import Gff3Parser
11 | 
12 | 
13 | class TestGff3Parser(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.example = Example()
17 |         self.gff_parser = Gff3Parser()
18 |         self.test_folder = "test_folder"
19 |         if (not os.path.exists(self.test_folder)):
20 |             os.mkdir(self.test_folder)
21 | 
22 |     def tearDown(self):
23 |         if os.path.exists(self.test_folder):
24 |             shutil.rmtree(self.test_folder)
25 | 
26 |     def test_gff_parser(self):
27 |         strains = []
28 |         features = []
29 |         starts = []
30 |         ends = []
31 |         strands = []
32 |         IDs = []
33 |         fh = StringIO(self.example.gff_file)
34 |         for entry in self.gff_parser.entries(fh):
35 |             strains.append(entry.seq_id)
36 |             features.append(entry.feature)
37 |             starts.append(entry.start)
38 |             ends.append(entry.end)
39 |             strands.append(entry.strand)
40 |             IDs.append(entry.attributes["ID"])
41 |         self.assertListEqual(strains, ["aaa", "aaa", "aaa",
42 |                                        "aaa", "bbb", "bbb"])
43 |         self.assertListEqual(features, ["gene", "CDS", "gene",
44 |                                         "CDS", "gene", "tRNA"])
45 |         self.assertListEqual(starts, [517, 517, 2156, 2156, 4444, 4444])
46 |         self.assertListEqual(ends, [1878, 1878, 3289, 3289, 5444, 5444])
47 |         self.assertListEqual(strands, ["+", "+", "-", "-", "+", "+"])
48 |         self.assertListEqual(IDs, ["gene0", "cds0", "gene1",
49 |                                    "cds1", "gene2", "rna0"])
50 | 
51 | class Example(object):
52 | 
53 |     gff_file = """#gff3
54 | aaa	Refseq	gene	517	1878	.	+	.	Name=dnaA;locus_tag=AAA_00001;gene=dnaA;ID=gene0;db_xref=GeneID:3919798
55 | aaa	Refseq	CDS	517	1878	.	+	.	protein_id=YP_498609.1;ID=cds0;Name=YP_498609.1;product=chromosomal replication initiation protein;Parent=gene0
56 | aaa	Refseq	gene	2156	3289	.	-	.	Name=AAA_00002;locus_tag=AAA_00002;ID=gene1;db_xref=GeneID:3919799
57 | aaa	Refseq	CDS	2156	3289	.	-	.	protein_id=YP_498610.1;ID=cds1;Name=YP_498610.1;locus_tag=AAA_00002
58 | bbb	Refseq	gene	4444	5444	.	+	.	Name=AAA_T00004;locus_tag=AAA_T00004;ID=gene2
59 | bbb	Refseq	tRNA	4444	5444	.	+	.	Name=AAA_T00018;locus_tag=AAA_T00004;ID=rna0"""
60 | 
61 | if __name__ == "__main__":
62 |     unittest.main()
63 | 
64 | 


--------------------------------------------------------------------------------
/tests/test_gen_table_tran.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_gff3 import Create_generator
 8 | import annogesiclib.gen_table_tran as gtt
 9 | 
10 | 
11 | class Mock_func(object):
12 | 
13 |     def __init__(self):
14 |         self.example = Example()
15 | 
16 | class TestGenTableTran(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         self.example = Example()
20 |         self.test_folder = "test_folder"
21 |         if (not os.path.exists(self.test_folder)):
22 |             os.mkdir(self.test_folder)
23 | 
24 |     def tearDown(self):
25 |         if os.path.exists(self.test_folder):
26 |             shutil.rmtree(self.test_folder)
27 | 
28 |     def test_detect_coverage(self):
29 |         infos = {}
30 |         gtt.detect_coverage(self.example.wigs, self.example.tas[0], infos)
31 |         self.assertDictEqual(infos, {'track_1': {
32 |             'high': 100, 'low': 2, 'avg': 33.529411764705884}})
33 | 
34 |     def test_print_coverage(self):
35 |         out = StringIO()
36 |         out_gff = StringIO()
37 |         gtt.print_coverage(self.example.tas, out, out_gff,
38 |                            self.example.wigs, self.example.wigs, None)
39 |         self.assertEqual(
40 |             out.getvalue(),
41 |             "aaa\tTranscript_0\t4\t20\t+\tfragmented&TEX+/-\tNA\tNA\tNA\ttrack_1(33.529411764705884)\n")
42 |         self.assertListEqual(out_gff.getvalue().split("\t")[:-1],
43 |                              ["aaa", "ANNOgesic", "Transcript",
44 |                               "4", "20", ".", "+", "."])
45 |         self.assertEqual(
46 |             set(out_gff.getvalue().split("\t")[-1].strip().split(";")),
47 |             set(["Name=Transcript_0", "detect_lib=fragmented&TEX+/-",
48 |                  "best_avg_coverage=33.529411764705884", "ID=tran0"]))
49 | class Example(object):
50 |     wigs = {"aaa": {"frag_1": {"track_1|+|frag": [
51 |         100, 30, 23, 21, 21, 2, 100, 30, 23, 21, 21, 2, 
52 |         100, 30, 23, 21, 21, 2, 100, 30, 23, 21, 21, 2]}}}
53 |     ta_dict = [{"seq_id": "aaa", "source": "ANNOgesic",
54 |                 "feature": "Transcript", "start": 4,
55 |                 "end": 20, "phase": ".", "strand": "+", "score": "."}]
56 |     attributes_tas = [{"ID": "tran0", "Name": "Transcript_0",
57 |                        "detect_lib": "fragmented&tex_notex"}]
58 |     tas = []
59 |     for index in range(0, 1):
60 |         tas.append(Create_generator(ta_dict[index],
61 |                                     attributes_tas[index], "gff"))
62 | if __name__ == "__main__":
63 |     unittest.main()
64 | 


--------------------------------------------------------------------------------
/tests/test_seq_editer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_helper import gen_file, import_data
 8 | from annogesiclib.seq_editer import SeqEditer
 9 | 
10 | 
11 | class TestSeqEditer(unittest.TestCase):
12 | 
13 |     def setUp(self):
14 |         self.example = Example()
15 |         self.test_folder = "test_folder"
16 |         self.fasta = os.path.join(self.test_folder, "fasta")
17 |         if (not os.path.exists(self.test_folder)):
18 |             os.mkdir(self.test_folder)
19 |             os.mkdir(self.fasta)
20 |         self.seq = SeqEditer()
21 | 
22 |     def tearDown(self):
23 |         if os.path.exists(self.test_folder):
24 |             shutil.rmtree(self.test_folder)
25 | 
26 |     def test_import_data(self):
27 |         mod_table = os.path.join(self.test_folder, "mod")
28 |         gen_file(mod_table, self.example.mutation)
29 |         datas = self.seq._import_data(mod_table, "test")
30 |         self.assertListEqual(datas, [{'target_id': 'test_NC_000915.1',
31 |              'datas': [{'ref_nt': 'c', 'tar_nt': '', 'position': '3'},
32 |              {'ref_nt': '-', 'tar_nt': 'deletion', 'position': '6'}], 'ref_id': 'NC_000915.1'}])
33 | 
34 |     def test_modify_seq(self):
35 |         mod_table = os.path.join(self.test_folder, "mod")
36 |         gen_file(mod_table, self.example.mutation)
37 |         gen_file(os.path.join(self.fasta, "NC_000915.1.fa"),
38 |                  self.example.fasta)
39 |         self.seq.modify_seq(self.fasta, mod_table, self.test_folder, "test")
40 |         datas = import_data(os.path.join(self.test_folder, "test_NC_000915.1.fa"))
41 |         self.assertEqual("\n".join(datas), self.example.out_1)
42 | 
43 |     def test_modify_header(self):
44 |         input_file = os.path.join(self.test_folder, "test.fa")
45 |         gen_file(input_file, ">AAA|BBB|CCC|DDD|EEE\nACATACAAGTACAGTT")
46 |         self.seq.modify_header(input_file)
47 |         datas = import_data(input_file)
48 |         self.assertEqual("\n".join(datas), ">DDD\nACATACAAGTACAGTT")
49 | 
50 | 
51 | class Example(object):
52 | 
53 |     fasta = """>NC_000915.1
54 | ATAGATAACCCAAGTACGACTCAGGTCCCTCACA"""
55 |     out_1 = """>test_NC_000915.1
56 | ATGATdeletionAACCCAAGTACGACTCAGGTCCCTCACA"""
57 |     out_2 = """>test_case2
58 | ATAGAgTAACCCAAGTACGACTCAGGTCCCTCACA"""
59 |     mutation = """#refernce_id	target_id	reference_nt	position	target_nt	impact of correction	locus tag	gene	Description
60 | NC_000915.1	3	a	c		SAOUHSC_00002	dnaA	XXXXXX
61 | NC_000915.1	6	a	-	deletion			YYYYYY"""
62 | 
63 | if __name__ == "__main__":
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/tests/test_color_png.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_helper import gen_file, import_data
 8 | from annogesiclib.color_png import ColorPNG
 9 | 
10 | 
11 | class Mock_func(object):
12 |     def __init__(self):
13 |         self.color = ColorPNG()
14 | 
15 |     def mock_convert_svg(self, imagemagick_path, out_path,
16 |                          screenshot, svg_file, log):
17 |         gen_file(os.path.join(out_path, svg_file),
18 |                  "<svg width=1111 hight=2222")
19 | 
20 |     def mock_convert_png(self, imagemagick_path, out_path,
21 |                          screenshot, png_file, log):
22 |         gen_file(os.path.join(out_path, png_file), "test")
23 |         pass
24 | 
25 |     def mock_gen_svg(out_path, track_num, height, width):
26 |         pass
27 | 
28 | class TestColorPng(unittest.TestCase):
29 | 
30 |     def setUp(self):
31 |         self.mock = Mock_func()
32 |         self.test_folder = "test_folder"
33 |         if (not os.path.exists(self.test_folder)):
34 |             os.mkdir(self.test_folder)
35 |             os.mkdir(os.path.join(self.test_folder, "screenshots"))
36 |             os.mkdir(os.path.join(self.test_folder, "screenshots", "aaa"))
37 |             os.mkdir(os.path.join(self.test_folder, "screenshots", "aaa",
38 |                                   "forward"))
39 |             os.mkdir(os.path.join(self.test_folder, "screenshots", "aaa",
40 |                                   "reverse"))
41 |         gen_file(os.path.join(self.test_folder, "screenshots", "aaa", "forward",
42 |                               "test_f.png"), "None")
43 |         gen_file(os.path.join(self.test_folder, "screenshots", "aaa", "reverse",
44 |                               "test_r.png"), "None")
45 |         self.color = ColorPNG()
46 | 
47 |     def tearDown(self):
48 |         if os.path.exists(self.test_folder):
49 |             shutil.rmtree(self.test_folder)
50 | 
51 |     def test_generate_color_png(self):
52 |         self.color._convert_svg = self.mock.mock_convert_svg
53 |         self.color._convert_png = self.mock.mock_convert_png
54 |         log = open(os.path.join(self.test_folder, "test.log"), "w")
55 |         self.color.gen_svg = self.mock.mock_gen_svg
56 |         self.color.generate_color_png(4, self.test_folder, "test", log)
57 |         data = import_data(os.path.join(self.test_folder, "screenshots",
58 |                                         "aaa", "forward", "test_f.png"))
59 |         self.assertListEqual(data, ["test"])
60 |         log.close()
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()
64 | 
65 | 


--------------------------------------------------------------------------------
/comparison/compare_srna.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | from gff3 import Gff3Parser
 8 | 
 9 | 
10 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
11 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted sRNA file")
15 | parser.add_argument("-r","--refseq_file",help="RefSeq gff file")
16 | args = parser.parse_args()
17 | 
18 | def main():
19 |     pres = []
20 |     for entry in Gff3Parser().entries(open(args.predict_file)):    
21 |         pres.append(entry)
22 |     refs = []
23 |     num_ref = 0
24 | #    fh = open(args.refseq_file, "r")
25 | #    for row in csv.reader(fh, delimiter='\t'):
26 | #        num_ref += 1
27 | #        refs.append({"start": int(row[0]), "end": int(row[1]), "strand": row[2]})
28 |     for entry in Gff3Parser().entries(open(args.refseq_file)):
29 |         if entry.feature == "ncRNA":
30 |             num_ref += 1
31 |             refs.append(entry)
32 |     detect = 0
33 |     for ref in refs:
34 |         for pre in pres:
35 | #            if pre.strand == ref["strand"]:
36 | #                if ((pre.start >= ref["start"]) and (
37 | #                     pre.end <= ref["end"])) or (
38 | #                    (pre.start <= ref["start"]) and (
39 | #                     pre.end >= ref["end"])) or (
40 | #                    (pre.start >= ref["start"]) and (
41 | #                     pre.start <= ref["end"]) and (
42 | #                     pre.end >= ref["end"])) or (
43 | #                    (pre.start <= ref["start"]) and (
44 | #                     pre.end >= ref["start"]) and (
45 | #                     pre.end <= ref["end"])):
46 |             if pre.strand == ref.strand:
47 |                 if ((pre.start >= ref.start) and (
48 |                      pre.end <= ref.end)) or (
49 |                     (pre.start <= ref.start) and (
50 |                      pre.end >= ref.end)) or (
51 |                     (pre.start >= ref.start) and (
52 |                      pre.start <= ref.end) and (
53 |                      pre.end >= ref.end)) or (
54 |                     (pre.start <= ref.start) and (
55 |                      pre.end >= ref.start) and (
56 |                      pre.end <= ref.end)):
57 |                     detect += 1
58 |                     break
59 |     print("the number of published sRNAs which can be detected by ANNOgesic:" + str(detect))
60 |     print("total number of sRNAs in RefSeq:" + str(num_ref))
61 |     print("detection rate:" + str(float(detect) / float(num_ref)))
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/tests/test_filter_TSS_pro.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import unittest
 4 | import os
 5 | import sys
 6 | import shutil
 7 | sys.path.append(".")
 8 | from io import StringIO
 9 | from mock_gff3 import Create_generator
10 | import annogesiclib.filter_TSS_pro as ftp
11 | 
12 | 
13 | class TestFilterTSSPro(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.test_folder = "test_project"
17 |         if os.path.exists(self.test_folder):
18 |             shutil.rmtree(self.test_folder)
19 |         os.mkdir(self.test_folder)
20 |         self.example = Example()
21 | 
22 |     def tearDown(self):
23 |         if os.path.exists(self.test_folder):
24 |             shutil.rmtree(self.test_folder)
25 | 
26 |     def test_compare_tss_pro(self):
27 |         out = StringIO()
28 |         ftp.compare_tss_pro(self.example.tars, self.example.refs, out, 3)
29 |         self.assertEqual("\t".join(out.getvalue().split("\t")[0:-1]), 
30 |                          "aaa\tRefseq\tTSS\t24\t24\t.\t+\t.")
31 | 
32 | 
33 | class Example(object):
34 |     tar_dict = [
35 |         {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 3,
36 |          "end": 3, "phase": ".", "strand": "+", "score": "."},
37 |         {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 24,
38 |          "end": 24, "phase": ".", "strand": "+", "score": "."},
39 |         {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 1243,
40 |          "end": 1243, "phase": ".", "strand": "+", "score": "."}]
41 |     attributes_tar = [{"coverage": "3", "ID": "tss1", "Name": "TSS:3_+"},
42 |                       {"coverage": "340", "ID": "tss2", "Name": "TSS:24_+"},
43 |                       {"coverage": "4440", "ID": "tss3", "Name": "TSS:1243_+"}]
44 |     ref_dict = [
45 |         {"seq_id": "aaa", "source": "Refseq", "feature": "Pro", "start": 3,
46 |          "end": 3, "phase": ".", "strand": "+", "score": "."},
47 |         {"seq_id": "aaa", "source": "Refseq", "feature": "Pro", "start": 333,
48 |          "end": 333, "phase": ".", "strand": "+", "score": "."},
49 |         {"seq_id": "aaa", "source": "Refseq", "feature": "Pro", "start": 1242,
50 |          "end": 1242, "phase": ".", "strand": "+", "score": "."}]
51 |     attributes_ref = [{"coverage": "3", "ID": "pro1", "Name": "Pro:3_+"},
52 |                       {"coverage": "330", "ID": "pro2", "Name": "Pro:333_+"},
53 |                       {"coverage": "1230", "ID": "pro3", "Name": "Pro:1242_+"}]
54 |     tars = []
55 |     refs = []
56 |     for index in range(0, 3):
57 |         tars.append(Create_generator(tar_dict[index],
58 |                                      attributes_tar[index], "gff"))
59 |         refs.append(Create_generator(ref_dict[index],
60 |                                      attributes_ref[index], "gff"))
61 | if __name__ == "__main__":
62 |     unittest.main()
63 | 


--------------------------------------------------------------------------------
/tests/test_blast_class.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | import annogesiclib.blast_class as blast_class
 8 | 
 9 | class Mock_func(object):
10 | 
11 |     def mock_read_file(self, blast_file, nums):
12 |         nums['total']['dnaA'] = 2
13 |         nums['aaa'] = {}
14 |         nums['aaa']['dnaA'] = 2
15 | 
16 | class TestBlastClass(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         self.example = Example()
20 |         self.test_folder = "test_folder"
21 |         if (not os.path.exists(self.test_folder)):
22 |             os.mkdir(self.test_folder)
23 |         self.blast_file = os.path.join(self.test_folder, "test.csv")
24 |         with open(self.blast_file, "w") as rh:
25 |             rh.write(self.example.blast)
26 | 
27 |     def tearDown(self):
28 |         if os.path.exists(self.test_folder):
29 |             shutil.rmtree(self.test_folder)
30 | 
31 |     def test_read_file(self):
32 |         nums = {}
33 |         nums["total"] = {}
34 |         blast_class.read_file(self.blast_file, nums)
35 |         self.assertDictEqual(nums, {'aaa': {'dnaA': 2}, 'total':{'dnaA': 2}})
36 | 
37 |     def test_blast_class(self):
38 |         blast_class.read_file = Mock_func().mock_read_file
39 |         lines = []
40 |         out_file = os.path.join(self.test_folder, "test.out")
41 |         blast_class.blast_class(self.blast_file, out_file)
42 |         with open(out_file) as fh:
43 |             for line in fh:
44 |                 line = line.strip()
45 |                 lines.append(line)
46 |         self.assertEqual(set(lines), set(self.example.blast_table.split("\n")))
47 | 
48 | class Example(object):
49 |     blast = """1\taaa\tdnaA\t2377296\t2377454\t-\tTSS:2377454_-\tNA\t2377296-2377454\tTEX+/-;Fragmented\t260123.91873361162\t446839.7634471806\t-0.0\tpMEM_t2_TEX_reverse(avg=155022.7050613754;high=266113.8349051722;low=0.6611942741842581)\t-0.2075\tIntergenic\tNA\tNA\t6\tNA\tsrn_4390|S._aureus_NCTC8325|dnaA|3e-55\tNA\tNA\tNA"""
50 |     read_out = [{'ID': 'hit_1', 'srna_name': 'dnaA', 'blast_strain': 'strain_b',
51 |                  'strain': 'aaa', 'start': '100', 'name': 'RNA_test1',
52 |                  'strand': '+', 'end': '200', 'e': '0.0005'},
53 |                 {'ID': 'hit_2', 'srna_name': 'dnaa', 'blast_strain': 'strain_c',
54 |                  'strain': 'aaa', 'start': '100', 'name': 'RNA_test1',
55 |                  'strand': '+', 'end': '200', 'e': '0.0007'},
56 |                 {'ID': 'hit_3', 'srna_name': 'dnaC', 'blast_strain': 'strain_b',
57 |                  'strain': 'aaa', 'start': '400', 'name': 'RNA_test2',
58 |                  'strand': '-', 'end': '450', 'e': '0.000002'}]
59 | 
60 |     blast_table = """aaa:
61 | sRNA_name	amount
62 | dnaA	2"""
63 | 
64 | if __name__ == "__main__":
65 |     unittest.main()
66 | 
67 | 


--------------------------------------------------------------------------------
/tests/test_goterm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from annogesiclib.goterm import GoTermFinding
 8 | from mock_args_container import MockClass
 9 | 
10 | 
11 | class TestGetPolyT(unittest.TestCase):
12 | 
13 |     def setUp(self):
14 |         self.mock_args = MockClass()
15 |         self.test_folder = "test_folder"
16 |         if (not os.path.exists(self.test_folder)):
17 |             os.mkdir(self.test_folder)
18 |         self.gffs = os.path.join(self.test_folder, "gff_folder")
19 |         if (not os.path.exists(self.gffs)):
20 |             os.mkdir(self.gffs)
21 |         self.go_folder = os.path.join(self.test_folder, "go_folder")
22 |         if (not os.path.exists(self.go_folder)):
23 |             os.mkdir(self.go_folder)
24 |         self.all_strain = "all_genomes_uniprot.csv"
25 |         self.trans = os.path.join(self.test_folder, "tran_folder")
26 |         if (not os.path.exists(self.trans)):
27 |             os.mkdir(self.trans)
28 |         args = self.mock_args.mock()
29 |         args.out_folder = self.test_folder
30 |         args.gffs = self.gffs
31 |         args.trans = self.trans
32 |         self.go = GoTermFinding(args)
33 | 
34 |     def tearDown(self):
35 |         if os.path.exists(self.test_folder):
36 |             shutil.rmtree(self.test_folder)
37 | 
38 |     def test_merge_files(self):
39 |         gff_folder = os.path.join(self.gffs, "test.gff_folder")
40 |         if (not os.path.exists(gff_folder)):
41 |             os.mkdir(gff_folder)
42 |         test1_folder = os.path.join(self.go_folder, "test1")
43 |         if (not os.path.exists(test1_folder)):
44 |             os.mkdir(test1_folder)
45 |         test2_folder = os.path.join(self.go_folder, "test2")
46 |         if (not os.path.exists(test2_folder)):
47 |             os.mkdir(test2_folder)
48 |         with open(os.path.join(gff_folder, "test1.gff"), "w") as fh:
49 |             fh.write("test1")
50 |         with open(os.path.join(gff_folder, "test2.gff"), "w") as fh:
51 |             fh.write("test2")
52 |         with open(os.path.join(test1_folder, "test1_uniprot.csv"), "w") as fh:
53 |             fh.write("test1")
54 |         with open(os.path.join(test2_folder, "test2_uniprot.csv"), "w") as fh:
55 |             fh.write("test2")
56 |         log = open(os.path.join(self.test_folder, "test.log"), "w")
57 |         self.go._merge_files(self.gffs, self.go_folder, self.test_folder, log)
58 |         out_file = os.path.join(self.go_folder, "test", self.all_strain)
59 |         self.assertTrue(os.path.exists(out_file))
60 |         data = []
61 |         with open(out_file) as fh:
62 |             for line in fh:
63 |                 data.append(line)
64 |         self.assertEqual("".join(data), "Genome	Strand	Start	End	Protein_id	Go_term\ntest2\ntest1\n")
65 |         log.close()
66 | 
67 | if __name__ == "__main__":
68 |     unittest.main()
69 | 


--------------------------------------------------------------------------------
/comparison/compare_operon_door.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import argparse
 7 | 
 8 | __author__ = "Sung-Huan Yu <sung-huan.yu@uni-wuerzburg.de>"
 9 | __email__ = "sung-huan.yu@uni-wuerzburg.de"
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("-d","--door2_file",help="door2 data")
13 | parser.add_argument("-p","--predict_file",help="ANNOgesic gff file")
14 | args = parser.parse_args()
15 | 
16 | def main():
17 |     pre_op = ""
18 |     operons = []
19 |     nums = {"detect": 0, "total": 0}
20 |     fh = open(args.door2_file, "r")
21 |     for row in csv.reader(fh, delimiter='\t'):
22 |         if row[0] != "OperonID":
23 |             if row[0] != pre_op:
24 |                 if len(pre_op) != 0:
25 |                     nums["total"] += 1
26 |                     operons.append({"start": start, "end": end, "strand": strand})
27 |                     start = int(row[3])
28 |                     end = int(row[4])
29 |                     strand = row[5]
30 |                 else:
31 |                     start = int(row[3])
32 |                     end = int(row[4])
33 |                     strand = row[5]
34 |                 pre_op = row[0]
35 |             else:
36 |                 if start > int(row[3]):
37 |                     start = int(row[3])
38 |                 if end < int(row[4]):
39 |                     end = int(row[4])
40 |     sh = open(args.predict_file, "r")
41 |     total_p = 0
42 |     uniqs = []
43 |     for row in csv.reader(sh, delimiter='\t'):
44 |         if row[0] != "Operon_ID":
45 |             start = int(row[2].split("-")[0])
46 |             end = int(row[2].split("-")[-1])
47 |             for operon in operons:
48 |                 if operon["strand"] == row[3]:
49 |                     if ((operon["start"] <= start) and (
50 |                             operon["end"] >= end)) or (
51 |                             (operon["start"] >= start) and (
52 |                             operon["end"] >= end)) or (
53 |                             (operon["start"] >= start) and (
54 |                             operon["start"] <= end) and (
55 |                             operon["end"] >= end)) or (
56 |                             (operon["start"] <= start) and (
57 |                             operon["end"] >= start) and (
58 |                             operon["end"] <= end)):
59 |                         if operon not in uniqs :
60 |                             nums["detect"] += 1
61 |                             uniqs.append(operon)
62 |                             operon["detect"] = True
63 |                             break
64 |             pre_op = {"start": start, "end": end, "strand": row[3]}
65 |     print("detected operon by ANNOgesic:" + str(nums["detect"]))
66 |     print("detection rate:" + str(float(nums["detect"]/nums["total"])))
67 |     print("total number of DOOR2:" + str(nums["total"]))
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/annogesiclib/rbs_overlap.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | from annogesiclib.helper import Helper
 6 | 
 7 | 
 8 | def read_gff(gff_file, type_):
 9 |     cdss = []
10 |     g_h = open(gff_file)
11 |     for entry in Gff3Parser().entries(g_h):
12 |         if (Helper().feature_without_notgene(entry)):
13 |             if (type_ == "riboswitch") and (entry.feature != "riboswitch"):
14 |                 cdss.append(entry)
15 |             elif (type_ == "thermometer") and (
16 |                     entry.feature != "RNA_thermometer"):
17 |                 cdss.append(entry)
18 |     cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
19 |     g_h.close()
20 |     return cdss
21 | 
22 | 
23 | def check_repeat(tab, strain, strand, start, end, fuzzy):
24 |     start = start + fuzzy
25 |     end = end - fuzzy
26 |     if (tab["strain"] == strain) and (
27 |             tab["strand"] == strand):
28 |         if ((tab["start"] <= start) and (
29 |                  tab["end"] >= end)) or (
30 |                 (tab["start"] >= start) and (
31 |                  tab["end"] <= end)) or (
32 |                 (tab["start"] <= start) and (
33 |                  tab["end"] <= end) and (
34 |                  tab["end"] >= start)) or (
35 |                 (tab["start"] >= start) and (
36 |                  tab["start"] <= end) and (
37 |                  tab["end"] >= end)):
38 |             return True
39 |     return False
40 | 
41 | 
42 | def rbs_overlap(table_file, gff_file, type_, fuzzy):
43 |     tmp_tab = table_file + "_tmp"
44 |     cdss = read_gff(gff_file, type_)
45 |     out = open(tmp_tab, "w")
46 |     fh = open(table_file, "r")
47 |     tables = []
48 |     for row in csv.reader(fh, delimiter='\t'):
49 |         if not row[0].startswith("#"):
50 |             tables.append({"strain": row[1], "strand": row[2],
51 |                            "start": int(row[4]), "end": int(row[5]),
52 |                            "info": "\t".join(row)})
53 |     fh.close()
54 |     for tab in tables:
55 |         overlap = False
56 |         for cds in cdss:
57 |             overlap = check_repeat(tab, cds.seq_id, cds.strand,
58 |                                    cds.start, cds.end, fuzzy)
59 |             if overlap:
60 |                 break
61 |         for com in tables:
62 |             if tab != com:
63 |                 repeat = check_repeat(tab, com["strain"], com["strand"],
64 |                                       com["start"], com["end"], 0)
65 |                 if (not overlap):
66 |                     if ((repeat) and (
67 |                             "print" not in tab.keys()) and (
68 |                             "print" not in com.keys())) or (
69 |                             not repeat):
70 |                         overlap = False
71 |                     else:
72 |                         overlap = True
73 |         if not overlap:
74 |             tab["print"] = True
75 |             out.write(tab["info"] + "\n")
76 |     out.close()
77 |     os.remove(table_file)
78 |     shutil.move(tmp_tab, table_file)
79 | 


--------------------------------------------------------------------------------
/tests/test_expresssion.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_helper import gen_file, import_data
 8 | import annogesiclib.expression as express_file
 9 | from annogesiclib.expression import Expression
10 | 
11 | 
12 | class Mock_func(object):
13 | 
14 |     def mock_expression(self, input_libs, gffs, percent_tex,
15 |                         percent_frag, wig_f_file, wig_r_file,
16 |                         features, merge_wigs, cutoff_coverage,
17 |                         tex_notex, replicates, stat, gff_folder,
18 |                         cover_type, max_color, min_color):
19 |         pass
20 | 
21 | class TestExpression(unittest.TestCase):
22 | 
23 |     def setUp(self):
24 |         self.mock = Mock_func()
25 |         self.test_folder = "test_folder"
26 |         self.tex_path = os.path.join(self.test_folder, "tex")
27 |         self.frag_path = os.path.join(self.test_folder, "frag")
28 |         if (not os.path.exists(self.test_folder)):
29 |             os.mkdir(self.test_folder)
30 |             os.mkdir(self.tex_path)
31 |             os.mkdir(self.frag_path)
32 |         self.express = Expression(self.test_folder)
33 | 
34 |     def tearDown(self):
35 |         if os.path.exists(self.test_folder):
36 |             shutil.rmtree(self.test_folder)
37 | 
38 |     def test_get_replicates(self):
39 |         replicates = self.express._get_replicates(2, 1)
40 |         self.assertDictEqual({'tex': 2, 'frag': 1}, replicates)
41 | 
42 |     def test_expression(self):
43 |         express_file.gene_expression = self.mock.mock_expression
44 |         tex_libs=["tex_-TEX_forward.wig:notex:1:a:+",
45 |                   "tex_-TEX_reverse.wig:notex:1:a:-",
46 |                   "tex_+TEX_forward.wig:tex:1:a:+",
47 |                   "tex_+TEX_reverse.wig:tex:1:a:-"]
48 |         frag_libs=["frag_forward.wig:frag:1:a:+",
49 |                    "frag_reverse.wig:frag:1:a:-"]
50 |         gen_file(os.path.join(self.tex_path, "tex_-TEX_forward.wig"), "tex1")
51 |         gen_file(os.path.join(self.tex_path, "tex_-TEX_reverse.wig"), "tex2")
52 |         gen_file(os.path.join(self.tex_path, "tex_+TEX_forward.wig"), "tex3")
53 |         gen_file(os.path.join(self.tex_path, "tex_+TEX_reverse.wig"), "tex4")
54 |         gen_file(os.path.join(self.frag_path, "frag_forward.wig"), "frag1")
55 |         gen_file(os.path.join(self.frag_path, "frag_reverse.wig"), "frag2")
56 |         self.express.expression(tex_libs, frag_libs, 2, 2,
57 |                                 1, self.tex_path, self.frag_path, "all",
58 |                                 "all", 5, self.test_folder, "CDS",
59 |                                 "high", 100, 0)
60 |         self.assertTrue(os.path.exists(os.path.join(
61 |             self.test_folder, "for_libs")))
62 |         self.assertTrue(os.path.exists(os.path.join(
63 |             self.test_folder, "for_libs", "statistics")))
64 |         self.assertTrue(os.path.exists(os.path.join(
65 |             self.test_folder, "for_libs", "gffs")))
66 | if __name__ == "__main__":
67 |     unittest.main()
68 | 
69 | 


--------------------------------------------------------------------------------
/annogesiclib/reorganize_table.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import shutil
 3 | from annogesiclib.lib_reader import read_libs
 4 | 
 5 | 
 6 | def import_covers(row):
 7 |     cover_names = []
 8 |     covers = []
 9 |     for data in row.split("("):
10 |         if ")" not in data:
11 |             cover_names.append(data)
12 |         else:
13 |             covers.append(data.split(")")[0])
14 |             if len(data.split(");")) == 2:
15 |                 cover_names.append(data.split(");")[-1])
16 |     return cover_names, covers
17 | 
18 | def get_lib_name(libs):
19 |     tracks = []
20 |     double_tracks = []
21 |     track_list = []
22 |     for lib1 in libs:
23 |         for lib2 in libs:
24 |             if (lib1["cond"] == lib2["cond"]) and (
25 |                     lib1["type"] == lib2["type"]) and (
26 |                     lib1["rep"] == lib2["rep"]) and (
27 |                     lib1["strand"] != lib2["strand"]):
28 |                 track = "/".join([lib1["name"], lib2["name"]])
29 |                 if track not in double_tracks:
30 |                     double_tracks.append(track)
31 |                     double_tracks.append("/".join([lib2["name"],
32 |                                          lib1["name"]]))
33 |                     tracks.append(track)
34 |                     track_list.append([lib1["name"], lib2["name"]])
35 |     return tracks, track_list
36 | 
37 | def reorganize_table(input_libs, wigs, cover_header, table_file):
38 |     libs, texs = read_libs(input_libs, wigs)
39 |     fh = open(table_file, "r")
40 |     first = True
41 |     headers = []
42 |     tracks, track_list = get_lib_name(libs)
43 |     out = open(table_file + "tmp", "w")
44 |     for row in csv.reader(fh, delimiter='\t'):
45 |         if first:
46 |             detect = False
47 |             header_num = 0
48 |             for header in row:
49 |                 if header == cover_header:
50 |                     index = header_num
51 |                     detect = True
52 |                 header_num += 1
53 |                 if not detect:
54 |                    headers.append(header)
55 |                 else:
56 |                    detect = False
57 |             first = False
58 |             for track in tracks:
59 |                 headers.append("Avg_coverage:" + track)
60 |             out.write("\t".join(headers) + "\n")
61 |         else:
62 |             if len(row) < (index + 1):
63 |                 cover_names = []
64 |                 covers = []
65 |             else:
66 |                 cover_names, covers = import_covers(row[index])
67 |             if len(row) == index + 1:
68 |                 row = row[:index]
69 |             else:
70 |                 row = row[:index] + row[index + 1:]
71 |             detects = ["Not_detect"] * len(tracks)
72 |             for name, cover in zip(cover_names, covers):
73 |                 num_track = 0
74 |                 for track in track_list:
75 |                     if name in track:
76 |                         detects[num_track] = cover
77 |                     num_track += 1
78 |             out.write("\t".join(row + detects) + "\n")
79 |     out.close()
80 |     shutil.move(table_file + "tmp", table_file)
81 | 


--------------------------------------------------------------------------------
/docs/source/docker.rst:
--------------------------------------------------------------------------------
 1 | Docker image
 2 | ==============
 3 | 
 4 | `Docker <https://www.docker.com>`_ is a platform for distributing package. 
 5 | It is light and easy to manage. ``ANNOgesic`` includes a ``Dockerfile`` which 
 6 | is for build up a environment and install all required tools for running ``ANNOgesic``.
 7 | 
 8 | Two ways can be used to build or pull Docker image:
 9 | 
10 | 1. You can simply pull the Docker image by running
11 | 
12 | ::
13 | 
14 |     $ docker pull silasysh/annogesic
15 | 
16 | 2. Alternatively, you can build the image by ``Dockerfile``.
17 | Please go to the folder where ``Dockerfile`` are located. Then type
18 | 
19 | ::
20 | 
21 |     $ sudo docker build -t="annogesic" .
22 | 
23 | It will build up an image called annogesic. You can see the images by typing ``docker images``
24 | 
25 | Based on different ways of installing docker image of ANNOgesic, the name of the docker image 
26 | will be different. Pulling from DockerHub is:
27 | 
28 | ::
29 | 
30 |    REPOSITORY          TAG                 IMAGE ID            CREATED             VIRTUAL SIZE
31 |    silasysh/annogesic  latest              d35f555694ad        3 days ago          2.782 GB
32 |    ubuntu              14.04               d2a0ecffe6fa        11 days ago         188.4 MB
33 | 
34 | Building Docker image by ``Dockerfile`` is:
35 | 
36 | ::
37 | 
38 |    REPOSITORY          TAG                 IMAGE ID            CREATED             VIRTUAL SIZE
39 |    annogesic           latest              d35f555694ad        3 days ago          2.782 GB
40 |    ubuntu              14.04               d2a0ecffe6fa        11 days ago         188.4 MB
41 | 
42 | Then we can use the image to create a container for running ``ANNOgesic``. Now, we used ``silasysh/annogesic`` 
43 | to represent Docker image. If you built Docker image by yourself, please replace ``silasysh/annogesic`` by ``annogesic``.
44 | Please type 
45 | 
46 | ::
47 | 
48 |     $ docker run -t -i silasysh/annogesic bash
49 | 
50 | Then you will jump into the container.
51 | 
52 | ::
53 | 
54 |     root@c9de31fcd7e3:~ ls
55 |     ANNOgesic
56 | 
57 | If you want to mount the files from your host to the container, just add ``-v`` to the command.
58 | 
59 | ::
60 | 
61 |     $ docker run -t -i -v /host/path/target:/file/path/within/container silasysh/annogesic bash
62 | 
63 | The paths should be absolute path. If we go to ``root`` in container. We can see the file.
64 | 
65 | 
66 | If you want to copy the files from container to host, you can use ``cp``.
67 | 
68 | ::
69 | 
70 |     $ docker cp <containerId>:/file/path/within/container /host/path/target
71 | 
72 | If you have no root permission for running Docker, Singularity is another way to 
73 | build up the image without root permission.
74 | 
75 | ::
76 | 
77 |     $ singularity build \
78 |         annogesic.img \
79 |         docker://silasysh/annogesic:latest
80 | 
81 | After building Singularity image of ANNOgesic, the user just needs to put the following line before
82 | the command that needs to be executed.
83 | 
84 | ::
85 | 
86 |     singularity exec -B $STORAGE_PATH annogesic.img
87 | 
88 | Please put the storage path of your home directory to ``$STORAGE_PATH``. ``df`` can be used to check the
89 | storage system.
90 | 


--------------------------------------------------------------------------------
/tests/test_gen_svg.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_helper import gen_file, import_data
 8 | import annogesiclib.gen_svg as gs
 9 | 
10 | 
11 | class TestGenSvg(unittest.TestCase):
12 | 
13 |     def setUp(self):
14 |         self.test_folder = "test_folder"
15 |         self.example = Example()
16 |         if (not os.path.exists(self.test_folder)):
17 |             os.mkdir(self.test_folder)
18 | 
19 |     def tearDown(self):
20 |         if os.path.exists(self.test_folder):
21 |             shutil.rmtree(self.test_folder)
22 | 
23 |     def test_gen_svg(self):
24 |         gs.gen_svg("test_folder/test.png", 4, 1000, 400)
25 |         data = import_data("test_folder/test.svg")
26 |         self.assertEqual("\n".join(data), self.example.svg)
27 | 
28 | 
29 | class Example(object):
30 | 
31 |     svg = """<?xml version="1.0" encoding="UTF-8" standalone="no"?>
32 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
33 | 
34 | <svg
35 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
36 |    xmlns:cc="http://creativecommons.org/ns#"
37 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
38 |    xmlns:svg="http://www.w3.org/2000/svg"
39 |    xmlns="http://www.w3.org/2000/svg"
40 |    xmlns:xlink="http://www.w3.org/1999/xlink"
41 |    version="1.1"
42 |    width="400"
43 |    height="1000"
44 |    viewBox="0 0 1860 1000"
45 |    id="svg3055">
46 |   <metadata
47 |      id="metadata3061">
48 |     <rdf:RDF>
49 |       <cc:Work
50 |          rdf:about="">
51 |         <dc:format>image/svg+xml</dc:format>
52 |         <dc:type
53 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
54 |         <dc:title></dc:title>
55 |       </cc:Work>
56 |     </rdf:RDF>
57 |   </metadata>
58 |   <defs
59 |      id="defs3059" />
60 |   <image
61 |      xlink:href="file:///home/silas/ANNOgesic/test_folder/test.png"
62 |      width="100%"
63 |      height="100%"
64 |      preserveAspectRatio="xMidYMin meet"
65 |      id="image3063" />
66 |   <rect
67 |      width="400"
68 |      height="40"
69 |      x="2.0744663"
70 |      y="131"
71 |      id="rect3067"
72 |      style="opacity:0.25;fill:#37c84f;fill-opacity:0.25;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:0.25" />
73 |   <rect
74 |      width="400"
75 |      height="40"
76 |      x="2.0744663"
77 |      y="171"
78 |      id="rect3068"
79 |      style="opacity:0.25;fill:#c8374f;fill-opacity:0.25;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:0.25" />
80 |   <rect
81 |      width="400"
82 |      height="40"
83 |      x="2.0744663"
84 |      y="211"
85 |      id="rect3069"
86 |      style="opacity:0.25;fill:#37c84f;fill-opacity:0.25;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:0.25" />
87 |   <rect
88 |      width="400"
89 |      height="40"
90 |      x="2.0744663"
91 |      y="251"
92 |      id="rect3070"
93 |      style="opacity:0.25;fill:#c8374f;fill-opacity:0.25;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:0.25" />
94 | </svg>"""
95 |     svg = svg.replace("/home/silas/ANNOgesic", os.getcwd())
96 | 
97 | if __name__ == "__main__":
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/annogesiclib/lib_reader.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os, gc
 3 | import numpy as np
 4 | from annogesiclib.parser_wig import WigParser
 5 | 
 6 | 
 7 | def read_libs(input_libs, wig_folder):
 8 |     libs = []
 9 |     texs = {}
10 |     for lib in input_libs:
11 |         datas = lib.split(":")
12 |         name = None
13 |         for wig in os.listdir(wig_folder):
14 |             if wig == datas[0]:
15 |                 with open(os.path.join(wig_folder, wig), "r") as w_h:
16 |                     for line in w_h:
17 |                         line = line.strip()
18 |                         if line.startswith("track"):
19 |                             name = line.split("=")[-1].replace("\"", "")
20 |                             break
21 |         if name is None:
22 |             print("Error: The {0} can not be found in lib names!!!".format(wig)) 
23 |         if (datas[1] == "tex") or (datas[1] == "notex"):
24 |             cond = "texnotex"
25 |         else:
26 |             cond = datas[1]
27 |         libs.append({"name": name, "type": datas[1],
28 |                      "cond": "_".join([datas[2], cond]),
29 |                      "rep": datas[3], "strand": datas[4]})
30 |     for lib1 in libs:
31 |         if lib1["type"] == "frag":
32 |             pass
33 |         elif (lib1["type"] == "tex") or (lib1["type"] == "notex"):
34 |             prefix1 = lib1["cond"].split("_")[0]
35 |             for lib2 in libs:
36 |                 prefix2 = lib2["cond"].split("_")[0]
37 |                 if (prefix1 == prefix2) and \
38 |                    (lib1["rep"] == lib2["rep"]) and \
39 |                    (lib1["type"] == "tex") and \
40 |                    (lib2["type"] == "notex") and \
41 |                    (lib1["strand"] == lib2["strand"]):
42 |                     texs[lib1["name"] + "@AND@" + lib2["name"]] = 0
43 |         else:
44 |             print("Error: Wrong library types are detected, "
45 |                   "please assign frag, tex or notex.")
46 |             sys.exit()
47 |     return libs, texs
48 | 
49 | 
50 | def read_wig(filename, strand, libs):
51 |     wig_parser = WigParser()
52 |     wigs = {}
53 |     if filename is not False:
54 |         wig_fh = open(filename)
55 |         for entry in wig_parser.parser(wig_fh, strand):
56 |             if entry.strain not in wigs.keys():
57 |                 wigs[entry.strain] = {}
58 |                 for lib in libs:
59 |                     if lib["cond"] not in wigs[entry.strain]:
60 |                         wigs[entry.strain][lib["cond"]] = {}
61 |             for lib in libs:
62 |                 if (lib["name"] == entry.track) and (
63 |                         lib["strand"] == entry.strand):
64 |                     lib_name = "|".join([
65 |                         entry.track, entry.strand, lib["type"]])
66 |                     if lib_name not in wigs[entry.strain][lib["cond"]].keys():
67 |                         wigs[entry.strain][lib["cond"]][lib_name] = []
68 |                     wigs[entry.strain][lib["cond"]][lib_name].append(entry.coverage)
69 |         wig_fh.close()
70 |         for strain, conds in wigs.items():
71 |             for cond, lib_names in conds.items():
72 |                 for lib_name, cover_list in lib_names.items():
73 |                     wigs[strain][cond][lib_name] = np.array(
74 |                             wigs[strain][cond][lib_name])
75 |     return wigs
76 | 


--------------------------------------------------------------------------------
/tests/test_stat_TSSpredater.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | import copy
 6 | from io import StringIO
 7 | sys.path.append(".")
 8 | from mock_gff3 import Create_generator
 9 | from mock_helper import import_data, gen_file, extract_info
10 | import annogesiclib.stat_TSSpredator as st
11 | 
12 | 
13 | class TestStatTSSpredator(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.example = Example()
17 |         self.test_folder = "test_folder"
18 |         if (not os.path.exists(self.test_folder)):
19 |             os.mkdir(self.test_folder)
20 | 
21 |     def tearDown(self):
22 |         if os.path.exists(self.test_folder):
23 |             shutil.rmtree(self.test_folder)
24 | 
25 |     def test_stat(self):
26 |         detect = False
27 |         out_stat = StringIO()
28 |         out_lib = StringIO()
29 |         st.stat(self.example.tsss, "aaa", "TSS", out_stat, "test", out_lib)
30 |         for data in out_stat.getvalue().split("\n"):
31 |             if "Primary" in data:
32 |                 self.assertEqual(data.split(" = ")[-1], "1 (1.0)")
33 |         if ("TSB_OD_0.2" in out_lib.getvalue()) and (
34 |             "pMEM_OD_0.5" in out_lib.getvalue()) and (
35 |             "pMEM_t2" in out_lib.getvalue()):
36 |             detect = True
37 |         self.assertTrue(detect)
38 |         os.remove("test_class_aaa.png")
39 | 
40 |     def test_plot(self):
41 |         st.plot(20, 23, 10, 13, 5, 100, 200, "name",
42 |                 "TSS", os.path.join(self.test_folder, "test"))
43 |         self.assertTrue(os.path.exists(os.path.join(
44 |             self.test_folder, "test_class_name.png")))
45 | 
46 |     def test_stat_tsspredator(self):
47 |         detect = False
48 |         tss_file = os.path.join(self.test_folder, "aaa_TSS.gff")
49 |         stat_file = os.path.join(self.test_folder, "stat")
50 |         lib_file = os.path.join(self.test_folder, "lib")
51 |         gen_file(tss_file, self.example.tss)
52 |         st.stat_tsspredator(tss_file, "TSS", stat_file, lib_file)
53 |         datas = import_data(stat_file)
54 |         for data in datas:
55 |             if "Primary" in data:
56 |                 self.assertEqual(data.split(" = ")[-1], "1 (1.0)")
57 |         datas = import_data(lib_file)
58 |         line = "\n".join(datas)
59 |         if ("TSB_OD_0.2" in line) and (
60 |             "pMEM_OD_0.5" in line) and (
61 |             "pMEM_t2" in line):
62 |             detect = True
63 |         self.assertTrue(detect)
64 |         self.assertTrue(os.path.exists("TSS_class_aaa.png"))
65 |         os.remove("TSS_class_aaa.png")
66 | 
67 | class Example(object):
68 | 
69 |     tss = """aaa	TSSpredator	TSS	2131	2131	.	+	.	UTR_length=Primary_25;type=Primary;ID=tss3;libs=TSB_OD_0.2&pMEM_OD_0.5&pMEM_t2;associated_gene=SAOUHSC_00002;Name=TSS:2131_f"""
70 |     tss_dict = [{"seq_id": "aaa", "source": "TSSpredator",
71 |                  "feature": "TSS", "start": 2131,
72 |                  "end": 2131, "phase": ".", "strand": "+", "score": "."}]
73 |     attributes_tss = [{
74 |         "ID": "tss3", "Name": "TSS:2131_f", "UTR_length": "Primary_25",
75 |         "type": "Primary", "associated_gene": "SAOUHSC_00002",
76 |         "libs": "TSB_OD_0.2&pMEM_OD_0.5&pMEM_t2"}]
77 |     tsss = []
78 |     tsss.append(Create_generator(tss_dict[0], attributes_tss[0], "gff"))
79 | 
80 | if __name__ == "__main__":
81 |     unittest.main()
82 | 
83 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
  1 | Installation
  2 | ============
  3 | 
  4 | There are three ways to install ANNOgesic. Please refer to the following 
  5 | sections. ANNOgesic can only work when the requirements are installed properly. If
  6 | you install ANNOgesic through source code or ``pip3``, please install the pre-required 
  7 | tools by yourself.
  8 | 
  9 | 
 10 | Github
 11 | ----------
 12 | 
 13 | All the source code including a run script (contains all the commands which are presented in tutorial) 
 14 | of ANNOgesic can be retrieve from our Git repository. Using the following commands can clone the 
 15 | source code easily.
 16 | 
 17 | ::
 18 | 
 19 |     $ git clone https://github.com/Sung-Huan/ANNOgesic.git
 20 | 
 21 | or
 22 | 
 23 | ::
 24 | 
 25 |     $ git clone git@github.com:Sung-Huan/ANNOgesic.git
 26 | 
 27 | In order to make ANNOgesic runnable, we should create a soft link of ``annogesiclib`` in ``bin``.
 28 | 
 29 | ::
 30 | 
 31 |     $ cd ANNOgesic/bin
 32 |     $ ln -s ../annogesiclib .
 33 | 
 34 | Docker
 35 | ----------
 36 | 
 37 | Some modules of ANNOgesic need third-party tools. In order to avoid all the possible issue caused by the dependencies, 
 38 | a Docker image is provided. For the details of Docker image, please check `here <https://www.docker.com/>`_.
 39 | 
 40 | For using Docker image, please use one of the following commands:
 41 | 
 42 | 1. You can simply pull the Docker image as following
 43 | 
 44 | ::
 45 | 
 46 |     $ docker pull silasysh/annogesic
 47 | 
 48 | 2. Alternatively, you can build the image via Dockerfile.
 49 | Please Download the `Dockerfile <https://github.com/Sung-Huan/ANNOgesic>`_ from our Git repository.
 50 | Then switch to the folder which Dockerfile are located. For the following commands, please 
 51 | refer to `here <https://github.com/Sung-Huan/ANNOgesic/blob/master/docs/source/docker.rst>`_.
 52 | 
 53 | If you want to check other commands of Docker, please refer to  `here <https://docs.docker.com/>`_.
 54 | 
 55 | Singularity
 56 | -----------
 57 | 
 58 | `Singularity <https://singularity.lbl.gov/index.html>`_ is another way to install ANNOgesic via 
 59 | Docker image without root permission.
 60 | 
 61 | ::
 62 | 
 63 |     $ singularity build \
 64 |         annogesic.img \
 65 |         docker://silasysh/annogesic:latest
 66 | 
 67 | After building Singularity image of ANNOgesic, the user just needs to put the following line before 
 68 | the command that needs to be executed.
 69 | 
 70 | ::
 71 | 
 72 |     singularity exec -B $STORAGE_PATH annogesic.img
 73 | 
 74 | Please put the storage path of your home directory to ``$STORAGE_PATH``. ``df`` can be used to check the 
 75 | storage system. 
 76 | 
 77 | pip3
 78 | ----------
 79 | 
 80 | ANNOgesic is also hosted in PyPI server. Thus, it can be simply installed via ``pip3``.
 81 | 
 82 | ::
 83 | 
 84 |     $ pip3 install ANNOgesic
 85 |     $ pip3 install ANNOgesic --upgrade
 86 | 
 87 | You can also install ANNOgesic without root permission.
 88 | 
 89 | ::
 90 | 
 91 |     $ pip3 install --user ANNOgesic
 92 |     $ pip3 install ANNOgesic --user --upgrade
 93 | 
 94 | Install Dependencies
 95 | --------------------
 96 | 
 97 | If the user want to install ANNOgesic via source code, ``get_package_database.sh`` can 
 98 | provide a way to install tools and download database automatically. The required versions 
 99 | of the tools will be shown on the screen as well.
100 | 


--------------------------------------------------------------------------------
/annogesiclib/check_srna_overlap.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import shutil
 3 | from annogesiclib.helper import Helper
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | 
 6 | 
 7 | def import_cds(gff):
 8 |     if "Name" in gff.attributes.keys():
 9 |         return gff.attributes["Name"]
10 |     elif "ID" in gff.attributes.keys():
11 |         return gff.attributes["ID"]
12 |     else:
13 |         name = "".join([gff.feature, ":", str(gff.start), "-", str(gff.end),
14 |                         "_", gff.strand])
15 |         return name
16 | 
17 | 
18 | def check_overlap(table_file, gff_file):
19 |     out = open(table_file + "tmp", "w")
20 |     gffs = []
21 |     gff_f = open(gff_file, "r")
22 |     for entry in Gff3Parser().entries(gff_f):
23 |         if Helper().feature_without_notgene(entry):
24 |             gffs.append(entry)
25 |     fh = open(table_file, "r")
26 |     out.write("\t".join([
27 |         "Rank", "Genome", "Name", "Start", "End", "Strand",
28 |         "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates",
29 |         "Lib_type", "Best_avg_coverage", "Track/Coverage",
30 |         "Normalized_secondary_energy_change(by_length)", "sRNA_types",
31 |         "Conflict_sORF", "nr_hit_number", "sRNA_hit_number",
32 |         "nr_hit_top3|ID|e-value|score", "sRNA_hit|e-value|score", "Overlap_CDS_forward",
33 |         "Overlap_nts_forward", "Overlap_CDS_reverse",
34 |         "Overlap_nts_reverse","End_with_terminator",
35 |         "Associated_promoter", "sRNA_length"]) + "\n")
36 |     for row in csv.reader(fh, delimiter='\t'):
37 |         if row[3] != "Start":
38 |             overlaps = {"forward": [], "reverse": [],
39 |                         "CDS_f": [], "CDS_r": []}
40 |             start = int(row[3])
41 |             end = int(row[4])
42 |             for gff in gffs:
43 |                 if ((gff.end < end) and (
44 |                          gff.end > start) and (
45 |                          gff.start <= start)) or (
46 |                         (gff.start > start) and (
47 |                          gff.start < end) and (
48 |                          gff.end >= end)) or (
49 |                         (gff.end >= end) and (
50 |                          gff.start <= start)) or (
51 |                         (gff.end <= end) and (
52 |                          gff.start >= start)):
53 |                     overlap = min(gff.end, end) - max(gff.start, start) + 1
54 |                     percent = "{0:.0f}%".format((float(overlap) / float(end - start + 1)) * 100)
55 |                     if gff.strand == "+":
56 |                         overlaps["forward"].append(str(overlap) + "(" + str(percent) + ")")
57 |                         overlaps["CDS_f"].append(import_cds(gff))
58 |                     else:
59 |                         overlaps["reverse"].append(str(overlap) + "(" + str(percent) + ")")
60 |                         overlaps["CDS_r"].append(import_cds(gff))
61 |             if len(overlaps["forward"]) == 0:
62 |                 overlaps["forward"] = ["NA"]
63 |                 overlaps["CDS_f"] = ["NA"]
64 |             if len(overlaps["reverse"]) == 0:
65 |                 overlaps["reverse"] = ["NA"]
66 |                 overlaps["CDS_r"] = ["NA"]
67 |             out.write("\t".join(row[0:19] + [";".join(overlaps["CDS_f"]), ";".join(overlaps["forward"]),
68 |                                              ";".join(overlaps["CDS_r"]), ";".join(overlaps["reverse"])] +
69 |                                 row[21:]) + "\n")
70 |     shutil.move(table_file + "tmp", table_file)
71 | 


--------------------------------------------------------------------------------
/annogesiclib/gen_svg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def print_track(track_num, svg_out, figure_width):
 5 |     id_num = 3067
 6 |     x = 2.0744663
 7 |     y = 131
 8 |     for track in range(track_num):
 9 |         if (track % 2) == 0:
10 |             svg_out.write("  <rect\n")
11 |             svg_out.write("     width=\"{0}\"\n".format(figure_width))
12 |             svg_out.write("     height=\"40\"\n")
13 |             svg_out.write("     x=\"{0}\"\n".format(x))
14 |             if track == 0:
15 |                 svg_out.write("     y=\"{0}\"\n".format(y))
16 |             else:
17 |                 y = y + 40
18 |                 svg_out.write("     y=\"{0}\"\n".format(y))
19 |             svg_out.write("     id=\"rect{0}\"\n".format(id_num))
20 |             svg_out.write("     style=\"opacity:0.25;fill:#37c84f;"
21 |                           "fill-opacity:0.25;fill-rule:evenodd;")
22 |             svg_out.write("stroke:#000000;stroke-width:1px;"
23 |                           "stroke-linecap:butt;stroke-linejoin:miter;"
24 |                           "stroke-opacity:0.25\" />\n")
25 |         if (track % 2) == 1:
26 |             svg_out.write("  <rect\n")
27 |             svg_out.write("     width=\"{0}\"\n".format(figure_width))
28 |             svg_out.write("     height=\"40\"\n")
29 |             svg_out.write("     x=\"{0}\"\n".format(x))
30 |             y = y + 40
31 |             svg_out.write("     y=\"{0}\"\n".format(y))
32 |             svg_out.write("     id=\"rect{0}\"\n".format(id_num))
33 |             svg_out.write("     style=\"opacity:0.25;fill:#c8374f;"
34 |                           "fill-opacity:0.25;fill-rule:evenodd;")
35 |             svg_out.write("stroke:#000000;stroke-width:1px;"
36 |                           "stroke-linecap:butt;stroke-linejoin:miter;"
37 |                           "stroke-opacity:0.25\" />\n")
38 |         id_num += 1
39 | 
40 | 
41 | def gen_svg(input_png, track_num, figure_height, figure_width):
42 |     svg_out = open(input_png[:-4] + ".svg", "w")
43 |     svg_out.write("""<?xml version="1.0" encoding="UTF-8" standalone="no"?>
44 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
45 | 
46 | <svg
47 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
48 |    xmlns:cc="http://creativecommons.org/ns#"
49 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
50 |    xmlns:svg="http://www.w3.org/2000/svg"
51 |    xmlns="http://www.w3.org/2000/svg"
52 |    xmlns:xlink="http://www.w3.org/1999/xlink"
53 |    version="1.1"
54 | """)
55 |     svg_out.write("   width=\"{0}\"\n".format(figure_width))
56 |     svg_out.write("   height=\"{0}\"\n".format(figure_height))
57 |     svg_out.write("   viewBox=\"0 0 1860 {0}\"\n".format(figure_height))
58 |     svg_out.write("""   id="svg3055">
59 |   <metadata
60 |      id="metadata3061">
61 |     <rdf:RDF>
62 |       <cc:Work
63 |          rdf:about="">
64 |         <dc:format>image/svg+xml</dc:format>
65 |         <dc:type
66 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
67 |         <dc:title></dc:title>
68 |       </cc:Work>
69 |     </rdf:RDF>
70 |   </metadata>
71 |   <defs
72 |      id="defs3059" />
73 |   <image
74 | """)
75 |     svg_out.write("     xlink:href=\"file://{0}/{1}\"\n".format(
76 |                   os.getcwd(), input_png))
77 |     svg_out.write("""     width="100%"
78 |      height="100%"
79 |      preserveAspectRatio="xMidYMin meet"
80 |      id="image3063" />\n""")
81 |     print_track(track_num, svg_out, figure_width)
82 |     svg_out.write("</svg>")
83 |     svg_out.close()
84 | 


--------------------------------------------------------------------------------
/annogesiclib/overlap.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from annogesiclib.gff3 import Gff3Parser
 4 | 
 5 | def get_overlap(anno, source, finals, overlaps, detect, out):
 6 |     if (anno.source in source) and (
 7 |             anno not in overlaps):
 8 |         finals.append(anno)
 9 |         detect = True
10 |     return detect
11 | 
12 | def deal_overlap(out_folder, source):
13 |     gffs = {}
14 |     num = 0
15 |     for gff_file in os.listdir(out_folder):
16 |         if gff_file.endswith(".gff"):
17 |             gff_f = open(os.path.join(out_folder, gff_file), "r")
18 |             for entry in Gff3Parser().entries(gff_f):
19 |                 if entry.feature not in gffs.keys():
20 |                     gffs[entry.feature] = []
21 |                 gffs[entry.feature].append(entry)
22 |             gff_f.close()
23 |             out = open(os.path.join(out_folder, gff_file + "tmp"), "w")
24 |             finals = []
25 |             overlaps = []
26 |             for feature, annos in gffs.items():
27 |                 for anno1 in annos:
28 |                     detect = False
29 |                     for anno2 in annos:
30 |                         if (anno1.seq_id == anno2.seq_id) and (
31 |                             anno1.strand == anno2.strand) and (
32 |                             anno1 != anno2) and (
33 |                             anno1.feature == anno2.feature) and (
34 |                             anno1.source != anno2.source):
35 |                             if ((anno1.start <= anno2.start) and (
36 |                                     anno1.end >= anno2.end)) or (
37 |                                     (anno1.start >= anno2.start) and (
38 |                                     anno1.end <= anno2.end)) or (
39 |                                     (anno1.start <= anno2.start) and (
40 |                                     anno1.end <= anno2.end) and (
41 |                                     anno1.end >= anno2.start)) or (
42 |                                     (anno1.start >= anno2.start) and (
43 |                                     anno1.start <= anno2.end) and (
44 |                                     anno1.end >= anno2.end)):
45 |                                 detect = get_overlap(anno1, source, finals,
46 |                                                      overlaps, detect, out)
47 |                                 detect = get_overlap(anno2, source, finals,
48 |                                                      overlaps, detect, out)
49 |                                 if detect:
50 |                                     overlaps.append(anno1)
51 |                                     overlaps.append(anno2)
52 |                     if (not detect) and (anno1 not in overlaps):
53 |                         finals.append(anno1)
54 |             finals = sorted(finals, key=lambda x: (x.seq_id, x.start,
55 |                                                    x.end, x.strand))
56 |             for final in finals:
57 |                 if (final.feature == "region") or (
58 |                         final.feature == "source") or (
59 |                         final.feature == "remark"):
60 |                     out.write(final.info + "\n")
61 |                     break
62 |             for final in finals:
63 |                 if (final.feature != "region") and (
64 |                         final.feature != "source"):
65 |                     out.write(final.info + "\n")
66 |             out.close()
67 |             shutil.move(os.path.join(out_folder, gff_file + "tmp"),
68 |                         os.path.join(out_folder, gff_file))
69 | 


--------------------------------------------------------------------------------
/annogesiclib/compare_sRNA_sORF.py:
--------------------------------------------------------------------------------
 1 | from annogesiclib.gff3 import Gff3Parser
 2 | from annogesiclib.helper import Helper
 3 | 
 4 | 
 5 | def print_file(datas, out, feature):
 6 |     for data in datas:
 7 |         if feature not in data.attributes.keys():
 8 |             data.attributes[feature] = "NA"
 9 |         else:
10 |             data.attributes[feature] = ",".join(data.attributes[feature])
11 |         data.attribute_string = ";".join(
12 |             ["=".join(items) for items in data.attributes.items()])
13 |         out.write("\t".join([data.info_without_attributes,
14 |                   data.attribute_string]) + "\n")
15 | 
16 | 
17 | def del_attributes(feature, entry):
18 |     '''Remove to the useless attributes'''
19 |     attributes = {}
20 |     for key, value in entry.attributes.items():
21 |         if feature not in key:
22 |             attributes[key] = value
23 |     return attributes
24 | 
25 | 
26 | def srna_sorf_comparison(sRNA_file, sORF_file, sRNA_out, sORF_out):
27 |     '''Comparison of sRNA and sORF. It can be a filter of sRNA detection'''
28 |     sorfs = []
29 |     srnas = []
30 |     out_r = open(sRNA_out, "w")
31 |     out_o = open(sORF_out, "w")
32 |     out_r.write("##gff-version 3\n")
33 |     out_o.write("##gff-version 3\n")
34 |     for entry in Gff3Parser().entries(open(sRNA_file)):
35 |         entry.attributes = del_attributes("sORF", entry)
36 |         srnas.append(entry)
37 |     srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
38 |     for entry in Gff3Parser().entries(open(sORF_file)):
39 |         entry.attributes = del_attributes("sRNA", entry)
40 |         sorfs.append(entry)
41 |     sorfs = sorted(sorfs, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
42 |     for srna in srnas:
43 |         for sorf in sorfs:
44 |             if (srna.seq_id == sorf.seq_id) and (srna.strand == sorf.strand):
45 |                 if ((srna.start <= sorf.start) and (
46 |                         srna.end >= sorf.end)) or (
47 |                         (srna.start >= sorf.start) and (
48 |                          srna.end <= sorf.end)) or (
49 |                         (srna.start <= sorf.start) and (
50 |                          srna.end >= sorf.start) and (
51 |                          srna.end <= sorf.end)) or (
52 |                         (srna.start >= sorf.start) and (
53 |                          srna.start <= sorf.end) and (
54 |                          srna.end >= sorf.end)):
55 |                     if "sORF" not in srna.attributes.keys():
56 |                         srna.attributes["sORF"] = []
57 |                         strand = Helper().get_strand_name(sorf.strand)
58 |                     srna.attributes["sORF"].append("".join([
59 |                                                "sORF:",
60 |                                                str(sorf.start), "-",
61 |                                                str(sorf.end),
62 |                                                "_", strand]))
63 |                     if "sRNA" not in sorf.attributes.keys():
64 |                         sorf.attributes["sRNA"] = []
65 |                         strand = Helper().get_strand_name(srna.strand)
66 |                     sorf.attributes["sRNA"].append("".join([
67 |                                                "sRNA:",
68 |                                                str(srna.start), "-",
69 |                                                str(srna.end),
70 |                                                "_", strand]))
71 |     print_file(sorfs, out_o, "sRNA")
72 |     print_file(srnas, out_r, "sORF")
73 |     out_r.close()
74 |     out_o.close()
75 | 


--------------------------------------------------------------------------------
/annogesiclib/compare_srna_promoter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | 
 6 | 
 7 | def read_file(gff_file, args_srna):
 8 |     srnas = []
 9 |     for entry in Gff3Parser().entries(open(gff_file)):
10 |         attributes = {}
11 |         for key, value in entry.attributes.items():
12 |             if "promoter" not in key:
13 |                 attributes[key] = value
14 |         entry.attributes = attributes
15 |         srnas.append(entry)
16 |     srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand))
17 |     fh = open(args_srna.promoter_table, "r")
18 |     pros = []
19 |     for row in csv.reader(fh, delimiter='\t'):
20 |         if (row[0] != "Genome") and (
21 |                 row[3] in args_srna.promoter_name):
22 |             pros.append({"strain": row[0], "pos": row[1],
23 |                          "strand": row[2], "name": row[3]})
24 |     fh.close()
25 |     return srnas, pros
26 | 
27 | 
28 | def print_table(srna_table, out_t, srnas):
29 |     fh = open(srna_table, "r")
30 |     for row in csv.reader(fh, delimiter='\t'):
31 |         for srna in srnas:
32 |             if (row[0] == srna.seq_id) and (
33 |                     int(row[2]) == srna.start) and (
34 |                     int(row[3]) == srna.end) and (
35 |                     row[4] == srna.strand):
36 |                 if "promoter" in srna.attributes.keys():
37 |                     promoter = [srna.attributes["promoter"]]
38 |                 else:
39 |                     promoter = ["NA"]
40 |                 out_t.write("\t".join(row + promoter) + "\n")
41 | 
42 | 
43 | def compare_srna_promoter(srna_gff, srna_table, args_srna):
44 |     '''compare sRNA and promoter to find the sRNA 
45 |     which is associated with a promoter.
46 |     it is for the ranking of sRNA'''
47 |     srnas, pros = read_file(srna_gff, args_srna)
48 |     out_g = open("tmp_srna.gff", "w")
49 |     out_t = open("tmp_srna.csv", "w")
50 |     out_g.write("##gff-version 3\n")
51 |     for srna in srnas:
52 |         tsss = []
53 |         detect = False
54 |         if "with_TSS" in srna.attributes.keys():
55 |             if srna.attributes["with_TSS"] != "NA":
56 |                 datas = srna.attributes["with_TSS"].split(",")
57 |                 for data in datas:
58 |                     info = data.split(":")[-1]
59 |                     tss = info.split("_")
60 |                     tsss.append({"pos": tss[0], "strand": tss[-1]})
61 |         if len(tsss) != 0:
62 |             for tss in tsss:
63 |                 for pro in pros:
64 |                     if (srna.seq_id == pro["strain"]) and (
65 |                             tss["strand"] == pro["strand"]) and (
66 |                             tss["pos"] == pro["pos"]):
67 |                         detect = True
68 |                         if "promoter" not in srna.attributes.keys():
69 |                             srna.attributes["promoter"] = pro["name"]
70 |                         else:
71 |                             srna.attributes["promoter"] = ",".join([
72 |                                 srna.attributes["promoter"],
73 |                                 pro["name"]])
74 |         if detect:
75 |             out_g.write(srna.info + ";promoter=" +
76 |                         srna.attributes["promoter"] + "\n")
77 |         else:
78 |             out_g.write(srna.info + ";promoter=NA" + "\n")
79 |     print_table(srna_table, out_t, srnas)
80 |     os.remove(srna_gff)
81 |     os.remove(srna_table)
82 |     out_t.close()
83 |     out_g.close()
84 |     shutil.move("tmp_srna.gff", srna_gff)
85 |     shutil.move("tmp_srna.csv", srna_table)
86 | 


--------------------------------------------------------------------------------
/annogesiclib/sRNA_antisense.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | from annogesiclib.gff3 import Gff3Parser
 5 | 
 6 | 
 7 | def compare_srna_gff(gffs, strain, strand, start, end, srna_types, file_type):
 8 |     for gff in gffs:
 9 |         if (strain == gff.seq_id) and (
10 |                 strand != gff.strand):
11 |             if ((start <= gff.start) and (
12 |                      end >= gff.end)) or (
13 |                     (start >= gff.start) and (
14 |                      end <= gff.end)) or (
15 |                     (start <= gff.start) and (
16 |                      end <= gff.end) and (
17 |                      end >= gff.start)) or (
18 |                     (start >= gff.start) and (
19 |                      start <= gff.end) and (
20 |                      end >= gff.end)):
21 |                 if file_type == "gff":
22 |                     if "antisense" not in srna_types:
23 |                         srna_types = srna_types + "," + "antisense"
24 |                 else:
25 |                     if "Antisense" not in srna_types:
26 |                         srna_types = srna_types + "," + "Antisense"
27 |     return srna_types
28 | 
29 | 
30 | def srna_antisense(srna_gff, srna_table, gff_file):
31 |     tmp_srna_gff = srna_gff + "tmp"
32 |     tmp_srna_table = srna_table + "tmp"
33 |     out = open(tmp_srna_gff, "w")
34 |     out.write("##gff-version 3\n")
35 |     out_t = open(tmp_srna_table, "w")
36 |     out_t.write("\t".join(["Rank", "Genome", "Name", "Start", "End", "Strand",
37 |                            "Start_with_TSS/Cleavage_site", "End_with_cleavage",
38 |                            "Candidates", "Lib_type", "Best_avg_coverage",
39 |                            "Best_highest_coverage", "Best_lower_coverage",
40 |                            "Track/Coverage",
41 |                            "Normalized_secondary_energy_change(by_length)",
42 |                            "sRNA_types", "Confliction_of_sORF",
43 |                            "nr_hit_number", "sRNA_hit_number",
44 |                            "nr_hit_top3|ID|e-value", "sRNA_hit|e-value",
45 |                            "Overlap_CDS", "Overlap_percent",
46 |                            "End_with_terminator"]) + "\n")
47 |     srnas = []
48 |     sf = open(srna_gff, "r")
49 |     for entry in Gff3Parser().entries(sf):
50 |         srnas.append(entry)
51 |     tabs = []
52 |     fh = open(srna_table, "r")
53 |     for row in csv.reader(fh, delimiter='\t'):
54 |         if row[0] != "rank":
55 |             tabs.append({"info": row, "strain": row[1], "strand": row[5],
56 |                          "start": int(row[3]), "end": int(row[4]),
57 |                          "srna_type": row[15]})
58 |         else:
59 |             out_t.write("\t".join(row) + "\n")
60 |     gffs = []
61 |     gf = open(gff_file, "r")
62 |     for entry in Gff3Parser().entries(gf):
63 |         gffs.append(entry)
64 |     for srna in srnas:
65 |         compare_srna_gff(gffs, srna.seq_id, srna.strand, srna.start, srna.end,
66 |                          srna.attributes["sRNA_type"], "gff")
67 |         attribute_string = ";".join(
68 |             ["=".join(items) for items in srna.attributes.items()])
69 |         out.write("\t".join([srna.info_without_attributes,
70 |                              attribute_string]) + "\n")
71 |     for tab in tabs:
72 |         compare_srna_gff(gffs, tab["strain"], tab["strand"], tab["start"],
73 |                          tab["end"], tab["srna_type"], "table")
74 |         tab["info"][15] = tab["srna_type"]
75 |         out_t.write("\t".join(tab["info"]) + "\n")
76 |     os.remove(srna_gff)
77 |     shutil.move(tmp_srna_gff, srna_gff)
78 |     os.remove(srna_table)
79 |     shutil.move(tmp_srna_table, srna_table)
80 | 


--------------------------------------------------------------------------------
/comparison/gff3.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | class Gff3Parser(object):
 5 |     """
 6 | A format description can be found at:
 7 | http://genome.ucsc.edu/FAQ/FAQformat.html#format3
 8 | http://www.sequenceontology.org/gff3.shtml
 9 | 
10 | a validator can be found here:
11 | http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online
12 | 
13 | WARNING: Currently this class in not strict enough and would also
14 | parse file not following the standard.
15 | """
16 | 
17 |     def entries(self, input_gff_fh):
18 |         """
19 | """
20 |         for entry_dict in csv.DictReader(
21 |             input_gff_fh, delimiter="\t",
22 |             fieldnames=["seq_id", "source", "feature", "start",
23 |                         "end", "score", "strand", "phase", "attributes"]):
24 |             if entry_dict["seq_id"].startswith("#"):
25 |                 continue
26 |             yield self._dict_to_entry(entry_dict)
27 | 
28 |     def _dict_to_entry(self, entry_dict):
29 |         return Gff3Entry(entry_dict)
30 | 
31 | 
32 | class Gff3Entry(object):
33 | 
34 |     """
35 | 
36 | Example:
37 | start, end = sorted([int(pos) for pos in [start, end]])
38 | Gff3Entry({
39 | "seq_id" : seq_id,
40 | "source" : "MyLab",
41 | "feature" : "sRNA",
42 | "start" : start,
43 | "end" : end,
44 | "strand" : strand,
45 | "score" : ".",
46 | "phase" : ".",
47 | "attributes" : "name=%s;locus_tag=%s" % (name, locus_tag)})
48 | """
49 | 
50 |     def __init__(self, entry_dict):
51 |         self.seq_id = entry_dict["seq_id"]
52 |         self.source = entry_dict["source"]
53 |         self.feature = entry_dict["feature"]
54 |         # 1-based coordinates
55 |         # Make sure that start <= end
56 |         start, end = sorted([int(entry_dict["start"]), int(entry_dict["end"])])
57 |         self.start = start
58 |         self.end = end
59 |         self.score = entry_dict["score"]
60 |         self.strand = entry_dict["strand"]
61 |         self.phase = entry_dict["phase"]
62 |         self.attributes = self._attributes(entry_dict["attributes"])
63 |         self.attribute_string = entry_dict["attributes"]
64 |         self.info = "\t".join([str(field) for field in [
65 |                         self.seq_id, self.source, self.feature, self.start,
66 |                         self.end, self.score, self.strand, self.phase,
67 |                         self.attribute_string]])
68 |         self.info_without_attributes = "\t".join([str(field) for field in [
69 |                         self.seq_id, self.source, self.feature, self.start,
70 |                         self.end, self.score, self.strand, self.phase]])
71 | 
72 |     def _attributes(self, attributes_string):
73 |         """Translate the attribute string to dictionary"""
74 |         attributes = {}
75 |         if len(attributes_string) > 0:
76 |             for attribute in attributes_string.split(";"):
77 |                 key_value_pair = attribute.split("=")
78 |                 key = key_value_pair[0]
79 |                 if len(key_value_pair) > 2:
80 |                     value = "=".join(key_value_pair[1:])
81 |                 else:
82 |                     value = key_value_pair[1]
83 |                 attributes[key] = value
84 |             return attributes
85 |         else:
86 |             return attributes
87 | 
88 |     def add_attribute(self, key, value):
89 |         self.attributes[key] = value
90 |         self.attribute_string = ";".join(
91 |             ["=".join(items) for items in self.attributes.items()])
92 | 
93 |     def __str__(self):
94 |         return "\t".join([str(field) for field in [
95 |                         self.seq_id, self.source, self.feature, self.start,
96 |                         self.end, self.score, self.strand, self.phase,
97 |                         self.attribute_string]])
98 | 


--------------------------------------------------------------------------------
/tests/test_meme.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | import annogesiclib.meme as me
 8 | from mock_helper import gen_file
 9 | from mock_args_container import MockClass
10 | from annogesiclib.meme import MEME 
11 | 
12 | 
13 | class Mock_func(object):
14 | 
15 |     def mock_del_repeat_fasta(self, tmp_fasta, all_no_orph):
16 |         with open("tmp/all_type.fa", "w") as fh:
17 |             fh.write("all")
18 |         with open("tmp/without_orphan.fa", "w") as fh:
19 |             fh.write("without_orphan")
20 | 
21 | class TestMEME(unittest.TestCase):
22 | 
23 |     def setUp(self):
24 |         self.mock_args = MockClass()
25 |         self.test_folder = "test_folder"
26 |         self.out_folder = "test_folder/output"
27 |         if (not os.path.exists(self.test_folder)):
28 |             os.mkdir(self.test_folder)
29 |             os.mkdir(self.out_folder)
30 |             os.mkdir(os.path.join(self.out_folder, "fasta_output"))
31 |         self.tss_folder = os.path.join(self.test_folder, "tss_folder")
32 |         if (not os.path.exists(self.tss_folder)):
33 |             os.mkdir(self.tss_folder)
34 |         self.gff_folder = os.path.join(self.test_folder, "gff_folder")
35 |         if (not os.path.exists(self.gff_folder)):
36 |             os.mkdir(self.gff_folder)
37 |         self.fa_folder = os.path.join(self.test_folder, "fa_folder")
38 |         if (not os.path.exists(self.fa_folder)):
39 |             os.mkdir(self.fa_folder)
40 |         args = self.mock_args.mock()
41 |         args.tsss = self.tss_folder
42 |         args.fastas = self.fa_folder
43 |         args.gffs = self.gff_folder
44 |         args.output_folder = self.out_folder
45 |         self.meme = MEME(args)
46 | 
47 |     def tearDown(self):
48 |         if os.path.exists(self.test_folder):
49 |             shutil.rmtree(self.test_folder)
50 | 
51 |     def test_move_and_merge_fasta(self):
52 |         me.del_repeat_fasta = Mock_func().mock_del_repeat_fasta
53 |         if (not os.path.exists("tmp")):
54 |             os.mkdir("tmp")
55 |         gen_file("tmp/primary.fa", "primary")
56 |         gen_file("tmp/secondary.fa", "secondary")
57 |         gen_file("tmp/internal.fa", "internal")
58 |         gen_file("tmp/antisense.fa", "antisense")
59 |         gen_file("tmp/orphan.fa", "orphan")
60 |         self.meme._move_and_merge_fasta(self.test_folder, "test")
61 |         self.assertTrue(os.path.exists(os.path.join(
62 |             self.test_folder, "test_allgenome_all_types.fa")))
63 |         self.assertTrue(os.path.exists(os.path.join(
64 |             self.test_folder, "test_allgenome_primary.fa")))
65 |         self.assertTrue(os.path.exists(os.path.join(
66 |             self.test_folder, "test_allgenome_secondary.fa")))
67 |         self.assertTrue(os.path.exists(os.path.join(
68 |             self.test_folder, "test_allgenome_internal.fa")))
69 |         self.assertTrue(os.path.exists(os.path.join(
70 |             self.test_folder, "test_allgenome_antisense.fa")))
71 |         self.assertTrue(os.path.exists(os.path.join(
72 |             self.test_folder, "test_allgenome_orphan.fa")))
73 |         self.assertTrue(os.path.exists(os.path.join(
74 |             self.test_folder, "test_allgenome_without_orphan.fa")))
75 | 
76 |     def test_split_fasta_by_strain(self):
77 |         with open(os.path.join(self.fa_folder, "allgenome.fa"), "w") as fh:
78 |             fh.write(""">aaa_aaa_aaa
79 | ATTATATATA
80 | >bbb_bbb_bbb
81 | AATTAATTAA""")
82 |         self.meme._split_fasta_by_strain(self.fa_folder)
83 |         self.assertTrue(os.path.join(self.fa_folder, "aaa.fa"))
84 |         self.assertTrue(os.path.join(self.fa_folder, "bbb.fa"))
85 | 
86 | if __name__ == "__main__":
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/annogesiclib/screen.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from annogesiclib.gen_screenshots import gen_screenshot
 4 | from annogesiclib.helper import Helper
 5 | 
 6 | 
 7 | class Screen(object):
 8 |     '''generation of screenshot'''
 9 | 
10 |     def __init__(self, args_sc, out_folder):
11 |         self.helper = Helper()
12 |         args_sc.output_folder = out_folder
13 |         filename = args_sc.fasta.split("/")[-1]
14 |         self.strain = ".".join(filename.split(".")[0:-1])
15 |         self.helper.check_make_folder(os.path.join(args_sc.output_folder,
16 |                                                    self.strain))
17 |         self.forward_file = os.path.join(args_sc.output_folder,
18 |                                          self.strain, "forward")
19 |         self.reverse_file = os.path.join(args_sc.output_folder,
20 |                                          self.strain, "reverse")
21 |         os.mkdir(self.forward_file)
22 |         os.mkdir(self.reverse_file)
23 | 
24 |     def _import_libs(self, texs, strand, lib_dict):
25 |         if strand == "+":
26 |             tex = "ft"
27 |             notex = "fn"
28 |         else:
29 |             tex = "rt"
30 |             notex = "rn"
31 |         for flib in texs:
32 |             if (flib[1] == "tex"):
33 |                 lib_dict[tex].append(flib[0])
34 |                 for nlib in texs:
35 |                     if (nlib[1] == "notex") and \
36 |                        (flib[2] == nlib[2]) and \
37 |                        (flib[3] == nlib[3]):
38 |                         lib_dict[notex].append(nlib[0])
39 | 
40 |     def screenshot(self, args_sc, log):
41 |         lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []}
42 |         f_texs = []
43 |         r_texs = []
44 |         if args_sc.tlibs is not None:
45 |             for lib in args_sc.tlibs:
46 |                 lib_datas = lib.split(":")
47 |                 if not lib_datas[0].endswith(".wig"):
48 |                     log.write("Wiggle files should end with .wig.\n")
49 |                     print("Error: Wiggle files should end with .wig!")
50 |                     sys.exit()
51 |                 else:
52 |                     if lib_datas[-1] == "+":
53 |                         f_texs.append(lib_datas)
54 |                     else:
55 |                         r_texs.append(lib_datas)
56 |             f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3]))
57 |             r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3]))
58 |             self._import_libs(f_texs, "+", lib_dict)
59 |             self._import_libs(r_texs, "-", lib_dict)
60 |         if args_sc.flibs is not None:
61 |             for lib in args_sc.flibs:
62 |                 lib_datas = lib.split(":")
63 |                 if not lib_datas[0].endswith(".wig"):
64 |                     log.write("Wiggle files should end with .wig.\n")
65 |                     print("Error: Wiggle files should end with .wig!")
66 |                     sys.exit()
67 |                 else:
68 |                     if lib_datas[-1] == "+":
69 |                         lib_dict["ff"].append(lib_datas[0])
70 |                     else:
71 |                         lib_dict["rf"].append(lib_datas[0])
72 |         log.write("Running gen_screenshots.py to generate IGV batch script.\n")
73 |         gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt",
74 |                        self.reverse_file + ".txt", self.strain)
75 |         log.write("\t" + self.forward_file + ".txt is generated.\n")
76 |         log.write("\t" + self.reverse_file + ".txt is generated.\n")
77 |         if (args_sc.tlibs is None) and (args_sc.flibs is None):
78 |             log.write("No wig files can be found.\n")
79 |             print("Error: There is no wig file assigned!")
80 |             sys.exit()
81 | 


--------------------------------------------------------------------------------
/tests/test_stat_sublocal.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | import copy
 6 | from io import StringIO
 7 | sys.path.append(".")
 8 | from mock_gff3 import Create_generator
 9 | from mock_helper import import_data, gen_file, extract_info
10 | import annogesiclib.stat_sublocal as ss
11 | 
12 | 
13 | class TestStatSubLocal(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.example = Example()
17 |         self.test_folder = "test_folder"
18 |         if (not os.path.exists(self.test_folder)):
19 |             os.mkdir(self.test_folder)
20 | 
21 |     def tearDown(self):
22 |         if os.path.exists(self.test_folder):
23 |             shutil.rmtree(self.test_folder)
24 | 
25 |     def test_read_table(self):
26 |         psortb_file = os.path.join(self.test_folder, "test.csv")
27 |         gen_file(psortb_file, self.example.table)
28 |         subs, total_nums, unknown_nums = ss.read_table(psortb_file)
29 |         self.assertDictEqual(subs, {
30 |             'Staphylococcus_aureus_HG002': {'Unknown': 1},
31 |             'Staphylococcus_aureus_HG003': {'CellWall': 1, 'Cytoplasmic': 2},
32 |             'all_genome': {'Unknown': 1, 'CellWall': 1, 'Cytoplasmic': 2}})
33 |         self.assertDictEqual(total_nums, {
34 |             'Staphylococcus_aureus_HG002': 1,
35 |             'Staphylococcus_aureus_HG003': 3, 'all_genome': 4})
36 |         self.assertDictEqual(unknown_nums, {
37 |             'Staphylococcus_aureus_HG002': 1,
38 |             'Staphylococcus_aureus_HG003': 0, 'all_genome': 1})
39 | 
40 |     def test_print_file_and_plot(self):
41 |         out_stat = StringIO()
42 |         sub = {'Unknown': 1, 'CellWall': 1, 'Cytoplasmic': 2}
43 |         total_nums = {'Staphylococcus_aureus_HG002': 1,
44 |                       'Staphylococcus_aureus_HG003': 3, 'all_strain': 4}
45 |         unknown_nums = {'Staphylococcus_aureus_HG002': 1,
46 |                         'Staphylococcus_aureus_HG003': 0, 'all_strain': 1}
47 |         ss.print_file_and_plot(sub, total_nums, unknown_nums,
48 |                                "all_strain", out_stat, self.test_folder + "/")
49 |         datas = out_stat.getvalue().split("\n")
50 |         for data in datas:
51 |             if "Total with Unknown" in data:
52 |                 self.assertEqual(data,
53 |                                  ("Total including Unknown is 4; "
54 |                                   "Total excluding Unknown is 3"))
55 |             elif "CellWall" in data:
56 |                 self.assertEqual(data,
57 |                                  ("\tCellWall\t1(including Unknown 0.25; "
58 |                                   "excluding Unknonwn 0.3333333333333333)"))
59 |             elif "Cytoplasmic" in data:
60 |                 self.assertEqual(data,
61 |                                  ("\tCytoplasmic\t2(including Unknown 0.5; "
62 |                                   "excluding Unknonwn 0.6666666666666666)"))
63 |             else:
64 |                 if "include Unknown" in data:
65 |                     self.assertEqual(data,
66 |                                      ("\tUnknown\t1(including Unknown 0.25)"))
67 | 
68 |     def test_plot(self):
69 |         subs = {'Unknown': 1, 'CellWall': 1, 'Cytoplasmic': 2}
70 |         ss.plot(subs, 4, 1, "test", self.test_folder + "/")
71 |         self.assertTrue(os.path.exists(os.path.join(
72 |             self.test_folder, "_test_sublocal.png")))
73 | 
74 | 
75 | class Example(object):
76 | 
77 |     table = """Staphylococcus_aureus_HG003	YP_498609.1	+	517	1878	Cytoplasmic	9.97
78 | Staphylococcus_aureus_HG003	YP_498610.1	+	2156	3289	Cytoplasmic	9.97
79 | Staphylococcus_aureus_HG003	YP_498611.1	+	3670	3915	CellWall	7.50
80 | Staphylococcus_aureus_HG002	YP_498612.1	+	4676	5015	Unknown	7.50"""
81 | 
82 | if __name__ == "__main__":
83 |     unittest.main()
84 | 
85 | 


--------------------------------------------------------------------------------
/tests/test_optimize.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import unittest
  4 | import shutil
  5 | from io import StringIO
  6 | sys.path.append(".")
  7 | from mock_helper import gen_file, import_data
  8 | from mock_args_container import MockClass
  9 | import annogesiclib.optimize as opt
 10 | 
 11 | 
 12 | class Mock_helper(object):
 13 | 
 14 |     def __init__(self):
 15 |         pass
 16 | 
 17 |     def check_uni_attributes(self, gff_file):
 18 |         pass
 19 | 
 20 |     def remove_all_content(self, filename, feature, type_):
 21 |         pass
 22 | 
 23 |     def remove_tmp(self, wigs):
 24 |         pass
 25 | 
 26 |     def remove_tmp_dir(self, folder):
 27 |         pass
 28 | 
 29 | 
 30 | class Mock_multiparser(object):
 31 | 
 32 |     def __init__(self):
 33 |         pass
 34 |     
 35 |     def parser_wig(self, wigs):
 36 |         pass
 37 | 
 38 |     def parser_gff(self, gffs, feature):
 39 |         pass
 40 | 
 41 |     def parser_fasta(self, fastas):
 42 |         pass
 43 | 
 44 | class Mock_func(object):
 45 | 
 46 |     def mock_optimization(self, wig_path, fasta_file, gff_file,
 47 |                           args, strain, manual, length, log):
 48 |         gen_file(os.path.join(args.output_folder, "test.csv"), "test")
 49 | 
 50 | class TestOptimizeTSS(unittest.TestCase):
 51 | 
 52 |     def setUp(self):
 53 |         self.mock_args = MockClass()
 54 |         self.test_folder = "test_folder"
 55 |         self.fastas = os.path.join(self.test_folder, "fasta")
 56 |         self.wigs = os.path.join(self.test_folder, "wigs")
 57 |         self.gffs = os.path.join(self.test_folder, "gffs")
 58 |         self.manuals = os.path.join(self.test_folder, "manuals")
 59 |         if (not os.path.exists(self.test_folder)):
 60 |             os.mkdir(self.test_folder)
 61 |             os.mkdir(self.fastas)
 62 |             os.mkdir(os.path.join(self.fastas, "tmp"))
 63 |             os.mkdir(self.wigs)
 64 |             os.mkdir(os.path.join(self.wigs, "tmp"))
 65 |             os.mkdir(self.gffs)
 66 |             os.mkdir(os.path.join(self.gffs, "tmp"))
 67 |             os.mkdir(self.manuals)
 68 |             os.mkdir(os.path.join(self.manuals, "tmp"))
 69 | 
 70 |     def tearDown(self):
 71 |         if os.path.exists(self.test_folder):
 72 |             shutil.rmtree(self.test_folder)
 73 | 
 74 |     def test_optimize_tss(self):
 75 |         opt.Helper = Mock_helper
 76 |         opt.Multiparser = Mock_multiparser
 77 |         opt.optimization = Mock_func().mock_optimization
 78 |         gen_file(os.path.join(self.gffs, "tmp", "test.gff"), "test")
 79 |         gen_file(os.path.join(self.fastas, "tmp", "test.fa"), "test")
 80 |         args = self.mock_args.mock()
 81 |         args.fastas = self.fastas
 82 |         args.gffs = self.gffs
 83 |         args.wigs = self.wigs
 84 |         args.tsspredator_path = "test"
 85 |         args.manuals = self.manuals
 86 |         gen_file(os.path.join(self.manuals, "tmp", "test.gff"), "test")
 87 |         args.output_folder = self.test_folder
 88 |         args.project_strain = "test"
 89 |         args.height = 9
 90 |         args.height_reduction = 9
 91 |         args.factor = 9
 92 |         args.factor_reduction = 9
 93 |         args.base_height = 9
 94 |         args.enrichment = 9
 95 |         args.processing = 9
 96 |         args.utr = 200
 97 |         args.libs = "test"
 98 |         args.replicate_name = "test"
 99 |         args.cluster = 2
100 |         args.strain_lengths = {"test": 100}
101 |         args.cores = 4
102 |         args.program = "TSS"
103 |         args.replicate = 2
104 |         args.steps = 2000
105 |         log = open(os.path.join(self.test_folder, "test.log"), "w")
106 |         opt.optimize_tss(args, log)
107 |         self.assertTrue(os.path.exists(os.path.join(
108 |             self.test_folder, "test.csv")))
109 |         log.close()
110 | 
111 | if __name__ == "__main__":
112 |     unittest.main()
113 | 
114 | 


--------------------------------------------------------------------------------
/annogesiclib/gff3.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | 
  3 | 
  4 | class Gff3Parser(object):
  5 |     """
  6 | A format description can be found at:
  7 | http://genome.ucsc.edu/FAQ/FAQformat.html#format3
  8 | http://www.sequenceontology.org/gff3.shtml
  9 | 
 10 | a validator can be found here:
 11 | http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online
 12 | 
 13 | WARNING: Currently this class in not strict enough and would also
 14 | parse file not following the standard.
 15 | """
 16 | 
 17 |     def entries(self, input_gff_fh):
 18 |         """
 19 | """
 20 |         for entry_dict in csv.DictReader(
 21 |             input_gff_fh, delimiter="\t",
 22 |             fieldnames=["seq_id", "source", "feature", "start",
 23 |                         "end", "score", "strand", "phase", "attributes"]):
 24 |             if entry_dict["seq_id"].startswith("#"):
 25 |                 continue
 26 |             yield self._dict_to_entry(entry_dict)
 27 | 
 28 |     def _dict_to_entry(self, entry_dict):
 29 |         return Gff3Entry(entry_dict)
 30 | 
 31 | 
 32 | class Gff3Entry(object):
 33 | 
 34 |     """
 35 | 
 36 | Example:
 37 | start, end = sorted([int(pos) for pos in [start, end]])
 38 | Gff3Entry({
 39 | "seq_id" : seq_id,
 40 | "source" : "MyLab",
 41 | "feature" : "sRNA",
 42 | "start" : start,
 43 | "end" : end,
 44 | "strand" : strand,
 45 | "score" : ".",
 46 | "phase" : ".",
 47 | "attributes" : "name=%s;locus_tag=%s" % (name, locus_tag)})
 48 | """
 49 | 
 50 |     def __init__(self, entry_dict):
 51 |         self.seq_id = entry_dict["seq_id"]
 52 |         self.source = entry_dict["source"]
 53 |         self.feature = entry_dict["feature"]
 54 |         # 1-based coordinates
 55 |         # Make sure that start <= end
 56 |         start, end = sorted([int(entry_dict["start"]), int(entry_dict["end"])])
 57 |         self.start = start
 58 |         self.end = end
 59 |         self.score = entry_dict["score"]
 60 |         self.strand = entry_dict["strand"]
 61 |         self.phase = entry_dict["phase"]
 62 |         self.attributes = self._attributes(entry_dict["attributes"])
 63 |         self.attribute_string = entry_dict["attributes"]
 64 |         self.info = "\t".join([str(field) for field in [
 65 |                         self.seq_id, self.source, self.feature, self.start,
 66 |                         self.end, self.score, self.strand, self.phase,
 67 |                         self.attribute_string]])
 68 |         self.info_without_attributes = "\t".join([str(field) for field in [
 69 |                         self.seq_id, self.source, self.feature, self.start,
 70 |                         self.end, self.score, self.strand, self.phase]])
 71 | 
 72 |     def _attributes(self, attributes_string):
 73 |         """Translate the attribute string to dictionary"""
 74 |         attributes = {}
 75 |         if len(attributes_string) > 0:
 76 |             for attribute in attributes_string.split(";"):
 77 |                 key_value_pair = attribute.split("=")
 78 |                 key = key_value_pair[0]
 79 |                 if len(key_value_pair) > 2:
 80 |                     value = "=".join(key_value_pair[1:])
 81 |                 elif len(key_value_pair) == 2:
 82 |                     value = key_value_pair[1]
 83 |                 else:
 84 |                     value = ""
 85 |                 attributes[key] = value
 86 |             return attributes
 87 |         else:
 88 |             return attributes
 89 | 
 90 |     def add_attribute(self, key, value):
 91 |         self.attributes[key] = value
 92 |         self.attribute_string = ";".join(
 93 |             ["=".join(items) for items in self.attributes.items()])
 94 | 
 95 |     def __str__(self):
 96 |         return "\t".join([str(field) for field in [
 97 |                         self.seq_id, self.source, self.feature, self.start,
 98 |                         self.end, self.score, self.strand, self.phase,
 99 |                         self.attribute_string]])
100 | 


--------------------------------------------------------------------------------
/tests/test_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | import shutil
 4 | import sys
 5 | sys.path.append(".")
 6 | from annogesiclib.paths import Paths
 7 | 
 8 | 
 9 | class TestPaths(unittest.TestCase):
10 | 
11 |     def setUp(self):
12 |         self.test_folder = "test_folder"
13 |         if not os.path.exists(self.test_folder):
14 |             os.mkdir(self.test_folder)
15 |         self.paths = Paths(base_path=self.test_folder)
16 |         self.folder_names = [
17 |             self.paths.input_folder,
18 |             self.paths.output_folder,
19 |             self.paths.reference_input_folder,
20 |             self.paths.wig_folder,
21 |             self.paths.mutation_table_folder,
22 |             self.paths.database_folder,
23 |             self.paths.manual_TSS_folder,
24 |             self.paths.manual_pro_folder,
25 |             self.paths.read_folder,
26 |             self.paths.bam_folder,
27 |             self.paths.target_folder,
28 |             self.paths.ratt_folder,
29 |             self.paths.tsspredator_folder,
30 |             self.paths.utr_folder,
31 |             self.paths.transterm_folder,
32 |             self.paths.transcript_output_folder,
33 |             self.paths.processing_site_folder,
34 |             self.paths.srna_folder,
35 |             self.paths.sorf_folder,
36 |             self.paths.promoter_output_folder,
37 |             self.paths.operon_output_folder,
38 |             self.paths.circrna_output_folder,
39 |             self.paths.goterm_output_folder,
40 |             self.paths.starget_output_folder,
41 |             self.paths.snp_output_folder,
42 |             self.paths.ppi_output_folder,
43 |             self.paths.sublocal_output_folder,
44 |             self.paths.ribos_output_folder]
45 | 
46 |     def tearDown(self):
47 |         if os.path.exists(self.test_folder):
48 |             shutil.rmtree(self.test_folder)
49 | 
50 |     def test_set_folder_names(self):
51 |         self.paths._set_folder_names()
52 |         for folder_name in self.folder_names:
53 |             assert(folder_name != '')
54 |             self.assertEqual(self.folder_names.count(folder_name), 1)
55 | 
56 | 
57 |     def test_required_folders(self):
58 |         self.assertEqual(len(self.paths.required_folders("root")), 22)
59 |         self.assertEqual(len(
60 |             self.paths.required_folders("get_target_fasta")), 25)
61 |         self.assertEqual(len(self.paths.required_folders("TSS")), 27)
62 |         self.assertEqual(len(self.paths.required_folders("transcript")), 26)
63 |         self.assertEqual(len(self.paths.required_folders("terminator")), 27)
64 |         self.assertEqual(len(
65 |             self.paths.required_folders("annotation_transfer")), 25)
66 |         self.assertEqual(len(self.paths.required_folders("utr")), 29)
67 |         self.assertEqual(len(self.paths.required_folders("promoter")), 23)
68 |         self.assertEqual(len(self.paths.required_folders("operon")), 26)
69 |         self.assertEqual(len(self.paths.required_folders("srna")), 37)
70 |         self.assertEqual(len(self.paths.required_folders("sorf")), 30)
71 |         self.assertEqual(len(self.paths.required_folders("processing")), 27)
72 |         self.assertEqual(len(self.paths.required_folders("riboswitch")), 27)
73 |         self.assertEqual(len(self.paths.required_folders("go_term")), 29)
74 |         self.assertEqual(len(self.paths.required_folders("ppi_network")), 26)
75 |         self.assertEqual(len(self.paths.required_folders("circrna")), 28)
76 |         self.assertEqual(len(self.paths.required_folders("crispr")), 26)
77 |         self.assertEqual(len(self.paths.required_folders("thermometer")), 27)
78 |         self.assertEqual(len(self.paths.required_folders("snp")), 39)
79 |         self.assertEqual(len(
80 |             self.paths.required_folders("subcellular_localization")), 29)
81 |         self.assertEqual(len(self.paths.required_folders("srna_target")), 29)
82 | 
83 | if __name__ == "__main__":
84 |     unittest.main()    
85 | 


--------------------------------------------------------------------------------
/tests/test_operon.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_helper import gen_file
 8 | import annogesiclib.operon as op
 9 | from annogesiclib.operon import OperonDetection
10 | from mock_args_container import MockClass
11 | 
12 | 
13 | class Mock_func(object):
14 | 
15 |     def mock_operon(self, tran, tss, gff, term, tss_fuzzy,
16 |                     term_fuzzy, length, out_table, out_gff):
17 |         gen_file(out_table, "test")
18 | 
19 |     def mock_stat(self, table, out_stat):
20 |         gen_file(out_stat, "test")
21 | 
22 |     def mock_combine_gff(self, gff, tran, tss, utr5, utr3, term,
23 |                         tss_fuzzy, term_fuzzy, out_file):
24 |         gen_file(out_file, "test")
25 | 
26 | class TestOperonDetection(unittest.TestCase):
27 | 
28 |     def setUp(self):
29 |         self.test_folder = "test_folder"
30 |         self.mock_args = MockClass()
31 |         self.mock = Mock_func()
32 |         self.tsss = os.path.join(self.test_folder, "tsss")
33 |         self.trans = os.path.join(self.test_folder, "trans")
34 |         self.utr5s = os.path.join(self.test_folder, "utr5s")
35 |         self.utr3s = os.path.join(self.test_folder, "utr3s")
36 |         self.output = os.path.join(self.test_folder, "output")
37 |         self.gffs = os.path.join(self.test_folder, "gffs")
38 |         self.out_gff = os.path.join(self.output, "gffs")
39 |         self.stat = os.path.join(self.test_folder, "stat")
40 |         if (not os.path.exists(self.test_folder)):
41 |             os.mkdir(self.test_folder)
42 |             os.mkdir(self.gffs)
43 |             os.mkdir(self.tsss)
44 |             os.mkdir(self.stat)
45 |             os.mkdir(os.path.join(self.tsss, "tmp"))
46 |             os.mkdir(self.trans)
47 |             os.mkdir(os.path.join(self.trans, "tmp"))
48 |             os.mkdir(self.utr5s)
49 |             os.mkdir(os.path.join(self.utr5s, "tmp"))
50 |             os.mkdir(self.utr3s)
51 |             os.mkdir(os.path.join(self.utr3s, "tmp"))
52 |             os.mkdir(self.output)
53 |             os.mkdir(self.out_gff)
54 |             os.mkdir(os.path.join(self.output, "tables"))
55 |         args = self.mock_args.mock()
56 |         args.tsss = self.tsss
57 |         args.trans = self.trans
58 |         args.utr5s = self.utr5s
59 |         args.utr3s = self.utr3s
60 |         args.output_folder = self.output
61 |         args.terms = None
62 |         self.operon = OperonDetection(args)
63 | 
64 |     def tearDown(self):
65 |         if os.path.exists(self.test_folder):
66 |             shutil.rmtree(self.test_folder)
67 | 
68 |     def test_detect_operon(self):
69 |         op.operon = self.mock.mock_operon
70 |         gen_file(os.path.join(self.tsss, "tmp", "test_TSS.gff"), "test")
71 |         gen_file(os.path.join(self.trans, "tmp",
72 |                               "test_transcript.gff"), "test")
73 |         gen_file(os.path.join(self.gffs, "test.gff"), "test")
74 |         args = self.mock_args.mock()
75 |         args.gffs = self.out_gff
76 |         args.term_fuzzy = 3
77 |         args.tss_fuzzy = 3
78 |         args.length = 100
79 |         log = open(os.path.join(self.test_folder, "test.log"), "w")
80 |         self.operon._detect_operon(["test"], args, log)
81 |         self.assertTrue(os.path.exists(os.path.join(self.output, "tables",
82 |                         "test_operon.csv")))
83 |         log.close()
84 | 
85 |     def test_stat(self):
86 |         op.stat = self.mock.mock_stat
87 |         table_file = os.path.join(self.output, "tables", "test_operon.csv")
88 |         log = open(os.path.join(self.test_folder, "test.log"), "w")
89 |         if not os.path.exists(table_file):
90 |             gen_file(table_file, "test")
91 |         self.operon._stat(os.path.join(self.output, "tables"), self.stat, log)
92 |         self.assertTrue(os.path.exists(os.path.join(
93 |             self.stat, "stat_test_operon.csv")))
94 |         log.close()
95 | 
96 | if __name__ == "__main__":
97 |     unittest.main()
98 | 
99 | 


--------------------------------------------------------------------------------
/annogesiclib/get_input.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import shutil
 4 | from subprocess import call
 5 | from annogesiclib.seq_editer import SeqEditer
 6 | 
 7 | 
 8 | def wget(input_folder, ftp, files_type, log):
 9 |     log.write("\t" + " ".join(["wget", "-cP", input_folder, ftp + "/*" + files_type]) + "\n")
10 |     os.system(" ".join(["wget", "-cP", input_folder, ftp + "/*" + files_type]))
11 |     log.write("Done!\n")
12 | 
13 | def deal_detect(input_file, file_path, change, input_folder):
14 |     '''deal with the header of fasta file and 
15 |     put the files to corresponding folders'''
16 |     if change:
17 |         shutil.move(input_file, file_path)
18 |         change = False
19 |     SeqEditer().modify_header(file_path)
20 |     with open(os.path.join(file_path)) as fh:
21 |         for line in fh:
22 |             line = line.strip()
23 |             if line.startswith(">"):
24 |                 seq_name = line[1:]
25 |     shutil.move(file_path,
26 |                 os.path.join(input_folder, seq_name + ".fa"))
27 |     return change, seq_name
28 | 
29 | 
30 | def get_file(ftp, input_folder, files_type, log):
31 |     checks = {"detect": False, "change": None}
32 |     filename = None
33 |     files = []
34 |     wget(input_folder, ftp, files_type, log)
35 |     for file_ in os.listdir(input_folder):
36 |         input_file = os.path.join(input_folder, file_)
37 |         if (file_[-3:] == "fna"):
38 |             filename = file_[0:-3] + "fa"
39 |             checks = {"detect": True, "change": True}
40 |         elif (file_[-5:] == "fasta"):
41 |             filename = file_[0:-5] + "fa"
42 |             checks = {"detect": True, "change": True}
43 |         elif (file_[-2:] == "fa"):
44 |             filename = file_[0:-2] + "fa"
45 |             checks = {"detect": True, "change": True}
46 |         elif (file_[-6:] == "fna.gz") and ("_genomic" in file_):
47 |             if ("_cds_from_genomic" in file_) or (
48 |                     "_rna_from_genomic" in file_):
49 |                 os.remove(input_file)
50 |             else:
51 |                 filename = file_[0:-6] + "fa"
52 |                 checks = {"detect": True, "change": True}
53 |                 log.write("\tgunzip " + input_file + "\n")
54 |                 call(["gunzip", input_file])
55 |                 input_file = input_file[:-3]
56 |         elif (file_[-6:] == "gff.gz") or (file_[-3:] == "gff"):
57 |             if ("_genomic" in file_) and (file_[-6:] == "gff.gz"):
58 |                 log.write("\tgunzip " + input_file + "\n")
59 |                 call(["gunzip", input_file])
60 |                 input_file = input_file[:-3]
61 |             fh = open(input_file, "r")
62 |             for row in csv.reader(fh, delimiter='\t'):
63 |                 if not row[0].startswith("#"):
64 |                     gff_name = row[0]
65 |                     break
66 |             shutil.move(input_file, os.path.join(input_folder,
67 |                                                gff_name + ".gff"))
68 |             fh.close()
69 |         elif (file_[-3:] == "gbk") or (file_[-7:] == "gbff.gz") or (
70 |                 file_[-4:] == "gbff"):
71 |             if (file_[-7:] == "gbff.gz") and ("_genomic" in file_):
72 |                 log.write("\tgunzip " + input_file + "\n")
73 |                 call(["gunzip", input_file])
74 |                 input_file = input_file[:-3]
75 |             with open(input_file, "r") as g_f:
76 |                 for line in g_f:
77 |                     line = line.strip()
78 |                     if line.startswith("VERSION"):
79 |                         for data in line.split(" "):
80 |                             if (len(data) != 0) and (data != "VERSION"):
81 |                                 break
82 |                         break
83 |             print(os.path.join(input_folder, data + ".gbk"))
84 |             shutil.move(input_file, os.path.join(input_folder, data + ".gbk"))
85 |         if checks["detect"]:
86 |             checks["detect"] = False
87 |             checks["change"], seq_name = deal_detect(
88 |                     input_file, filename, checks["change"], input_folder)
89 | 


--------------------------------------------------------------------------------
/tests/test_plot_TSS_venn.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_gff3 import Create_generator
 8 | from mock_helper import gen_file
 9 | import annogesiclib.plot_TSS_venn as ptv
10 | 
11 | 
12 | class Mock_func(object):
13 | 
14 |     def mock_plot_text(plt, xy1, xy2, tss_type, size, color_text):
15 |         pass
16 | 
17 | class TestPlotTSSVenn(unittest.TestCase):
18 | 
19 |     def setUp(self):
20 |         self.test_folder = "test_folder"
21 |         if (not os.path.exists(self.test_folder)):
22 |             os.mkdir(self.test_folder)
23 |         self.mock = Mock_func()
24 |         self.example = Example()
25 | 
26 |     def tearDown(self):
27 |         if os.path.exists(self.test_folder):
28 |             shutil.rmtree(self.test_folder)
29 | 
30 |     def test_check_tss_class(self):
31 |         strain = "test"
32 |         tss_type = "Internal"
33 |         total_types = {"test": {}}
34 |         ptv.check_tss_class(total_types, strain,
35 |                             self.example.tsss[0], tss_type)
36 |         self.assertDictEqual(total_types, {'test': {'Internal': 0}})
37 |         tss_type = "Primary"
38 |         ptv.check_tss_class(total_types, strain,
39 |                             self.example.tsss[0], tss_type)
40 |         self.assertDictEqual(total_types, {'test': {
41 |             'Internal': 0, 'Primary': 1}})
42 | 
43 |     def test_import_types(self):
44 |         tsss = {"test": self.example.tsss}
45 |         types, total_types = ptv.import_types(tsss)
46 |         self.assertDictEqual(types, {'test': {'Orphan': 1, 'Internal': 1,
47 |                                      'Primary': 1}, 'all': {}})
48 |         self.assertDictEqual(total_types, {
49 |             'test': {'Orphan': 1, 'Antisense': 0,
50 |                      'Secondary': 0, 'Internal': 1,
51 |                      'Primary': 1}, 'all': {}})
52 | 
53 |     def test_read_gff(self):
54 |         tss_file = os.path.join(self.test_folder, "test.gff")
55 |         gen_file(tss_file, self.example.tss_file)
56 |         tsss, tss_num = ptv.read_gff(tss_file)
57 |         self.assertEqual(tsss["all"][0].start, 140)
58 |         self.assertEqual(tsss["aaa"][0].start, 140)
59 |         self.assertDictEqual(tss_num, {'all': 1, 'aaa': 1})
60 | 
61 |     def test_plot(self):
62 |         types = {'test': {'Orphan': 1, 'Internal': 1,
63 |                           'Primary': 1}, 'all': {}}
64 |         total_types = {'test': {'Orphan': 1, 'Antisense': 0,
65 |                                 'Secondary': 0, 'Internal': 1,
66 |                                 'Primary': 1}, 'all': {}}
67 |         tss_num = {'all': 0, 'test': 3}
68 |         ptv.plot(types, "TSS", "TSS", total_types, tss_num)
69 |         self.assertTrue(os.path.exists("TSS_venn_test.png"))
70 |         os.remove("TSS_venn_test.png")
71 | 
72 | class Example(object):
73 | 
74 |     tss_dict = [
75 |         {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 140,
76 |          "end": 140, "phase": ".", "strand": "+", "score": "."},
77 |         {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 230,
78 |          "end": 230, "phase": ".", "strand": "+", "score": "."},
79 |         {"seq_id": "bbb", "source": "Refseq", "feature": "TSS", "start": 5166,
80 |          "end": 5166, "phase": ".", "strand": "-", "score": "."}]
81 |     attributes_tss = [{"ID": "tss0", "Name": "TSS_0", "type": "Primary",
82 |                        "associated_gene": "AAA_00001"},
83 |                       {"ID": "tss1", "Name": "TSS_1", "type": "Internal",
84 |                        "associated_gene": "AAA_00002"},
85 |                       {"ID": "tss2", "Name": "TSS_2", "type": "Orphan",
86 |                        "associated_gene": "orphan"}]
87 |     tsss = []
88 |     for index in range(0, 3):
89 |         tsss.append(Create_generator(
90 |             tss_dict[index], attributes_tss[index], "gff"))
91 |     tss_file = """aaa\tRefseq\tTSS\t140\t140\t.\t+\t.\tID=TSS_0;Name=TSS_00000;associated_gene=AAA_00001;type=Primary"""
92 | 
93 | if __name__ == "__main__":
94 |     unittest.main()
95 | 
96 | 


--------------------------------------------------------------------------------
/annogesiclib/color_png.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from subprocess import call
 3 | from annogesiclib.gen_svg import gen_svg
 4 | from annogesiclib.helper import Helper
 5 | 
 6 | 
 7 | class ColorPNG(object):
 8 | 
 9 |     def _convert_svg(self, imagemagick_path, out_path, screenshot, svg_file, log):
10 |         call([imagemagick_path,
11 |               os.path.join(out_path, screenshot),
12 |               os.path.join(out_path, svg_file)])
13 |         log.write("\t" + " ".join([imagemagick_path,
14 |                   os.path.join(out_path, screenshot),
15 |                   os.path.join(out_path, svg_file)]) + "\n")
16 | 
17 |     def _convert_png(self, imagemagick_path, out_path, screenshot, png_file, log):
18 |         call([imagemagick_path, "-background", "none",
19 |               os.path.join(out_path, screenshot),
20 |               os.path.join(out_path, png_file)])
21 |         log.write("\t" + " ".join([imagemagick_path, "-background", "none",
22 |                   os.path.join(out_path, screenshot),
23 |                   os.path.join(out_path, png_file)]) + "\n")
24 | 
25 |     def generate_color_png(self, track_num, out_folder, imagemagick_path, log):
26 |         '''generation of color png based on tracks'''
27 |         out_folder = os.path.join(out_folder, "screenshots")
28 |         for strain in os.listdir(out_folder):
29 |             if os.path.isdir(os.path.join(out_folder, strain)):
30 |                 for strand in ["forward", "reverse"]:
31 |                     print("Running for {0}_{1}".format(strain, strand))
32 |                     out_path = os.path.join(out_folder, strain, strand)
33 |                     # convert original png to svg and give color on it.
34 |                     log.write("Converting png file in {0} to svg.\n".format(
35 |                         out_path))
36 |                     log.write("Colorizing svg files.\n"
37 |                               "Make sure the version of ImageMagick is "
38 |                               "at least 6.9.0-0.\n")
39 |                     for screenshot in os.listdir(out_path):
40 |                         if screenshot.endswith(".png"):
41 |                             print("Converting {0} to svg files and "
42 |                                   "Painting tracks now".format(
43 |                                       screenshot))
44 |                             svg_file = screenshot.replace(".png", ".svg")
45 |                             self._convert_svg(imagemagick_path, out_path,
46 |                                               screenshot, svg_file, log)
47 |                             with open(os.path.join(
48 |                                       out_path, svg_file), "r") as f_h:
49 |                                 for line in f_h:
50 |                                     line = line.strip()
51 |                                     if line.startswith("<svg"):
52 |                                         line = line.split(" ")
53 |                                         height = line[-1].split("=")[-1][1:-2]
54 |                                         width = line[1].split("=")[-1][1:-1]
55 |                                         break
56 |                             gen_svg(os.path.join(out_path, screenshot),
57 |                                     track_num, height, width)
58 |                     log.write("All colorization for {0} is done.\n".format(out_path))
59 |                     # convert to png file again
60 |                     log.write("Converting svg file in {0} to png.\n".format(
61 |                         out_path))
62 |                     for screenshot in os.listdir(out_path):
63 |                         if screenshot.endswith(".svg"):
64 |                             print("Converting {0} to png files now...".format(
65 |                                   screenshot))
66 |                             png_file = screenshot.replace(".svg", ".png")
67 |                             self._convert_png(imagemagick_path, out_path,
68 |                                               screenshot, png_file, log)
69 |                     Helper().remove_all_content(out_path, ".svg", "file")
70 |                     log.write("All conversion for {0} is done.\n".format(out_path))
71 | 


--------------------------------------------------------------------------------
/tests/test_sORF_intergenic.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | import copy
 6 | from io import StringIO
 7 | sys.path.append(".")
 8 | from mock_gff3 import Create_generator
 9 | from mock_helper import import_data, gen_file
10 | import annogesiclib.sORF_intergenic as si
11 | 
12 | class TestsORFIntergenic(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 |         self.example = Example()
16 |         self.test_folder = "test_folder"
17 |         if (not os.path.exists(self.test_folder)):
18 |             os.mkdir(self.test_folder)
19 | 
20 |     def tearDown(self):
21 |         if os.path.exists(self.test_folder):
22 |             shutil.rmtree(self.test_folder)
23 | 
24 |     def test_get_type(self):
25 |         inter = {"strain": "aaa", "strand": "+", "start": 100, "end": 149}
26 |         si.get_type(inter, self.example.gffs)
27 |         self.assertDictEqual(inter, {'strand': '+', 'source': '5utr',
28 |                                      'end': 149,
29 |                                      'strain': 'aaa', 'start': 100})
30 | 
31 |     def test_compare_tran_cds(self):
32 |         inters = si.compare_tran_cds(self.example.trans, self.example.gffs)
33 |         self.assertListEqual(inters, [
34 |             {'strand': '+', 'strain': 'aaa', 'start': 10, 'end': 100},
35 |             {'strand': '+', 'strain': 'aaa', 'start': 1100, 'end': 1229}])
36 | 
37 |     def test_get_intergenic(self):
38 |         out_file = os.path.join(self.test_folder, "out")
39 |         gff_file = os.path.join(self.test_folder, "anno.gff")
40 |         tran_file = os.path.join(self.test_folder, "tran.gff")
41 |         gen_file(gff_file, self.example.gff_file)
42 |         gen_file(tran_file, self.example.tran_file)
43 |         si.get_intergenic(gff_file, tran_file, out_file, True, False, 5, 75)
44 |         datas = import_data(out_file)
45 |         self.assertEqual("\n".join(datas), self.example.out_file)
46 | 
47 | 
48 | class Example(object):
49 |     gff_file = """aaa	Refseq	CDS	112	623	.	+	.	ID=cds0;Name=CDS_00000
50 | aaa	Refseq	CDS	800	1100	.	+	.	ID=trna1;Name=tRNA_00001"""
51 |     tran_file = """aaa	Refseq	transcript	19	100	.	+	.	ID=tran0;Name=Tran_00000
52 | aaa	Refseq	transcript	600	1800	.	+	.	ID=tran1;Name=Tran_00001"""
53 |     out_file = """aaa	intergenic	sORF	14	175	.	+	.	ID=aaa_sorf0;Name=sORF_00000
54 | aaa	UTR_derived	sORF	619	874	.	+	.	ID=aaa_sorf1;Name=sORF_00001;UTR_type=interCDS
55 | aaa	UTR_derived	sORF	1096	1875	.	+	.	ID=aaa_sorf2;Name=sORF_00002;UTR_type=3utr"""
56 |     gff_dict = [
57 |         {"seq_id": "aaa", "source": "Refseq", "feature": "gene", "start": 150,
58 |          "end": 200, "phase": ".", "strand": "+", "score": "."},
59 |         {"seq_id": "aaa", "source": "Refseq", "feature": "CDS", "start": 1230,
60 |          "end": 1240, "phase": ".", "strand": "+", "score": "."},
61 |         {"seq_id": "aaa", "source": "Refseq", "feature": "CDS", "start": 7100,
62 |          "end": 9167, "phase": ".", "strand": "-", "score": "."}]
63 |     attributes_gff = [
64 |         {"ID": "gene0", "Name": "Gene_0", "locus_tag": "AAA_00001"},
65 |         {"ID": "cds1", "Name": "CDS_1", "locus_tag": "AAA_00002"},
66 |         {"ID": "cds4", "Name": "CDS_4", "locus_tag": "BBB_00003"}] 
67 |     tran_dict = [
68 |         {"seq_id": "aaa", "source": "Refseq", "feature": "transcript",
69 |          "start": 10, "end": 100, "phase": ".", "strand": "+", "score": "."},
70 |         {"seq_id": "aaa", "source": "Refseq", "feature": "transcript",
71 |          "start": 1100, "end": 1240, "phase": ".", "strand": "+",
72 |          "score": "."},
73 |         {"seq_id": "aaa", "source": "Refseq", "feature": "transcript",
74 |          "start": 8000, "end": 900, "phase": ".", "strand": "-",
75 |          "score": "."}]
76 |     attributes_tran = [{"ID": "tran0", "Name": "tran_0"},
77 |                        {"ID": "tran1", "Name": "tran_1"},
78 |                        {"ID": "tran4", "Name": "tran_4"}]
79 |     gffs = []
80 |     trans = []
81 |     for index in range(0, 3):
82 |         gffs.append(Create_generator(
83 |             gff_dict[index], attributes_gff[index], "gff"))
84 |         trans.append(Create_generator(
85 |             tran_dict[index], attributes_tran[index], "gff"))
86 | 
87 | if __name__ == "__main__":
88 |     unittest.main()
89 | 
90 | 


--------------------------------------------------------------------------------
/annogesiclib/expression.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | from annogesiclib.multiparser import Multiparser
 5 | from annogesiclib.helper import Helper
 6 | from annogesiclib.gene_express_analysis import gene_expression
 7 | 
 8 | 
 9 | class Expression(object):
10 | 
11 |     def __init__(self, gffs):
12 |         self.multiparser = Multiparser()
13 |         self.helper = Helper()
14 |         self.out_folder = os.path.join(gffs, "for_libs")
15 |         if os.path.exists(self.out_folder):
16 |             shutil.rmtree(self.out_folder)
17 |         os.mkdir(self.out_folder)
18 |         self.stat = os.path.join(self.out_folder, "statistics")
19 |         os.mkdir(self.stat)
20 |         self.gff_folder = os.path.join(self.out_folder, "gffs")
21 |         os.mkdir(self.gff_folder)
22 |         self.merge_wigs = os.path.join(gffs, "merge_wigs")
23 |         if os.path.exists(self.merge_wigs):
24 |             shutil.rmtree(self.merge_wigs)
25 | 
26 |     def _get_replicates(self, replicates_tex, replicates_frag):
27 |         if (replicates_tex is not None) and (
28 |                 replicates_frag is not None):
29 |             replicates = {"tex": int(replicates_tex),
30 |                           "frag": int(replicates_frag)}
31 |         elif replicates_tex is not None:
32 |             replicates = {"tex": int(replicates_tex), "frag": -1}
33 |         elif replicates_frag is not None:
34 |             replicates = {"tex": -1, "frag": int(replicates_frag)}
35 |         else:
36 |             print("Error:No replicates number assign!!!")
37 |             sys.exit()
38 |         return replicates
39 | 
40 |     def expression(self, tex_libs, frag_libs, tex_notex, replicates_tex,
41 |                    replicates_frag, tex_wigs, frag_wigs, percent_tex,
42 |                    percent_frag, cutoff_coverage, gffs, features,
43 |                    cover_type, max_color, min_color):
44 |         replicates = self._get_replicates(replicates_tex, replicates_frag)
45 |         if (tex_libs is not None) and (frag_libs is not None):
46 |             input_libs = tex_libs + frag_libs
47 |         elif tex_libs is not None:
48 |             input_libs = tex_libs
49 |         elif frag_libs is not None:
50 |             input_libs = frag_libs
51 |         else:
52 |             print("Error: plese assign the libraries!!\n")
53 |             sys.exit()
54 |         if (tex_wigs is not None) and (frag_wigs is not None):
55 |             merge_wigs = self.merge_wigs
56 |             os.mkdir(merge_wigs)
57 |             for wig in os.listdir(tex_wigs):
58 |                 if os.path.isfile(os.path.join(tex_wigs, wig)):
59 |                     shutil.copy(os.path.join(tex_wigs, wig), merge_wigs)
60 |             for wig in os.listdir(frag_wigs):
61 |                 if os.path.isfile(os.path.join(frag_wigs, wig)):
62 |                     shutil.copy(os.path.join(frag_wigs, wig), merge_wigs)
63 |         elif tex_wigs is not None:
64 |             merge_wigs = tex_wigs
65 |         elif frag_wigs is not None:
66 |             merge_wigs = frag_wigs
67 |         else:
68 |             print("Error: plese assign the wiggle files!!\n")
69 |             sys.exit()
70 |         wig_f_file = os.path.join(merge_wigs, "whole_forward.wig")
71 |         wig_r_file = os.path.join(merge_wigs, "whole_reverse.wig")
72 |         for wig in os.listdir(merge_wigs):
73 |             for lib in input_libs:
74 |                 if (wig in lib) and (lib[-1] == "+"):
75 |                     self.helper.merge_file(os.path.join(merge_wigs, wig),
76 |                                            wig_f_file)
77 |                 elif (wig in lib) and (lib[-1] == "-"):
78 |                     self.helper.merge_file(os.path.join(merge_wigs, wig),
79 |                                            wig_r_file)
80 |         print("Computing expression analysis...")
81 |         gene_expression(input_libs, gffs, percent_tex, percent_frag,
82 |                         wig_f_file, wig_r_file, features, merge_wigs,
83 |                         cutoff_coverage, tex_notex, replicates, self.stat,
84 |                         self.gff_folder, cover_type, max_color, min_color)
85 |         os.remove(wig_f_file)
86 |         os.remove(wig_r_file)
87 |         if os.path.exists(self.merge_wigs):
88 |             shutil.rmtree(self.merge_wigs)
89 | 


--------------------------------------------------------------------------------
/tests/test_compare_sRNA_sORF.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import shutil
 5 | from io import StringIO
 6 | sys.path.append(".")
 7 | from mock_gff3 import Create_generator
 8 | from mock_helper import gen_file, extract_info
 9 | import annogesiclib.compare_sRNA_sORF as cro
10 | 
11 | class Mock_Gff_parser(object):
12 | 
13 |     def __init__(self):
14 |         self.example = Example()
15 | 
16 |     def entries(self, fh):
17 |         for line in fh:
18 |             if "srna" in line:
19 |                 lists = self.example.srna_dict
20 |                 attributes = self.example.attributes_srna
21 |             elif "sorf" in line:
22 |                 lists = self.example.sorf_dict
23 |                 attributes = self.example.attributes_sorf
24 |         for index in range(0, 3):
25 |             yield Create_generator(lists[index], attributes[index], "gff")
26 |         fh.close()
27 | 
28 | class TestComparesRNAsORF(unittest.TestCase):
29 | 
30 |     def setUp(self):
31 |         self.example = Example()
32 |         self.test_folder = "test_folder"
33 |         if (not os.path.exists(self.test_folder)):
34 |             os.mkdir(self.test_folder)
35 | 
36 |     def tearDown(self):
37 |         if os.path.exists(self.test_folder):
38 |             shutil.rmtree(self.test_folder)
39 | 
40 |     def test_srna_sorf_comparison(self):
41 |         cro.Gff3Parser = Mock_Gff_parser
42 |         sRNA_file = os.path.join(self.test_folder, "sRNA.gff")
43 |         sORF_file = os.path.join(self.test_folder, "sORF.gff")
44 |         gen_file(sRNA_file, "srna")
45 |         gen_file(sORF_file, "sorf")
46 |         sRNA_out = os.path.join(self.test_folder, "sRNA.out")
47 |         sORF_out = os.path.join(self.test_folder, "sORF.out")
48 |         cro.srna_sorf_comparison(sRNA_file, sORF_file, sRNA_out, sORF_out)
49 |         srnas, attribute_srnas = extract_info(sRNA_out, "file")
50 |         refs, attribute_refs = extract_info(self.example.srna_out, "string")
51 |         self.assertEqual(set(srnas), set(refs[1:]))
52 |         self.assertEqual(set(attribute_srnas[2]), set(attribute_refs[3]))
53 |         sorfs, attribute_sorfs = extract_info(sORF_out, "file")
54 |         refs, attribute_refs = extract_info(self.example.sorf_out, "string")
55 |         self.assertEqual(set(sorfs), set(refs[1:]))
56 |         self.assertEqual(set(attribute_sorfs[2]), set(attribute_refs[3]))
57 | 
58 | class Example(object):
59 | 
60 |     srna_dict = [
61 |         {"seq_id": "aaa", "source": "Refseq", "feature": "sRNA", "start": 140,
62 |          "end": 367, "phase": ".", "strand": "+", "score": "."},
63 |         {"seq_id": "aaa", "source": "Refseq", "feature": "sRNA", "start": 30,
64 |          "end": 40, "phase": ".", "strand": "+", "score": "."},
65 |         {"seq_id": "bbb", "source": "Refseq", "feature": "sRNA", "start": 430,
66 |          "end": 567, "phase": ".", "strand": "-", "score": "."}]
67 |     sorf_dict = [
68 |         {"seq_id": "aaa", "source": "Refseq", "feature": "sORF", "start": 160,
69 |          "end": 300, "phase": ".", "strand": "+", "score": "."},
70 |         {"seq_id": "aaa", "source": "Refseq", "feature": "sORF", "start": 3,
71 |          "end": 38, "phase": ".", "strand": "+", "score": "."},
72 |         {"seq_id": "bbb", "source": "Refseq", "feature": "sORF", "start": 420,
73 |          "end": 577, "phase": ".", "strand": "-", "score": "."}]
74 |     attributes_srna = [{"ID": "srna0", "Name": "sRNA_0"},
75 |                        {"ID": "srna1", "Name": "sRNA_1"},
76 |                        {"ID": "srna2", "Name": "sRNA_2"}]
77 |     attributes_sorf = [{"ID": "sorf0", "Name": "sORF_0"},
78 |                        {"ID": "sorf1", "Name": "sORF_1"},
79 |                        {"ID": "sorf2", "Name": "sORF_2"}]
80 |     sorf_out = """##gff-version 3
81 | aaa	Refseq	sORF	3	38	.	+	.	sRNA=NA;ID=sorf1;Name=sORF_1
82 | aaa	Refseq	sORF	160	300	.	+	.	sRNA=sRNA:140-367_f;ID=sorf0;Name=sORF_0
83 | bbb	Refseq	sORF	420	577	.	-	.	sRNA=sRNA:430-567_r;ID=sorf2;Name=sORF_2"""
84 | 
85 |     srna_out = """##gff-version 3
86 | aaa	Refseq	sRNA	30	40	.	+	.	ID=srna1;sORF=NA;Name=sRNA_1
87 | aaa	Refseq	sRNA	140	367	.	+	.	ID=srna0;sORF=sORF:160-300_f;Name=sRNA_0
88 | bbb	Refseq	sRNA	430	567	.	-	.	ID=srna2;sORF=sORF:420-577_r;Name=sRNA_2"""
89 | 
90 | if __name__ == "__main__":
91 |     unittest.main()
92 | 
93 | 


--------------------------------------------------------------------------------
/annogesiclib/combine_frag_tex.py:
--------------------------------------------------------------------------------
  1 | from annogesiclib.gff3 import Gff3Parser
  2 | 
  3 | 
  4 | def modify_position(frag, norm):
  5 |     '''get proper position, we choose the long one'''
  6 |     if frag.end < norm.end:
  7 |         frag.end = norm.end
  8 |     if frag.start > norm.start:
  9 |         frag.start = norm.start
 10 |     norm.attributes["print"] = True
 11 |     frag.attributes["print"] = True
 12 | 
 13 | 
 14 | def print_file(data, out, name, num):
 15 |     attributes = {}
 16 |     attributes["ID"] = data.seq_id + "_transcript" + str(num)
 17 |     attributes["Name"] = "transcript_" + name
 18 |     attributes["detect_lib"] = data.attributes["detect_lib"]
 19 |     attribute_string = ";".join(["=".join(items)
 20 |                                  for items in attributes.items()])
 21 |     out.write("\t".join([str(field) for field in [
 22 |                         data.seq_id, data.source, data.feature, data.start,
 23 |                         data.end, data.score, data.strand, data.phase,
 24 |                         attribute_string]]) + "\n")
 25 | 
 26 | 
 27 | def store(data, source, finals):
 28 |     data.attributes["detect_lib"] = source
 29 |     data.attributes["print"] = False
 30 |     finals.append(data)
 31 | 
 32 | 
 33 | def compare(data1, data2, overlap, tolerance):
 34 |     '''search the sRNA which can be detected in frag and tex libs.
 35 |     Then, try to merge them to be a longer one'''
 36 |     if (data1.seq_id == data2.seq_id) and (data1.strand == data2.strand):
 37 |         if (data1.start <= (data2.end + tolerance)) and (
 38 |                 data1.start >= data2.start) and (
 39 |                 data1.end >= (data2.end + tolerance)):
 40 |             modify_position(data1, data2)
 41 |             overlap = True
 42 |         elif (data1.end >= (data2.start - tolerance)) and (
 43 |                 data1.end <= data2.end) and (
 44 |                 data1.start <= (data2.start - tolerance)):
 45 |             modify_position(data1, data2)
 46 |             overlap = True
 47 |         elif (data1.start <= data2.start) and (
 48 |                 data1.end >= data2.end):
 49 |             modify_position(data1, data2)
 50 |             overlap = True
 51 |         elif (data2.start <= data1.start) and (
 52 |                 data2.end >= data1.end):
 53 |             modify_position(data1, data2)
 54 |             overlap = True
 55 |     return overlap
 56 | 
 57 | 
 58 | def combine(frag_file, tex_file, tolerance, output_file):
 59 |     '''merge the results of sRNA which detected by fragmented and dRNA'''
 60 |     frags = []
 61 |     norms = []
 62 |     finals = []
 63 |     out = open(output_file, "w")
 64 |     out.write("##gff-version 3\n")
 65 |     f_h = open(frag_file, "r")
 66 |     for entry in Gff3Parser().entries(f_h):
 67 |         entry.attributes["print"] = False
 68 |         frags.append(entry)
 69 |     f_h.close()
 70 |     n_h = open(tex_file, "r")
 71 |     for entry in Gff3Parser().entries(n_h):
 72 |         entry.attributes["print"] = False
 73 |         norms.append(entry)
 74 |     n_h.close()
 75 |     sort_frags = sorted(frags, key=lambda k: (k.seq_id, k.start,
 76 |                                               k.end, k.strand))
 77 |     sort_norms = sorted(norms, key=lambda k: (k.seq_id, k.start,
 78 |                                               k.end, k.strand))
 79 |     for frag in sort_frags:
 80 |         overlap = False
 81 |         for norm in sort_norms:
 82 |             overlap = compare(frag, norm, overlap, tolerance)
 83 |         if overlap:
 84 |             store(frag, "fragmented,tex_notex", finals)
 85 |         else:
 86 |             store(frag, "fragmented", finals)
 87 |     for norm in sort_norms:
 88 |         if not norm.attributes["print"]:
 89 |             store(norm, "tex_notex", finals)
 90 |     sort_finals = sorted(finals, key=lambda k: (k.seq_id, k.start,
 91 |                                                 k.end, k.strand))
 92 |     num = 0
 93 |     for tar in sort_finals:
 94 |         if tar.attributes["print"]:
 95 |             continue
 96 |         overlap = False
 97 |         for ref in sort_finals:
 98 |             overlap = compare(tar, ref, overlap, tolerance)
 99 |         name = '%0*d' % (5, num)
100 |         print_file(tar, out, name, num)
101 |         num += 1
102 |     out.close()
103 | 


--------------------------------------------------------------------------------
/annogesiclib/stat_operon.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import itertools
  3 | 
  4 | 
  5 | def _boolean(data):
  6 |     if data == "False":
  7 |         result = False
  8 |     else:
  9 |         result = True
 10 |     return result
 11 | 
 12 | 
 13 | def row_to_location(row):
 14 |     if row[4] == "0":
 15 |         sub = False
 16 |         nosub = True
 17 |     else:
 18 |         sub = True
 19 |         nosub = False
 20 |     tss = _boolean(row[6])
 21 |     term = _boolean(row[8])
 22 |     return {"have no sub-operons": nosub, "have sub-operons": sub,
 23 |             "start with tss": tss, "stop with terminator": term}
 24 | 
 25 | 
 26 | def plus_num(num_total, strain, type_):
 27 |     num_total["total"][type_] += 1
 28 |     num_total[strain][type_] += 1
 29 |     num_total["total"]["total"] += 1
 30 |     num_total[strain]["total"] += 1
 31 | 
 32 | 
 33 | def print_stat(operons, total_num, class_operon, out):
 34 |     num_features = {}
 35 |     out.write("Total number of operons is {0}\n".format(total_num))
 36 |     out.write("The sub operon and features:\n")
 37 |     for operon in operons:
 38 |         for it in range(1, 5):
 39 |             for features in itertools.combinations(operon.keys(), it):
 40 |                 check_key = 0
 41 |                 for key in features:
 42 |                     if operon[key]:
 43 |                         if it == 1:
 44 |                             if key in num_features.keys():
 45 |                                 num_features[key] += 1
 46 |                             else:
 47 |                                 num_features[key] = 1
 48 |                         check_key += 1
 49 |                 if (check_key == it) and (it != 1):
 50 |                     key = " and ".join(features)
 51 |                     if key in num_features.keys():
 52 |                         num_features[key] += 1
 53 |                     else:
 54 |                         num_features[key] = 1
 55 |     for key, value in num_features.items():
 56 |         out.write("\tthe number of operons which {0} = {1} ({2})\n".format(
 57 |                   key, value, float(value) / float(total_num)))
 58 |     out.write("mono/polycistronic:\n")
 59 |     out.write("\tmonocistronic: {0} ({1})\n".format(
 60 |               class_operon["mono"],
 61 |               float(class_operon["mono"]) / float(class_operon["total"])))
 62 |     out.write("\tpolycistronic: {0} ({1})\n".format(
 63 |               class_operon["poly"],
 64 |               float(class_operon["poly"]) / float(class_operon["total"])))
 65 | 
 66 | 
 67 | def stat(input_file, out_file):
 68 |     out = open(out_file, "w")
 69 |     operons = {}
 70 |     operons_all = []
 71 |     tmp_id = ""
 72 |     f_h = open(input_file, "r")
 73 |     pre_seq_id = ""
 74 |     total_num = {}
 75 |     total_num_all = 0
 76 |     class_operon = {}
 77 |     class_operon["total"] = {"na": 0, "mono": 0, "poly": 0, "total": 0}
 78 |     for row in csv.reader(f_h, delimiter="\t"):
 79 |         if row[0] != "Operon_ID":
 80 |             if row[0] != tmp_id:
 81 |                 if pre_seq_id != row[1]:
 82 |                     pre_seq_id = row[1]
 83 |                     operons[row[1]] = []
 84 |                     total_num[row[1]] = 0
 85 |                     class_operon[row[1]] = {"na": 0, "mono": 0,
 86 |                                             "poly": 0, "total": 0}
 87 |                 operons[row[1]].append(row_to_location(row))
 88 |                 operons_all.append(row_to_location(row))
 89 |                 total_num[row[1]] += 1
 90 |                 total_num_all += 1
 91 |                 if row[-1] == "NA":
 92 |                     plus_num(class_operon, row[1], "na")
 93 |                 elif len(row[-1].split(",")) == 1:
 94 |                     plus_num(class_operon, row[1], "mono")
 95 |                 elif len(row[-1].split(",")) > 1:
 96 |                     plus_num(class_operon, row[1], "poly")
 97 |                 tmp_id = row[0]
 98 |     if len(operons) > 1:
 99 |         out.write("All genomes:\n")
100 |         print_stat(operons_all, total_num_all, class_operon["total"], out)
101 |     for strain in operons.keys():
102 |         out.write("\n" + strain + ":\n")
103 |         print_stat(operons[strain], total_num[strain],
104 |                    class_operon[strain], out)
105 |     out.close()
106 |     f_h.close()
107 | 


--------------------------------------------------------------------------------