├── .gitignore ├── MANIFEST.in ├── tests ├── mock_args_container.py ├── test_projectcreator.py ├── test_change_db_format.py ├── test_plot_coverage_table.py ├── mock_helper.py ├── test_seqmodifier.py ├── test_splice_parser.py ├── mock_gff3.py ├── test_plot_mountain.py ├── test_TSSpredator.py ├── uni_report.py ├── test_modify_rbs_table.py ├── test_parser_wig.py ├── test_gff3.py ├── test_gen_table_tran.py ├── test_seq_editer.py ├── test_color_png.py ├── test_filter_TSS_pro.py ├── test_blast_class.py ├── test_goterm.py ├── test_expresssion.py ├── test_gen_svg.py ├── test_stat_TSSpredater.py ├── test_meme.py ├── test_stat_sublocal.py ├── test_optimize.py ├── test_paths.py ├── test_operon.py ├── test_plot_TSS_venn.py ├── test_sORF_intergenic.py └── test_compare_sRNA_sORF.py ├── docs └── source │ ├── logo │ ├── logo_annogesic.pdf │ ├── logo_annogesic.png │ ├── READemption_logo.png │ └── annogesic_logo_white.png │ ├── license.rst │ ├── docker.rst │ └── installation.rst ├── run_test.py ├── tutorial_data ├── mutation.csv └── replace_seq_id.py ├── CITATION.cff ├── annogesiclib ├── change_db_format.py ├── splice_parser.py ├── get_Rfam_ribo.py ├── print_rank_all.py ├── projectcreator.py ├── sRNA_filter_min_utr.py ├── extract_sec_info.py ├── plot_tran.py ├── blast_class.py ├── filter_TSS_pro.py ├── seqmodifier.py ├── map_ribos.py ├── plot_mountain.py ├── TSSpredator_parser.py ├── plot_coverage_table.py ├── sRNA_filter_frag.py ├── modify_rbs_table.py ├── parser_wig.py ├── gen_promoter_table.py ├── output_cutoff_table.py ├── rbs_overlap.py ├── reorganize_table.py ├── lib_reader.py ├── check_srna_overlap.py ├── gen_svg.py ├── overlap.py ├── compare_sRNA_sORF.py ├── compare_srna_promoter.py ├── sRNA_antisense.py ├── screen.py ├── gff3.py ├── get_input.py ├── color_png.py ├── expression.py ├── combine_frag_tex.py └── stat_operon.py ├── database ├── Rfam_RNA_thermometer_ID.csv └── Rfam_riboswitch_ID.csv ├── benchmark_sRNAs ├── Campylobacter.csv └── Helicobacter.csv ├── LICENSE ├── comparison ├── README.md ├── compare_TSS_Mendoza_Vargas.py ├── compare_term_ecocyc.py ├── compare_sORF.py ├── compare_promoter_regulondb.py ├── compare_tran.py ├── compare_term_regulon.py ├── compare_TSS_Salgado.py ├── compare_operon_regulondb.py ├── compare_srna.py ├── compare_operon_door.py └── gff3.py ├── setup.py ├── Makefile └── Table_dependency_version.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | *~ 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | #documentation 2 | recursive-include html * 3 | 4 | #Misc 5 | include LICENSE 6 | -------------------------------------------------------------------------------- /tests/mock_args_container.py: -------------------------------------------------------------------------------- 1 | class MockClass(object): 2 | 3 | def mock(self): 4 | return self 5 | -------------------------------------------------------------------------------- /docs/source/logo/logo_annogesic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/logo_annogesic.pdf -------------------------------------------------------------------------------- /docs/source/logo/logo_annogesic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/logo_annogesic.png -------------------------------------------------------------------------------- /docs/source/logo/READemption_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/READemption_logo.png -------------------------------------------------------------------------------- /docs/source/logo/annogesic_logo_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sung-Huan/ANNOgesic/HEAD/docs/source/logo/annogesic_logo_white.png -------------------------------------------------------------------------------- /run_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | all_tests = unittest.TestLoader().discover("./tests") 3 | unittest.TextTestRunner(verbosity=1).run(all_tests) 4 | -------------------------------------------------------------------------------- /tutorial_data/mutation.csv: -------------------------------------------------------------------------------- 1 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT BAM 2 | NC_009839.1 3 . g c . . . . . 3 | NC_009839.1 6 . t - . . . . . 4 | NC_009839.1 600 . - g . . . . . 5 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.0.3 2 | message: If you use ANNOgesic, please cite it as below. 3 | authors: 4 | - family-names: Yu 5 | given-names: Sung-Huan 6 | orcid: https://orcid.org/0000-0001-7955-8645 7 | - family-names: Vogel 8 | given-names: Jörg 9 | orcid: https://orcid.org/0000-0003-2220-1404 10 | - family-names: Förstner 11 | given-names: Konrad Ulrich 12 | orcid: https://orcid.org/0000-0002-1481-2996 13 | title: ANNOgesic 14 | version: 1.0.15 15 | doi: 10.1093/gigascience/giy096 16 | date-released: 2018-09-03 17 | -------------------------------------------------------------------------------- /annogesiclib/change_db_format.py: -------------------------------------------------------------------------------- 1 | def change_format(input_file, output_file): 2 | '''change the format of sRNA database''' 3 | num = 1 4 | out = open(output_file, "w") 5 | with open(input_file) as f_h: 6 | for line in f_h: 7 | line = line.strip() 8 | if line.startswith(">"): 9 | datas = line.split("|") 10 | if datas[0][1:] == "NA": 11 | datas[0] = ">srn_" + str(num) 12 | num += 1 13 | out.write("|".join(datas[:3]) + "\n") 14 | else: 15 | out.write(line + "\n") 16 | out.close() 17 | -------------------------------------------------------------------------------- /database/Rfam_RNA_thermometer_ID.csv: -------------------------------------------------------------------------------- 1 | #Rfam_ID Name Description 2 | RF01795 FourU FourU thermometer RNA element 3 | RF02358 hsp17 Hsp17 thermometer 4 | RF01832 ROSE_2 Repression of heat shock gene expression ROSE element 5 | RF02523 ROSE_3 Repression of heat shock gene expression ROSE element 6 | RF00038 PrfA PrfA thermoregulator UTR 7 | RF00433 Hsp90_CRE Hsp90 cis regulatory element 8 | RF00435 ROSE Repression of heat shock gene expression ROSE element 9 | RF01766 cspA cspA thermoregulator 10 | RF01859 Phe_leader Phenylalanine leader peptide 11 | RF01804 Lambda_thermo Lambda phage CIII thermoregulator element 12 | -------------------------------------------------------------------------------- /benchmark_sRNAs/Campylobacter.csv: -------------------------------------------------------------------------------- 1 | start end strand 2 | 75877 75984 + 3 | 100738 100833 + 4 | 248102 248257 - 5 | 427729 427864 + 6 | 439584 - + 7 | 518348 518664 - 8 | 650864 650959 + 9 | 681025 681305 - 10 | 879931 880026 + 11 | 879955 880052 - 12 | 996243 996319 + 13 | 1148713 1148849 + 14 | 1200515 1200700 + 15 | 1209854 1209926 + 16 | 1293194 1293552 - 17 | 1440760 1440797 + 18 | 1440826 1440863 + 19 | 1440893 1440930 + 20 | 1440958 1440995 + 21 | 1441025 1441062 + 22 | 1441090 1441127 + 23 | 1441156 1441193 + 24 | 1441289 1441362 + 25 | 1542619 1542645 + 26 | 1563092 1563246 + 27 | 1563121 1563337 - 28 | 1568600 1568750 + 29 | 1624613 1624711 - 30 | - 671301 - 31 | 174436 - + 32 | 947560 - + 33 | -------------------------------------------------------------------------------- /docs/source/license.rst: -------------------------------------------------------------------------------- 1 | License 2 | ========== 3 | 4 | ANNOgesic is open source software and available under the ISC license. 5 | 6 | Copyright (c) 2013-2020, Sung-Huan Yu 7 | 8 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 9 | 10 | THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ANNOgesic is open source software and available under the ISC license. 2 | 3 | Copyright (c) 2013-2023, Sung-Huan Yu 4 | Konrad Förstner 5 | 6 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 7 | 8 | THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 9 | -------------------------------------------------------------------------------- /annogesiclib/splice_parser.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class SpliceParser(object): 5 | '''parser the splice data of segemehl''' 6 | 7 | def parser(self, splice_fh): 8 | for row in csv.reader(splice_fh, delimiter="\t"): 9 | yield assign_value(row) 10 | 11 | 12 | class assign_value(object): 13 | 14 | def __init__(self, row): 15 | self.strain = row[0] 16 | self.start = int(row[1]) 17 | self.end = int(row[2]) 18 | self.splice = row[3] 19 | splice = row[3].split(":") 20 | self.supported_reads = int(splice[1]) 21 | self.start_site_reads = int(splice[2]) 22 | self.end_site_reads = int(splice[3]) 23 | self.splice_type = splice[4] 24 | self.situation = splice[5] 25 | self.strand = row[5] 26 | self.info = ("\t".join(row)) 27 | 28 | def __str__(self): 29 | return "{0} {1} {2} {3} {4}".format( 30 | self.strain, self.start, self.end, self.splice, self.strand) 31 | -------------------------------------------------------------------------------- /comparison/README.md: -------------------------------------------------------------------------------- 1 | The scripts for comparison between ANNOgesic predictions and several databases 2 | ------------------------------------------------------------------------------ 3 | 4 | 1. Please download the data from RegulonDB (http://regulondb.ccg.unam.mx/menu/download/datasets/index.jsp), 5 | EcoCyc (https://ecocyc.org/site-search.shtml) or DOOR2 (http://csbl.bmb.uga.edu/DOOR/index.php). 6 | 7 | 2. In order to make the comparison more reliable, please remove the non-expressed 8 | features of databases. Otherwise the performances will be influenced by the 9 | non-expressed features. 10 | 11 | 3. For terminators, we also suggest to remove the terminators from the databases 12 | which does not contain a coverage significant decrease. 13 | 14 | 4. For sORF, we used the study of Hemm et. al (2010) as benchmarking set. Please 15 | convert the sORF information to Gff3 format. 16 | 17 | 5. The number of CRISPRs and riboswitches are few in databases, the manual 18 | comparison can be implemented easily. Thus, no scrips for the comparison is provided. 19 | -------------------------------------------------------------------------------- /annogesiclib/get_Rfam_ribo.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | def rbs_from_rfam(ribo_table, rfam_file, out_file): 5 | ribos = [] 6 | out = open(out_file, "w") 7 | f_h = open(ribo_table, "r") 8 | for row in csv.reader(f_h, delimiter="\t"): 9 | if not row[0].startswith("#"): 10 | ribos.append(row[0].strip()) 11 | detect = False 12 | with open(rfam_file, "r") as r_h: 13 | for line in r_h: 14 | line = line.rstrip("\n") 15 | datas = line.split(" ") 16 | if ("INFERNAL" in datas[0]) or ( 17 | "HMMER" in datas[0]): 18 | header = line 19 | detect = False 20 | elif "NAME" in datas[0]: 21 | name = line 22 | elif ("ACC" in datas[0]): 23 | for ribo in ribos: 24 | if datas[-1] == ribo: 25 | out.write("{0}\n{1}\n{2}\n".format(header, name, line)) 26 | detect = True 27 | else: 28 | if (detect): 29 | out.write(line + "\n") 30 | out.close() 31 | f_h.close() 32 | -------------------------------------------------------------------------------- /annogesiclib/print_rank_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | 5 | 6 | def print_rank_all(all_table, best_table): 7 | out = open("tmp_rank_table", "w") 8 | fh = open(best_table, "r") 9 | rank = 0 10 | bests = [] 11 | for row in csv.reader(fh, delimiter='\t'): 12 | if row[0] != "Rank": 13 | bests.append(row) 14 | rank = int(row[0]) 15 | out.write("\t".join(row) + "\n") 16 | fh.close() 17 | fh = open(all_table, "r") 18 | for row in csv.reader(fh, delimiter='\t'): 19 | detect = False 20 | if row[0] != "rank": 21 | for best in bests: 22 | if (row[1] == best[1]) and ( 23 | row[3] == best[3]) and ( 24 | row[4] == best[4]) and ( 25 | row[5] == best[5]): 26 | detect = True 27 | break 28 | if not detect: 29 | rank += 1 30 | row[0] = str(rank) 31 | out.write("\t".join(row) + "\n") 32 | os.remove(all_table) 33 | shutil.move("tmp_rank_table", all_table) 34 | -------------------------------------------------------------------------------- /annogesiclib/projectcreator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | class ProjectCreator(object): 6 | 7 | def create_root_folder(self, project_name): 8 | """Create the root folder of a new project with the given name. 9 | Arguments: 10 | - `project_name`: Name of the project root folder 11 | """ 12 | if not os.path.exists(project_name): 13 | os.mkdir(project_name) 14 | else: 15 | sys.stderr.write("Cannot create folder \"%s\"! File/folder with " 16 | "the same name exists already.\n" % project_name) 17 | sys.exit(2) 18 | 19 | def create_subfolders(self, subfolders): 20 | """Create required subfolders in the given folder. 21 | Arguments: 22 | - `project_name`: Name of the project root folder 23 | """ 24 | for folder in subfolders: 25 | if not os.path.exists(folder): 26 | os.mkdir(folder) 27 | 28 | def create_version_file(self, version_file_path, version): 29 | with open(version_file_path, "w") as fh: 30 | fh.write("ANNOgesic version %s" % version) 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | try: 3 | from setuptools import setup 4 | except ImportError: 5 | from distutils.core import setup 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | with open(path.join(here, 'README.rst')) as f: 10 | long_description = f.read() 11 | 12 | setup( 13 | name='ANNOgesic', 14 | version='1.1.14', 15 | packages=['annogesiclib'], 16 | author='Sung-Huan Yu', 17 | author_email='silasysh@g-mail.nsysu.edu.tw', 18 | description='ANNOgesic - A tool for bacterial/archaeal RNA-Seq based genome annotations', 19 | long_description=long_description, 20 | url='https://github.com/Sung-Huan/ANNOgesic', 21 | install_requires=[ 22 | "biopython >= 1.65", 23 | "matplotlib >= 1.5.0", 24 | "numpy >= 1.9.2", 25 | "networkx >= 1.9.1" 26 | ], 27 | scripts=['bin/annogesic'], 28 | license='ISC License (ISCL)', 29 | classifiers=[ 30 | 'License :: OSI Approved :: ISC License (ISCL)', 31 | 'Operating System :: POSIX', 32 | 'Programming Language :: Python :: 3', 33 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 34 | ] 35 | ) 36 | -------------------------------------------------------------------------------- /tutorial_data/replace_seq_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import shutil 5 | import argparse 6 | 7 | __author__ = "Sung-Huan Yu " 8 | __email__ = "shyu@biochem.mpg.de" 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("-i","--input_wig_folder",help="input wig file") 12 | parser.add_argument("-n","--strain_name",help="strain_name") 13 | args = parser.parse_args() 14 | 15 | def main(): 16 | for wig in os.listdir(args.input_wig_folder): 17 | out = open("tmp", "w") 18 | with open(os.path.join(args.input_wig_folder, wig)) as fh: 19 | for line in fh: 20 | if line.startswith("variableStep"): 21 | data = line.split(" ") 22 | choms = data[1].split("=") 23 | choms[-1] = args.strain_name 24 | data[1] = "=".join(choms) 25 | out.write(" ".join(data)) 26 | else: 27 | out.write(line) 28 | out.close() 29 | os.remove(os.path.join(args.input_wig_folder, wig)) 30 | shutil.move("tmp", os.path.join(args.input_wig_folder, wig)) 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /tests/test_projectcreator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import sys 4 | import shutil 5 | sys.path.append(".") 6 | from annogesiclib.projectcreator import ProjectCreator 7 | 8 | 9 | class TestProjectCreator(unittest.TestCase): 10 | 11 | def setUp(self): 12 | self.root_folder_name = "a_test_project" 13 | self.projectcreator = ProjectCreator() 14 | 15 | def tearDown(self): 16 | if os.path.exists(self.root_folder_name): 17 | shutil.rmtree(self.root_folder_name) 18 | 19 | def test_create_root_folder(self): 20 | self.projectcreator.create_root_folder(self.root_folder_name) 21 | assert(os.path.exists(self.root_folder_name)) 22 | shutil.rmtree(self.root_folder_name) 23 | 24 | def test_create_subfolders(self): 25 | self.projectcreator.create_root_folder(self.root_folder_name) 26 | subfolders = ["test_a", "test_b", "test_c"] 27 | subfolders = [self.root_folder_name + "/" + subfolder for 28 | subfolder in subfolders] 29 | self.projectcreator.create_subfolders(subfolders) 30 | for subfolder in subfolders: 31 | assert(os.path.exists(subfolder)) 32 | 33 | if __name__ == "__main__": 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /tests/test_change_db_format.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_gff3 import Create_generator 8 | from mock_helper import gen_file, import_data 9 | import annogesiclib.change_db_format as cdf 10 | 11 | 12 | class TestChangeDBFormat(unittest.TestCase): 13 | 14 | def setUp(self): 15 | self.test_folder = "test_folder" 16 | if (not os.path.exists(self.test_folder)): 17 | os.mkdir(self.test_folder) 18 | 19 | def tearDown(self): 20 | if os.path.exists(self.test_folder): 21 | shutil.rmtree(self.test_folder) 22 | 23 | def test_change_format(self): 24 | input_file = os.path.join(self.test_folder, "input") 25 | output_file = os.path.join(self.test_folder, "output") 26 | gen_file(input_file, 27 | ">srna_1|Staphylococcus|Aar|12314|12444|forward\nATAGATTCCCGCGTATAGTCATCATTGTAC") 28 | cdf.change_format(input_file, output_file) 29 | data = import_data(output_file) 30 | self.assertListEqual(data, ['>srna_1|Staphylococcus|Aar', 31 | 'ATAGATTCCCGCGTATAGTCATCATTGTAC']) 32 | 33 | if __name__ == "__main__": 34 | unittest.main() 35 | 36 | -------------------------------------------------------------------------------- /annogesiclib/sRNA_filter_min_utr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | from annogesiclib.gff3 import Gff3Parser 5 | 6 | 7 | def filter_utr(srna_gff, srna_table, min_utr): 8 | out = open("tmp_utr_srna.gff", "w") 9 | out_ta = open("tmp_utr_srna.csv", "w") 10 | out.write("##gff-version 3\n") 11 | gffs = [] 12 | tables = [] 13 | gff_parser = Gff3Parser() 14 | g_f = open(srna_gff, "r") 15 | for entry in gff_parser.entries(g_f): 16 | gffs.append(entry) 17 | fh = open(srna_table, "r") 18 | for row in csv.reader(fh, delimiter='\t'): 19 | if row[0] != "rank": 20 | if (float(row[7]) >= min_utr): 21 | tables.append(row) 22 | out_ta.write("\t".join(row) + "\n") 23 | for gff in gffs: 24 | for table in tables: 25 | if (table[0] == gff.seq_id) and ( 26 | int(table[2]) == gff.start) and ( 27 | int(table[3]) == gff.end) and ( 28 | table[4] == gff.strand): 29 | out.write(gff.info + "\n") 30 | g_f.close() 31 | fh.close() 32 | os.remove(srna_gff) 33 | os.remove(srna_table) 34 | shutil.move("tmp_utr_srna.gff", srna_gff) 35 | shutil.move("tmp_utr_srna.csv", srna_table) 36 | out.close() 37 | out_ta.close() 38 | -------------------------------------------------------------------------------- /tests/test_plot_coverage_table.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_gff3 import Create_generator 8 | from mock_helper import gen_file 9 | import annogesiclib.plot_coverage_table as pct 10 | 11 | 12 | class Mock_func(object): 13 | 14 | def mock_fig(self, rowlabels, collabels, cells, filename, 15 | max_color, min_color): 16 | gen_file(filename, "test") 17 | pass 18 | 19 | class TestPlotCoverageTable(unittest.TestCase): 20 | 21 | def setUp(self): 22 | self.test_folder = "test_folder" 23 | if (not os.path.exists(self.test_folder)): 24 | os.mkdir(self.test_folder) 25 | 26 | def tearDown(self): 27 | if os.path.exists(self.test_folder): 28 | shutil.rmtree(self.test_folder) 29 | 30 | 31 | def test_plot_table(self): 32 | pct.fig = Mock_func().mock_fig 33 | plots = [{"aaa": {"cond_1": {"track_1": 3.543, "track_2": 4.523}, 34 | "cond_2": {"track_1": 4.43, "track_2": 0.523}}}] 35 | pct.plot_table(plots, 100, 0, os.path.join(self.test_folder, "test")) 36 | self.assertTrue(os.path.exists(os.path.join(self.test_folder, "test"))) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | 41 | -------------------------------------------------------------------------------- /benchmark_sRNAs/Helicobacter.csv: -------------------------------------------------------------------------------- 1 | start end strand 2 | 22809 22931 - 3 | 78090 78365 + 4 | 141579 141833 + 5 | 170022 170227 + 6 | 314965 315169 - 7 | 367053 367555 + 8 | 439217 439567 + 9 | 444827 445139 - 10 | 466371 466793 - 11 | 479648 479856 - 12 | 513559 513627 - 13 | 515574 515653 - 14 | 516627 517186 + 15 | 537305 537624 - 16 | 540217 540473 - 17 | 541026 541298 - 18 | 567949 568607 - 19 | 664270 664447 - 20 | 684255 684584 + 21 | 756936 757177 + 22 | 804286 804580 - 23 | 865570 865916 + 24 | 946294 946540 + 25 | 964751 964805 + 26 | 968583 968616 + 27 | 968980 969164 + 28 | 996891 997299 + 29 | 998717 998995 + 30 | 1026267 1026428 - 31 | 1046292 1046837 + 32 | 1070879 1071067 + 33 | 1071536 1071801 + 34 | 1071960 1072171 + 35 | 1105620 1106041 + 36 | 1111333 1111469 + 37 | 1120506 1120704 - 38 | 1156100 1156429 + 39 | 1178410 1178489 + 40 | 1180436 1180504 + 41 | 1217306 1217526 + 42 | 1235678 1235907 + 43 | 1243404 1243474 - 44 | 1245610 1245780 - 45 | 1295413 1295598 - 46 | 1302757 1303070 - 47 | 1307821 1307963 - 48 | 1366650 1366889 - 49 | 1394991 1345126 + 50 | 1414321 1414603 + 51 | 1439464 1439745 + 52 | 1449788 1450126 + 53 | 1470642 1470983 - 54 | 1477003 1477319 + 55 | 1482579 1482926 - 56 | 1502823 1503160 - 57 | 1508086 1508595 - 58 | 1510538 1510962 + 59 | 1514863 1515121 - 60 | 1524329 1524681 - 61 | 1543943 1544194 + 62 | 1589890 1589984 - 63 | 1612281 1612596 - 64 | 1647007 1647568 + 65 | -------------------------------------------------------------------------------- /annogesiclib/extract_sec_info.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | def mod_file(input_file, out, indexs): 5 | with open(input_file) as fh: 6 | for line in fh: 7 | line = line.strip() 8 | if line.startswith(">"): 9 | out.write(indexs[line] + "\n") 10 | else: 11 | out.write(line + "\n") 12 | out.close() 13 | 14 | def extract_info_sec(sec_file, seq_file, index_file): 15 | out_sec = open(sec_file + "tmp", "w") 16 | out_seq = open(seq_file + "tmp", "w") 17 | indexs = {} 18 | with open(index_file) as hi: 19 | for line in hi: 20 | line = line.strip() 21 | if line.startswith(">"): 22 | tag = line.split("|")[0] 23 | indexs[tag] = line 24 | mod_file(sec_file, out_sec, indexs) 25 | mod_file(seq_file, out_seq, indexs) 26 | os.remove(sec_file) 27 | shutil.move(sec_file + "tmp", sec_file) 28 | os.remove(seq_file) 29 | shutil.move(seq_file + "tmp", seq_file) 30 | 31 | def modify_header(seq_file, index_file): 32 | out = open(seq_file, "w") 33 | with open(index_file) as fh: 34 | for line in fh: 35 | line = line.strip() 36 | if line.startswith(">"): 37 | tag = line.split("|")[0] 38 | out.write(tag + "\n") 39 | else: 40 | out.write(line + "\n") 41 | -------------------------------------------------------------------------------- /tests/mock_helper.py: -------------------------------------------------------------------------------- 1 | from mock_gff3 import Create_generator 2 | 3 | def convert_dict(line_list): 4 | datas = {} 5 | for data in line_list: 6 | datas[data] = data 7 | return datas 8 | 9 | def gen_file(out_file, content): 10 | with open(out_file, "w") as fh: 11 | fh.write(content) 12 | 13 | def import_data(filename): 14 | datas = [] 15 | with open(filename) as fh: 16 | for line in fh: 17 | line = line.rstrip() 18 | datas.append(line) 19 | return datas 20 | 21 | def extract_info(out_file, type_): 22 | datas = [] 23 | attributes = [] 24 | if type_ == "file": 25 | with open(out_file) as fh: 26 | for line in fh: 27 | line = line.rstrip() 28 | if (line != "##gff-version 3") and len(line): 29 | attributes.append(line.split("\t")[-1].split(";")) 30 | datas.append("\t".join(line.split("\t")[0:-1])) 31 | else: 32 | for line in out_file.split("\n"): 33 | line = line.rstrip() 34 | if len(line): 35 | attributes.append(line.split("\t")[-1].split(";")) 36 | datas.append("\t".join(line.split("\t")[0:-1])) 37 | 38 | return datas, attributes 39 | 40 | def read_dict(num, gff, attributes): 41 | gffs = [] 42 | for index in range(0, num): 43 | gffs.append(Create_generator(gff[index], attributes[index], "gff")) 44 | return gffs 45 | -------------------------------------------------------------------------------- /tests/test_seqmodifier.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file, import_data 8 | from annogesiclib.seqmodifier import SeqModifier 9 | 10 | 11 | class TestSeqModifier(unittest.TestCase): 12 | 13 | def setUp(self): 14 | self.test_folder = "test_folder" 15 | if (not os.path.exists(self.test_folder)): 16 | os.mkdir(self.test_folder) 17 | self.seq = SeqModifier("AATTATATAGGAAGGCCC") 18 | 19 | def tearDown(self): 20 | if os.path.exists(self.test_folder): 21 | shutil.rmtree(self.test_folder) 22 | 23 | def test_init_pos_dict(self): 24 | self.seq._init_pos_dict() 25 | self.assertDictEqual(self.seq._org_pos_to_internal_pos, 26 | {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 27 | 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 28 | 14: 13, 15: 14, 16: 15, 17: 16, 18: 17}) 29 | 30 | def test_replace(self): 31 | self.seq.replace(2, "G") 32 | self.assertEqual(self.seq._seq, "AGTTATATAGGAAGGCCC") 33 | 34 | def test_remove(self): 35 | self.seq.remove(8, 1) 36 | self.assertEqual(self.seq._seq, "AATTATAAGGAAGGCCC") 37 | 38 | def test_insert(self): 39 | self.seq.insert(5, "C") 40 | self.assertEqual(self.seq._seq, "AATTCATATAGGAAGGCCC") 41 | 42 | if __name__ == "__main__": 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /annogesiclib/plot_tran.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import matplotlib as mpl 4 | from annogesiclib.gff3 import Gff3Parser 5 | from annogesiclib.helper import Helper 6 | import numpy as np 7 | mpl.use('Agg') 8 | import matplotlib.pyplot as plt 9 | plt.style.use('ggplot') 10 | 11 | 12 | def plot(lens, out_figure): 13 | ticks = max(lens) / 50 14 | bin_num = np.arange(0, max(lens), ticks) 15 | n, bins, hist1 = plt.hist(lens, bin_num, 16 | color="#FF9999", label='Transcript', 17 | edgecolor='black', linewidth=1) 18 | plt.xlabel("Transcript_length (nt)") 19 | plt.ylabel("Amount") 20 | plt.savefig(out_figure) 21 | plt.clf() 22 | 23 | 24 | def plot_tran(tran_folder, stat_folder, max_dist): 25 | lens = [] 26 | less = [] 27 | for tran in os.listdir(tran_folder): 28 | if tran.endswith(".gff"): 29 | prefix = tran.replace("_transcript.gff", "") 30 | gff_f = open(os.path.join(tran_folder, tran), "r") 31 | for entry in Gff3Parser().entries(gff_f): 32 | if entry.feature == "transcript": 33 | lens.append(entry.end - entry.start) 34 | if entry.end - entry.start <= max_dist: 35 | less.append(entry.end - entry.start) 36 | plot(lens, os.path.join(stat_folder, prefix + "_length_all.png")) 37 | plot(less, os.path.join(stat_folder, prefix + "_length_less_" + 38 | str(max_dist) + ".png")) 39 | 40 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | python3 -m pytest tests 3 | # python3 run_test.py 4 | 5 | coverage: 6 | python3 -m coverage run run_test.py 7 | @echo "computing coverage.." 8 | python3 -m coverage report > "unittest" 9 | python3 tests/uni_report.py -i "unittest" -o "uni_report" 10 | rm "unittest" 11 | @echo "check uni_report.." 12 | 13 | package: 14 | rm -rf dist 15 | python3 setup.py bdist_wheel 16 | rm -rf ANNOgesic.egg-info 17 | ls dist/* 18 | 19 | build: 20 | python3 setup.py bdist 21 | 22 | package_to_pypi: 23 | twine upload dist/* 24 | 25 | html_doc: 26 | cd docs && make html && cd .. 27 | 28 | new_release: 29 | new_release: 30 | @echo "* Create/checkout a release branch" 31 | @echo " git branch release_v0.3.X" 32 | @echo " git checkout release_v0.3.X" 33 | @echo "* Change bin/reademption" 34 | @echo "* Change setup.py" 35 | @echo "* Change docs/source/conf.py" 36 | @echo "* Change CHANGELOG.txt" 37 | @echo "* Create new docs" 38 | @echo "* Test package creation" 39 | @echo "* Test doc creation" 40 | @echo "* make package_to_pypi" 41 | @echo "* git add CHANGELOG.txt bin/reademption docs/source/conf.py setup.py" 42 | @echo "* Commit changes e.g. 'git commit -m \"Set version to 0.3.X\"'" 43 | @echo "* Tag the commit e.g. 'git tag -a v0.3.X -m \"version v0.3.X\"'" 44 | @echo "* Merge release into dev and master" 45 | @echo "* Push it to github: git push" 46 | @echo "* Generate a new release based on this tag at" 47 | @echo " https://github.com/konrad/READemption/releases/new" 48 | @echo "* Upload new docs using 'make upload_doc'" 49 | -------------------------------------------------------------------------------- /annogesiclib/blast_class.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | def read_file(srna_file, nums): 5 | srna_f = open(srna_file, "r") 6 | for row in csv.reader(srna_f, delimiter="\t"): 7 | if (row[-6] != "NA") and (row[0] != "Rank"): 8 | if row[1] not in nums.keys(): 9 | nums[row[1]] = {} 10 | if row[2] not in nums[row[1]].keys(): 11 | nums[row[1]][row[2]] = 1 12 | else: 13 | nums[row[1]][row[2]] += 1 14 | if row[2] not in nums["total"].keys(): 15 | nums["total"][row[2]] = 1 16 | else: 17 | nums["total"][row[2]] += 1 18 | srna_f.close() 19 | 20 | 21 | def blast_class(srna_file, out_file): 22 | '''statistics of the results of blast sRNA database''' 23 | nums = {} 24 | nums["total"] = {} 25 | read_file(srna_file, nums) 26 | out = open(out_file, "w") 27 | if len(nums) > 1: 28 | if len(nums) > 2: 29 | out.write("All genomes:\n") 30 | out.write("sRNA_name\tamount\n") 31 | for blast, num in nums["total"].items(): 32 | out.write("{0}\t{1}\n".format(blast, num)) 33 | for strain, srna_name in nums.items(): 34 | if strain != "total": 35 | out.write(strain + ":\n") 36 | out.write("sRNA_name\tamount\n") 37 | for blast, num in srna_name.items(): 38 | out.write("{0}\t{1}\n".format(blast, num)) 39 | else: 40 | out.write("No known sRNA!!\n") 41 | out.close() 42 | -------------------------------------------------------------------------------- /annogesiclib/filter_TSS_pro.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import math 4 | from annogesiclib.gff3 import Gff3Parser 5 | 6 | 7 | def read_gff(input_file): 8 | datas = [] 9 | gff_parser = Gff3Parser() 10 | f_h = open(input_file, "r") 11 | for entry in gff_parser.entries(f_h): 12 | datas.append(entry) 13 | datas = sorted(datas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) 14 | return datas 15 | 16 | 17 | def compare_tss_pro(tars, refs, out, cluster): 18 | '''compare between TSS and processing site''' 19 | for tar in tars: 20 | for ref in refs: 21 | if (tar.seq_id == ref.seq_id) and ( 22 | tar.strand == ref.strand): 23 | if math.fabs(tar.start - ref.start) <= cluster: 24 | break 25 | elif (ref.start - tar.start) > cluster: 26 | out.write(tar.info + "\n") 27 | break 28 | 29 | 30 | def filter_tss_pro(tss_file, pro_file, feature, cluster): 31 | '''deal with the overlap of TSS and processing site''' 32 | tsss = read_gff(tss_file) 33 | pros = read_gff(pro_file) 34 | out = open("tmp_filter", "w") 35 | out.write("##gff-version 3\n") 36 | if feature.lower() == "tss": 37 | compare_tss_pro(pros, tsss, out, cluster) 38 | os.remove(pro_file) 39 | shutil.move("tmp_filter", pro_file) 40 | elif feature.lower() == "processing": 41 | compare_tss_pro(tsss, pros, out, cluster) 42 | os.remove(tss_file) 43 | shutil.move("tmp_filter", tss_file) 44 | -------------------------------------------------------------------------------- /tests/test_splice_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import shutil 7 | import unittest 8 | from io import StringIO 9 | sys.path.append(".") 10 | from annogesiclib.splice_parser import SpliceParser 11 | 12 | 13 | class TestGff3Parser(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.example = Example() 17 | self.s_parser = SpliceParser() 18 | self.test_folder = "test_folder" 19 | if (not os.path.exists(self.test_folder)): 20 | os.mkdir(self.test_folder) 21 | 22 | def tearDown(self): 23 | if os.path.exists(self.test_folder): 24 | shutil.rmtree(self.test_folder) 25 | 26 | def test_parser(self): 27 | splice_fh = StringIO(self.example.splice) 28 | starts = [] 29 | splices = [] 30 | for entry in self.s_parser.parser(splice_fh): 31 | starts.append(entry.start) 32 | splices.append(entry.splice) 33 | self.assertListEqual(starts, [17647, 20734, 43490, 49952]) 34 | self.assertListEqual(splices, ['splits:1:1:1:N:F', 'splits:1:1:1:C:P', 35 | 'splits:1:1:1:N:P', 'splits:2:2:2:N:P']) 36 | 37 | class Example(object): 38 | 39 | splice = """Staphylococcus_aureus_HG003 17647 17667 splits:1:1:1:N:F 0 + 40 | Staphylococcus_aureus_HG003 20734 21396 splits:1:1:1:C:P 0 + 41 | Staphylococcus_aureus_HG003 43490 43644 splits:1:1:1:N:P 0 + 42 | Staphylococcus_aureus_HG003 49952 50016 splits:2:2:2:N:P 0 +""" 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | 47 | -------------------------------------------------------------------------------- /database/Rfam_riboswitch_ID.csv: -------------------------------------------------------------------------------- 1 | RF00162 SAM SAM riboswitch box leader 2 | RF00174 Cobalamin Cobalamin riboswitch 3 | RF00634 SAM-IV S adenosyl methionine SAM riboswitch 4 | RF00059 TPP TPP riboswitch THI element 5 | RF00167 Purine Purine riboswitch 6 | RF00168 Lysine Lysine riboswitch 7 | RF00504 Glycine Glycine riboswitch 8 | RF00521 SAM_alpha SAM riboswitch alpha proteobacteria 9 | RF01051 c-di-GMP-I Cyclic di GMP riboswitch 10 | RF01055 MOCO_RNA_motif Moco molybdenum cofactor riboswitch 11 | RF01057 SAH_riboswitch S adenosyl homocysteine riboswitch 12 | RF01510 MFR M florum riboswitch 13 | RF01767 SMK_box_riboswitch SMK box translational riboswitch 14 | RF01826 SAM_V SAM riboswitch 15 | RF01831 THF THF riboswitch 16 | RF01689 AdoCbl-variant AdoCbl variant RNA 17 | RF01725 SAM-I-IV-variant SAM IV variant riboswitch 18 | RF00050 FMN FMN riboswitch RFN element 19 | RF00234 glmS glmS glucosamine phosphate activated ribozyme 20 | RF00522 PreQ1 PreQ1 riboswitch 21 | RF01054 preQ1-II preQ1 II pre queuosine riboswitch 22 | RF01056 Mg_sensor Magnesium Sensor 23 | RF01482 AdoCbl_riboswitch AdoCbl riboswitch 24 | RF01727 SAM-SAH SAM SAH riboswitch 25 | RF01786 c-di-GMP-II Cyclic di GMP II riboswitch 26 | RF01787 drz-agam-1 drz agam riboswitch 27 | RF01788 drz-agam-2-2 drz agam riboswitch 28 | RF00080 yybP-ykoY yybP ykoY leader 29 | RF00379 ydaO-yuaA ydaO yuaA leader 30 | RF00380 ykoK ykoK leader 31 | RF00442 ykkC-yxkD ykkC yxkD leader 32 | RF00516 ylbH ylbH leader 33 | RF00517 serC serC leader 34 | RF00518 speF speF leader 35 | RF00519 suhB suhB 36 | RF00520 ybhL ybhL leader 37 | -------------------------------------------------------------------------------- /tests/mock_gff3.py: -------------------------------------------------------------------------------- 1 | class Create_generator(object): 2 | 3 | def __init__(self, gff, attributes, type_): 4 | if (type_ == "gff") or (type_ == "circ"): 5 | self.seq_id = gff["seq_id"] 6 | self.strain = gff["seq_id"] 7 | self.strand = gff["strand"] 8 | self.start = gff["start"] 9 | self.end = gff["end"] 10 | self.feature = gff["feature"] 11 | self.phase = gff["phase"] 12 | self.score = gff["score"] 13 | self.source = gff["source"] 14 | if type_ == "circ": 15 | self.supported_reads = gff["support"] 16 | self.start_site_reads = gff["start_site"] 17 | self.end_site_reads = gff["end_site"] 18 | self.situation = gff["situation"] 19 | self.splice_type = gff["splice_type"] 20 | self.attributes = {} 21 | for key, value in attributes.items(): 22 | self.attributes[key] = value 23 | self.attribute_string = ";".join( 24 | ["=".join(items) for items in self.attributes.items()]) 25 | self.info = "\t".join([str(field) for field in [ 26 | self.seq_id, self.source, self.feature, self.start, 27 | self.end, self.score, self.strand, self.phase, 28 | self.attribute_string]]) 29 | self.info_without_attributes = "\t".join([str(field) for field in [ 30 | self.seq_id, self.source, self.feature, self.start, 31 | self.end, self.score, self.strand, self.phase]]) 32 | if type_ == "wig": 33 | self.coverage = gff["coverage"] 34 | -------------------------------------------------------------------------------- /annogesiclib/seqmodifier.py: -------------------------------------------------------------------------------- 1 | class SeqModifier(object): 2 | """Help to apply SNPs, insertion and deletions to a sequence.""" 3 | 4 | def __init__(self, seq): 5 | self._seq = seq 6 | self._init_pos_dict() 7 | 8 | def seq(self): 9 | return self._seq 10 | 11 | def _init_pos_dict(self): 12 | self._org_pos_to_internal_pos = dict( 13 | [(pos, pos-1) 14 | for pos in range(1, len(self._seq) + 1)]) 15 | 16 | def replace(self, pos, nucleotide): 17 | seq_as_list = list(self._seq) 18 | seq_as_list[self._org_pos_to_internal_pos[pos]] = nucleotide 19 | self._seq = "".join(seq_as_list) 20 | 21 | def remove(self, pos, num): 22 | int_pos = self._org_pos_to_internal_pos[pos] 23 | self._seq = self._seq[:int_pos] + self._seq[int_pos+1:] 24 | del(self._org_pos_to_internal_pos[pos]) 25 | for pos in range(pos, len(self._seq) + 2): 26 | try: 27 | self._org_pos_to_internal_pos[pos] = ( 28 | self._org_pos_to_internal_pos[pos] - num) 29 | except KeyError: 30 | pass 31 | 32 | def insert(self, pos, nucleotide): 33 | """Insert after nucleotide of the given position""" 34 | int_pos = self._org_pos_to_internal_pos[pos] 35 | self._seq = self._seq[:int_pos] + nucleotide + self._seq[int_pos:] 36 | for pos in range(pos + 1, len(self._seq) + 1): 37 | try: 38 | self._org_pos_to_internal_pos[pos] = ( 39 | self._org_pos_to_internal_pos[pos] + len(nucleotide)) 40 | except KeyError: 41 | pass 42 | 43 | def get_nucl(self, pos): 44 | return self._seq[self._org_pos_to_internal_pos[pos]] 45 | -------------------------------------------------------------------------------- /comparison/compare_TSS_Mendoza_Vargas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | import math 8 | from gff3 import Gff3Parser 9 | 10 | __author__ = "Sung-Huan Yu " 11 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-k","--regulondb_file",help="TSS of Mendoza-Vargas in RegulonDB") 15 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted TSS file") 16 | parser.add_argument("-f","--fuzzy", type=int, help="tolerance of nts for comparison") 17 | args = parser.parse_args() 18 | 19 | def main(): 20 | pros = {} 21 | tsss = [] 22 | total = 0 23 | detect = 0 24 | for entry in Gff3Parser().entries(open(args.predict_file)): 25 | tsss.append(entry) 26 | fh = open(args.regulondb_file, "r") 27 | for row in csv.reader(fh, delimiter='\t'): 28 | if (not row[0].startswith("#")) and (row[-1] != "weak"): 29 | total += 1 30 | if row[5] == "forward": 31 | strand = "+" 32 | else: 33 | strand = "-" 34 | pros[row[1]] = {"start": int(row[3]), "strand": strand} 35 | for ref in pros.values(): 36 | for pre in tsss: 37 | if pre.strand == ref["strand"]: 38 | if (math.fabs(ref["start"] - pre.start) <= args.fuzzy): 39 | detect += 1 40 | break 41 | print("the number of published TSSs which can be detected by ANNOgesic:" + str(detect)) 42 | print("the total number of TSSs from Mendoza-Vargas in Regulon DB" + str(total)) 43 | print("detection rate:" + str(float(detect)/float(total))) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /annogesiclib/map_ribos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | 5 | 6 | def mapping_ribos(table_folder, id_file, feature): 7 | ids = [] 8 | ih = open(id_file, "r") 9 | for row in csv.reader(ih, delimiter='\t'): 10 | if not row[0].startswith("#"): 11 | ids.append({"id": row[0].strip(), 12 | "name": row[1].strip(), 13 | "info": row[2].strip()}) 14 | for table_file in os.listdir(table_folder): 15 | if table_file.endswith("_" + feature + ".csv"): 16 | tmp_table = os.path.join(table_folder, "tmp" + table_file) 17 | table_file = os.path.join(table_folder, table_file) 18 | out = open(tmp_table, "w") 19 | tables = [] 20 | fh = open(table_file, "r") 21 | out.write("#ID\tGenome\tStrand\tAssociated_CDS\tStart_genome\t" 22 | "End_genome\tRfam_ID\tRfam_name\tE_value\tScore\t" 23 | "Start_align\tEnd_align\n") 24 | for row in csv.reader(fh, delimiter='\t'): 25 | if not row[0].startswith("#"): 26 | tables.append({"input": row[0:6], "Rfam": row[6], 27 | "e": row[7], "score": row[8], 28 | "start": row[9], "end": row[10]}) 29 | for table in tables: 30 | for id_ in ids: 31 | if table["Rfam"] == id_["id"]: 32 | name = id_["name"] 33 | out.write("\t".join(table["input"] + [table["Rfam"], name, 34 | table["e"], table["score"], 35 | table["start"], table["end"]]) + "\n") 36 | out.close() 37 | os.remove(table_file) 38 | shutil.move(tmp_table, table_file) 39 | -------------------------------------------------------------------------------- /comparison/compare_term_ecocyc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | from gff3 import Gff3Parser 8 | 9 | __author__ = "Sung-Huan Yu " 10 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-k","--ecocyc_file",help="terminators of EcoCyc") 14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted terminator file") 15 | args = parser.parse_args() 16 | 17 | def main(): 18 | terms = [] 19 | detect = 0 20 | total = 0 21 | fh = open(args.ecocyc_file, "r") 22 | for row in csv.reader(fh, delimiter='\t'): 23 | if len(row) >= 4: 24 | total += 1 25 | terms.append({"id": row[0], "start": int(row[1]), 26 | "end": int(row[2])}) 27 | tot_term = 0 28 | for pre in Gff3Parser().entries(open(args.predict_file)): 29 | tot_term += 1 30 | for ref in terms: 31 | if ((pre.start >= ref["start"]) and ( 32 | pre.end <= ref["end"])) or ( 33 | (pre.start <= ref["start"]) and ( 34 | pre.end >= ref["end"])) or ( 35 | (pre.start >= ref["start"]) and ( 36 | pre.start <= ref["end"]) and ( 37 | pre.end >= ref["end"])) or ( 38 | (pre.start <= ref["start"]) and ( 39 | pre.end >= ref["start"]) and ( 40 | pre.end <= ref["end"])): 41 | detect += 1 42 | break 43 | print("the number of published terminators can be detected by ANNOgesic:" + str(detect)) 44 | print("total number of terminators in EcoCyc:" + str(total)) 45 | print("detection rate:" + str(float(detect)/float(total))) 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /comparison/compare_sORF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | from gff3 import Gff3Parser 8 | 9 | __author__ = "Sung-Huan Yu " 10 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-k","--benchmark_file",help="the benchmarking set of sORF") 14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted sORF file") 15 | args = parser.parse_args() 16 | 17 | def main(): 18 | sorfs = [] 19 | pres = [] 20 | num_ref = 0 21 | detect = 0 22 | for sorf in Gff3Parser().entries(open(args.benchmark_file)): 23 | num_ref += 1 24 | sorfs.append(sorf) 25 | for pre in Gff3Parser().entries(open(args.predict_file)): 26 | pres.append(pre) 27 | for sorf in sorfs: 28 | for pre in pres: 29 | if pre.strand == sorf.strand: 30 | if ((pre.start >= sorf.start) and ( 31 | pre.end <= sorf.end)) or ( 32 | (pre.start <= sorf.start) and ( 33 | pre.end >= sorf.end)) or ( 34 | (pre.start >= sorf.start) and ( 35 | pre.start <= sorf.end) and ( 36 | pre.end >= sorf.end)) or ( 37 | (pre.start <= sorf.start) and ( 38 | pre.end >= sorf.start) and ( 39 | pre.end <= sorf.end)): 40 | detect += 1 41 | sorf.attributes["detect"] = True 42 | break 43 | print("the number of known sORFs which can be detected by ANNOgesic:" + str(detect)) 44 | print("the total number of known sORFs:" + str(num_ref)) 45 | print("the detection rate:"+ str(float(detect) / float(num_ref))) 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /tests/test_plot_mountain.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_gff3 import Create_generator 8 | from mock_helper import gen_file 9 | import annogesiclib.plot_mountain as pm 10 | 11 | 12 | class TestPlotMountain(unittest.TestCase): 13 | 14 | def setUp(self): 15 | self.test_folder = "test_folder" 16 | if (not os.path.exists(self.test_folder)): 17 | os.mkdir(self.test_folder) 18 | self.example = Example() 19 | 20 | def tearDown(self): 21 | if os.path.exists(self.test_folder): 22 | shutil.rmtree(self.test_folder) 23 | 24 | def test_plot_mountain_plot(self): 25 | gen_file(os.path.join(self.test_folder, "test"), self.example.mountain) 26 | pm.plot_mountain_plot(os.path.join(self.test_folder, "test"), 27 | os.path.join(self.test_folder, "out")) 28 | 29 | self.assertTrue(os.path.exists(os.path.join(self.test_folder, "out"))) 30 | 31 | class Example(object): 32 | 33 | mountain = """ 1 0 34 | 2 0.001304 35 | 3 0.0037577 36 | 4 0.0068858 37 | 5 0.015473 38 | 6 0.025351 39 | 7 0.71432 40 | 8 1.6366 41 | 9 2.615 42 | 10 3.6091 43 | & 44 | 1 0 45 | 2 0 46 | 3 0 47 | 4 0 48 | 5 0 49 | 6 0 50 | 7 1 51 | 8 2 52 | 9 3 53 | 10 4 54 | & 55 | 1 0.018708 56 | 2 0.035075 57 | 3 0.043831 58 | 4 0.093979 59 | 5 0.10259 60 | 6 0.96026 61 | 7 0.4509 62 | 8 0.18699 63 | 9 0.062985 64 | 10 0.0055594 65 | & 66 | 1 0 67 | 2 0 68 | 3 0 69 | 4 0 70 | 5 0 71 | 6 0 72 | 7 1 73 | 8 2 74 | 9 3 75 | 10 4""" 76 | 77 | if __name__ == "__main__": 78 | unittest.main() 79 | 80 | -------------------------------------------------------------------------------- /comparison/compare_promoter_regulondb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | import math 8 | from gff3 import Gff3Parser 9 | 10 | __author__ = "Sung-Huan Yu " 11 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-k","--regulondb_file",help="RegulonDB promoter file") 15 | parser.add_argument("-p","--predict_file",help="ANNOgesic promoter table") 16 | parser.add_argument("-f","--fuzzy", type=int, help="the tolerance nts for comparison") 17 | args = parser.parse_args() 18 | 19 | def main(): 20 | pros = {} 21 | pres = [] 22 | total = 0 23 | detect = 0 24 | ph = open(args.predict_file, "r") 25 | for row in csv.reader(ph, delimiter='\t'): 26 | if row[0] != "strain": 27 | pres.append({"start": int(row[1]), "strand": row[2]}) 28 | fh = open(args.regulondb_file, "r") 29 | for row in csv.reader(fh, delimiter='\t'): 30 | if (not row[0].startswith("#")) and (row[-1].lower() != "weak"): 31 | if row[2] == "forward": 32 | strand = "+" 33 | else: 34 | strand = "-" 35 | if int(row[3]) != 0: 36 | total += 1 37 | pros[row[0]] = {"start": int(row[3]), "strand": strand} 38 | for ref in pros.values(): 39 | for pre in pres: 40 | if pre["strand"] == ref["strand"]: 41 | if (math.fabs(ref["start"] - pre["start"]) <= args.fuzzy): 42 | detect += 1 43 | break 44 | print("the number of published promoters which can be found by ANNOgesic:" + str(detect)) 45 | print("total number of promoters in RegulonDB:" + str(total)) 46 | print("detection rate:" + str(float(detect)/float(total))) 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /tests/test_TSSpredator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import shutil 7 | import unittest 8 | from io import StringIO 9 | sys.path.append(".") 10 | from annogesiclib.TSSpredator_parser import TSSPredatorReader 11 | 12 | 13 | class TestTSSPredatorReader(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.example = Example() 17 | self.test_folder = "test_folder" 18 | if (not os.path.exists(self.test_folder)): 19 | os.mkdir(self.test_folder) 20 | self.tss = TSSPredatorReader() 21 | 22 | def tearDown(self): 23 | if os.path.exists(self.test_folder): 24 | shutil.rmtree(self.test_folder) 25 | 26 | def test_entries(self): 27 | input_fh = StringIO(self.example.master) 28 | tsss = [] 29 | for entry in self.tss.entries(input_fh): 30 | tsss.append(entry) 31 | self.assertEqual(tsss[0].pos, 179) 32 | self.assertTrue(tsss[1].is_primary) 33 | self.assertTrue(tsss[2].is_internal) 34 | 35 | 36 | class Example(object): 37 | 38 | master = """SuperPos SuperStrand mapCount detCount Genome detected enriched stepHeight stepFactor enrichmentFactor classCount Pos Strand Locus_tag sRNA/asRNA Product UTRlength GeneLength Primary Secondary Internal Antisense Automated Manual Putative sRNA Putative asRNA Comment Sequence -50 nt upstream + TSS (51nt) 39 | 179 - 1 1 test 1 1 4.45 31.93 8.69 1 179 - orphan orphan NA NA 0 0 0 0 1 0 0 0 ACCCTTGAATTGAGGGTGTTTTATACCTAAATTTAAAAAATGATGCTATAA 40 | 681 - 1 1 test 1 1 4.2 3.0 3.54 2 681 - HP0001 transcription antitermination protein NusB 48 417 1 0 0 0 1 0 0 0 GATTGAAAGAGCGGGCAGTAAAGCCGGCAATAAGGGCTTTGAAGCGATGAG 41 | 681 - 1 1 test 1 1 4.2 3.0 3.54 2 681 - HP0002 6%2C7-dimethyl-8-ribityllumazine synthase NA 471 0 0 1 0 1 0 0 0 GATTGAAAGAGCGGGCAGTAAAGCCGGCAATAAGGGCTTTGAAGCGATGAG""" 42 | if __name__ == "__main__": 43 | unittest.main() 44 | 45 | -------------------------------------------------------------------------------- /tests/uni_report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("-i","--input_file", help="input file") 10 | parser.add_argument("-o","--output_file",help="output file") 11 | args = parser.parse_args() 12 | 13 | def main(): 14 | out = open(args.output_file, "w") 15 | out.write("Name\tStmts\tMiss\tCover\n----------------------------------------------------\n") 16 | sts = 0 17 | miss = 0 18 | with open(args.input_file) as fh: 19 | for line in fh: 20 | line = line.strip() 21 | if line.startswith("annogesic"): 22 | datas = line.split(" ") 23 | covers = [] 24 | for data in datas: 25 | if len(data): 26 | covers.append(data) 27 | sts = sts + int(covers[1]) 28 | miss = miss + int(covers[2]) 29 | out.write("\t".join(covers)) 30 | out.write("\n") 31 | # sts = 0 32 | # miss = 0 33 | # for input_file in args.input_files: 34 | # with open(input_file) as fh: 35 | # for line in fh: 36 | # line = line.strip() 37 | # datas = line.split(" ") 38 | # covers = [] 39 | # if datas[0].split("/")[-1] == input_file.split("/")[-1].replace("unitest_test_", ""): 40 | # for data in datas: 41 | # if len(data): 42 | # covers.append(data) 43 | # sts = sts + int(covers[1]) 44 | # miss = miss + int(covers[2]) 45 | # out.write("\t".join(covers)) 46 | # out.write("\n") 47 | out.write("----------------------------------------------------\n") 48 | out.write("Total = " + str(100 - (100*(float(miss) / float(sts)))) + "%") 49 | out.close() 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /tests/test_modify_rbs_table.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file, import_data 8 | import annogesiclib.modify_rbs_table as mrt 9 | 10 | 11 | class TestGenSvg(unittest.TestCase): 12 | 13 | def setUp(self): 14 | self.test_folder = "test_folder" 15 | self.example = Example() 16 | if (not os.path.exists(self.test_folder)): 17 | os.mkdir(self.test_folder) 18 | 19 | def tearDown(self): 20 | if os.path.exists(self.test_folder): 21 | shutil.rmtree(self.test_folder) 22 | 23 | def test_modify_table(self): 24 | result = """#ID\tGenome\tStrand\tAssociated_CDS\tStart_genome\tEnd_genome\tRfam\tE_value\tScore\tStart_align\tEnd_align 25 | riboswitch_5\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00013\t15948\t16046\tRF00162\t1.6e-18\t74\t1\t99 26 | riboswitch_11\tStaphylococcus_aureus_HG003\t-\tSAOUHSC_00007\t27955\t28053\tRF00162\t1.6e-18\t74\t1\t99 27 | riboswitch_183\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00372\t377996\t378098\tRF00167\t2.2e-18\t45\t1\t103""" 28 | table = os.path.join(self.test_folder, "test") 29 | gen_file(table, self.example.ribos) 30 | mrt.modify_table(table, True) 31 | data = import_data(table) 32 | self.assertEqual("\n".join(data), result) 33 | gen_file(table, self.example.ribos) 34 | mrt.modify_table(table, False) 35 | data = import_data(table) 36 | self.assertEqual("\n".join(data), result) 37 | 38 | class Example(object): 39 | 40 | ribos = """riboswitch_5\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00013\t15948\t16046\tRF00162\t1.6e-18\t74\t1\t99 41 | riboswitch_11\tStaphylococcus_aureus_HG003\t-\tSAOUHSC_00007\t27955\t28053\tRF00162\t1.6e-18\t74\t1\t99 42 | riboswitch_183\tStaphylococcus_aureus_HG003\t+\tSAOUHSC_00372\t377996\t378098\tRF00167\t2.2e-18\t45\t1\t103""" 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /tests/test_parser_wig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import unittest 7 | import shutil 8 | from io import StringIO 9 | sys.path.append(".") 10 | from annogesiclib.parser_wig import WigParser 11 | 12 | 13 | class TestParserWig(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.example = Example() 17 | self.wig_parser = WigParser() 18 | self.test_folder = "test_folder" 19 | if (not os.path.exists(self.test_folder)): 20 | os.mkdir(self.test_folder) 21 | 22 | def tearDown(self): 23 | if os.path.exists(self.test_folder): 24 | shutil.rmtree(self.test_folder) 25 | 26 | def test_parser(self): 27 | wigs = [] 28 | wig_f_fh = StringIO(self.example.wig_forward_file) 29 | for entry in self.wig_parser.parser(wig_f_fh, "+"): 30 | self.assertEqual(entry.strain, "aaa") 31 | self.assertEqual(entry.track, "TSB_t0_TEX_forward") 32 | wigs.append(entry) 33 | self.assertEqual(wigs[2].pos, 3) 34 | self.assertEqual(wigs[2].coverage, 1.4041251228308191) 35 | wigs = [] 36 | wig_r_fh = StringIO(self.example.wig_reverse_file) 37 | for entry in self.wig_parser.parser(wig_r_fh, "-"): 38 | self.assertEqual(entry.strain, "aaa") 39 | self.assertEqual(entry.track, "TSB_t0_TEX_reverse") 40 | wigs.append(entry) 41 | self.assertEqual(wigs[2].pos, 3) 42 | self.assertEqual(wigs[2].coverage, 1.4041251228308191) 43 | 44 | class Example(object): 45 | wig_forward_file = """track type=wiggle_0 name="TSB_t0_TEX_forward" 46 | variableStep chrom=aaa span=1 47 | 3 1.4041251228308191 48 | 4 56.867067474648174 49 | 5 56.867067474648174""" 50 | 51 | wig_reverse_file = """track type=wiggle_0 name="TSB_t0_TEX_reverse" 52 | variableStep chrom=aaa span=1 53 | 3 -1.4041251228308191 54 | 4 -56.867067474648174 55 | 5 -56.867067474648174""" 56 | if __name__ == "__main__": 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /annogesiclib/plot_mountain.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | mpl.use('Agg') 3 | import matplotlib.pyplot 4 | matplotlib.pyplot.style.use('ggplot') 5 | 6 | 7 | def plot_mountain_plot(input_file, output_name): 8 | poss = [] 9 | values = [] 10 | check = 0 11 | pre_check = 0 12 | f_h = open(input_file, "r") 13 | while True: 14 | line = f_h.readline() 15 | line = line.rstrip() 16 | if not line: 17 | matplotlib.pyplot.figure(1) 18 | matplotlib.pyplot.subplot(212) 19 | matplotlib.pyplot.xlabel('Nucleotide position') 20 | matplotlib.pyplot.ylabel('Entropy') 21 | matplotlib.pyplot.plot(values, color="black") 22 | matplotlib.pyplot.savefig(output_name, format='pdf') 23 | break 24 | elif line == "&": 25 | line = f_h.readline() 26 | line = line.rstrip() 27 | check += 1 28 | else: 29 | poss.append(float(line[0:4].replace(" ", ""))) 30 | values.append(float(line[5:].replace(" ", ""))) 31 | if check != pre_check: 32 | pre_check = check 33 | if check == 1: 34 | matplotlib.pyplot.figure(1) 35 | matplotlib.pyplot.subplot(211) 36 | ylabel = ("Number of enclosing nucleotides\nor\n" 37 | "Min free energy structure") 38 | matplotlib.pyplot.ylabel( 39 | ylabel, fontsize=10, multialignment='left') 40 | matplotlib.pyplot.plot(values, label='pair probabilities') 41 | values = [] 42 | poss = [] 43 | elif check == 2: 44 | matplotlib.pyplot.plot(values, label='mfe structure') 45 | matplotlib.pyplot.legend( 46 | bbox_to_anchor=(0., 1.02, 1., .102), 47 | loc=3, ncol=2, mode="expand", borderaxespad=0.) 48 | values = [] 49 | poss = [] 50 | f_h.close() 51 | matplotlib.pyplot.cla() 52 | matplotlib.pyplot.clf() 53 | -------------------------------------------------------------------------------- /comparison/compare_tran.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | from gff3 import Gff3Parser 8 | 9 | __author__ = "Sung-Huan Yu " 10 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-k","--ecocyc_file",help="the transcripts from EcoCyc") 14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted transcripts") 15 | args = parser.parse_args() 16 | 17 | def main(): 18 | trans = {} 19 | pres = [] 20 | total = 0 21 | detect = 0 22 | for entry in Gff3Parser().entries(open(args.predict_file)): 23 | pres.append(entry) 24 | fh = open(args.ecocyc_file, "r") 25 | for row in csv.reader(fh, delimiter='\t'): 26 | if row[0] not in trans.keys(): 27 | total += 1 28 | trans[row[0]] = {"start": int(row[1]), "end": int(row[2])} 29 | else: 30 | if int(row[1]) < trans[row[0]]["start"]: 31 | trans[row[0]]["start"] = int(row[1]) 32 | if int(row[2]) > trans[row[0]]["end"]: 33 | trans[row[0]]["end"] = int(row[2]) 34 | for ref in trans.values(): 35 | for pre in pres: 36 | if ((pre.start >= ref["start"]) and ( 37 | pre.end <= ref["end"])) or ( 38 | (pre.start <= ref["start"]) and ( 39 | pre.end >= ref["end"])) or ( 40 | (pre.start >= ref["start"]) and ( 41 | pre.start <= ref["end"]) and ( 42 | pre.end >= ref["end"])) or ( 43 | (pre.start <= ref["start"]) and ( 44 | pre.end >= ref["start"]) and ( 45 | pre.end <= ref["end"])): 46 | detect += 1 47 | break 48 | print("the number of published transcripts which can be detected by ANNOgesic:" + str(detect)) 49 | print("total number of transcripts in EcoCyc:" + str(total)) 50 | print("detection rate:" + str(float(detect)/float(total))) 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /Table_dependency_version.txt: -------------------------------------------------------------------------------- 1 | Basic requirement: 2 | Python : version higher or equal to 3.4. 3 | BioPython: version higher or equal to 1.65. 4 | Wget: version higher or equal to 1.17.1. 5 | Matplotlib : version higher or equal to 1.5.0. 6 | 7 | Annotation transfer: 8 | BioPerl: version higher or equal to 1.6.1. 9 | RATT : version higher or equal to 1.64. 10 | 11 | SNP calling: 12 | Samtools : version higher or equal to 1.3.1 (using htslib 1.3.1). 13 | Bcftools : version higher or equal to 1.3.1 (using htslib 1.3.1). 14 | 15 | TSS and PS prediction: 16 | TSSpredator : version higher or equal to 1.06. 17 | 18 | TSS and PS parameter optimization: 19 | TSSpredator : version higher or equal to 1.06. 20 | 21 | sRNA detection: 22 | Blast+ : version higher or equal to 2.2.28+. 23 | ViennaRNA : version higher or equal to 2.3.2. RNAfold, mountain.pl and relplot.pl are needed for sRNA prediction. 24 | 25 | Terminator detection: 26 | TranstermHP : version higher or equal to 2.09. 27 | ViennaRNA : version higher or equal to 2.3.2. RNAfold is needed for terminator prediction. 28 | 29 | Promoter search: 30 | MEME : version higher or equal to 4.11.1. 31 | GLAM2 : version higher or equal to 4.11.1. 32 | MPICH : version higher or equal to 3.2. It is for parallel version of promoter detection. 33 | 34 | sRNA target prediction: 35 | ViennaRNA : version higher or equal to 2.3.2. 36 | RNAup, RNAplex, RNAplfold are required for executing many modules of ANNOgesic. 37 | IntaRNA: version higher or equal to 2.0.4. 38 | 39 | Circular RNA detection: 40 | Samtools : version higher or equal to 1.3.1 (using htslib 1.3.1). 41 | Segemehl : version higher or equal to 0.1.9. 42 | 43 | Riboswitch and RNA thermometer identification: 44 | Infernal : version higher or equal to 1.1.1. 45 | 46 | CRISPR detection: 47 | CRT: version higher or equal to 1.2. 48 | 49 | Subcellular localization prediction: 50 | Psortb : version higher or equal to 3.0. 51 | 52 | Protein-protein interaction detection: 53 | Networkx : version higher or equal to 1.10. 54 | 55 | Generating screenshots of IGV: 56 | IGV : version higher or equal to 2.3.20. 57 | 58 | Colorization of screenshots: 59 | ImageMagick : version higher or equal to 6.9.0-0. 60 | -------------------------------------------------------------------------------- /annogesiclib/TSSpredator_parser.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class TSSPredatorReader(object): 5 | 6 | def entries(self, input_fh): 7 | for row in csv.reader(input_fh, delimiter="\t"): 8 | if row[0].startswith("SuperPos"): 9 | continue 10 | yield TSSPredatorEntry(row) 11 | 12 | 13 | class TSSPredatorEntry(object): 14 | 15 | def __init__(self, row): 16 | assert(len(row) == 30) 17 | self.super_pos = int(row[0]) 18 | self.super_strand = row[1] 19 | self.map_count = int(row[2]) 20 | self.det_count = int(row[3]) 21 | self.genome = row[4] 22 | self.is_detected = True if row[5] == "1" else False 23 | self.is_enriched = True if row[6] == "1" else False 24 | self.step_heigth = row[7] 25 | self.step_factor = row[8] 26 | self.enrichment_factor = row[9] 27 | self.class_count = int(row[10]) 28 | self.pos = int(row[11]) 29 | self.strand = row[12] 30 | self.locus_tag = row[13] 31 | self.srna_asrna = row[14] 32 | self.product = row[15] 33 | self.utr_length = row[16] 34 | self.gene_length = row[17] 35 | self.is_primary = True if row[18] == "1" else False 36 | self.is_secondary = True if row[19] == "1" else False 37 | self.is_internal = True if row[20] == "1" else False 38 | self.is_antisense = True if row[21] == "1" else False 39 | self.is_automated = True if row[22] == "1" else False 40 | self.is_manual = True if row[23] == "1" else False 41 | self.is_putative_srna = True if row[24] == "1" else False 42 | self.is_putative_asrna = True if row[25] == "1" else False 43 | self.comment = row[26] 44 | self.seq = row[27] 45 | self.contig_pos = row[28] 46 | self.contig_id = row[29] 47 | self.is_orphan = False 48 | if (self.is_primary is False and self.is_secondary is False and 49 | self.is_internal is False and self.is_antisense is False): 50 | self.is_orphan = True 51 | 52 | def __str__(self): 53 | return "%s %s %s" % (self.super_pos, self.super_strand, self.genome) 54 | -------------------------------------------------------------------------------- /annogesiclib/plot_coverage_table.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import matplotlib as mpl 3 | mpl.use('Agg') 4 | import matplotlib.pyplot as plt 5 | plt.style.use('ggplot') 6 | 7 | 8 | def fig(rowlabels, collabels, cells, filename, max_color, min_color): 9 | row_num = len(rowlabels) / 100 10 | if row_num == 0: 11 | row_num = 1 12 | col_num = len(collabels) / 8 13 | if col_num == 0: 14 | col_num = 1 15 | plt.figure(figsize=(18*col_num, 10*row_num), edgecolor=None) 16 | img = plt.imshow(cells, interpolation='none', aspect='auto', cmap="RdBu_r") 17 | plt.xticks(range(len(collabels)), collabels, fontsize=6) 18 | plt.yticks(range(len(rowlabels)), rowlabels, fontsize=6) 19 | plt.colorbar(fraction=0.046, pad=0.04) 20 | img.set_clim(vmin=min_color, vmax=max_color) 21 | plt.savefig(filename) 22 | 23 | 24 | def plot_table(plots, max_color, min_color, filename): 25 | rowlabels = [] 26 | collabels = [] 27 | cells = [] 28 | first = True 29 | t_num = 0 30 | for plot in plots: 31 | for key, value in plot.items(): 32 | rowlabels.append(key) 33 | cell = [] 34 | for cond, tracks in value.items(): 35 | for track, cover in tracks.items(): 36 | if first: 37 | name = track 38 | if len(track) > 16: 39 | diff = int(len(name) / 16) 40 | for i in range(diff): 41 | name = (name[:(16)*(i+1)+i] + "\n" + 42 | name[(16)*(i+1)+i:]) 43 | collabels.append(name) 44 | cell.append(round(cover, 1)) 45 | cells.append(deepcopy(cell)) 46 | first = False 47 | if len(rowlabels) >= 500: 48 | plotname = (filename[:-4] + "_" + str(t_num) + "-" + 49 | str(t_num + 500) + ".png") 50 | fig(rowlabels, collabels, cells, plotname, max_color, min_color) 51 | t_num = t_num + 500 52 | rowlabels = [] 53 | cells = [] 54 | if t_num == 0: 55 | fig(rowlabels, collabels, cells, filename, max_color, min_color) 56 | -------------------------------------------------------------------------------- /comparison/compare_term_regulon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | from gff3 import Gff3Parser 8 | 9 | __author__ = "Sung-Huan Yu " 10 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-k","--regulondb_file",help="terminators in RegulonDB") 14 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted terminators") 15 | args = parser.parse_args() 16 | 17 | def main(): 18 | terms = [] 19 | detect = 0 20 | total = 0 21 | fh = open(args.regulondb_file, "r") 22 | for row in csv.reader(fh, delimiter='\t'): 23 | if row[3] == "forward": 24 | strand = "+" 25 | else: 26 | strand = "-" 27 | total += 1 28 | terms.append({"id": row[0], "start": int(row[1]), 29 | "end": int(row[2]), "strand": strand}) 30 | if row[3] == "both": 31 | terms.append({"id": row[0], "start": int(row[1]), 32 | "end": int(row[2]), "strand": "+"}) 33 | total += 1 34 | for pre in Gff3Parser().entries(open(args.predict_file)): 35 | for ref in terms: 36 | if pre.strand == ref["strand"]: 37 | if ((pre.start >= ref["start"]) and ( 38 | pre.end <= ref["end"])) or ( 39 | (pre.start <= ref["start"]) and ( 40 | pre.end >= ref["end"])) or ( 41 | (pre.start >= ref["start"]) and ( 42 | pre.start <= ref["end"]) and ( 43 | pre.end >= ref["end"])) or ( 44 | (pre.start <= ref["start"]) and ( 45 | pre.end >= ref["start"]) and ( 46 | pre.end <= ref["end"])): 47 | detect += 1 48 | break 49 | print("the number of published terminators which can be detected by ANNOgesic:" + str(detect)) 50 | print("total number of terminators in RegulonDB:" + str(total)) 51 | print("detection rate:" + str(float(detect)/float(total))) 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /annogesiclib/sRNA_filter_frag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | from annogesiclib.gff3 import Gff3Parser 5 | 6 | 7 | def filter_frag(srna_table, srna_gff): 8 | out = open("tmp_srna.gff", "w") 9 | out_ta = open("tmp_srna.csv", "w") 10 | out.write("##gff-version 3\n") 11 | gffs = [] 12 | tables = [] 13 | gff_parser = Gff3Parser() 14 | g_f = open(srna_gff, "r") 15 | for entry in gff_parser.entries(g_f): 16 | gffs.append(entry) 17 | fh = open(srna_table, "r") 18 | for row in csv.reader(fh, delimiter='\t'): 19 | tables.append(row) 20 | new_gffs = [] 21 | for gff in gffs: 22 | if ("UTR_type" in gff.attributes.keys()): 23 | if ("5utr" in gff.attributes["UTR_type"]) or ( 24 | "interCDS" in gff.attributes["UTR_type"]): 25 | for table in tables: 26 | if (gff.seq_id == table[0]) and ( 27 | gff.start == int(table[2])) and ( 28 | gff.end == int(table[3])) and ( 29 | gff.strand == table[4]): 30 | if "frag" in table[5]: 31 | new_gffs.append(gff) 32 | elif "3utr" in gff.attributes["UTR_type"]: 33 | new_gffs.append(gff) 34 | else: 35 | new_gffs.append(gff) 36 | new_tables = [] 37 | for table in tables: 38 | for gff in new_gffs: 39 | if (gff.seq_id == table[0]) and ( 40 | gff.start == int(table[2])) and ( 41 | gff.end == int(table[3])) and ( 42 | gff.strand == table[4]): 43 | new_tables.append(table) 44 | out_ta.write("\t".join(table) + "\n") 45 | for gff in new_gffs: 46 | for table in new_tables: 47 | if (gff.seq_id == table[0]) and ( 48 | gff.start == int(table[2])) and ( 49 | gff.end == int(table[3])) and ( 50 | gff.strand == table[4]): 51 | out.write(gff.info + "\n") 52 | g_f.close() 53 | fh.close() 54 | out.close() 55 | out_ta.close() 56 | os.remove(srna_gff) 57 | os.remove(srna_table) 58 | shutil.move("tmp_srna.gff", srna_gff) 59 | shutil.move("tmp_srna.csv", srna_table) 60 | -------------------------------------------------------------------------------- /annogesiclib/modify_rbs_table.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import csv 3 | 4 | 5 | def import_data(row): 6 | return{"strain": row[1], "strand": row[2], 7 | "associate": row[3], "start_seq": int(row[4]), 8 | "end_seq": int(row[5]), "rfam": row[6], "e": row[7], 9 | "score": row[8], 10 | "start_align": int(row[9]), "end_align": int(row[10]), 11 | "info": row[0:6], "ID": row[0]} 12 | 13 | 14 | def modify_table(table, output_all): 15 | first = True 16 | rbss = [] 17 | out = open("tmp.csv", "w") 18 | out.write("#ID\tGenome\tStrand\tAssociated_CDS\tStart_genome\t" 19 | "End_genome\tRfam\tE_value\tScore\tStart_align\tEnd_align\n") 20 | if output_all: 21 | with open(table) as fh: 22 | for line in fh: 23 | line = line.strip() 24 | if first: 25 | first = False 26 | rbss.append(line) 27 | out.write(line + "\n") 28 | else: 29 | if line not in rbss: 30 | rbss.append(line) 31 | out.write(line + "\n") 32 | else: 33 | fh = open(table, "r") 34 | for row in csv.reader(fh, delimiter='\t'): 35 | rbss.append(import_data(row)) 36 | ids = [] 37 | for rbs1 in rbss: 38 | repeat = False 39 | if "print" not in rbs1.keys(): 40 | rbs1["print"] = True 41 | for rbs2 in rbss: 42 | if (rbs1["strain"] == rbs2["strain"]) and \ 43 | (rbs1["strand"] == rbs2["strand"]) and \ 44 | (rbs1["ID"] == rbs2["ID"]): 45 | if "print" not in rbs2.keys(): 46 | rbs2["print"] = True 47 | repeat = True 48 | if (not repeat) or (rbs1["ID"] not in ids): 49 | ids.append(rbs1["ID"]) 50 | out.write("\t".join(rbs1["info"] + [rbs1["rfam"], 51 | rbs1["e"], rbs1["score"], 52 | str(rbs1["start_align"]), 53 | str(rbs1["end_align"])]) + "\n") 54 | fh.close() 55 | out.close() 56 | shutil.move("tmp.csv", table) 57 | -------------------------------------------------------------------------------- /comparison/compare_TSS_Salgado.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | import math 8 | from gff3 import Gff3Parser 9 | 10 | __author__ = "Sung-Huan Yu " 11 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-k","--regulondb_file",help="TSS file of Salgado et. al in RegulonDB") 15 | parser.add_argument("-p","--predict_file",help="ANNOgesic predicted TSS file") 16 | parser.add_argument("-f","--fuzzy", type=int, help="tolerance of nts for comparison") 17 | args = parser.parse_args() 18 | 19 | def main(): 20 | pros = {} 21 | tsss = [] 22 | total = 0 23 | detect = 0 24 | for entry in Gff3Parser().entries(open(args.predict_file)): 25 | tsss.append(entry) 26 | refs = [] 27 | fh = open(args.regulondb_file, "r") 28 | for row in csv.reader(fh, delimiter='\t'): 29 | if not row[0].startswith("#"): 30 | if row[5] == "forward": 31 | strand = "+" 32 | else: 33 | strand = "-" 34 | total += 1 35 | refs.append({"start": int(row[0]), 36 | "end": int(row[1]), "strand": strand}) 37 | for ref in refs: 38 | ref["start"] = ref["start"] - args.fuzzy 39 | ref["end"] = ref["end"] + args.fuzzy 40 | for pre in tsss: 41 | if pre.strand == ref["strand"]: 42 | if ((pre.start >= ref["start"]) and ( 43 | pre.end <= ref["end"])) or ( 44 | (pre.start <= ref["start"]) and ( 45 | pre.end >= ref["end"])) or ( 46 | (pre.start >= ref["start"]) and ( 47 | pre.start <= ref["end"]) and ( 48 | pre.end >= ref["end"])) or ( 49 | (pre.start <= ref["start"]) and ( 50 | pre.end >= ref["start"]) and ( 51 | pre.end <= ref["end"])): 52 | detect += 1 53 | break 54 | print("the number of reported TSSs which can be detected by ANNOgesic:" + str(detect)) 55 | print("total number of TSSs from Salgado et. al in RegulonDB:" + str(detect)) 56 | print("detection rate:" + str(float(detect)/float(total))) 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /annogesiclib/parser_wig.py: -------------------------------------------------------------------------------- 1 | class WigParser(object): 2 | '''parser the wiggle file based on 3 | strain, track, position and coverage''' 4 | 5 | def parser(self, wig_fh, strand): 6 | track = "" 7 | strain = "" 8 | for line in wig_fh.readlines(): 9 | line = line.strip() 10 | if (len(line) != 0) and (not line.startswith("#")): 11 | datas = line.split(" ") 12 | if (datas[0] == "variableStep") or (datas[0] == "fixedStep"): 13 | strain = datas[1].split("=") 14 | strain = strain[1].strip() 15 | pre_pos = 0 16 | first = True 17 | if (datas[0] == "track"): 18 | track = datas[2].split("=") 19 | track = track[1].replace("\"", "") 20 | pre_pos = 0 21 | first = True 22 | if (datas[0] != "track") and ( 23 | datas[0] != "variableStep") and ( 24 | datas[0] != "fixedStep"): 25 | if len(datas) != 2: 26 | datas = line.split("\t") 27 | if int(datas[0]) - 1 != pre_pos: 28 | for pos in range(pre_pos + 1, int(datas[0])): 29 | yield AssignValue(pos, 0, strand, strain, track) 30 | pre_pos = int(datas[0]) 31 | first = True 32 | if (int(datas[0]) - 1 == pre_pos) or (first): 33 | pre_pos = int(datas[0]) 34 | first = False 35 | yield AssignValue(datas[0], datas[1], 36 | strand, strain, track) 37 | 38 | 39 | class AssignValue(object): 40 | 41 | def __init__(self, pos, coverage, strand, strain, track): 42 | self.pos = int(pos) 43 | if strand == "+": 44 | self.coverage = float(coverage) 45 | else: 46 | if float(coverage) < 0: 47 | self.coverage = -1 * float(coverage) 48 | else: 49 | self.coverage = float(coverage) 50 | self.strand = strand 51 | self.strain = strain 52 | self.track = track 53 | 54 | def __str__(self): 55 | return "{0} {1} {2} {3} {4}".format( 56 | self.pos, self.coverage, self.strand, self.strain, self.track) 57 | -------------------------------------------------------------------------------- /annogesiclib/gen_promoter_table.py: -------------------------------------------------------------------------------- 1 | from annogesiclib.gff3 import Gff3Parser 2 | 3 | 4 | def gen_promoter_table(input_file, output_file, tss_file, type_): 5 | '''generate the table of promoter based on MEME''' 6 | tsss = [] 7 | gff_f = open(tss_file, "r") 8 | for entry in Gff3Parser().entries(gff_f): 9 | tsss.append(entry) 10 | out = open(output_file, "w") 11 | out.write("\t".join(["Genome", "TSS_position", 12 | "TSS_strand", "Motif"]) + "\n") 13 | detect = False 14 | num = 1 15 | with open(input_file) as fh: 16 | for line in fh: 17 | line = line.strip() 18 | if type_ == "meme": 19 | if line.startswith("MOTIF"): 20 | motif = line.split("MEME")[0].strip() 21 | datas = motif.split(" ") 22 | motif = datas[0] + "_" + datas[-1] 23 | detect = False 24 | elif (line.startswith("Sequence name")) and ( 25 | line.endswith("Site")): 26 | detect = True 27 | elif (len(line) == 0): 28 | detect = False 29 | elif (detect) and (not line.startswith("---")): 30 | tag = line.split(" ")[0] 31 | datas = tag.split("_") 32 | for tss in tsss: 33 | if ("_".join(datas[2:]) in tss.seq_id) and ( 34 | datas[0] == str(tss.start)) and ( 35 | datas[1] == tss.strand): 36 | out.write("\t".join([tss.seq_id, datas[0], 37 | datas[1], motif]) + "\n") 38 | elif type_ == "glam2": 39 | if line.startswith("*"): 40 | detect = True 41 | motif = "MOTIF_" + str(num) 42 | num += 1 43 | elif len(line) == 0: 44 | detect = False 45 | elif detect: 46 | datas = line.split(" ")[0].split("_") 47 | for tss in tsss: 48 | if ("_".join(datas[2:]) in tss.seq_id) and ( 49 | datas[0] == str(tss.start)) and ( 50 | datas[1] == tss.strand): 51 | out.write("\t".join([tss.seq_id, datas[0], 52 | datas[1], motif]) + "\n") 53 | -------------------------------------------------------------------------------- /comparison/compare_operon_regulondb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | 8 | __author__ = "Sung-Huan Yu " 9 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-d","--regulondb_file",help="RegulonDB file") 13 | parser.add_argument("-p","--predict_file",help="ANNOgesic operon table") 14 | args = parser.parse_args() 15 | 16 | def main(): 17 | pre_op = "" 18 | operons = [] 19 | nums = {"detect": 0, "total": 0} 20 | fh = open(args.regulondb_file, "r") 21 | for row in csv.reader(fh, delimiter='\t'): 22 | if (not row[0].startswith("#")) and (row[-1] != "Weak"): 23 | nums["total"] += 1 24 | if row[3] == "forward": 25 | row[3] = "+" 26 | else: 27 | row[3] = "-" 28 | operons.append({"start": int(row[1]), "end": int(row[2]), "strand": row[3]}) 29 | sh = open(args.predict_file, "r") 30 | uniqs = [] 31 | for row in csv.reader(sh, delimiter='\t'): 32 | if row[0] != "Operon_ID": 33 | start = int(row[2].split("-")[0]) 34 | end = int(row[2].split("-")[-1]) 35 | for operon in operons: 36 | if operon["strand"] == row[3]: 37 | if ((operon["start"] <= start) and ( 38 | operon["end"] >= end)) or ( 39 | (operon["start"] >= start) and ( 40 | operon["end"] >= end)) or ( 41 | (operon["start"] >= start) and ( 42 | operon["start"] <= end) and ( 43 | operon["end"] >= end)) or ( 44 | (operon["start"] <= start) and ( 45 | operon["end"] >= start) and ( 46 | operon["end"] <= end)): 47 | if operon not in uniqs : 48 | nums["detect"] += 1 49 | uniqs.append(operon) 50 | operon["detect"] = True 51 | break 52 | pre_op = {"start": start, "end": end, "strand": row[3]} 53 | print("detected operons by ANNOgesic:" + str(nums["detect"])) 54 | print("total operon in RegulonDB:" + str(nums["total"])) 55 | print("detection rate:" + str(float(nums["detect"]/nums["total"]))) 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /annogesiclib/output_cutoff_table.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from annogesiclib.gff3 import Gff3Parser 4 | 5 | 6 | def output_coverage(table_file, gff_file, cutoff_cover, stat_file, out_folder): 7 | out = open(os.path.join(out_folder, "tmp_srna_table"), "w") 8 | out_g = open(os.path.join(out_folder, "tmp_srna_gff"), "w") 9 | out.write("\t".join([ 10 | "Rank", "Genome", "Name", "Start", "End", "Strand", 11 | "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates", 12 | "Lib_type", "Best_avg_coverage", "Best_highest_coverage", 13 | "Best_lower_coverage", "Track/Coverage", 14 | "Normalized_secondary_energy_change(by_length)", 15 | "UTR_derived/Intergenic", "Confliction_of_sORF", "nr_hit_number", 16 | "sRNA_hit_number", "nr_hit_top3|ID|e-value", "sRNA_hit|e-value", 17 | "Overlap_CDS", "Overlap_percent", "End_with_terminator"]) + "\n") 18 | out_g.write("##gff-version 3\n") 19 | stat_out = open(stat_file, "w") 20 | nums = {5: 0} 21 | for i in range(10, 100, 10): 22 | nums[i] = 0 23 | for i in range(100, 1000, 100): 24 | nums[i] = 0 25 | for i in range(1000, 5000, 500): 26 | nums[i] = 0 27 | gffs = [] 28 | gh = open(gff_file, "r") 29 | for entry in Gff3Parser().entries(gh): 30 | gffs.append(entry) 31 | fh = open(table_file, "r") 32 | rank = 1 33 | new_gffs = [] 34 | for row in csv.reader(fh, delimiter='\t'): 35 | if row[0] != "rank": 36 | for cutoff in nums.keys(): 37 | if float(row[10]) >= cutoff: 38 | nums[cutoff] += 1 39 | if float(row[10]) >= cutoff_cover: 40 | row[0] = str(rank) 41 | out.write("\t".join(row) + "\n") 42 | rank += 1 43 | for gff in gffs: 44 | if (row[1] == gff.seq_id) and ( 45 | row[3] == str(gff.start)) and ( 46 | row[4] == str(gff.end)) and ( 47 | row[5] == gff.strand): 48 | new_gffs.append(gff) 49 | sort_gffs = sorted(new_gffs, key=lambda k: (k.seq_id, k.start, 50 | k.end, k.strand)) 51 | for gff in sort_gffs: 52 | out_g.write(gff.info + "\n") 53 | coverlist = sorted(nums, key=lambda key: nums[key]) 54 | stat_out.write("coverage\tfrequency\n") 55 | for cover in coverlist: 56 | stat_out.write("\t".join([str(cover), str(nums[cover])]) + "\n") 57 | -------------------------------------------------------------------------------- /tests/test_gff3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import shutil 7 | import unittest 8 | from io import StringIO 9 | sys.path.append(".") 10 | from annogesiclib.gff3 import Gff3Parser 11 | 12 | 13 | class TestGff3Parser(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.example = Example() 17 | self.gff_parser = Gff3Parser() 18 | self.test_folder = "test_folder" 19 | if (not os.path.exists(self.test_folder)): 20 | os.mkdir(self.test_folder) 21 | 22 | def tearDown(self): 23 | if os.path.exists(self.test_folder): 24 | shutil.rmtree(self.test_folder) 25 | 26 | def test_gff_parser(self): 27 | strains = [] 28 | features = [] 29 | starts = [] 30 | ends = [] 31 | strands = [] 32 | IDs = [] 33 | fh = StringIO(self.example.gff_file) 34 | for entry in self.gff_parser.entries(fh): 35 | strains.append(entry.seq_id) 36 | features.append(entry.feature) 37 | starts.append(entry.start) 38 | ends.append(entry.end) 39 | strands.append(entry.strand) 40 | IDs.append(entry.attributes["ID"]) 41 | self.assertListEqual(strains, ["aaa", "aaa", "aaa", 42 | "aaa", "bbb", "bbb"]) 43 | self.assertListEqual(features, ["gene", "CDS", "gene", 44 | "CDS", "gene", "tRNA"]) 45 | self.assertListEqual(starts, [517, 517, 2156, 2156, 4444, 4444]) 46 | self.assertListEqual(ends, [1878, 1878, 3289, 3289, 5444, 5444]) 47 | self.assertListEqual(strands, ["+", "+", "-", "-", "+", "+"]) 48 | self.assertListEqual(IDs, ["gene0", "cds0", "gene1", 49 | "cds1", "gene2", "rna0"]) 50 | 51 | class Example(object): 52 | 53 | gff_file = """#gff3 54 | aaa Refseq gene 517 1878 . + . Name=dnaA;locus_tag=AAA_00001;gene=dnaA;ID=gene0;db_xref=GeneID:3919798 55 | aaa Refseq CDS 517 1878 . + . protein_id=YP_498609.1;ID=cds0;Name=YP_498609.1;product=chromosomal replication initiation protein;Parent=gene0 56 | aaa Refseq gene 2156 3289 . - . Name=AAA_00002;locus_tag=AAA_00002;ID=gene1;db_xref=GeneID:3919799 57 | aaa Refseq CDS 2156 3289 . - . protein_id=YP_498610.1;ID=cds1;Name=YP_498610.1;locus_tag=AAA_00002 58 | bbb Refseq gene 4444 5444 . + . Name=AAA_T00004;locus_tag=AAA_T00004;ID=gene2 59 | bbb Refseq tRNA 4444 5444 . + . Name=AAA_T00018;locus_tag=AAA_T00004;ID=rna0""" 60 | 61 | if __name__ == "__main__": 62 | unittest.main() 63 | 64 | -------------------------------------------------------------------------------- /tests/test_gen_table_tran.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_gff3 import Create_generator 8 | import annogesiclib.gen_table_tran as gtt 9 | 10 | 11 | class Mock_func(object): 12 | 13 | def __init__(self): 14 | self.example = Example() 15 | 16 | class TestGenTableTran(unittest.TestCase): 17 | 18 | def setUp(self): 19 | self.example = Example() 20 | self.test_folder = "test_folder" 21 | if (not os.path.exists(self.test_folder)): 22 | os.mkdir(self.test_folder) 23 | 24 | def tearDown(self): 25 | if os.path.exists(self.test_folder): 26 | shutil.rmtree(self.test_folder) 27 | 28 | def test_detect_coverage(self): 29 | infos = {} 30 | gtt.detect_coverage(self.example.wigs, self.example.tas[0], infos) 31 | self.assertDictEqual(infos, {'track_1': { 32 | 'high': 100, 'low': 2, 'avg': 33.529411764705884}}) 33 | 34 | def test_print_coverage(self): 35 | out = StringIO() 36 | out_gff = StringIO() 37 | gtt.print_coverage(self.example.tas, out, out_gff, 38 | self.example.wigs, self.example.wigs, None) 39 | self.assertEqual( 40 | out.getvalue(), 41 | "aaa\tTranscript_0\t4\t20\t+\tfragmented&TEX+/-\tNA\tNA\tNA\ttrack_1(33.529411764705884)\n") 42 | self.assertListEqual(out_gff.getvalue().split("\t")[:-1], 43 | ["aaa", "ANNOgesic", "Transcript", 44 | "4", "20", ".", "+", "."]) 45 | self.assertEqual( 46 | set(out_gff.getvalue().split("\t")[-1].strip().split(";")), 47 | set(["Name=Transcript_0", "detect_lib=fragmented&TEX+/-", 48 | "best_avg_coverage=33.529411764705884", "ID=tran0"])) 49 | class Example(object): 50 | wigs = {"aaa": {"frag_1": {"track_1|+|frag": [ 51 | 100, 30, 23, 21, 21, 2, 100, 30, 23, 21, 21, 2, 52 | 100, 30, 23, 21, 21, 2, 100, 30, 23, 21, 21, 2]}}} 53 | ta_dict = [{"seq_id": "aaa", "source": "ANNOgesic", 54 | "feature": "Transcript", "start": 4, 55 | "end": 20, "phase": ".", "strand": "+", "score": "."}] 56 | attributes_tas = [{"ID": "tran0", "Name": "Transcript_0", 57 | "detect_lib": "fragmented&tex_notex"}] 58 | tas = [] 59 | for index in range(0, 1): 60 | tas.append(Create_generator(ta_dict[index], 61 | attributes_tas[index], "gff")) 62 | if __name__ == "__main__": 63 | unittest.main() 64 | -------------------------------------------------------------------------------- /tests/test_seq_editer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file, import_data 8 | from annogesiclib.seq_editer import SeqEditer 9 | 10 | 11 | class TestSeqEditer(unittest.TestCase): 12 | 13 | def setUp(self): 14 | self.example = Example() 15 | self.test_folder = "test_folder" 16 | self.fasta = os.path.join(self.test_folder, "fasta") 17 | if (not os.path.exists(self.test_folder)): 18 | os.mkdir(self.test_folder) 19 | os.mkdir(self.fasta) 20 | self.seq = SeqEditer() 21 | 22 | def tearDown(self): 23 | if os.path.exists(self.test_folder): 24 | shutil.rmtree(self.test_folder) 25 | 26 | def test_import_data(self): 27 | mod_table = os.path.join(self.test_folder, "mod") 28 | gen_file(mod_table, self.example.mutation) 29 | datas = self.seq._import_data(mod_table, "test") 30 | self.assertListEqual(datas, [{'target_id': 'test_NC_000915.1', 31 | 'datas': [{'ref_nt': 'c', 'tar_nt': '', 'position': '3'}, 32 | {'ref_nt': '-', 'tar_nt': 'deletion', 'position': '6'}], 'ref_id': 'NC_000915.1'}]) 33 | 34 | def test_modify_seq(self): 35 | mod_table = os.path.join(self.test_folder, "mod") 36 | gen_file(mod_table, self.example.mutation) 37 | gen_file(os.path.join(self.fasta, "NC_000915.1.fa"), 38 | self.example.fasta) 39 | self.seq.modify_seq(self.fasta, mod_table, self.test_folder, "test") 40 | datas = import_data(os.path.join(self.test_folder, "test_NC_000915.1.fa")) 41 | self.assertEqual("\n".join(datas), self.example.out_1) 42 | 43 | def test_modify_header(self): 44 | input_file = os.path.join(self.test_folder, "test.fa") 45 | gen_file(input_file, ">AAA|BBB|CCC|DDD|EEE\nACATACAAGTACAGTT") 46 | self.seq.modify_header(input_file) 47 | datas = import_data(input_file) 48 | self.assertEqual("\n".join(datas), ">DDD\nACATACAAGTACAGTT") 49 | 50 | 51 | class Example(object): 52 | 53 | fasta = """>NC_000915.1 54 | ATAGATAACCCAAGTACGACTCAGGTCCCTCACA""" 55 | out_1 = """>test_NC_000915.1 56 | ATGATdeletionAACCCAAGTACGACTCAGGTCCCTCACA""" 57 | out_2 = """>test_case2 58 | ATAGAgTAACCCAAGTACGACTCAGGTCCCTCACA""" 59 | mutation = """#refernce_id target_id reference_nt position target_nt impact of correction locus tag gene Description 60 | NC_000915.1 3 a c SAOUHSC_00002 dnaA XXXXXX 61 | NC_000915.1 6 a - deletion YYYYYY""" 62 | 63 | if __name__ == "__main__": 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /tests/test_color_png.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file, import_data 8 | from annogesiclib.color_png import ColorPNG 9 | 10 | 11 | class Mock_func(object): 12 | def __init__(self): 13 | self.color = ColorPNG() 14 | 15 | def mock_convert_svg(self, imagemagick_path, out_path, 16 | screenshot, svg_file, log): 17 | gen_file(os.path.join(out_path, svg_file), 18 | "= ref["start"]) and ( 37 | # pre.end <= ref["end"])) or ( 38 | # (pre.start <= ref["start"]) and ( 39 | # pre.end >= ref["end"])) or ( 40 | # (pre.start >= ref["start"]) and ( 41 | # pre.start <= ref["end"]) and ( 42 | # pre.end >= ref["end"])) or ( 43 | # (pre.start <= ref["start"]) and ( 44 | # pre.end >= ref["start"]) and ( 45 | # pre.end <= ref["end"])): 46 | if pre.strand == ref.strand: 47 | if ((pre.start >= ref.start) and ( 48 | pre.end <= ref.end)) or ( 49 | (pre.start <= ref.start) and ( 50 | pre.end >= ref.end)) or ( 51 | (pre.start >= ref.start) and ( 52 | pre.start <= ref.end) and ( 53 | pre.end >= ref.end)) or ( 54 | (pre.start <= ref.start) and ( 55 | pre.end >= ref.start) and ( 56 | pre.end <= ref.end)): 57 | detect += 1 58 | break 59 | print("the number of published sRNAs which can be detected by ANNOgesic:" + str(detect)) 60 | print("total number of sRNAs in RefSeq:" + str(num_ref)) 61 | print("detection rate:" + str(float(detect) / float(num_ref))) 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /tests/test_filter_TSS_pro.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import unittest 4 | import os 5 | import sys 6 | import shutil 7 | sys.path.append(".") 8 | from io import StringIO 9 | from mock_gff3 import Create_generator 10 | import annogesiclib.filter_TSS_pro as ftp 11 | 12 | 13 | class TestFilterTSSPro(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.test_folder = "test_project" 17 | if os.path.exists(self.test_folder): 18 | shutil.rmtree(self.test_folder) 19 | os.mkdir(self.test_folder) 20 | self.example = Example() 21 | 22 | def tearDown(self): 23 | if os.path.exists(self.test_folder): 24 | shutil.rmtree(self.test_folder) 25 | 26 | def test_compare_tss_pro(self): 27 | out = StringIO() 28 | ftp.compare_tss_pro(self.example.tars, self.example.refs, out, 3) 29 | self.assertEqual("\t".join(out.getvalue().split("\t")[0:-1]), 30 | "aaa\tRefseq\tTSS\t24\t24\t.\t+\t.") 31 | 32 | 33 | class Example(object): 34 | tar_dict = [ 35 | {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 3, 36 | "end": 3, "phase": ".", "strand": "+", "score": "."}, 37 | {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 24, 38 | "end": 24, "phase": ".", "strand": "+", "score": "."}, 39 | {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 1243, 40 | "end": 1243, "phase": ".", "strand": "+", "score": "."}] 41 | attributes_tar = [{"coverage": "3", "ID": "tss1", "Name": "TSS:3_+"}, 42 | {"coverage": "340", "ID": "tss2", "Name": "TSS:24_+"}, 43 | {"coverage": "4440", "ID": "tss3", "Name": "TSS:1243_+"}] 44 | ref_dict = [ 45 | {"seq_id": "aaa", "source": "Refseq", "feature": "Pro", "start": 3, 46 | "end": 3, "phase": ".", "strand": "+", "score": "."}, 47 | {"seq_id": "aaa", "source": "Refseq", "feature": "Pro", "start": 333, 48 | "end": 333, "phase": ".", "strand": "+", "score": "."}, 49 | {"seq_id": "aaa", "source": "Refseq", "feature": "Pro", "start": 1242, 50 | "end": 1242, "phase": ".", "strand": "+", "score": "."}] 51 | attributes_ref = [{"coverage": "3", "ID": "pro1", "Name": "Pro:3_+"}, 52 | {"coverage": "330", "ID": "pro2", "Name": "Pro:333_+"}, 53 | {"coverage": "1230", "ID": "pro3", "Name": "Pro:1242_+"}] 54 | tars = [] 55 | refs = [] 56 | for index in range(0, 3): 57 | tars.append(Create_generator(tar_dict[index], 58 | attributes_tar[index], "gff")) 59 | refs.append(Create_generator(ref_dict[index], 60 | attributes_ref[index], "gff")) 61 | if __name__ == "__main__": 62 | unittest.main() 63 | -------------------------------------------------------------------------------- /tests/test_blast_class.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | import annogesiclib.blast_class as blast_class 8 | 9 | class Mock_func(object): 10 | 11 | def mock_read_file(self, blast_file, nums): 12 | nums['total']['dnaA'] = 2 13 | nums['aaa'] = {} 14 | nums['aaa']['dnaA'] = 2 15 | 16 | class TestBlastClass(unittest.TestCase): 17 | 18 | def setUp(self): 19 | self.example = Example() 20 | self.test_folder = "test_folder" 21 | if (not os.path.exists(self.test_folder)): 22 | os.mkdir(self.test_folder) 23 | self.blast_file = os.path.join(self.test_folder, "test.csv") 24 | with open(self.blast_file, "w") as rh: 25 | rh.write(self.example.blast) 26 | 27 | def tearDown(self): 28 | if os.path.exists(self.test_folder): 29 | shutil.rmtree(self.test_folder) 30 | 31 | def test_read_file(self): 32 | nums = {} 33 | nums["total"] = {} 34 | blast_class.read_file(self.blast_file, nums) 35 | self.assertDictEqual(nums, {'aaa': {'dnaA': 2}, 'total':{'dnaA': 2}}) 36 | 37 | def test_blast_class(self): 38 | blast_class.read_file = Mock_func().mock_read_file 39 | lines = [] 40 | out_file = os.path.join(self.test_folder, "test.out") 41 | blast_class.blast_class(self.blast_file, out_file) 42 | with open(out_file) as fh: 43 | for line in fh: 44 | line = line.strip() 45 | lines.append(line) 46 | self.assertEqual(set(lines), set(self.example.blast_table.split("\n"))) 47 | 48 | class Example(object): 49 | blast = """1\taaa\tdnaA\t2377296\t2377454\t-\tTSS:2377454_-\tNA\t2377296-2377454\tTEX+/-;Fragmented\t260123.91873361162\t446839.7634471806\t-0.0\tpMEM_t2_TEX_reverse(avg=155022.7050613754;high=266113.8349051722;low=0.6611942741842581)\t-0.2075\tIntergenic\tNA\tNA\t6\tNA\tsrn_4390|S._aureus_NCTC8325|dnaA|3e-55\tNA\tNA\tNA""" 50 | read_out = [{'ID': 'hit_1', 'srna_name': 'dnaA', 'blast_strain': 'strain_b', 51 | 'strain': 'aaa', 'start': '100', 'name': 'RNA_test1', 52 | 'strand': '+', 'end': '200', 'e': '0.0005'}, 53 | {'ID': 'hit_2', 'srna_name': 'dnaa', 'blast_strain': 'strain_c', 54 | 'strain': 'aaa', 'start': '100', 'name': 'RNA_test1', 55 | 'strand': '+', 'end': '200', 'e': '0.0007'}, 56 | {'ID': 'hit_3', 'srna_name': 'dnaC', 'blast_strain': 'strain_b', 57 | 'strain': 'aaa', 'start': '400', 'name': 'RNA_test2', 58 | 'strand': '-', 'end': '450', 'e': '0.000002'}] 59 | 60 | blast_table = """aaa: 61 | sRNA_name amount 62 | dnaA 2""" 63 | 64 | if __name__ == "__main__": 65 | unittest.main() 66 | 67 | -------------------------------------------------------------------------------- /tests/test_goterm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from annogesiclib.goterm import GoTermFinding 8 | from mock_args_container import MockClass 9 | 10 | 11 | class TestGetPolyT(unittest.TestCase): 12 | 13 | def setUp(self): 14 | self.mock_args = MockClass() 15 | self.test_folder = "test_folder" 16 | if (not os.path.exists(self.test_folder)): 17 | os.mkdir(self.test_folder) 18 | self.gffs = os.path.join(self.test_folder, "gff_folder") 19 | if (not os.path.exists(self.gffs)): 20 | os.mkdir(self.gffs) 21 | self.go_folder = os.path.join(self.test_folder, "go_folder") 22 | if (not os.path.exists(self.go_folder)): 23 | os.mkdir(self.go_folder) 24 | self.all_strain = "all_genomes_uniprot.csv" 25 | self.trans = os.path.join(self.test_folder, "tran_folder") 26 | if (not os.path.exists(self.trans)): 27 | os.mkdir(self.trans) 28 | args = self.mock_args.mock() 29 | args.out_folder = self.test_folder 30 | args.gffs = self.gffs 31 | args.trans = self.trans 32 | self.go = GoTermFinding(args) 33 | 34 | def tearDown(self): 35 | if os.path.exists(self.test_folder): 36 | shutil.rmtree(self.test_folder) 37 | 38 | def test_merge_files(self): 39 | gff_folder = os.path.join(self.gffs, "test.gff_folder") 40 | if (not os.path.exists(gff_folder)): 41 | os.mkdir(gff_folder) 42 | test1_folder = os.path.join(self.go_folder, "test1") 43 | if (not os.path.exists(test1_folder)): 44 | os.mkdir(test1_folder) 45 | test2_folder = os.path.join(self.go_folder, "test2") 46 | if (not os.path.exists(test2_folder)): 47 | os.mkdir(test2_folder) 48 | with open(os.path.join(gff_folder, "test1.gff"), "w") as fh: 49 | fh.write("test1") 50 | with open(os.path.join(gff_folder, "test2.gff"), "w") as fh: 51 | fh.write("test2") 52 | with open(os.path.join(test1_folder, "test1_uniprot.csv"), "w") as fh: 53 | fh.write("test1") 54 | with open(os.path.join(test2_folder, "test2_uniprot.csv"), "w") as fh: 55 | fh.write("test2") 56 | log = open(os.path.join(self.test_folder, "test.log"), "w") 57 | self.go._merge_files(self.gffs, self.go_folder, self.test_folder, log) 58 | out_file = os.path.join(self.go_folder, "test", self.all_strain) 59 | self.assertTrue(os.path.exists(out_file)) 60 | data = [] 61 | with open(out_file) as fh: 62 | for line in fh: 63 | data.append(line) 64 | self.assertEqual("".join(data), "Genome Strand Start End Protein_id Go_term\ntest2\ntest1\n") 65 | log.close() 66 | 67 | if __name__ == "__main__": 68 | unittest.main() 69 | -------------------------------------------------------------------------------- /comparison/compare_operon_door.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import argparse 7 | 8 | __author__ = "Sung-Huan Yu " 9 | __email__ = "sung-huan.yu@uni-wuerzburg.de" 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-d","--door2_file",help="door2 data") 13 | parser.add_argument("-p","--predict_file",help="ANNOgesic gff file") 14 | args = parser.parse_args() 15 | 16 | def main(): 17 | pre_op = "" 18 | operons = [] 19 | nums = {"detect": 0, "total": 0} 20 | fh = open(args.door2_file, "r") 21 | for row in csv.reader(fh, delimiter='\t'): 22 | if row[0] != "OperonID": 23 | if row[0] != pre_op: 24 | if len(pre_op) != 0: 25 | nums["total"] += 1 26 | operons.append({"start": start, "end": end, "strand": strand}) 27 | start = int(row[3]) 28 | end = int(row[4]) 29 | strand = row[5] 30 | else: 31 | start = int(row[3]) 32 | end = int(row[4]) 33 | strand = row[5] 34 | pre_op = row[0] 35 | else: 36 | if start > int(row[3]): 37 | start = int(row[3]) 38 | if end < int(row[4]): 39 | end = int(row[4]) 40 | sh = open(args.predict_file, "r") 41 | total_p = 0 42 | uniqs = [] 43 | for row in csv.reader(sh, delimiter='\t'): 44 | if row[0] != "Operon_ID": 45 | start = int(row[2].split("-")[0]) 46 | end = int(row[2].split("-")[-1]) 47 | for operon in operons: 48 | if operon["strand"] == row[3]: 49 | if ((operon["start"] <= start) and ( 50 | operon["end"] >= end)) or ( 51 | (operon["start"] >= start) and ( 52 | operon["end"] >= end)) or ( 53 | (operon["start"] >= start) and ( 54 | operon["start"] <= end) and ( 55 | operon["end"] >= end)) or ( 56 | (operon["start"] <= start) and ( 57 | operon["end"] >= start) and ( 58 | operon["end"] <= end)): 59 | if operon not in uniqs : 60 | nums["detect"] += 1 61 | uniqs.append(operon) 62 | operon["detect"] = True 63 | break 64 | pre_op = {"start": start, "end": end, "strand": row[3]} 65 | print("detected operon by ANNOgesic:" + str(nums["detect"])) 66 | print("detection rate:" + str(float(nums["detect"]/nums["total"]))) 67 | print("total number of DOOR2:" + str(nums["total"])) 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /annogesiclib/rbs_overlap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | from annogesiclib.gff3 import Gff3Parser 5 | from annogesiclib.helper import Helper 6 | 7 | 8 | def read_gff(gff_file, type_): 9 | cdss = [] 10 | g_h = open(gff_file) 11 | for entry in Gff3Parser().entries(g_h): 12 | if (Helper().feature_without_notgene(entry)): 13 | if (type_ == "riboswitch") and (entry.feature != "riboswitch"): 14 | cdss.append(entry) 15 | elif (type_ == "thermometer") and ( 16 | entry.feature != "RNA_thermometer"): 17 | cdss.append(entry) 18 | cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) 19 | g_h.close() 20 | return cdss 21 | 22 | 23 | def check_repeat(tab, strain, strand, start, end, fuzzy): 24 | start = start + fuzzy 25 | end = end - fuzzy 26 | if (tab["strain"] == strain) and ( 27 | tab["strand"] == strand): 28 | if ((tab["start"] <= start) and ( 29 | tab["end"] >= end)) or ( 30 | (tab["start"] >= start) and ( 31 | tab["end"] <= end)) or ( 32 | (tab["start"] <= start) and ( 33 | tab["end"] <= end) and ( 34 | tab["end"] >= start)) or ( 35 | (tab["start"] >= start) and ( 36 | tab["start"] <= end) and ( 37 | tab["end"] >= end)): 38 | return True 39 | return False 40 | 41 | 42 | def rbs_overlap(table_file, gff_file, type_, fuzzy): 43 | tmp_tab = table_file + "_tmp" 44 | cdss = read_gff(gff_file, type_) 45 | out = open(tmp_tab, "w") 46 | fh = open(table_file, "r") 47 | tables = [] 48 | for row in csv.reader(fh, delimiter='\t'): 49 | if not row[0].startswith("#"): 50 | tables.append({"strain": row[1], "strand": row[2], 51 | "start": int(row[4]), "end": int(row[5]), 52 | "info": "\t".join(row)}) 53 | fh.close() 54 | for tab in tables: 55 | overlap = False 56 | for cds in cdss: 57 | overlap = check_repeat(tab, cds.seq_id, cds.strand, 58 | cds.start, cds.end, fuzzy) 59 | if overlap: 60 | break 61 | for com in tables: 62 | if tab != com: 63 | repeat = check_repeat(tab, com["strain"], com["strand"], 64 | com["start"], com["end"], 0) 65 | if (not overlap): 66 | if ((repeat) and ( 67 | "print" not in tab.keys()) and ( 68 | "print" not in com.keys())) or ( 69 | not repeat): 70 | overlap = False 71 | else: 72 | overlap = True 73 | if not overlap: 74 | tab["print"] = True 75 | out.write(tab["info"] + "\n") 76 | out.close() 77 | os.remove(table_file) 78 | shutil.move(tmp_tab, table_file) 79 | -------------------------------------------------------------------------------- /tests/test_expresssion.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file, import_data 8 | import annogesiclib.expression as express_file 9 | from annogesiclib.expression import Expression 10 | 11 | 12 | class Mock_func(object): 13 | 14 | def mock_expression(self, input_libs, gffs, percent_tex, 15 | percent_frag, wig_f_file, wig_r_file, 16 | features, merge_wigs, cutoff_coverage, 17 | tex_notex, replicates, stat, gff_folder, 18 | cover_type, max_color, min_color): 19 | pass 20 | 21 | class TestExpression(unittest.TestCase): 22 | 23 | def setUp(self): 24 | self.mock = Mock_func() 25 | self.test_folder = "test_folder" 26 | self.tex_path = os.path.join(self.test_folder, "tex") 27 | self.frag_path = os.path.join(self.test_folder, "frag") 28 | if (not os.path.exists(self.test_folder)): 29 | os.mkdir(self.test_folder) 30 | os.mkdir(self.tex_path) 31 | os.mkdir(self.frag_path) 32 | self.express = Expression(self.test_folder) 33 | 34 | def tearDown(self): 35 | if os.path.exists(self.test_folder): 36 | shutil.rmtree(self.test_folder) 37 | 38 | def test_get_replicates(self): 39 | replicates = self.express._get_replicates(2, 1) 40 | self.assertDictEqual({'tex': 2, 'frag': 1}, replicates) 41 | 42 | def test_expression(self): 43 | express_file.gene_expression = self.mock.mock_expression 44 | tex_libs=["tex_-TEX_forward.wig:notex:1:a:+", 45 | "tex_-TEX_reverse.wig:notex:1:a:-", 46 | "tex_+TEX_forward.wig:tex:1:a:+", 47 | "tex_+TEX_reverse.wig:tex:1:a:-"] 48 | frag_libs=["frag_forward.wig:frag:1:a:+", 49 | "frag_reverse.wig:frag:1:a:-"] 50 | gen_file(os.path.join(self.tex_path, "tex_-TEX_forward.wig"), "tex1") 51 | gen_file(os.path.join(self.tex_path, "tex_-TEX_reverse.wig"), "tex2") 52 | gen_file(os.path.join(self.tex_path, "tex_+TEX_forward.wig"), "tex3") 53 | gen_file(os.path.join(self.tex_path, "tex_+TEX_reverse.wig"), "tex4") 54 | gen_file(os.path.join(self.frag_path, "frag_forward.wig"), "frag1") 55 | gen_file(os.path.join(self.frag_path, "frag_reverse.wig"), "frag2") 56 | self.express.expression(tex_libs, frag_libs, 2, 2, 57 | 1, self.tex_path, self.frag_path, "all", 58 | "all", 5, self.test_folder, "CDS", 59 | "high", 100, 0) 60 | self.assertTrue(os.path.exists(os.path.join( 61 | self.test_folder, "for_libs"))) 62 | self.assertTrue(os.path.exists(os.path.join( 63 | self.test_folder, "for_libs", "statistics"))) 64 | self.assertTrue(os.path.exists(os.path.join( 65 | self.test_folder, "for_libs", "gffs"))) 66 | if __name__ == "__main__": 67 | unittest.main() 68 | 69 | -------------------------------------------------------------------------------- /annogesiclib/reorganize_table.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import shutil 3 | from annogesiclib.lib_reader import read_libs 4 | 5 | 6 | def import_covers(row): 7 | cover_names = [] 8 | covers = [] 9 | for data in row.split("("): 10 | if ")" not in data: 11 | cover_names.append(data) 12 | else: 13 | covers.append(data.split(")")[0]) 14 | if len(data.split(");")) == 2: 15 | cover_names.append(data.split(");")[-1]) 16 | return cover_names, covers 17 | 18 | def get_lib_name(libs): 19 | tracks = [] 20 | double_tracks = [] 21 | track_list = [] 22 | for lib1 in libs: 23 | for lib2 in libs: 24 | if (lib1["cond"] == lib2["cond"]) and ( 25 | lib1["type"] == lib2["type"]) and ( 26 | lib1["rep"] == lib2["rep"]) and ( 27 | lib1["strand"] != lib2["strand"]): 28 | track = "/".join([lib1["name"], lib2["name"]]) 29 | if track not in double_tracks: 30 | double_tracks.append(track) 31 | double_tracks.append("/".join([lib2["name"], 32 | lib1["name"]])) 33 | tracks.append(track) 34 | track_list.append([lib1["name"], lib2["name"]]) 35 | return tracks, track_list 36 | 37 | def reorganize_table(input_libs, wigs, cover_header, table_file): 38 | libs, texs = read_libs(input_libs, wigs) 39 | fh = open(table_file, "r") 40 | first = True 41 | headers = [] 42 | tracks, track_list = get_lib_name(libs) 43 | out = open(table_file + "tmp", "w") 44 | for row in csv.reader(fh, delimiter='\t'): 45 | if first: 46 | detect = False 47 | header_num = 0 48 | for header in row: 49 | if header == cover_header: 50 | index = header_num 51 | detect = True 52 | header_num += 1 53 | if not detect: 54 | headers.append(header) 55 | else: 56 | detect = False 57 | first = False 58 | for track in tracks: 59 | headers.append("Avg_coverage:" + track) 60 | out.write("\t".join(headers) + "\n") 61 | else: 62 | if len(row) < (index + 1): 63 | cover_names = [] 64 | covers = [] 65 | else: 66 | cover_names, covers = import_covers(row[index]) 67 | if len(row) == index + 1: 68 | row = row[:index] 69 | else: 70 | row = row[:index] + row[index + 1:] 71 | detects = ["Not_detect"] * len(tracks) 72 | for name, cover in zip(cover_names, covers): 73 | num_track = 0 74 | for track in track_list: 75 | if name in track: 76 | detects[num_track] = cover 77 | num_track += 1 78 | out.write("\t".join(row + detects) + "\n") 79 | out.close() 80 | shutil.move(table_file + "tmp", table_file) 81 | -------------------------------------------------------------------------------- /docs/source/docker.rst: -------------------------------------------------------------------------------- 1 | Docker image 2 | ============== 3 | 4 | `Docker `_ is a platform for distributing package. 5 | It is light and easy to manage. ``ANNOgesic`` includes a ``Dockerfile`` which 6 | is for build up a environment and install all required tools for running ``ANNOgesic``. 7 | 8 | Two ways can be used to build or pull Docker image: 9 | 10 | 1. You can simply pull the Docker image by running 11 | 12 | :: 13 | 14 | $ docker pull silasysh/annogesic 15 | 16 | 2. Alternatively, you can build the image by ``Dockerfile``. 17 | Please go to the folder where ``Dockerfile`` are located. Then type 18 | 19 | :: 20 | 21 | $ sudo docker build -t="annogesic" . 22 | 23 | It will build up an image called annogesic. You can see the images by typing ``docker images`` 24 | 25 | Based on different ways of installing docker image of ANNOgesic, the name of the docker image 26 | will be different. Pulling from DockerHub is: 27 | 28 | :: 29 | 30 | REPOSITORY TAG IMAGE ID CREATED VIRTUAL SIZE 31 | silasysh/annogesic latest d35f555694ad 3 days ago 2.782 GB 32 | ubuntu 14.04 d2a0ecffe6fa 11 days ago 188.4 MB 33 | 34 | Building Docker image by ``Dockerfile`` is: 35 | 36 | :: 37 | 38 | REPOSITORY TAG IMAGE ID CREATED VIRTUAL SIZE 39 | annogesic latest d35f555694ad 3 days ago 2.782 GB 40 | ubuntu 14.04 d2a0ecffe6fa 11 days ago 188.4 MB 41 | 42 | Then we can use the image to create a container for running ``ANNOgesic``. Now, we used ``silasysh/annogesic`` 43 | to represent Docker image. If you built Docker image by yourself, please replace ``silasysh/annogesic`` by ``annogesic``. 44 | Please type 45 | 46 | :: 47 | 48 | $ docker run -t -i silasysh/annogesic bash 49 | 50 | Then you will jump into the container. 51 | 52 | :: 53 | 54 | root@c9de31fcd7e3:~ ls 55 | ANNOgesic 56 | 57 | If you want to mount the files from your host to the container, just add ``-v`` to the command. 58 | 59 | :: 60 | 61 | $ docker run -t -i -v /host/path/target:/file/path/within/container silasysh/annogesic bash 62 | 63 | The paths should be absolute path. If we go to ``root`` in container. We can see the file. 64 | 65 | 66 | If you want to copy the files from container to host, you can use ``cp``. 67 | 68 | :: 69 | 70 | $ docker cp :/file/path/within/container /host/path/target 71 | 72 | If you have no root permission for running Docker, Singularity is another way to 73 | build up the image without root permission. 74 | 75 | :: 76 | 77 | $ singularity build \ 78 | annogesic.img \ 79 | docker://silasysh/annogesic:latest 80 | 81 | After building Singularity image of ANNOgesic, the user just needs to put the following line before 82 | the command that needs to be executed. 83 | 84 | :: 85 | 86 | singularity exec -B $STORAGE_PATH annogesic.img 87 | 88 | Please put the storage path of your home directory to ``$STORAGE_PATH``. ``df`` can be used to check the 89 | storage system. 90 | -------------------------------------------------------------------------------- /tests/test_gen_svg.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file, import_data 8 | import annogesiclib.gen_svg as gs 9 | 10 | 11 | class TestGenSvg(unittest.TestCase): 12 | 13 | def setUp(self): 14 | self.test_folder = "test_folder" 15 | self.example = Example() 16 | if (not os.path.exists(self.test_folder)): 17 | os.mkdir(self.test_folder) 18 | 19 | def tearDown(self): 20 | if os.path.exists(self.test_folder): 21 | shutil.rmtree(self.test_folder) 22 | 23 | def test_gen_svg(self): 24 | gs.gen_svg("test_folder/test.png", 4, 1000, 400) 25 | data = import_data("test_folder/test.svg") 26 | self.assertEqual("\n".join(data), self.example.svg) 27 | 28 | 29 | class Example(object): 30 | 31 | svg = """ 32 | 33 | 34 | 46 | 48 | 49 | 51 | image/svg+xml 52 | 54 | 55 | 56 | 57 | 58 | 60 | 66 | 73 | 80 | 87 | 94 | """ 95 | svg = svg.replace("/home/silas/ANNOgesic", os.getcwd()) 96 | 97 | if __name__ == "__main__": 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /annogesiclib/lib_reader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os, gc 3 | import numpy as np 4 | from annogesiclib.parser_wig import WigParser 5 | 6 | 7 | def read_libs(input_libs, wig_folder): 8 | libs = [] 9 | texs = {} 10 | for lib in input_libs: 11 | datas = lib.split(":") 12 | name = None 13 | for wig in os.listdir(wig_folder): 14 | if wig == datas[0]: 15 | with open(os.path.join(wig_folder, wig), "r") as w_h: 16 | for line in w_h: 17 | line = line.strip() 18 | if line.startswith("track"): 19 | name = line.split("=")[-1].replace("\"", "") 20 | break 21 | if name is None: 22 | print("Error: The {0} can not be found in lib names!!!".format(wig)) 23 | if (datas[1] == "tex") or (datas[1] == "notex"): 24 | cond = "texnotex" 25 | else: 26 | cond = datas[1] 27 | libs.append({"name": name, "type": datas[1], 28 | "cond": "_".join([datas[2], cond]), 29 | "rep": datas[3], "strand": datas[4]}) 30 | for lib1 in libs: 31 | if lib1["type"] == "frag": 32 | pass 33 | elif (lib1["type"] == "tex") or (lib1["type"] == "notex"): 34 | prefix1 = lib1["cond"].split("_")[0] 35 | for lib2 in libs: 36 | prefix2 = lib2["cond"].split("_")[0] 37 | if (prefix1 == prefix2) and \ 38 | (lib1["rep"] == lib2["rep"]) and \ 39 | (lib1["type"] == "tex") and \ 40 | (lib2["type"] == "notex") and \ 41 | (lib1["strand"] == lib2["strand"]): 42 | texs[lib1["name"] + "@AND@" + lib2["name"]] = 0 43 | else: 44 | print("Error: Wrong library types are detected, " 45 | "please assign frag, tex or notex.") 46 | sys.exit() 47 | return libs, texs 48 | 49 | 50 | def read_wig(filename, strand, libs): 51 | wig_parser = WigParser() 52 | wigs = {} 53 | if filename is not False: 54 | wig_fh = open(filename) 55 | for entry in wig_parser.parser(wig_fh, strand): 56 | if entry.strain not in wigs.keys(): 57 | wigs[entry.strain] = {} 58 | for lib in libs: 59 | if lib["cond"] not in wigs[entry.strain]: 60 | wigs[entry.strain][lib["cond"]] = {} 61 | for lib in libs: 62 | if (lib["name"] == entry.track) and ( 63 | lib["strand"] == entry.strand): 64 | lib_name = "|".join([ 65 | entry.track, entry.strand, lib["type"]]) 66 | if lib_name not in wigs[entry.strain][lib["cond"]].keys(): 67 | wigs[entry.strain][lib["cond"]][lib_name] = [] 68 | wigs[entry.strain][lib["cond"]][lib_name].append(entry.coverage) 69 | wig_fh.close() 70 | for strain, conds in wigs.items(): 71 | for cond, lib_names in conds.items(): 72 | for lib_name, cover_list in lib_names.items(): 73 | wigs[strain][cond][lib_name] = np.array( 74 | wigs[strain][cond][lib_name]) 75 | return wigs 76 | -------------------------------------------------------------------------------- /tests/test_stat_TSSpredater.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | import copy 6 | from io import StringIO 7 | sys.path.append(".") 8 | from mock_gff3 import Create_generator 9 | from mock_helper import import_data, gen_file, extract_info 10 | import annogesiclib.stat_TSSpredator as st 11 | 12 | 13 | class TestStatTSSpredator(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.example = Example() 17 | self.test_folder = "test_folder" 18 | if (not os.path.exists(self.test_folder)): 19 | os.mkdir(self.test_folder) 20 | 21 | def tearDown(self): 22 | if os.path.exists(self.test_folder): 23 | shutil.rmtree(self.test_folder) 24 | 25 | def test_stat(self): 26 | detect = False 27 | out_stat = StringIO() 28 | out_lib = StringIO() 29 | st.stat(self.example.tsss, "aaa", "TSS", out_stat, "test", out_lib) 30 | for data in out_stat.getvalue().split("\n"): 31 | if "Primary" in data: 32 | self.assertEqual(data.split(" = ")[-1], "1 (1.0)") 33 | if ("TSB_OD_0.2" in out_lib.getvalue()) and ( 34 | "pMEM_OD_0.5" in out_lib.getvalue()) and ( 35 | "pMEM_t2" in out_lib.getvalue()): 36 | detect = True 37 | self.assertTrue(detect) 38 | os.remove("test_class_aaa.png") 39 | 40 | def test_plot(self): 41 | st.plot(20, 23, 10, 13, 5, 100, 200, "name", 42 | "TSS", os.path.join(self.test_folder, "test")) 43 | self.assertTrue(os.path.exists(os.path.join( 44 | self.test_folder, "test_class_name.png"))) 45 | 46 | def test_stat_tsspredator(self): 47 | detect = False 48 | tss_file = os.path.join(self.test_folder, "aaa_TSS.gff") 49 | stat_file = os.path.join(self.test_folder, "stat") 50 | lib_file = os.path.join(self.test_folder, "lib") 51 | gen_file(tss_file, self.example.tss) 52 | st.stat_tsspredator(tss_file, "TSS", stat_file, lib_file) 53 | datas = import_data(stat_file) 54 | for data in datas: 55 | if "Primary" in data: 56 | self.assertEqual(data.split(" = ")[-1], "1 (1.0)") 57 | datas = import_data(lib_file) 58 | line = "\n".join(datas) 59 | if ("TSB_OD_0.2" in line) and ( 60 | "pMEM_OD_0.5" in line) and ( 61 | "pMEM_t2" in line): 62 | detect = True 63 | self.assertTrue(detect) 64 | self.assertTrue(os.path.exists("TSS_class_aaa.png")) 65 | os.remove("TSS_class_aaa.png") 66 | 67 | class Example(object): 68 | 69 | tss = """aaa TSSpredator TSS 2131 2131 . + . UTR_length=Primary_25;type=Primary;ID=tss3;libs=TSB_OD_0.2&pMEM_OD_0.5&pMEM_t2;associated_gene=SAOUHSC_00002;Name=TSS:2131_f""" 70 | tss_dict = [{"seq_id": "aaa", "source": "TSSpredator", 71 | "feature": "TSS", "start": 2131, 72 | "end": 2131, "phase": ".", "strand": "+", "score": "."}] 73 | attributes_tss = [{ 74 | "ID": "tss3", "Name": "TSS:2131_f", "UTR_length": "Primary_25", 75 | "type": "Primary", "associated_gene": "SAOUHSC_00002", 76 | "libs": "TSB_OD_0.2&pMEM_OD_0.5&pMEM_t2"}] 77 | tsss = [] 78 | tsss.append(Create_generator(tss_dict[0], attributes_tss[0], "gff")) 79 | 80 | if __name__ == "__main__": 81 | unittest.main() 82 | 83 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | There are three ways to install ANNOgesic. Please refer to the following 5 | sections. ANNOgesic can only work when the requirements are installed properly. If 6 | you install ANNOgesic through source code or ``pip3``, please install the pre-required 7 | tools by yourself. 8 | 9 | 10 | Github 11 | ---------- 12 | 13 | All the source code including a run script (contains all the commands which are presented in tutorial) 14 | of ANNOgesic can be retrieve from our Git repository. Using the following commands can clone the 15 | source code easily. 16 | 17 | :: 18 | 19 | $ git clone https://github.com/Sung-Huan/ANNOgesic.git 20 | 21 | or 22 | 23 | :: 24 | 25 | $ git clone git@github.com:Sung-Huan/ANNOgesic.git 26 | 27 | In order to make ANNOgesic runnable, we should create a soft link of ``annogesiclib`` in ``bin``. 28 | 29 | :: 30 | 31 | $ cd ANNOgesic/bin 32 | $ ln -s ../annogesiclib . 33 | 34 | Docker 35 | ---------- 36 | 37 | Some modules of ANNOgesic need third-party tools. In order to avoid all the possible issue caused by the dependencies, 38 | a Docker image is provided. For the details of Docker image, please check `here `_. 39 | 40 | For using Docker image, please use one of the following commands: 41 | 42 | 1. You can simply pull the Docker image as following 43 | 44 | :: 45 | 46 | $ docker pull silasysh/annogesic 47 | 48 | 2. Alternatively, you can build the image via Dockerfile. 49 | Please Download the `Dockerfile `_ from our Git repository. 50 | Then switch to the folder which Dockerfile are located. For the following commands, please 51 | refer to `here `_. 52 | 53 | If you want to check other commands of Docker, please refer to `here `_. 54 | 55 | Singularity 56 | ----------- 57 | 58 | `Singularity `_ is another way to install ANNOgesic via 59 | Docker image without root permission. 60 | 61 | :: 62 | 63 | $ singularity build \ 64 | annogesic.img \ 65 | docker://silasysh/annogesic:latest 66 | 67 | After building Singularity image of ANNOgesic, the user just needs to put the following line before 68 | the command that needs to be executed. 69 | 70 | :: 71 | 72 | singularity exec -B $STORAGE_PATH annogesic.img 73 | 74 | Please put the storage path of your home directory to ``$STORAGE_PATH``. ``df`` can be used to check the 75 | storage system. 76 | 77 | pip3 78 | ---------- 79 | 80 | ANNOgesic is also hosted in PyPI server. Thus, it can be simply installed via ``pip3``. 81 | 82 | :: 83 | 84 | $ pip3 install ANNOgesic 85 | $ pip3 install ANNOgesic --upgrade 86 | 87 | You can also install ANNOgesic without root permission. 88 | 89 | :: 90 | 91 | $ pip3 install --user ANNOgesic 92 | $ pip3 install ANNOgesic --user --upgrade 93 | 94 | Install Dependencies 95 | -------------------- 96 | 97 | If the user want to install ANNOgesic via source code, ``get_package_database.sh`` can 98 | provide a way to install tools and download database automatically. The required versions 99 | of the tools will be shown on the screen as well. 100 | -------------------------------------------------------------------------------- /annogesiclib/check_srna_overlap.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import shutil 3 | from annogesiclib.helper import Helper 4 | from annogesiclib.gff3 import Gff3Parser 5 | 6 | 7 | def import_cds(gff): 8 | if "Name" in gff.attributes.keys(): 9 | return gff.attributes["Name"] 10 | elif "ID" in gff.attributes.keys(): 11 | return gff.attributes["ID"] 12 | else: 13 | name = "".join([gff.feature, ":", str(gff.start), "-", str(gff.end), 14 | "_", gff.strand]) 15 | return name 16 | 17 | 18 | def check_overlap(table_file, gff_file): 19 | out = open(table_file + "tmp", "w") 20 | gffs = [] 21 | gff_f = open(gff_file, "r") 22 | for entry in Gff3Parser().entries(gff_f): 23 | if Helper().feature_without_notgene(entry): 24 | gffs.append(entry) 25 | fh = open(table_file, "r") 26 | out.write("\t".join([ 27 | "Rank", "Genome", "Name", "Start", "End", "Strand", 28 | "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates", 29 | "Lib_type", "Best_avg_coverage", "Track/Coverage", 30 | "Normalized_secondary_energy_change(by_length)", "sRNA_types", 31 | "Conflict_sORF", "nr_hit_number", "sRNA_hit_number", 32 | "nr_hit_top3|ID|e-value|score", "sRNA_hit|e-value|score", "Overlap_CDS_forward", 33 | "Overlap_nts_forward", "Overlap_CDS_reverse", 34 | "Overlap_nts_reverse","End_with_terminator", 35 | "Associated_promoter", "sRNA_length"]) + "\n") 36 | for row in csv.reader(fh, delimiter='\t'): 37 | if row[3] != "Start": 38 | overlaps = {"forward": [], "reverse": [], 39 | "CDS_f": [], "CDS_r": []} 40 | start = int(row[3]) 41 | end = int(row[4]) 42 | for gff in gffs: 43 | if ((gff.end < end) and ( 44 | gff.end > start) and ( 45 | gff.start <= start)) or ( 46 | (gff.start > start) and ( 47 | gff.start < end) and ( 48 | gff.end >= end)) or ( 49 | (gff.end >= end) and ( 50 | gff.start <= start)) or ( 51 | (gff.end <= end) and ( 52 | gff.start >= start)): 53 | overlap = min(gff.end, end) - max(gff.start, start) + 1 54 | percent = "{0:.0f}%".format((float(overlap) / float(end - start + 1)) * 100) 55 | if gff.strand == "+": 56 | overlaps["forward"].append(str(overlap) + "(" + str(percent) + ")") 57 | overlaps["CDS_f"].append(import_cds(gff)) 58 | else: 59 | overlaps["reverse"].append(str(overlap) + "(" + str(percent) + ")") 60 | overlaps["CDS_r"].append(import_cds(gff)) 61 | if len(overlaps["forward"]) == 0: 62 | overlaps["forward"] = ["NA"] 63 | overlaps["CDS_f"] = ["NA"] 64 | if len(overlaps["reverse"]) == 0: 65 | overlaps["reverse"] = ["NA"] 66 | overlaps["CDS_r"] = ["NA"] 67 | out.write("\t".join(row[0:19] + [";".join(overlaps["CDS_f"]), ";".join(overlaps["forward"]), 68 | ";".join(overlaps["CDS_r"]), ";".join(overlaps["reverse"])] + 69 | row[21:]) + "\n") 70 | shutil.move(table_file + "tmp", table_file) 71 | -------------------------------------------------------------------------------- /annogesiclib/gen_svg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def print_track(track_num, svg_out, figure_width): 5 | id_num = 3067 6 | x = 2.0744663 7 | y = 131 8 | for track in range(track_num): 9 | if (track % 2) == 0: 10 | svg_out.write(" \n") 25 | if (track % 2) == 1: 26 | svg_out.write(" \n") 38 | id_num += 1 39 | 40 | 41 | def gen_svg(input_png, track_num, figure_height, figure_width): 42 | svg_out = open(input_png[:-4] + ".svg", "w") 43 | svg_out.write(""" 44 | 45 | 46 | 59 | 61 | 62 | 64 | image/svg+xml 65 | 67 | 68 | 69 | 70 | 71 | 73 | \n""") 81 | print_track(track_num, svg_out, figure_width) 82 | svg_out.write("") 83 | svg_out.close() 84 | -------------------------------------------------------------------------------- /annogesiclib/overlap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from annogesiclib.gff3 import Gff3Parser 4 | 5 | def get_overlap(anno, source, finals, overlaps, detect, out): 6 | if (anno.source in source) and ( 7 | anno not in overlaps): 8 | finals.append(anno) 9 | detect = True 10 | return detect 11 | 12 | def deal_overlap(out_folder, source): 13 | gffs = {} 14 | num = 0 15 | for gff_file in os.listdir(out_folder): 16 | if gff_file.endswith(".gff"): 17 | gff_f = open(os.path.join(out_folder, gff_file), "r") 18 | for entry in Gff3Parser().entries(gff_f): 19 | if entry.feature not in gffs.keys(): 20 | gffs[entry.feature] = [] 21 | gffs[entry.feature].append(entry) 22 | gff_f.close() 23 | out = open(os.path.join(out_folder, gff_file + "tmp"), "w") 24 | finals = [] 25 | overlaps = [] 26 | for feature, annos in gffs.items(): 27 | for anno1 in annos: 28 | detect = False 29 | for anno2 in annos: 30 | if (anno1.seq_id == anno2.seq_id) and ( 31 | anno1.strand == anno2.strand) and ( 32 | anno1 != anno2) and ( 33 | anno1.feature == anno2.feature) and ( 34 | anno1.source != anno2.source): 35 | if ((anno1.start <= anno2.start) and ( 36 | anno1.end >= anno2.end)) or ( 37 | (anno1.start >= anno2.start) and ( 38 | anno1.end <= anno2.end)) or ( 39 | (anno1.start <= anno2.start) and ( 40 | anno1.end <= anno2.end) and ( 41 | anno1.end >= anno2.start)) or ( 42 | (anno1.start >= anno2.start) and ( 43 | anno1.start <= anno2.end) and ( 44 | anno1.end >= anno2.end)): 45 | detect = get_overlap(anno1, source, finals, 46 | overlaps, detect, out) 47 | detect = get_overlap(anno2, source, finals, 48 | overlaps, detect, out) 49 | if detect: 50 | overlaps.append(anno1) 51 | overlaps.append(anno2) 52 | if (not detect) and (anno1 not in overlaps): 53 | finals.append(anno1) 54 | finals = sorted(finals, key=lambda x: (x.seq_id, x.start, 55 | x.end, x.strand)) 56 | for final in finals: 57 | if (final.feature == "region") or ( 58 | final.feature == "source") or ( 59 | final.feature == "remark"): 60 | out.write(final.info + "\n") 61 | break 62 | for final in finals: 63 | if (final.feature != "region") and ( 64 | final.feature != "source"): 65 | out.write(final.info + "\n") 66 | out.close() 67 | shutil.move(os.path.join(out_folder, gff_file + "tmp"), 68 | os.path.join(out_folder, gff_file)) 69 | -------------------------------------------------------------------------------- /annogesiclib/compare_sRNA_sORF.py: -------------------------------------------------------------------------------- 1 | from annogesiclib.gff3 import Gff3Parser 2 | from annogesiclib.helper import Helper 3 | 4 | 5 | def print_file(datas, out, feature): 6 | for data in datas: 7 | if feature not in data.attributes.keys(): 8 | data.attributes[feature] = "NA" 9 | else: 10 | data.attributes[feature] = ",".join(data.attributes[feature]) 11 | data.attribute_string = ";".join( 12 | ["=".join(items) for items in data.attributes.items()]) 13 | out.write("\t".join([data.info_without_attributes, 14 | data.attribute_string]) + "\n") 15 | 16 | 17 | def del_attributes(feature, entry): 18 | '''Remove to the useless attributes''' 19 | attributes = {} 20 | for key, value in entry.attributes.items(): 21 | if feature not in key: 22 | attributes[key] = value 23 | return attributes 24 | 25 | 26 | def srna_sorf_comparison(sRNA_file, sORF_file, sRNA_out, sORF_out): 27 | '''Comparison of sRNA and sORF. It can be a filter of sRNA detection''' 28 | sorfs = [] 29 | srnas = [] 30 | out_r = open(sRNA_out, "w") 31 | out_o = open(sORF_out, "w") 32 | out_r.write("##gff-version 3\n") 33 | out_o.write("##gff-version 3\n") 34 | for entry in Gff3Parser().entries(open(sRNA_file)): 35 | entry.attributes = del_attributes("sORF", entry) 36 | srnas.append(entry) 37 | srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) 38 | for entry in Gff3Parser().entries(open(sORF_file)): 39 | entry.attributes = del_attributes("sRNA", entry) 40 | sorfs.append(entry) 41 | sorfs = sorted(sorfs, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) 42 | for srna in srnas: 43 | for sorf in sorfs: 44 | if (srna.seq_id == sorf.seq_id) and (srna.strand == sorf.strand): 45 | if ((srna.start <= sorf.start) and ( 46 | srna.end >= sorf.end)) or ( 47 | (srna.start >= sorf.start) and ( 48 | srna.end <= sorf.end)) or ( 49 | (srna.start <= sorf.start) and ( 50 | srna.end >= sorf.start) and ( 51 | srna.end <= sorf.end)) or ( 52 | (srna.start >= sorf.start) and ( 53 | srna.start <= sorf.end) and ( 54 | srna.end >= sorf.end)): 55 | if "sORF" not in srna.attributes.keys(): 56 | srna.attributes["sORF"] = [] 57 | strand = Helper().get_strand_name(sorf.strand) 58 | srna.attributes["sORF"].append("".join([ 59 | "sORF:", 60 | str(sorf.start), "-", 61 | str(sorf.end), 62 | "_", strand])) 63 | if "sRNA" not in sorf.attributes.keys(): 64 | sorf.attributes["sRNA"] = [] 65 | strand = Helper().get_strand_name(srna.strand) 66 | sorf.attributes["sRNA"].append("".join([ 67 | "sRNA:", 68 | str(srna.start), "-", 69 | str(srna.end), 70 | "_", strand])) 71 | print_file(sorfs, out_o, "sRNA") 72 | print_file(srnas, out_r, "sORF") 73 | out_r.close() 74 | out_o.close() 75 | -------------------------------------------------------------------------------- /annogesiclib/compare_srna_promoter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | from annogesiclib.gff3 import Gff3Parser 5 | 6 | 7 | def read_file(gff_file, args_srna): 8 | srnas = [] 9 | for entry in Gff3Parser().entries(open(gff_file)): 10 | attributes = {} 11 | for key, value in entry.attributes.items(): 12 | if "promoter" not in key: 13 | attributes[key] = value 14 | entry.attributes = attributes 15 | srnas.append(entry) 16 | srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) 17 | fh = open(args_srna.promoter_table, "r") 18 | pros = [] 19 | for row in csv.reader(fh, delimiter='\t'): 20 | if (row[0] != "Genome") and ( 21 | row[3] in args_srna.promoter_name): 22 | pros.append({"strain": row[0], "pos": row[1], 23 | "strand": row[2], "name": row[3]}) 24 | fh.close() 25 | return srnas, pros 26 | 27 | 28 | def print_table(srna_table, out_t, srnas): 29 | fh = open(srna_table, "r") 30 | for row in csv.reader(fh, delimiter='\t'): 31 | for srna in srnas: 32 | if (row[0] == srna.seq_id) and ( 33 | int(row[2]) == srna.start) and ( 34 | int(row[3]) == srna.end) and ( 35 | row[4] == srna.strand): 36 | if "promoter" in srna.attributes.keys(): 37 | promoter = [srna.attributes["promoter"]] 38 | else: 39 | promoter = ["NA"] 40 | out_t.write("\t".join(row + promoter) + "\n") 41 | 42 | 43 | def compare_srna_promoter(srna_gff, srna_table, args_srna): 44 | '''compare sRNA and promoter to find the sRNA 45 | which is associated with a promoter. 46 | it is for the ranking of sRNA''' 47 | srnas, pros = read_file(srna_gff, args_srna) 48 | out_g = open("tmp_srna.gff", "w") 49 | out_t = open("tmp_srna.csv", "w") 50 | out_g.write("##gff-version 3\n") 51 | for srna in srnas: 52 | tsss = [] 53 | detect = False 54 | if "with_TSS" in srna.attributes.keys(): 55 | if srna.attributes["with_TSS"] != "NA": 56 | datas = srna.attributes["with_TSS"].split(",") 57 | for data in datas: 58 | info = data.split(":")[-1] 59 | tss = info.split("_") 60 | tsss.append({"pos": tss[0], "strand": tss[-1]}) 61 | if len(tsss) != 0: 62 | for tss in tsss: 63 | for pro in pros: 64 | if (srna.seq_id == pro["strain"]) and ( 65 | tss["strand"] == pro["strand"]) and ( 66 | tss["pos"] == pro["pos"]): 67 | detect = True 68 | if "promoter" not in srna.attributes.keys(): 69 | srna.attributes["promoter"] = pro["name"] 70 | else: 71 | srna.attributes["promoter"] = ",".join([ 72 | srna.attributes["promoter"], 73 | pro["name"]]) 74 | if detect: 75 | out_g.write(srna.info + ";promoter=" + 76 | srna.attributes["promoter"] + "\n") 77 | else: 78 | out_g.write(srna.info + ";promoter=NA" + "\n") 79 | print_table(srna_table, out_t, srnas) 80 | os.remove(srna_gff) 81 | os.remove(srna_table) 82 | out_t.close() 83 | out_g.close() 84 | shutil.move("tmp_srna.gff", srna_gff) 85 | shutil.move("tmp_srna.csv", srna_table) 86 | -------------------------------------------------------------------------------- /annogesiclib/sRNA_antisense.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | from annogesiclib.gff3 import Gff3Parser 5 | 6 | 7 | def compare_srna_gff(gffs, strain, strand, start, end, srna_types, file_type): 8 | for gff in gffs: 9 | if (strain == gff.seq_id) and ( 10 | strand != gff.strand): 11 | if ((start <= gff.start) and ( 12 | end >= gff.end)) or ( 13 | (start >= gff.start) and ( 14 | end <= gff.end)) or ( 15 | (start <= gff.start) and ( 16 | end <= gff.end) and ( 17 | end >= gff.start)) or ( 18 | (start >= gff.start) and ( 19 | start <= gff.end) and ( 20 | end >= gff.end)): 21 | if file_type == "gff": 22 | if "antisense" not in srna_types: 23 | srna_types = srna_types + "," + "antisense" 24 | else: 25 | if "Antisense" not in srna_types: 26 | srna_types = srna_types + "," + "Antisense" 27 | return srna_types 28 | 29 | 30 | def srna_antisense(srna_gff, srna_table, gff_file): 31 | tmp_srna_gff = srna_gff + "tmp" 32 | tmp_srna_table = srna_table + "tmp" 33 | out = open(tmp_srna_gff, "w") 34 | out.write("##gff-version 3\n") 35 | out_t = open(tmp_srna_table, "w") 36 | out_t.write("\t".join(["Rank", "Genome", "Name", "Start", "End", "Strand", 37 | "Start_with_TSS/Cleavage_site", "End_with_cleavage", 38 | "Candidates", "Lib_type", "Best_avg_coverage", 39 | "Best_highest_coverage", "Best_lower_coverage", 40 | "Track/Coverage", 41 | "Normalized_secondary_energy_change(by_length)", 42 | "sRNA_types", "Confliction_of_sORF", 43 | "nr_hit_number", "sRNA_hit_number", 44 | "nr_hit_top3|ID|e-value", "sRNA_hit|e-value", 45 | "Overlap_CDS", "Overlap_percent", 46 | "End_with_terminator"]) + "\n") 47 | srnas = [] 48 | sf = open(srna_gff, "r") 49 | for entry in Gff3Parser().entries(sf): 50 | srnas.append(entry) 51 | tabs = [] 52 | fh = open(srna_table, "r") 53 | for row in csv.reader(fh, delimiter='\t'): 54 | if row[0] != "rank": 55 | tabs.append({"info": row, "strain": row[1], "strand": row[5], 56 | "start": int(row[3]), "end": int(row[4]), 57 | "srna_type": row[15]}) 58 | else: 59 | out_t.write("\t".join(row) + "\n") 60 | gffs = [] 61 | gf = open(gff_file, "r") 62 | for entry in Gff3Parser().entries(gf): 63 | gffs.append(entry) 64 | for srna in srnas: 65 | compare_srna_gff(gffs, srna.seq_id, srna.strand, srna.start, srna.end, 66 | srna.attributes["sRNA_type"], "gff") 67 | attribute_string = ";".join( 68 | ["=".join(items) for items in srna.attributes.items()]) 69 | out.write("\t".join([srna.info_without_attributes, 70 | attribute_string]) + "\n") 71 | for tab in tabs: 72 | compare_srna_gff(gffs, tab["strain"], tab["strand"], tab["start"], 73 | tab["end"], tab["srna_type"], "table") 74 | tab["info"][15] = tab["srna_type"] 75 | out_t.write("\t".join(tab["info"]) + "\n") 76 | os.remove(srna_gff) 77 | shutil.move(tmp_srna_gff, srna_gff) 78 | os.remove(srna_table) 79 | shutil.move(tmp_srna_table, srna_table) 80 | -------------------------------------------------------------------------------- /comparison/gff3.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class Gff3Parser(object): 5 | """ 6 | A format description can be found at: 7 | http://genome.ucsc.edu/FAQ/FAQformat.html#format3 8 | http://www.sequenceontology.org/gff3.shtml 9 | 10 | a validator can be found here: 11 | http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online 12 | 13 | WARNING: Currently this class in not strict enough and would also 14 | parse file not following the standard. 15 | """ 16 | 17 | def entries(self, input_gff_fh): 18 | """ 19 | """ 20 | for entry_dict in csv.DictReader( 21 | input_gff_fh, delimiter="\t", 22 | fieldnames=["seq_id", "source", "feature", "start", 23 | "end", "score", "strand", "phase", "attributes"]): 24 | if entry_dict["seq_id"].startswith("#"): 25 | continue 26 | yield self._dict_to_entry(entry_dict) 27 | 28 | def _dict_to_entry(self, entry_dict): 29 | return Gff3Entry(entry_dict) 30 | 31 | 32 | class Gff3Entry(object): 33 | 34 | """ 35 | 36 | Example: 37 | start, end = sorted([int(pos) for pos in [start, end]]) 38 | Gff3Entry({ 39 | "seq_id" : seq_id, 40 | "source" : "MyLab", 41 | "feature" : "sRNA", 42 | "start" : start, 43 | "end" : end, 44 | "strand" : strand, 45 | "score" : ".", 46 | "phase" : ".", 47 | "attributes" : "name=%s;locus_tag=%s" % (name, locus_tag)}) 48 | """ 49 | 50 | def __init__(self, entry_dict): 51 | self.seq_id = entry_dict["seq_id"] 52 | self.source = entry_dict["source"] 53 | self.feature = entry_dict["feature"] 54 | # 1-based coordinates 55 | # Make sure that start <= end 56 | start, end = sorted([int(entry_dict["start"]), int(entry_dict["end"])]) 57 | self.start = start 58 | self.end = end 59 | self.score = entry_dict["score"] 60 | self.strand = entry_dict["strand"] 61 | self.phase = entry_dict["phase"] 62 | self.attributes = self._attributes(entry_dict["attributes"]) 63 | self.attribute_string = entry_dict["attributes"] 64 | self.info = "\t".join([str(field) for field in [ 65 | self.seq_id, self.source, self.feature, self.start, 66 | self.end, self.score, self.strand, self.phase, 67 | self.attribute_string]]) 68 | self.info_without_attributes = "\t".join([str(field) for field in [ 69 | self.seq_id, self.source, self.feature, self.start, 70 | self.end, self.score, self.strand, self.phase]]) 71 | 72 | def _attributes(self, attributes_string): 73 | """Translate the attribute string to dictionary""" 74 | attributes = {} 75 | if len(attributes_string) > 0: 76 | for attribute in attributes_string.split(";"): 77 | key_value_pair = attribute.split("=") 78 | key = key_value_pair[0] 79 | if len(key_value_pair) > 2: 80 | value = "=".join(key_value_pair[1:]) 81 | else: 82 | value = key_value_pair[1] 83 | attributes[key] = value 84 | return attributes 85 | else: 86 | return attributes 87 | 88 | def add_attribute(self, key, value): 89 | self.attributes[key] = value 90 | self.attribute_string = ";".join( 91 | ["=".join(items) for items in self.attributes.items()]) 92 | 93 | def __str__(self): 94 | return "\t".join([str(field) for field in [ 95 | self.seq_id, self.source, self.feature, self.start, 96 | self.end, self.score, self.strand, self.phase, 97 | self.attribute_string]]) 98 | -------------------------------------------------------------------------------- /tests/test_meme.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | import annogesiclib.meme as me 8 | from mock_helper import gen_file 9 | from mock_args_container import MockClass 10 | from annogesiclib.meme import MEME 11 | 12 | 13 | class Mock_func(object): 14 | 15 | def mock_del_repeat_fasta(self, tmp_fasta, all_no_orph): 16 | with open("tmp/all_type.fa", "w") as fh: 17 | fh.write("all") 18 | with open("tmp/without_orphan.fa", "w") as fh: 19 | fh.write("without_orphan") 20 | 21 | class TestMEME(unittest.TestCase): 22 | 23 | def setUp(self): 24 | self.mock_args = MockClass() 25 | self.test_folder = "test_folder" 26 | self.out_folder = "test_folder/output" 27 | if (not os.path.exists(self.test_folder)): 28 | os.mkdir(self.test_folder) 29 | os.mkdir(self.out_folder) 30 | os.mkdir(os.path.join(self.out_folder, "fasta_output")) 31 | self.tss_folder = os.path.join(self.test_folder, "tss_folder") 32 | if (not os.path.exists(self.tss_folder)): 33 | os.mkdir(self.tss_folder) 34 | self.gff_folder = os.path.join(self.test_folder, "gff_folder") 35 | if (not os.path.exists(self.gff_folder)): 36 | os.mkdir(self.gff_folder) 37 | self.fa_folder = os.path.join(self.test_folder, "fa_folder") 38 | if (not os.path.exists(self.fa_folder)): 39 | os.mkdir(self.fa_folder) 40 | args = self.mock_args.mock() 41 | args.tsss = self.tss_folder 42 | args.fastas = self.fa_folder 43 | args.gffs = self.gff_folder 44 | args.output_folder = self.out_folder 45 | self.meme = MEME(args) 46 | 47 | def tearDown(self): 48 | if os.path.exists(self.test_folder): 49 | shutil.rmtree(self.test_folder) 50 | 51 | def test_move_and_merge_fasta(self): 52 | me.del_repeat_fasta = Mock_func().mock_del_repeat_fasta 53 | if (not os.path.exists("tmp")): 54 | os.mkdir("tmp") 55 | gen_file("tmp/primary.fa", "primary") 56 | gen_file("tmp/secondary.fa", "secondary") 57 | gen_file("tmp/internal.fa", "internal") 58 | gen_file("tmp/antisense.fa", "antisense") 59 | gen_file("tmp/orphan.fa", "orphan") 60 | self.meme._move_and_merge_fasta(self.test_folder, "test") 61 | self.assertTrue(os.path.exists(os.path.join( 62 | self.test_folder, "test_allgenome_all_types.fa"))) 63 | self.assertTrue(os.path.exists(os.path.join( 64 | self.test_folder, "test_allgenome_primary.fa"))) 65 | self.assertTrue(os.path.exists(os.path.join( 66 | self.test_folder, "test_allgenome_secondary.fa"))) 67 | self.assertTrue(os.path.exists(os.path.join( 68 | self.test_folder, "test_allgenome_internal.fa"))) 69 | self.assertTrue(os.path.exists(os.path.join( 70 | self.test_folder, "test_allgenome_antisense.fa"))) 71 | self.assertTrue(os.path.exists(os.path.join( 72 | self.test_folder, "test_allgenome_orphan.fa"))) 73 | self.assertTrue(os.path.exists(os.path.join( 74 | self.test_folder, "test_allgenome_without_orphan.fa"))) 75 | 76 | def test_split_fasta_by_strain(self): 77 | with open(os.path.join(self.fa_folder, "allgenome.fa"), "w") as fh: 78 | fh.write(""">aaa_aaa_aaa 79 | ATTATATATA 80 | >bbb_bbb_bbb 81 | AATTAATTAA""") 82 | self.meme._split_fasta_by_strain(self.fa_folder) 83 | self.assertTrue(os.path.join(self.fa_folder, "aaa.fa")) 84 | self.assertTrue(os.path.join(self.fa_folder, "bbb.fa")) 85 | 86 | if __name__ == "__main__": 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /annogesiclib/screen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from annogesiclib.gen_screenshots import gen_screenshot 4 | from annogesiclib.helper import Helper 5 | 6 | 7 | class Screen(object): 8 | '''generation of screenshot''' 9 | 10 | def __init__(self, args_sc, out_folder): 11 | self.helper = Helper() 12 | args_sc.output_folder = out_folder 13 | filename = args_sc.fasta.split("/")[-1] 14 | self.strain = ".".join(filename.split(".")[0:-1]) 15 | self.helper.check_make_folder(os.path.join(args_sc.output_folder, 16 | self.strain)) 17 | self.forward_file = os.path.join(args_sc.output_folder, 18 | self.strain, "forward") 19 | self.reverse_file = os.path.join(args_sc.output_folder, 20 | self.strain, "reverse") 21 | os.mkdir(self.forward_file) 22 | os.mkdir(self.reverse_file) 23 | 24 | def _import_libs(self, texs, strand, lib_dict): 25 | if strand == "+": 26 | tex = "ft" 27 | notex = "fn" 28 | else: 29 | tex = "rt" 30 | notex = "rn" 31 | for flib in texs: 32 | if (flib[1] == "tex"): 33 | lib_dict[tex].append(flib[0]) 34 | for nlib in texs: 35 | if (nlib[1] == "notex") and \ 36 | (flib[2] == nlib[2]) and \ 37 | (flib[3] == nlib[3]): 38 | lib_dict[notex].append(nlib[0]) 39 | 40 | def screenshot(self, args_sc, log): 41 | lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []} 42 | f_texs = [] 43 | r_texs = [] 44 | if args_sc.tlibs is not None: 45 | for lib in args_sc.tlibs: 46 | lib_datas = lib.split(":") 47 | if not lib_datas[0].endswith(".wig"): 48 | log.write("Wiggle files should end with .wig.\n") 49 | print("Error: Wiggle files should end with .wig!") 50 | sys.exit() 51 | else: 52 | if lib_datas[-1] == "+": 53 | f_texs.append(lib_datas) 54 | else: 55 | r_texs.append(lib_datas) 56 | f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3])) 57 | r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3])) 58 | self._import_libs(f_texs, "+", lib_dict) 59 | self._import_libs(r_texs, "-", lib_dict) 60 | if args_sc.flibs is not None: 61 | for lib in args_sc.flibs: 62 | lib_datas = lib.split(":") 63 | if not lib_datas[0].endswith(".wig"): 64 | log.write("Wiggle files should end with .wig.\n") 65 | print("Error: Wiggle files should end with .wig!") 66 | sys.exit() 67 | else: 68 | if lib_datas[-1] == "+": 69 | lib_dict["ff"].append(lib_datas[0]) 70 | else: 71 | lib_dict["rf"].append(lib_datas[0]) 72 | log.write("Running gen_screenshots.py to generate IGV batch script.\n") 73 | gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt", 74 | self.reverse_file + ".txt", self.strain) 75 | log.write("\t" + self.forward_file + ".txt is generated.\n") 76 | log.write("\t" + self.reverse_file + ".txt is generated.\n") 77 | if (args_sc.tlibs is None) and (args_sc.flibs is None): 78 | log.write("No wig files can be found.\n") 79 | print("Error: There is no wig file assigned!") 80 | sys.exit() 81 | -------------------------------------------------------------------------------- /tests/test_stat_sublocal.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | import copy 6 | from io import StringIO 7 | sys.path.append(".") 8 | from mock_gff3 import Create_generator 9 | from mock_helper import import_data, gen_file, extract_info 10 | import annogesiclib.stat_sublocal as ss 11 | 12 | 13 | class TestStatSubLocal(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.example = Example() 17 | self.test_folder = "test_folder" 18 | if (not os.path.exists(self.test_folder)): 19 | os.mkdir(self.test_folder) 20 | 21 | def tearDown(self): 22 | if os.path.exists(self.test_folder): 23 | shutil.rmtree(self.test_folder) 24 | 25 | def test_read_table(self): 26 | psortb_file = os.path.join(self.test_folder, "test.csv") 27 | gen_file(psortb_file, self.example.table) 28 | subs, total_nums, unknown_nums = ss.read_table(psortb_file) 29 | self.assertDictEqual(subs, { 30 | 'Staphylococcus_aureus_HG002': {'Unknown': 1}, 31 | 'Staphylococcus_aureus_HG003': {'CellWall': 1, 'Cytoplasmic': 2}, 32 | 'all_genome': {'Unknown': 1, 'CellWall': 1, 'Cytoplasmic': 2}}) 33 | self.assertDictEqual(total_nums, { 34 | 'Staphylococcus_aureus_HG002': 1, 35 | 'Staphylococcus_aureus_HG003': 3, 'all_genome': 4}) 36 | self.assertDictEqual(unknown_nums, { 37 | 'Staphylococcus_aureus_HG002': 1, 38 | 'Staphylococcus_aureus_HG003': 0, 'all_genome': 1}) 39 | 40 | def test_print_file_and_plot(self): 41 | out_stat = StringIO() 42 | sub = {'Unknown': 1, 'CellWall': 1, 'Cytoplasmic': 2} 43 | total_nums = {'Staphylococcus_aureus_HG002': 1, 44 | 'Staphylococcus_aureus_HG003': 3, 'all_strain': 4} 45 | unknown_nums = {'Staphylococcus_aureus_HG002': 1, 46 | 'Staphylococcus_aureus_HG003': 0, 'all_strain': 1} 47 | ss.print_file_and_plot(sub, total_nums, unknown_nums, 48 | "all_strain", out_stat, self.test_folder + "/") 49 | datas = out_stat.getvalue().split("\n") 50 | for data in datas: 51 | if "Total with Unknown" in data: 52 | self.assertEqual(data, 53 | ("Total including Unknown is 4; " 54 | "Total excluding Unknown is 3")) 55 | elif "CellWall" in data: 56 | self.assertEqual(data, 57 | ("\tCellWall\t1(including Unknown 0.25; " 58 | "excluding Unknonwn 0.3333333333333333)")) 59 | elif "Cytoplasmic" in data: 60 | self.assertEqual(data, 61 | ("\tCytoplasmic\t2(including Unknown 0.5; " 62 | "excluding Unknonwn 0.6666666666666666)")) 63 | else: 64 | if "include Unknown" in data: 65 | self.assertEqual(data, 66 | ("\tUnknown\t1(including Unknown 0.25)")) 67 | 68 | def test_plot(self): 69 | subs = {'Unknown': 1, 'CellWall': 1, 'Cytoplasmic': 2} 70 | ss.plot(subs, 4, 1, "test", self.test_folder + "/") 71 | self.assertTrue(os.path.exists(os.path.join( 72 | self.test_folder, "_test_sublocal.png"))) 73 | 74 | 75 | class Example(object): 76 | 77 | table = """Staphylococcus_aureus_HG003 YP_498609.1 + 517 1878 Cytoplasmic 9.97 78 | Staphylococcus_aureus_HG003 YP_498610.1 + 2156 3289 Cytoplasmic 9.97 79 | Staphylococcus_aureus_HG003 YP_498611.1 + 3670 3915 CellWall 7.50 80 | Staphylococcus_aureus_HG002 YP_498612.1 + 4676 5015 Unknown 7.50""" 81 | 82 | if __name__ == "__main__": 83 | unittest.main() 84 | 85 | -------------------------------------------------------------------------------- /tests/test_optimize.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file, import_data 8 | from mock_args_container import MockClass 9 | import annogesiclib.optimize as opt 10 | 11 | 12 | class Mock_helper(object): 13 | 14 | def __init__(self): 15 | pass 16 | 17 | def check_uni_attributes(self, gff_file): 18 | pass 19 | 20 | def remove_all_content(self, filename, feature, type_): 21 | pass 22 | 23 | def remove_tmp(self, wigs): 24 | pass 25 | 26 | def remove_tmp_dir(self, folder): 27 | pass 28 | 29 | 30 | class Mock_multiparser(object): 31 | 32 | def __init__(self): 33 | pass 34 | 35 | def parser_wig(self, wigs): 36 | pass 37 | 38 | def parser_gff(self, gffs, feature): 39 | pass 40 | 41 | def parser_fasta(self, fastas): 42 | pass 43 | 44 | class Mock_func(object): 45 | 46 | def mock_optimization(self, wig_path, fasta_file, gff_file, 47 | args, strain, manual, length, log): 48 | gen_file(os.path.join(args.output_folder, "test.csv"), "test") 49 | 50 | class TestOptimizeTSS(unittest.TestCase): 51 | 52 | def setUp(self): 53 | self.mock_args = MockClass() 54 | self.test_folder = "test_folder" 55 | self.fastas = os.path.join(self.test_folder, "fasta") 56 | self.wigs = os.path.join(self.test_folder, "wigs") 57 | self.gffs = os.path.join(self.test_folder, "gffs") 58 | self.manuals = os.path.join(self.test_folder, "manuals") 59 | if (not os.path.exists(self.test_folder)): 60 | os.mkdir(self.test_folder) 61 | os.mkdir(self.fastas) 62 | os.mkdir(os.path.join(self.fastas, "tmp")) 63 | os.mkdir(self.wigs) 64 | os.mkdir(os.path.join(self.wigs, "tmp")) 65 | os.mkdir(self.gffs) 66 | os.mkdir(os.path.join(self.gffs, "tmp")) 67 | os.mkdir(self.manuals) 68 | os.mkdir(os.path.join(self.manuals, "tmp")) 69 | 70 | def tearDown(self): 71 | if os.path.exists(self.test_folder): 72 | shutil.rmtree(self.test_folder) 73 | 74 | def test_optimize_tss(self): 75 | opt.Helper = Mock_helper 76 | opt.Multiparser = Mock_multiparser 77 | opt.optimization = Mock_func().mock_optimization 78 | gen_file(os.path.join(self.gffs, "tmp", "test.gff"), "test") 79 | gen_file(os.path.join(self.fastas, "tmp", "test.fa"), "test") 80 | args = self.mock_args.mock() 81 | args.fastas = self.fastas 82 | args.gffs = self.gffs 83 | args.wigs = self.wigs 84 | args.tsspredator_path = "test" 85 | args.manuals = self.manuals 86 | gen_file(os.path.join(self.manuals, "tmp", "test.gff"), "test") 87 | args.output_folder = self.test_folder 88 | args.project_strain = "test" 89 | args.height = 9 90 | args.height_reduction = 9 91 | args.factor = 9 92 | args.factor_reduction = 9 93 | args.base_height = 9 94 | args.enrichment = 9 95 | args.processing = 9 96 | args.utr = 200 97 | args.libs = "test" 98 | args.replicate_name = "test" 99 | args.cluster = 2 100 | args.strain_lengths = {"test": 100} 101 | args.cores = 4 102 | args.program = "TSS" 103 | args.replicate = 2 104 | args.steps = 2000 105 | log = open(os.path.join(self.test_folder, "test.log"), "w") 106 | opt.optimize_tss(args, log) 107 | self.assertTrue(os.path.exists(os.path.join( 108 | self.test_folder, "test.csv"))) 109 | log.close() 110 | 111 | if __name__ == "__main__": 112 | unittest.main() 113 | 114 | -------------------------------------------------------------------------------- /annogesiclib/gff3.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class Gff3Parser(object): 5 | """ 6 | A format description can be found at: 7 | http://genome.ucsc.edu/FAQ/FAQformat.html#format3 8 | http://www.sequenceontology.org/gff3.shtml 9 | 10 | a validator can be found here: 11 | http://modencode.oicr.on.ca/cgi-bin/validate_gff3_online 12 | 13 | WARNING: Currently this class in not strict enough and would also 14 | parse file not following the standard. 15 | """ 16 | 17 | def entries(self, input_gff_fh): 18 | """ 19 | """ 20 | for entry_dict in csv.DictReader( 21 | input_gff_fh, delimiter="\t", 22 | fieldnames=["seq_id", "source", "feature", "start", 23 | "end", "score", "strand", "phase", "attributes"]): 24 | if entry_dict["seq_id"].startswith("#"): 25 | continue 26 | yield self._dict_to_entry(entry_dict) 27 | 28 | def _dict_to_entry(self, entry_dict): 29 | return Gff3Entry(entry_dict) 30 | 31 | 32 | class Gff3Entry(object): 33 | 34 | """ 35 | 36 | Example: 37 | start, end = sorted([int(pos) for pos in [start, end]]) 38 | Gff3Entry({ 39 | "seq_id" : seq_id, 40 | "source" : "MyLab", 41 | "feature" : "sRNA", 42 | "start" : start, 43 | "end" : end, 44 | "strand" : strand, 45 | "score" : ".", 46 | "phase" : ".", 47 | "attributes" : "name=%s;locus_tag=%s" % (name, locus_tag)}) 48 | """ 49 | 50 | def __init__(self, entry_dict): 51 | self.seq_id = entry_dict["seq_id"] 52 | self.source = entry_dict["source"] 53 | self.feature = entry_dict["feature"] 54 | # 1-based coordinates 55 | # Make sure that start <= end 56 | start, end = sorted([int(entry_dict["start"]), int(entry_dict["end"])]) 57 | self.start = start 58 | self.end = end 59 | self.score = entry_dict["score"] 60 | self.strand = entry_dict["strand"] 61 | self.phase = entry_dict["phase"] 62 | self.attributes = self._attributes(entry_dict["attributes"]) 63 | self.attribute_string = entry_dict["attributes"] 64 | self.info = "\t".join([str(field) for field in [ 65 | self.seq_id, self.source, self.feature, self.start, 66 | self.end, self.score, self.strand, self.phase, 67 | self.attribute_string]]) 68 | self.info_without_attributes = "\t".join([str(field) for field in [ 69 | self.seq_id, self.source, self.feature, self.start, 70 | self.end, self.score, self.strand, self.phase]]) 71 | 72 | def _attributes(self, attributes_string): 73 | """Translate the attribute string to dictionary""" 74 | attributes = {} 75 | if len(attributes_string) > 0: 76 | for attribute in attributes_string.split(";"): 77 | key_value_pair = attribute.split("=") 78 | key = key_value_pair[0] 79 | if len(key_value_pair) > 2: 80 | value = "=".join(key_value_pair[1:]) 81 | elif len(key_value_pair) == 2: 82 | value = key_value_pair[1] 83 | else: 84 | value = "" 85 | attributes[key] = value 86 | return attributes 87 | else: 88 | return attributes 89 | 90 | def add_attribute(self, key, value): 91 | self.attributes[key] = value 92 | self.attribute_string = ";".join( 93 | ["=".join(items) for items in self.attributes.items()]) 94 | 95 | def __str__(self): 96 | return "\t".join([str(field) for field in [ 97 | self.seq_id, self.source, self.feature, self.start, 98 | self.end, self.score, self.strand, self.phase, 99 | self.attribute_string]]) 100 | -------------------------------------------------------------------------------- /tests/test_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import shutil 4 | import sys 5 | sys.path.append(".") 6 | from annogesiclib.paths import Paths 7 | 8 | 9 | class TestPaths(unittest.TestCase): 10 | 11 | def setUp(self): 12 | self.test_folder = "test_folder" 13 | if not os.path.exists(self.test_folder): 14 | os.mkdir(self.test_folder) 15 | self.paths = Paths(base_path=self.test_folder) 16 | self.folder_names = [ 17 | self.paths.input_folder, 18 | self.paths.output_folder, 19 | self.paths.reference_input_folder, 20 | self.paths.wig_folder, 21 | self.paths.mutation_table_folder, 22 | self.paths.database_folder, 23 | self.paths.manual_TSS_folder, 24 | self.paths.manual_pro_folder, 25 | self.paths.read_folder, 26 | self.paths.bam_folder, 27 | self.paths.target_folder, 28 | self.paths.ratt_folder, 29 | self.paths.tsspredator_folder, 30 | self.paths.utr_folder, 31 | self.paths.transterm_folder, 32 | self.paths.transcript_output_folder, 33 | self.paths.processing_site_folder, 34 | self.paths.srna_folder, 35 | self.paths.sorf_folder, 36 | self.paths.promoter_output_folder, 37 | self.paths.operon_output_folder, 38 | self.paths.circrna_output_folder, 39 | self.paths.goterm_output_folder, 40 | self.paths.starget_output_folder, 41 | self.paths.snp_output_folder, 42 | self.paths.ppi_output_folder, 43 | self.paths.sublocal_output_folder, 44 | self.paths.ribos_output_folder] 45 | 46 | def tearDown(self): 47 | if os.path.exists(self.test_folder): 48 | shutil.rmtree(self.test_folder) 49 | 50 | def test_set_folder_names(self): 51 | self.paths._set_folder_names() 52 | for folder_name in self.folder_names: 53 | assert(folder_name != '') 54 | self.assertEqual(self.folder_names.count(folder_name), 1) 55 | 56 | 57 | def test_required_folders(self): 58 | self.assertEqual(len(self.paths.required_folders("root")), 22) 59 | self.assertEqual(len( 60 | self.paths.required_folders("get_target_fasta")), 25) 61 | self.assertEqual(len(self.paths.required_folders("TSS")), 27) 62 | self.assertEqual(len(self.paths.required_folders("transcript")), 26) 63 | self.assertEqual(len(self.paths.required_folders("terminator")), 27) 64 | self.assertEqual(len( 65 | self.paths.required_folders("annotation_transfer")), 25) 66 | self.assertEqual(len(self.paths.required_folders("utr")), 29) 67 | self.assertEqual(len(self.paths.required_folders("promoter")), 23) 68 | self.assertEqual(len(self.paths.required_folders("operon")), 26) 69 | self.assertEqual(len(self.paths.required_folders("srna")), 37) 70 | self.assertEqual(len(self.paths.required_folders("sorf")), 30) 71 | self.assertEqual(len(self.paths.required_folders("processing")), 27) 72 | self.assertEqual(len(self.paths.required_folders("riboswitch")), 27) 73 | self.assertEqual(len(self.paths.required_folders("go_term")), 29) 74 | self.assertEqual(len(self.paths.required_folders("ppi_network")), 26) 75 | self.assertEqual(len(self.paths.required_folders("circrna")), 28) 76 | self.assertEqual(len(self.paths.required_folders("crispr")), 26) 77 | self.assertEqual(len(self.paths.required_folders("thermometer")), 27) 78 | self.assertEqual(len(self.paths.required_folders("snp")), 39) 79 | self.assertEqual(len( 80 | self.paths.required_folders("subcellular_localization")), 29) 81 | self.assertEqual(len(self.paths.required_folders("srna_target")), 29) 82 | 83 | if __name__ == "__main__": 84 | unittest.main() 85 | -------------------------------------------------------------------------------- /tests/test_operon.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_helper import gen_file 8 | import annogesiclib.operon as op 9 | from annogesiclib.operon import OperonDetection 10 | from mock_args_container import MockClass 11 | 12 | 13 | class Mock_func(object): 14 | 15 | def mock_operon(self, tran, tss, gff, term, tss_fuzzy, 16 | term_fuzzy, length, out_table, out_gff): 17 | gen_file(out_table, "test") 18 | 19 | def mock_stat(self, table, out_stat): 20 | gen_file(out_stat, "test") 21 | 22 | def mock_combine_gff(self, gff, tran, tss, utr5, utr3, term, 23 | tss_fuzzy, term_fuzzy, out_file): 24 | gen_file(out_file, "test") 25 | 26 | class TestOperonDetection(unittest.TestCase): 27 | 28 | def setUp(self): 29 | self.test_folder = "test_folder" 30 | self.mock_args = MockClass() 31 | self.mock = Mock_func() 32 | self.tsss = os.path.join(self.test_folder, "tsss") 33 | self.trans = os.path.join(self.test_folder, "trans") 34 | self.utr5s = os.path.join(self.test_folder, "utr5s") 35 | self.utr3s = os.path.join(self.test_folder, "utr3s") 36 | self.output = os.path.join(self.test_folder, "output") 37 | self.gffs = os.path.join(self.test_folder, "gffs") 38 | self.out_gff = os.path.join(self.output, "gffs") 39 | self.stat = os.path.join(self.test_folder, "stat") 40 | if (not os.path.exists(self.test_folder)): 41 | os.mkdir(self.test_folder) 42 | os.mkdir(self.gffs) 43 | os.mkdir(self.tsss) 44 | os.mkdir(self.stat) 45 | os.mkdir(os.path.join(self.tsss, "tmp")) 46 | os.mkdir(self.trans) 47 | os.mkdir(os.path.join(self.trans, "tmp")) 48 | os.mkdir(self.utr5s) 49 | os.mkdir(os.path.join(self.utr5s, "tmp")) 50 | os.mkdir(self.utr3s) 51 | os.mkdir(os.path.join(self.utr3s, "tmp")) 52 | os.mkdir(self.output) 53 | os.mkdir(self.out_gff) 54 | os.mkdir(os.path.join(self.output, "tables")) 55 | args = self.mock_args.mock() 56 | args.tsss = self.tsss 57 | args.trans = self.trans 58 | args.utr5s = self.utr5s 59 | args.utr3s = self.utr3s 60 | args.output_folder = self.output 61 | args.terms = None 62 | self.operon = OperonDetection(args) 63 | 64 | def tearDown(self): 65 | if os.path.exists(self.test_folder): 66 | shutil.rmtree(self.test_folder) 67 | 68 | def test_detect_operon(self): 69 | op.operon = self.mock.mock_operon 70 | gen_file(os.path.join(self.tsss, "tmp", "test_TSS.gff"), "test") 71 | gen_file(os.path.join(self.trans, "tmp", 72 | "test_transcript.gff"), "test") 73 | gen_file(os.path.join(self.gffs, "test.gff"), "test") 74 | args = self.mock_args.mock() 75 | args.gffs = self.out_gff 76 | args.term_fuzzy = 3 77 | args.tss_fuzzy = 3 78 | args.length = 100 79 | log = open(os.path.join(self.test_folder, "test.log"), "w") 80 | self.operon._detect_operon(["test"], args, log) 81 | self.assertTrue(os.path.exists(os.path.join(self.output, "tables", 82 | "test_operon.csv"))) 83 | log.close() 84 | 85 | def test_stat(self): 86 | op.stat = self.mock.mock_stat 87 | table_file = os.path.join(self.output, "tables", "test_operon.csv") 88 | log = open(os.path.join(self.test_folder, "test.log"), "w") 89 | if not os.path.exists(table_file): 90 | gen_file(table_file, "test") 91 | self.operon._stat(os.path.join(self.output, "tables"), self.stat, log) 92 | self.assertTrue(os.path.exists(os.path.join( 93 | self.stat, "stat_test_operon.csv"))) 94 | log.close() 95 | 96 | if __name__ == "__main__": 97 | unittest.main() 98 | 99 | -------------------------------------------------------------------------------- /annogesiclib/get_input.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import shutil 4 | from subprocess import call 5 | from annogesiclib.seq_editer import SeqEditer 6 | 7 | 8 | def wget(input_folder, ftp, files_type, log): 9 | log.write("\t" + " ".join(["wget", "-cP", input_folder, ftp + "/*" + files_type]) + "\n") 10 | os.system(" ".join(["wget", "-cP", input_folder, ftp + "/*" + files_type])) 11 | log.write("Done!\n") 12 | 13 | def deal_detect(input_file, file_path, change, input_folder): 14 | '''deal with the header of fasta file and 15 | put the files to corresponding folders''' 16 | if change: 17 | shutil.move(input_file, file_path) 18 | change = False 19 | SeqEditer().modify_header(file_path) 20 | with open(os.path.join(file_path)) as fh: 21 | for line in fh: 22 | line = line.strip() 23 | if line.startswith(">"): 24 | seq_name = line[1:] 25 | shutil.move(file_path, 26 | os.path.join(input_folder, seq_name + ".fa")) 27 | return change, seq_name 28 | 29 | 30 | def get_file(ftp, input_folder, files_type, log): 31 | checks = {"detect": False, "change": None} 32 | filename = None 33 | files = [] 34 | wget(input_folder, ftp, files_type, log) 35 | for file_ in os.listdir(input_folder): 36 | input_file = os.path.join(input_folder, file_) 37 | if (file_[-3:] == "fna"): 38 | filename = file_[0:-3] + "fa" 39 | checks = {"detect": True, "change": True} 40 | elif (file_[-5:] == "fasta"): 41 | filename = file_[0:-5] + "fa" 42 | checks = {"detect": True, "change": True} 43 | elif (file_[-2:] == "fa"): 44 | filename = file_[0:-2] + "fa" 45 | checks = {"detect": True, "change": True} 46 | elif (file_[-6:] == "fna.gz") and ("_genomic" in file_): 47 | if ("_cds_from_genomic" in file_) or ( 48 | "_rna_from_genomic" in file_): 49 | os.remove(input_file) 50 | else: 51 | filename = file_[0:-6] + "fa" 52 | checks = {"detect": True, "change": True} 53 | log.write("\tgunzip " + input_file + "\n") 54 | call(["gunzip", input_file]) 55 | input_file = input_file[:-3] 56 | elif (file_[-6:] == "gff.gz") or (file_[-3:] == "gff"): 57 | if ("_genomic" in file_) and (file_[-6:] == "gff.gz"): 58 | log.write("\tgunzip " + input_file + "\n") 59 | call(["gunzip", input_file]) 60 | input_file = input_file[:-3] 61 | fh = open(input_file, "r") 62 | for row in csv.reader(fh, delimiter='\t'): 63 | if not row[0].startswith("#"): 64 | gff_name = row[0] 65 | break 66 | shutil.move(input_file, os.path.join(input_folder, 67 | gff_name + ".gff")) 68 | fh.close() 69 | elif (file_[-3:] == "gbk") or (file_[-7:] == "gbff.gz") or ( 70 | file_[-4:] == "gbff"): 71 | if (file_[-7:] == "gbff.gz") and ("_genomic" in file_): 72 | log.write("\tgunzip " + input_file + "\n") 73 | call(["gunzip", input_file]) 74 | input_file = input_file[:-3] 75 | with open(input_file, "r") as g_f: 76 | for line in g_f: 77 | line = line.strip() 78 | if line.startswith("VERSION"): 79 | for data in line.split(" "): 80 | if (len(data) != 0) and (data != "VERSION"): 81 | break 82 | break 83 | print(os.path.join(input_folder, data + ".gbk")) 84 | shutil.move(input_file, os.path.join(input_folder, data + ".gbk")) 85 | if checks["detect"]: 86 | checks["detect"] = False 87 | checks["change"], seq_name = deal_detect( 88 | input_file, filename, checks["change"], input_folder) 89 | -------------------------------------------------------------------------------- /tests/test_plot_TSS_venn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import shutil 5 | from io import StringIO 6 | sys.path.append(".") 7 | from mock_gff3 import Create_generator 8 | from mock_helper import gen_file 9 | import annogesiclib.plot_TSS_venn as ptv 10 | 11 | 12 | class Mock_func(object): 13 | 14 | def mock_plot_text(plt, xy1, xy2, tss_type, size, color_text): 15 | pass 16 | 17 | class TestPlotTSSVenn(unittest.TestCase): 18 | 19 | def setUp(self): 20 | self.test_folder = "test_folder" 21 | if (not os.path.exists(self.test_folder)): 22 | os.mkdir(self.test_folder) 23 | self.mock = Mock_func() 24 | self.example = Example() 25 | 26 | def tearDown(self): 27 | if os.path.exists(self.test_folder): 28 | shutil.rmtree(self.test_folder) 29 | 30 | def test_check_tss_class(self): 31 | strain = "test" 32 | tss_type = "Internal" 33 | total_types = {"test": {}} 34 | ptv.check_tss_class(total_types, strain, 35 | self.example.tsss[0], tss_type) 36 | self.assertDictEqual(total_types, {'test': {'Internal': 0}}) 37 | tss_type = "Primary" 38 | ptv.check_tss_class(total_types, strain, 39 | self.example.tsss[0], tss_type) 40 | self.assertDictEqual(total_types, {'test': { 41 | 'Internal': 0, 'Primary': 1}}) 42 | 43 | def test_import_types(self): 44 | tsss = {"test": self.example.tsss} 45 | types, total_types = ptv.import_types(tsss) 46 | self.assertDictEqual(types, {'test': {'Orphan': 1, 'Internal': 1, 47 | 'Primary': 1}, 'all': {}}) 48 | self.assertDictEqual(total_types, { 49 | 'test': {'Orphan': 1, 'Antisense': 0, 50 | 'Secondary': 0, 'Internal': 1, 51 | 'Primary': 1}, 'all': {}}) 52 | 53 | def test_read_gff(self): 54 | tss_file = os.path.join(self.test_folder, "test.gff") 55 | gen_file(tss_file, self.example.tss_file) 56 | tsss, tss_num = ptv.read_gff(tss_file) 57 | self.assertEqual(tsss["all"][0].start, 140) 58 | self.assertEqual(tsss["aaa"][0].start, 140) 59 | self.assertDictEqual(tss_num, {'all': 1, 'aaa': 1}) 60 | 61 | def test_plot(self): 62 | types = {'test': {'Orphan': 1, 'Internal': 1, 63 | 'Primary': 1}, 'all': {}} 64 | total_types = {'test': {'Orphan': 1, 'Antisense': 0, 65 | 'Secondary': 0, 'Internal': 1, 66 | 'Primary': 1}, 'all': {}} 67 | tss_num = {'all': 0, 'test': 3} 68 | ptv.plot(types, "TSS", "TSS", total_types, tss_num) 69 | self.assertTrue(os.path.exists("TSS_venn_test.png")) 70 | os.remove("TSS_venn_test.png") 71 | 72 | class Example(object): 73 | 74 | tss_dict = [ 75 | {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 140, 76 | "end": 140, "phase": ".", "strand": "+", "score": "."}, 77 | {"seq_id": "aaa", "source": "Refseq", "feature": "TSS", "start": 230, 78 | "end": 230, "phase": ".", "strand": "+", "score": "."}, 79 | {"seq_id": "bbb", "source": "Refseq", "feature": "TSS", "start": 5166, 80 | "end": 5166, "phase": ".", "strand": "-", "score": "."}] 81 | attributes_tss = [{"ID": "tss0", "Name": "TSS_0", "type": "Primary", 82 | "associated_gene": "AAA_00001"}, 83 | {"ID": "tss1", "Name": "TSS_1", "type": "Internal", 84 | "associated_gene": "AAA_00002"}, 85 | {"ID": "tss2", "Name": "TSS_2", "type": "Orphan", 86 | "associated_gene": "orphan"}] 87 | tsss = [] 88 | for index in range(0, 3): 89 | tsss.append(Create_generator( 90 | tss_dict[index], attributes_tss[index], "gff")) 91 | tss_file = """aaa\tRefseq\tTSS\t140\t140\t.\t+\t.\tID=TSS_0;Name=TSS_00000;associated_gene=AAA_00001;type=Primary""" 92 | 93 | if __name__ == "__main__": 94 | unittest.main() 95 | 96 | -------------------------------------------------------------------------------- /annogesiclib/color_png.py: -------------------------------------------------------------------------------- 1 | import os 2 | from subprocess import call 3 | from annogesiclib.gen_svg import gen_svg 4 | from annogesiclib.helper import Helper 5 | 6 | 7 | class ColorPNG(object): 8 | 9 | def _convert_svg(self, imagemagick_path, out_path, screenshot, svg_file, log): 10 | call([imagemagick_path, 11 | os.path.join(out_path, screenshot), 12 | os.path.join(out_path, svg_file)]) 13 | log.write("\t" + " ".join([imagemagick_path, 14 | os.path.join(out_path, screenshot), 15 | os.path.join(out_path, svg_file)]) + "\n") 16 | 17 | def _convert_png(self, imagemagick_path, out_path, screenshot, png_file, log): 18 | call([imagemagick_path, "-background", "none", 19 | os.path.join(out_path, screenshot), 20 | os.path.join(out_path, png_file)]) 21 | log.write("\t" + " ".join([imagemagick_path, "-background", "none", 22 | os.path.join(out_path, screenshot), 23 | os.path.join(out_path, png_file)]) + "\n") 24 | 25 | def generate_color_png(self, track_num, out_folder, imagemagick_path, log): 26 | '''generation of color png based on tracks''' 27 | out_folder = os.path.join(out_folder, "screenshots") 28 | for strain in os.listdir(out_folder): 29 | if os.path.isdir(os.path.join(out_folder, strain)): 30 | for strand in ["forward", "reverse"]: 31 | print("Running for {0}_{1}".format(strain, strand)) 32 | out_path = os.path.join(out_folder, strain, strand) 33 | # convert original png to svg and give color on it. 34 | log.write("Converting png file in {0} to svg.\n".format( 35 | out_path)) 36 | log.write("Colorizing svg files.\n" 37 | "Make sure the version of ImageMagick is " 38 | "at least 6.9.0-0.\n") 39 | for screenshot in os.listdir(out_path): 40 | if screenshot.endswith(".png"): 41 | print("Converting {0} to svg files and " 42 | "Painting tracks now".format( 43 | screenshot)) 44 | svg_file = screenshot.replace(".png", ".svg") 45 | self._convert_svg(imagemagick_path, out_path, 46 | screenshot, svg_file, log) 47 | with open(os.path.join( 48 | out_path, svg_file), "r") as f_h: 49 | for line in f_h: 50 | line = line.strip() 51 | if line.startswith(" norm.start: 9 | frag.start = norm.start 10 | norm.attributes["print"] = True 11 | frag.attributes["print"] = True 12 | 13 | 14 | def print_file(data, out, name, num): 15 | attributes = {} 16 | attributes["ID"] = data.seq_id + "_transcript" + str(num) 17 | attributes["Name"] = "transcript_" + name 18 | attributes["detect_lib"] = data.attributes["detect_lib"] 19 | attribute_string = ";".join(["=".join(items) 20 | for items in attributes.items()]) 21 | out.write("\t".join([str(field) for field in [ 22 | data.seq_id, data.source, data.feature, data.start, 23 | data.end, data.score, data.strand, data.phase, 24 | attribute_string]]) + "\n") 25 | 26 | 27 | def store(data, source, finals): 28 | data.attributes["detect_lib"] = source 29 | data.attributes["print"] = False 30 | finals.append(data) 31 | 32 | 33 | def compare(data1, data2, overlap, tolerance): 34 | '''search the sRNA which can be detected in frag and tex libs. 35 | Then, try to merge them to be a longer one''' 36 | if (data1.seq_id == data2.seq_id) and (data1.strand == data2.strand): 37 | if (data1.start <= (data2.end + tolerance)) and ( 38 | data1.start >= data2.start) and ( 39 | data1.end >= (data2.end + tolerance)): 40 | modify_position(data1, data2) 41 | overlap = True 42 | elif (data1.end >= (data2.start - tolerance)) and ( 43 | data1.end <= data2.end) and ( 44 | data1.start <= (data2.start - tolerance)): 45 | modify_position(data1, data2) 46 | overlap = True 47 | elif (data1.start <= data2.start) and ( 48 | data1.end >= data2.end): 49 | modify_position(data1, data2) 50 | overlap = True 51 | elif (data2.start <= data1.start) and ( 52 | data2.end >= data1.end): 53 | modify_position(data1, data2) 54 | overlap = True 55 | return overlap 56 | 57 | 58 | def combine(frag_file, tex_file, tolerance, output_file): 59 | '''merge the results of sRNA which detected by fragmented and dRNA''' 60 | frags = [] 61 | norms = [] 62 | finals = [] 63 | out = open(output_file, "w") 64 | out.write("##gff-version 3\n") 65 | f_h = open(frag_file, "r") 66 | for entry in Gff3Parser().entries(f_h): 67 | entry.attributes["print"] = False 68 | frags.append(entry) 69 | f_h.close() 70 | n_h = open(tex_file, "r") 71 | for entry in Gff3Parser().entries(n_h): 72 | entry.attributes["print"] = False 73 | norms.append(entry) 74 | n_h.close() 75 | sort_frags = sorted(frags, key=lambda k: (k.seq_id, k.start, 76 | k.end, k.strand)) 77 | sort_norms = sorted(norms, key=lambda k: (k.seq_id, k.start, 78 | k.end, k.strand)) 79 | for frag in sort_frags: 80 | overlap = False 81 | for norm in sort_norms: 82 | overlap = compare(frag, norm, overlap, tolerance) 83 | if overlap: 84 | store(frag, "fragmented,tex_notex", finals) 85 | else: 86 | store(frag, "fragmented", finals) 87 | for norm in sort_norms: 88 | if not norm.attributes["print"]: 89 | store(norm, "tex_notex", finals) 90 | sort_finals = sorted(finals, key=lambda k: (k.seq_id, k.start, 91 | k.end, k.strand)) 92 | num = 0 93 | for tar in sort_finals: 94 | if tar.attributes["print"]: 95 | continue 96 | overlap = False 97 | for ref in sort_finals: 98 | overlap = compare(tar, ref, overlap, tolerance) 99 | name = '%0*d' % (5, num) 100 | print_file(tar, out, name, num) 101 | num += 1 102 | out.close() 103 | -------------------------------------------------------------------------------- /annogesiclib/stat_operon.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import itertools 3 | 4 | 5 | def _boolean(data): 6 | if data == "False": 7 | result = False 8 | else: 9 | result = True 10 | return result 11 | 12 | 13 | def row_to_location(row): 14 | if row[4] == "0": 15 | sub = False 16 | nosub = True 17 | else: 18 | sub = True 19 | nosub = False 20 | tss = _boolean(row[6]) 21 | term = _boolean(row[8]) 22 | return {"have no sub-operons": nosub, "have sub-operons": sub, 23 | "start with tss": tss, "stop with terminator": term} 24 | 25 | 26 | def plus_num(num_total, strain, type_): 27 | num_total["total"][type_] += 1 28 | num_total[strain][type_] += 1 29 | num_total["total"]["total"] += 1 30 | num_total[strain]["total"] += 1 31 | 32 | 33 | def print_stat(operons, total_num, class_operon, out): 34 | num_features = {} 35 | out.write("Total number of operons is {0}\n".format(total_num)) 36 | out.write("The sub operon and features:\n") 37 | for operon in operons: 38 | for it in range(1, 5): 39 | for features in itertools.combinations(operon.keys(), it): 40 | check_key = 0 41 | for key in features: 42 | if operon[key]: 43 | if it == 1: 44 | if key in num_features.keys(): 45 | num_features[key] += 1 46 | else: 47 | num_features[key] = 1 48 | check_key += 1 49 | if (check_key == it) and (it != 1): 50 | key = " and ".join(features) 51 | if key in num_features.keys(): 52 | num_features[key] += 1 53 | else: 54 | num_features[key] = 1 55 | for key, value in num_features.items(): 56 | out.write("\tthe number of operons which {0} = {1} ({2})\n".format( 57 | key, value, float(value) / float(total_num))) 58 | out.write("mono/polycistronic:\n") 59 | out.write("\tmonocistronic: {0} ({1})\n".format( 60 | class_operon["mono"], 61 | float(class_operon["mono"]) / float(class_operon["total"]))) 62 | out.write("\tpolycistronic: {0} ({1})\n".format( 63 | class_operon["poly"], 64 | float(class_operon["poly"]) / float(class_operon["total"]))) 65 | 66 | 67 | def stat(input_file, out_file): 68 | out = open(out_file, "w") 69 | operons = {} 70 | operons_all = [] 71 | tmp_id = "" 72 | f_h = open(input_file, "r") 73 | pre_seq_id = "" 74 | total_num = {} 75 | total_num_all = 0 76 | class_operon = {} 77 | class_operon["total"] = {"na": 0, "mono": 0, "poly": 0, "total": 0} 78 | for row in csv.reader(f_h, delimiter="\t"): 79 | if row[0] != "Operon_ID": 80 | if row[0] != tmp_id: 81 | if pre_seq_id != row[1]: 82 | pre_seq_id = row[1] 83 | operons[row[1]] = [] 84 | total_num[row[1]] = 0 85 | class_operon[row[1]] = {"na": 0, "mono": 0, 86 | "poly": 0, "total": 0} 87 | operons[row[1]].append(row_to_location(row)) 88 | operons_all.append(row_to_location(row)) 89 | total_num[row[1]] += 1 90 | total_num_all += 1 91 | if row[-1] == "NA": 92 | plus_num(class_operon, row[1], "na") 93 | elif len(row[-1].split(",")) == 1: 94 | plus_num(class_operon, row[1], "mono") 95 | elif len(row[-1].split(",")) > 1: 96 | plus_num(class_operon, row[1], "poly") 97 | tmp_id = row[0] 98 | if len(operons) > 1: 99 | out.write("All genomes:\n") 100 | print_stat(operons_all, total_num_all, class_operon["total"], out) 101 | for strain in operons.keys(): 102 | out.write("\n" + strain + ":\n") 103 | print_stat(operons[strain], total_num[strain], 104 | class_operon[strain], out) 105 | out.close() 106 | f_h.close() 107 | --------------------------------------------------------------------------------