├── .gitattributes ├── .github └── workflows │ ├── ci.yml │ └── publish_to_pypi.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── README.md ├── example ├── example.zip ├── input │ ├── ecoli.faa │ ├── mycoplasma.faa │ └── synechocystis.faa ├── output │ ├── ecoli │ │ ├── cog_classify.tsv │ │ ├── cog_count.tsv │ │ ├── cog_count_barchart.html │ │ ├── cog_count_barchart.png │ │ ├── cog_count_piechart.html │ │ ├── cog_count_piechart.png │ │ ├── cogclassifier.log │ │ └── rpsblast.tsv │ ├── mycoplasma │ │ ├── cog_classify.tsv │ │ ├── cog_count.tsv │ │ ├── cog_count_barchart.html │ │ ├── cog_count_barchart.png │ │ ├── cog_count_piechart.html │ │ ├── cog_count_piechart.png │ │ ├── cogclassifier.log │ │ └── rpsblast.tsv │ └── synechocystis │ │ ├── cog_classify.tsv │ │ ├── cog_count.tsv │ │ ├── cog_count_barchart.html │ │ ├── cog_count_barchart.png │ │ ├── cog_count_piechart.html │ │ ├── cog_count_piechart.png │ │ ├── cogclassifier.log │ │ └── rpsblast.tsv └── plot │ ├── cog_count.tsv │ ├── cog_count_add_no_classify.tsv │ ├── cog_count_change_color.tsv │ └── plot_example.ipynb ├── pyproject.toml ├── requirements-dev.lock ├── requirements.lock ├── src └── cogclassifier │ ├── __init__.py │ ├── __main__.py │ ├── blast.py │ ├── cog.py │ ├── const.py │ ├── logger.py │ ├── main.py │ ├── plot.py │ ├── resources │ ├── cog_definition.tsv │ └── cog_func_category.tsv │ ├── scripts │ ├── __init__.py │ ├── cogclassifier.py │ ├── plot_cog_count_barchart.py │ └── plot_cog_count_piechart.py │ └── utils.py └── tests ├── conftest.py ├── data ├── cog_count.tsv └── example.faa ├── scripts ├── test_cogclassifier.py ├── test_plot_cog_count_barchart.py └── test_plot_cog_count_piechart.py └── test_cog.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.html linguist-documentation 2 | *.ipynb linguist-documentation 3 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: [main, develop] 5 | paths: ["src/**", "tests/**", ".github/workflows/ci.yml"] 6 | pull_request: 7 | branches: [main, develop] 8 | paths: ["src/**", "tests/**", ".github/workflows/ci.yml"] 9 | workflow_dispatch: 10 | 11 | jobs: 12 | CI: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest] 17 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v4 21 | 22 | - name: Install Rye 23 | run: | 24 | curl -sSf https://rye.astral.sh/get | RYE_INSTALL_OPTION="--yes" bash 25 | echo "$HOME/.rye/shims" >> $GITHUB_PATH 26 | 27 | - name: Setup Python ${{matrix.python-version}} & Dependencies 28 | run: | 29 | rye pin ${{ matrix.python-version }} 30 | rye sync --update-all --all-features 31 | 32 | - name: Install external tool dependencies 33 | run: | 34 | sudo apt update -y 35 | sudo apt install -y ncbi-blast+ 36 | if: ${{ matrix.os=='ubuntu-latest' }} 37 | 38 | - name: Run ruff lint check 39 | run: rye run ruff check --diff 40 | 41 | - name: Run ruff format check 42 | run: rye run ruff format --check --diff 43 | 44 | - name: Run pytest 45 | run: rye run pytest 46 | -------------------------------------------------------------------------------- /.github/workflows/publish_to_pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | on: 3 | release: 4 | types: [released] 5 | workflow_dispatch: 6 | 7 | jobs: 8 | publish_to_pypi: 9 | name: Publish to PyPI 10 | runs-on: ubuntu-latest 11 | env: 12 | PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} 13 | PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | 18 | - name: Install Rye 19 | run: | 20 | curl -sSf https://rye.astral.sh/get | RYE_INSTALL_OPTION="--yes" bash 21 | echo "$HOME/.rye/shims" >> $GITHUB_PATH 22 | 23 | - name: Build 24 | run: rye build 25 | 26 | - name: Publish 27 | run: rye publish -u $PYPI_USERNAME --token $PYPI_PASSWORD -y 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cog_download/ 2 | # example/ 3 | # output/ 4 | .vscode/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | # *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/astral-sh/ruff-pre-commit 5 | rev: v0.11.6 6 | hooks: 7 | - id: ruff 8 | name: ruff lint check 9 | types_or: [python, pyi] 10 | args: [--fix] 11 | - id: ruff-format 12 | name: ruff format check 13 | types_or: [python, pyi] 14 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: If you use this software, please cite it as below. 3 | authors: 4 | - family-names: Shimoyama 5 | given-names: Yuki 6 | title: "COGclassifier: A tool for classifying prokaryote protein sequences into COG functional category" 7 | date-released: 2022-03-20 8 | url: https://github.com/moshi4/COGclassifier 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 moshi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # COGclassifier 2 | 3 | ![Python3](https://img.shields.io/badge/Language-Python3-steelblue) 4 | ![OS](https://img.shields.io/badge/OS-Windows_|_Mac_|_Linux-steelblue) 5 | ![License](https://img.shields.io/badge/License-MIT-steelblue) 6 | [![Latest PyPI version](https://img.shields.io/pypi/v/cogclassifier.svg)](https://pypi.python.org/pypi/cogclassifier) 7 | [![Bioconda](https://img.shields.io/conda/vn/bioconda/cogclassifier.svg?color=green)](https://anaconda.org/bioconda/cogclassifier) 8 | ![CI workflow](https://github.com/moshi4/COGclassifier/actions/workflows/ci.yml/badge.svg) 9 | 10 | ## Table of Contents 11 | 12 | - [Overview](#overview) 13 | - [Installation](#installation) 14 | - [Workflow](#workflow) 15 | - [Usage](#usage) 16 | - [Output Contents](#output-contents) 17 | - [Customize Charts](#customize-charts) 18 | 19 | ## Overview 20 | 21 | COG(Cluster of Orthologous Genes) is a database that plays an important role in the annotation, classification, and analysis of microbial gene function. 22 | Functional annotation, classification, and analysis of each gene in newly sequenced bacterial genomes using the COG database is a common task. 23 | However, there was no COG functional classification command line software that is easy-to-use and capable of producing publication-ready figures. 24 | Therefore, I developed COGclassifier to fill this need. 25 | COGclassifier can automatically perform the processes from searching query sequences into the COG database, to annotation and classification of gene functions, to generation of publication-ready figures (See figure below). 26 | 27 | ![ecoli_barchart_fig](https://raw.githubusercontent.com/moshi4/COGclassifier/main/example/output/ecoli/cog_count_barchart.png) 28 | Fig.1: Barchart of COG funcitional category classification result for E.coli 29 | 30 | ![ecoli_piechart_fig](https://raw.githubusercontent.com/moshi4/COGclassifier/main/example/output/ecoli/cog_count_piechart.png) 31 | Fig.2: Piechart of COG funcitional category classification result for E.coli 32 | 33 | ## Installation 34 | 35 | `Python 3.9 or later` is required for installation. Installation of RPS-BLAST(ncbi-blast+) is also necessary. 36 | 37 | **Install bioconda package:** 38 | 39 | conda install -c conda-forge -c bioconda cogclassifier 40 | 41 | **Install PyPI stable package:** 42 | 43 | pip install cogclassifier 44 | 45 | ## Workflow 46 | 47 | Description of COGclassifier's automated workflow. 48 | This workflow was created based in part on [cdd2cog](https://github.com/aleimba/bac-genomics-scripts/tree/master/cdd2cog). 49 | 50 | ### 1. Setup COG & CDD resources 51 | 52 | Download & load 4 required COG & CDD files from FTP site. 53 | 54 | - `cog-24.fun.tab` () 55 | Descriptions of COG functional categories. 56 | This resource file is included in the package as `cog_func_category.tsv`. 57 | 58 |
59 | Show more information 60 | 61 | > Tab-delimited plain text file with descriptions of COG functional categories 62 | > The categories form four functional groups: 63 | > 1\. INFORMATION STORAGE AND PROCESSING 64 | > 2\. CELLULAR PROCESSES AND SIGNALING 65 | > 3\. METABOLISM 66 | > 4\. POORLY CHARACTERIZED 67 | > Columns: 68 | > 1\. Functional category ID (one letter) 69 | > 2\. Functional group (1-4, as above) 70 | > 3\. Hexadecimal RGB color associated with the functional category 71 | > 4\. Functional category description 72 | > Each line corresponds to one functional category. The order of the categories is meaningful (reflects a hierarchy of functions; determines the order of display) 73 | > 74 | > (From ) 75 | 76 |
77 | 78 | - `cog-24.def.tab` () 79 | COG descriptions such as 'COG ID', 'COG functional category', 'COG name', etc... 80 | This resource file is included in the package as `cog_definition.tsv`. 81 | 82 |
83 | Show more information 84 | 85 | > Tab-delimited plain text file with COG descriptions 86 | > Columns: 87 | > 1\. COG ID 88 | > 2\. COG functional category (could include multiple letters in the order of importance) 89 | > 3\. COG name 90 | > 4\. Gene name associated with the COG (optional) 91 | > 5\. Functional pathway associated with the COG (optional) 92 | > 6\. PubMed ID, associated with the COG (multiple entries are semicolon-separated; optional) 93 | > 7\. PDB ID of the structure associated with the COG (multiple entries are semicolon-separated; optional) 94 | > Each line corresponds to one COG. The order of the COGs is arbitrary (displayed in the lexicographic order) 95 | > 96 | > (From ) 97 | 98 |
99 | 100 | - `cddid.tbl.gz` () 101 | Summary information about the CD(Conserved Domain) model. 102 | 103 |
104 | Show more information 105 | 106 | >"cddid.tbl.gz" contains summary information about the CD models in this 107 | >distribution, which are part of the default "cdd" search database and are 108 | >indexed in NCBI's Entrez database. This is a tab-delimited text file, with a 109 | >single row per CD model and the following columns: 110 | > 111 | >PSSM-Id (unique numerical identifier) 112 | >CD accession (starting with 'cd', 'pfam', 'smart', 'COG', 'PRK' or "CHL') 113 | >CD "short name" 114 | >CD description 115 | >PSSM-Length (number of columns, the size of the search model) 116 | > 117 | > (From ) 118 | 119 |
120 | 121 | - `Cog_LE.tar.gz` () 122 | COG database, a part of CDD(Conserved Domain Database), for RPS-BLAST search. 123 | 124 | ### 2. RPS-BLAST search against COG database 125 | 126 | Run query sequences RPS-BLAST against COG database [Default: E-value = 1e-2]. 127 | Best-hit (=lowest e-value) blast results are extracted and used in next functional classification step. 128 | 129 | ### 3. Classify query sequences into COG functional category 130 | 131 | From best-hit results, extract relationship between query sequences and COG functional category as described below. 132 | 133 | 1. Best-hit results -> CDD ID 134 | 2. CDD ID -> COG ID (From `cddid.tbl.gz`) 135 | 3. COG ID -> COG Functional Category Letter (From `cog-24.def.tab`) 136 | 4. COG Functional Category Letter -> COG Functional Category Definition (From `cog-24.fun.tab`) 137 | 138 | > :warning: 139 | > If functional category with multiple letters exists, first letter is treated as functional category 140 | > (e.g. COG4862 has multiple letters `KTN`. A letter `K` is treated as functional category). 141 | 142 | Using the above information, the number of query sequences classified into each COG functional category is calculated and 143 | functional annotation and classification results are output. 144 | 145 | ## Usage 146 | 147 | ### Basic Command 148 | 149 | COGclassifier -i [protein fasta file] -o [output directory] 150 | 151 | ### Options 152 | 153 | $ COGclassifier --help 154 | 155 | Usage: COGclassifier [OPTIONS] 156 | 157 | A tool for classifying prokaryote protein sequences into COG functional category 158 | 159 | ╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ 160 | │ * --infile -i Input query protein fasta file [required] │ 161 | │ * --outdir -o Output directory [required] │ 162 | │ --download_dir -d Download COG & CDD resources directory [default: /home/user/.cache/cogclassifier_v2] │ 163 | │ --thread_num -t RPS-BLAST num_thread parameter [default: MaxThread - 1] │ 164 | │ --evalue -e RPS-BLAST e-value parameter [default: 0.01] │ 165 | │ --quiet -q No print log on screen │ 166 | │ --version -v Print version information │ 167 | │ --help -h Show this message and exit. │ 168 | ╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ 169 | 170 | ### Example Command 171 | 172 | Click [here](https://github.com/moshi4/COGclassifier/raw/main/example/example.zip) to download example protein fasta files. 173 | 174 | COGclassifier -i ./example/ecoli.faa -o ./ecoli_cogclassifier 175 | 176 | ## Output Contents 177 | 178 | - **`rpsblast.tsv`** ([example](https://github.com/moshi4/COGclassifier/blob/main/example/output/mycoplasma/rpsblast.tsv)) 179 | RPS-BLAST against COG database result (format = `outfmt 6`). 180 | 181 | - **`cog_classify.tsv`** ([example](https://github.com/moshi4/COGclassifier/blob/main/example/output/mycoplasma/cog_classify.tsv)) 182 | Query sequences classified into COG functional category result. 183 | This file contains all classified query sequences and associated COG information. 184 | 185 |
186 | Table of detailed tsv format information (9 columns) 187 | 188 | | Columns | Contents | Example Value | 189 | | ---------------- | -------------------------------------- | ----------------------------------- | 190 | | QUERY_ID | Query ID | NP_414544.1 | 191 | | COG_ID | COG ID of RPS-BLAST top hit result | COG0083 | 192 | | CDD_ID | CDD ID of RPS-BLAST top hit result | 223161 | 193 | | EVALUE | RPS-BLAST top hit evalue | 2.5e-150 | 194 | | IDENTITY | RPS-BLAST top hit identity | 45.806 | 195 | | GENE_NAME | Abbreviated gene name | ThrB | 196 | | COG_NAME | COG gene name | Homoserine kinase | 197 | | COG_LETTER | Letter of COG functional category | E | 198 | | COG_DESCRIPTION | Description of COG functional category | Amino acid transport and metabolism | 199 | 200 |
201 | 202 | - **`cog_count.tsv`** ([example](https://github.com/moshi4/COGclassifier/blob/main/example/output/ecoli/cog_count.tsv)) 203 | Count classified sequences per COG functional category result. 204 | 205 |
206 | Table of detailed tsv format information (5 columns) 207 | 208 | | Columns | Contents | Example Value | 209 | | ------------| --------------------------------------- | ----------------------------------------------- | 210 | | LETTER | Letter of COG functional category | J | 211 | | COUNT | Count of COG classified sequence | 259 | 212 | | GROUP | COG functional group | INFORMATION STORAGE AND PROCESSING | 213 | | COLOR | Symbol color of COG functional category | #FCCCFC | 214 | | DESCRIPTION | Description of COG functional category | Translation, ribosomal structure and biogenesis | 215 | 216 |
217 | 218 | - **`cogclassifier.log`** ([example](https://github.com/moshi4/COGclassifier/blob/main/example/output/ecoli/cogclassifier.log)) 219 | COGclassifier log file. 220 | 221 | - **`cog_count_barchart.[png|html]`** 222 | Barchart of COG funcitional category classification result. 223 | COGclassifier uses [`Altair`](https://altair-viz.github.io/) visualization library for plotting charts. 224 | 225 | ![cog_count_barchart](https://raw.githubusercontent.com/moshi4/COGclassifier/main/example/output/ecoli/cog_count_barchart.png) 226 | 227 | - **`cog_count_piechart.[png|html]`** 228 | Piechart of COG funcitional category classification result. 229 | Functional category with percentages less than 1% don't display letter on piechart. 230 | 231 | ![cog_count_piechart](https://raw.githubusercontent.com/moshi4/COGclassifier/main/example/output/ecoli/cog_count_piechart.png) 232 | 233 | ## Customize Charts 234 | 235 | COGclassifier also provides barchart & piechart plotting API/CLI to customize charts appearence. 236 | See [notebooks](https://github.com/moshi4/COGclassifier/blob/main/example/plot/plot_example.ipynb) and command below for details. 237 | 238 | ### plot_cog_count_barchart 239 | 240 | $ plot_cog_count_barchart --help 241 | 242 | Usage: plot_cog_count_barchart [OPTIONS] 243 | 244 | Plot COGclassifier count barchart figure 245 | 246 | ╭─ Options ───────────────────────────────────────────────────────────────────────────────────╮ 247 | │ * --infile -i Input COG count result file ('cog_count.tsv') [required] │ 248 | │ * --outfile -o Output barchart figure file (*.png|*.svg|*.html) [required] │ 249 | │ --width Figure pixel width [default: 440] │ 250 | │ --height Figure pixel height [default: 340] │ 251 | │ --bar_width Figure bar width [default: 15] │ 252 | │ --y_limit Y-axis max limit value │ 253 | │ --percent_style Plot percent style instead of number count │ 254 | │ --sort Enable descending sort by number count │ 255 | │ --dpi Figure DPI [default: 100] │ 256 | │ --help -h Show this message and exit. │ 257 | ╰─────────────────────────────────────────────────────────────────────────────────────────────╯ 258 | 259 | ### plot_cog_count_piechart 260 | 261 | $ plot_cog_count_piechart --help 262 | 263 | Usage: plot_cog_count_piechart [OPTIONS] 264 | 265 | Plot COGclassifier count piechart figure 266 | 267 | ╭─ Options ───────────────────────────────────────────────────────────────────────────────────╮ 268 | │ * --infile -i Input COG count result file ('cog_count.tsv') [required] │ 269 | │ * --outfile -o Output piechart figure file (*.png|*.svg|*.html) [required] │ 270 | │ --width Figure pixel width [default: 380] │ 271 | │ --height Figure pixel height [default: 380] │ 272 | │ --show_letter Show functional category lettter on piechart │ 273 | │ --sort Enable descending sort by number count │ 274 | │ --dpi Figure DPI [default: 100] │ 275 | │ --help -h Show this message and exit. │ 276 | ╰─────────────────────────────────────────────────────────────────────────────────────────────╯ 277 | -------------------------------------------------------------------------------- /example/example.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/example/example.zip -------------------------------------------------------------------------------- /example/output/ecoli/cog_count.tsv: -------------------------------------------------------------------------------- 1 | LETTER COUNT GROUP COLOR DESCRIPTION 2 | J 264 INFORMATION STORAGE AND PROCESSING #FCCCFC Translation, ribosomal structure and biogenesis 3 | A 8 INFORMATION STORAGE AND PROCESSING #FCDCFC RNA processing and modification 4 | K 272 INFORMATION STORAGE AND PROCESSING #FCDCEC Transcription 5 | L 147 INFORMATION STORAGE AND PROCESSING #FCDCDC Replication, recombination and repair 6 | B 8 INFORMATION STORAGE AND PROCESSING #FCDCCC Chromatin structure and dynamics 7 | D 51 CELLULAR PROCESSES AND SIGNALING #FCFCCC Cell cycle control, cell division, chromosome partitioning 8 | Y 0 CELLULAR PROCESSES AND SIGNALING #FCFCBC Nuclear structure 9 | V 95 CELLULAR PROCESSES AND SIGNALING #FCFCAC Defense mechanisms 10 | T 157 CELLULAR PROCESSES AND SIGNALING #ECFCAC Signal transduction mechanisms 11 | M 280 CELLULAR PROCESSES AND SIGNALING #DCFCAC Cell wall/membrane/envelope biogenesis 12 | N 106 CELLULAR PROCESSES AND SIGNALING #CCFCAC Cell motility 13 | Z 1 CELLULAR PROCESSES AND SIGNALING #BCFCAC Cytoskeleton 14 | W 14 CELLULAR PROCESSES AND SIGNALING #ACFCAC Extracellular structures 15 | U 39 CELLULAR PROCESSES AND SIGNALING #9CFCAC Intracellular trafficking, secretion, and vesicular transport 16 | O 149 CELLULAR PROCESSES AND SIGNALING #9CFC9C Posttranslational modification, protein turnover, chaperones 17 | X 69 CELLULAR PROCESSES AND SIGNALING #9CFC9C Mobilome: prophages, transposons 18 | C 280 METABOLISM #BCFCFC Energy production and conversion 19 | G 369 METABOLISM #CCFCFC Carbohydrate transport and metabolism 20 | E 371 METABOLISM #DCFCFC Amino acid transport and metabolism 21 | F 110 METABOLISM #DCECFC Nucleotide transport and metabolism 22 | H 180 METABOLISM #DCDCFC Coenzyme transport and metabolism 23 | I 127 METABOLISM #DCCCFC Lipid transport and metabolism 24 | P 199 METABOLISM #CCCCFC Inorganic ion transport and metabolism 25 | Q 46 METABOLISM #BCCCFC Secondary metabolites biosynthesis, transport and catabolism 26 | R 159 POORLY CHARACTERIZED #E0E0E0 General function prediction only 27 | S 116 POORLY CHARACTERIZED #CCCCCC Function unknown 28 | -------------------------------------------------------------------------------- /example/output/ecoli/cog_count_barchart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 41 | 42 | -------------------------------------------------------------------------------- /example/output/ecoli/cog_count_barchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/example/output/ecoli/cog_count_barchart.png -------------------------------------------------------------------------------- /example/output/ecoli/cog_count_piechart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 41 | 42 | -------------------------------------------------------------------------------- /example/output/ecoli/cog_count_piechart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/example/output/ecoli/cog_count_piechart.png -------------------------------------------------------------------------------- /example/output/ecoli/cogclassifier.log: -------------------------------------------------------------------------------- 1 | 2025-04-19 21:09:56 | INFO | Run COGclassifier v2.0.0 2 | 2025-04-19 21:09:56 | INFO | $ COGclassifier -i example/input/ecoli.faa -o example/output/ecoli 3 | 2025-04-19 21:09:56 | INFO | Operating System: linux 4 | 2025-04-19 21:09:56 | INFO | Python Version: v3.9.19 5 | 2025-04-19 21:09:56 | INFO | Parameter: infile=example/input/ecoli.faa 6 | 2025-04-19 21:09:56 | INFO | Parameter: outdir=example/output/ecoli 7 | 2025-04-19 21:09:56 | INFO | Parameter: download_dir=/home/ys/.cache/cogclassifier_v2 8 | 2025-04-19 21:09:56 | INFO | Parameter: thread_num=11 9 | 2025-04-19 21:09:56 | INFO | Parameter: evalue=0.01 10 | 2025-04-19 21:09:56 | INFO | Download COG & CDD resources in NCBI FTP site 11 | 2025-04-19 21:09:56 | INFO | Download https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid.tbl.gz 12 | 2025-04-19 21:09:56 | INFO | => Already file exists /home/ys/.cache/cogclassifier_v2/cddid.tbl.gz 13 | 2025-04-19 21:09:56 | INFO | Download https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/Cog_LE.tar.gz 14 | 2025-04-19 21:09:56 | INFO | => Already file exists /home/ys/.cache/cogclassifier_v2/Cog_LE.tar.gz 15 | 2025-04-19 21:09:56 | INFO | Load COG Functional Category from /home/ys/Desktop/Project/Bioinformatics/COGclassifier/src/cogclassifier/resources/cog_func_category.tsv 16 | 2025-04-19 21:09:56 | INFO | Load COG Definition from /home/ys/Desktop/Project/Bioinformatics/COGclassifier/src/cogclassifier/resources/cog_definition.tsv 17 | 2025-04-19 21:09:56 | INFO | Load COG <=> CDD ID Conversion Table from /home/ys/.cache/cogclassifier_v2/cddid.tbl.gz 18 | 2025-04-19 21:09:57 | INFO | ********** Start RPS-BLAST(v2.12.0) Search ********** 19 | 2025-04-19 21:09:57 | INFO | $ rpsblast+ -query example/input/ecoli.faa -db /home/ys/.cache/cogclassifier_v2/Cog_LE/Cog -outfmt 6 -out /tmp/tmp5i0hoxl1/rpsblast.tsv -evalue 0.01 -num_threads 11 -mt_mode 1 20 | 2025-04-19 21:10:21 | INFO | ********** Finished RPS-BLAST Search ********** 21 | 2025-04-19 21:10:22 | INFO | 87.37% (3617 / 4140) sequences are classified into COG functional category 22 | 2025-04-19 21:10:22 | INFO | Write rpsblast search result 23 | 2025-04-19 21:10:22 | INFO | => example/output/ecoli/rpsblast.tsv 24 | 2025-04-19 21:10:22 | INFO | Write summary of COG functional category count 25 | 2025-04-19 21:10:22 | INFO | => example/output/ecoli/cog_count.tsv 26 | 2025-04-19 21:10:22 | INFO | Write result of COG classification per query 27 | 2025-04-19 21:10:22 | INFO | => example/output/ecoli/cog_classify.tsv 28 | 2025-04-19 21:10:22 | INFO | Plot COG count barchart figure 29 | 2025-04-19 21:10:22 | INFO | => example/output/ecoli/cog_count_barchart.html 30 | 2025-04-19 21:10:23 | INFO | => example/output/ecoli/cog_count_barchart.png 31 | 2025-04-19 21:10:23 | INFO | Plot COG count piechart figure 32 | 2025-04-19 21:10:23 | INFO | => example/output/ecoli/cog_count_piechart.html 33 | 2025-04-19 21:10:23 | INFO | => example/output/ecoli/cog_count_piechart.png 34 | 2025-04-19 21:10:23 | INFO | Done (elapsed time: 27.20[s]) 35 | -------------------------------------------------------------------------------- /example/output/mycoplasma/cog_count.tsv: -------------------------------------------------------------------------------- 1 | LETTER COUNT GROUP COLOR DESCRIPTION 2 | J 123 INFORMATION STORAGE AND PROCESSING #FCCCFC Translation, ribosomal structure and biogenesis 3 | A 3 INFORMATION STORAGE AND PROCESSING #FCDCFC RNA processing and modification 4 | K 18 INFORMATION STORAGE AND PROCESSING #FCDCEC Transcription 5 | L 38 INFORMATION STORAGE AND PROCESSING #FCDCDC Replication, recombination and repair 6 | B 1 INFORMATION STORAGE AND PROCESSING #FCDCCC Chromatin structure and dynamics 7 | D 22 CELLULAR PROCESSES AND SIGNALING #FCFCCC Cell cycle control, cell division, chromosome partitioning 8 | Y 0 CELLULAR PROCESSES AND SIGNALING #FCFCBC Nuclear structure 9 | V 16 CELLULAR PROCESSES AND SIGNALING #FCFCAC Defense mechanisms 10 | T 8 CELLULAR PROCESSES AND SIGNALING #ECFCAC Signal transduction mechanisms 11 | M 21 CELLULAR PROCESSES AND SIGNALING #DCFCAC Cell wall/membrane/envelope biogenesis 12 | N 1 CELLULAR PROCESSES AND SIGNALING #CCFCAC Cell motility 13 | Z 0 CELLULAR PROCESSES AND SIGNALING #BCFCAC Cytoskeleton 14 | W 0 CELLULAR PROCESSES AND SIGNALING #ACFCAC Extracellular structures 15 | U 6 CELLULAR PROCESSES AND SIGNALING #9CFCAC Intracellular trafficking, secretion, and vesicular transport 16 | O 21 CELLULAR PROCESSES AND SIGNALING #9CFC9C Posttranslational modification, protein turnover, chaperones 17 | X 0 CELLULAR PROCESSES AND SIGNALING #9CFC9C Mobilome: prophages, transposons 18 | C 20 METABOLISM #BCFCFC Energy production and conversion 19 | G 43 METABOLISM #CCFCFC Carbohydrate transport and metabolism 20 | E 25 METABOLISM #DCFCFC Amino acid transport and metabolism 21 | F 23 METABOLISM #DCECFC Nucleotide transport and metabolism 22 | H 24 METABOLISM #DCDCFC Coenzyme transport and metabolism 23 | I 13 METABOLISM #DCCCFC Lipid transport and metabolism 24 | P 18 METABOLISM #CCCCFC Inorganic ion transport and metabolism 25 | Q 0 METABOLISM #BCCCFC Secondary metabolites biosynthesis, transport and catabolism 26 | R 5 POORLY CHARACTERIZED #E0E0E0 General function prediction only 27 | S 12 POORLY CHARACTERIZED #CCCCCC Function unknown 28 | -------------------------------------------------------------------------------- /example/output/mycoplasma/cog_count_barchart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 41 | 42 | -------------------------------------------------------------------------------- /example/output/mycoplasma/cog_count_barchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/example/output/mycoplasma/cog_count_barchart.png -------------------------------------------------------------------------------- /example/output/mycoplasma/cog_count_piechart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 41 | 42 | -------------------------------------------------------------------------------- /example/output/mycoplasma/cog_count_piechart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/example/output/mycoplasma/cog_count_piechart.png -------------------------------------------------------------------------------- /example/output/mycoplasma/cogclassifier.log: -------------------------------------------------------------------------------- 1 | 2025-04-19 21:06:37 | INFO | Run COGclassifier v2.0.0 2 | 2025-04-19 21:06:37 | INFO | $ COGclassifier -i example/input/mycoplasma.faa -o example/output/mycoplasma 3 | 2025-04-19 21:06:37 | INFO | Operating System: linux 4 | 2025-04-19 21:06:37 | INFO | Python Version: v3.9.19 5 | 2025-04-19 21:06:37 | INFO | Parameter: infile=example/input/mycoplasma.faa 6 | 2025-04-19 21:06:37 | INFO | Parameter: outdir=example/output/mycoplasma 7 | 2025-04-19 21:06:37 | INFO | Parameter: download_dir=/home/ys/.cache/cogclassifier_v2 8 | 2025-04-19 21:06:37 | INFO | Parameter: thread_num=11 9 | 2025-04-19 21:06:37 | INFO | Parameter: evalue=0.01 10 | 2025-04-19 21:06:37 | INFO | Download COG & CDD resources in NCBI FTP site 11 | 2025-04-19 21:06:37 | INFO | Download https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid.tbl.gz 12 | 2025-04-19 21:06:37 | INFO | => Already file exists /home/ys/.cache/cogclassifier_v2/cddid.tbl.gz 13 | 2025-04-19 21:06:37 | INFO | Download https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/Cog_LE.tar.gz 14 | 2025-04-19 21:06:37 | INFO | => Already file exists /home/ys/.cache/cogclassifier_v2/Cog_LE.tar.gz 15 | 2025-04-19 21:06:37 | INFO | Load COG Functional Category from /home/ys/Desktop/Project/Bioinformatics/COGclassifier/src/cogclassifier/resources/cog_func_category.tsv 16 | 2025-04-19 21:06:37 | INFO | Load COG Definition from /home/ys/Desktop/Project/Bioinformatics/COGclassifier/src/cogclassifier/resources/cog_definition.tsv 17 | 2025-04-19 21:06:37 | INFO | Load COG <=> CDD ID Conversion Table from /home/ys/.cache/cogclassifier_v2/cddid.tbl.gz 18 | 2025-04-19 21:06:38 | INFO | ********** Start RPS-BLAST(v2.12.0) Search ********** 19 | 2025-04-19 21:06:38 | INFO | $ rpsblast+ -query example/input/mycoplasma.faa -db /home/ys/.cache/cogclassifier_v2/Cog_LE/Cog -outfmt 6 -out /tmp/tmpp6_rz243/rpsblast.tsv -evalue 0.01 -num_threads 11 -mt_mode 1 20 | 2025-04-19 21:06:41 | INFO | ********** Finished RPS-BLAST Search ********** 21 | 2025-04-19 21:06:41 | INFO | 67.20% (461 / 686) sequences are classified into COG functional category 22 | 2025-04-19 21:06:41 | INFO | Write rpsblast search result 23 | 2025-04-19 21:06:41 | INFO | => example/output/mycoplasma/rpsblast.tsv 24 | 2025-04-19 21:06:41 | INFO | Write summary of COG functional category count 25 | 2025-04-19 21:06:41 | INFO | => example/output/mycoplasma/cog_count.tsv 26 | 2025-04-19 21:06:41 | INFO | Write result of COG classification per query 27 | 2025-04-19 21:06:41 | INFO | => example/output/mycoplasma/cog_classify.tsv 28 | 2025-04-19 21:06:41 | INFO | Plot COG count barchart figure 29 | 2025-04-19 21:06:41 | INFO | => example/output/mycoplasma/cog_count_barchart.html 30 | 2025-04-19 21:06:42 | INFO | => example/output/mycoplasma/cog_count_barchart.png 31 | 2025-04-19 21:06:42 | INFO | Plot COG count piechart figure 32 | 2025-04-19 21:06:42 | INFO | => example/output/mycoplasma/cog_count_piechart.html 33 | 2025-04-19 21:06:42 | INFO | => example/output/mycoplasma/cog_count_piechart.png 34 | 2025-04-19 21:06:42 | INFO | Done (elapsed time: 5.53[s]) 35 | -------------------------------------------------------------------------------- /example/output/synechocystis/cog_count.tsv: -------------------------------------------------------------------------------- 1 | LETTER COUNT GROUP COLOR DESCRIPTION 2 | J 212 INFORMATION STORAGE AND PROCESSING #FCCCFC Translation, ribosomal structure and biogenesis 3 | A 7 INFORMATION STORAGE AND PROCESSING #FCDCFC RNA processing and modification 4 | K 81 INFORMATION STORAGE AND PROCESSING #FCDCEC Transcription 5 | L 121 INFORMATION STORAGE AND PROCESSING #FCDCDC Replication, recombination and repair 6 | B 4 INFORMATION STORAGE AND PROCESSING #FCDCCC Chromatin structure and dynamics 7 | D 51 CELLULAR PROCESSES AND SIGNALING #FCFCCC Cell cycle control, cell division, chromosome partitioning 8 | Y 0 CELLULAR PROCESSES AND SIGNALING #FCFCBC Nuclear structure 9 | V 157 CELLULAR PROCESSES AND SIGNALING #FCFCAC Defense mechanisms 10 | T 213 CELLULAR PROCESSES AND SIGNALING #ECFCAC Signal transduction mechanisms 11 | M 218 CELLULAR PROCESSES AND SIGNALING #DCFCAC Cell wall/membrane/envelope biogenesis 12 | N 30 CELLULAR PROCESSES AND SIGNALING #CCFCAC Cell motility 13 | Z 0 CELLULAR PROCESSES AND SIGNALING #BCFCAC Cytoskeleton 14 | W 2 CELLULAR PROCESSES AND SIGNALING #ACFCAC Extracellular structures 15 | U 51 CELLULAR PROCESSES AND SIGNALING #9CFCAC Intracellular trafficking, secretion, and vesicular transport 16 | O 149 CELLULAR PROCESSES AND SIGNALING #9CFC9C Posttranslational modification, protein turnover, chaperones 17 | X 96 CELLULAR PROCESSES AND SIGNALING #9CFC9C Mobilome: prophages, transposons 18 | C 259 METABOLISM #BCFCFC Energy production and conversion 19 | G 133 METABOLISM #CCFCFC Carbohydrate transport and metabolism 20 | E 176 METABOLISM #DCFCFC Amino acid transport and metabolism 21 | F 61 METABOLISM #DCECFC Nucleotide transport and metabolism 22 | H 190 METABOLISM #DCDCFC Coenzyme transport and metabolism 23 | I 73 METABOLISM #DCCCFC Lipid transport and metabolism 24 | P 163 METABOLISM #CCCCFC Inorganic ion transport and metabolism 25 | Q 36 METABOLISM #BCCCFC Secondary metabolites biosynthesis, transport and catabolism 26 | R 282 POORLY CHARACTERIZED #E0E0E0 General function prediction only 27 | S 95 POORLY CHARACTERIZED #CCCCCC Function unknown 28 | -------------------------------------------------------------------------------- /example/output/synechocystis/cog_count_barchart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 41 | 42 | -------------------------------------------------------------------------------- /example/output/synechocystis/cog_count_barchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/example/output/synechocystis/cog_count_barchart.png -------------------------------------------------------------------------------- /example/output/synechocystis/cog_count_piechart.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 41 | 42 | -------------------------------------------------------------------------------- /example/output/synechocystis/cog_count_piechart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/example/output/synechocystis/cog_count_piechart.png -------------------------------------------------------------------------------- /example/output/synechocystis/cogclassifier.log: -------------------------------------------------------------------------------- 1 | 2025-04-19 21:10:43 | INFO | Run COGclassifier v2.0.0 2 | 2025-04-19 21:10:43 | INFO | $ COGclassifier -i example/input/synechocystis.faa -o example/output/synechocystis 3 | 2025-04-19 21:10:43 | INFO | Operating System: linux 4 | 2025-04-19 21:10:43 | INFO | Python Version: v3.9.19 5 | 2025-04-19 21:10:43 | INFO | Parameter: infile=example/input/synechocystis.faa 6 | 2025-04-19 21:10:43 | INFO | Parameter: outdir=example/output/synechocystis 7 | 2025-04-19 21:10:43 | INFO | Parameter: download_dir=/home/ys/.cache/cogclassifier_v2 8 | 2025-04-19 21:10:43 | INFO | Parameter: thread_num=11 9 | 2025-04-19 21:10:43 | INFO | Parameter: evalue=0.01 10 | 2025-04-19 21:10:43 | INFO | Download COG & CDD resources in NCBI FTP site 11 | 2025-04-19 21:10:43 | INFO | Download https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid.tbl.gz 12 | 2025-04-19 21:10:43 | INFO | => Already file exists /home/ys/.cache/cogclassifier_v2/cddid.tbl.gz 13 | 2025-04-19 21:10:43 | INFO | Download https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/Cog_LE.tar.gz 14 | 2025-04-19 21:10:43 | INFO | => Already file exists /home/ys/.cache/cogclassifier_v2/Cog_LE.tar.gz 15 | 2025-04-19 21:10:43 | INFO | Load COG Functional Category from /home/ys/Desktop/Project/Bioinformatics/COGclassifier/src/cogclassifier/resources/cog_func_category.tsv 16 | 2025-04-19 21:10:43 | INFO | Load COG Definition from /home/ys/Desktop/Project/Bioinformatics/COGclassifier/src/cogclassifier/resources/cog_definition.tsv 17 | 2025-04-19 21:10:43 | INFO | Load COG <=> CDD ID Conversion Table from /home/ys/.cache/cogclassifier_v2/cddid.tbl.gz 18 | 2025-04-19 21:10:44 | INFO | ********** Start RPS-BLAST(v2.12.0) Search ********** 19 | 2025-04-19 21:10:44 | INFO | $ rpsblast+ -query example/input/synechocystis.faa -db /home/ys/.cache/cogclassifier_v2/Cog_LE/Cog -outfmt 6 -out /tmp/tmpyhfw8ofs/rpsblast.tsv -evalue 0.01 -num_threads 11 -mt_mode 1 20 | 2025-04-19 21:11:06 | INFO | ********** Finished RPS-BLAST Search ********** 21 | 2025-04-19 21:11:07 | INFO | 80.25% (2860 / 3564) sequences are classified into COG functional category 22 | 2025-04-19 21:11:07 | INFO | Write rpsblast search result 23 | 2025-04-19 21:11:07 | INFO | => example/output/synechocystis/rpsblast.tsv 24 | 2025-04-19 21:11:07 | INFO | Write summary of COG functional category count 25 | 2025-04-19 21:11:07 | INFO | => example/output/synechocystis/cog_count.tsv 26 | 2025-04-19 21:11:07 | INFO | Write result of COG classification per query 27 | 2025-04-19 21:11:07 | INFO | => example/output/synechocystis/cog_classify.tsv 28 | 2025-04-19 21:11:07 | INFO | Plot COG count barchart figure 29 | 2025-04-19 21:11:07 | INFO | => example/output/synechocystis/cog_count_barchart.html 30 | 2025-04-19 21:11:08 | INFO | => example/output/synechocystis/cog_count_barchart.png 31 | 2025-04-19 21:11:08 | INFO | Plot COG count piechart figure 32 | 2025-04-19 21:11:08 | INFO | => example/output/synechocystis/cog_count_piechart.html 33 | 2025-04-19 21:11:08 | INFO | => example/output/synechocystis/cog_count_piechart.png 34 | 2025-04-19 21:11:08 | INFO | Done (elapsed time: 25.65[s]) 35 | -------------------------------------------------------------------------------- /example/plot/cog_count.tsv: -------------------------------------------------------------------------------- 1 | LETTER COUNT GROUP COLOR DESCRIPTION 2 | J 264 INFORMATION STORAGE AND PROCESSING #FCCCFC Translation, ribosomal structure and biogenesis 3 | A 8 INFORMATION STORAGE AND PROCESSING #FCDCFC RNA processing and modification 4 | K 272 INFORMATION STORAGE AND PROCESSING #FCDCEC Transcription 5 | L 147 INFORMATION STORAGE AND PROCESSING #FCDCDC Replication, recombination and repair 6 | B 8 INFORMATION STORAGE AND PROCESSING #FCDCCC Chromatin structure and dynamics 7 | D 51 CELLULAR PROCESSES AND SIGNALING #FCFCCC Cell cycle control, cell division, chromosome partitioning 8 | Y 0 CELLULAR PROCESSES AND SIGNALING #FCFCBC Nuclear structure 9 | V 95 CELLULAR PROCESSES AND SIGNALING #FCFCAC Defense mechanisms 10 | T 157 CELLULAR PROCESSES AND SIGNALING #ECFCAC Signal transduction mechanisms 11 | M 280 CELLULAR PROCESSES AND SIGNALING #DCFCAC Cell wall/membrane/envelope biogenesis 12 | N 106 CELLULAR PROCESSES AND SIGNALING #CCFCAC Cell motility 13 | Z 1 CELLULAR PROCESSES AND SIGNALING #BCFCAC Cytoskeleton 14 | W 14 CELLULAR PROCESSES AND SIGNALING #ACFCAC Extracellular structures 15 | U 39 CELLULAR PROCESSES AND SIGNALING #9CFCAC Intracellular trafficking, secretion, and vesicular transport 16 | O 149 CELLULAR PROCESSES AND SIGNALING #9CFC9C Posttranslational modification, protein turnover, chaperones 17 | X 69 CELLULAR PROCESSES AND SIGNALING #9CFC9C Mobilome: prophages, transposons 18 | C 280 METABOLISM #BCFCFC Energy production and conversion 19 | G 369 METABOLISM #CCFCFC Carbohydrate transport and metabolism 20 | E 371 METABOLISM #DCFCFC Amino acid transport and metabolism 21 | F 110 METABOLISM #DCECFC Nucleotide transport and metabolism 22 | H 180 METABOLISM #DCDCFC Coenzyme transport and metabolism 23 | I 127 METABOLISM #DCCCFC Lipid transport and metabolism 24 | P 199 METABOLISM #CCCCFC Inorganic ion transport and metabolism 25 | Q 46 METABOLISM #BCCCFC Secondary metabolites biosynthesis, transport and catabolism 26 | R 159 POORLY CHARACTERIZED #E0E0E0 General function prediction only 27 | S 116 POORLY CHARACTERIZED #CCCCCC Function unknown 28 | -------------------------------------------------------------------------------- /example/plot/cog_count_add_no_classify.tsv: -------------------------------------------------------------------------------- 1 | LETTER COUNT GROUP COLOR DESCRIPTION 2 | J 264 INFORMATION STORAGE AND PROCESSING #FCCCFC Translation, ribosomal structure and biogenesis 3 | A 8 INFORMATION STORAGE AND PROCESSING #FCDCFC RNA processing and modification 4 | K 272 INFORMATION STORAGE AND PROCESSING #FCDCEC Transcription 5 | L 147 INFORMATION STORAGE AND PROCESSING #FCDCDC Replication, recombination and repair 6 | B 8 INFORMATION STORAGE AND PROCESSING #FCDCCC Chromatin structure and dynamics 7 | D 51 CELLULAR PROCESSES AND SIGNALING #FCFCCC Cell cycle control, cell division, chromosome partitioning 8 | Y 0 CELLULAR PROCESSES AND SIGNALING #FCFCBC Nuclear structure 9 | V 95 CELLULAR PROCESSES AND SIGNALING #FCFCAC Defense mechanisms 10 | T 157 CELLULAR PROCESSES AND SIGNALING #ECFCAC Signal transduction mechanisms 11 | M 280 CELLULAR PROCESSES AND SIGNALING #DCFCAC Cell wall/membrane/envelope biogenesis 12 | N 106 CELLULAR PROCESSES AND SIGNALING #CCFCAC Cell motility 13 | Z 1 CELLULAR PROCESSES AND SIGNALING #BCFCAC Cytoskeleton 14 | W 14 CELLULAR PROCESSES AND SIGNALING #ACFCAC Extracellular structures 15 | U 39 CELLULAR PROCESSES AND SIGNALING #9CFCAC Intracellular trafficking, secretion, and vesicular transport 16 | O 149 CELLULAR PROCESSES AND SIGNALING #9CFC9C Posttranslational modification, protein turnover, chaperones 17 | X 69 CELLULAR PROCESSES AND SIGNALING #9CFC9C Mobilome: prophages, transposons 18 | C 280 METABOLISM #BCFCFC Energy production and conversion 19 | G 369 METABOLISM #CCFCFC Carbohydrate transport and metabolism 20 | E 371 METABOLISM #DCFCFC Amino acid transport and metabolism 21 | F 110 METABOLISM #DCECFC Nucleotide transport and metabolism 22 | H 180 METABOLISM #DCDCFC Coenzyme transport and metabolism 23 | I 127 METABOLISM #DCCCFC Lipid transport and metabolism 24 | P 199 METABOLISM #CCCCFC Inorganic ion transport and metabolism 25 | Q 46 METABOLISM #BCCCFC Secondary metabolites biosynthesis, transport and catabolism 26 | R 159 POORLY CHARACTERIZED #E0E0E0 General function prediction only 27 | S 116 POORLY CHARACTERIZED #CCCCCC Function unknown 28 | - 523 - #B8B8B8 No classified -------------------------------------------------------------------------------- /example/plot/cog_count_change_color.tsv: -------------------------------------------------------------------------------- 1 | LETTER COUNT GROUP COLOR DESCRIPTION 2 | J 264 INFORMATION STORAGE AND PROCESSING red Translation, ribosomal structure and biogenesis 3 | A 8 INFORMATION STORAGE AND PROCESSING red RNA processing and modification 4 | K 272 INFORMATION STORAGE AND PROCESSING red Transcription 5 | L 147 INFORMATION STORAGE AND PROCESSING red Replication, recombination and repair 6 | B 8 INFORMATION STORAGE AND PROCESSING red structure and dynamics 7 | D 51 CELLULAR PROCESSES AND SIGNALING blue cycle control, cell division, chromosome partitioning 8 | Y 0 CELLULAR PROCESSES AND SIGNALING blue structure 9 | V 95 CELLULAR PROCESSES AND SIGNALING blue mechanisms 10 | T 157 CELLULAR PROCESSES AND SIGNALING blue transduction mechanisms 11 | M 280 CELLULAR PROCESSES AND SIGNALING blue Cell wall/membrane/envelope biogenesis 12 | N 106 CELLULAR PROCESSES AND SIGNALING blue Cell motility 13 | Z 1 CELLULAR PROCESSES AND SIGNALING blue Cytoskeleton 14 | W 14 CELLULAR PROCESSES AND SIGNALING blue Extracellular structures 15 | U 39 CELLULAR PROCESSES AND SIGNALING blue Intracellular trafficking, secretion, and vesicular transport 16 | O 149 CELLULAR PROCESSES AND SIGNALING blue Posttranslational modification, protein turnover, chaperones 17 | X 69 CELLULAR PROCESSES AND SIGNALING blue Mobilome: prophages, transposons 18 | C 280 METABOLISM green Energy production and conversion 19 | G 369 METABOLISM green Carbohydrate transport and metabolism 20 | E 371 METABOLISM green Amino acid transport and metabolism 21 | F 110 METABOLISM green Nucleotide transport and metabolism 22 | H 180 METABOLISM green Coenzyme transport and metabolism 23 | I 127 METABOLISM green Lipid transport and metabolism 24 | P 199 METABOLISM green Inorganic ion transport and metabolism 25 | Q 46 METABOLISM green Secondary metabolites biosynthesis, transport and catabolism 26 | R 159 POORLY CHARACTERIZED grey General function prediction only 27 | S 116 POORLY CHARACTERIZED grey Function unknown 28 | - 523 - black No classified -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "cogclassifier" 3 | dynamic = ["version"] 4 | description = "A tool for classifying prokaryote protein sequences into COG functional category" 5 | authors = [{ name = "moshi4", email = "" }] 6 | license = "MIT" 7 | readme = "README.md" 8 | keywords = [ 9 | "COG", 10 | "bioinformatics", 11 | "genomics", 12 | "functional-annotation", 13 | "functional-analysis", 14 | ] 15 | classifiers = [ 16 | "Intended Audience :: Science/Research", 17 | "Topic :: Scientific/Engineering :: Bio-Informatics", 18 | ] 19 | requires-python = ">=3.9" 20 | dependencies = [ 21 | "requests>=2.27.1", 22 | "pandas>=2.0.0", 23 | "altair>=5.0.0", 24 | "pydantic>=2.11.3", 25 | "typer>=0.15.2", 26 | "vl-convert-python>=1.7.0", 27 | ] 28 | 29 | [project.urls] 30 | repository = "https://github.com/moshi4/COGclassifier/" 31 | 32 | [project.scripts] 33 | COGclassifier = "cogclassifier.scripts.cogclassifier:app" 34 | plot_cog_count_barchart = "cogclassifier.scripts.plot_cog_count_barchart:app" 35 | plot_cog_count_piechart = "cogclassifier.scripts.plot_cog_count_piechart:app" 36 | 37 | [tool.hatch.version] 38 | path = "src/cogclassifier/__init__.py" 39 | 40 | [tool.rye] 41 | managed = true 42 | dev-dependencies = [ 43 | "ruff>=0.11.0", 44 | "pre-commit>=4.2.0", 45 | "pytest>=8.0.0", 46 | "pytest-cov>=6.0.0", 47 | "ipykernel>=6.13.0", 48 | ] 49 | 50 | [tool.pytest.ini_options] 51 | minversion = "6.0" 52 | addopts = "--cov=src --tb=long -vv --cov-report=xml --cov-report=term" 53 | testpaths = ["tests"] 54 | 55 | [tool.ruff] 56 | include = ["src/**.py", "tests/**.py"] 57 | line-length = 88 58 | 59 | # Lint Rules: https://docs.astral.sh/ruff/rules/ 60 | [tool.ruff.lint] 61 | select = [ 62 | "F", # pyflakes 63 | "E", # pycodestyle (Error) 64 | "W", # pycodestyle (Warning) 65 | "I", # isort 66 | "D", # pydocstyle 67 | ] 68 | ignore = [ 69 | "D100", # Missing docstring in public module 70 | "D101", # Missing docstring in public class 71 | "D104", # Missing docstring in public package 72 | "D105", # Missing docstring in magic method 73 | "D205", # 1 blank line required between summary line and description 74 | "D400", # First line should end with a period 75 | "D401", # First line should be in imperative mood 76 | "D403", # First word of the first line should be properly capitalized 77 | "D415", # First line should end with a period, question mark, or exclamation point 78 | ] 79 | 80 | [tool.ruff.lint.pydocstyle] 81 | convention = "numpy" 82 | 83 | [build-system] 84 | requires = ["hatchling==1.26.3"] 85 | build-backend = "hatchling.build" 86 | -------------------------------------------------------------------------------- /requirements-dev.lock: -------------------------------------------------------------------------------- 1 | # generated by rye 2 | # use `rye lock` or `rye sync` to update this lockfile 3 | # 4 | # last locked with the following flags: 5 | # pre: false 6 | # features: [] 7 | # all-features: true 8 | # with-sources: false 9 | # generate-hashes: false 10 | # universal: false 11 | 12 | -e file:. 13 | altair==5.5.0 14 | # via cogclassifier 15 | annotated-types==0.7.0 16 | # via pydantic 17 | asttokens==3.0.0 18 | # via stack-data 19 | attrs==25.3.0 20 | # via jsonschema 21 | # via referencing 22 | certifi==2025.1.31 23 | # via requests 24 | cfgv==3.4.0 25 | # via pre-commit 26 | charset-normalizer==3.4.1 27 | # via requests 28 | click==8.1.8 29 | # via typer 30 | comm==0.2.2 31 | # via ipykernel 32 | coverage==7.8.0 33 | # via pytest-cov 34 | debugpy==1.8.14 35 | # via ipykernel 36 | decorator==5.2.1 37 | # via ipython 38 | distlib==0.3.9 39 | # via virtualenv 40 | exceptiongroup==1.2.2 41 | # via ipython 42 | # via pytest 43 | executing==2.2.0 44 | # via stack-data 45 | filelock==3.18.0 46 | # via virtualenv 47 | identify==2.6.9 48 | # via pre-commit 49 | idna==3.10 50 | # via requests 51 | importlib-metadata==8.6.1 52 | # via jupyter-client 53 | iniconfig==2.1.0 54 | # via pytest 55 | ipykernel==6.29.5 56 | ipython==8.18.1 57 | # via ipykernel 58 | jedi==0.19.2 59 | # via ipython 60 | jinja2==3.1.6 61 | # via altair 62 | jsonschema==4.23.0 63 | # via altair 64 | jsonschema-specifications==2024.10.1 65 | # via jsonschema 66 | jupyter-client==8.6.3 67 | # via ipykernel 68 | jupyter-core==5.7.2 69 | # via ipykernel 70 | # via jupyter-client 71 | markdown-it-py==3.0.0 72 | # via rich 73 | markupsafe==3.0.2 74 | # via jinja2 75 | matplotlib-inline==0.1.7 76 | # via ipykernel 77 | # via ipython 78 | mdurl==0.1.2 79 | # via markdown-it-py 80 | narwhals==1.35.0 81 | # via altair 82 | nest-asyncio==1.6.0 83 | # via ipykernel 84 | nodeenv==1.9.1 85 | # via pre-commit 86 | numpy==2.0.2 87 | # via pandas 88 | packaging==24.2 89 | # via altair 90 | # via ipykernel 91 | # via pytest 92 | pandas==2.2.3 93 | # via cogclassifier 94 | parso==0.8.4 95 | # via jedi 96 | pexpect==4.9.0 97 | # via ipython 98 | platformdirs==4.3.7 99 | # via jupyter-core 100 | # via virtualenv 101 | pluggy==1.5.0 102 | # via pytest 103 | pre-commit==4.2.0 104 | prompt-toolkit==3.0.51 105 | # via ipython 106 | psutil==7.0.0 107 | # via ipykernel 108 | ptyprocess==0.7.0 109 | # via pexpect 110 | pure-eval==0.2.3 111 | # via stack-data 112 | pydantic==2.11.3 113 | # via cogclassifier 114 | pydantic-core==2.33.1 115 | # via pydantic 116 | pygments==2.19.1 117 | # via ipython 118 | # via rich 119 | pytest==8.3.5 120 | # via pytest-cov 121 | pytest-cov==6.1.1 122 | python-dateutil==2.9.0.post0 123 | # via jupyter-client 124 | # via pandas 125 | pytz==2025.2 126 | # via pandas 127 | pyyaml==6.0.2 128 | # via pre-commit 129 | pyzmq==26.4.0 130 | # via ipykernel 131 | # via jupyter-client 132 | referencing==0.36.2 133 | # via jsonschema 134 | # via jsonschema-specifications 135 | requests==2.32.3 136 | # via cogclassifier 137 | rich==14.0.0 138 | # via typer 139 | rpds-py==0.24.0 140 | # via jsonschema 141 | # via referencing 142 | ruff==0.11.6 143 | shellingham==1.5.4 144 | # via typer 145 | six==1.17.0 146 | # via python-dateutil 147 | stack-data==0.6.3 148 | # via ipython 149 | tomli==2.2.1 150 | # via coverage 151 | # via pytest 152 | tornado==6.4.2 153 | # via ipykernel 154 | # via jupyter-client 155 | traitlets==5.14.3 156 | # via comm 157 | # via ipykernel 158 | # via ipython 159 | # via jupyter-client 160 | # via jupyter-core 161 | # via matplotlib-inline 162 | typer==0.15.2 163 | # via cogclassifier 164 | typing-extensions==4.13.2 165 | # via altair 166 | # via ipython 167 | # via pydantic 168 | # via pydantic-core 169 | # via referencing 170 | # via rich 171 | # via typer 172 | # via typing-inspection 173 | typing-inspection==0.4.0 174 | # via pydantic 175 | tzdata==2025.2 176 | # via pandas 177 | urllib3==2.4.0 178 | # via requests 179 | virtualenv==20.30.0 180 | # via pre-commit 181 | vl-convert-python==1.7.0 182 | # via cogclassifier 183 | wcwidth==0.2.13 184 | # via prompt-toolkit 185 | zipp==3.21.0 186 | # via importlib-metadata 187 | -------------------------------------------------------------------------------- /requirements.lock: -------------------------------------------------------------------------------- 1 | # generated by rye 2 | # use `rye lock` or `rye sync` to update this lockfile 3 | # 4 | # last locked with the following flags: 5 | # pre: false 6 | # features: [] 7 | # all-features: true 8 | # with-sources: false 9 | # generate-hashes: false 10 | # universal: false 11 | 12 | -e file:. 13 | altair==5.5.0 14 | # via cogclassifier 15 | annotated-types==0.7.0 16 | # via pydantic 17 | attrs==25.3.0 18 | # via jsonschema 19 | # via referencing 20 | certifi==2025.1.31 21 | # via requests 22 | charset-normalizer==3.4.1 23 | # via requests 24 | click==8.1.8 25 | # via typer 26 | idna==3.10 27 | # via requests 28 | jinja2==3.1.6 29 | # via altair 30 | jsonschema==4.23.0 31 | # via altair 32 | jsonschema-specifications==2024.10.1 33 | # via jsonschema 34 | markdown-it-py==3.0.0 35 | # via rich 36 | markupsafe==3.0.2 37 | # via jinja2 38 | mdurl==0.1.2 39 | # via markdown-it-py 40 | narwhals==1.35.0 41 | # via altair 42 | numpy==2.0.2 43 | # via pandas 44 | packaging==24.2 45 | # via altair 46 | pandas==2.2.3 47 | # via cogclassifier 48 | pydantic==2.11.3 49 | # via cogclassifier 50 | pydantic-core==2.33.1 51 | # via pydantic 52 | pygments==2.19.1 53 | # via rich 54 | python-dateutil==2.9.0.post0 55 | # via pandas 56 | pytz==2025.2 57 | # via pandas 58 | referencing==0.36.2 59 | # via jsonschema 60 | # via jsonschema-specifications 61 | requests==2.32.3 62 | # via cogclassifier 63 | rich==14.0.0 64 | # via typer 65 | rpds-py==0.24.0 66 | # via jsonschema 67 | # via referencing 68 | shellingham==1.5.4 69 | # via typer 70 | six==1.17.0 71 | # via python-dateutil 72 | typer==0.15.2 73 | # via cogclassifier 74 | typing-extensions==4.13.2 75 | # via altair 76 | # via pydantic 77 | # via pydantic-core 78 | # via referencing 79 | # via rich 80 | # via typer 81 | # via typing-inspection 82 | typing-inspection==0.4.0 83 | # via pydantic 84 | tzdata==2025.2 85 | # via pandas 86 | urllib3==2.4.0 87 | # via requests 88 | vl-convert-python==1.7.0 89 | # via cogclassifier 90 | -------------------------------------------------------------------------------- /src/cogclassifier/__init__.py: -------------------------------------------------------------------------------- 1 | import cogclassifier.logger as _logger 2 | from cogclassifier.main import CogClassifier 3 | 4 | __all__ = ["CogClassifier"] 5 | __version__ = "2.0.0" 6 | 7 | _logger.init_null_logger() 8 | -------------------------------------------------------------------------------- /src/cogclassifier/__main__.py: -------------------------------------------------------------------------------- 1 | from cogclassifier.scripts.cogclassifier import app 2 | 3 | app() 4 | -------------------------------------------------------------------------------- /src/cogclassifier/blast.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import csv 4 | import logging 5 | import re 6 | import shlex 7 | import shutil 8 | import subprocess as sp 9 | import tempfile 10 | from functools import cached_property 11 | from pathlib import Path 12 | 13 | from pydantic import BaseModel, ConfigDict 14 | 15 | from cogclassifier import const 16 | 17 | 18 | class RpsBlast: 19 | """RPS-BLAST Run Class""" 20 | 21 | def __init__( 22 | self, 23 | query: str | Path, 24 | db: str | Path, 25 | *, 26 | outfile: str | Path | None = None, 27 | evalue: float = 1e-2, 28 | thread_num: int = 1, 29 | ): 30 | self._query = query 31 | self._db = db 32 | self._outfile = outfile 33 | self._evalue = evalue 34 | self._thread_num = thread_num 35 | 36 | def run(self) -> BlastAlignmentRecord: 37 | """Run RPS-BLAST""" 38 | self.check_installation() 39 | with tempfile.TemporaryDirectory() as tmpdir: 40 | outfile = self._outfile 41 | if outfile is None: 42 | outfile = Path(tmpdir) / "rpsblast.tsv" 43 | cmd = f"{self.get_binary_name()} -query {self._query} -db {self._db} -outfmt 6 -out {outfile} -evalue {self._evalue} -num_threads {self._thread_num} -mt_mode 1" # noqa: E501 44 | version = self.get_version() 45 | logger = logging.getLogger(__name__) 46 | logger.info(f"{'*' * 10} Start RPS-BLAST(v{version}) Search {'*' * 10}") 47 | self._run_cmd(cmd, logger) 48 | logger.info(f"{'*' * 10} Finished RPS-BLAST Search {'*' * 10}") 49 | return BlastAlignmentRecord(outfile) 50 | 51 | @classmethod 52 | def check_installation(cls, raise_error: bool = True) -> bool: 53 | """Check tool installation""" 54 | if shutil.which("rpsblast") is None and shutil.which("rpsblast+") is None: 55 | if raise_error: 56 | raise RuntimeError("rpsblast is not installed!!") 57 | return False 58 | return True 59 | 60 | @classmethod 61 | def get_version(cls) -> str: 62 | """Get tool version""" 63 | try: 64 | cmd = f"{cls.get_binary_name()} -version" 65 | cmd_args = shlex.split(cmd) 66 | cmd_res = sp.run(cmd_args, capture_output=True, text=True) 67 | output = cmd_res.stderr if cmd_res.stdout == "" else cmd_res.stdout 68 | version = re.findall(r"blast (\d+.\d+.\d+)", output, re.MULTILINE)[0] 69 | return version 70 | except Exception: 71 | return const.UNKNOWN_VERSION 72 | 73 | @classmethod 74 | def get_binary_name(cls) -> str: 75 | """Binary name""" 76 | return "rpsblast+" if shutil.which("rpsblast") is None else "rpsblast" 77 | 78 | def _run_cmd( 79 | self, 80 | cmd: str, 81 | logger: logging.Logger, 82 | stdout_file: str | Path | None = None, 83 | ) -> None: 84 | """Run command 85 | 86 | Parameters 87 | ---------- 88 | cmd : str 89 | Command to run 90 | logger : logging.Logger 91 | Logger object 92 | stdout_file : str | Path | None, optional 93 | Write stdout result if file is set 94 | """ 95 | logger.info(f"$ {cmd}") 96 | cmd_args = shlex.split(cmd) 97 | try: 98 | cmd_res = sp.run(cmd_args, capture_output=True, text=True, check=True) 99 | # Write stdout result if stdout_file is set 100 | if stdout_file: 101 | logger.info(f"> Save cmd stdout results to '{stdout_file}'") 102 | with open(stdout_file, "w", encoding="utf-8") as f: 103 | f.write(cmd_res.stdout) 104 | except sp.CalledProcessError as e: 105 | returncode, stdout, stderr = e.returncode, str(e.stdout), str(e.stderr) 106 | logger.error(f"Failed to run command below ({returncode=})") 107 | logger.error(f"$ {cmd}") 108 | stdout_lines = stdout.splitlines() 109 | if len(stdout_lines) > 0: 110 | logger.error("STDOUT:") 111 | for line in stdout_lines: 112 | logger.error(f"> {line}") 113 | stderr_lines = stderr.splitlines() 114 | if len(stderr_lines) > 0: 115 | logger.error("STDERR:") 116 | for line in stderr_lines: 117 | logger.error(f"> {line}") 118 | logger.error("Failed to run 'RPS-BLAST'!!") 119 | raise 120 | except FileNotFoundError: 121 | raise 122 | 123 | 124 | class BlastAlignment(BaseModel): 125 | """Blast Alignment Class""" 126 | 127 | qaccver: str # Query Accession Version 128 | saccver: str # Subject Accession Version 129 | pident: float # Percent Identity 130 | length: int # Alignment Length 131 | mismatch: int 132 | gapopen: int 133 | qstart: int 134 | qend: int 135 | sstart: int 136 | send: int 137 | evalue: float 138 | bitscore: float 139 | 140 | model_config = ConfigDict(frozen=True) 141 | 142 | @property 143 | def as_tsv(self) -> str: 144 | """Return the fields as tsv""" 145 | return "\t".join(map(str, self.model_dump().values())) 146 | 147 | 148 | class BlastAlignmentRecord: 149 | def __init__(self, blast_outfile: str | Path): 150 | """Parse tsv format blast result file 151 | 152 | Parameters 153 | ---------- 154 | blast_outfile : str | Path 155 | TSV format blast result file 156 | 157 | Returns 158 | ------- 159 | blast_results : list[BlastResult] 160 | List of BlastResult 161 | """ 162 | blast_alns: list[BlastAlignment] = [] 163 | with open(blast_outfile, encoding="utf-8") as f: 164 | reader = csv.reader(f, delimiter="\t") 165 | for row in reader: 166 | # Ignore header line 167 | if row[0].startswith("#"): 168 | continue 169 | keys = BlastAlignment.model_fields 170 | blast_alns.append(BlastAlignment(**dict(zip(keys, row)))) # type: ignore 171 | 172 | self._blast_alns = blast_alns 173 | 174 | @property 175 | def alignments(self) -> list[BlastAlignment]: 176 | """Blast alignment results""" 177 | return self._blast_alns 178 | 179 | @cached_property 180 | def top_hit_alignments(self) -> list[BlastAlignment]: 181 | """Top hit blast alignment results""" 182 | top_hits = [] 183 | top_hit_blast_results = [] 184 | for br in self._blast_alns: 185 | if br.qaccver in top_hits: 186 | continue 187 | top_hits.append(br.qaccver) 188 | top_hit_blast_results.append(br) 189 | return top_hit_blast_results 190 | 191 | def __str__(self) -> str: 192 | return "\n".join([aln.as_tsv for aln in self.alignments]) 193 | -------------------------------------------------------------------------------- /src/cogclassifier/cog.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import csv 4 | import gzip 5 | import logging 6 | from functools import cached_property 7 | from pathlib import Path 8 | 9 | import pandas as pd 10 | from pydantic import BaseModel, ConfigDict 11 | 12 | from cogclassifier.blast import BlastAlignmentRecord 13 | 14 | 15 | class CogFuncCategory(BaseModel): 16 | """COG Functional Category Class""" 17 | 18 | letter: str 19 | group: str 20 | color: str 21 | desc: str 22 | 23 | model_config = ConfigDict(frozen=True) 24 | 25 | @property 26 | def as_tsv(self) -> str: 27 | """Return the fields as tsv""" 28 | return "\t".join(map(str, self.model_dump().values())) 29 | 30 | 31 | class CogFuncCategoryRecord: 32 | """COG Functional Category Record Class""" 33 | 34 | def __init__(self, cog_fc_file: str | Path): 35 | """ 36 | Parameters 37 | ---------- 38 | config_file : str | Path 39 | COG functional category file 40 | """ 41 | cfc_list: list[CogFuncCategory] = [] 42 | with open(cog_fc_file, encoding="utf-8") as f: 43 | reader = csv.reader(f, delimiter="\t") 44 | for row in reader: 45 | keys = CogFuncCategory.model_fields 46 | cfc_list.append(CogFuncCategory(**dict(zip(keys, row)))) 47 | 48 | self._letters = [cfc.letter for cfc in cfc_list] 49 | self._cfc_list = cfc_list 50 | self._letter2cfc = {cfc.letter: cfc for cfc in cfc_list} 51 | 52 | def get(self, letter: str) -> CogFuncCategory: 53 | """Get target letter COG info""" 54 | return self[letter] 55 | 56 | def get_all(self) -> list[CogFuncCategory]: 57 | """Get all COG info""" 58 | return self._cfc_list 59 | 60 | def get_letters(self) -> list[str]: 61 | """Get all COG letters""" 62 | return self._letters 63 | 64 | def __str__(self) -> str: 65 | return "\n".join([cfc.as_tsv for cfc in self.get_all()]) 66 | 67 | def __len__(self) -> int: 68 | return len(self.get_all()) 69 | 70 | def __getitem__(self, letter: str) -> CogFuncCategory: 71 | return self._letter2cfc[letter] 72 | 73 | 74 | class CogDefinition(BaseModel): 75 | """COG Definition Class""" 76 | 77 | id: str 78 | letter: str 79 | cog_name: str 80 | gene_name: str 81 | func_pathway: str 82 | pubmed_id_list: list[str] 83 | pdb_id_list: list[str] 84 | 85 | model_config = ConfigDict(frozen=True) 86 | 87 | @property 88 | def one_letter(self) -> str: 89 | """One letter (e.g. letter=`KT` -> one_letter=`K`)""" 90 | return self.letter[0] 91 | 92 | @property 93 | def as_tsv(self) -> str: 94 | """Return the fields as tsv""" 95 | return "\t".join( 96 | ( 97 | self.id, 98 | self.letter, 99 | self.cog_name, 100 | self.gene_name, 101 | self.func_pathway, 102 | ";".join(self.pubmed_id_list), 103 | ";".join(self.pdb_id_list), 104 | ) 105 | ) 106 | 107 | 108 | class CogDefinitionRecord: 109 | """COG Definition Record Class""" 110 | 111 | def __init__(self, cog_def_file: str | Path): 112 | cog_defs: list[CogDefinition] = [] 113 | with open(cog_def_file, encoding="utf-8") as f: 114 | reader = csv.reader(f, delimiter="\t") 115 | for row in reader: 116 | # COG definition file does not have a fixed number of columns per line 117 | cog_id, letter, cog_name, gene_name = row[0:4] 118 | func_pathway = row[4] if len(row) > 4 else "" 119 | pubmed_id_list = row[5].split(";") if len(row) > 5 else [] 120 | pdb_id_list = row[6].split(";") if len(row) > 6 else [] 121 | cog_defs.append( 122 | CogDefinition( 123 | id=cog_id, 124 | letter=letter, 125 | cog_name=cog_name, 126 | gene_name=gene_name, 127 | func_pathway=func_pathway, 128 | pubmed_id_list=pubmed_id_list, 129 | pdb_id_list=pdb_id_list, 130 | ) 131 | ) 132 | 133 | self._id_list = [cd.id for cd in cog_defs] 134 | self._cog_defs = cog_defs 135 | self._id2cog_def = {cd.id: cd for cd in cog_defs} 136 | 137 | def get(self, cog_id: str) -> CogDefinition | None: 138 | """Get target ID COG definition info""" 139 | return self[cog_id] 140 | 141 | def get_all(self) -> list[CogDefinition]: 142 | """Get all COG definition info""" 143 | return self._cog_defs 144 | 145 | def get_id_list(self) -> list[str]: 146 | """Get all COG ID""" 147 | return self._id_list 148 | 149 | def __str__(self) -> str: 150 | return "\n".join([cd.as_tsv for cd in self.get_all()]) 151 | 152 | def __len__(self) -> int: 153 | return len(self.get_all()) 154 | 155 | def __getitem__(self, cog_id: str) -> CogDefinition | None: 156 | return self._id2cog_def.get(cog_id) 157 | 158 | 159 | class CogCddIdTable: 160 | """COG ID & CDD ID table for ID conversion""" 161 | 162 | def __init__(self, cddid_table_file: str | Path): 163 | cdd_id2cog_id, cog_id2cdd_id = dict(), dict() 164 | xopen = gzip.open if Path(cddid_table_file).suffix == ".gz" else open 165 | with xopen(cddid_table_file, mode="rt", encoding="utf-8") as f: 166 | reader = csv.reader(f, delimiter="\t") 167 | for row in reader: 168 | cdd_id, acc_id = row[0], row[1] 169 | if acc_id.startswith("COG"): 170 | cdd_id2cog_id[cdd_id] = acc_id 171 | cog_id2cdd_id[acc_id] = cdd_id 172 | self._cdd_id2cog_id: dict[str, str] = cdd_id2cog_id 173 | self._cog_id2cdd_id: dict[str, str] = cog_id2cdd_id 174 | 175 | def to_cog_id(self, cddid: str) -> str: 176 | """Convert CDD ID to COG ID""" 177 | return self._cdd_id2cog_id[cddid] 178 | 179 | def to_cdd_id(self, cogid: str) -> str: 180 | """Convert COG ID to CDD ID""" 181 | return self._cog_id2cdd_id[cogid] 182 | 183 | 184 | class CogClassifyStats: 185 | """COG Classify Result Statistics Class""" 186 | 187 | def __init__( 188 | self, 189 | query: str | Path, 190 | blast_rec: BlastAlignmentRecord, 191 | cog_fc_rec: CogFuncCategoryRecord, 192 | cog_def_rec: CogDefinitionRecord, 193 | cog_cdd_id_table: CogCddIdTable, 194 | ): 195 | self._query = query 196 | self.blast_rec = blast_rec 197 | self.cog_fc_rec = cog_fc_rec 198 | self.cog_def_rec = cog_def_rec 199 | self.cog_cdd_id_table = cog_cdd_id_table 200 | 201 | @cached_property 202 | def classify_count(self) -> int: 203 | """Number of COG classified sequence""" 204 | return len(self.query_classify_df) 205 | 206 | @cached_property 207 | def query_count(self) -> int: 208 | """Number of query fasta sequence""" 209 | with open(self._query) as f: 210 | return len(list(filter(lambda line: line.startswith(">"), f.readlines()))) 211 | 212 | @cached_property 213 | def classify_ratio(self) -> float: 214 | """Ratio of COG classified sequence""" 215 | return self.classify_count / self.query_count 216 | 217 | @cached_property 218 | def query_classify_df(self) -> pd.DataFrame: 219 | """COG classified query dataframe""" 220 | df_rows = [] 221 | for aln in self.blast_rec.top_hit_alignments: 222 | # Get query & CDD ID from rpsblast hits 223 | query_id, cdd_id = aln.qaccver, aln.saccver.replace("CDD:", "") 224 | # Convert CDD ID to COG ID 225 | cog_id = self.cog_cdd_id_table.to_cog_id(cdd_id) 226 | # Get COG definition by COG ID (Some COG ID not found in definition) 227 | cog_def = self.cog_def_rec[cog_id] 228 | if cog_def is None: 229 | logger = logging.getLogger(__name__) 230 | logger.debug( 231 | f"{cog_id=} is not found in COG definition ({query_id=}, {cdd_id=})" 232 | ) 233 | continue 234 | # Get COG functional category by COG letter 235 | cog_fc = self.cog_fc_rec[cog_def.one_letter] 236 | df_rows.append( 237 | ( 238 | query_id, 239 | cog_id, 240 | cdd_id, 241 | aln.evalue, 242 | aln.pident, 243 | cog_def.gene_name, 244 | cog_def.cog_name, 245 | cog_def.one_letter, 246 | cog_fc.desc, 247 | ), 248 | ) 249 | return pd.DataFrame( 250 | df_rows, 251 | columns=[ 252 | "QUERY_ID", 253 | "COG_ID", 254 | "CDD_ID", 255 | "EVALUE", 256 | "IDENTITY", 257 | "GENE_NAME", 258 | "COG_NAME", 259 | "COG_LETTER", 260 | "COG_DESCRIPTION", 261 | ], 262 | ) 263 | 264 | @cached_property 265 | def count_summary_df(self) -> pd.DataFrame: 266 | """Summary COG classification count result dataframe""" 267 | df_rows = [] 268 | for cog_fc in self.cog_fc_rec.get_all(): 269 | count = ( 270 | self.query_classify_df["COG_LETTER"] 271 | .value_counts() 272 | .get(cog_fc.letter, 0) 273 | ) 274 | df_rows.append( 275 | ( 276 | cog_fc.letter, 277 | count, 278 | cog_fc.group, 279 | cog_fc.color, 280 | cog_fc.desc, 281 | ) 282 | ) 283 | return pd.DataFrame( 284 | df_rows, 285 | columns=[ 286 | "LETTER", 287 | "COUNT", 288 | "GROUP", 289 | "COLOR", 290 | "DESCRIPTION", 291 | ], 292 | ) 293 | -------------------------------------------------------------------------------- /src/cogclassifier/const.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | COG_FUN_FTP = "https://ftp.ncbi.nih.gov/pub/COG/COG2024/data/fun-24.tab" 5 | COG_DEF_FTP = "https://ftp.ncbi.nih.gov/pub/COG/COG2024/data/cog-24.def.tab" 6 | 7 | CDDID_TBL_FTP = "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cddid.tbl.gz" 8 | COG_LE_FTP = "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/Cog_LE.tar.gz" 9 | 10 | RESOURCES_DIR = Path(__file__).parent / "resources" 11 | COG_FUNC_CATEGORY_FILE = RESOURCES_DIR / "cog_func_category.tsv" 12 | COG_DEFINITION_FILE = RESOURCES_DIR / "cog_definition.tsv" 13 | 14 | _cpu_count = os.cpu_count() 15 | MIN_CPU = 1 16 | MAX_CPU = 1 if _cpu_count is None else _cpu_count 17 | DEFAULT_CPU = 1 if MAX_CPU == 1 else MAX_CPU - 1 18 | 19 | CACHE_DIR = Path.home() / ".cache" / "cogclassifier_v2" 20 | 21 | UNKNOWN_VERSION = "?.?.?" 22 | -------------------------------------------------------------------------------- /src/cogclassifier/logger.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import sys 5 | from pathlib import Path 6 | 7 | 8 | def init_null_logger(): 9 | """Initialize package root logger with NullHandler 10 | 11 | Configuring package root null logger for a library 12 | https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library 13 | """ 14 | pkg_root_name = __name__.split(".")[0] 15 | logger = logging.getLogger(pkg_root_name) 16 | logger.addHandler(logging.NullHandler()) 17 | 18 | 19 | def init_logger( 20 | *, 21 | quiet: bool = False, 22 | verbose: bool = False, 23 | log_file: str | Path | None = None, 24 | ): 25 | """Initialize package root logger with StreamHandler(& FileHandler) 26 | 27 | Configuring package root default logger for a CLI tool 28 | 29 | Parameters 30 | ---------- 31 | quiet : bool, optional 32 | If True, no print info log on screen 33 | verbose: bool, optional 34 | If True & quiet=False, print debug log on screen 35 | log_file : str | Path | None, optional 36 | Log file 37 | """ 38 | pkg_root_name = __name__.split(".")[0] 39 | logger = logging.getLogger(pkg_root_name) 40 | 41 | # Remove existing handler to avoid duplicate logging 42 | for handler in logger.handlers: 43 | logger.removeHandler(handler) 44 | handler.close() 45 | 46 | logger.setLevel(logging.DEBUG) 47 | log_formatter = logging.Formatter( 48 | fmt="$asctime | $levelname | $message", 49 | datefmt="%Y-%m-%d %H:%M:%S", 50 | style="$", 51 | ) 52 | # Add stream handler for terminal stderr 53 | stream_handler = logging.StreamHandler(sys.stderr) 54 | stream_handler.setFormatter(log_formatter) 55 | if quiet: 56 | log_level = logging.WARNING 57 | else: 58 | log_level = logging.DEBUG if verbose else logging.INFO 59 | stream_handler.setLevel(log_level) 60 | logger.addHandler(stream_handler) 61 | 62 | if log_file: 63 | # Add file handler for log file 64 | file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8") 65 | file_handler.setFormatter(log_formatter) 66 | log_level = logging.DEBUG if verbose else logging.INFO 67 | file_handler.setLevel(log_level) 68 | logger.addHandler(file_handler) 69 | -------------------------------------------------------------------------------- /src/cogclassifier/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import shutil 5 | from pathlib import Path 6 | 7 | from cogclassifier import const, utils 8 | from cogclassifier.blast import RpsBlast 9 | from cogclassifier.cog import ( 10 | CogCddIdTable, 11 | CogClassifyStats, 12 | CogDefinitionRecord, 13 | CogFuncCategoryRecord, 14 | ) 15 | 16 | 17 | class CogClassifier: 18 | """COG Classification Class""" 19 | 20 | def __init__( 21 | self, 22 | query: str | Path, 23 | *, 24 | download_dir: str | Path | None = None, 25 | thread_num: int | None = None, 26 | evalue: float = 1e-2, 27 | ): 28 | download_dir = const.CACHE_DIR if download_dir is None else download_dir 29 | thread_num = const.DEFAULT_CPU if thread_num is None else thread_num 30 | 31 | self._query = Path(query) 32 | self._download_dir = Path(download_dir) 33 | self._thread_num = thread_num 34 | self._evalue = evalue 35 | 36 | def run(self) -> CogClassifyStats: 37 | """Run COGclassifier""" 38 | logger = logging.getLogger(__name__) 39 | 40 | # Download NCBI COG & CDD resources 41 | logger.info("Download COG & CDD resources in NCBI FTP site") 42 | cddid_tbl_gzfile = utils.ftp_download(const.CDDID_TBL_FTP, self._download_dir) 43 | 44 | cog_le_targz_file = utils.ftp_download(const.COG_LE_FTP, self._download_dir) 45 | cog_le_dir = self._download_dir / "Cog_LE" 46 | if not cog_le_dir.exists(): 47 | logger.info(f"Unpack {cog_le_targz_file} => {cog_le_dir}") 48 | shutil.unpack_archive(cog_le_targz_file, cog_le_dir) 49 | 50 | # Load NCBI COG & CDD resources 51 | logger.info(f"Load COG Functional Category from {const.COG_FUNC_CATEGORY_FILE}") 52 | cog_fc_rec = CogFuncCategoryRecord(const.COG_FUNC_CATEGORY_FILE) 53 | logger.info(f"Load COG Definition from {const.COG_DEFINITION_FILE}") 54 | cog_def_rec = CogDefinitionRecord(const.COG_DEFINITION_FILE) 55 | logger.info(f"Load COG <=> CDD ID Conversion Table from {cddid_tbl_gzfile}") 56 | cog_cdd_id_table = CogCddIdTable(cddid_tbl_gzfile) 57 | 58 | # Run RPS-BLAST 59 | rpsblast_db = cog_le_dir / "Cog" 60 | blast_rec = RpsBlast( 61 | self._query, 62 | rpsblast_db, 63 | outfile=None, 64 | evalue=self._evalue, 65 | thread_num=self._thread_num, 66 | ).run() 67 | 68 | stats = CogClassifyStats( 69 | self._query, 70 | blast_rec, 71 | cog_fc_rec, 72 | cog_def_rec, 73 | cog_cdd_id_table, 74 | ) 75 | logger.info( 76 | f"{stats.classify_ratio * 100:.2f}% ({stats.classify_count} / {stats.query_count}) sequences are classified into COG functional category" # noqa: E501 77 | ) 78 | 79 | return stats 80 | -------------------------------------------------------------------------------- /src/cogclassifier/plot.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import altair as alt 6 | import pandas as pd 7 | 8 | 9 | def plot_cog_count_barchart( 10 | data: str | Path | pd.DataFrame, 11 | outfile: str | Path | None = None, 12 | *, 13 | fig_width: int = 440, 14 | fig_height: int = 340, 15 | bar_width: int = 12, 16 | y_limit: float | None = None, 17 | percent_style: bool = False, 18 | sort: bool = False, 19 | dpi: int = 100, 20 | ) -> alt.Chart: 21 | """Plot altair barchart from COG count dataframe 22 | 23 | Parameters 24 | ---------- 25 | data : str | Path | pd.DataFrame 26 | COG count file or dataframe 27 | outfile : str | Path | None, optional 28 | Barchart output file (`*.png`|`*.svg`|`*.html`) 29 | fig_width : int, optional 30 | Figure pixel width 31 | fig_height : int, optional 32 | Figure pixel height 33 | bar_width : int, optional 34 | Figure pixel bar width 35 | y_limit : float | None, optional 36 | Y-axis max limit value 37 | percent_style : bool, optional 38 | Plot y-axis as percent(%) instead of count number 39 | sort : bool, optional 40 | Enable descending sort by count 41 | dpi : int, option 42 | Figure DPI 43 | 44 | Returns 45 | ------- 46 | barchart : alt.Chart 47 | Altair barchart 48 | """ 49 | if isinstance(data, pd.DataFrame): 50 | df = data 51 | else: 52 | df = pd.read_csv(data, sep="\t", encoding="utf-8") 53 | 54 | # Set 'percent style' or 'count style' 55 | if percent_style: 56 | yfield, ytitle, yformat = "RATIO", "Percent of Sequences", ".0%" 57 | y_limit = None if y_limit is None else y_limit / 100 58 | else: 59 | yfield, ytitle, yformat = "COUNT", "Number of Sequences", "c" 60 | 61 | # Set sort style (descending by count) 62 | df = df.sort_values("COUNT", ascending=False) if sort else df 63 | 64 | # Calculate count rate (%) 65 | df["RATIO"] = df["COUNT"] / df["COUNT"].sum() 66 | df["RATIO(%)"] = df["RATIO"].map("{:.2%}".format) 67 | 68 | # If no y_limit defined by user, set appropriate value 69 | ymax = df[yfield].max() 70 | y_limit = ymax + (ymax * 0.05) if y_limit is None else y_limit 71 | 72 | df["L_DESCRIPTION"] = df["LETTER"] + " : " + df["DESCRIPTION"] 73 | barchart = ( 74 | alt.Chart(df, title="COG Functional Classification") 75 | .mark_bar(stroke="black", strokeWidth=0.2) 76 | .encode( 77 | x=alt.X("LETTER", title="Functional Category", sort=None), 78 | y=alt.Y( 79 | yfield, 80 | title=ytitle, 81 | axis=alt.Axis(format=yformat), 82 | scale=alt.Scale(domainMax=y_limit, clamp=True), 83 | ), 84 | tooltip=["DESCRIPTION", "LETTER", "COUNT", "RATIO(%)"], 85 | color=alt.Color( 86 | "L_DESCRIPTION", 87 | title="", 88 | scale=alt.Scale( 89 | domain=df["L_DESCRIPTION"].to_list(), 90 | range=df["COLOR"].to_list(), 91 | ), 92 | ), 93 | stroke=alt.condition( 94 | alt.datum[yfield] > 0, alt.value("black"), alt.value("transparent") 95 | ), 96 | ) 97 | .properties(width=fig_width, height=fig_height) 98 | .configure_title(fontSize=15) 99 | .configure_legend(labelLimit=0) 100 | .configure_axisX(labelAngle=0, tickSize=0) 101 | .configure_mark( 102 | stroke="black", width=bar_width, strokeWidth=0.15, strokeOpacity=1 103 | ) 104 | ) 105 | if outfile is not None: 106 | if Path(outfile).suffix == ".png": 107 | barchart.save(outfile, ppi=dpi) 108 | else: 109 | barchart.save(outfile) 110 | 111 | return barchart 112 | 113 | 114 | def plot_cog_count_piechart( 115 | data: str | Path | pd.DataFrame, 116 | outfile: str | Path | None = None, 117 | *, 118 | fig_width: int = 380, 119 | fig_height: int = 380, 120 | show_letter: bool = False, 121 | sort: bool = False, 122 | dpi: int = 100, 123 | ) -> alt.LayerChart: 124 | """Plot altair piechart from COG count dataframe 125 | 126 | Parameters 127 | ---------- 128 | df : str | Path | pd.DataFrame 129 | COG count file or dataframe 130 | outfile : str | Path | None, optional 131 | Piechart output file (`*.png`|`*.svg`|`*.html`) 132 | fig_width : int, optional 133 | Figure pixel width 134 | fig_height : int, optional 135 | Figure pixel height 136 | show_letter : bool, optional 137 | Show letter on piechart 138 | sort : bool, optional 139 | Enable count descending sort 140 | dpi : int, optional 141 | Figure DPI 142 | 143 | Returns 144 | ------- 145 | piechart : alt.LayerChart 146 | Altair piechart 147 | """ 148 | if isinstance(data, pd.DataFrame): 149 | df = data 150 | else: 151 | df = pd.read_csv(data, sep="\t", encoding="utf-8") 152 | 153 | # Remove 0 Count (no assigned category) 154 | df = df[df["COUNT"] != 0] 155 | 156 | # Set sort style ("descending by count" or "ascending by index") 157 | if sort: 158 | sort_field, sort_order = "COUNT", "descending" 159 | df = df.sort_values("COUNT", ascending=False) 160 | else: 161 | df = df.reset_index() 162 | sort_field, sort_order = "index", "ascending" 163 | 164 | df["L_DESCRIPTION"] = df["LETTER"] + " : " + df["DESCRIPTION"] 165 | 166 | df["RATIO"] = df["COUNT"] / df["COUNT"].sum() * 100 167 | visible_letters = [] 168 | if show_letter: 169 | # Only visible 'LETTER' more than 1.0% ratio 170 | for ratio, letter in zip(df["RATIO"], df["LETTER"]): 171 | visible_letter = letter if ratio >= 1.0 else "" 172 | visible_letters.append(visible_letter) 173 | else: 174 | visible_letters = [""] * len(df) 175 | df["VISIBLE_LETTER"] = visible_letters 176 | 177 | # Format ratio to percentage (e.g. 10.293... -> "10.29%"") 178 | df["RATIO(%)"] = [f"{r:.2f}%" for r in df["RATIO"]] 179 | 180 | base = alt.Chart( 181 | df, 182 | title="COG Functional Classification", 183 | ).encode( 184 | theta=alt.Theta("COUNT", stack=True), 185 | tooltip=["DESCRIPTION", "LETTER", "COUNT", "RATIO(%)"], 186 | order=alt.Order(sort_field, sort=sort_order), 187 | color=alt.Color( 188 | "L_DESCRIPTION", 189 | title="", 190 | scale=alt.Scale( 191 | domain=df["L_DESCRIPTION"].to_list(), 192 | range=df["COLOR"].to_list(), 193 | ), 194 | ), 195 | ) 196 | 197 | outer_radius = int(min(fig_width, fig_height) / 2) 198 | piechart = base.mark_arc(outerRadius=outer_radius) 199 | text = base.mark_text( 200 | radius=outer_radius - 15, 201 | size=10, 202 | stroke="black", 203 | strokeWidth=1.0, 204 | strokeOpacity=1.0, 205 | ).encode(text="VISIBLE_LETTER") 206 | 207 | piechart_with_text = ( 208 | alt.layer(piechart + text) 209 | .properties(width=fig_width, height=fig_height) 210 | .configure_title(fontSize=15, offset=10) 211 | .configure_legend(labelLimit=0) 212 | .configure_view(strokeWidth=0) 213 | .configure_mark(stroke="white", strokeWidth=1.0, strokeOpacity=1.0) 214 | ) 215 | if outfile is not None: 216 | if Path(outfile).suffix == ".png": 217 | piechart_with_text.save(outfile, ppi=dpi) 218 | else: 219 | piechart_with_text.save(outfile) 220 | 221 | return piechart_with_text 222 | -------------------------------------------------------------------------------- /src/cogclassifier/resources/cog_func_category.tsv: -------------------------------------------------------------------------------- 1 | J INFORMATION STORAGE AND PROCESSING #FCCCFC Translation, ribosomal structure and biogenesis 2 | A INFORMATION STORAGE AND PROCESSING #FCDCFC RNA processing and modification 3 | K INFORMATION STORAGE AND PROCESSING #FCDCEC Transcription 4 | L INFORMATION STORAGE AND PROCESSING #FCDCDC Replication, recombination and repair 5 | B INFORMATION STORAGE AND PROCESSING #FCDCCC Chromatin structure and dynamics 6 | D CELLULAR PROCESSES AND SIGNALING #FCFCCC Cell cycle control, cell division, chromosome partitioning 7 | Y CELLULAR PROCESSES AND SIGNALING #FCFCBC Nuclear structure 8 | V CELLULAR PROCESSES AND SIGNALING #FCFCAC Defense mechanisms 9 | T CELLULAR PROCESSES AND SIGNALING #ECFCAC Signal transduction mechanisms 10 | M CELLULAR PROCESSES AND SIGNALING #DCFCAC Cell wall/membrane/envelope biogenesis 11 | N CELLULAR PROCESSES AND SIGNALING #CCFCAC Cell motility 12 | Z CELLULAR PROCESSES AND SIGNALING #BCFCAC Cytoskeleton 13 | W CELLULAR PROCESSES AND SIGNALING #ACFCAC Extracellular structures 14 | U CELLULAR PROCESSES AND SIGNALING #9CFCAC Intracellular trafficking, secretion, and vesicular transport 15 | O CELLULAR PROCESSES AND SIGNALING #9CFC9C Posttranslational modification, protein turnover, chaperones 16 | X CELLULAR PROCESSES AND SIGNALING #9CFC9C Mobilome: prophages, transposons 17 | C METABOLISM #BCFCFC Energy production and conversion 18 | G METABOLISM #CCFCFC Carbohydrate transport and metabolism 19 | E METABOLISM #DCFCFC Amino acid transport and metabolism 20 | F METABOLISM #DCECFC Nucleotide transport and metabolism 21 | H METABOLISM #DCDCFC Coenzyme transport and metabolism 22 | I METABOLISM #DCCCFC Lipid transport and metabolism 23 | P METABOLISM #CCCCFC Inorganic ion transport and metabolism 24 | Q METABOLISM #BCCCFC Secondary metabolites biosynthesis, transport and catabolism 25 | R POORLY CHARACTERIZED #E0E0E0 General function prediction only 26 | S POORLY CHARACTERIZED #CCCCCC Function unknown -------------------------------------------------------------------------------- /src/cogclassifier/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moshi4/COGclassifier/9a626f609a8f7ea6f82dbc276ba0bcfbd320fa0b/src/cogclassifier/scripts/__init__.py -------------------------------------------------------------------------------- /src/cogclassifier/scripts/cogclassifier.py: -------------------------------------------------------------------------------- 1 | # from __future__ import annotations 2 | 3 | import logging 4 | import os 5 | import platform 6 | import sys 7 | from functools import partial 8 | from pathlib import Path 9 | from typing import Annotated 10 | 11 | import typer 12 | from typer import Option, Typer 13 | 14 | from cogclassifier import CogClassifier, __version__, const 15 | from cogclassifier.logger import init_logger 16 | from cogclassifier.plot import ( 17 | plot_cog_count_barchart, 18 | plot_cog_count_piechart, 19 | ) 20 | from cogclassifier.utils import exit_handler, logging_timeit 21 | 22 | Option = partial(Option, metavar="") 23 | 24 | app = Typer(add_completion=False) 25 | 26 | 27 | def version_callback(v: bool): 28 | """Callback function for print version""" 29 | if v: 30 | print(f"v{__version__}") 31 | raise typer.Exit() 32 | 33 | 34 | @app.command( 35 | no_args_is_help=True, 36 | epilog=None, 37 | context_settings=dict(help_option_names=["-h", "--help"]), 38 | ) 39 | @logging_timeit 40 | @exit_handler 41 | def cli( 42 | infile: Annotated[ 43 | Path, 44 | Option( 45 | "-i", 46 | "--infile", 47 | help="Input query protein fasta file", 48 | show_default=False, 49 | ), 50 | ], 51 | outdir: Annotated[ 52 | Path, 53 | Option( 54 | "-o", 55 | "--outdir", 56 | help="Output directory", 57 | show_default=False, 58 | ), 59 | ], 60 | download_dir: Annotated[ 61 | Path, 62 | Option("-d", "--download_dir", help="Download COG & CDD resources directory"), 63 | ] = const.CACHE_DIR, 64 | thread_num: Annotated[ 65 | int, 66 | Option("-t", "--thread_num", help="RPS-BLAST num_thread parameter"), 67 | ] = const.DEFAULT_CPU, 68 | evalue: Annotated[ 69 | float, 70 | Option("-e", "--evalue", help="RPS-BLAST e-value parameter"), 71 | ] = 1e-2, 72 | quiet: Annotated[ 73 | bool, 74 | Option("-q", "--quiet", help="No print log on screen"), 75 | ] = False, 76 | debug: Annotated[ 77 | bool, 78 | Option("--debug", help="Print debug log", hidden=True), 79 | ] = False, 80 | _: Annotated[ 81 | bool, 82 | Option( 83 | "-v", 84 | "--version", 85 | help="Print version information", 86 | callback=version_callback, 87 | is_eager=True, 88 | ), 89 | ] = False, 90 | ) -> None: 91 | """A tool for classifying prokaryote protein sequences into COG functional category""" # noqa: E501 92 | args = locals() 93 | os.makedirs(outdir, exist_ok=True) 94 | 95 | # Initialize logger 96 | log_file = outdir / "cogclassifier.log" 97 | init_logger(quiet=quiet, verbose=debug, log_file=log_file) 98 | logger = logging.getLogger(__name__) 99 | 100 | # Run COGclassifier 101 | logger.info(f"Run COGclassifier v{__version__}") 102 | logger.info(f"$ {Path(sys.argv[0]).name} {' '.join(sys.argv[1:])}") 103 | logger.info(f"Operating System: {sys.platform}") 104 | logger.info(f"Python Version: v{platform.python_version()}") 105 | for name, value in args.items(): 106 | if name not in ("quiet", "debug", "_"): 107 | logger.info(f"Parameter: {name}={value}") 108 | cog_stats = CogClassifier( 109 | infile, 110 | download_dir=download_dir, 111 | thread_num=thread_num, 112 | evalue=evalue, 113 | ).run() 114 | 115 | # Write RPS-BLAST result 116 | rpsblast_file = outdir / "rpsblast.tsv" 117 | with open(rpsblast_file, "w") as f: 118 | f.write(str(cog_stats.blast_rec)) 119 | logger.info("Write rpsblast search result") 120 | logger.info(f"=> {rpsblast_file}") 121 | 122 | # Write COG count summary 123 | cog_count_file = outdir / "cog_count.tsv" 124 | cog_stats.count_summary_df.to_csv(cog_count_file, sep="\t", index=False) 125 | logger.info("Write summary of COG functional category count") 126 | logger.info(f"=> {cog_count_file}") 127 | # Write COG classification result 128 | cog_classify_file = outdir / "cog_classify.tsv" 129 | cog_stats.query_classify_df.to_csv(cog_classify_file, sep="\t", index=False) 130 | logger.info("Write result of COG classification per query") 131 | logger.info(f"=> {cog_classify_file}") 132 | 133 | # Plot barchart 134 | barchart_html_file = outdir / "cog_count_barchart.html" 135 | barchart_png_file = barchart_html_file.with_suffix(".png") 136 | logger.info("Plot COG count barchart figure") 137 | plot_cog_count_barchart(cog_stats.count_summary_df, barchart_html_file) 138 | logger.info(f"=> {barchart_html_file}") 139 | plot_cog_count_barchart(cog_stats.count_summary_df, barchart_png_file) 140 | logger.info(f"=> {barchart_png_file}") 141 | # Plot piechart 142 | piechart_html_file = outdir / "cog_count_piechart.html" 143 | piechart_png_file = piechart_html_file.with_suffix(".png") 144 | props = dict(show_letter=True, sort=True) 145 | logger.info("Plot COG count piechart figure") 146 | plot_cog_count_piechart(cog_stats.count_summary_df, piechart_html_file, **props) # type: ignore 147 | logger.info(f"=> {piechart_html_file}") 148 | plot_cog_count_piechart(cog_stats.count_summary_df, piechart_png_file, **props) # type: ignore 149 | logger.info(f"=> {piechart_png_file}") 150 | 151 | 152 | if __name__ == "__main__": 153 | app() 154 | -------------------------------------------------------------------------------- /src/cogclassifier/scripts/plot_cog_count_barchart.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from pathlib import Path 3 | from typing import Annotated, Optional 4 | 5 | from typer import Option, Typer 6 | 7 | from cogclassifier.plot import plot_cog_count_barchart 8 | 9 | Option = partial(Option, metavar="") 10 | 11 | app = Typer(add_completion=False) 12 | 13 | 14 | @app.command( 15 | no_args_is_help=True, 16 | epilog=None, 17 | context_settings=dict(help_option_names=["-h", "--help"]), 18 | ) 19 | def cli( 20 | infile: Annotated[ 21 | Path, 22 | Option( 23 | "-i", 24 | "--infile", 25 | help="Input COG count result file ('cog_count.tsv')", 26 | show_default=False, 27 | ), 28 | ], 29 | outfile: Annotated[ 30 | Path, 31 | Option( 32 | "-o", 33 | "--outfile", 34 | help="Output barchart figure file (*.png|*.svg|*.html)", 35 | show_default=False, 36 | ), 37 | ], 38 | width: Annotated[ 39 | int, 40 | Option("--width", help="Figure pixel width"), 41 | ] = 440, 42 | height: Annotated[ 43 | int, 44 | Option("--height", help="Figure pixel height"), 45 | ] = 340, 46 | bar_width: Annotated[ 47 | int, 48 | Option("--bar_width", help="Figure bar width"), 49 | ] = 15, 50 | y_limit: Annotated[ 51 | Optional[int], 52 | Option("--y_limit", help="Y-axis max limit value", show_default=False), 53 | ] = None, 54 | percent_style: Annotated[ 55 | bool, 56 | Option("--percent_style", help="Plot percent style instead of number count"), 57 | ] = False, 58 | sort: Annotated[ 59 | bool, 60 | Option("--sort", help="Enable descending sort by number count"), 61 | ] = False, 62 | dpi: Annotated[ 63 | int, 64 | Option("--dpi", help="Figure DPI"), 65 | ] = 100, 66 | ) -> None: 67 | """Plot COGclassifier count barchart figure""" 68 | plot_cog_count_barchart( 69 | infile, 70 | outfile, 71 | fig_width=width, 72 | fig_height=height, 73 | bar_width=bar_width, 74 | y_limit=y_limit, 75 | percent_style=percent_style, 76 | sort=sort, 77 | dpi=dpi, 78 | ) 79 | 80 | 81 | if __name__ == "__main__": 82 | app() 83 | -------------------------------------------------------------------------------- /src/cogclassifier/scripts/plot_cog_count_piechart.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from pathlib import Path 3 | from typing import Annotated 4 | 5 | from typer import Option, Typer 6 | 7 | from cogclassifier.plot import plot_cog_count_piechart 8 | 9 | Option = partial(Option, metavar="") 10 | 11 | app = Typer(add_completion=False) 12 | 13 | 14 | @app.command( 15 | no_args_is_help=True, 16 | epilog=None, 17 | context_settings=dict(help_option_names=["-h", "--help"]), 18 | ) 19 | def cli( 20 | infile: Annotated[ 21 | Path, 22 | Option( 23 | "-i", 24 | "--infile", 25 | help="Input COG count result file ('cog_count.tsv')", 26 | show_default=False, 27 | ), 28 | ], 29 | outfile: Annotated[ 30 | Path, 31 | Option( 32 | "-o", 33 | "--outfile", 34 | help="Output piechart figure file (*.png|*.svg|*.html)", 35 | show_default=False, 36 | ), 37 | ], 38 | width: Annotated[ 39 | int, 40 | Option("--width", help="Figure pixel width"), 41 | ] = 380, 42 | height: Annotated[ 43 | int, 44 | Option("--height", help="Figure pixel height"), 45 | ] = 380, 46 | show_letter: Annotated[ 47 | bool, 48 | Option("--show_letter", help="Show functional category lettter on piechart"), 49 | ] = False, 50 | sort: Annotated[ 51 | bool, 52 | Option("--sort", help="Enable descending sort by number count"), 53 | ] = False, 54 | dpi: Annotated[ 55 | int, 56 | Option("--dpi", help="Figure DPI"), 57 | ] = 100, 58 | ) -> None: 59 | """Plot COGclassifier count piechart figure""" 60 | plot_cog_count_piechart( 61 | infile, 62 | outfile, 63 | fig_width=width, 64 | fig_height=height, 65 | show_letter=show_letter, 66 | sort=sort, 67 | dpi=dpi, 68 | ) 69 | 70 | 71 | if __name__ == "__main__": 72 | app() 73 | -------------------------------------------------------------------------------- /src/cogclassifier/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import os 5 | import signal 6 | import sys 7 | import time 8 | from functools import partial, wraps 9 | from pathlib import Path 10 | from typing import Callable 11 | 12 | import requests 13 | 14 | 15 | def ftp_download( 16 | url: str, 17 | outdir: str | Path, 18 | overwrite: bool = False, 19 | ) -> Path: 20 | """Download file from FTP site 21 | 22 | Parameters 23 | ---------- 24 | url : str 25 | FTP site url for download 26 | outdir : str | Path 27 | Output directory 28 | overwrite : bool, optional 29 | Overwrite or not 30 | 31 | Returns 32 | ------- 33 | download_file : Path 34 | Download file path 35 | """ 36 | os.makedirs(outdir, exist_ok=True) 37 | download_file = Path(outdir) / Path(url).name 38 | logger = logging.getLogger(__name__) 39 | logger.info(f"Download {url}") 40 | 41 | if download_file.exists() and not overwrite: 42 | logger.info(f"=> Already file exists {download_file}") 43 | return download_file 44 | try: 45 | res = requests.get(url, stream=True) 46 | with open(download_file, "wb") as f: 47 | f.write(res.content) 48 | logger.info(f"=> Successfully downloaded {download_file}") 49 | return download_file 50 | except requests.exceptions.ConnectionError: 51 | logger.exception("Failed to download file. Please check network connection.") 52 | raise 53 | 54 | 55 | def logging_timeit( 56 | func: Callable | None = None, 57 | /, 58 | *, 59 | show_func_name: bool = False, 60 | debug: bool = False, 61 | ): 62 | """Elapsed time logging decorator 63 | 64 | e.g. `Done (elapsed time: 82.3[s]) [module.function]` 65 | 66 | Parameters 67 | ---------- 68 | func : Callable | None, optional 69 | Target function 70 | show_func_name : bool, optional 71 | If True, show elapsed time message with `module.function` definition 72 | debug : bool, optional 73 | If True, use `logger.debug` (By default `logger.info`) 74 | """ 75 | if func is None: 76 | return partial(logging_timeit, show_func_name=show_func_name, debug=debug) 77 | 78 | @wraps(func) 79 | def wrapper(*args, **kwargs): 80 | start_time = time.time() 81 | result = func(*args, **kwargs) 82 | elapsed_time = time.time() - start_time 83 | logger = logging.getLogger(__name__) 84 | msg = f"Done (elapsed time: {elapsed_time:.2f}[s])" 85 | if show_func_name: 86 | msg = f"{msg} [{func.__module__}.{func.__name__}]" 87 | logger_func = logger.debug if debug else logger.info 88 | logger_func(msg) 89 | return result 90 | 91 | return wrapper 92 | 93 | 94 | def exit_handler(func): 95 | """Exit handling decorator on exception 96 | 97 | The main purpose is logging on keyboard interrupt exception 98 | """ 99 | 100 | @wraps(func) 101 | def wrapper(*args, **kwargs): 102 | logger = logging.getLogger(__name__) 103 | try: 104 | return func(*args, **kwargs) 105 | except KeyboardInterrupt: 106 | logger.exception("Keyboard Interrupt") 107 | sys.exit(signal.SIGINT) 108 | except Exception as e: 109 | logger.exception(e) 110 | sys.exit(1) 111 | 112 | return wrapper 113 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="session") 7 | def data_dir() -> Path: 8 | """Data directory fixture""" 9 | return Path(__file__).parent / "data" 10 | 11 | 12 | @pytest.fixture(scope="session") 13 | def example_fasta_file(data_dir: Path) -> Path: 14 | """example.faa file fixture""" 15 | return data_dir / "example.faa" 16 | 17 | 18 | @pytest.fixture(scope="session") 19 | def cog_count_file(data_dir: Path) -> Path: 20 | """cog_count.tsv file fixture""" 21 | return data_dir / "cog_count.tsv" 22 | 23 | 24 | @pytest.fixture(scope="session") 25 | def cog_download_dir(data_dir: Path) -> Path: 26 | """cog_download directory fixture""" 27 | cog_download_dir = data_dir / "cog_download" 28 | cog_download_dir.mkdir(exist_ok=True) 29 | return cog_download_dir 30 | -------------------------------------------------------------------------------- /tests/data/cog_count.tsv: -------------------------------------------------------------------------------- 1 | LETTER COUNT GROUP COLOR DESCRIPTION 2 | J 264 INFORMATION STORAGE AND PROCESSING #FCCCFC Translation, ribosomal structure and biogenesis 3 | A 8 INFORMATION STORAGE AND PROCESSING #FCDCFC RNA processing and modification 4 | K 272 INFORMATION STORAGE AND PROCESSING #FCDCEC Transcription 5 | L 147 INFORMATION STORAGE AND PROCESSING #FCDCDC Replication, recombination and repair 6 | B 8 INFORMATION STORAGE AND PROCESSING #FCDCCC Chromatin structure and dynamics 7 | D 51 CELLULAR PROCESSES AND SIGNALING #FCFCCC Cell cycle control, cell division, chromosome partitioning 8 | Y 0 CELLULAR PROCESSES AND SIGNALING #FCFCBC Nuclear structure 9 | V 95 CELLULAR PROCESSES AND SIGNALING #FCFCAC Defense mechanisms 10 | T 157 CELLULAR PROCESSES AND SIGNALING #ECFCAC Signal transduction mechanisms 11 | M 280 CELLULAR PROCESSES AND SIGNALING #DCFCAC Cell wall/membrane/envelope biogenesis 12 | N 106 CELLULAR PROCESSES AND SIGNALING #CCFCAC Cell motility 13 | Z 1 CELLULAR PROCESSES AND SIGNALING #BCFCAC Cytoskeleton 14 | W 14 CELLULAR PROCESSES AND SIGNALING #ACFCAC Extracellular structures 15 | U 39 CELLULAR PROCESSES AND SIGNALING #9CFCAC Intracellular trafficking, secretion, and vesicular transport 16 | O 149 CELLULAR PROCESSES AND SIGNALING #9CFC9C Posttranslational modification, protein turnover, chaperones 17 | X 69 CELLULAR PROCESSES AND SIGNALING #9CFC9C Mobilome: prophages, transposons 18 | C 280 METABOLISM #BCFCFC Energy production and conversion 19 | G 369 METABOLISM #CCFCFC Carbohydrate transport and metabolism 20 | E 371 METABOLISM #DCFCFC Amino acid transport and metabolism 21 | F 110 METABOLISM #DCECFC Nucleotide transport and metabolism 22 | H 180 METABOLISM #DCDCFC Coenzyme transport and metabolism 23 | I 127 METABOLISM #DCCCFC Lipid transport and metabolism 24 | P 199 METABOLISM #CCCCFC Inorganic ion transport and metabolism 25 | Q 46 METABOLISM #BCCCFC Secondary metabolites biosynthesis, transport and catabolism 26 | R 159 POORLY CHARACTERIZED #E0E0E0 General function prediction only 27 | S 116 POORLY CHARACTERIZED #CCCCCC Function unknown 28 | -------------------------------------------------------------------------------- /tests/data/example.faa: -------------------------------------------------------------------------------- 1 | >NP_414542.1 thr operon leader peptide 2 | MKRISTTITTTITITTGNGAG 3 | >NP_414543.1 Bifunctional aspartokinase/homoserine dehydrogenase 1 4 | MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWLKNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHVVTPNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDGVCRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV 5 | >NP_414544.1 homoserine kinase 6 | MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGKQIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEENDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPETAQRVADWLGKNYLQNQEGFVHICRLDTAGARVLEN 7 | >NP_414545.1 L-threonine synthase 8 | MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEMLKLDFVTRSAKILSAFIGDEIPQEILEERVRAAFAFPAPVANVESDVGCLELFHGPTLAFKDFGGRFMAQMLTHIAGDKPVTILTATSGDTGAAVAHAFYGLPNVKVVILYPRGKISPLQEKLFCTLGGNIETVAIDGDFDACQALVKQAFDDEELKVALGLNSANSINISRLLAQICYYFEAVAQLPQETRNQLVVSVPSGNFGDLTAGLLAKSLGLPVKRFIAATNVNDTVPRFLHDGQWSPKATQATLSNAMDVSQPNNWPRVEELFRRKIWQLKELGYAAVDDETTQQTMRELKELGYTSEPHAAVAYRALRDQLNPGEYGLFLGTAHPAKFKESVEAILGETLDLPKELAERADLPLLSHNLPADFAALRKLMMNHQ 9 | >NP_414546.1 DUF2502 family putative periplasmic protein 10 | MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDHGWWKQHYEWRGNRWHLHGPPPPPRHHKKAPHDHHGGHGPGKHHR 11 | >NP_414547.1 peroxide resistance protein, lowers intracellular iron 12 | MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQISTLMRISDKLAGINAARFHDWQPDFTPANARQAILAFKGDVYTGLQAETFSEDDFDFAQQHLRMLSGLYGVLRPLDLMQPYRLEMGIRLENARGKDLYQFWGDIITNKLNEALAAQGDNVVINLASDEYFKSVKPKKLNAEIIKPVFLDEKNGKFKIISFYAKKARGLMSRFIIENRLTKPEQLTGFNSEGYFFDEDSSSNGELVFKRYEQR 13 | >NP_414548.1 putative transporter 14 | MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKSLKNSIHPQPGGLTSFQSLCTSLAARVGSGNLAGVALAITAGGPGAVFWMWVAAFIGMATSFAECSLAQLYKERDVNGQFRGGPAWYMARGLGMRWMGVLFAVFLLIAYGIIFSGVQANAVARALSFSFDFPPLVTGIILAVFTLLAITRGLHGVARLMQGFVPLMAIIWVLTSLVICVMNIGQLPHVIWSIFESAFGWQEAAGGAAGYTLSQAITNGFQRSMFSNEAGMGSTPNAAAAAASWPPHPAAQGIVQMIGIFIDTLVICTASAMLILLAGNGTTYMPLEGIQLIQKAMRVLMGSWGAEFVTLVVILFAFSSIVANYIYAENNLFFLRLNNPKAIWCLRICTFATVIGGTLLSLPLMWQLADIIMACMAITNLTAILLLSPVVHTIASDYLRQRKLGVRPVFDPLRYPDIGRQLSPDAWDDVSQE 15 | >NP_414549.1 transaldolase B 16 | MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIPEYRKLIDDAVAWAKQQSNDRAQQIVDATDKLAVNIGLEILKLVPGRISTEVDARLSYDTEASIAKAKRLIKLYNDAGISNDRILIKLASTWQGIRAAEQLEKEGINCNLTLLFSFAQARACAEAGVFLISPFVGRILDWYKANTDKKEYAPAEDPGVVSVSEIYQYYKEHGYETVVMGASFRNIGEILELAGCDRLTIAPALLKELAESEGAIERKLSYTGEVKARPARITESEFLWQHNQDPMAVDKLAEGIRKFAIDQEKLEKMIGDLL 17 | >NP_414550.1 molybdochelatase incorporating molybdenum into molybdopterin 18 | MNTLRIGLVSISDRASSGVYQDKGIPALEEWLTSALTTPFELETRLIPDEQAIIEQTLCELVDEMSCHLVLTTGGTGPARRDVTPDATLAVADREMPGFGEQMRQISLHFVPTAILSRQVGVIRKQALILNLPGQPKSIKETLEGVKDAEGNVVVHGIFASVPYCIQLLEGPYVETAPEVVAAFRPKSARRDVSE 19 | >NP_414551.1 succinate-acetate transporter 20 | MGNTKLANPAPLGLMGFGMTTILLNLHNVGYFALDGIILAMGIFYGGIAQIFAGLLEYKKGNTFGLTAFTSYGSFWLTLVAILLMPKLGLTDAPNAQFLGVYLGLWGVFTLFMFFGTLKGARVLQFVFFSLTVLFALLAIGNIAGNAAIIHFAGWIGLICGASAIYLAMGEVLNEQFGRTVLPIGESH 21 | >NP_414552.1 UPF0174 family protein 22 | MNVNYLNDSDLDFLQHCSEEQLANFARLLTHNEKGKTRLSSVLMRNELFKSMEGHPEQHRRNWQLIAGELQHFGGDSIANKLRGHGKLYRAILLDVSKRLKLKADKEMSTFEIEQQLLEQFLRNTWKKMDEEHKQEFLHAVDARVNELEELLPLLMKDKLLAKGVSHLLSSQLTRILRTHAAMSVLGHGLLRGAGLGGPVGAALNGVKAVSGSAYRVTIPAVLQIACLRRMVSATQV 23 | >NP_414554.1 UPF0412 family protein 24 | MKSVFTISASLAISLMLCCTAQANDHKLLGAIAMPRNETNDLALKLPVCRIVKRIQLSADHGDLQLSGASVYFKAARSASQSLNIPSEIKEGQTTDWININSDNDNKRCVSKITFSGHTVNSSDMATLKIIGDD 25 | >NP_414555.1 chaperone Hsp70, with co-chaperone DnaJ 26 | MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDGETLVGQPAKRQAVTNPQNTLFAIKRLIGRRFQDEEVQRDVSIMPFKIIAADNGDAWVEVKGQKMAPPQISAEVLKKMKKTAEDYLGEPVTEAVITVPAYFNDAQRQATKDAGRIAGLEVKRIINEPTAAALAYGLDKGTGNRTIAVYDLGGGTFDISIIEIDEVDGEKTFEVLATNGDTHLGGEDFDSRLINYLVEEFKKDQGIDLRNDPLAMQRLKEAAEKAKIELSSAQQTDVNLPYITADATGPKHMNIKVTRAKLESLVEDLVNRSIEPLKVALQDAGLSVSDIDDVILVGGQTRMPMVQKKVAEFFGKEPRKDVNPDEAVAIGAAVQGGVLTGDVKDVLLLDVTPLSLGIETMGGVMTTLIAKNTTIPTKHSQVFSTAEDNQSAVTIHVLQGERKRAADNKSLGQFNLDGINPAPRGMPQIEVTFDIDADGILHVSAKDKNSGKEQKITIKASSGLNEDEIQKMVRDAEANAEADRKFEELVQTRNQGDHLLHSTRKQVEEAGDKLPADDKTAIESALTALETALKGEDKAAIEAKMQELAQVSQKLMEIAQQQHAQQQTAGADASANNAKDDDVVDAEFEEVKDKK 27 | >NP_414556.1 chaperone Hsp40, DnaK co-chaperone 28 | MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAKFKEIKEAYEVLTDSQKRAAYDQYGHAAFEQGGMGGGGFGGGADFSDIFGDVFGDIFGGGRGRQRAARGADLRYNMELTLEEAVRGVTKEIRIPTLEECDVCHGSGAKPGTQPQTCPTCHGSGQVQMRQGFFAVQQTCPHCQGRGTLIKDPCNKCHGHGRVERSKTLSVKIPAGVDTGDRIRLAGEGEAGEHGAPAGDLYVQVQVKQHPIFEREGNNLYCEVPINFAMAALGGEIEVPTLDGRVKLKVPGETQTGKLFRMRGKGVKSVRGGAQGDLLCRVVVETPVGLNERQKQLLQELQESFGGPTGEHNSPRSKSFFDGVKKFFDDLTR 29 | >NP_414557.1 IS186 transposase 30 | MNYSHDNWSAILAHIGKPEELDTSARNAGALTRRREIRDAATLLRLGLAYGPGGMSLREVTAWAQLHDVATLSDVALLKRLRNAADWFGILAAQTLAVRAAVTGCTSGKRLRLVDGTAISAPGGGSAEWRLHMGYDPHTCQFTDFELTDSRDAERLDRFAQTADEIRIADRGFGSRPECIRSLAFGEADYIVRVHWRGLRWLTAEGMRFDMMGFLRGLDCGKNGETTVMIGNSGNKKAGAPFPARLIAVSLPPEKALISKTRLLSENRRKGRVVQAETLEAAGHVLLLTSLPEDEYSAEQVADCYRLRWQIELAFKRLKSLLHLDALRAKEPELAKAWIFANLLAAFLIDDIIQPSLDFPPRSAGSEKKN 31 | >NP_414559.1 regulatory protein for HokC, overlaps CDS of hokC 32 | MLNTCRVPLTDRKVKEKRAMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFTAYESE 33 | >YP_025292.1 toxic membrane protein, small 34 | MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFTAYESE 35 | >NP_414560.1 sodium-proton antiporter 36 | MKHLHRFFSSDASGGIILIIAAILAMIMANSGATSGWYHDFLETPVQLRVGSLEINKNMLLWINDALMAVFFLLVGLEVKRELMQGSLASLRQAAFPVIAAIGGMIVPALLYLAFNYADPITREGWAIPAATDIAFALGVLALLGSRVPLALKIFLMALAIIDDLGAIIIIALFYTNDLSMASLGVAAVAIAVLAVLNLCGARRTGVYILVGVVLWTAVLKSGVHATLAGVIVGFFIPLKEKHGRSPAKRLEHVLHPWVAYLILPLFAFANAGVSLQGVTLDGLTSILPLGIIAGLLIGKPLGISLFCWLALRLKLAHLPEGTTYQQIMVVGILCGIGFTMSIFIASLAFGSVDPELINWAKLGILVGSISSAVIGYSWLRVRLRPSV 37 | >NP_414561.1 transcriptional activator of nhaA 38 | MSMSHINYNHLYYFWHVYKEGSVVGAAEALYLTPQTITGQIRALEERLQGKLFKRKGRGLEPSELGELVYRYADKMFTLSQEMLDIVNYRKESNLLFDVGVADALSKRLVSSVLNAAVVEGEPIHLRCFESTHEMLLEQLSQHKLDMIISDCPIDSTQQEGLFSVRIGECGVSFWCTNPPPEKPFPACLEERRLLIPGRRSMLGRKLLNWFNSQGLNVEILGEFDDAALMKAFGAMHNAIFVAPTLYAYDFYADKTVVEIGRVENVMEEYHAIFAERMIQHPAVQRICNTDYSALFSPAVR 39 | >NP_414562.1 IS1 transposase B 40 | MPGNSPHYGRWPQHDFTSLKKLRPQSVTSRIQPGSDVIVCAEMDEQWGYVGAKSRQRWLFYAYDSLRKTVVAHVFGERTMATLGRLMSLLSPFDVVIWMTDGWPLYESRLKGKLHVISKRYTQRIERHNLNLRQHLARLGRKSLSFSKSVELHDKVIGHYLNIKHYQ 41 | >NP_414563.1 IS1 repressor TnpA 42 | MASVSISCPSCSATDGVVRNGKSTAGHQRYLCSHCRKTWQLQFTYTASQPGTHQKIIDMAMNGVGCRATARIMGVGLNTILRHLKNSGRSR 43 | >NP_414564.1 30S ribosomal subunit protein S20 44 | MANIKSAKKRAIQSEKARKHNASRRSMMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKAARHKANLTAQINKLA 45 | >NP_414565.1 uncharacterized protein 46 | MCRHSLRSDGAGFYQLAGCEYSFSAIKIAAGGQFLPVICAMAMKSHFFLISVLNRRLTLTAVQGILGRFSLF 47 | >NP_414566.1 bifunctional riboflavin kinase/FAD synthetase 48 | MKLIRGIHNLSQAPQEGCVLTIGNFDGVHRGHRALLQGLQEEGRKRNLPVMVMLFEPQPLELFATDKAPARLTRLREKLRYLAECGVDYVLCVRFDRRFAALTAQNFISDLLVKHLRVKFLAVGDDFRFGAGREGDFLLLQKAGMEYGFDITSTQTFCEGGVRISSTAVRQALADDNLALAESLLGHPFAISGRVVHGDELGRTIGFPTANVPLRRQVSPVKGVYAVEVLGLGEKPLPGVANIGTRPTVAGIRQQLEVHLLDVAMDLYGRHIQVVLRKKIRNEQRFASLDELKAQIARDELTAREFFGLTKPA 49 | >NP_414567.1 isoleucyl-tRNA synthetase 50 | MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKKGKKTFILHDGPPYANGSIHIGHSVNKILKDIIVKSKGLSGYDSPYVPGWDCHGLPIELKVEQEYGKPGEKFTAAEFRAKCREYAATQVDGQRKDFIRLGVLGDWSHPYLTMDFKTEANIIRALGKIIGNGHLHKGAKPVHWCVDCRSALAEAEVEYYDKTSPSIDVAFQAVDQDALKAKFAVSNVNGPISLVIWTTTPWTLPANRAISIAPDFDYALVQIDGQAVILAKDLVESVMQRIGVTDYTILGTVKGAELELLRFTHPFMGFDVPAILGDHVTLDAGTGAVHTAPGHGPDDYVIGQKYGLETANPVGPDGTYLPGTYPTLDGVNVFKANDIVVALLQEKGALLHVEKMQHSYPCCWRHKTPIIFRATPQWFVSMDQKGLRAQSLKEIKGVQWIPDWGQARIESMVANRPDWCISRQRTWGVPMSLFVHKDTEELHPRTLELMEEVAKRVEVDGIQAWWDLDAKEILGDEADQYVKVPDTLDVWFDSGSTHSSVVDVRPEFAGHAADMYLEGSDQHRGWFMSSLMISTAMKGKAPYRQVLTHGFTVDGQGRKMSKSIGNTVSPQDVMNKLGADILRLWVASTDYTGEMAVSDEILKRAADSYRRIRNTARFLLANLNGFDPAKDMVKPEEMVVLDRWAVGCAKAAQEDILKAYEAYDFHEVVQRLMRFCSVEMGSFYLDIIKDRQYTAKADSVARRSCQTALYHIAEALVRWMAPILSFTADEVWGYLPGEREKYVFTGEWYEGLFGLADSEAMNDAFWDELLKVRGEVNKVIEQARADKKVGGSLEAAVTLYAEPELSAKLTALGDELRFVLLTSGATVADYNDAPADAQQSEVLKGLKVALSKAEGEKCPRCWHYTQDVGKVAEHAEICGRCVSNVAGDGEKRKFA 51 | >NP_414568.1 prolipoprotein signal peptidase (signal peptidase II) 52 | MSQSICSTGLRWLWLVVVVLIIDLGSKYLILQNFALGDTVPLFPSLNLHYARNYGAAFSFLADSGGWQRWFFAGIAIGISVILAVMMYRSKATQKLNNIAYALIIGGALGNLFDRLWHGFVVDMIDFYVGDWHFATFNLADTAICVGAALIVLEGFLPSRAKKQ 53 | >NP_414569.1 FKBP-type peptidyl-prolyl cis-trans isomerase (rotamase) 54 | MSESVQSNSAVLVHFTLKLDDGTTAESTRNNGKPALFRLGDASLSEGLEQHLLGLKVGDKTTFSLEPDAAFGVPSPDLIQYFSRREFMDAGEPEIGAIMLFTAMDGSEMPGVIREINGDSITVDFNHPLAGQTVHFDIEVLEIDPALEA 55 | >NP_414570.1 4-hydroxy-3-methylbut-2-enyl diphosphate reductase, 4Fe-4S protein 56 | MQILLANPRGFCAGVDRAISIVENALAIYGAPIYVRHEVVHNRYVVDSLRERGAIFIEQISEVPDGAILIFSAHGVSQAVRNEAKSRDLTVFDATCPLVTKVHMEVARASRRGEESILIGHAGHPEVEGTMGQYSNPEGGMYLVESPDDVWKLTVKNEEKLSFMTQTTLSVDDTSDVIDALRKRFPKIVGPRKDDICYATTNRQEAVRALAEQAEVVLVVGSKNSSNSNRLAELAQRMGKRAFLIDDAKDIQEEWVKEVKCVGVTAGASAPDILVQNVVARLQQLGGGEAIPLEGREENIVFEVPKELRVDIREVD 57 | >NP_414571.1 ribonucleoside hydrolase 3 58 | MRLPIFLDTDPGIDDAVAIAAAIFAPELDLQLMTTVAGNVSVEKTTRNALQLLHFWNAEIPLAQGAAVPLVRAPRDAASVHGESGMAGYDFVEHNRKPLGIPAFLAIRDALMRAPEPVTLVAIGPLTNIALLLSQCPECKPYIRRLVIMGGSAGRGNCTPNAEFNIAADPEAAACVFRSGIEIVMCGLDVTNQAILTPDYLSTLPQLNRTGKMLHALFSHYRSGSMQSGLRMHDLCAIAWLVRPDLFTLKPCFVAVETQGEFTSGTTVVDIDGCLGKPANVQVALDLDVKGFQQWVAEVLALAS 59 | >NP_414572.1 dihydrodipicolinate reductase 60 | MHDANIRVAIAGAGGRMGRQLIQAALALEGVQLGAALEREGSSLLGSDAGELAGAGKTGVTVQSSLDAVKDDFDVFIDFTRPEGTLNHLAFCRQHGKGMVIGTTGFDEAGKQAIRDAAADIAIVFAANFSVGVNVMLKLLEKAAKVMGDYTDIEIIEAHHRHKVDAPSGTALAMGEAIAHALDKDLKDCAVYSREGHTGERVPGTIGFATVRAGDIVGEHTAMFADIGERLEITHKASSRMTFANGAVRSALWLSGKESGLFDMRDVLDLNNL 61 | >NP_414573.1 carbamoyl phosphate synthetase small subunit, glutamine amidotransferase 62 | MIKSALLVLEDGTQFHGRAIGATGSAVGEVVFNTSMTGYQEILTDPSYSRQIVTLTYPHIGNVGTNDADEESSQVHAQGLVIRDLPLIASNFRNTEDLSSYLKRHNIVAIADIDTRKLTRLLREKGAQNGCIIAGDNPDAALALEKARAFPGLNGMDLAKEVTTAEAYSWTQGSWTLTGGLPEAKKEDELPFHVVAYDFGAKRNILRMLVDRGCRLTIVPAQTSAEDVLKMNPDGIFLSNGPGDPAPCDYAITAIQKFLETDIPVFGICLGHQLLALASGAKTVKMKFGHHGGNHPVKDVEKNVVMITAQNHGFAVDEATLPANLRVTHKSLFDGTLQGIHRTDKPAFSFQGHPEASPGPHDAAPLFDHFIELIEQYRKTAK 63 | >NP_414574.1 carbamoyl-phosphate synthase large subunit 64 | MPKRTDIKSILILGAGPIVIGQACEFDYSGAQACKALREEGYRVILVNSNPATIMTDPEMADATYIEPIHWEVVRKIIEKERPDAVLPTMGGQTALNCALELERQGVLEEFGVTMIGATADAIDKAEDRRRFDVAMKKIGLETARSGIAHTMEEALAVAADVGFPCIIRPSFTMGGSGGGIAYNREEFEEICARGLDLSPTKELLIDESLIGWKEYEMEVVRDKNDNCIIVCSIENFDAMGIHTGDSITVAPAQTLTDKEYQIMRNASMAVLREIGVETGGSNVQFAVNPKNGRLIVIEMNPRVSRSSALASKATGFPIAKVAAKLAVGYTLDELMNDITGGRTPASFEPSIDYVVTKIPRFNFEKFAGANDRLTTQMKSVGEVMAIGRTQQESLQKALRGLEVGATGFDPKVSLDDPEALTKIRRELKDAGADRIWYIADAFRAGLSVDGVFNLTNIDRWFLVQIEELVRLEEKVAEVGITGLNADFLRQLKRKGFADARLAKLAGVREAEIRKLRDQYDLHPVYKRVDTCAAEFATDTAYMYSTYEEECEANPSTDREKIMVLGGGPNRIGQGIEFDYCCVHASLALREDGYETIMVNCNPETVSTDYDTSDRLYFEPVTLEDVLEIVRIEKPKGVIVQYGGQTPLKLARALEAAGVPVIGTSPDAIDRAEDRERFQHAVERLKLKQPANATVTAIEMAVEKAKEIGYPLVVRPSYVLGGRAMEIVYDEADLRRYFQTAVSVSNDAPVLLDHFLDDAVEVDVDAICDGEMVLIGGIMEHIEQAGVHSGDSACSLPAYTLSQEIQDVMRQQVQKLAFELQVRGLMNVQFAVKNNEVYLIEVNPRAARTVPFVSKATGVPLAKVAARVMAGKSLAEQGVTKEVIPPYYSVKEVVLPFNKFPGVDPLLGPEMRSTGEVMGVGRTFAEAFAKAQLGSNSTMKKHGRALLSVREGDKERVVDLAAKLLKQGFELDATHGTAIVLGEAGINPRLVNKVHEGRPHIQDRIKNGEYTYIINTTSGRRAIEDSRVIRRSALQYKVHYDTTLNGGFATAMALNADATEKVISVQEMHAQIK 65 | >NP_414576.4 cai operon transcriptional activator 66 | MCEGYVEKPLYLLIAEWMMAENRWVIAREISIHFDIEHSKAVNTLTYILSEVTEISCEVKMIPNKLEGRGCQCQRLVKVVDIDEQIYARLRNNSREKLVGVRKTPRIPAVPLTELNREQKWQMMLSKSMRR 67 | >NP_414577.2 stimulator of CaiD and CaiB enzyme activities 68 | MSYYAFEGLIPVVHPTAFVHPSAVLIGDVIVGAGVYIGPLASLRGDYGRLIVQAGANIQDGCIMHGYCDTDTIVGENGHIGHGAILHGCLIGRDALVGMNSVIMDGAVIGEESIVAAMSFVKAGFRGEKRQLLMGTPARAVRNVSDDELHWKRLNTKEYQDLVGRCHVSLHETQPLRQMEENRPRLQGTTDVTPKR 69 | >NP_414578.2 carnitinyl-CoA dehydratase 70 | MSESLHLTRNGSILEITLDRPKANAIDAKTSFEMGEVFLNFRDDPQLRVAIITGAGEKFFSAGWDLKAAAEGEAPDADFGPGGFAGLTEIFNLDKPVIAAVNGYAFGGGFELALAADFIVCADNASFALPEAKLGIVPDSGGVLRLPKILPPAIVNEMVMTGRRMGAEEALRWGIVNRVVSQAELMDNARELAQQLVNSAPLAIAALKEIYRTTSEMPVEEAYRYIRSGVLKHYPSVLHSEDAIEGPLAFAEKRDPVWKGR 71 | >NP_414579.4 putative crotonobetaine/carnitine-CoA ligase 72 | MDIIGGQHLRQMWDDLADVYGHKTALICESSGGVVNRYSYLELNQEINRTANLFYTLGIRKGDKVALHLDNCPEFIFCWFGLAKIGAIMVPINARLLCEESAWILQNSQACLLVTSAQFYPMYQQIQQEDATQLRHICLTDVALPADDGVSSFTQLKNQQPATLCYAPPLSTDDTAEILFTSGTTSRPKGVVITHYNLRFAGYYSAWQCALRDDDVYLTVMPAFHIDCQCTAAMAAFSAGATFVLVEKYSARAFWGQVQKYRATVTECIPMMIRTLMVQPPSANDQQHRLREVMFYLNLSEQEKDAFCERFGVRLLTSYGMTETIVGIIGDRPGDKRRWPSIGRVGFCYEAEIRDDHNRPLPAGEIGEICIKGIPGKTIFKEYFLNPQATAKVLEADGWLHTGDTGYRDEEDFFYFVDRRCNMIKRGGENVSCVELENIIAAHPKIQDIVVVGIKDSIRDEAIKAFVVLNEGETLSEEEFFRFCEQNMAKFKVPSYLEIRKDLPRNCSGKIIRKNLK 73 | >NP_414580.1 crotonobetainyl CoA:carnitine CoA transferase 74 | MDHLPMPKFGPLAGLRVVFSGIEIAGPFAGQMFAEWGAEVIWIENVAWADTIRVQPNYPQLSRRNLHALSLNIFKDEGREAFLKLMETTDIFIEASKGPAFARRGITDEVLWQHNPKLVIAHLSGFGQYGTEEYTNLPAYNTIAQAFSGYLIQNGDVDQPMPAFPYTADYFSGLTATTAALAALHKVRETGKGESIDIAMYEVMLRMGQYFMMDYFNGGEMCPRMSKGKDPYYAGCGLYKCADGYIVMELVGITQIEECFKDIGLAHLLGTPEIPEGTQLIHRIECPYGPLVEEKLDAWLATHTIAEVKERFAELNIACAKVLTVPELESNPQYVARESITQWQTMDGRTCKGPNIMPKFKNNPGQIWRGMPSHGMDTAAILKNIGYSENDIQELVSKGLAKVED 75 | >NP_414581.1 crotonobetaine reductase subunit II, FAD-binding 76 | MDFNLNDEQELFVAGIRELMASENWEAYFAECDRDSVYPERFVKALADMGIDSLLIPEEHGGLDAGFVTLAAVWMELGRLGAPTYVLYQLPGGFNTFLREGTQEQIDKIMAFRGTGKQMWNSAITEPGAGSDVGSLKTTYTRRNGKIYLNGSKCFITSSAYTPYIVVMARDGASPDKPVYTEWFVDMSKPGIKVTKLEKLGLRMDSCCEITFDDVELDEKDMFGREGNGFNRVKEEFDHERFLVALTNYGTAMCAFEDAARYANQRVQFGEAIGRFQLIQEKFAHMAIKLNSMKNMLYEAAWKADNGTITSGDAAMCKYFCANAAFEVVDSAMQVLGGVGIAGNHRISRFWRDLRVDRVSGGSDEMQILTLGRAVLKQYR 77 | >NP_414582.1 putative transporter 78 | MKNEKRKTGIEPKVFFPPLIIVGILCWLTVRDLDAANVVINAVFSYVTNVWGWAFEWYMVVMLFGWFWLVFGPYAKKRLGNEPPEFSTASWIFMMFASCTSAAVLFWGSIEIYYYISTPPFGLEPNSTGAKELGLAYSLFHWGPLPWATYSFLSVAFAYFFFVRKMEVIRPSSTLVPLVGEKHAKGLFGTIVDNFYLVALIFAMGTSLGLATPLVTECMQWLFGIPHTLQLDAIIITCWIILNAICVACGLQKGVRIASDVRSYLSFLMLGWVFIVSGASFIMNYFTDSVGMLLMYLPRMLFYTDPIAKGGFPQGWTVFYWAWWVIYAIQMSIFLARISRGRTVRELCFGMVLGLTASTWILWTVLGSNTLLLIDKNIINIPNLIEQYGVARAIIETWAALPLSTATMWGFFILCFIATVTLVNACSYTLAMSTCREVRDGEEPPLLVRIGWSILVGIIGIVLLALGGLKPIQTAIIAGGCPLFFVNIMVTLSFIKDAKQNWKD 79 | >NP_414583.2 anaerobic carnitine reduction putative electron transfer flavoprotein subunit 80 | MKIITCYKCVPDEQDIAVNNADGSLDFSKADAKISQYDLNAIEAACQLKQQAAEAQVTALSVGGKALTNAKGRKDVLSRGPDELIVVIDDQFEQALPQQTASALAAAAQKAGFDLILCGDGSSDLYAQQVGLLVGEILNIPAVNGVSKIISLTADTLTVERELEDETETLSIPLPAVVAVSTDINSPQIPSMKAILGAAKKPVQVWSAADIGFNAEAAWSEQQVAAPKQRERQRIVIEGDGEEQIAAFAENLRKVI 81 | >NP_414584.1 putative electron transfer flavoprotein, NAD/FAD-binding domain and ETFP adenine nucleotide-binding domain-like protein 82 | MNTFSQVWVFSDTPSRLPELMNGAQALANQINTFVLNDADGAQAIQLGANHVWKLNGKPDDRMIEDYAGVMADTIRQHGADGLVLLPNTRRGKLLAAKLGYRLKAAVSNDASTVSVQDGKATVKHMVYGGLAIGEERIATPYAVLTISSGTFDAAQPDASRTGETHTVEWQAPAVAITRTATQARQSNSVDLDKARLVVSVGRGIGSKENIALAEQLCKAIGAELACSRPVAENEKWMEHERYVGISNLMLKPELYLAVGISGQIQHMVGANASQTIFAINKDKNAPIFQYADYGIVGDAVKILPALTAALAR 83 | >NP_414585.1 putative oxidoreductase 84 | MSEDIFDAIIVGAGLAGSVAALVLAREGAQVLVIERGNSAGAKNVTGGRLYAHSLEHIIPGFADSAPVERLITHEKLAFMTEKSAMTMDYCNGDETSPSQRSYSVLRSKFDAWLMEQAEEAGAQLITGIRVDNLVQRDGKVVGVEADGDVIEAKTVILADGVNSILAEKLGMAKRVKPTDVAVGVKELIELPKSVIEDRFQLQGNQGAACLFAGSPTDGLMGGGFLYTNENTLSLGLVCGLHHLHDAKKSVPQMLEDFKQHPAVAPLIAGGKLVEYSAHVVPEAGINMLPELVGDGVLIAGDAAGMCMNLGFTIRGMDLAIAAGEAAAKTVLSAMKSDDFSKQKLAEYRQHLESGPLRDMRMYQKLPAFLDNPRMFSGYPELAVGVARDLFTIDGSAPELMRKKILRHGKKVGFINLIKDGMKGVTVL 85 | >NP_414586.1 putative 4Fe-4S ferredoxin-type protein 86 | MTSPVNVDVKLGVNKFNVDEEHPHIVVKADADKQALELLVKACPAGLYKKQDDGSVRFDYAGCLECGTCRILGLGSALEQWEYPRGTFGVEFRYG 87 | >NP_414587.1 putative MFS sugar transporter; membrane protein 88 | MQPSRNFDDLKFSSIHRRILLWGSGGPFLDGYVLVMIGVALEQLTPALKLDADWIGLLGAGTLAGLFVGTSLFGYISDKVGRRKMFLIDIIAIGVISVATMFVSSPVELLVMRVLIGIVIGADYPIATSMITEFSSTRQRAFSISFIAAMWYVGATCADLVGYWLYDVEGGWRWMLGSAAIPCLLILIGRFELPESPRWLLRKGRVKECEEMMIKLFGEPVAFDEEQPQQTRFRDLFNRRHFPFVLFVAAIWTCQVIPMFAIYTFGPQIVGLLGLGVGKNAALGNVVISLFFMLGCIPPMLWLNTAGRRPLLIGSFAMMTLALAVLGLIPDMGIWLVVMAFAVYAFFSGGPGNLQWLYPNELFPTDIRASAVGVIMSLSRIGTIVSTWALPIFINNYGISNTMLMGAGISLFGLLISVAFAPETRGMSLAQTSNMTIRGQRMG 89 | >NP_414588.1 potassium-efflux system ancillary protein for KefC, glutathione-regulated; quinone oxidoreductase, FMN-dependent 90 | MILIIYAHPYPHHSHANKRMLEQARTLEGVEIRSLYQLYPDFNIDIAAEQEALSRADLIVWQHPMQWYSIPPLLKLWIDKVFSHGWAYGHGGTALHGKHLLWAVTTGGGESHFEIGAHPGFDVLSQPLQATAIYCGLNWLPPFAMHCTFICDDETLEGQARHYKQRLLEWQEAHHG 91 | >NP_414589.1 potassium:proton antiporter 92 | MDSHTLIQALIYLGSAALIVPIAVRLGLGSVLGYLIAGCIIGPWGLRLVTDAESILHFAEIGVVLMLFIIGLELDPQRLWKLRAAVFGCGALQMVICGGLLGLFCMLLGLRWQVAELIGMTLALSSTAIAMQAMNERNLMVTQMGRSAFAVLLFQDIAAIPLVAMIPLLATSSASTTMGAFALSALKVAGALVLVVLLGRYVTRPALRFVARSGLREVFSAVALFLVFGFGLLLEEVGLSMAMGAFLAGVLLASSEYRHALESDIEPFKGLLLGLFFIGVGMSIDFGTLLENPLRIVILLLGFLIIKIAMLWLIARPLQVPNKQRRWFAVLLGQGSEFAFVVFGAAQMANVLEPEWAKSLTLAVALSMAATPILLVILNRLEQSSTEEAREADEIDEEQPRVIIAGFGRFGQITGRLLLSSGVKMVVLDHDPDHIETLRKFGMKVFYGDATRMDLLESAGAAKAEVLINAIDDPQTNLQLTEMVKEHFPHLQIIARARDVDHYIRLRQAGVEKPERETFEGALKTGRLALESLGLGPYEARERADVFRRFNIQMVEEMAMVENDTKARAAVYKRTSAMLSEIITEDREHLSLIQRHGWQGTEEGKHTGNMADEPETKPSS 93 | >NP_414590.1 dihydrofolate reductase 94 | MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR 95 | >NP_414591.1 diadenosine tetraphosphatase 96 | MATYLIGDVHGCYDELIALLHKVEFTPGKDTLWLTGDLVARGPGSLDVLRYVKSLGDSVRLVLGNHDLHLLAVFAGISRNKPKDRLTPLLEAPDADELLNWLRRQPLLQIDEEKKLVMAHAGITPQWDLQTAKECARDVEAVLSSDSYPFFLDAMYGDMPNNWSPELRGLGRLRFITNAFTRMRFCFPNGQLDMYSKESPEEAPAPLKPWFAIPGPVAEEYSIAFGHWASLEGKGTPEGIYALDTGCCWGGTLTCLRWEDKQYFVQPSNRHKDLGEAAAS 97 | >NP_414592.1 protein associated with Co2+ and Mg2+ efflux 98 | MINSPRVCIQVQSVYIEAQSSPDNERYVFAYTVTIRNLGRAPVQLLGRYWLITNGNGRETEVQGEGVVGVQPLIAPGEEYQYTSGAIIETPLGTMQGHYEMIDENGVPFSIDIPVFRLAVPTLIH 99 | >NP_414593.1 16S rRNA m(6)A1518, m(6)A1519 dimethyltransferase, SAM-dependent 100 | MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGPGLAALTEPVGERLDQLTVIELDRDLAARLQTHPFLGPKLTIYQQDAMTFNFGELAEKMGQPLRVFGNLPYNISTPLMFHLFSYTDAIADMHFMLQKEVVNRLVAGPNSKAYGRLSVMAQYYCNVIPVLEVPPSAFTPPPKVDSAVVRLVPHATMPHPVKDVRVLSRITTEAFNQRRKTIRNSLGNLFSVEVLTGMGIDPAMRAENISVAQYCQMANYLAENAPLQES 101 | >NP_414594.1 4-hydroxy-L-threonine phosphate dehydrogenase, NAD-dependent 102 | MVKTQRVVITPGEPAGIGPDLVVQLAQREWPVELVVCADATLLTNRAAMLGLPLTLRPYSPNSPAQPQTAGTLTLLPVALRAPVTAGQLAVENGHYVVETLARACDGCLNGEFAALITGPVHKGVINDAGIPFTGHTEFFEERSQAKKVVMMLATEELRVALATTHLPLRDIADAITPALLHEVIAILHHDLRTKFGIAEPRILVCGLNPHAGEGGHMGTEEIDTIIPVLNELRAQGMKLNGPLPADTLFQPKYLDNADAVLAMYHDQGLPVLKYQGFGRGVNITLGLPFIRTSVDHGTALELAGRGKADVGSFITALNLAIKMIVNTQ 103 | >NP_414595.1 peptidyl-prolyl cis-trans isomerase (PPIase) 104 | MKNWKTLLLGIAMIANTSFAAPQVVDKVAAVVNNGVVLESDVDGLMQSVKLNAAQARQQLPDDATLRHQIMERLIMDQIILQMGQKMGVKISDEQLDQAIANIAKQNNMTLDQMRSRLAYDGLNYNTYRNQIRKEMIISEVRNNEVRRRITILPQEVESLAQQVGNQNDASTELNLSHILIPLPENPTSDQVNEAESQARAIVDQARNGADFGKLAIAHSADQQALNGGQMGWGRIQELPGIFAQALSTAKKGDIVGPIRSGVGFHILKVNDLRGESKNISVTEVHARHILLKPSPIMTDEQARVKLEQIAADIKSGKTTFAAAAKEFSQDPGSANQGGDLGWATPDIFDPAFRDALTRLNKGQMSAPVHSSFGWHLIELLDTRNVDKTDAAQKDRAYRMLMNRKFSEEAASWMQEQRASAYVKILSN 105 | >NP_414596.1 LPS assembly OM complex LptDE, beta-barrel component 106 | MKKRIPTLLATMIATALYSQQGLAADLASQCMLGVPSYDRPLVQGDTNDLPVTINADHAKGDYPDDAVFTGSVDIMQGNSRLQADEVQLHQKEAPGQPEPVRTVDALGNVHYDDNQVILKGPKGWANLNTKDTNVWEGDYQMVGRQGRGKADLMKQRGENRYTILDNGSFTSCLPGSDTWSVVGSEIIHDREEQVAEIWNARFKVGPVPIFYSPYLQLPVGDKRRSGFLIPNAKYTTTNYFEFYLPYYWNIAPNMDATITPHYMHRRGNIMWENEFRYLSQAGAGLMELDYLPSDKVYEDEHPNDDSSRRWLFYWNHSGVMDQVWRFNVDYTKVSDPSYFNDFDNKYGSSTDGYATQKFSVGYAVQNFNATVSTKQFQVFSEQNTSSYSAEPQLDVNYYQNDVGPFDTRIYGQAVHFVNTRDDMPEATRVHLEPTINLPLSNNWGSINTEAKLLATHYQQTNLDWYNSRNTTKLDESVNRVMPQFKVDGKMVFERDMEMLAPGYTQTLEPRAQYLYVPYRDQSDIYNYDSSLLQSDYSGLFRDRTYGGLDRIASANQVTTGVTSRIYDDAAVERFNISVGQIYYFTESRTGDDNITWENDDKTGSLVWAGDTYWRISERWGLRGGIQYDTRLDNVATSNSSIEYRRDEDRLVQLNYRYASPEYIQATLPKYYSTAEQYKNGISQVGAVASWPIADRWSIVGAYYYDTNANKQADSMLGVQYSSCCYAIRVGYERKLNGWDNDKQHAVYDNAIGFNIELRGLSSNYGLGTQEMLRSNILPYQNTL 107 | >NP_414597.1 membrane-anchored DnaK co-chaperone, DNA-binding protein 108 | MQYWGKIIGVAVALLMGGGFWGVVLGLLIGHMFDKARSRKMAWFANQRERQALFFATTFEVMGHLTKSKGRVTEADIHIASQLMDRMNLHGASRTAAQNAFRVGKSDNYPLREKMRQFRSVCFGRFDLIRMFLEIQIQAAFADGSLHPNERAVLYVIAEELGISRAQFDQFLRMMQGGAQFGGGYQQQTGGGNWQQAQRGPTLEDACNVLGVKPTDDATTIKRAYRKLMSEHHPDKLVAKGLPPEMMEMAKQKAQEIQQAYELIKQQKGFK 109 | >NP_414600.1 dual specificity 23S rRNA pseudouridine(746), tRNA pseudouridine(32) synthase, SAM-dependent 110 | MGMENYNPPQEPWLVILYQDDHIMVVNKPSGLLSVPGRLEEHKDSVMTRIQRDYPQAESVHRLDMATSGVIVVALTKAAERELKRQFREREPKKQYVARVWGHPSPAEGLVDLPLICDWPNRPKQKVCYETGKPAQTEYEVVEYAADNTARVVLKPITGRSHQLRVHMLALGHPILGDRFYASPEARAMAPRLLLHAEMLTITHPAYGNSMTFKAPADF 111 | >NP_414601.1 RNA polymerase remodeling/recycling factor ATPase; RNA polymerase-associated, ATP-dependent RNA translocase 112 | MPFTLGQRWISDTESELGLGTVVAVDARTVTLLFPSTGENRLYARSDSPVTRVMFNPGDTITSHDGWQMQVEEVKEENGLLTYIGTRLDTEESGVALREVFLDSKLVFSKPQDRLFAGQIDRMDRFALRYRARKYSSEQFRMPYSGLRGQRTSLIPHQLNIAHDVGRRHAPRVLLADEVGLGKTIEAGMILHQQLLSGAAERVLIIVPETLQHQWLVEMLRRFNLRFALFDDERYAEAQHDAYNPFDTEQLVICSLDFARRSKQRLEHLCEAEWDLLVVDEAHHLVWSEDAPSREYQAIEQLAEHVPGVLLLTATPEQLGMESHFARLRLLDPNRFHDFAQFVEEQKNYRPVADAVAMLLAGNKLSNDELNMLGEMIGEQDIEPLLQAANSDSEDAQSARQELVSMLMDRHGTSRVLFRNTRNGVKGFPKRELHTIKLPLPTQYQTAIKVSGIMGARKSAEDRARDMLYPERIYQEFEGDNATWWNFDPRVEWLMGYLTSHRSQKVLVICAKAATALQLEQVLREREGIRAAVFHEGMSIIERDRAAAWFAEEDTGAQVLLCSEIGSEGRNFQFASHMVMFDLPFNPDLLEQRIGRLDRIGQAHDIQIHVPYLEKTAQSVLVRWYHEGLDAFEHTCPTGRTIYDSVYNDLINYLASPDQTEGFDDLIKNCREQHEALKAQLEQGRDRLLEIHSNGGEKAQALAESIEEQDDDTNLIAFAMNLFDIIGINQDDRGDNMIVLTPSDHMLVPDFPGLSEDGITITFDREVALAREDAQFITWEHPLIRNGLDLILSGDTGSSTISLLKNKALPVGTLLVELIYVVEAQAPKQLQLNRFLPPTPVRMLLDKNGNNLAAQVEFETFNRQLNAVNRHTGSKLVNAVQQDVHAILQLGEAQIEKSARALIDAARNEADEKLSAELSRLEALRAVNPNIRDDELTAIESNRQQVMESLDQAGWRLDALRLIVVTHQ 113 | >NP_414602.1 DNA polymerase II 114 | MAQAGFILTRHWRDTPQGTEVSFWLATDNGPLQVTLAPQESVAFIPADQVPRAQHILQGEQGFRLTPLALKDFHRQPVYGLYCRAHRQLMNYEKRLREGGVTVYEADVRPPERYLMERFITSPVWVEGDMHNGTIVNARLKPHPDYRPPLKWVSIDIETTRHGELYCIGLEGCGQRIVYMLGPENGDASSLDFELEYVASRPQLLEKLNAWFANYDPDVIIGWNVVQFDLRMLQKHAERYRLPLRLGRDNSELEWREHGFKNGVFFAQAKGRLIIDGIEALKSAFWNFSSFSLETVAQELLGEGKSIDNPWDRMDEIDRRFAEDKPALATYNLKDCELVTQIFHKTEIMPFLLERATVNGLPVDRHGGSVAAFGHLYFPRMHRAGYVAPNLGEVPPHASPGGYVMDSRPGLYDSVLVLDYKSLYPSIIRTFLIDPVGLVEGMAQPDPEHSTEGFLDAWFSREKHCLPEIVTNIWHGRDEAKRQGNKPLSQALKIIMNAFYGVLGTTACRFFDPRLASSITMRGHQIMRQTKALIEAQGYDVIYGDTDSTFVWLKGAHSEEEAAKIGRALVQHVNAWWAETLQKQRLTSALELEYETHFCRFLMPTIRGADTGSKKRYAGLIQEGDKQRMVFKGLETVRTDWTPLAQQFQQELYLRIFRNEPYQEYVRETIDKLMAGELDARLVYRKRLRRPLSEYQRNVPPHVRAARLADEENQKRGRPLQYQNRGTIKYVWTTNGPEPLDYQRSPLDYEHYLTRQLQPVAEGILPFIEDNFATLMTGQLGLF 115 | >NP_414603.1 L-ribulose-5-phosphate 4-epimerase 116 | MLEDLKRQVLEANLALPKHNLVTLTWGNVSAVDRERGVFVIKPSGVDYSVMTADDMVVVSIETGEVVEGTKKPSSDTPTHRLLYQAFPSIGGIVHTHSRHATIWAQAGQSIPATGTTHADYFYGTIPCTRKMTDAEINGEYEWETGNVIVETFEKQGIDAAQMPGVLVHSHGPFAWGKNAEDAVHNAIVLEEVAYMGIFCRQLAPQLPDMQQTLLDKHYLRKHGAKAYYGQ 117 | >NP_414604.1 L-arabinose isomerase 118 | MTIFDNYEVWFVIGSQHLYGPETLRQVTQHAEHVVNALNTEAKLPCKLVLKPLGTTPDEITAICRDANYDDRCAGLVVWLHTFSPAKMWINGLTMLNKPLLQFHTQFNAALPWDSIDMDFMNLNQTAHGGREFGFIGARMRQQHAVVTGHWQDKQAHERIGSWMRQAVSKQDTRHLKVCRFGDNMREVAVTDGDKVAAQIKFGFSVNTWAVGDLVQVVNSISDGDVNALVDEYESCYTMTPATQIHGKKRQNVLEAARIELGMKRFLEQGGFHAFTTTFEDLHGLKQLPGLAVQRLMQQGYGFAGEGDWKTAALLRIMKVMSTGLQGGTSFMEDYTYHFEKGNDLVLGSHMLEVCPSIAAEEKPILDVQHLGIGGKDDPARLIFNTQTGPAIVASLIDLGDRYRLLVNCIDTVKTPHSLPKLPVANALWKAQPDLPTASEAWILAGGAHHTVFSHALNLNDMRQFAEMHDIEITVIDNDTRLPAFKDALRWNEVYYGFRR 119 | >NP_414605.1 L-ribulokinase 120 | MAIAIGLDFGSDSVRALAVDCATGEEIATSVEWYPRWQKGQFCDAPNNQFRHHPRDYIESMEAALKTVLAELSVEQRAAVVGIGVDSTGSTPAPIDADGNVLALRPEFAENPNAMFVLWKDHTAVEEAEEITRLCHAPGNVDYSRYIGGIYSSEWFWAKILHVTRQDSAVAQSAASWIELCDWVPALLSGTTRPQDIRRGRCSAGHKSLWHESWGGLPPASFFDELDPILNRHLPSPLFTDTWTADIPVGTLCPEWAQRLGLPESVVISGGAFDCHMGAVGAGAQPNALVKVIGTSTCDILIADKQSVGERAVKGICGQVDGSVVPGFIGLEAGQSAFGDIYAWFGRVLGWPLEQLAAQHPELKTQINASQKQLLPALTEAWAKNPSLDHLPVVLDWFNGRRTPNANQRLKGVITDLNLATDAPLLFGGLIAATAFGARAIMECFTDQGIAVNNVMALGGIARKNQVIMQACCDVLNRPLQIVASDQCCALGAAIFAAVAAKVHADIPSAQQKMASAVEKTLQPCSEQAQRFEQLYRRYQQWAMSAEQHYLPTSAPAQAAQAVATL 121 | >NP_414606.1 ara regulon transcriptional activator; autorepressor 122 | MAEAQNDPLLPGYSFNAHLVAGLTPIEANGYLDFFIDRPLGMKGYILNLTIRGQGVVKNQGREFVCRPGDILLFPPGEIHHYGRHPEAREWYHQWVYFRPRAYWHEWLNWPSIFANTGFFRPDEAHQPHFSDLFGQIINAGQGEGRYSELLAINLLEQLLLRRMEAINESLHPPMDNRVREACQYISDHLADSNFDIASVAQHVCLSPSRLSHLFRQQLGISVLSWREDQRISQAKLLLSTTRMPIATVGRNVGFDDQLYFSRVFKKCTGASPSEFRAGCEEKVNDVAVKLS 123 | >NP_414607.1 DedA family inner membrane protein 124 | MQALLEHFITQSTVYSLMAVVLVAFLESLALVGLILPGTVLMAGLGALIGSGELSFWHAWLAGIIGCLMGDWISFWLGWRFKKPLHRWSFLKKNKALLDKTEHALHQHSMFTILVGRFVGPTRPLVPMVAGMLDLPVAKFITPNIIGCLLWPPFYFLPGILAGAAIDIPAGMQSGEFKWLLLATAVFLWVGGWLCWRLWRSGKATDRLSHYLSRGRLLWLTPLISAIGVVALVVLIRHPLMPVYIDILRKVVGV 125 | >NP_414608.1 thiamine/thiamine pyrophosphate ABC transporter ATPase 126 | MLKLTDITWLYHHLPMRFSLTVERGEQVAILGPSGAGKSTLLNLIAGFLTPASGSLTIDGVDHTTMPPSRRPVSMLFQENNLFSHLTVAQNIGLGLNPGLKLNAVQQGKMHAIARQMGIDNLMARLPGELSGGQRQRVALARCLVREQPILLLDEPFSALDPALRQEMLTLVSTSCQQQKMTLLMVSHSVEDAARIATRSVVVADGRIAWQGMTNELLSGKASASALLGITG 127 | >NP_414609.1 thiamine/thiamine pyrophosphate ABC transporter permease 128 | MATRRQPLIPGWLIPGVSATTLVVAVALAAFLALWWNAPQDDWVAVWQDSYLWHVVRFSFWQAFLSALLSVIPAIFLARALYRRRFPGRLALLRLCAMTLILPVLVAVFGILSVYGRQGWLATLCQSLGLEWTFSPYGLQGILLAHVFFNLPMASRLLLQALENIPGEQRQLAAQLGMRSWHFFRFVEWPWLRRQIPPVAALIFMLCFASFATVLSLGGGPQATTIELAIYQALSYDYDPARAAMLALLQMVCCLGLVLLSQRLSKAIAPGTTLLQGWRDPDDRLHSRICDTVLIVLALLLLLPPLLAVIVDGVNRQLPEVLAQPVLWQALWTSLRIALAAGVLCVVLTMMLLWSSRELRARQKMLAGQVLEMSGMLILAMPGIVLATGFFLLLNNTIGLPQSADGIVIFTNALMAIPYALKVLENPMRDITARYSMLCQSLGIEGWSRLKVVELRALKRPLAQALAFACVLSIGDFGVVALFGNDDFRTLPFYLYQQIGSYRSQDGAVTALILLLLCFLLFTVIEKLPGRNVKTD 129 | >NP_414610.1 thiamine/thiamine pyrophosphate/thiamine monophosphate ABC transporter periplasmic binding protein 130 | MLKKCLPLLLLCTAPVFAKPVLTVYTYDSFAADWGPGPVVKKAFEADCNCELKLVALEDGVSLLNRLRMEGKNSKADVVLGLDNNLLDAASKTGLFAKSGVAADAVNVPGGWNNDTFVPFDYGYFAFVYDKNKLKNPPQSLKELVESDQNWRVIYQDPRTSTPGLGLLLWMQKVYGDDAPQAWQKLAKKTVTVTKGWSEAYGLFLKGESDLVLSYTTSPAYHILEEKKDNYAAANFSEGHYLQVEVAARTAASKQPELAQKFLQFMVSPAFQNAIPTGNWMYPVANVTLPAGFEKLTKPATTLEFTPAEVAAQRQAWISEWQRAVSR 131 | >NP_414611.1 transcriptional DNA-binding transcriptional activator of sgrS sRNA 132 | MPSARLQQQFIRLWQCCEGKSQDTTLNELAALLSCSRRHMRTLLNTMQDRGWLTWEAEVGRGKRSRLTFLYTGLALQQQRAEDLLEQDRIDQLVQLVGDKATVRQMLVSHLGRSFRQGRHILRVLYYRPLRNLLPGSALRRSETHIARQIFSSLTRINEENGELEADIAHHWQQISPLHWRFFLRPGVHFHHGRELEMDDVIASLKRINTLPLYSHIADIVSPTPWTLDIHLTQPDRWLPLLLGQVPAMILPREWETLSNFASHPIGTGPYAVIRNSTNQLKIQAFDDFFGYRALIDEVNVWVLPEIADEPAGGLMLKGPQGEEKEIESRLEEGCYYLLFDSRTHRGANQQVRDWVSYVLSPTNLVYFAEEQYQQLWFPAYGLLPRWHHARTIKSEKPAGLESLTLTFYQDHSEHRVIAGIMQQILASHQVTLKIKEIDYDQWHTGEIESDIWLNSANFTLPLDFSVFAHLCEVPLLQHCIPIDWQADAARWRNGEMNLANWCQQLVASKAMVPLLHHWLIIQGQRSMRGLRMNTLGWFDFKSAWFAPPDP 133 | >YP_002791237.1 inhibitor of glucose uptake 134 | MRQFYQHYFTATAKLCWLRWLSVPQRLTMLEGLMQWDDRNSES 135 | >YP_025293.1 broad specificity sugar efflux system 136 | MIWIMTMARRMNGVYAAFMLVAFMMGVAGALQAPTLSLFLSREVGAQPFWIGLFYTVNAIAGIGVSLWLAKRSDSQGDRRKLIIFCCLMAIGNALLFAFNRHYLTLITCGVLLASLANTAMPQLFALAREYADNSAREVVMFSSVMRAQLSLAWVIGPPLAFMLALNYGFTVMFSIAAGIFTLSLVLIAFMLPSVARVELPSENALSMQGGWQDSNVRMLFVASTLMWTCNTMYIIDMPLWISSELGLPDKLAGFLMGTAAGLEIPAMILAGYYVKRYGKRRMMVIAVAAGVLFYTGLIFFNSRMALMTLQLFNAVFIGIVAGIGMLWFQDLMPGRAGAATTLFTNSISTGVILAGVIQGAIAQSWGHFAVYWVIAVISVVALFLTAKVKDV 137 | >NP_414613.1 3-isopropylmalate dehydratase small subunit 138 | MAEKFIKHTGLVVPLDAANVDTDAIIPKQFLQKVTRTGFGAHLFNDWRFLDEKGQQPNPDFVLNFPQYQGASILLARENFGCGSSREHAPWALTDYGFKVVIAPSFADIFYGNSFNNQLLPVKLSDAEVDELFALVKANPGIHFDVDLEAQEVKAGEKTYRFTIDAFRRHCMMNGLDSIGLTLQHDDAIAAYEAKQPAFMN 139 | >NP_414614.1 3-isopropylmalate dehydratase large subunit 140 | MAKTLYEKLFDAHVVYEAENETPLLYIDRHLVHEVTSPQAFDGLRAHGRPVRQPGKTFATMDHNVSTQTKDINACGEMARIQMQELIKNCKEFGVELYDLNHPYQGIVHVMGPEQGVTLPGMTIVCGDSHTATHGAFGALAFGIGTSEVEHVLATQTLKQGRAKTMKIEVQGKAAPGITAKDIVLAIIGKTGSAGGTGHVVEFCGEAIRDLSMEGRMTLCNMAIEMGAKAGLVAPDETTFNYVKGRLHAPKGKDFDDAVAYWKTLQTDEGATFDTVVTLQAEEISPQVTWGTNPGQVISVNDNIPDPASFADPVERASAEKALAYMGLKPGIPLTEVAIDKVFIGSCTNSRIEDLRAAAEIAKGRKVAPGVQALVVPGSGPVKAQAEAEGLDKIFIEAGFEWRLPGCSMCLAMNNDRLNPGERCASTSNRNFEGRQGRGGRTHLVSPAMAAAAAVTGHFADIRNIK 141 | >NP_414615.4 3-isopropylmalate dehydrogenase, NAD(+)-dependent 142 | MSKNYHIAVLPGDGIGPEVMTQALKVLDAVRNRFAMRITTSHYDVGGAAIDNHGQPLPPATVEGCEQADAVLFGSVGGPKWEHLPPDQQPERGALLPLRKHFKLFSNLRPAKLYQGLEAFCPLRADIAANGFDILCVRELTGGIYFGQPKGREGSGQYEKAFDTEVYHRFEIERIARIAFESARKRRHKVTSIDKANVLQSSILWREIVNEIATEYPDVELAHMYIDNATMQLIKDPSQFDVLLCSNLFGDILSDECAMITGSMGMLPSASLNEQGFGLYEPAGGSAPDIAGKNIANPIAQILSLALLLRYSLDADDAACAIERAINRALEEGIRTGDLARGAAAVSTDEMGDIIARYVAEGV 143 | >NP_414616.1 2-isopropylmalate synthase 144 | MSQQVIIFDTTLRDGEQALQASLSVKEKLQIALALERMGVDVMEVGFPVSSPGDFESVQTIARQVKNSRVCALARCVEKDIDVAAESLKVAEAFRIHTFIATSPMHIATKLRSTLDEVIERAIYMVKRARNYTDDVEFSCEDAGRTPIADLARVVEAAINAGATTINIPDTVGYTMPFEFAGIISGLYERVPNIDKAIISVHTHDDLGLAVGNSLAAVHAGARQVEGAMNGIGERAGNCSLEEVIMAIKVRKDILNVHTAINHQEIWRTSQLVSQICNMPIPANKAIVGSGAFAHSSGIHQDGVLKNRENYEIMTPESIGLNQIQLNLTSRSGRAAVKHRMDEMGYKESEYNLDNLYDAFLKLADKKGQVFDYDLEALAFIGKQQEEPEHFRLDYFSVQSGSNDIATAAVKLACGEEVKAEAANGNGPVDAVYQAINRITEYNVELVKYSLTAKGHGKDALGQVDIVANYNGRRFHGVGLATDIVESSAKAMVHVLNNIWRAAEVEKELQRKAQHNENNKETV 145 | >NP_414617.1 leu operon leader peptide 146 | MTHIVRFIGLLLLNASSLRGRRVSGIQH 147 | >NP_414618.4 global transcription factor 148 | MPEVQTDHPETAELSKPQLRMVDLNLLTVFDAVMQEQNITRAAHVLGMSQPAVSNAVARLKVMFNDELFVRYGRGIQPTARAFQLFGSVRQALQLVQNELPGSGFEPASSERVFHLCVCSPLDSILTSQIYNHIEQIAPNIHVMFKSSLNQNTEHQLRYQETEFVISYEDFHRPEFTSVPLFKDEMVLVASKNHPTIKGPLLKHDVYNEQHAAVSLDRFASFSQPWYDTVDKQASIAYQGMAMMSVLSVVSQTHLVAIAPRWLAEEFAESLELQVLPLPLKQNSRTCYLSWHEAAGRDKGHQWMEEQLVSICKR 149 | >YP_025294.2 acetolactate synthase 3 large subunit 150 | MEMLSGAEMVVRSLIDQGVKQVFGYPGGAVLDIYDALHTVGGIDHVLVRHEQAAVHMADGLARATGEVGVVLVTSGPGATNAITGIATAYMDSIPLVVLSGQVATSLIGYDAFQECDMVGISRPVVKHSFLVKQTEDIPQVLKKAFWLAASGRPGPVVVDLPKDILNPANKLPYVWPESVSMRSYNPTTTGHKGQIKRALQTLVAAKKPVVYVGGGAITAGCHQQLKETVEALNLPVVCSLMGLGAFPATHRQALGMLGMHGTYEANMTMHNADVIFAVGVRFDDRTTNNLAKYCPNATVLHIDIDPTSISKTVTADIPIVGDARQVLEQMLELLSQESAHQPLDEIRDWWQQIEQWRARQCLKYDTHSEKIKPQAVIETLWRLTKGDAYVTSDVGQHQMFAALYYPFDKPRRWINSGGLGTMGFGLPAALGVKMALPEETVVCVTGDGSIQMNIQELSTALQYELPVLVVNLNNRYLGMVKQWQDMIYSGRHSQSYMQSLPDFVRLAEAYGHVGIQISHPHELESKLSEALEQVRNNRLVFVDVTVDGSEHVYPMQIRGGGMDEMWLSKTERT 151 | >NP_414620.1 acetolactate synthase 3, small subunit, valine-sensitive 152 | MRRILSVLLENESGALSRVIGLFSQRGYNIESLTVAPTDDPTLSRMTIQTVGDEKVLEQIEKQLHKLVDVLRVSELGQGAHVEREIMLVKIQASGYGRDEVKRNTEIFRGQIIDVTPSLYTVQLAGTSGKLDAFLASIRDVAKIVEVARSGVVGLSRGDKIMR 153 | >NP_414622.1 transcriptional repressor-activator for carbon metabolism 154 | MKLDEIARLAGVSRTTASYVINGKAKQYRVSDKTVEKVMAVVREHNYHPNAVAAGLRAGRTRSIGLVIPDLENTSYTRIANYLERQARQRGYQLLIACSEDQPDNEMRCIEHLLQRQVDAIIVSTSLPPEHPFYQRWANDPFPIVALDRALDREHFTSVVGADQDDAEMLAEELRKFPAETVLYLGALPELSVSFLREQGFRTAWKDDPREVHFLYANSYEREAAAQLFEKWLETHPMPQALFTTSFALLQGVMDVTLRRDGKLPSDLAIATFGDNELLDFLQCPVLAVAQRHRDVAERVLEIVLASLDEPRKPKPGLTRIKRNLYRRGVLSRS 155 | >NP_414623.1 RsmH methytransferase inhibitor 156 | MFRGATLVNLDSKGRLSVPTRYREQLLENAAGQMVCTIDIYHPCLLLYPLPEWEIIEQKLSRLSSMNPVERRVQRLLLGHASECQMDGAGRLLIAPVLRQHAGLTKEVMLVGQFNKFELWDETTWHQQVKEDIDAEQLATGDLSERLQDLSL 157 | >NP_414624.1 16S rRNA m(4)C1402 methyltransferase, SAM-dependent 158 | MMENYKHTTVLLDEAVNGLNIRPDGIYIDGTFGRGGHSRLILSQLGEEGRLLAIDRDPQAIAVAKTIDDPRFSIIHGPFSALGEYVAERDLIGKIDGILLDLGVSSPQLDDAERGFSFMRDGPLDMRMDPTRGQSAAEWLQTAEEADIAWVLKTYGEERFAKRIARAIVERNREQPMTRTKELAEVVAAATPVKDKFKHPATRTFQAVRIWVNSELEEIEQALKSSLNVLAPGGRLSIISFHSLEDRIVKRFMRENSRGPQVPAGLPMTEEQLKKLGGRQLRALGKLMPGEEEVAENPRARSSVLRIAERTNA 159 | >NP_414625.1 membrane bound cell division leucine zipper septum protein 160 | MISRVTEALSKVKGSMGSHERHALPGVIGDDLLRFGKLPLCLFICIILTAVTVVTTAHHTRLLTAQREQLVLERDALDIEWRNLILEENALGDHSRVERIATEKLQMQHVDPSQENIVVQK 161 | >NP_414626.1 transpeptidase involved in septal peptidoglycan synthesis; penicillin-binding protein 3 162 | MKAAAKTQKPKRQEEHANFISWRFALLCGCILLALAFLLGRVAWLQVISPDMLVKEGDMRSLRVQQVSTSRGMITDRSGRPLAVSVPVKAIWADPKEVHDAGGISVGDRWKALANALNIPLDQLSARINANPKGRFIYLARQVNPDMADYIKKLKLPGIHLREESRRYYPSGEVTAHLIGFTNVDSQGIEGVEKSFDKWLTGQPGERIVRKDRYGRVIEDISSTDSQAAHNLALSIDERLQALVYRELNNAVAFNKAESGSAVLVDVNTGEVLAMANSPSYNPNNLSGTPKEAMRNRTITDVFEPGSTVKPMVVMTALQRGVVRENSVLNTIPYRINGHEIKDVARYSELTLTGVLQKSSNVGVSKLALAMPSSALVDTYSRFGLGKATNLGLVGERSGLYPQKQRWSDIERATFSFGYGLMVTPLQLARVYATIGSYGIYRPLSITKVDPPVPGERVFPESIVRTVVHMMESVALPGGGGVKAAIKGYRIAIKTGTAKKVGPDGRYINKYIAYTAGVAPASQPRFALVVVINDPQAGKYYGGAVSAPVFGAIMGGVLRTMNIEPDALTTGDKNEFVINQGEGTGGRS 163 | >NP_414627.1 UDP-N-acetylmuramoyl-L-alanyl-D-glutamate:meso- diaminopimelate ligase 164 | MADRNLRDLLAPWVPDAPSRALREMTLDSRVAAAGDLFVAVVGHQADGRRYIPQAIAQGVAAIIAEAKDEATDGEIREMHGVPVIYLSQLNERLSALAGRFYHEPSDNLRLVGVTGTNGKTTTTQLLAQWSQLLGEISAVMGTVGNGLLGKVIPTENTTGSAVDVQHELAGLVDQGATFCAMEVSSHGLVQHRVAALKFAASVFTNLSRDHLDYHGDMEHYEAAKWLLYSEHHCGQAIINADDEVGRRWLAKLPDAVAVSMEDHINPNCHGRWLKATEVNYHDSGATIRFSSSWGDGEIESHLMGAFNVSNLLLALATLLALGYPLADLLKTAARLQPVCGRMEVFTAPGKPTVVVDYAHTPDALEKALQAARLHCAGKLWCVFGCGGDRDKGKRPLMGAIAEEFADVAVVTDDNPRTEEPRAIINDILAGMLDAGHAKVMEGRAEAVTCAVMQAKENDVVLVAGKGHEDYQIVGNQRLDYSDRVTVARLLGVIA 165 | >NP_414628.1 UDP-N-acetylmuramoyl-tripeptide:D-alanyl-D- alanine ligase 166 | MISVTLSQLTDILNGELQGADITLDAVTTDTRKLTPGCLFVALKGERFDAHDFADQAKAGGAGALLVSRPLDIDLPQLIVKDTRLAFGELAAWVRQQVPARVVALTGSSGKTSVKEMTAAILSQCGNTLYTAGNLNNDIGVPMTLLRLTPEYDYAVIELGANHQGEIAWTVSLTRPEAALVNNLAAAHLEGFGSLAGVAKAKGEIFSGLPENGIAIMNADNNDWLNWQSVIGSRKVWRFSPNAANSDFTATNIHVTSHGTEFTLQTPTGSVDVLLPLPGRHNIANALAAAALSMSVGATLDAIKAGLANLKAVPGRLFPIQLAENQLLLDDSYNANVGSMTAAVQVLAEMPGYRVLVVGDMAELGAESEACHVQVGEAAKAAGIDRVLSVGKQSHAISTASGVGEHFADKTALITRLKLLIAEQQVITILVKGSRSAAMEEVVRALQENGTC 167 | >NP_414629.1 phospho-N-acetylmuramoyl-pentapeptide transferase 168 | MLVWLAEHLVKYYSGFNVFSYLTFRAIVSLLTALFISLWMGPRMIAHLQKLSFGQVVRNDGPESHFSKRGTPTMGGIMILTAIVISVLLWAYPSNPYVWCVLVVLVGYGVIGFVDDYRKVVRKDTKGLIARWKYFWMSVIALGVAFALYLAGKDTPATQLVVPFFKDVMPQLGLFYILLAYFVIVGTGNAVNLTDGLDGLAIMPTVFVAGGFALVAWATGNMNFASYLHIPYLRHAGELVIVCTAIVGAGLGFLWFNTYPAQVFMGDVGSLALGGALGIIAVLLRQEFLLVIMGGVFVVETLSVILQVGSFKLRGQRIFRMAPIHHHYELKGWPEPRVIVRFWIISLMLVLIGLATLKVR 169 | >NP_414630.1 UDP-N-acetylmuramoyl-L-alanine:D-glutamate ligase 170 | MADYQGKNVVIIGLGLTGLSCVDFFLARGVTPRVMDTRMTPPGLDKLPEAVERHTGSLNDEWLMAADLIVASPGIALAHPSLSAAADAGIEIVGDIELFCREAQAPIVAITGSNGKSTVTTLVGEMAKAAGVNVGVGGNIGLPALMLLDDECELYVLELSSFQLETTSSLQAVAATILNVTEDHMDRYPFGLQQYRAAKLRIYENAKVCVVNADDALTMPIRGADERCVSFGVNMGDYHLNHQQGETWLRVKGEKVLNVKEMKLSGQHNYTNALAALALADAAGLPRASSLKALTTFTGLPHRFEVVLEHNGVRWINDSKATNVGSTEAALNGLHVDGTLHLLLGGDGKSADFSPLARYLNGDNVRLYCFGRDGAQLAALRPEVAEQTETMEQAMRLLAPRVQPGDMVLLSPACASLDQFKNFEQRGNEFARLAKELG 171 | >NP_414631.1 putative lipid II flippase; integral membrane protein; FtsZ ring stabilizer 172 | MRLSLPRLKMPRLPGFSILVWISTALKGWVMGSREKDTDSLIMYDRTLLWLTFGLAAIGFIMVTSASMPIGQRLTNDPFFFAKRDGVYLILAFILAIITLRLPMEFWQRYSATMLLGSIILLMIVLVVGSSVKGASRWIDLGLLRIQPAELTKLSLFCYIANYLVRKGDEVRNNLRGFLKPMGVILVLAVLLLAQPDLGTVVVLFVTTLAMLFLAGAKLWQFIAIIGMGISAVVLLILAEPYRIRRVTAFWNPWEDPFGSGYQLTQSLMAFGRGELWGQGLGNSVQKLEYLPEAHTDFIFAIIGEELGYVGVVLALLMVFFVAFRAMSIGRKALEIDHRFSGFLACSIGIWFSFQALVNVGAAAGMLPTKGLTLPLISYGGSSLLIMSTAIMMLLRIDYETRLEKAQAFVRGSR 173 | >NP_414632.1 N-acetylglucosaminyl transferase 174 | MSGQGKRLMVMAGGTGGHVFPGLAVAHHLMAQGWQVRWLGTADRMEADLVPKHGIEIDFIRISGLRGKGIKALIAAPLRIFNAWRQARAIMKAYKPDVVLGMGGYVSGPGGLAAWSLGIPVVLHEQNGIAGLTNKWLAKIATKVMQAFPGAFPNAEVVGNPVRTDVLALPLPQQRLAGREGPVRVLVVGGSQGARILNQTMPQVAAKLGDSVTIWHQSGKGSQQSVEQAYAEAGQPQHKVTEFIDDMAAAYAWADVVVCRSGALTVSEIAAAGLPALFVPFQHKDRQQYWNALPLEKAGAAKIIEQPQLSVDAVANTLAGWSRETLLTMAERARAASIPDATERVANEVSRVARA 175 | >NP_414633.1 UDP-N-acetylmuramate:L-alanine ligase 176 | MNTQQLAKLRSIVPEMRRVRHIHFVGIGGAGMGGIAEVLANEGYQISGSDLAPNPVTQQLMNLGATIYFNHRPENVRDASVVVVSSAISADNPEIVAAHEARIPVIRRAEMLAELMRFRHGIAIAGTHGKTTTTAMVSSIYAEAGLDPTFVNGGLVKAAGVHARLGHGRYLIAEADESDASFLHLQPMVAIVTNIEADHMDTYQGDFENLKQTFINFLHNLPFYGRAVMCVDDPVIRELLPRVGRQTTTYGFSEDADVRVEDYQQIGPQGHFTLLRQDKEPMRVTLNAPGRHNALNAAAAVAVATEEGIDDEAILRALESFQGTGRRFDFLGEFPLEPVNGKSGTAMLVDDYGHHPTEVDATIKAARAGWPDKNLVMLFQPHRFTRTRDLYDDFANVLTQVDTLLMLEVYPAGEAPIPGADSRSLCRTIRGRGKIDPILVPDPARVAEMLAPVLTGNDLILVQGAGNIGKIARSLAEIKLKPQTPEEEQHD 177 | >NP_414634.1 D-alanine:D-alanine ligase 178 | MTDKIAVLLGGTSAEREVSLNSGAAVLAGLREGGIDAYPVDPKEVDVTQLKSMGFQKVFIALHGRGGEDGTLQGMLELMGLPYTGSGVMASALSMDKLRSKLLWQGAGLPVAPWVALTRAEFEKGLSDKQLAEISALGLPVIVKPSREGSSVGMSKVVAENALQDALRLAFQHDEEVLIEKWLSGPEFTVAILGEEILPSIRIQPSGTFYDYEAKYLSDETQYFCPAGLEASQEANLQALVLKAWTTLGCKGWGRIDVMLDSDGQFYLLEANTSPGMTSHSLVPMAARQAGMSFSQLVVRILELAD 179 | >NP_414635.1 divisome assembly protein, membrane anchored protein involved in growth of wall at septum 180 | MSQAALNTRNSEEEVSSRRNNGTRLAGILFLLTVLTTVLVSGWVVLGWMEDAQRLPLSKLVLTGERHYTRNDDIRQSILALGEPGTFMTQDVNIIQTQIEQRLPWIKQVSVRKQWPDELKIHLVEYVPIARWNDQHMVDAEGNTFSVPPERTSKQVLPMLYGPEGSANEVLQGYREMGQMLAKDRFTLKEAAMTARRSWQLTLNNDIKLNLGRGDTMKRLARFVELYPVLQQQAQTDGKRISYVDLRYDSGAAVGWAPLPPEESTQQQNQAQAEQQ 181 | >NP_414636.1 ATP-binding cell division FtsK recruitment protein 182 | MIKATDRKLVVGLEIGTAKVAALVGEVLPDGMVNIIGVGSCPSRGMDKGGVNDLESVVKCVQRAIDQAELMADCQISSVYLALSGKHISCQNEIGMVPISEEEVTQEDVENVVHTAKSVRVRDEHRVLHVIPQEYAIDYQEGIKNPVGLSGVRMQAKVHLITCHNDMAKNIVKAVERCGLKVDQLIFAGLASSYSVLTEDERELGVCVVDIGGGTMDIAVYTGGALRHTKVIPYAGNVVTSDIAYAFGTPPSDAEAIKVRHGCALGSIVGKDESVEVPSVGGRPPRSLQRQTLAEVIEPRYTELLNLVNEEILQLQEKLRQQGVKHHLAAGIVLTGGAAQIEGLAACAQRVFHTQVRIGAPLNITGLTDYAQEPYYSTAVGLLHYGKESHLNGEAEVEKRVTASVGSWIKRLNSWLRKEF 183 | >NP_414637.1 GTP-binding tubulin-like cell division protein 184 | MFEPMELTNDAVIKVIGVGGGGGNAVEHMVRERIEGVEFFAVNTDAQALRKTAVGQTIQIGSGITKGLGAGANPEVGRNAADEDRDALRAALEGADMVFIAAGMGGGTGTGAAPVVAEVAKDLGILTVAVVTKPFNFEGKKRMAFAEQGITELSKHVDSLITIPNDKLLKVLGRGISLLDAFGAANDVLKGAVQGIAELITRPGLMNVDFADVRTVMSEMGYAMMGSGVASGEDRAEEAAEMAISSPLLEDIDLSGARGVLVNITAGFDLRLDEFETVGNTIRAFASDNATVVIGTSLDPDMNDELRVTVVATGIGMDKRPEITLVTNKQVQQPVMDRYQQHGMAPLTQEQKPVAKVVNDNAPQTAKEPDYLDIPAFLRKQAD 185 | >NP_414638.1 UDP-3-O-acyl N-acetylglucosamine deacetylase 186 | MIKQRTLKRIVQATGVGLHTGKKVTLTLRPAPANTGVIYRRTDLNPPVDFPADAKSVRDTMLCTCLVNEHDVRISTVEHLNAALAGLGIDNIVIEVNAPEIPIMDGSAAPFVYLLLDAGIDELNCAKKFVRIKETVRVEDGDKWAEFKPYNGFSLDFTIDFNHPAIDSSNQRYAMNFSADAFMRQISRARTFGFMRDIEYLQSRGLCLGGSFDCAIVVDDYRVLNEDGLRFEDEFVRHKMLDAIGDLFMCGHNIIGAFTAYKSGHALNNKLLQAVLAKQEAWEYVTFQDDAELPLAFKAPSAVLA 187 | >NP_414639.2 regulator of secA translation 188 | MSGILTRWRQFGKRYFWPHLLLGMVAASLGLPALSNAAEPNAPAKATTRNHEPSAKVNFGQLALLEANTRRPNSNYSVDYWHQHAIRTVIRHLSFAMAPQTLPVAEESLPLQAQHLALLDTLSALLTQEGTPSEKGYRIDYAHFTPQAKFSTPVWISQAQGIRAGPQRLT 189 | >NP_414640.1 preprotein translocase subunit, ATPase 190 | MLIKLLTKVFGSRNDRTLRRMRKVVNIINAMEPEMEKLSDEELKGKTAEFRARLEKGEVLENLIPEAFAVVREASKRVFGMRHFDVQLLGGMVLNERCIAEMRTGEGKTLTATLPAYLNALTGKGVHVVTVNDYLAQRDAENNRPLFEFLGLTVGINLPGMPAPAKREAYAADITYGTNNEYGFDYLRDNMAFSPEERVQRKLHYALVDEVDSILIDEARTPLIISGPAEDSSEMYKRVNKIIPHLIRQEKEDSETFQGEGHFSVDEKSRQVNLTERGLVLIEELLVKEGIMDEGESLYSPANIMLMHHVTAALRAHALFTRDVDYIVKDGEVIIVDEHTGRTMQGRRWSDGLHQAVEAKEGVQIQNENQTLASITFQNYFRLYEKLAGMTGTADTEAFEFSSIYKLDTVVVPTNRPMIRKDLPDLVYMTEAEKIQAIIEDIKERTAKGQPVLVGTISIEKSELVSNELTKAGIKHNVLNAKFHANEAAIVAQAGYPAAVTIATNMAGRGTDIVLGGSWQAEVAALENPTAEQIEKIKADWQVRHDAVLEAGGLHIIGTERHESRRIDNQLRGRSGRQGDAGSSRFYLSMEDALMRIFASDRVSGMMRKLGMKPGEAIEHPWVTKAIANAQRKVESRNFDIRKQLLEYDDVANDQRRAIYSQRNELLDVSDVSETINSIREDVFKATIDAYIPPQSLEEMWDIPGLQERLKNDFDLDLPIAEWLDKEPELHEETLRERILAQSIEVYQRKEEVVGAEMMRHFEKGVMLQTLDSLWKEHLAAMDYLRQGIHLRGYAQKDPKQEYKRESFSMFAAMLESLKYEVISTLSKVQVRMPEEVEELEQQRRMEAERLAQMQQLSHQDDDSAAAAALAAQTGERKVGRNDPCPCGSGKKYKQCHGRLQ 191 | >NP_414641.1 dGTP-preferring nucleoside triphosphate pyrophosphohydrolase 192 | MKKLQIAVGIIRNENNEIFITRRAADAHMANKLEFPGGKIEMGETPEQAVVRELQEEVGITPQHFSLFEKLEYEFPDRHITLWFWLVERWEGEPWGKEGQPGEWMSLVGLNADDFPPANEPVIAKLKRL 193 | >NP_414643.1 DNA gyrase inhibitor 194 | MSETITVNCPTCGKTVVWGEISPFRPFCSKRCQLIDLGEWAAEEKRIPSSGDLSESDDWSEEPKQ 195 | >NP_414644.1 FtsZ stabilizer 196 | MQTQVLFEHPLNEKMRTWLRIEFLIQQLTVNLPIVDHAGALHFFRNVSELLDVFERGEVRTELLKELDRQQRKLQTWIGVPGVDQSRIEALIQQLKAAGSVLISAPRIGQFLREDRLIALVRQRLSIPGGCCSFDLPTLHIWLHLPQAQRDSQVETWIASLNPLTQALTMVLDLIRQSAPFRKQTSLNGFYQDNGGDADLLRLNLSLDSQLYPQISGHKSRFAIRFMPLDTENGQVPERLDFELACC 197 | >NP_414645.1 dephospho-CoA kinase 198 | MRYIVALTGGIGSGKSTVANAFADLGINVIDADIIARQVVEPGAPALHAIADHFGANMIAADGTLQRRALRERIFANPEEKNWLNALLHPLIQQETQHQIQQATSPYVLWVVPLLVENSLYKKANRVLVVDVSPETQLKRTMQRDDVTREHVEQILAAQATREARLAVADDVIDNNGAPDAIASDVARLHAHYLQLASQFVSQEKP 199 | >NP_414646.1 GMP reductase 200 | MRIEEDLKLGFKDVLIRPKRSTLKSRSDVELERQFTFKHSGQSWSGVPIIAANMDTVGTFSMASALASFDILTAVHKHYSVEEWQAFINNSSADVLKHVMVSTGTSDADFEKTKQILDLNPALNFVCIDVANGYSEHFVQFVAKAREAWPTKTICAGNVVTGEMCEELILSGADIVKVGIGPGSVCTTRVKTGVGYPQLSAVIECADAAHGLGGMIVSDGGCTTPGDVAKAFGGGADFVMLGGMLAGHEESGGRIVEENGEKFMLFYGMSSESAMKRHVGGVAEYRAAEGKTVKLPLRGPVENTARDILGGLRSACTYVGASRLKELTKRTTFIRVQEQENRIFNNL 201 | -------------------------------------------------------------------------------- /tests/scripts/test_cogclassifier.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | import subprocess as sp 3 | from pathlib import Path 4 | 5 | 6 | def test_cli(example_fasta_file: Path, tmp_path: Path): 7 | """Test COGclassifier CLI""" 8 | cmd = f"COGclassifier -i {example_fasta_file} -o {tmp_path} --thread_num 1 --evalue 1e-2" # noqa: E501 9 | cmd_args = shlex.split(cmd) 10 | result = sp.run(cmd_args) 11 | assert result.returncode == 0 12 | outfile_names = [ 13 | "rpsblast.tsv", 14 | "cog_count.tsv", 15 | "cog_classify.tsv", 16 | "cog_count_barchart.html", 17 | "cog_count_piechart.html", 18 | "cogclassifier.log", 19 | ] 20 | for outfile_name in outfile_names: 21 | outfile = tmp_path / outfile_name 22 | assert outfile.exists() 23 | -------------------------------------------------------------------------------- /tests/scripts/test_plot_cog_count_barchart.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | import subprocess as sp 3 | from pathlib import Path 4 | 5 | 6 | def test_cli_html(cog_count_file: Path, tmp_path: Path): 7 | """Test plot_cog_count_barchart CLI (HTML format)""" 8 | outfile = tmp_path / "cog_count_barchart.html" 9 | cmd = f"plot_cog_count_barchart -i {cog_count_file} -o {outfile} --width 540 --height 340 --bar_width 15" # noqa: E501 10 | cmd_args = shlex.split(cmd) 11 | result = sp.run(cmd_args) 12 | assert result.returncode == 0 13 | assert outfile.exists() 14 | 15 | 16 | def test_cli_png(cog_count_file: Path, tmp_path: Path): 17 | """Test plot_cog_count_barchart CLI (PNG format)""" 18 | outfile = tmp_path / "cog_count_barchart.png" 19 | cmd = f"plot_cog_count_barchart -i {cog_count_file} -o {outfile} --width 540 --height 340 --bar_width 15" # noqa: E501 20 | cmd_args = shlex.split(cmd) 21 | result = sp.run(cmd_args) 22 | assert result.returncode == 0 23 | assert outfile.exists() 24 | -------------------------------------------------------------------------------- /tests/scripts/test_plot_cog_count_piechart.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | import subprocess as sp 3 | from pathlib import Path 4 | 5 | 6 | def test_cli_html(cog_count_file: Path, tmp_path: Path): 7 | """Test plot_cog_count_piechart CLI (HTML format)""" 8 | outfile = tmp_path / "cog_count_barchart.html" 9 | cmd = f"plot_cog_count_piechart -i {cog_count_file} -o {outfile} --width 380 --height 380 --show_letter --sort" # noqa: E501 10 | cmd_args = shlex.split(cmd) 11 | result = sp.run(cmd_args) 12 | assert result.returncode == 0 13 | assert outfile.exists() 14 | 15 | 16 | def test_cli_png(cog_count_file: Path, tmp_path: Path): 17 | """Test plot_cog_count_piechart CLI (PNG format)""" 18 | outfile = tmp_path / "cog_count_barchart.png" 19 | cmd = f"plot_cog_count_piechart -i {cog_count_file} -o {outfile} --width 380 --height 380 --show_letter --sort" # noqa: E501 20 | cmd_args = shlex.split(cmd) 21 | result = sp.run(cmd_args) 22 | assert result.returncode == 0 23 | assert outfile.exists() 24 | -------------------------------------------------------------------------------- /tests/test_cog.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from cogclassifier import const 4 | from cogclassifier.cog import CogDefinitionRecord, CogFuncCategoryRecord 5 | 6 | 7 | class TestCogFuncCategoryRecord: 8 | def test_len_method(self): 9 | """Test dunder len method""" 10 | cog_fc_rec = CogFuncCategoryRecord(const.COG_FUNC_CATEGORY_FILE) 11 | assert len(cog_fc_rec) == 26 12 | 13 | def test_str_method(self): 14 | """Test dunder str method""" 15 | cog_fc_rec = CogFuncCategoryRecord(const.COG_FUNC_CATEGORY_FILE) 16 | with open(const.COG_FUNC_CATEGORY_FILE) as f: 17 | expected_str = f.read() 18 | assert str(cog_fc_rec) == expected_str 19 | 20 | 21 | class TestCogDefinitionRecord: 22 | def test_len_method(self): 23 | """Test dunder len method""" 24 | cog_def_rec = CogDefinitionRecord(const.COG_DEFINITION_FILE) 25 | assert len(cog_def_rec) == 5050 26 | 27 | def test_str_method(self): 28 | """Test dunder str method 29 | 30 | The officially provided COG Definition files do not always have 31 | a constant number of tabs at the end of each line. 32 | This test checks if the result is the same after removing trailing tabs. 33 | """ 34 | 35 | def _remove_trailing_tabs(text: str) -> str: 36 | return re.sub(r"\t+$", "", text, flags=re.MULTILINE) 37 | 38 | cog_def_rec = CogDefinitionRecord(const.COG_DEFINITION_FILE) 39 | with open(const.COG_DEFINITION_FILE) as f: 40 | expected_str = _remove_trailing_tabs(f.read()) 41 | assert _remove_trailing_tabs(str(cog_def_rec)) == expected_str 42 | --------------------------------------------------------------------------------