├── .gitignore
├── LICENSE.txt
├── README.md
├── conda_env.yml
├── docs
├── README.md
├── developing_domainator.md
├── esm_3b_foldseek.md
├── examples.md
├── file_formats.md
├── limitations_and_FAQ.md
└── media
│ ├── Domainator_logo.svg
│ ├── Overview_diagram.png
│ ├── detective_Domainic_transparent_background.png
│ ├── genome_mining_workflow.png
│ └── hmm_profile_comparison.png
├── domainator.def
├── domainator_esmologs.def
├── package.sh
├── pyproject.toml
├── src
└── domainator
│ ├── Bio
│ ├── Data
│ │ ├── CodonTable.py
│ │ ├── IUPACData.py
│ │ ├── SCOPData.py
│ │ └── __init__.py
│ ├── File.py
│ ├── GenBank
│ │ ├── Scanner.py
│ │ ├── __init__.py
│ │ └── utils.py
│ ├── LICENSE.rst
│ ├── Seq.py
│ ├── SeqFeature.py
│ ├── SeqIO
│ │ ├── FastaIO.py
│ │ ├── InsdcIO.py
│ │ ├── InsdcIO_new.py
│ │ ├── Interfaces.py
│ │ ├── SwissIO.py
│ │ ├── TabIO.py
│ │ └── __init__.py
│ ├── SeqRecord.py
│ ├── SwissProt
│ │ └── __init__.py
│ └── __init__.py
│ ├── Taxonomy
│ └── __init__.py
│ ├── __init__.py
│ ├── build_projection.py
│ ├── build_ssn.py
│ ├── build_tree.py
│ ├── color_genbank.py
│ ├── color_table_to_legend.py
│ ├── compare_contigs.py
│ ├── cytoscape.py
│ ├── data_matrix.py
│ ├── deduplicate_genbank.py
│ ├── domain_search.py
│ ├── domainate.py
│ ├── domainator_db_download.py
│ ├── enum_report.py
│ ├── extract_domains.py
│ ├── extract_peptides.py
│ ├── extract_unannotated.py
│ ├── filter_domains.py
│ ├── foldseek.py
│ ├── genbank_to_fasta.py
│ ├── hmmer_build.py
│ ├── hmmer_compare.py
│ ├── hmmer_report.py
│ ├── hmmer_search.py
│ ├── hmmer_select.py
│ ├── matrix_report.py
│ ├── partition_seqfile.py
│ ├── partition_seqids.py
│ ├── plot_contigs.py
│ ├── select_by_cds.py
│ ├── select_by_contig.py
│ ├── seq_dist.py
│ ├── summary_report.py
│ ├── transform_matrix.py
│ ├── trim_contigs.py
│ └── utils.py
└── test
├── data
├── 206.gb
├── CcdB.hmm
├── CcdB.hmm.h3f
├── CcdB.hmm.h3i
├── CcdB.hmm.h3m
├── CcdB.hmm.h3p
├── CuSOD_enum_report_test.gb
├── FeSOD_20.fasta
├── FeSOD_20.gb
├── FeSOD_20_pfam.gb
├── FeSOD_dist.dense.hdf5
├── FeSOD_dist.sparse.hdf5
├── FeSOD_dist.tsv
├── FeSOD_metadata.tsv
├── FeSOD_pfam.hmm
├── FeSOD_score_dist.newick
├── FeSOD_score_dist.tsv
├── FeSOD_score_dist.xgmml
├── JABFVH010000506_extraction.gb
├── MT_nbs.enum_report.tsv
├── MT_nbs.gb
├── Peptidase_M28.hmm
├── Peptidase_M28.hmm.h3f
├── Peptidase_M28.hmm.h3i
├── Peptidase_M28.hmm.h3m
├── Peptidase_M28.hmm.h3p
├── Polymorphism_feature.gb
├── SPR.hmm
├── Staph_phages.gb
├── bacillus_phage_SPR.gb
├── bacillus_phage_SPR_with_annotations.gb
├── bacillus_phage_SPR_with_annotations_reversed.gb
├── bin3.sparse.hdf5
├── bin3.sparse.tsv
├── bin3.tsv
├── ccdb.gb
├── color_domain_search_test.gb
├── color_specification.tsv
├── color_table_123.tsv
├── domain_search_test_out1.gb
├── domain_search_test_out2.gb
├── domain_search_test_out3.gb
├── domain_search_translate_out.gb
├── empty.gb
├── enum_report_html_max_size_out.html
├── enum_report_html_out.html
├── enum_report_html_out_quote_escape.html
├── extract_peptides_test_1_out.gb
├── extract_peptides_test_2.gb
├── foldseek
│ ├── FeSOD
│ ├── FeSOD.dbtype
│ ├── FeSOD.index
│ ├── FeSOD.lookup
│ ├── FeSOD_20.3di.fasta
│ ├── FeSOD_20.fasta
│ ├── FeSOD_h
│ ├── FeSOD_h.dbtype
│ ├── FeSOD_h.index
│ ├── FeSOD_ss
│ ├── FeSOD_ss.dbtype
│ └── FeSOD_ss.index
├── metadata_FeSOD_20.tsv
├── pDONR201.fasta
├── pDONR201.gb
├── pDONR201_domainator_circular.gb
├── pDONR201_empty.gb
├── pDONR201_genemark.gb
├── pDONR201_genemark.gff
├── pDONR201_multi.fasta
├── pDONR201_multi_genemark.gb
├── pDONR201_multi_genemark.gff
├── pDONR201_multi_genemark_clipped_domainator.gb
├── pDONR201_multi_genemark_domainator.gb
├── pDONR201_multi_genemark_domainator_multi_hmm.gb
├── pDONR201_multi_genemark_domainator_multi_hmm_2.gb
├── pDONR201_multi_subset.txt
├── pDONR201_multigenemark_partition.gb
├── pDONR201_no_CDSs.gb
├── pDONR201_partly_CDSs.gb
├── pDONR201_pseudo.gb
├── pDONR_201_domain_search.gb
├── pDONR_201_domain_search_long_annotations.gb
├── pDONR_201_domainator.gb
├── pDONR_201_domainator_domain_reorder.gb
├── pDONR_201_hmm_scores.tsv
├── pdonr_hmms.hmm
├── pdonr_hmms_1.hmm
├── pdonr_hmms_2.hmm
├── pdonr_peptides.fasta
├── saccharomyces_defense_finder.hmm
├── saccharomyces_extraction.gb
├── saccharomyces_extraction_circular.gb
├── score3.sparse.tsv
├── score4.sparse.tsv
├── scorefull.dense.hdf5
├── scorefull.tsv
├── simple_genpept.gb
├── simple_genpept_contigs.txt
├── simple_genpept_equals_second_line.gb
├── simple_genpept_quote_name.gb
├── ssn_FeSOD.sparse.xgmml
├── ssn_FeSOD.xgmml
├── ssn_FeSOD_clusters.tsv
├── ssn_FeSOD_clusters_header.tsv
├── swissprot_CuSOD_subset.fasta
├── taxdmp
│ ├── delnodes.dmp
│ ├── merged.dmp
│ ├── names.dmp
│ ├── nodes.dmp
│ └── taxdump.tar.gz
├── test_matrix.dense.hdf5
├── test_matrix.dense.tsv
├── test_matrix.sparse.hdf5
└── thymidylate_synthase.fasta
├── helpers.py
├── test_SeqFeature.py
├── test_build_projection.py
├── test_build_ssn.py
├── test_build_tree.py
├── test_color_genbank.py
├── test_color_table_to_legend.py
├── test_compare_contigs.py
├── test_data_matrix.py
├── test_deduplicate_genbank.py
├── test_domain_search.py
├── test_domainate.py
├── test_domainator_db_download.py
├── test_enum_report.py
├── test_extract_domains.py
├── test_extract_peptides.py
├── test_extract_unannotated.py
├── test_filter_domains.py
├── test_hmmer_build.py
├── test_hmmer_compare.py
├── test_hmmer_report.py
├── test_hmmer_search.py
├── test_hmmer_select.py
├── test_matrix_report.py
├── test_ncbi_taxonomy.py
├── test_partition_seqfile.py
├── test_partition_seqids.py
├── test_plot_contigs.py
├── test_select_by_cds.py
├── test_select_by_contig.py
├── test_seq_dist.py
├── test_summary_report.py
├── test_transform_matrix.py
├── test_trim_contigs.py
└── test_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 |
132 | #other
133 | tmp/
134 |
135 | pfam/
136 |
137 | test_out/
138 | nextflow/conda
139 | .nextflow*
140 | work
141 | output
142 |
143 | *.code-workspace
144 | *.zip
145 | *.pdf
146 | src/domainator/_lib/*
147 |
148 | .vscode/*
149 |
150 | nextflow/*
151 |
152 | *.sif
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Code derived from Biopython (found in src/domainator/Bio) is released under the "Biopython License Agreement" (given in full below). Unless stated otherwise in individual file headers, all Biopython's files are under the "Biopython License Agreement".
2 |
3 | Some files are explicitly dual licensed under your choice of the "Biopython License Agreement" or the "BSD 3-Clause License" (both given in full below). This is with the intention of later offering all of Biopython under this dual licensing approach.
4 |
5 | All other code (that is, code not in the "src/domainator/Bio" directory) is Licensed under the MIT License agreement (given in full below).
6 |
7 | MIT License
8 |
9 | Copyright (c) 2023 Sean R. Johnson
10 |
11 | Permission is hereby granted, free of charge, to any person obtaining a copy
12 | of this software and associated documentation files (the "Software"), to deal
13 | in the Software without restriction, including without limitation the rights
14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 | copies of the Software, and to permit persons to whom the Software is
16 | furnished to do so, subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be included in all
19 | copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | SOFTWARE.
28 |
29 |
30 | Biopython License Agreement
31 | Permission to use, copy, modify, and distribute this software and its documentation with or without modifications and for any purpose and without fee is hereby granted, provided that any copyright notices appear in all copies and that both those copyright notices and this permission notice appear in supporting documentation, and that the names of the contributors or copyright holders not be used in advertising or publicity pertaining to distribution of the software without specific prior permission.
32 |
33 | THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
34 |
35 | BSD 3-Clause License
36 | Copyright (c) 1999-2023, The Biopython Contributors All rights reserved.
37 |
38 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
39 |
40 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
41 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
42 | Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 |
--------------------------------------------------------------------------------
/conda_env.yml:
--------------------------------------------------------------------------------
1 | name: domainator
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | dependencies:
6 | - python>=3.10
7 | - pip>=23.3
8 | - setuptools>=61.0.0
9 | - coverage~=6.3.2
10 | - cd-hit~=4.8.1
11 | - diamond>=2.0.0
12 | - hmmer~=3.3.2
13 | - usearch
14 | - pip:
15 | - -e .[test]
16 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 | - [Home](../README.md)
3 | - [File Formats](file_formats.md)
4 | - [Examples](examples.md)
5 | - [Developing Domainator](developing_domainator.md)
6 | - [ESM-2 3B 3Di and foldseek integration](esm_3b_foldseek.md)
7 | - [Limitations and FAQ](limitations_and_FAQ.md)
8 |
--------------------------------------------------------------------------------
/docs/developing_domainator.md:
--------------------------------------------------------------------------------
1 | [index](README.md)
2 | # Developing Domainator
3 |
4 | ## testing
5 |
6 | run all tests
7 | ```
8 | pytest test
9 | ```
10 |
11 | examine test coverage
12 | ```
13 | coverage run -m pytest test
14 | coverage report -m
15 | coverage html
16 | ```
17 | then open htmlcov/index.html in a browser to see coverage
18 |
19 |
20 | ## Converting the manual into a pdf
21 | on Ubuntu
22 |
23 | Install pandoc
24 | ```
25 | sudo apt-get install pandoc texlive-latex-base texlive-fonts-recommended texlive-extra-utils texlive-latex-extra librsvg2-bin
26 | ```
27 |
28 | run pandoc
29 | ```
30 | pandoc -V geometry:margin=0.5in -V geometry:paperwidth=13.5in README.md -o README.pdf
31 | ```
32 |
33 | ## Projects that domainator depends on
34 |
35 | - hmmer3
36 | - usearch
37 | - python3
38 | - biopython
39 | - pytest
40 | - pytest-datadir
41 | - pandas
42 | - seaborn
43 | - cd-hit
44 | - scipy
45 | - pyhmmer
46 | - umap-learn
47 | - diamond
48 | - coverage
49 |
--------------------------------------------------------------------------------
/docs/esm_3b_foldseek.md:
--------------------------------------------------------------------------------
1 | Domainator can support sequence searches at high sensitivity across deep evolutionary distances by leveraging the ESM-2 3B 3Di model, described in the manuscript:
2 | [https://www.biorxiv.org/content/10.1101/2023.07.26.550718v1](https://www.biorxiv.org/content/10.1101/2023.07.26.550718v1)
3 |
4 | # Installation
5 | In addition to Domainator, the esmologs package ([https://github.com/seanrjohnson/esmologs](https://github.com/seanrjohnson/esmologs)) and pytorch with CUDA support must be installed. The easiest way to accomplish this is to create a new conda environment with esmologs, and install domainator into that environment.
6 |
7 | You also need to download the ESM-2 3B 3Di fine tuning checkpoint, from [https://zenodo.org/record/8174960](https://zenodo.org/record/8174960)
8 |
9 | ## download the checkpoint
10 |
11 | ```bash
12 | wget https://zenodo.org/record/8174960/files/ESM-2_3B_3Di.pt
13 | ```
14 |
15 | ## Install via conda
16 |
17 | ```bash
18 | git clone https://github.com/seanrjohnson/esmologs.git
19 | cd esmologs
20 |
21 | conda env create --name domainator_esmologs --file conda_env.yml
22 |
23 | cd ..
24 | git clone https://github.com/nebiolabs/domainator.git
25 | cd domainator
26 | conda env update --name domainator_esmologs --file conda_env.yml
27 |
28 | conda activate domainator_esmologs
29 | pytest test
30 | cd ..
31 | ```
32 |
33 | ## install via Apptainer/Singularity
34 |
35 | ```bash
36 | git clone https://github.com/nebiolabs/domainator.git
37 | cd domainator
38 |
39 | apptainer build domainator_esmologs.sif domainator_esmologs.def
40 |
41 | # if using wsl, you need to use --nvccli. In other linux, --nv also works. These flags make the GPU visible to the container.
42 | apptainer shell --nvccli domainator_esmologs.sif
43 |
44 | # in the apptainer shell
45 | cd /opt/domainator
46 | pytest test
47 | exit # or ctrl + d
48 | ```
49 |
50 | # Using ESM-2 3B 3Di with domainate.py
51 |
52 | In this workflow, we first create a reference Foldseek 3Di database, and then use domainator to annotate contigs from that database
53 |
54 | (Note that domain_search.py with ESM-2 3B 3Di is not yet supported)
55 |
56 | ## conda
57 | ```bash
58 | conda activate domainator_esmologs
59 |
60 | # convert a reference file to 3di
61 | predict_from_ESM2_to_3Di.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_20.3di.fasta --weights ESM-2_3B_3Di.pt --device cuda:0
62 |
63 | # convert the amino acid and 3di fasta files into a foldseek database
64 | fasta2foldseek.py --aa domainator/test/data/foldseek/FeSOD_20.fasta --tdi FeSOD_20.3di.fasta -o FeSOD
65 |
66 | # run domainate.py with the foldseek reference database. In this example, our query is the same file we used to make the database, but it could be any fasta or genbank file.
67 | domainate.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_all_to_all_3Di.gb --foldseek FeSOD --esm2_3Di_weights ESM-2_3B_3Di.pt --esm2_3Di_device cuda:0
68 | ```
69 |
70 | ## Apptainer/Singularity
71 |
72 | ```bash
73 |
74 | # convert a reference file to 3di
75 | apptainer exec --nv domainator/domainator_esmologs.sif predict_from_ESM2_to_3Di.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_20.3di.fasta --weights ESM-2_3B_3Di.pt --device cuda:0
76 |
77 | # convert the amino acid and 3di fasta files into a foldseek database
78 | apptainer exec domainator/domainator_esmologs.sif fasta2foldseek.py --aa domainator/test/data/foldseek/FeSOD_20.fasta --tdi FeSOD_20.3di.fasta -o FeSOD
79 |
80 | # run domainate.py with the foldseek reference database. In this example, our query is the same file we used to make the database, but it could be any fasta or genbank file.
81 | apptainer exec --nv domainator/domainator_esmologs.sif domainate.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_all_to_all_3Di.gb --foldseek FeSOD --esm2_3Di_weights ESM-2_3B_3Di.pt --esm2_3Di_device cuda:0
82 | ```
83 |
84 |
--------------------------------------------------------------------------------
/docs/limitations_and_FAQ.md:
--------------------------------------------------------------------------------
1 | # Limitations
2 | ## Large reference databases
3 | Domainator currently loads reference databases into memory, so it is not suitable for large reference databases, for example it is suitable for annotating using Pfam as a reference with ~30,000 profiles, but not NCBI nr with millions of sequences.
4 |
5 | ## Contigs vs genomes
6 | A major limitation of Domainator is that it operates on the contig level, not the genome level. So for example, when reporting taxonomy, fragmented genome assemblies will be counted multiple times, once for each contig in the assembly.
7 |
8 | ## Scores and E-values
9 | Domainator uses scores and evalues somewhat inconsistently. Some programs allow filtering by evalue, others allow filtering by local alignment scores. It would be nice to be more consistent about that. Also domainator evalues are typically not adjusted by database size. Z = 1000 by default in most cases. One possible solution could be to stop using evalues and scores all together, and do most operations in the space of EFI scores.
10 |
11 | ## plot_contigs.py handling of large contigs and large lists of contigs
12 |
13 | plot_contigs.py output looks best when the contigs are of sizes in the range of kb to 10s of kb, and when there are a fewer than about 300 of them.
14 |
15 | For whole genomes, [genome_notebook](https://github.com/dbikard/genomenotebook) might work better.
16 |
17 | [Geneious Prime](https://www.geneious.com/) works very well for visualizing Domainator annotations, but is commercial software.
18 |
19 | We welcome any other recommendations for contig visualization software.
20 |
21 | # Frequently asked questions
22 |
23 | 
24 | ## What's with all the otters?
25 | The Domainator mascot is Domainic, the Domain-otter. Domainic is a American river otter. Besides the irresistible pun and cuteness, the choice of an otter mascot was inspired by the river otters that live in the pond on the New England Biolabs Ipswich campus, where most of Domainator was written. You may see Domainic throughout our documentation, donning his detective gear as he investigates new proteins and genome neighborhoods. The otters holding hands as the M in our logo are a metaphor for a multidomain protein. Similar to the domains of many two-domain proteins, the individual otters are connected by a flexible linker, and can function independently when separated, but are happiest when they are together. While river otters are not known to hold hands, it's sea otters that exhibit that behavior, we hope you'll forgive our creative license in incorporating hand holding river otters into our logo.
26 | ## Can Domainator handle eukaryotic genomes with introns in their CDSs?
27 | Yes! Domainator can add domain annotations across introns and process those files just the same as files without introns. Input files must be in genbank format, and the gene must annotated as a CDS with a complex location, for example `join(11356..11374,11523..17083)`. One potential complication is that eukaryotic genome annotations often include multiple gene models for the same gene. Domainator considers these to be distinct CDSs, so tools like `domain_search.py` and `select_by_cds.py` may behave in unexpected ways, extracting redundant hits, or smaller neighborhoods than expected. If possible, select non redundant representative gene models and delete others from the GenBank file before using it as input to Domainator. If there is enough demand, we will consider adding automated tools to Domainator for better handling of alternative gene models.
28 |
--------------------------------------------------------------------------------
/docs/media/Overview_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/Overview_diagram.png
--------------------------------------------------------------------------------
/docs/media/detective_Domainic_transparent_background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/detective_Domainic_transparent_background.png
--------------------------------------------------------------------------------
/docs/media/genome_mining_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/genome_mining_workflow.png
--------------------------------------------------------------------------------
/docs/media/hmm_profile_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/hmm_profile_comparison.png
--------------------------------------------------------------------------------
/domainator.def:
--------------------------------------------------------------------------------
1 | Bootstrap: docker
2 | From: condaforge/miniforge3:24.9.2-0
3 |
4 | %files
5 | ./src /opt/domainator/src
6 | ./conda_env.yml /opt/domainator
7 | ./test /opt/domainator/test
8 | ./pyproject.toml /opt/domainator
9 |
10 | %post
11 | cd /opt/domainator
12 | conda env update --name base --file conda_env.yml
13 |
--------------------------------------------------------------------------------
/domainator_esmologs.def:
--------------------------------------------------------------------------------
1 | Bootstrap: docker
2 | From: continuumio/miniconda3:23.10.0-1
3 |
4 | %files
5 | ./src /opt/domainator/src
6 | ./conda_env.yml /opt/domainator
7 | ./test /opt/domainator/test
8 | ./pyproject.toml /opt/domainator
9 |
10 |
11 | %post
12 |
13 | git clone https://github.com/seanrjohnson/esmologs.git
14 | cd esmologs
15 |
16 | conda env update --name base --file conda_env.yml
17 |
18 | cd /opt/domainator
19 | conda env update --name base --file conda_env.yml
20 |
--------------------------------------------------------------------------------
/package.sh:
--------------------------------------------------------------------------------
1 | pandoc -V geometry:margin=0.5in -V geometry:paperwidth=13.5in README.md -o README.pdf
2 | zip -r domainator.zip README.md README.pdf pyproject.toml conda_env.yml test src -x "src/domainator.egg-info/*" "src/domainator/__pycache__/*" "test/__pycache__/*" "test/.ipynb_checkpoints/*" "test/data/.ipynb_checkpoints/*" "src/Bio/__pycache__/*"
3 |
4 | # maybe create a directory in the zip?
5 | # maybe instead of having all of those exclusions, make a temporary directory, copy in everything important, zip it, and delete it.
6 |
7 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name="domainator"
7 | authors = [
8 | {name = "Sean Johnson", email = "sjohnson@neb.com"},
9 | {name = "Andrew Ge"},
10 | {name = "Yu-Cheng Lin"},
11 | {name = "Zhiyi Sun"}
12 | ]
13 | description="A flexible and modular software suite for domain-based gene neighborhood and protein search, extraction, and clustering."
14 | readme = "README.md"
15 | requires-python = ">=3.9"
16 |
17 | classifiers=[
18 | "Programming Language :: Python :: 3",
19 | ]
20 | license = {file = "LICENSE.txt"}
21 | dynamic = ["version"]
22 | dependencies = [
23 | "pandas >=2.1.0",
24 | "seaborn >=0.13.0",
25 | "scipy >=1.11.2",
26 | "pyhmmer >=0.10.2 ",
27 | "umap-learn >=0.5.4",
28 | "h5py >=3.9.0",
29 | "jsonargparse >=4.18.0",
30 | "psutil >=5.9.6",
31 | "tqdm >=4.65.0",
32 | "pyrodigal >=3.0.1",
33 | "bashplotlib >=0.6.5",
34 | "requests >=2.31.0"
35 | ]
36 |
37 | [project.optional-dependencies]
38 | test = ["pytest >=7.4.2",
39 | "pytest-datadir~=1.4.1"]
40 | # add dependency for Foldseek/ESM2 related stuff.
41 |
42 | [project.urls]
43 | Bug_Tracker = "https://github.com/nebiolabs/domainator/issues"
44 |
45 |
46 | [tool.setuptools]
47 | package-dir = {"" = "src"}
48 |
49 |
50 | [tool.setuptools.dynamic]
51 | version = {attr = "domainator.__version__"}
52 |
53 | [project.scripts]
54 | "build_projection.py" = "domainator:build_projection._entrypoint"
55 | "build_ssn.py" = "domainator:build_ssn._entrypoint"
56 | "build_tree.py" = "domainator:build_tree._entrypoint"
57 | "color_genbank.py" = "domainator:color_genbank._entrypoint"
58 | "color_table_to_legend.py" = "domainator:color_table_to_legend._entrypoint"
59 | "compare_contigs.py" = "domainator:compare_contigs._entrypoint"
60 | "deduplicate_genbank.py" = "domainator:deduplicate_genbank._entrypoint"
61 | "domain_search.py" = "domainator:domain_search._entrypoint"
62 | "domainate.py" = "domainator:domainate._entrypoint"
63 | "domainator_db_download.py" = "domainator:domainator_db_download._entrypoint"
64 | "enum_report.py" = "domainator:enum_report._entrypoint"
65 | "extract_domains.py" = "domainator:extract_domains._entrypoint"
66 | "extract_peptides.py" = "domainator:extract_peptides._entrypoint"
67 | "extract_unannotated.py" = "domainator:extract_unannotated._entrypoint"
68 | "filter_domains.py" = "domainator:filter_domains._entrypoint"
69 | "genbank_to_fasta.py" = "domainator:genbank_to_fasta._entrypoint"
70 | "hmmer_build.py" = "domainator:hmmer_build._entrypoint"
71 | "hmmer_compare.py" = "domainator:hmmer_compare._entrypoint"
72 | "hmmer_report.py" = "domainator:hmmer_report._entrypoint"
73 | "hmmer_search.py" = "domainator:hmmer_search._entrypoint"
74 | "hmmer_select.py" = "domainator:hmmer_select._entrypoint"
75 | "matrix_report.py" = "domainator:matrix_report._entrypoint"
76 | "partition_seqfile.py" = "domainator:partition_seqfile._entrypoint"
77 | "select_by_cds.py" = "domainator:select_by_cds._entrypoint"
78 | "select_by_contig.py" = "domainator:select_by_contig._entrypoint"
79 | "seq_dist.py" = "domainator:seq_dist._entrypoint"
80 | "summary_report.py" = "domainator:summary_report._entrypoint"
81 | "transform_matrix.py" = "domainator:transform_matrix._entrypoint"
82 | "plot_contigs.py" = "domainator:plot_contigs._entrypoint"
83 | "trim_contigs.py" = "domainator:trim_contigs._entrypoint"
--------------------------------------------------------------------------------
/src/domainator/Bio/Data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2000 Andrew Dalke. All rights reserved.
2 | #
3 | # This file is part of the Biopython distribution and governed by your
4 | # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5 | # Please see the LICENSE file that should have been included as part of this
6 | # package.
7 |
8 | """Collections of various bits of useful biological data."""
9 |
--------------------------------------------------------------------------------
/src/domainator/Bio/GenBank/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2000 by Jeffrey Chang, Brad Chapman. All rights reserved.
2 | # Copyright 2006-2017 by Peter Cock. All rights reserved.
3 | #
4 | # This code is part of the Biopython distribution and governed by its
5 | # license. Please see the LICENSE file that should have been included
6 | # as part of this package.
7 |
8 | from domainator.Bio.GenBank.utils import *
9 |
10 | # if __name__ == "__main__":
11 | # from Bio._utils import run_doctest
12 |
13 | # run_doctest()
14 |
--------------------------------------------------------------------------------
/src/domainator/Bio/LICENSE.rst:
--------------------------------------------------------------------------------
1 | Files in the Bio directory are modified from the Biopython project. The original files are licensed under the Biopython License Agreement and the BSD 3-Clause License. The original files are available at:https://github.com/biopython/biopython
2 |
3 | Files in this directory are licensed as follows:
4 |
5 | Biopython is currently released under the "Biopython License Agreement" (given in full below). Unless stated otherwise in individual file headers, all Biopython's files are under the "Biopython License Agreement".
6 |
7 | Some files are explicitly dual licensed under your choice of the "Biopython License Agreement" or the "BSD 3-Clause License" (both given in full below). This is with the intention of later offering all of Biopython under this dual licensing approach.
8 |
9 | Biopython License Agreement
10 | Permission to use, copy, modify, and distribute this software and its documentation with or without modifications and for any purpose and without fee is hereby granted, provided that any copyright notices appear in all copies and that both those copyright notices and this permission notice appear in supporting documentation, and that the names of the contributors or copyright holders not be used in advertising or publicity pertaining to distribution of the software without specific prior permission.
11 |
12 | THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
13 |
14 | BSD 3-Clause License
15 | Copyright (c) 1999-2023, The Biopython Contributors All rights reserved.
16 |
17 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
18 |
19 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
20 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
21 | Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/src/domainator/Bio/SeqIO/SwissIO.py:
--------------------------------------------------------------------------------
1 | # Copyright 2006-2013,2020 by Peter Cock.
2 | # Revisions copyright 2008-2009 by Michiel de Hoon.
3 | # All rights reserved.
4 | #
5 | # This file is part of the Biopython distribution and governed by your
6 | # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
7 | # Please see the LICENSE file that should have been included as part of this
8 | # package.
9 | """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
10 | You are expected to use this module via the Bio.SeqIO functions.
11 | See also the Bio.SwissProt module which offers more than just accessing
12 | the sequences as SeqRecord objects.
13 | See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format.
14 | """
15 | from domainator.Bio import SeqFeature
16 | from domainator.Bio import SwissProt
17 | from domainator.Bio.Seq import Seq
18 | from domainator.Bio.SeqRecord import SeqRecord
19 |
20 |
21 | def SwissIterator(source):
22 | """Break up a Swiss-Prot/UniProt file into SeqRecord objects.
23 | Argument source is a file-like object or a path to a file.
24 | Every section from the ID line to the terminating // becomes
25 | a single SeqRecord with associated annotation and features.
26 | This parser is for the flat file "swiss" format as used by:
27 | - Swiss-Prot aka SwissProt
28 | - TrEMBL
29 | - UniProtKB aka UniProt Knowledgebase
30 | For consistency with BioPerl and EMBOSS we call this the "swiss"
31 | format. See also the SeqIO support for "uniprot-xml" format.
32 | Rather than calling it directly, you are expected to use this
33 | parser via Bio.SeqIO.parse(..., format="swiss") instead.
34 | """
35 | swiss_records = SwissProt.parse(source)
36 |
37 | for swiss_record in swiss_records:
38 | # Convert the SwissProt record to a SeqRecord
39 | record = SeqRecord(
40 | Seq(swiss_record.sequence),
41 | id=swiss_record.accessions[0],
42 | name=swiss_record.entry_name,
43 | description=swiss_record.description,
44 | features=swiss_record.features,
45 | )
46 | for cross_reference in swiss_record.cross_references:
47 | if len(cross_reference) < 2:
48 | continue
49 | database, accession = cross_reference[:2]
50 | dbxref = f"{database}:{accession}"
51 | if dbxref not in record.dbxrefs:
52 | record.dbxrefs.append(dbxref)
53 | annotations = record.annotations
54 | annotations["molecule_type"] = "protein"
55 | annotations["accessions"] = swiss_record.accessions
56 | if swiss_record.protein_existence:
57 | annotations["protein_existence"] = swiss_record.protein_existence
58 | if swiss_record.created:
59 | date, version = swiss_record.created
60 | annotations["date"] = date
61 | annotations["sequence_version"] = version
62 | if swiss_record.sequence_update:
63 | date, version = swiss_record.sequence_update
64 | annotations["date_last_sequence_update"] = date
65 | annotations["sequence_version"] = version
66 | if swiss_record.annotation_update:
67 | date, version = swiss_record.annotation_update
68 | annotations["date_last_annotation_update"] = date
69 | annotations["entry_version"] = version
70 | if swiss_record.gene_name:
71 | annotations["gene_name"] = swiss_record.gene_name
72 | annotations["organism"] = swiss_record.organism.rstrip(".")
73 | annotations["taxonomy"] = swiss_record.organism_classification
74 | annotations["ncbi_taxid"] = swiss_record.taxonomy_id
75 | if swiss_record.host_organism:
76 | annotations["organism_host"] = swiss_record.host_organism
77 | if swiss_record.host_taxonomy_id:
78 | annotations["host_ncbi_taxid"] = swiss_record.host_taxonomy_id
79 | if swiss_record.comments:
80 | annotations["comment"] = "\n".join(swiss_record.comments)
81 | if swiss_record.references:
82 | annotations["references"] = []
83 | for reference in swiss_record.references:
84 | feature = SeqFeature.Reference()
85 | feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments)
86 | for key, value in reference.references:
87 | if key == "PubMed":
88 | feature.pubmed_id = value
89 | elif key == "MEDLINE":
90 | feature.medline_id = value
91 | elif key == "DOI":
92 | pass
93 | elif key == "AGRICOLA":
94 | pass
95 | else:
96 | raise ValueError(f"Unknown key {key} found in references")
97 | feature.authors = reference.authors
98 | feature.title = reference.title
99 | feature.journal = reference.location
100 | annotations["references"].append(feature)
101 | if swiss_record.keywords:
102 | record.annotations["keywords"] = swiss_record.keywords
103 | yield record
--------------------------------------------------------------------------------
/src/domainator/Bio/SeqIO/TabIO.py:
--------------------------------------------------------------------------------
1 | # Copyright 2008-2017,2020 by Peter Cock. All rights reserved.
2 | #
3 | # This file is part of the Biopython distribution and governed by your
4 | # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5 | # Please see the LICENSE file that should have been included as part of this
6 | # package.
7 | """Bio.SeqIO support for the "tab" (simple tab separated) file format.
8 |
9 | You are expected to use this module via the Bio.SeqIO functions.
10 |
11 | The "tab" format is an ad-hoc plain text file format where each sequence is
12 | on one (long) line. Each line contains the identifier/description, followed
13 | by a tab, followed by the sequence. For example, consider the following
14 | short FASTA format file::
15 |
16 | >ID123456 possible binding site?
17 | CATCNAGATGACACTACGACTACGACTCAGACTAC
18 | >ID123457 random sequence
19 | ACACTACGACTACGACTCAGACTACAAN
20 |
21 | Apart from the descriptions, this can be represented in the simple two column
22 | tab separated format as follows::
23 |
24 | ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC
25 | ID123457(tab)ACACTACGACTACGACTCAGACTACAAN
26 |
27 | When reading this file, "ID123456" or "ID123457" will be taken as the record's
28 | .id and .name property. There is no other information to record.
29 |
30 | Similarly, when writing to this format, Biopython will ONLY record the record's
31 | .id and .seq (and not the description or any other information) as in the
32 | example above.
33 | """
34 | from domainator.Bio.Seq import Seq
35 | from domainator.Bio.SeqRecord import SeqRecord
36 |
37 | from .Interfaces import _clean
38 | from .Interfaces import _get_seq_string
39 | from .Interfaces import SequenceIterator
40 | from .Interfaces import SequenceWriter
41 |
42 |
43 | class TabIterator(SequenceIterator):
44 | """Parser for tab-delimited files."""
45 |
46 | def __init__(self, source):
47 | """Iterate over tab separated lines as SeqRecord objects.
48 |
49 | Each line of the file should contain one tab only, dividing the line
50 | into an identifier and the full sequence.
51 |
52 | Arguments:
53 | - source - file-like object opened in text mode, or a path to a file
54 |
55 | The first field is taken as the record's .id and .name (regardless of
56 | any spaces within the text) and the second field is the sequence.
57 |
58 | Any blank lines are ignored.
59 |
60 | Examples
61 | --------
62 | >>> with open("GenBank/NC_005816.tsv") as handle:
63 | ... for record in TabIterator(handle):
64 | ... print("%s length %i" % (record.id, len(record)))
65 | gi|45478712|ref|NP_995567.1| length 340
66 | gi|45478713|ref|NP_995568.1| length 260
67 | gi|45478714|ref|NP_995569.1| length 64
68 | gi|45478715|ref|NP_995570.1| length 123
69 | gi|45478716|ref|NP_995571.1| length 145
70 | gi|45478717|ref|NP_995572.1| length 357
71 | gi|45478718|ref|NP_995573.1| length 138
72 | gi|45478719|ref|NP_995574.1| length 312
73 | gi|45478720|ref|NP_995575.1| length 99
74 | gi|45478721|ref|NP_995576.1| length 90
75 |
76 | """
77 | super().__init__(source, mode="t", fmt="Tab-separated plain-text")
78 |
79 | def parse(self, handle):
80 | """Start parsing the file, and return a SeqRecord generator."""
81 | records = self.iterate(handle)
82 | return records
83 |
84 | def iterate(self, handle):
85 | """Parse the file and generate SeqRecord objects."""
86 | for line in handle:
87 | try:
88 | title, seq = line.split("\t") # will fail if more than one tab!
89 | except ValueError:
90 | if line.strip() == "":
91 | # It's a blank line, ignore it
92 | continue
93 | raise ValueError(
94 | "Each line should have one tab separating the"
95 | + " title and sequence, this line has %i tabs: %r"
96 | % (line.count("\t"), line)
97 | ) from None
98 | title = title.strip()
99 | seq = seq.strip() # removes the trailing new line
100 | yield SeqRecord(Seq(seq), id=title, name=title, description="")
101 |
102 |
103 | class TabWriter(SequenceWriter):
104 | """Class to write simple tab separated format files.
105 |
106 | Each line consists of "id(tab)sequence" only.
107 |
108 | Any description, name or other annotation is not recorded.
109 |
110 | This class is not intended to be used directly. Instead, please use
111 | the function ``as_tab``, or the top level ``Bio.SeqIO.write()`` function
112 | with ``format="tab"``.
113 | """
114 |
115 | def write_record(self, record):
116 | """Write a single tab line to the file."""
117 | assert self._header_written
118 | assert not self._footer_written
119 | self._record_written = True
120 | self.handle.write(as_tab(record))
121 |
122 |
123 | def as_tab(record):
124 | """Return record as tab separated (id(tab)seq) string."""
125 | title = _clean(record.id)
126 | seq = _get_seq_string(record) # Catches sequence being None
127 | assert "\t" not in title
128 | assert "\n" not in title
129 | assert "\r" not in title
130 | assert "\t" not in seq
131 | assert "\n" not in seq
132 | assert "\r" not in seq
133 | return f"{title}\t{seq}\n"
134 |
135 |
136 | # if __name__ == "__main__":
137 | # from Bio._utils import run_doctest
138 |
139 | # run_doctest(verbose=0)
140 |
--------------------------------------------------------------------------------
/src/domainator/__init__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | __version__ = "0.7.1" # remember to update version in README.md also. And if you make breaking changes to the matrix file format, increment the matrix file version in data_matrix.py.
3 | DOMAIN_FEATURE_NAME="Domainator" # Changing this will break backwards compatibility with files generated by other versions of Domainator
4 | DOMAIN_SEARCH_BEST_HIT_NAME="Domain_Search" # Changing this will break backwards compatibility with files generated by other versions of Domainator
5 | # SYS_ANNOTATION_NAME="Domainator_sys_" #Changing this will (probably) NOT break backwards compatibility with files generated by other versions of Domainator
6 |
7 | class RawAndDefaultsFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
8 | pass
9 |
10 |
--------------------------------------------------------------------------------
/src/domainator/color_table_to_legend.py:
--------------------------------------------------------------------------------
1 | """ Generates an SVG legend based on a color table.
2 | The color table is a tsv file that maps annotations to hex color codes.
3 | The generated SVG file contains rectangles filled with the corresponding colors and text labels for each annotation.
4 | """
5 |
6 |
7 | from jsonargparse import ArgumentParser, ActionConfigFile
8 | import sys
9 | from typing import Tuple, List, Optional, Set, NamedTuple, Union, Dict
10 | from os import PathLike
11 | from domainator import __version__, RawAndDefaultsFormatter
12 | from domainator.color_genbank import read_color_table
13 | from collections import OrderedDict
14 | import html
15 |
16 |
17 | def color_table_to_legend(table: Dict[str, str], svg: str, title: str):
18 | # Constants for layout and styling
19 | title_font_size = 24
20 | item_font_size = 20
21 | stroke_width = 2 # Width of the stroke for boxes
22 | padding = 10 # Padding around the content inside the legend box
23 | title_space = 40 # Space allocated for the title at the top of the legend
24 |
25 | box_height = item_font_size * 2 # Height of each color box, increased to add whitespace
26 | box_width = box_height # Width of color boxes
27 | text_offset_y = box_height/2 # Adjust to vertically center text in the color box
28 |
29 | longest_key = max(len(k) for k in table.keys())
30 |
31 | text_offset_x = box_width + padding*2 # X offset for text to align it nicely with boxes
32 | total_height = title_space + len(table) * box_height + 2 * padding
33 | total_width = max(box_width + padding*3 + (longest_key * item_font_size)/2, padding*3 + (len(title) * title_font_size) /2) # Assumed extra space for text
34 |
35 |
36 | with open(svg, "w") as f:
37 | f.write(f"""
38 |
52 | """)
53 |
54 | def main(argv):
55 | parser = ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter)
56 |
57 | parser.add_argument("-i", "--input", default=None, nargs="+", type=str, required=False,
58 | help="""names of color table files. If not supplied, reads from stdin.\n
59 | Files are tab separated with two columns and no header, columns are: annotation, hex color. For example: CCDB cc0000""")
60 | parser.add_argument("--svg", default=None, type=str, required=True,
61 | help="name of output svg file")
62 | parser.add_argument("--title", default="Legend", type=str, required=False,
63 | help="Title of the legend. Default: 'Legend'")
64 | #TODO: font, font size, indicator shape, etc.
65 |
66 | parser.add_argument('--config', action=ActionConfigFile)
67 |
68 | params = parser.parse_args(argv)
69 |
70 | ### validate input
71 |
72 |
73 | if params.input is None:
74 | input_path = (sys.stdin,)
75 | else:
76 | input_path = params.input
77 |
78 | table = OrderedDict()
79 | for color_table_file in input_path:
80 | for k, v in read_color_table(color_table_file).items():
81 | table[k] = v
82 |
83 | ### Run
84 |
85 | color_table_to_legend(table, params.svg, params.title)
86 |
87 | def _entrypoint():
88 | main(sys.argv[1:])
89 |
90 | if __name__ == '__main__':
91 | main(sys.argv[1:])
92 |
93 |
94 |
--------------------------------------------------------------------------------
/src/domainator/foldseek.py:
--------------------------------------------------------------------------------
1 | try:
2 | from esmologs.ESM2_to_3Di import ESM2_to_3Di
3 | from esmologs.predict_from_ESM2_to_3Di import convert_batch
4 | from esmologs.fasta2foldseek import fasta2foldseek
5 | import torch
6 | except ImportError:
7 | pass
8 |
9 | import psutil
10 | import tempfile
11 | import subprocess
12 | from typing import List, Iterable, Tuple, Union, Iterator
13 | from collections import namedtuple
14 |
15 | # define a named tuple for hits with fields "query,target,qheader,theader,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits"
16 | FoldseekHit = namedtuple("Hit", ["query","target","qheader","theader","pident","alnlen","mismatch","gapopen","qstart","qend","tstart","tend","evalue","bits", "qlen", "tlen"])
17 |
18 | MAX_PROTEIN_SIZE = 2500
19 |
20 | def search(database_path, proteins, foldseek, cpu, E) -> Iterable[FoldseekHit]:
21 | with tempfile.TemporaryDirectory() as tmpdirname:
22 | out_base_name = tmpdirname + "/output"
23 | protein_fasta_name = tmpdirname + "/protein.fasta"
24 | threedi_fasta_name = tmpdirname + "/threedi.fasta"
25 | foldseek_tmpfolder = tmpdirname + "/foldseek_tmpfolder"
26 | aln_path = tmpdirname + "/aln"
27 |
28 | num_seqs = 0
29 | with open(protein_fasta_name, "w") as protein_f:
30 | with open(threedi_fasta_name, "w") as threedi_f:
31 | for i, foldseek_seq in enumerate(foldseek):
32 | if foldseek_seq is None:
33 | continue
34 | num_seqs += 1
35 |
36 | protein = proteins[i]
37 | protein = protein.textize()
38 |
39 | protein_f.write(f">{protein.name.decode('utf-8')} {protein.description.decode('utf-8')}\n{protein.sequence}\n")
40 | threedi_f.write(foldseek_seq + "\n")
41 |
42 | if num_seqs == 0:
43 | return # no sequences to search, yield nothing
44 |
45 | fasta2foldseek(protein_fasta_name, threedi_fasta_name, out_base_name)
46 | foldseek_options = ["foldseek", "search", out_base_name, database_path, aln_path, foldseek_tmpfolder, "-e", str(E)]
47 | if cpu > 0 and cpu is not None:
48 | foldseek_options += ["--threads", str(cpu)]
49 | foldseek_out = subprocess.Popen(foldseek_options, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
50 | foldseek_out.wait()
51 | if foldseek_out.returncode != 0:
52 | raise RuntimeError(f"foldseek exited with code {foldseek_out.returncode}:\n{foldseek_out.stderr.read().decode('utf-8')}")
53 |
54 | convertalis_out = subprocess.Popen(["foldseek", "convertalis", "--format-output", "query,target,qheader,theader,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen", out_base_name, database_path, aln_path, foldseek_tmpfolder + "/results.tsv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
55 | convertalis_out.wait()
56 | if convertalis_out.returncode != 0:
57 | raise RuntimeError(f"convertalis exited with code {convertalis_out.returncode}:\n{convertalis_out.stderr.read().decode('utf-8')}")
58 |
59 | with open(foldseek_tmpfolder + "/results.tsv", "r") as f:
60 | for line in f:
61 | yield FoldseekHit(*line.strip().split("\t"))
62 |
63 |
64 | class foldseekBuilder():
65 | def __init__(self, device="cuda:0", checkpoint=None):
66 | self.device = device
67 | self.checkpoint = checkpoint
68 | self.model = ESM2_to_3Di("esm2_t36_3B_UR50D", torch.load(checkpoint, map_location=device))
69 | self.checkpoint=checkpoint
70 | self.device=device
71 | self.model.to(self.device)
72 | self.model.eval()
73 |
74 | def __call__(self, name:str, prot:str) -> bytes:
75 | if len(prot) > MAX_PROTEIN_SIZE: #TODO: maybe warn?
76 | return None
77 | # skip if contains non-amino acid characters
78 | if prot.strip("ACDEFGHIKLMNPQRSTVWY") != "": #TODO: maybe warn?
79 | return None
80 | predicted_seqs = convert_batch(self.model, [prot], device=self.device)
81 | return f">{name}\n{predicted_seqs[0]}"
--------------------------------------------------------------------------------
/src/domainator/genbank_to_fasta.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert a GenBank file to a FASTA file.
3 | """
4 | import argparse
5 | import sys
6 | from domainator.Bio import SeqIO
7 | from domainator import __version__, RawAndDefaultsFormatter
8 | from domainator.utils import parse_seqfiles
9 |
10 | # TODO: replace this with a more general purpose script that can convert between any of the supported formats
11 | # allow specification of various paramters, such as species, accession, etc.
12 |
13 | def main(argv):
14 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter)
15 |
16 | parser.add_argument('-i', '--input', nargs='+', required=False, default=None,
17 | help="Genbank filenames. If not supplied, reads from stdin.")
18 |
19 | parser.add_argument('-o', '--output', default=None, required=False,
20 | help="the name of the output fasta file. If not supplied writes to stdout.")
21 |
22 | params = parser.parse_args(argv)
23 |
24 | ### Figure out what input and output files ####
25 |
26 | if params.input is None:
27 | genbanks = [sys.stdin]
28 | else:
29 | genbanks = params.input
30 |
31 | if params.output is None:
32 | out = sys.stdout
33 | else:
34 | out = open(params.output, "w")
35 |
36 | for rec in parse_seqfiles(genbanks, filetype_override="genbank"):
37 | SeqIO.write(rec, out, "fasta")
38 |
39 | if params.output is not None:
40 | out.close()
41 |
42 | def _entrypoint():
43 | main(sys.argv[1:])
44 |
45 | if __name__ == "__main__":
46 | main(sys.argv[1:])
47 |
--------------------------------------------------------------------------------
/src/domainator/hmmer_build.py:
--------------------------------------------------------------------------------
1 | """
2 | Script to build a HMM profile from a multiple sequence alignment (MSA).
3 | Allows the user to specify the ACC, NAME, and DESC fields of the HMM profile.
4 | """
5 | from jsonargparse import ArgumentParser, ActionConfigFile
6 | from pyhmmer.easel import MSAFile
7 | from domainator import __version__, RawAndDefaultsFormatter
8 | import sys
9 | import pyhmmer
10 | from typing import Optional,BinaryIO,Union
11 | import re
12 |
13 | def sanitize_string(s:str) -> str:
14 | return re.sub("[^ \w\d_\-\.;:]", "_", s)
15 |
16 | def hmmer_build(file:Union[str,BinaryIO], alphabet:Optional[pyhmmer.easel.Alphabet]=None, name:Optional[str]=None, acc:Optional[str]=None, desc:Optional[str]=None) -> pyhmmer.plan7.HMM:
17 | with MSAFile(file, digital=True, alphabet=alphabet) as msa_file:
18 | msa = msa_file.read()
19 |
20 | msa.name = sanitize_string(name).encode()
21 | if acc is not None:
22 | msa.accession = sanitize_string(acc).encode()
23 | if desc is not None:
24 | msa.description = sanitize_string(desc).encode()
25 |
26 | builder = pyhmmer.plan7.Builder(msa.alphabet)
27 | background = pyhmmer.plan7.Background(msa.alphabet)
28 | hmm, _, _ = builder.build_msa(msa, background)
29 | return hmm
30 |
31 |
32 | def main(argv):
33 | parser = ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter)
34 |
35 | parser.add_argument("-i", "--input", default=None, required=False, type=str,
36 | help="Path of input msa. If not supplied, reads from stdin. Acceptable formats are the same as for hmmbuild.")
37 |
38 | parser.add_argument("-o", "--output", default=None, required=False, type=str,
39 | help="hmm output file path. If not supplied writes to stdout.")
40 |
41 | parser.add_argument("--name", default=None, required=True, type=str,
42 | help="Name of the HMM profile.")
43 | parser.add_argument("--acc", default=None, required=False, type=str,
44 | help="Accession of the HMM profile.")
45 | parser.add_argument("--desc", default=None, required=False, type=str,
46 | help="Description of the HMM profile.")
47 | parser.add_argument("--alphabet", default=None, required=False, type=str.lower, choices={"amino", "dna", "rna"},)
48 |
49 | parser.add_argument("--config", action=ActionConfigFile)
50 |
51 | params = parser.parse_args(argv)
52 |
53 |
54 | if params.input is None:
55 | input_file = sys.stdin.buffer
56 | else:
57 | input_file = open(params.input, "rb")
58 |
59 | if params.output is None:
60 | output_handle = sys.stdout.buffer
61 | else:
62 | output_handle = open(params.output, "wb")
63 |
64 | alphabet = None
65 | if params.alphabet == "amino":
66 | alphabet = pyhmmer.easel.Alphabet.amino()
67 | elif params.alphabet == "dna":
68 | alphabet = pyhmmer.easel.Alphabet.dna()
69 | elif params.alphabet == "rna":
70 | alphabet = pyhmmer.easel.Alphabet.rna()
71 |
72 | hmm = hmmer_build(file=input_file, alphabet=alphabet, name=params.name, acc=params.acc, desc=params.desc)
73 | hmm.write(output_handle)
74 |
75 | if params.input is not None:
76 | input_file.close()
77 |
78 | if params.output is not None:
79 | output_handle.close()
80 |
81 | def _entrypoint():
82 | main(sys.argv[1:])
83 |
84 | if __name__ == "__main__":
85 | main(sys.argv[1:])
--------------------------------------------------------------------------------
/src/domainator/hmmer_compare.py:
--------------------------------------------------------------------------------
1 | """Aligns and calculates alignment scores between hmmer3 profiles
2 |
3 | Kind of like hhsearch except much slower and for hmmer3 profiles instead of hhsuite profiles.
4 |
5 | Adapted from pseudocode in:
6 | Steinegger, Martin, Markus Meier, Milot Mirdita, Harald Vöhringer, Stephan J. Haunsberger, and Johannes Söding. “HH-Suite3 for Fast Remote Homology Detection and Deep Protein Annotation.” BMC Bioinformatics 20, no. 1 (September 14, 2019): 473. https://doi.org/10.1186/s12859-019-3019-7.
7 |
8 | and
9 |
10 | Söding, Johannes. “Protein Homology Detection by HMM–HMM Comparison.” Bioinformatics 21, no. 7 (April 1, 2005): 951–60. https://doi.org/10.1093/bioinformatics/bti125.
11 |
12 | """
13 | import argparse
14 | import sys
15 | import pyhmmer
16 | from typing import Iterable, TextIO
17 | import heapq
18 | from multiprocessing import Pool
19 | from domainator import __version__, RawAndDefaultsFormatter
20 | from domainator.hmmer_search import read_hmms, compare_hmmer, traceback, HmmerHit
21 |
22 | class _hmmer_compare_worker():
23 | def __init__(self, hmmer_targets, alignment=False, k=None, score_cutoff=float("-inf")):
24 | self.k = k
25 | self.hmmer_targets = hmmer_targets
26 | self.alignment = alignment
27 | self.score_cutoff = score_cutoff
28 |
29 | def __call__(self, input_profile):
30 | out_heap = []
31 | for target_dataset in self.hmmer_targets.values():
32 | for target_profile in target_dataset.values():
33 | score, backtrace, max_index, match_scores = compare_hmmer(input_profile, target_profile)
34 | if score >= self.score_cutoff:
35 | if self.alignment:
36 | alignment = traceback(input_profile,target_profile,backtrace,max_index, match_scores)
37 | else:
38 | alignment = None
39 |
40 | result = HmmerHit(score, input_profile.name.decode(), target_profile.name.decode(), alignment)
41 | if (self.k is None) or (len(out_heap) < self.k):
42 | heapq.heappush(out_heap, result)
43 | else:
44 | heapq.heappushpop(out_heap, result)
45 | out_heap.sort(reverse=True)
46 | return out_heap
47 |
48 |
49 |
50 | def hmmer_compare(query_files:Iterable[str], reference_files:Iterable[str], out_handle:TextIO, score_cutoff:float, alignments:bool, k:int, cpu:int):
51 | references = read_hmms(hmm_files=reference_files) # list of lists of pyhmmer hmm objects
52 |
53 | worker = _hmmer_compare_worker(references, alignments, k, score_cutoff)
54 |
55 | sep="\t"
56 | print(sep.join(("query","reference","score")), file=out_handle) #TODO: how to write the alignment?
57 |
58 | for file in query_files:
59 | # file_name = os.path.basename(Path(file).stem)
60 | with Pool(processes=cpu) as pool:
61 | for hits in pool.imap(worker, pyhmmer.plan7.HMMFile(file), chunksize=1): # I tested some chunk sizes and it didn't seem to make a difference
62 | for hit in hits:
63 | print(sep.join( (hit.query_name,hit.reference_name,f"{round(hit.score,2):.2f}") ), file=out_handle)
64 | if alignments:
65 | print(hit.alignment, file=out_handle)
66 | print("\n\n", file=out_handle)
67 |
68 |
69 | def main(argv):
70 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter)
71 | parser.add_argument('-i', '--input', type=str, required=True, nargs='+',
72 | help="Input query files. One or more hmm text files with one or more hmmer3 profiles.")
73 | parser.add_argument('-r', "--reference", type=str, required=True, nargs='+',
74 | help="Reference files. One or more hmm text files with one or more hmmer3 profiles.")
75 |
76 | parser.add_argument('--score_cutoff', type=float, default = 0,
77 | help="Report alignments with scores greater than or equal to this.") #TODO: what is a reasonable cutoff?
78 |
79 | parser.add_argument('-k', type=int, required=False, default=None,
80 | help="Include at most this many of the top hits for each query. Default: Include all hits.")
81 |
82 | parser.add_argument('-o', '--output', type=str, default=None,
83 | help="File to write the scores and alignments to.")
84 |
85 | parser.add_argument('--alignments', action='store_true', default=False,
86 | help="when activated, will write the alignments to the output.")
87 |
88 | parser.add_argument('--cpu', type=int, default=8, required=False,
89 | help="how many cpu threads to use. Default: 8")
90 |
91 |
92 | params = parser.parse_args(argv)
93 |
94 | if params.output is None:
95 | out = sys.stdout
96 | else:
97 | out = open(params.output, "w")
98 |
99 | hmmer_compare(params.input, params.reference, out, params.score_cutoff, params.alignments, params.k, params.cpu)
100 |
101 | if params.output is not None:
102 | out.close()
103 |
104 | def _entrypoint():
105 | main(sys.argv[1:])
106 |
107 | if __name__ == '__main__':
108 | main(sys.argv[1:])
109 |
--------------------------------------------------------------------------------
/src/domainator/partition_seqfile.py:
--------------------------------------------------------------------------------
1 | """Find record offsets in a sequence file
2 |
3 | Given an input sequence file, writes the total count of CDSs/protein sequences,
4 | followed by a list of offset\trecords pairs to divide the sequence file.
5 |
6 | Seeking to each offset and reading the specified number of records will result in reading the entire file once.
7 | """
8 | import argparse
9 | import sys
10 | from domainator import __version__, RawAndDefaultsFormatter
11 | from domainator import utils
12 |
13 |
14 | def i_partition_seqfiles(input_paths, cdss_per_partition): #TODO: test this!
15 | """
16 | input:
17 | input_paths: a list of paths to genbank (peptide or nucleotide) or fasta (peptide) files.
18 | cdss_per_partition: how many cdss to try to include in each partition (mutually exclusive with partitions)
19 |
20 | yields:
21 | (input_path, offset, recs_to_read)
22 |
23 | """
24 |
25 | for input_path in input_paths:
26 | running_sum = 0
27 | next_offset = None
28 | recs_in_buffer = 0
29 | for offset, cds_count in utils.i_get_offsets(input_path):
30 | if next_offset is None:
31 | next_offset = offset
32 | running_sum += cds_count
33 | recs_in_buffer += 1
34 | if running_sum >= cdss_per_partition:
35 | yield (input_path, next_offset, recs_in_buffer)
36 | running_sum = 0
37 | recs_in_buffer = 0
38 | next_offset = None
39 | if recs_in_buffer > 0:
40 | yield (input_path, next_offset, recs_in_buffer)
41 |
42 |
43 | def partition_seqfile(input_path, partitions=None, cdss_per_partition=None):
44 | """
45 | input:
46 | input_path: path to a genbank (peptide or nucleotide) or fasta (peptide) file.
47 | partitions: desired number of partitions to divide the file into
48 | cdss_per_partition: how many cdss to try to include in each partition (mutually exclusive with partitions)
49 |
50 | output:
51 | total_cds_count, [(input_path, offset, recs_to_read)]
52 | """
53 |
54 | cds_count = 0
55 | out_list = list()
56 |
57 | if partitions is None and cdss_per_partition is None:
58 | raise ValueError("Error: Must specify either partitions or cdss_per_partition")
59 |
60 | if partitions is not None and cdss_per_partition is not None:
61 | raise ValueError("Error: Must specify only one of partitions or cdss_per_partition")
62 |
63 | offsets, num_proteins = utils.get_offsets(input_path)
64 |
65 | num_recs = len(offsets)
66 |
67 | cds_count = sum(num_proteins)
68 |
69 | if num_recs == 0:
70 | raise ValueError("Error: No sequence records found in input file (maybe the file has the wrong extension or there is a formatting error in the input file).")
71 |
72 |
73 | if cdss_per_partition is None:
74 | cdss_per_partition = int(cds_count/partitions)
75 |
76 | running_sum = 0
77 | next_offset = offsets[0]
78 | recs_in_buffer = 0
79 | for i in range(num_recs):
80 | running_sum += num_proteins[i]
81 | recs_in_buffer += 1
82 | if running_sum >= cdss_per_partition:
83 | out_list.append((input_path, next_offset, recs_in_buffer))
84 | running_sum = 0
85 | recs_in_buffer = 0
86 | if i + 1 < num_recs:
87 | next_offset = offsets[i+1] # next_offset = the offset of the next record
88 | if recs_in_buffer > 0:
89 | out_list.append((input_path, next_offset, recs_in_buffer))
90 |
91 | return cds_count, out_list
92 |
93 |
94 | def main(args):
95 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter)
96 |
97 | parser.add_argument('-i', '--input', default=None, type=str, required=True,
98 | help="the genbank or fasta file to split the contig names of. Genbank files can be nucleotide (with CDS annotations) or peptide. Fasta files must be peptide.")
99 | parser.add_argument('-o',"--output", default=None, type=str,
100 | help="Name of output file. Default: stdout")
101 |
102 | overwrite_group = parser.add_mutually_exclusive_group(required=True)
103 | overwrite_group.add_argument('--partitions', type=int, default=None,
104 | help="The number of partitions to divide the ids into, roughly evenly.")
105 | overwrite_group.add_argument('--cdss_per_partition', type=int, default=None,
106 | help="The approximate number of ids to write to each partition. Partitioning algorithm is greedy, it adds records until the CDS count is met or exceeded, then goes to the next start pointer.")
107 | params = parser.parse_args(args)
108 |
109 | if params.output is None:
110 | out = sys.stdout
111 | else:
112 | out = open(params.output, "w")
113 |
114 |
115 | cds_count, splits = partition_seqfile(params.input, params.partitions, params.cdss_per_partition)
116 | print(cds_count, file=out)
117 | for _, offset, recs in splits:
118 | print(f"{offset}\t{recs}", file=out)
119 |
120 | if params.output is not None:
121 | out.close()
122 |
123 | def _entrypoint():
124 | main(sys.argv[1:])
125 |
126 | if __name__ == '__main__':
127 | main(sys.argv[1:])
128 |
--------------------------------------------------------------------------------
/src/domainator/partition_seqids.py:
--------------------------------------------------------------------------------
1 | """Partitions the sequence IDs from a sequence file
2 |
3 | Given an input sequence file, write new text files consisting of the sequence IDs partitioned into groups.
4 | Also prints the total number of records in the file (without a newline at the end).
5 | The "number of records" is the number of CDSs in nucleotide databases, and the number of contigs in protein databases.
6 |
7 | """
8 | #TODO: maybe deprecate this whole file?
9 |
10 | #TODO: change the purpose of the file to split based on some kind of annotation, like SSN_cluster, then it could be piped into a script for generating group-specific HMMS.
11 |
12 | import argparse
13 | from domainator.utils import parse_seqfiles, count_peptides_in_record
14 | from domainator import __version__, RawAndDefaultsFormatter
15 | import sys
16 |
17 | def partition_seqids(input_path, output_prefix, partitions, ids_per_partition, filetype=None):
18 | cds_count = 0
19 | rec_names = list()
20 |
21 | if partitions is None and ids_per_partition is None:
22 | raise ValueError("Error: Must specify either partitions or ids_per_partition")
23 |
24 | if partitions is not None and ids_per_partition is not None:
25 | raise ValueError("Error: Must specify only one of partitions or ids_per_partition")
26 |
27 | for rec in parse_seqfiles(input_path, filetype_override=filetype):
28 | cds_count += count_peptides_in_record(rec)
29 | rec_names.append(rec.id)
30 |
31 | num_recs = len(rec_names)
32 |
33 | if num_recs == 0:
34 | raise ValueError("Error: No sequence records found in input file (maybe you specified the wrong file type or there is a formatting error in the input file).")
35 |
36 | if partitions is None: #partitions is None, so we need to set it based on num_recs and ids_per_partition
37 | partitions = int(num_recs / ids_per_partition)
38 | if num_recs % ids_per_partition != 0: # if rec ids don't divide evenly into ids_per_partition we need an extra partition
39 | partitions += 1
40 | if ids_per_partition is None: #ids_per_partition is None, so we need to set it based on num_recs and partitions
41 | ids_per_partition = int(num_recs / partitions)
42 | if num_recs % partitions != 0: # if rec ids don't divide evenly into partitions we need an extra partition
43 | ids_per_partition += 1
44 | max_digits = len(f"{partitions}")
45 | out_index = 0
46 | out_file = None
47 |
48 | for position, id in enumerate(rec_names):
49 |
50 | if position % ids_per_partition == 0:
51 | out_index += 1
52 | if out_file is not None:
53 | out_file.close()
54 | out_file = open(output_prefix + str(out_index).zfill(max_digits)+ ".txt", "w")
55 | print(id, file=out_file)
56 |
57 | out_file.close()
58 |
59 | print(cds_count,end='')
60 |
61 | def main(argv):
62 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter)
63 |
64 | parser.add_argument('-i', '--input', default=None,
65 | nargs='+', type=str,
66 | help="the genbank or fasta files to split the contig names of. Genbank files can be nucleotide (with CDS annotations) or peptide. Fasta files must be peptide.")
67 | parser.add_argument('--output_prefix', required=True, type=str,
68 | help="Output files will be named [output_prefix][0-9]+.txt")
69 |
70 | overwrite_group = parser.add_mutually_exclusive_group(required=True)
71 | overwrite_group.add_argument('--partitions', type=int, default=None,
72 | help="The number of partitions to divide the ids into, roughly evenly. Actual number of partitions will usually be a little smaller than this number due to bin rounding.")
73 | overwrite_group.add_argument('--ids_per_partition', type=int, default=None,
74 | help="The number of ids to write to each partition.")
75 | params = parser.parse_args(argv)
76 |
77 |
78 | partition_seqids(params.input, params.output_prefix, params.partitions, params.ids_per_partition)
79 |
80 | def _entrypoint():
81 | main(sys.argv[1:])
82 |
83 | if __name__ == '__main__':
84 | main(sys.argv[1:])
--------------------------------------------------------------------------------
/test/data/CcdB.hmm.h3f:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3f
--------------------------------------------------------------------------------
/test/data/CcdB.hmm.h3i:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3i
--------------------------------------------------------------------------------
/test/data/CcdB.hmm.h3m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3m
--------------------------------------------------------------------------------
/test/data/CcdB.hmm.h3p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3p
--------------------------------------------------------------------------------
/test/data/CuSOD_enum_report_test.gb:
--------------------------------------------------------------------------------
1 | LOCUS sp|P0AGD1|SODC_ECOLI 173 aa UNK 06-JUN-2023
2 | DEFINITION sp|P0AGD1|SODC_ECOLI Superoxide dismutase [Cu-Zn] OS=Escherichia
3 | coli (strain K12) OX=562 GN=sodC PE=1 SV=1.
4 | ACCESSION sp|P0AGD1|SODC_ECOLI
5 | VERSION sp|P0AGD1|SODC_ECOLI
6 | KEYWORDS .
7 | SOURCE .
8 | ORGANISM .
9 | .
10 | FEATURES Location/Qualifiers
11 | Domain_Search 1..173
12 | /program="phmmer"
13 | /database="swissprot_CuSOD_subset"
14 | /description="Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1"
15 | /evalue="2.0e-114"
16 | /score="375.2"
17 | /name="sp|P0AGD1|SODC_ECOLI"
18 | /identity="100.0"
19 | /cds_id="0_1_173"
20 | /rstart="3"
21 | /rend="79"
22 | /rlen="100"
23 | Domainator 1..173
24 | /program="phmmer"
25 | /database="swissprot_CuSOD_subset"
26 | /description="Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1"
27 | /evalue="2.0e-114"
28 | /score="375.2"
29 | /name="sp|P0AGD1|SODC_ECOLI"
30 | /identity="100.0"
31 | /cds_id="0_1_173"
32 | /rstart="3"
33 | /rend="79"
34 | /rlen="100"
35 | Domainator 14..173
36 | /program="phmmer"
37 | /database="swissprot_CuSOD_subset"
38 | /description="Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1"
39 | /evalue="6.5e-13"
40 | /score="44.8"
41 | /name="sp|O31851|YOJM_BACSU"
42 | /identity="38.1"
43 | /cds_id="0_1_173"
44 | /rstart="3"
45 | /rend="79"
46 | /rlen="100"
47 | ORIGIN
48 | 1 mkrfslaila lvvatgaqaa sekvemnlvt sqgvgqsigs vtitetdkgl efspdlkalp
49 | 61 pgehgfhiha kgscqpatkd gkasaaesag ghldpqntgk hegpegaghl gdlpalvvnn
50 | 121 dgkatdavia prlksldeik dkalmvhvgg dnmsdqpkpl ggggeryacg vik
51 | //
52 | LOCUS sp|O31851|YOJM_BACSU 196 aa UNK 06-JUN-2023
53 | DEFINITION sp|O31851|YOJM_BACSU Superoxide dismutase-like protein YojM
54 | OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1.
55 | ACCESSION sp|O31851|YOJM_BACSU
56 | VERSION sp|O31851|YOJM_BACSU
57 | KEYWORDS .
58 | SOURCE .
59 | ORGANISM .
60 | .
61 | FEATURES Location/Qualifiers
62 | Domain_Search 1..196
63 | /program="phmmer"
64 | /database="swissprot_CuSOD_subset"
65 | /description="Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1"
66 | /evalue="6.8e-135"
67 | /score="442.9"
68 | /name="sp|O31851|YOJM_BACSU"
69 | /identity="100.0"
70 | /cds_id="0_1_196"
71 | /rstart="3"
72 | /rend="79"
73 | /rlen="100"
74 | Domainator 1..196
75 | /program="phmmer"
76 | /database="swissprot_CuSOD_subset"
77 | /description="Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1"
78 | /evalue="6.8e-135"
79 | /score="442.9"
80 | /name="sp|O31851|YOJM_BACSU"
81 | /identity="100.0"
82 | /cds_id="0_1_196"
83 | /rstart="3"
84 | /rend="79"
85 | /rlen="100"
86 | Domainator 36..190
87 | /program="phmmer"
88 | /database="swissprot_CuSOD_subset"
89 | /description="Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1"
90 | /evalue="9.8e-13"
91 | /score="44.3"
92 | /name="sp|P0AGD1|SODC_ECOLI"
93 | /identity="37.4"
94 | /cds_id="0_1_196"
95 | /rstart="3"
96 | /rend="79"
97 | /rlen="100"
98 | ORIGIN
99 | 1 mhrllllmml talgvagcgq kkppdppnrv pekkvvetsa fghhvqlvnr egkavgfiei
100 | 61 kesddegldi hisanslrpg aslgfhiyek gscvrpdfes aggpfnplnk ehgfnnpmgh
101 | 121 hagdlpnlev gadgkvdvim napdtslkkg sklnildedg safiiheqad dyltnpsgns
102 | 181 garivcgall gnnekq
103 | //
104 |
--------------------------------------------------------------------------------
/test/data/FeSOD_20.fasta:
--------------------------------------------------------------------------------
1 | >FeSOD_A0A1F4ZT98|unreviewed|Superoxide
2 | MFTLPPLPYPTNALEPYLDTQTLEIHFGKHHATYLKNLNDLLPEKSDADLIPVLQHLDDL
3 | PQDIRVKVRNNAGGVYNHNLYWQCMSPKSKSPSPRLLSSIESGFGTLDAFKEKFSQAALT
4 | HFGSGWAWLVKGTKGLEIVTTPNQDSPVSTGLTPILGLDVWEHAYYLKYQNRRVEYIQAW
5 | WNVVNWDYVSSLLADR
6 | >FeSOD_A0A067LT26|unreviewed|Superoxide
7 | MADYTLVDLPYDYSALEPSISGRIMELHHDKHHKTYVDGANTALVKLQEARDAGDLTFVN
8 | KLQKDLAFNLAGHVNHTVFWNNLSPDGGDKPTGELAAAIDEFFGSYDKFQAHFTASALGI
9 | QGSGWSILAWDSLGQKLIIEQLYDHQGNLAAATVPILLLDMWEHAFYLDYVNVKADYVKA
10 | FWNIVNWADVQARFDAARTKTQGLFLLS
11 | >FeSOD_A0A2E1RF15|unreviewed|Superoxide
12 | MAHQLPELPYSKDALSPHISAETLDYHHGKHHNAYVTKLNAAIEGTEHAEKSIEELVKTT
13 | SGGLFNNAAQHYNHSFYWNCLAPNAGGSATGTVGEMISSKWGSFDKFKEDFSNAGAANFG
14 | SGWTWLVKNASGDLEIVNTDDAECPLTEGHTPLLTMDVWEHAYYVDYRNARPKYIEAFWN
15 | LVNWDFVNSNL
16 | >FeSOD_A0A538G8K1|unreviewed|Superoxide
17 | MAYSVPPLAYDFDALEPHIDAQTMEIHHDKHHGAYVTNLNAALEGTEWMDRPIESVLASL
18 | DVIPEDKRTAVRNNGGGHANHTFFWEIMGPNGGGEPSGALADAIADTFGGLDQLKTQVND
19 | AGVKRFGSGWTWLVWDGTGLAVKSTPNQDSPVMDSDVPLLGIDVWEHAYYLRYQNRRPDY
20 | LAAWWNVVNWEAVATRYEAAK
21 | >FeSOD_A0A6L8W9C4|unreviewed|Superoxide
22 | MAFELPPLPYAKDALAPHISENTLDFHYGKHHNAYVTNLNGLLEDGDSRSLEEIMKDTAG
23 | DAAKAGVFNNAAQVWNHTFYWHSMKPNGGGKPTGAIADKINEDFGSYEKFAEEFKAAGAT
24 | QFGSGWAWLVLDGGKLKVTKTPNAACPLTDGAKPLLTMDVWEHAYYLDYQNARPKYMETF
25 | LESLVNWDFANENLG
26 | >FeSOD_A0A2H0YVA1|unreviewed|Superoxide
27 | MKYELPKLNYAYDALEPYLDAQTMEIHHSKHHQAYTDNFNKALAEHPDLAEMPVEELLAK
28 | VNELSIKDRDKNALRNHGGGYYNHKLFWEIMDPANKKDESLIKDIETEFGSVDSFKEQFS
29 | ETAKTLLGSGWAWLARDKEGKLHVYGLPNQDSPFQKGHTPIICLDVWEHAYYLKYQNRRP
30 | EYIENWWNVLKLI
31 | >FeSOD_R7J7P3|unreviewed|Superoxide
32 | MHQMPKLPYEMEALAPLMSKETFDFHYGKHLQTYVNNLNKLIVGTPYENLELEQIVCQAD
33 | GGIYNNAAQTWNHTFFFQLLTPEQPSLPDDLAGLLTRDFGSVDQFKEDFTKAALGLFGSG
34 | WVWLVLGKDGKLSLLPTPNAGNPLKDGLKPLLVIDVWEHAYYIDYRNNRAAFIEAFWKLV
35 | NWEKVADLLG
36 | >FeSOD_B8LFE6|unreviewed|Superoxide
37 | MAEYTLPDLPYDYAALEPHISGRIMELHHTKHHATYVKGANDTLDKLAEARADGSIAGKV
38 | YGLSATLSFHLGGHTNHSIFWKNLSPNGGDKPEGDLAAAIDDQFGGFDKFQAHFTAAATT
39 | LQGSGWAILGYDTIGGKLVILQLTDQSDNIPAAIIPVVMLDDMWEHAFYLDYQNVKPDYV
40 | KAWWNVVNWADAAERLGRATSQGKGLIVPA
41 | >FeSOD_A0A1C0AS03|unreviewed|Superoxide
42 | MTYTLPDLDYDYGALAPHIAPEIMELHHSKHHAAYVKGINDALEQLAAAREKGDLGAVNK
43 | LSKDAAFHLGGHINHSVFWKNMSPDGGGRPDGELASAIDEYFGSFDGFQKHFNAVANGIQ
44 | GSGWSMLVWDTLGQRMNINQLFDQQGNLPAGQIPLLQLDMWEHAFYLQYKNVKGDYVTAW
45 | WNVVNWTDVTERFARAKAASAGLV
46 | >FeSOD_A0A2M8Q2V9|unreviewed|Superoxide
47 | MAFELPPLPYAYDALEPHIDARTMEIHHDKHHAGYVSKLNSAIAGTEWESKSIDEILRNI
48 | NSVPEDIRVAVQNNGGGHANHSLFWEIMGPNGGGSPSGALAEAINAAFGSFDAFKEKFSN
49 | AAASRFGSGWAWLVVDDAGNLAVYSTANQDSPYMQGHTPILGLDVWEHAYYLNYQNRRPD
50 | YIAAFWNVVNWDAVAEKFAAAKK
51 | >FeSOD_A0A2E1VW30|unreviewed|Superoxide
52 | MAYELPALPYGENDLAPHITAETIQYHYGKHHAAYVGKLNELLDGDDSKSLEELIQSTDG
53 | GVFNNAAQVWNHTFYWNCMKPGGGGAPSGDLAAAIERDFGSYDAFVTKFKAAAMARFGSG
54 | WAWLVADADGKLSVVETLNAGNPMTDGLKPLLTCDVWEHAYYIDFRNARPKYLDVFFDSL
55 | VNWDFVASNL
56 | >FeSOD_A0A1V4UH29|unreviewed|Superoxide
57 | MEPNRSYTLPKLPYDYSALAPSISEEQLRIHHTKHHQGYVNGANAIYEKLDKTRKDGGDI
58 | DQKATLKELSFHVGGFLLHALYWENMAPAGQGGGVPKGALGSRIEKEFGSFDQFKKEFTA
59 | AANSVEGSGWAALTYCQKTGRLLLMQIEKHNVHVFPSFSVLMVVDVWEHAYYIDYKNDRA
60 | KYLENFWNIINWDAVNSRLEKALKG
61 | >FeSOD_A0A7V9SPC9|unreviewed|Superoxide
62 | MPTEVYELPDLSYDYSALEPHINARIMELHHDKHHATYVKGANTALEKLAEVRATGDFAT
63 | IAMLEKNLAFNVSGHVLHSIFWTNLSPNGGGEPDGELATALTDTFGGFEHFRKQMNEAAA
64 | TVQGAGWALASWEPIAQRLIVQQVHDHQGNHGQGTIPLLAIDAWEHAYYLQYENRKTEFF
65 | DAVWNVVSWGDVEARFKAARNAELIRQT
66 | >FeSOD_R7F5H2|unreviewed|Superoxide
67 | MIKKINLEYPLDSLEPYYSRETLNIHYNTLYVGYVDNTNITLEKLEKARKERNFENIKCL
68 | EKNLSFFGSGVILHELFFENMGPAIPSSPDINLMEQINKDFGSFELFKEQFTESSKVVEA
69 | SGWNLLVWVPRFNKLEIIQCEKHQDLTLWNCKPILVLDMWEHSYFLQYKANRGEYIKAFW
70 | NIINWNNVNKRFRNTIKY
71 | >FeSOD_A0A2N5YS14|unreviewed|Superoxide
72 | MFELPKLPYEFNSLEPKISAKTVEFHYTKHHQVYVNKLNGLIEGTDYAGKTLEEIIKTSE
73 | GGIFNNAAQVWNHTFYWEGFGPNPQSAPSGKLAEMINETFGSFEKFKEEFSTKAATLFGS
74 | GWAWLVLDNGQLKITGTSNAGSPLTEGHKPILTCDVWEHAYYLDYQNLRPKYIENFWELV
75 | DWKKIEGRI
76 | >FeSOD_A0A0F6MY72|unreviewed|Superoxide
77 | MAFELPSLPFDQDALESSKMSANTLSYHHGKHHAAYVKNLNAAIEGTDMANMSLEEIIKA
78 | TYNDPSKSGIFNNAAQVWNHSFFWKCLKPNGGGQPTGALADKIQADFGSFDAFIQEFKNA
79 | AATQFGSGWAWLVLDNGTLKVTKTANAVNPMVEGKTPLLTLDVWEHAYYLDFQNARPGFI
80 | DNFIENLVNWDFVAENLASAS
81 | >FeSOD_A0A060HP82|unreviewed|Superoxide
82 | MPRRPSHLMANFTLPQLPYAYDALEPHIDATTMQIHHTKHHQAYTDGLNKALGSLDAKFQ
83 | SMDAVDILKNIDTVPENARGAVNFHGGGYNNHTLFWNNMKKGGGGEPSGELADAIKKAFG
84 | SFADFKTKFQTDSVAIQGSGWGWLVKNASGGVQFITMPNQTSPWTRWKAEKLTPLLGLDV
85 | WEHSYYLKYQNRRADYVTAWWNVVNWDEVAKRFKA
86 | >FeSOD_A0A4P7WS39|unreviewed|Superoxide
87 | MKITHQLPELPFNKSALNPIITEETFDYHYGKHHAAYVNNLATLIQDTELINFSIEDIIK
88 | KGFYEKNASLFNNAAQHWSHTFFWNCLSPNGGKAPVGRITELITRDFGSFELFKDQFSNA
89 | AIKLFGCGWAWLVQDENDKLEIIAMKEAQTPLILNKKPILTLDVWEHAYYIDYKNARPKF
90 | VEGFWDIVNWDFANKNVI
91 | >FeSOD_A0A076JJX0|unreviewed|Superoxide
92 | MPVYTLPELPYDYSALEPYVSGKIMELHHDKHHQAYVNGANQALEQIHDAAESGNVAQSN
93 | LLEKNLAFNLAGHKNHTIFWKNMAPSIGQEPTGELKAAIEDQFGSFEGFQRYFESMCAGI
94 | QGSGWAVLAWDSLGERLVTLQMYDHQGNLPVTIFPLILLDLWEHAYYLDYLNVRADYVKA
95 | WWHIVNWEDASKRFDEVRNLNTNLVK
96 | >FeSOD_G8R729|unreviewed|Superoxide
97 | MSFELPDLPYSKSALEPYIDAQTMEIHHDKHHAGYTTKLNDAIEGTELEKQSIEDILKNV
98 | SKHSGGVRNNGGGYYNHSLFWSIMGPDAGGDPTGDVGAAIDDAFGSYENFKTEFSNAAAT
99 | RFGSGWAWLIVNGEGKLEVTSSPNQDNPLMDVAEKKGTPILGLDVWEHAYYLKYQNKRPD
100 | YISAFFNVINWDEVNRRFAEAK
101 |
--------------------------------------------------------------------------------
/test/data/FeSOD_dist.dense.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/FeSOD_dist.dense.hdf5
--------------------------------------------------------------------------------
/test/data/FeSOD_dist.sparse.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/FeSOD_dist.sparse.hdf5
--------------------------------------------------------------------------------
/test/data/FeSOD_dist.tsv:
--------------------------------------------------------------------------------
1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide
2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 410.0 0.0 0.0 199.0 0.0 189.0 0.0 0.0 0.0 216.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 187.0
3 | FeSOD_A0A067LT26|unreviewed|Superoxide 0.0 429.0 0.0 0.0 0.0 0.0 0.0 271.0 279.0 0.0 0.0 0.0 247.0 0.0 0.0 0.0 0.0 0.0 277.0 0.0
4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 0.0 0.0 405.0 0.0 239.0 0.0 0.0 0.0 0.0 0.0 232.0 0.0 0.0 0.0 228.0 0.0 0.0 228.0 0.0 0.0
5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 199.0 0.0 0.0 425.0 0.0 0.0 0.0 0.0 0.0 287.0 0.0 0.0 0.0 0.0 0.0 0.0 205.0 0.0 0.0 229.0
6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 0.0 0.0 240.0 0.0 411.0 0.0 0.0 0.0 0.0 0.0 258.0 0.0 0.0 0.0 234.0 271.0 0.0 0.0 0.0 0.0
7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 191.0 0.0 0.0 184.0 0.0 410.0 0.0 0.0 0.0 208.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 192.0
8 | FeSOD_R7J7P3|unreviewed|Superoxide 0.0 0.0 192.0 0.0 0.0 0.0 400.0 0.0 0.0 0.0 199.0 0.0 0.0 0.0 209.0 0.0 0.0 191.0 0.0 0.0
9 | FeSOD_B8LFE6|unreviewed|Superoxide 0.0 267.0 0.0 0.0 0.0 0.0 0.0 434.0 264.0 0.0 0.0 0.0 214.0 0.0 0.0 0.0 0.0 0.0 239.0 0.0
10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 0.0 275.0 0.0 0.0 0.0 0.0 0.0 264.0 426.0 0.0 0.0 0.0 232.0 0.0 0.0 0.0 0.0 0.0 249.0 0.0
11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 0.0 0.0 214.0 279.0 0.0 0.0 0.0 0.0 0.0 414.0 0.0 0.0 0.0 0.0 0.0 0.0 221.0 0.0 0.0 261.0
12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 0.0 0.0 233.0 0.0 258.0 0.0 0.0 0.0 0.0 0.0 398.0 0.0 0.0 0.0 219.0 239.0 0.0 0.0 0.0 0.0
13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 0.0 171.0 0.0 0.0 0.0 0.0 0.0 177.0 169.0 0.0 0.0 424.0 0.0 0.0 0.0 0.0 0.0 0.0 169.0 0.0
14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 0.0 244.0 0.0 0.0 0.0 0.0 0.0 216.0 234.0 0.0 0.0 0.0 430.0 0.0 0.0 0.0 0.0 0.0 222.0 0.0
15 | FeSOD_R7F5H2|unreviewed|Superoxide 0.0 135.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 149.0 140.0 413.0 0.0 0.0 0.0 0.0 135.0 0.0
16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 0.0 0.0 233.0 0.0 237.0 0.0 0.0 0.0 0.0 0.0 222.0 0.0 0.0 0.0 402.0 223.0 0.0 0.0 0.0 0.0
17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 0.0 0.0 231.0 0.0 275.0 0.0 0.0 0.0 0.0 0.0 243.0 0.0 0.0 0.0 224.0 417.0 0.0 0.0 0.0 0.0
18 | FeSOD_A0A060HP82|unreviewed|Superoxide 177.0 0.0 0.0 202.0 0.0 0.0 0.0 0.0 0.0 226.0 0.0 0.0 0.0 0.0 0.0 0.0 451.0 0.0 0.0 208.0
19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 0.0 0.0 234.0 0.0 205.0 0.0 0.0 0.0 0.0 0.0 203.0 0.0 0.0 0.0 0.0 208.0 0.0 419.0 0.0 0.0
20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 0.0 271.0 0.0 0.0 0.0 0.0 0.0 238.0 248.0 0.0 0.0 0.0 219.0 0.0 0.0 0.0 0.0 0.0 429.0 0.0
21 | FeSOD_G8R729|unreviewed|Superoxide 0.0 0.0 218.0 231.0 0.0 0.0 0.0 0.0 0.0 271.0 0.0 0.0 0.0 0.0 0.0 0.0 213.0 0.0 0.0 422.0
22 |
--------------------------------------------------------------------------------
/test/data/FeSOD_metadata.tsv:
--------------------------------------------------------------------------------
1 | Name Type Accession Status Substrate active expression level
2 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD A0A060HP82 unreviewed Superoxide 1 0.602844258
3 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD A0A067LT26 unreviewed Superoxide 0 0.828329655
4 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD A0A076JJX0 unreviewed Superoxide 1 0.342002723
5 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD A0A0F6MY72 unreviewed Superoxide 1 0.341330749
6 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD A0A1C0AS03 unreviewed Superoxide 0 0.59654186
7 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD A0A1F4ZT98 unreviewed Superoxide 0 0.097440548
8 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD A0A1V4UH29 unreviewed Superoxide 1 0.549335033
9 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD A0A2E1RF15 unreviewed Superoxide 0 0.340114417
10 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD A0A2E1VW30 unreviewed Superoxide 0 0.450472955
11 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD A0A2H0YVA1 unreviewed Superoxide 1 0.08286451
12 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD A0A2M8Q2V9 unreviewed Superoxide 1 0.146398972
13 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD A0A2N5YS14 unreviewed Superoxide 0 0.91315881
14 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD A0A4P7WS39 unreviewed Superoxide 1 0.468473632
15 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD A0A538G8K1 unreviewed Superoxide 0 0.021717089
16 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD A0A6L8W9C4 unreviewed Superoxide 1 0.05147778
17 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD A0A7V9SPC9 unreviewed Superoxide 1 0.129515554
18 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD B8LFE6 unreviewed Superoxide 0 0.433221718
19 | FeSOD_G8R729|unreviewed|Superoxide FeSOD G8R729 unreviewed Superoxide 1 0.265199983
20 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD R7F5H2 unreviewed Superoxide 0 0.441960291
21 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD R7J7P3 unreviewed Superoxide 1 0.029610013
22 |
--------------------------------------------------------------------------------
/test/data/FeSOD_score_dist.newick:
--------------------------------------------------------------------------------
1 | ((((((FeSOD_A0A1C0AS03|unreviewed|Superoxide:0.40,FeSOD_B8LFE6|unreviewed|Superoxide:0.40):0.01,(FeSOD_A0A076JJX0|unreviewed|Superoxide:0.37,FeSOD_A0A067LT26|unreviewed|Superoxide:0.37):0.04):0.07,FeSOD_A0A7V9SPC9|unreviewed|Superoxide:0.49):0.14,FeSOD_A0A1V4UH29|unreviewed|Superoxide:0.63):0.08,FeSOD_R7F5H2|unreviewed|Superoxide:0.70):0.02,((((((FeSOD_A0A2M8Q2V9|unreviewed|Superoxide:0.34,FeSOD_A0A538G8K1|unreviewed|Superoxide:0.34):0.10,FeSOD_G8R729|unreviewed|Superoxide:0.43):0.08,FeSOD_A0A060HP82|unreviewed|Superoxide:0.52):0.03,FeSOD_A0A1F4ZT98|unreviewed|Superoxide:0.54):0.02,FeSOD_A0A2H0YVA1|unreviewed|Superoxide:0.56):0.07,((((FeSOD_A0A2N5YS14|unreviewed|Superoxide:0.44,FeSOD_A0A2E1RF15|unreviewed|Superoxide:0.44):0.01,((FeSOD_A0A0F6MY72|unreviewed|Superoxide:0.36,FeSOD_A0A6L8W9C4|unreviewed|Superoxide:0.36):0.05,FeSOD_A0A2E1VW30|unreviewed|Superoxide:0.40):0.05):0.06,FeSOD_A0A4P7WS39|unreviewed|Superoxide:0.51):0.04,FeSOD_R7J7P3|unreviewed|Superoxide:0.55):0.08):0.09);
2 |
--------------------------------------------------------------------------------
/test/data/MT_nbs.enum_report.tsv:
--------------------------------------------------------------------------------
1 | contig cds_count domain_count taxid_species
2 | BX548174_369054:361090rc 9 6 1219
3 | AP010935_277029:265042rc 9 10 1334
4 | AP010935_936190:944542 9 13 1334
5 | AP010958_937230:927711rc 9 10 562
6 | AP010958_5381772:5387197 8 10 562
7 | AP010958_2919630:2924198 9 10 562
8 | AP010958_2902985:2909899 9 9 562
9 | AP010958_2275839:2267457rc 9 10 562
10 | AP010958_4084369:4094899 9 10 562
11 | AP010958_2177479:2170952rc 8 8 562
12 | AP010958_4182350:4173105rc 9 13 562
13 | AP010958_1458605:1465836 9 13 562
14 | AP010958_578280:583796 9 8 562
15 | AP011121_834340:842417 9 10 438
16 | AP011121_1001485:1013239 9 12 438
17 | BA000040_5565877:5553106rc 9 11 1355477
18 | BA000040_2743729:2731520rc 9 16 1355477
19 | AP011115_461187:454129rc 9 10 37919
20 | AP010950_263272:271283 9 12 137722
21 | AP011115_4384225:4376630rc 9 8 37919
22 |
--------------------------------------------------------------------------------
/test/data/Peptidase_M28.hmm.h3f:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3f
--------------------------------------------------------------------------------
/test/data/Peptidase_M28.hmm.h3i:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3i
--------------------------------------------------------------------------------
/test/data/Peptidase_M28.hmm.h3m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3m
--------------------------------------------------------------------------------
/test/data/Peptidase_M28.hmm.h3p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3p
--------------------------------------------------------------------------------
/test/data/bin3.sparse.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/bin3.sparse.hdf5
--------------------------------------------------------------------------------
/test/data/bin3.sparse.tsv:
--------------------------------------------------------------------------------
1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1.0
2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0
3 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0
4 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0
5 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0
6 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 1.0
7 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 1.0
8 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0
9 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0
10 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0
11 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0
12 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0
13 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0
14 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0
15 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0
16 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2H0YVA1|unreviewed|Superoxide 1.0
17 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0
18 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0
19 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_R7J7P3|unreviewed|Superoxide 1.0
20 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 1.0
21 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0
22 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 1.0
23 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0
24 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0
25 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0
26 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0
27 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 1.0
28 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0
29 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0
30 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0
31 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0
32 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0
33 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0
34 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 1.0
35 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 1.0
36 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0
37 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 1.0
38 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0
39 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0
40 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_R7F5H2|unreviewed|Superoxide 1.0
41 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 1.0
42 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 1.0
43 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 1.0
44 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0
45 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 1.0
46 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0
47 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0
48 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0
49 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A060HP82|unreviewed|Superoxide 1.0
50 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0
51 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0
52 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A4P7WS39|unreviewed|Superoxide 1.0
53 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 1.0
54 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0
55 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 1.0
56 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0
57 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0
58 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0
59 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0
60 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0
61 |
--------------------------------------------------------------------------------
/test/data/bin3.tsv:
--------------------------------------------------------------------------------
1 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
3 | FeSOD_A0A067LT26|unreviewed|Superoxide 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1
8 | FeSOD_R7J7P3|unreviewed|Superoxide 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0
9 | FeSOD_B8LFE6|unreviewed|Superoxide 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
15 | FeSOD_R7F5H2|unreviewed|Superoxide 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
18 | FeSOD_A0A060HP82|unreviewed|Superoxide 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1
19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
21 | FeSOD_G8R729|unreviewed|Superoxide 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
22 |
--------------------------------------------------------------------------------
/test/data/ccdb.gb:
--------------------------------------------------------------------------------
1 | LOCUS pDONR201_1265:958rc 306 bp DNA UNK 27-DEC-2023
2 | DEFINITION Gateway donor vector.
3 | ACCESSION urn.local...v2-dgufz7r
4 | VERSION urn.local...v2-dgufz7r
5 | KEYWORDS .
6 | SOURCE
7 | ORGANISM .
8 | .
9 | FEATURES Location/Qualifiers
10 | source complement(1..306)
11 | /invitrogen="""584"
12 | /label="Invitrogen vector 584"
13 | CDS 1..306
14 | /invitrogen="""1060000"
15 | /label="ccdB"
16 | /translation="MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDK
17 | VSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI*"
18 | /cds_id="1264_-1_959"
19 | /domainator_CcdB="CcdB (CcdB protein, 4.4e-31, 103.1)"
20 | Domainator 4..300
21 | /program="hmmsearch"
22 | /database="CcdB"
23 | /description="CcdB protein"
24 | /evalue="4.4e-31"
25 | /score="103.1"
26 | /name="CcdB"
27 | /identity="39.0"
28 | /cds_id="1264_-1_959"
29 | /rstart="1"
30 | /rend="100"
31 | /rlen="100"
32 | Domain_Search 4..300
33 | /program="hmmsearch"
34 | /database="CcdB"
35 | /description="CcdB protein"
36 | /evalue="4.4e-31"
37 | /score="103.1"
38 | /name="CcdB"
39 | /identity="39.0"
40 | /cds_id="1264_-1_959"
41 | /rstart="1"
42 | /rend="100"
43 | /rlen="100"
44 | ORIGIN
45 | 1 atgcagttta aggtttacac ctataaaaga gagagccgtt atcgtctgtt tgtggatgta
46 | 61 cagagtgata ttattgacac gcccgggcga cggatggtga tccccctggc cagtgcacgt
47 | 121 ctgctgtcag ataaagtctc ccgtgaactt tacccggtgg tgcatatcgg ggatgaaagc
48 | 181 tggcgcatga tgaccaccga tatggccagt gtgccggtct ccgttatcgg ggaagaagtg
49 | 241 gctgatctca gccaccgcga aaatgacatc aaaaacgcca ttaacctgat gttctgggga
50 | 301 atataa
51 | //
52 |
--------------------------------------------------------------------------------
/test/data/color_specification.tsv:
--------------------------------------------------------------------------------
1 | CcdB #ff0000
2 | APH #00ff00
3 | CAT #0000ff
4 | Condensation #ff00ff
5 | 2-oxoacid_dh #ffffff
6 |
--------------------------------------------------------------------------------
/test/data/color_table_123.tsv:
--------------------------------------------------------------------------------
1 | 1 #1F77C4
2 | 2 #FEC7E8
3 | 3 #FFFF0E
4 |
--------------------------------------------------------------------------------
/test/data/domain_search_translate_out.gb:
--------------------------------------------------------------------------------
1 | LOCUS 1264_-1_959 102 aa UNK 27-DEC-2023
2 | DEFINITION Gateway donor vector.
3 | ACCESSION 1264_-1_959
4 | VERSION 1264_-1_959
5 | KEYWORDS .
6 | SOURCE
7 | ORGANISM .
8 | .
9 | FEATURES Location/Qualifiers
10 | source complement(1..102)
11 | /invitrogen="""584"
12 | /label="Invitrogen vector 584"
13 | CDS 1..102
14 | /invitrogen="""1060000"
15 | /label="ccdB"
16 | /translation="MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDK
17 | VSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI*"
18 | /cds_id="1264_-1_959"
19 | /source_contig="pDONR201"
20 | Domain_Search 2..100
21 | /program="hmmsearch"
22 | /database="CcdB"
23 | /description="CcdB protein"
24 | /evalue="4.4e-31"
25 | /score="103.1"
26 | /name="CcdB"
27 | /identity="39.0"
28 | /cds_id="1264_-1_959"
29 | /rstart="1"
30 | /rend="100"
31 | /rlen="100"
32 | ORIGIN
33 | 1 mqfkvytykr esryrlfvdv qsdiidtpgr rmviplasar llsdkvsrel ypvvhigdes
34 | 61 wrmmttdmas vpvsvigeev adlshrendi knainlmfwg i*
35 | //
36 |
--------------------------------------------------------------------------------
/test/data/empty.gb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/empty.gb
--------------------------------------------------------------------------------
/test/data/enum_report_html_max_size_out.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Enum Report
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
64 |
65 |
--------------------------------------------------------------------------------
/test/data/enum_report_html_out.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Enum Report
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
63 |
64 |
--------------------------------------------------------------------------------
/test/data/enum_report_html_out_quote_escape.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Enum Report
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
55 |
56 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD:
--------------------------------------------------------------------------------
1 | MFTLPPLPYPTNALEPYLDTQTLEIHFGKHHATYLKNLNDLLPEKSDADLIPVLQHLDDLPQDIRVKVRNNAGGVYNHNLYWQCMSPKSKSPSPRLLSSIESGFGTLDAFKEKFSQAALTHFGSGWAWLVKGTKGLEIVTTPNQDSPVSTGLTPILGLDVWEHAYYLKYQNRRVEYIQAWWNVVNWDYVSSLLADR
2 | MADYTLVDLPYDYSALEPSISGRIMELHHDKHHKTYVDGANTALVKLQEARDAGDLTFVNKLQKDLAFNLAGHVNHTVFWNNLSPDGGDKPTGELAAAIDEFFGSYDKFQAHFTASALGIQGSGWSILAWDSLGQKLIIEQLYDHQGNLAAATVPILLLDMWEHAFYLDYVNVKADYVKAFWNIVNWADVQARFDAARTKTQGLFLLS
3 | MAHQLPELPYSKDALSPHISAETLDYHHGKHHNAYVTKLNAAIEGTEHAEKSIEELVKTTSGGLFNNAAQHYNHSFYWNCLAPNAGGSATGTVGEMISSKWGSFDKFKEDFSNAGAANFGSGWTWLVKNASGDLEIVNTDDAECPLTEGHTPLLTMDVWEHAYYVDYRNARPKYIEAFWNLVNWDFVNSNL
4 | MAYSVPPLAYDFDALEPHIDAQTMEIHHDKHHGAYVTNLNAALEGTEWMDRPIESVLASLDVIPEDKRTAVRNNGGGHANHTFFWEIMGPNGGGEPSGALADAIADTFGGLDQLKTQVNDAGVKRFGSGWTWLVWDGTGLAVKSTPNQDSPVMDSDVPLLGIDVWEHAYYLRYQNRRPDYLAAWWNVVNWEAVATRYEAAK
5 | MAFELPPLPYAKDALAPHISENTLDFHYGKHHNAYVTNLNGLLEDGDSRSLEEIMKDTAGDAAKAGVFNNAAQVWNHTFYWHSMKPNGGGKPTGAIADKINEDFGSYEKFAEEFKAAGATQFGSGWAWLVLDGGKLKVTKTPNAACPLTDGAKPLLTMDVWEHAYYLDYQNARPKYMETFLESLVNWDFANENLG
6 | MKYELPKLNYAYDALEPYLDAQTMEIHHSKHHQAYTDNFNKALAEHPDLAEMPVEELLAKVNELSIKDRDKNALRNHGGGYYNHKLFWEIMDPANKKDESLIKDIETEFGSVDSFKEQFSETAKTLLGSGWAWLARDKEGKLHVYGLPNQDSPFQKGHTPIICLDVWEHAYYLKYQNRRPEYIENWWNVLKLI
7 | MHQMPKLPYEMEALAPLMSKETFDFHYGKHLQTYVNNLNKLIVGTPYENLELEQIVCQADGGIYNNAAQTWNHTFFFQLLTPEQPSLPDDLAGLLTRDFGSVDQFKEDFTKAALGLFGSGWVWLVLGKDGKLSLLPTPNAGNPLKDGLKPLLVIDVWEHAYYIDYRNNRAAFIEAFWKLVNWEKVADLLG
8 | MAEYTLPDLPYDYAALEPHISGRIMELHHTKHHATYVKGANDTLDKLAEARADGSIAGKVYGLSATLSFHLGGHTNHSIFWKNLSPNGGDKPEGDLAAAIDDQFGGFDKFQAHFTAAATTLQGSGWAILGYDTIGGKLVILQLTDQSDNIPAAIIPVVMLDDMWEHAFYLDYQNVKPDYVKAWWNVVNWADAAERLGRATSQGKGLIVPA
9 | MTYTLPDLDYDYGALAPHIAPEIMELHHSKHHAAYVKGINDALEQLAAAREKGDLGAVNKLSKDAAFHLGGHINHSVFWKNMSPDGGGRPDGELASAIDEYFGSFDGFQKHFNAVANGIQGSGWSMLVWDTLGQRMNINQLFDQQGNLPAGQIPLLQLDMWEHAFYLQYKNVKGDYVTAWWNVVNWTDVTERFARAKAASAGLV
10 | MAFELPPLPYAYDALEPHIDARTMEIHHDKHHAGYVSKLNSAIAGTEWESKSIDEILRNINSVPEDIRVAVQNNGGGHANHSLFWEIMGPNGGGSPSGALAEAINAAFGSFDAFKEKFSNAAASRFGSGWAWLVVDDAGNLAVYSTANQDSPYMQGHTPILGLDVWEHAYYLNYQNRRPDYIAAFWNVVNWDAVAEKFAAAKK
11 | MAYELPALPYGENDLAPHITAETIQYHYGKHHAAYVGKLNELLDGDDSKSLEELIQSTDGGVFNNAAQVWNHTFYWNCMKPGGGGAPSGDLAAAIERDFGSYDAFVTKFKAAAMARFGSGWAWLVADADGKLSVVETLNAGNPMTDGLKPLLTCDVWEHAYYIDFRNARPKYLDVFFDSLVNWDFVASNL
12 | MEPNRSYTLPKLPYDYSALAPSISEEQLRIHHTKHHQGYVNGANAIYEKLDKTRKDGGDIDQKATLKELSFHVGGFLLHALYWENMAPAGQGGGVPKGALGSRIEKEFGSFDQFKKEFTAAANSVEGSGWAALTYCQKTGRLLLMQIEKHNVHVFPSFSVLMVVDVWEHAYYIDYKNDRAKYLENFWNIINWDAVNSRLEKALKG
13 | MPTEVYELPDLSYDYSALEPHINARIMELHHDKHHATYVKGANTALEKLAEVRATGDFATIAMLEKNLAFNVSGHVLHSIFWTNLSPNGGGEPDGELATALTDTFGGFEHFRKQMNEAAATVQGAGWALASWEPIAQRLIVQQVHDHQGNHGQGTIPLLAIDAWEHAYYLQYENRKTEFFDAVWNVVSWGDVEARFKAARNAELIRQT
14 | MIKKINLEYPLDSLEPYYSRETLNIHYNTLYVGYVDNTNITLEKLEKARKERNFENIKCLEKNLSFFGSGVILHELFFENMGPAIPSSPDINLMEQINKDFGSFELFKEQFTESSKVVEASGWNLLVWVPRFNKLEIIQCEKHQDLTLWNCKPILVLDMWEHSYFLQYKANRGEYIKAFWNIINWNNVNKRFRNTIKY
15 | MFELPKLPYEFNSLEPKISAKTVEFHYTKHHQVYVNKLNGLIEGTDYAGKTLEEIIKTSEGGIFNNAAQVWNHTFYWEGFGPNPQSAPSGKLAEMINETFGSFEKFKEEFSTKAATLFGSGWAWLVLDNGQLKITGTSNAGSPLTEGHKPILTCDVWEHAYYLDYQNLRPKYIENFWELVDWKKIEGRI
16 | MAFELPSLPFDQDALESSKMSANTLSYHHGKHHAAYVKNLNAAIEGTDMANMSLEEIIKATYNDPSKSGIFNNAAQVWNHSFFWKCLKPNGGGQPTGALADKIQADFGSFDAFIQEFKNAAATQFGSGWAWLVLDNGTLKVTKTANAVNPMVEGKTPLLTLDVWEHAYYLDFQNARPGFIDNFIENLVNWDFVAENLASAS
17 | MPRRPSHLMANFTLPQLPYAYDALEPHIDATTMQIHHTKHHQAYTDGLNKALGSLDAKFQSMDAVDILKNIDTVPENARGAVNFHGGGYNNHTLFWNNMKKGGGGEPSGELADAIKKAFGSFADFKTKFQTDSVAIQGSGWGWLVKNASGGVQFITMPNQTSPWTRWKAEKLTPLLGLDVWEHSYYLKYQNRRADYVTAWWNVVNWDEVAKRFKA
18 | MKITHQLPELPFNKSALNPIITEETFDYHYGKHHAAYVNNLATLIQDTELINFSIEDIIKKGFYEKNASLFNNAAQHWSHTFFWNCLSPNGGKAPVGRITELITRDFGSFELFKDQFSNAAIKLFGCGWAWLVQDENDKLEIIAMKEAQTPLILNKKPILTLDVWEHAYYIDYKNARPKFVEGFWDIVNWDFANKNVI
19 | MPVYTLPELPYDYSALEPYVSGKIMELHHDKHHQAYVNGANQALEQIHDAAESGNVAQSNLLEKNLAFNLAGHKNHTIFWKNMAPSIGQEPTGELKAAIEDQFGSFEGFQRYFESMCAGIQGSGWAVLAWDSLGERLVTLQMYDHQGNLPVTIFPLILLDLWEHAYYLDYLNVRADYVKAWWHIVNWEDASKRFDEVRNLNTNLVK
20 | MSFELPDLPYSKSALEPYIDAQTMEIHHDKHHAGYTTKLNDAIEGTELEKQSIEDILKNVSKHSGGVRNNGGGYYNHSLFWSIMGPDAGGDPTGDVGAAIDDAFGSYENFKTEFSNAAATRFGSGWAWLIVNGEGKLEVTSSPNQDNPLMDVAEKKGTPILGLDVWEHAYYLKYQNKRPDYISAFFNVINWDEVNRRFAEAK
21 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD.dbtype:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD.index:
--------------------------------------------------------------------------------
1 | 0 0 198
2 | 1 198 210
3 | 2 408 193
4 | 3 601 203
5 | 4 804 197
6 | 5 1001 195
7 | 6 1196 192
8 | 7 1388 212
9 | 8 1600 206
10 | 9 1806 205
11 | 10 2011 192
12 | 11 2203 207
13 | 12 2410 210
14 | 13 2620 200
15 | 14 2820 191
16 | 15 3011 203
17 | 16 3214 217
18 | 17 3431 200
19 | 18 3631 208
20 | 19 3839 204
21 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD.lookup:
--------------------------------------------------------------------------------
1 | 0 FeSOD_A0A1F4ZT98 0
2 | 1 FeSOD_A0A067LT26 1
3 | 2 FeSOD_A0A2E1RF15 2
4 | 3 FeSOD_A0A538G8K1 3
5 | 4 FeSOD_A0A6L8W9C4 4
6 | 5 FeSOD_A0A2H0YVA1 5
7 | 6 FeSOD_R7J7P3 6
8 | 7 FeSOD_B8LFE6 7
9 | 8 FeSOD_A0A1C0AS03 8
10 | 9 FeSOD_A0A2M8Q2V9 9
11 | 10 FeSOD_A0A2E1VW30 10
12 | 11 FeSOD_A0A1V4UH29 11
13 | 12 FeSOD_A0A7V9SPC9 12
14 | 13 FeSOD_R7F5H2 13
15 | 14 FeSOD_A0A2N5YS14 14
16 | 15 FeSOD_A0A0F6MY72 15
17 | 16 FeSOD_A0A060HP82 16
18 | 17 FeSOD_A0A4P7WS39 17
19 | 18 FeSOD_A0A076JJX0 18
20 | 19 FeSOD_G8R729 19
21 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_20.3di.fasta:
--------------------------------------------------------------------------------
1 | >FeSOD_A0A1F4ZT98 unreviewed Superoxide
2 | DDDADDAPDDLCLLPPLAHNVLSCCQRNPQLVVLSVLLCVLCVVPDPDDLLVCLVCLVVDDPVSSVSSLQSSLSNLLSSLLSVLFGSDFDAFDPVLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQDPVGTHIDIDHRSHDCSNVVTDTSDMDTCHCSNPCVVCNVPSSSSSVSRSSTTNRVSSNVSSVVD
3 | >FeSOD_A0A067LT26 unreviewed Superoxide
4 | DDDDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLRVQLRVLVVQLVVCVVVVNPVSNVVSLLSNLLSVLSNLLSNLLSVLFANQFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLLPDDFFWKWFWWQQLVVRGTYIDIDHHSPPCDSVRTDTLDIDTSHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVCSVVVDPDPDPDD
5 | >FeSOD_A0A2E1RF15 unreviewed Superoxide
6 | DADAADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLRVLCPPHPCSPDDLLVCLQPDDDSSNLSSLLSVLSNLLSVQFANPFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPVRRTDIDIDHRRHDCNNVRTDTSDMDTCHCSNPCVRCNPPSSSSSVSNSSTTPRVSSNVSD
7 | >FeSOD_A0A538G8K1 unreviewed Superoxide
8 | DADDQDDFPDDLCLLPPLAHSVLSCCCRPPQLNVLSVQLRVLCPPHPCLVDDLLVCLVPLVPDDPVSSLSNQQSSLSNLLSSLLSVQFANQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDPPPGTDIDIDHRSHHCSDPPTRTLDMDTCHCSNPCVVQNVPSSSSSVSSSRTYNRVSSNVSSVVSD
9 | >FeSOD_A0A6L8W9C4 unreviewed Superoxide
10 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLCVLCVVPDPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSCLLSVLFGNNFFDAADDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDDPSGTHIDIDHRSHGSSNVPTDTLDMDTCHCSNCCPVCNPPSSSSSVSCRVRTGDNVSSNVSSD
11 | >FeSOD_A0A2H0YVA1 unreviewed Superoxide
12 | DADDADDFPDDLCLLPPLAHRVLSCCCRPPQLVVLSVQLRVLCVVPVVLSPDDLLVCLVCLVVDPDDDPSSVSSQLSSLSNLLSVLLSVQLHSPDDDDPVLQVQCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWADPVRGTDIDIDHRSHDCSNVVTHTSDMDTCHCSNPCVVCNVCSSSSSVSSVVSGDRD
13 | >FeSOD_R7J7P3 unreviewed Superoxide
14 | DDDQDDQPDDLCLCPPLAHSVLSCCCRNPQLVVLSVQLCVLCPPHPCNPDDLLVSLQPDDDSSNLSSLLNCLSNVLLVLFAPDFDAADPVLQVLCCVQQNHPVSVLVVQLCQQLPDDAWWKWFWFQDPVRGTDIDTDHSSHGCVVVVTDTLDIDTCHCSNCCVVCNPPSNSSSVSSSSTTPSVSSSVSND
15 | >FeSOD_B8LFE6 unreviewed Superoxide
16 | DDADDADDQPDDLCLLPPLFHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCPVCNVVSVVSNLLSSLSNLLSVLLSVLFHSQFFDAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWWWWQQLQVRGTYIDIDHHSSPCDDSSRTDTSDIDTCHCSNCCVVCNPPSSSSSVSRVRTTNRVSSNVSSVVSVVSSPPDDDDD
17 | >FeSOD_A0A1C0AS03 unreviewed Superoxide
18 | DADDADDFPDDLCLLPPFAHSVLSCCCRPPQLVVLSVQLNVLVVQLVVCVVVVNPVSNLVSLLSNLQSVLSNLLSNLLSVLFASQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWQQLVVRGTYIDIDHGSSPDDDPRIDTLDIDTCHCSNPCVVQNPPSSSSSVSSSRTTPRVSSNVSSVSSVVSSVVSD
19 | >FeSOD_A0A2M8Q2V9 unreviewed Superoxide
20 | DADDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVLLRVLCPPHPCSVDDLLVCLVPLVSDDPVSSVSNLQSSLSNLQSSLLSVQFGNNFADAADDPLQVLCCVQQNHPVSVLVVLLCQLLPDPAWWKWFWKQALVLGTDIDIDHRSHDCSNVRTDTSDMDTCHCSNPCVVQNVCSSSSSVSSSRTTNRVSSNVSSVVSVD
21 | >FeSOD_A0A2E1VW30 unreviewed Superoxide
22 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLSVQLRVLCVVPPPDDLLRCLQPDDDPSNLSSQLNVLSNLLSNLFGNPFFDADDDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWADPVRGTDIDTDHSSHDSSDVPTDTSDMDTCHCSNPCPVCNPPRSSSSVSCRVTTTDNVSSNVSD
23 | >FeSOD_A0A1V4UH29 unreviewed Superoxide
24 | DDDPDADDQDDQPDDLCLLPPLAHSVLSCCCRRPQLSLLSVQLRVLSVVVVVCVVVVHDDDPVVSLLSNLQSSQSNLLSQLLSVLFAAQVLFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQDLVPRGTYIDIDGRSDPVDDPRTRTSDIDTCHCSNPCVVQNPPSNSSSVSSSRTTNSVSSNVSSVVSVVD
25 | >FeSOD_A0A7V9SPC9 unreviewed Superoxide
26 | DDDDADDADFQPDDLCLLPPQAHSVLSCCCRPPQLVVLSVQLNVLVVVLVVCVVVVPPVSNVVSLVSNLLSSLSNLLSNLLSVLFGNQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDPAWWKWFWWQQLVVRGIYIDTDHHSCPCDDPRTQTSDIDGSHCSNPCVVPNPPSSSSSVSRVRTGDRVSSNVSSVVSVVSVVSVVD
27 | >FeSOD_R7F5H2 unreviewed Superoxide
28 | DDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCVSNVVSLVSNLLSVLSNLLSCLLSVQFGAFDPDAFDPVLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQQLVVRGTDIDIDGGSSDCVVSRIDTLDMDTCHCSNPCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVRVVD
29 | >FeSOD_A0A2N5YS14 unreviewed Superoxide
30 | DDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLRVVLCVLCPPHPCNPDDLLVCLQPDDDSSNLSSLLSCLSNLLSVQFALDDPDADDDPLQVQCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPSGTHIDIDHRSHGSSNVPTDTSDMDTCHCSNQCVVCNPPSSSSSVSSVVGHPSVSSNVSD
31 | >FeSOD_A0A0F6MY72 unreviewed Superoxide
32 | DADDADDAPDDLCQCPDQLAHSVLSCCCRNPQLVVLSVQLRVLCPPDPCNPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSVLLSVLFGNNFFAFADDPLQVQCCVQQNHPVSVLVVQLVLLVPPPAWWKWFFWDDPSGTHIDIDHRSDDCSDVPTDTLDMDTCHCSNCCVVCNVPRSSSSVSCRVTTTDNVSSNVSSVVND
33 | >FeSOD_A0A060HP82 unreviewed Superoxide
34 | DDDDPPPPPDQDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLCVPDDPVLLPDDLLVCLLPLVSRDPSSSVSNQQSSLSNLLSSLQSVQAGHQFFDAADDPLQVLCCVQQNHPVSVLVVQLVQQLPDDAWWKWWWWADPVRGTDIDIDHRSHDSSDDPPPVRIDTSDMDTSHCSNQCVVQNPPSNSSSVSNVRTTPRVSSNVSNVD
35 | >FeSOD_A0A4P7WS39 unreviewed Superoxide
36 | DDDDDDADDQPDDLCLLPPLAHSVLSCCCRNPQLSVLSVQLRVLCPPDPCSPDDLLVLLVVCVVVVNVSSNLSSQLNVLSNLLSVLFGLQFFDADDDPLVVQCCVQQNHPVSVLVVQLCCLLPDDAWWKWFWWADPVRGTDIDTDHHSHHCVVVVTHTLDMDTCHCSNCCVVQNPPSSSSSVSRVSTTNRVSSNVSRD
37 | >FeSOD_A0A076JJX0 unreviewed Superoxide
38 | DDADDADDQPDDLCLLPPQFHSVLSCCCRPPQLVVLSVQLRVLVVQLVVCVVVVNPVSNLVSLLSNLLSVLSNLLSSLLSVLLGNDDDAAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWFWWQQLVVRGIYIDIDHHSPPDDDPRIDTLDIDTCHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVSVVVDVVSVD
39 | >FeSOD_G8R729 unreviewed Superoxide
40 | DADDADAAPDDLCLLPPLAHNVLSCCCRPPQLVVLRVQLRVLCPPHPLLVDDLLVCLVCLVVDDPSSLLSSLSNLLSSLLSVQFGNQLFDFADDPQQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQALVLGTDIDIDHRSHDSCPPPVPRRTRTSDMDTSHCSNQCVVQNPPSSSSSVSNVRTTNRVSSNVSSVVSD
41 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_20.fasta:
--------------------------------------------------------------------------------
1 | >FeSOD_A0A1F4ZT98 unreviewed Superoxide
2 | MFTLPPLPYPTNALEPYLDTQTLEIHFGKHHATYLKNLNDLLPEKSDADLIPVLQHLDDL
3 | PQDIRVKVRNNAGGVYNHNLYWQCMSPKSKSPSPRLLSSIESGFGTLDAFKEKFSQAALT
4 | HFGSGWAWLVKGTKGLEIVTTPNQDSPVSTGLTPILGLDVWEHAYYLKYQNRRVEYIQAW
5 | WNVVNWDYVSSLLADR
6 | >FeSOD_A0A067LT26 unreviewed Superoxide
7 | MADYTLVDLPYDYSALEPSISGRIMELHHDKHHKTYVDGANTALVKLQEARDAGDLTFVN
8 | KLQKDLAFNLAGHVNHTVFWNNLSPDGGDKPTGELAAAIDEFFGSYDKFQAHFTASALGI
9 | QGSGWSILAWDSLGQKLIIEQLYDHQGNLAAATVPILLLDMWEHAFYLDYVNVKADYVKA
10 | FWNIVNWADVQARFDAARTKTQGLFLLS
11 | >FeSOD_A0A2E1RF15 unreviewed Superoxide
12 | MAHQLPELPYSKDALSPHISAETLDYHHGKHHNAYVTKLNAAIEGTEHAEKSIEELVKTT
13 | SGGLFNNAAQHYNHSFYWNCLAPNAGGSATGTVGEMISSKWGSFDKFKEDFSNAGAANFG
14 | SGWTWLVKNASGDLEIVNTDDAECPLTEGHTPLLTMDVWEHAYYVDYRNARPKYIEAFWN
15 | LVNWDFVNSNL
16 | >FeSOD_A0A538G8K1 unreviewed Superoxide
17 | MAYSVPPLAYDFDALEPHIDAQTMEIHHDKHHGAYVTNLNAALEGTEWMDRPIESVLASL
18 | DVIPEDKRTAVRNNGGGHANHTFFWEIMGPNGGGEPSGALADAIADTFGGLDQLKTQVND
19 | AGVKRFGSGWTWLVWDGTGLAVKSTPNQDSPVMDSDVPLLGIDVWEHAYYLRYQNRRPDY
20 | LAAWWNVVNWEAVATRYEAAK
21 | >FeSOD_A0A6L8W9C4 unreviewed Superoxide
22 | MAFELPPLPYAKDALAPHISENTLDFHYGKHHNAYVTNLNGLLEDGDSRSLEEIMKDTAG
23 | DAAKAGVFNNAAQVWNHTFYWHSMKPNGGGKPTGAIADKINEDFGSYEKFAEEFKAAGAT
24 | QFGSGWAWLVLDGGKLKVTKTPNAACPLTDGAKPLLTMDVWEHAYYLDYQNARPKYMETF
25 | LESLVNWDFANENLG
26 | >FeSOD_A0A2H0YVA1 unreviewed Superoxide
27 | MKYELPKLNYAYDALEPYLDAQTMEIHHSKHHQAYTDNFNKALAEHPDLAEMPVEELLAK
28 | VNELSIKDRDKNALRNHGGGYYNHKLFWEIMDPANKKDESLIKDIETEFGSVDSFKEQFS
29 | ETAKTLLGSGWAWLARDKEGKLHVYGLPNQDSPFQKGHTPIICLDVWEHAYYLKYQNRRP
30 | EYIENWWNVLKLI
31 | >FeSOD_R7J7P3 unreviewed Superoxide
32 | MHQMPKLPYEMEALAPLMSKETFDFHYGKHLQTYVNNLNKLIVGTPYENLELEQIVCQAD
33 | GGIYNNAAQTWNHTFFFQLLTPEQPSLPDDLAGLLTRDFGSVDQFKEDFTKAALGLFGSG
34 | WVWLVLGKDGKLSLLPTPNAGNPLKDGLKPLLVIDVWEHAYYIDYRNNRAAFIEAFWKLV
35 | NWEKVADLLG
36 | >FeSOD_B8LFE6 unreviewed Superoxide
37 | MAEYTLPDLPYDYAALEPHISGRIMELHHTKHHATYVKGANDTLDKLAEARADGSIAGKV
38 | YGLSATLSFHLGGHTNHSIFWKNLSPNGGDKPEGDLAAAIDDQFGGFDKFQAHFTAAATT
39 | LQGSGWAILGYDTIGGKLVILQLTDQSDNIPAAIIPVVMLDDMWEHAFYLDYQNVKPDYV
40 | KAWWNVVNWADAAERLGRATSQGKGLIVPA
41 | >FeSOD_A0A1C0AS03 unreviewed Superoxide
42 | MTYTLPDLDYDYGALAPHIAPEIMELHHSKHHAAYVKGINDALEQLAAAREKGDLGAVNK
43 | LSKDAAFHLGGHINHSVFWKNMSPDGGGRPDGELASAIDEYFGSFDGFQKHFNAVANGIQ
44 | GSGWSMLVWDTLGQRMNINQLFDQQGNLPAGQIPLLQLDMWEHAFYLQYKNVKGDYVTAW
45 | WNVVNWTDVTERFARAKAASAGLV
46 | >FeSOD_A0A2M8Q2V9 unreviewed Superoxide
47 | MAFELPPLPYAYDALEPHIDARTMEIHHDKHHAGYVSKLNSAIAGTEWESKSIDEILRNI
48 | NSVPEDIRVAVQNNGGGHANHSLFWEIMGPNGGGSPSGALAEAINAAFGSFDAFKEKFSN
49 | AAASRFGSGWAWLVVDDAGNLAVYSTANQDSPYMQGHTPILGLDVWEHAYYLNYQNRRPD
50 | YIAAFWNVVNWDAVAEKFAAAKK
51 | >FeSOD_A0A2E1VW30 unreviewed Superoxide
52 | MAYELPALPYGENDLAPHITAETIQYHYGKHHAAYVGKLNELLDGDDSKSLEELIQSTDG
53 | GVFNNAAQVWNHTFYWNCMKPGGGGAPSGDLAAAIERDFGSYDAFVTKFKAAAMARFGSG
54 | WAWLVADADGKLSVVETLNAGNPMTDGLKPLLTCDVWEHAYYIDFRNARPKYLDVFFDSL
55 | VNWDFVASNL
56 | >FeSOD_A0A1V4UH29 unreviewed Superoxide
57 | MEPNRSYTLPKLPYDYSALAPSISEEQLRIHHTKHHQGYVNGANAIYEKLDKTRKDGGDI
58 | DQKATLKELSFHVGGFLLHALYWENMAPAGQGGGVPKGALGSRIEKEFGSFDQFKKEFTA
59 | AANSVEGSGWAALTYCQKTGRLLLMQIEKHNVHVFPSFSVLMVVDVWEHAYYIDYKNDRA
60 | KYLENFWNIINWDAVNSRLEKALKG
61 | >FeSOD_A0A7V9SPC9 unreviewed Superoxide
62 | MPTEVYELPDLSYDYSALEPHINARIMELHHDKHHATYVKGANTALEKLAEVRATGDFAT
63 | IAMLEKNLAFNVSGHVLHSIFWTNLSPNGGGEPDGELATALTDTFGGFEHFRKQMNEAAA
64 | TVQGAGWALASWEPIAQRLIVQQVHDHQGNHGQGTIPLLAIDAWEHAYYLQYENRKTEFF
65 | DAVWNVVSWGDVEARFKAARNAELIRQT
66 | >FeSOD_R7F5H2 unreviewed Superoxide
67 | MIKKINLEYPLDSLEPYYSRETLNIHYNTLYVGYVDNTNITLEKLEKARKERNFENIKCL
68 | EKNLSFFGSGVILHELFFENMGPAIPSSPDINLMEQINKDFGSFELFKEQFTESSKVVEA
69 | SGWNLLVWVPRFNKLEIIQCEKHQDLTLWNCKPILVLDMWEHSYFLQYKANRGEYIKAFW
70 | NIINWNNVNKRFRNTIKY
71 | >FeSOD_A0A2N5YS14 unreviewed Superoxide
72 | MFELPKLPYEFNSLEPKISAKTVEFHYTKHHQVYVNKLNGLIEGTDYAGKTLEEIIKTSE
73 | GGIFNNAAQVWNHTFYWEGFGPNPQSAPSGKLAEMINETFGSFEKFKEEFSTKAATLFGS
74 | GWAWLVLDNGQLKITGTSNAGSPLTEGHKPILTCDVWEHAYYLDYQNLRPKYIENFWELV
75 | DWKKIEGRI
76 | >FeSOD_A0A0F6MY72 unreviewed Superoxide
77 | MAFELPSLPFDQDALESSKMSANTLSYHHGKHHAAYVKNLNAAIEGTDMANMSLEEIIKA
78 | TYNDPSKSGIFNNAAQVWNHSFFWKCLKPNGGGQPTGALADKIQADFGSFDAFIQEFKNA
79 | AATQFGSGWAWLVLDNGTLKVTKTANAVNPMVEGKTPLLTLDVWEHAYYLDFQNARPGFI
80 | DNFIENLVNWDFVAENLASAS
81 | >FeSOD_A0A060HP82 unreviewed Superoxide
82 | MPRRPSHLMANFTLPQLPYAYDALEPHIDATTMQIHHTKHHQAYTDGLNKALGSLDAKFQ
83 | SMDAVDILKNIDTVPENARGAVNFHGGGYNNHTLFWNNMKKGGGGEPSGELADAIKKAFG
84 | SFADFKTKFQTDSVAIQGSGWGWLVKNASGGVQFITMPNQTSPWTRWKAEKLTPLLGLDV
85 | WEHSYYLKYQNRRADYVTAWWNVVNWDEVAKRFKA
86 | >FeSOD_A0A4P7WS39 unreviewed Superoxide
87 | MKITHQLPELPFNKSALNPIITEETFDYHYGKHHAAYVNNLATLIQDTELINFSIEDIIK
88 | KGFYEKNASLFNNAAQHWSHTFFWNCLSPNGGKAPVGRITELITRDFGSFELFKDQFSNA
89 | AIKLFGCGWAWLVQDENDKLEIIAMKEAQTPLILNKKPILTLDVWEHAYYIDYKNARPKF
90 | VEGFWDIVNWDFANKNVI
91 | >FeSOD_A0A076JJX0 unreviewed Superoxide
92 | MPVYTLPELPYDYSALEPYVSGKIMELHHDKHHQAYVNGANQALEQIHDAAESGNVAQSN
93 | LLEKNLAFNLAGHKNHTIFWKNMAPSIGQEPTGELKAAIEDQFGSFEGFQRYFESMCAGI
94 | QGSGWAVLAWDSLGERLVTLQMYDHQGNLPVTIFPLILLDLWEHAYYLDYLNVRADYVKA
95 | WWHIVNWEDASKRFDEVRNLNTNLVK
96 | >FeSOD_G8R729 unreviewed Superoxide
97 | MSFELPDLPYSKSALEPYIDAQTMEIHHDKHHAGYTTKLNDAIEGTELEKQSIEDILKNV
98 | SKHSGGVRNNGGGYYNHSLFWSIMGPDAGGDPTGDVGAAIDDAFGSYENFKTEFSNAAAT
99 | RFGSGWAWLIVNGEGKLEVTSSPNQDNPLMDVAEKKGTPILGLDVWEHAYYLKYQNKRPD
100 | YISAFFNVINWDEVNRRFAEAK
101 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_h:
--------------------------------------------------------------------------------
1 | FeSOD_A0A1F4ZT98 unreviewed Superoxide
2 | FeSOD_A0A067LT26 unreviewed Superoxide
3 | FeSOD_A0A2E1RF15 unreviewed Superoxide
4 | FeSOD_A0A538G8K1 unreviewed Superoxide
5 | FeSOD_A0A6L8W9C4 unreviewed Superoxide
6 | FeSOD_A0A2H0YVA1 unreviewed Superoxide
7 | FeSOD_R7J7P3 unreviewed Superoxide
8 | FeSOD_B8LFE6 unreviewed Superoxide
9 | FeSOD_A0A1C0AS03 unreviewed Superoxide
10 | FeSOD_A0A2M8Q2V9 unreviewed Superoxide
11 | FeSOD_A0A2E1VW30 unreviewed Superoxide
12 | FeSOD_A0A1V4UH29 unreviewed Superoxide
13 | FeSOD_A0A7V9SPC9 unreviewed Superoxide
14 | FeSOD_R7F5H2 unreviewed Superoxide
15 | FeSOD_A0A2N5YS14 unreviewed Superoxide
16 | FeSOD_A0A0F6MY72 unreviewed Superoxide
17 | FeSOD_A0A060HP82 unreviewed Superoxide
18 | FeSOD_A0A4P7WS39 unreviewed Superoxide
19 | FeSOD_A0A076JJX0 unreviewed Superoxide
20 | FeSOD_G8R729 unreviewed Superoxide
21 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_h.dbtype:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_h.index:
--------------------------------------------------------------------------------
1 | 0 0 40
2 | 1 40 40
3 | 2 80 40
4 | 3 120 40
5 | 4 160 40
6 | 5 200 40
7 | 6 240 36
8 | 7 276 36
9 | 8 312 40
10 | 9 352 40
11 | 10 392 40
12 | 11 432 40
13 | 12 472 40
14 | 13 512 36
15 | 14 548 40
16 | 15 588 40
17 | 16 628 40
18 | 17 668 40
19 | 18 708 40
20 | 19 748 36
21 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_ss:
--------------------------------------------------------------------------------
1 | DDDADDAPDDLCLLPPLAHNVLSCCQRNPQLVVLSVLLCVLCVVPDPDDLLVCLVCLVVDDPVSSVSSLQSSLSNLLSSLLSVLFGSDFDAFDPVLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQDPVGTHIDIDHRSHDCSNVVTDTSDMDTCHCSNPCVVCNVPSSSSSVSRSSTTNRVSSNVSSVVD
2 | DDDDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLRVQLRVLVVQLVVCVVVVNPVSNVVSLLSNLLSVLSNLLSNLLSVLFANQFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLLPDDFFWKWFWWQQLVVRGTYIDIDHHSPPCDSVRTDTLDIDTSHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVCSVVVDPDPDPDD
3 | DADAADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLRVLCPPHPCSPDDLLVCLQPDDDSSNLSSLLSVLSNLLSVQFANPFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPVRRTDIDIDHRRHDCNNVRTDTSDMDTCHCSNPCVRCNPPSSSSSVSNSSTTPRVSSNVSD
4 | DADDQDDFPDDLCLLPPLAHSVLSCCCRPPQLNVLSVQLRVLCPPHPCLVDDLLVCLVPLVPDDPVSSLSNQQSSLSNLLSSLLSVQFANQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDPPPGTDIDIDHRSHHCSDPPTRTLDMDTCHCSNPCVVQNVPSSSSSVSSSRTYNRVSSNVSSVVSD
5 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLCVLCVVPDPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSCLLSVLFGNNFFDAADDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDDPSGTHIDIDHRSHGSSNVPTDTLDMDTCHCSNCCPVCNPPSSSSSVSCRVRTGDNVSSNVSSD
6 | DADDADDFPDDLCLLPPLAHRVLSCCCRPPQLVVLSVQLRVLCVVPVVLSPDDLLVCLVCLVVDPDDDPSSVSSQLSSLSNLLSVLLSVQLHSPDDDDPVLQVQCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWADPVRGTDIDIDHRSHDCSNVVTHTSDMDTCHCSNPCVVCNVCSSSSSVSSVVSGDRD
7 | DDDQDDQPDDLCLCPPLAHSVLSCCCRNPQLVVLSVQLCVLCPPHPCNPDDLLVSLQPDDDSSNLSSLLNCLSNVLLVLFAPDFDAADPVLQVLCCVQQNHPVSVLVVQLCQQLPDDAWWKWFWFQDPVRGTDIDTDHSSHGCVVVVTDTLDIDTCHCSNCCVVCNPPSNSSSVSSSSTTPSVSSSVSND
8 | DDADDADDQPDDLCLLPPLFHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCPVCNVVSVVSNLLSSLSNLLSVLLSVLFHSQFFDAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWWWWQQLQVRGTYIDIDHHSSPCDDSSRTDTSDIDTCHCSNCCVVCNPPSSSSSVSRVRTTNRVSSNVSSVVSVVSSPPDDDDD
9 | DADDADDFPDDLCLLPPFAHSVLSCCCRPPQLVVLSVQLNVLVVQLVVCVVVVNPVSNLVSLLSNLQSVLSNLLSNLLSVLFASQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWQQLVVRGTYIDIDHGSSPDDDPRIDTLDIDTCHCSNPCVVQNPPSSSSSVSSSRTTPRVSSNVSSVSSVVSSVVSD
10 | DADDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVLLRVLCPPHPCSVDDLLVCLVPLVSDDPVSSVSNLQSSLSNLQSSLLSVQFGNNFADAADDPLQVLCCVQQNHPVSVLVVLLCQLLPDPAWWKWFWKQALVLGTDIDIDHRSHDCSNVRTDTSDMDTCHCSNPCVVQNVCSSSSSVSSSRTTNRVSSNVSSVVSVD
11 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLSVQLRVLCVVPPPDDLLRCLQPDDDPSNLSSQLNVLSNLLSNLFGNPFFDADDDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWADPVRGTDIDTDHSSHDSSDVPTDTSDMDTCHCSNPCPVCNPPRSSSSVSCRVTTTDNVSSNVSD
12 | DDDPDADDQDDQPDDLCLLPPLAHSVLSCCCRRPQLSLLSVQLRVLSVVVVVCVVVVHDDDPVVSLLSNLQSSQSNLLSQLLSVLFAAQVLFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQDLVPRGTYIDIDGRSDPVDDPRTRTSDIDTCHCSNPCVVQNPPSNSSSVSSSRTTNSVSSNVSSVVSVVD
13 | DDDDADDADFQPDDLCLLPPQAHSVLSCCCRPPQLVVLSVQLNVLVVVLVVCVVVVPPVSNVVSLVSNLLSSLSNLLSNLLSVLFGNQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDPAWWKWFWWQQLVVRGIYIDTDHHSCPCDDPRTQTSDIDGSHCSNPCVVPNPPSSSSSVSRVRTGDRVSSNVSSVVSVVSVVSVVD
14 | DDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCVSNVVSLVSNLLSVLSNLLSCLLSVQFGAFDPDAFDPVLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQQLVVRGTDIDIDGGSSDCVVSRIDTLDMDTCHCSNPCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVRVVD
15 | DDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLRVVLCVLCPPHPCNPDDLLVCLQPDDDSSNLSSLLSCLSNLLSVQFALDDPDADDDPLQVQCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPSGTHIDIDHRSHGSSNVPTDTSDMDTCHCSNQCVVCNPPSSSSSVSSVVGHPSVSSNVSD
16 | DADDADDAPDDLCQCPDQLAHSVLSCCCRNPQLVVLSVQLRVLCPPDPCNPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSVLLSVLFGNNFFAFADDPLQVQCCVQQNHPVSVLVVQLVLLVPPPAWWKWFFWDDPSGTHIDIDHRSDDCSDVPTDTLDMDTCHCSNCCVVCNVPRSSSSVSCRVTTTDNVSSNVSSVVND
17 | DDDDPPPPPDQDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLCVPDDPVLLPDDLLVCLLPLVSRDPSSSVSNQQSSLSNLLSSLQSVQAGHQFFDAADDPLQVLCCVQQNHPVSVLVVQLVQQLPDDAWWKWWWWADPVRGTDIDIDHRSHDSSDDPPPVRIDTSDMDTSHCSNQCVVQNPPSNSSSVSNVRTTPRVSSNVSNVD
18 | DDDDDDADDQPDDLCLLPPLAHSVLSCCCRNPQLSVLSVQLRVLCPPDPCSPDDLLVLLVVCVVVVNVSSNLSSQLNVLSNLLSVLFGLQFFDADDDPLVVQCCVQQNHPVSVLVVQLCCLLPDDAWWKWFWWADPVRGTDIDTDHHSHHCVVVVTHTLDMDTCHCSNCCVVQNPPSSSSSVSRVSTTNRVSSNVSRD
19 | DDADDADDQPDDLCLLPPQFHSVLSCCCRPPQLVVLSVQLRVLVVQLVVCVVVVNPVSNLVSLLSNLLSVLSNLLSSLLSVLLGNDDDAAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWFWWQQLVVRGIYIDIDHHSPPDDDPRIDTLDIDTCHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVSVVVDVVSVD
20 | DADDADAAPDDLCLLPPLAHNVLSCCCRPPQLVVLRVQLRVLCPPHPLLVDDLLVCLVCLVVDDPSSLLSSLSNLLSSLLSVQFGNQLFDFADDPQQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQALVLGTDIDIDHRSHDSCPPPVPRRTRTSDMDTSHCSNQCVVQNPPSSSSSVSNVRTTNRVSSNVSSVVSD
21 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_ss.dbtype:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/data/foldseek/FeSOD_ss.index:
--------------------------------------------------------------------------------
1 | 0 0 198
2 | 1 198 210
3 | 2 408 193
4 | 3 601 203
5 | 4 804 197
6 | 5 1001 195
7 | 6 1196 192
8 | 7 1388 212
9 | 8 1600 206
10 | 9 1806 205
11 | 10 2011 192
12 | 11 2203 207
13 | 12 2410 210
14 | 13 2620 200
15 | 14 2820 191
16 | 15 3011 203
17 | 16 3214 217
18 | 17 3431 200
19 | 18 3631 208
20 | 19 3839 204
21 |
--------------------------------------------------------------------------------
/test/data/metadata_FeSOD_20.tsv:
--------------------------------------------------------------------------------
1 | group
2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide g2
3 | FeSOD_A0A067LT26|unreviewed|Superoxide g3
4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide g1
5 | FeSOD_A0A538G8K1|unreviewed|Superoxide g2
6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide g1
7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide g2
8 | FeSOD_R7J7P3|unreviewed|Superoxide g1
9 | FeSOD_B8LFE6|unreviewed|Superoxide g3
10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide g3
11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide g2
12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide g1
13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide g2
14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide g3
15 | FeSOD_R7F5H2|unreviewed|Superoxide g2
16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide g1
17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide g1
18 | FeSOD_A0A060HP82|unreviewed|Superoxide g2
19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide g1
20 | FeSOD_A0A076JJX0|unreviewed|Superoxide g3
21 | FeSOD_G8R729|unreviewed|Superoxide g2
22 |
--------------------------------------------------------------------------------
/test/data/pDONR201.fasta:
--------------------------------------------------------------------------------
1 | >pDONR201
2 | CTTTCCTGCGTTATCCCCTGATTCTGTGGATAACCGTATTACCGCTAGCCAGGAAGAGTTTGTAGAAACGCAAAAAGGCCATCCGTCAGGATGGCCTTCTGCTTAGTTTGATGCCTGGCAGTTTATGGCGGGCGTCCTGCCCGCCACCCTCCGGGCCGTTGCTTCACAACGTTCAAATCCGCTCCCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACAACAGATAAAACGAAAGGCCCAGTCTTCCGACTGAGCCTTTCGTTTTATTTGATGCCTGGCAGTTCCCTACTCTCGCGTTAACGCTAGCATGGATCTCGGGCCCCAAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAAGTTTGTACAAAAAAGCAGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATCCAGTCACTATGAATCAACTACTTAGATGGTATTAGTGACCTGTAGTCGACCGACAGCCTTCCAAATGTTCTTCGGGTGATGCTGCCAACTTAGTCGACCGACAGCCTTCCAAATGTTCTTCTCAAACGGAATCGTCGTATCCAGCCTACTCGCTATTGTCCTCAATGCCGTATTAAATCATAAAAAGAAATAAGAAAAAGAGGTGCGAGCCTCTTTTTTGTGTGACAAAATAAAAACATCTACCTATTCATATACGCTAGTGTCATAGTCCTGAAAATCATCTGCATCAAGAACAATTTCACAACTCTTATACTTTTCTCTTACAAGTCGTTCGGCTTCATCTGGATTTTCAGCCTCTATACTTACTAAACGTGATAAAGTTTCTGTAATTTCTACTGTATCGACCTGCAGACTGGCTGTGTATAAGGGAGCCTGACATTTATATTCCCCAGAACATCAGGTTAATGGCGTTTTTGATGTCATTTTCGCGGTGGCTGAGATCAGCCACTTCTTCCCCGATAACGGAGACCGGCACACTGGCCATATCGGTGGTCATCATGCGCCAGCTTTCATCCCCGATATGCACCACCGGGTAAAGTTCACGGGAGACTTTATCTGACAGCAGACGTGCACTGGCCAGGGGGATCACCATCCGTCGCCCGGGCGTGTCAATAATATCACTCTGTACATCCACAAACAGACGATAACGGCTCTCTCTTTTATAGGTGTAAACCTTAAACTGCATTTCACCAGTCCCTGTTCTCGTCAGCAAAAGAGCCGTTCATTTCAATAAACCGGGCGACCTCAGCCATCCCTTCCTGATTTTCCGCTTTCCAGCGTTCGGCACGCAGACGACGGGCTTCATTCTGCATGGTTGTGCTTACCAGACCGGAGATATTGACATCATATATGCCTTGAGCAACTGATAGCTGTCGCTGTCAACTGTCACTGTAATACGCTGCTTCATAGCACACCTCTTTTTGACATACTTCGGGTATACATATCAGTATATATTCTTATACCGCAAAAATCAGCGCGCAAATACGCATACTGTTATCTGGCTTTTAGTAAGCCGGATCCACGCGATTACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTCATTAAGCATTCTGCCGACATGGAAGCCATCACAGACGGCATGATGAACCTGAATCGCCAGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGGCGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGGATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTTTCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGTGGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACAAGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCCGGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCTTATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATAGGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGATATATCAACGGTGGTATATCCAGTGATTTTTTTCTCCATTTTAGCTTCCTTAGCTCCTGAAAATCTCGATAACTCAAAAAATACGCCCGGTAGTGATCTTATTTCATTATGGTGAAAGTTGGAACCTCTTACGTGCCGATCAACGTCTCATTTTCGCCAAAAGTTGGCCCAGGGCTTCCCGGTATCAACAGGGACACCAGGATTTATTTATTCTGCGAAGTGATCTTCCGTCACAGGTATTTATTCGGCGCAAAGTGCGTCGGGTGATGCTGCCAACTTAGTCGACTACAGGTCACTAATACCATCTAAGTAGTTGATTCATAGTGACTGGATATGTTGTGTTTTACAGTATTATGTAGTCTGTTTTTTATGCAAAATCTAATTTAATATATTGATATTTATATCATTTTACGTTTCTCGTTCAGCTTTCTTGTACAAAGTGGGCATTATAAGAAAGCATTGCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGCCATCCAGCTGCAGCTCTGGCCCGTGTCTCAAAATCTCTGATGTTACATTGCACAAGATAAAAATATATCATCATGAACAATAAAACTGTCTGCTTACATAAACAGTAATACAAGGGGTGTTATGAGCCATATTCAACGGGAAACGTCGAGGCCGCGATTAAATTCCAACATGGATGCTGATTTATATGGGTATAAATGGGCTCGCGATAATGTCGGGCAATCAGGTGCGACAATCTATCGCTTGTATGGGAAGCCCGATGCGCCAGAGTTGTTTCTGAAACATGGCAAAGGTAGCGTTGCCAATGATGTTACAGATGAGATGGTCAGACTAAACTGGCTGACGGAATTTATGCCTCTTCCGACCATCAAGCATTTTATCCGTACTCCTGATGATGCATGGTTACTCACCACTGCGATCCCCGGAAAAACAGCATTCCAGGTATTAGAAGAATATCCTGATTCAGGTGAAAATATTGTTGATGCGCTGGCAGTGTTCCTGCGCCGGTTGCATTCGATTCCTGTTTGTAATTGTCCTTTTAACAGCGATCGCGTATTTCGTCTCGCTCAGGCGCAATCACGAATGAATAACGGTTTGGTTGATGCGAGTGATTTTGATGACGAGCGTAATGGCTGGCCTGTTGAACAAGTCTGGAAAGAAATGCATAAACTTTTGCCATTCTCACCGGATTCAGTCGTCACTCATGGTGATTTCTCACTTGATAACCTTATTTTTGACGAGGGGAAATTAATAGGTTGTATTGATGTTGGACGAGTCGGAATCGCAGACCGATACCAGGATCTTGCCATCCTATGGAACTGCCTCGGTGAGTTTTCTCCTTCATTACAGAAACGGCTTTTTCAAAAATATGGTATTGATAATCCTGATATGAATAAATTGCAGTTTCATTTGATGCTCGATGAGTTTTTCTAATCAGAATTGGTTAATTGGTTGTAACACTGGCAGAGCATTACGCTGACTTGACGGGACGGCGCAAGCTCATGACCAAAATCCCTTAACGTGAGTTTTCGTTCCACTGAGCGTCAGACCCCGTAGAAAAGATCAAAGGATCTTCTTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAAAAACCACCGCTACCAGCGGTGGTTTGTTTGCCGGATCAAGAGCTACCAACTCTTTTTCCGAAGGTAACTGGCTTCAGCAGAGCGCAGATACCAAATACTGTCCTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACGCCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAAAACGCCAGCAACGCGGCCTTTTTACGGTTCCTGGCCTTTTGCTGGCCTTTTGCTCACATGTT
3 |
--------------------------------------------------------------------------------
/test/data/pDONR201_genemark.gff:
--------------------------------------------------------------------------------
1 | ##gff-version 2
2 | # GeneMark.hmm-2 prokaryotic version: 1.24
3 | # File with sequence: pDONR201.fasta
4 | # File with MetaGeneMark parameters: /mnt/rbg/programs/srj_nextflow/workflows/metagenemark/resources/mgm_11.mod
5 | # translation table: 11
6 | # output date start: Wed Sep 22 12:11:10 2021
7 |
8 | ##sequence-region pDONR201 1 4470
9 | pDONR201 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105;
10 | pDONR201 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306;
11 | pDONR201 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126;
12 | pDONR201 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660;
13 | pDONR201 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762;
14 | pDONR201 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93;
15 | # pDONR201 total_logodd 127.403 average_length 342 average_density 1.34
16 |
--------------------------------------------------------------------------------
/test/data/pDONR201_multi_genemark.gff:
--------------------------------------------------------------------------------
1 | ##gff-version 2
2 | # GeneMark.hmm-2 prokaryotic version: 1.24
3 | # File with sequence: pDONR201.fasta
4 | # File with MetaGeneMark parameters: /mnt/rbg/programs/srj_nextflow/workflows/metagenemark/resources/mgm_11.mod
5 | # translation table: 11
6 | # output date start: Wed Sep 22 12:11:10 2021
7 |
8 | pDONR201_1 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105;
9 | pDONR201_1 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306;
10 | pDONR201_1 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126;
11 | pDONR201_1 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660;
12 | pDONR201_1 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762;
13 | pDONR201_1 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93;
14 |
15 | pDONR201_2 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105;
16 | pDONR201_2 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306;
17 | pDONR201_2 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126;
18 | pDONR201_2 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660;
19 | pDONR201_2 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762;
20 | pDONR201_2 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93;
21 |
22 | pDONR201_3 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105;
23 | pDONR201_3 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306;
24 | pDONR201_3 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126;
25 | pDONR201_3 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660;
26 | pDONR201_3 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762;
27 | pDONR201_3 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93;
28 |
29 | pDONR201_4 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105;
30 | pDONR201_4 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306;
31 | pDONR201_4 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126;
32 | pDONR201_4 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660;
33 | pDONR201_4 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762;
34 | pDONR201_4 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93;
--------------------------------------------------------------------------------
/test/data/pDONR201_multi_subset.txt:
--------------------------------------------------------------------------------
1 |
2 | pDONR201_4
3 |
--------------------------------------------------------------------------------
/test/data/pdonr_peptides.fasta:
--------------------------------------------------------------------------------
1 | >pDONR201_1
2 | FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA*
3 | >pDONR201_2
4 | MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDKVSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEV
5 | ADLSHRENDIKNAINLMFWGI*
6 | >pDONR201_3
7 | MQNEARRLRAERWKAENQEGMAEVARFIEMNGSFADENRDW*
8 | >pDONR201_4
9 | MEKKITGYTTVDISQWHRKEHFEAFQSVAQCTYNQTVQLDITAFLKTVKKNKHKFYPAFIHILARLMNAHPEFRMAMKDG
10 | ELVIWDSVHPCYTVFHEQTETFSSLWSEYHDDFRQFLHIYSQDVACYGENLAYFPKGFIENMFFVSANPWVSFTSFDLNV
11 | ANMDNFFAPVFTMGKYYTQGDKVLMPLAIQVHHAVCDGFHVGRMLNELQQYCDEWQGGA*
12 | >pDONR201_5
13 | MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDVTDEMVRLNWLTEFMPLPTIKHFIRTPDDAWLLT
14 | TAIPGKTAFQVLEEYPDSGENIVDALAVFLRRLHSIPVCNCPFNSDRVFRLAQAQSRMNNGLVDASDFDDERNGWPVEQV
15 | WKEMHKLLPFSPDSVVTHGDFSLDNLIFDEGKLIGCIDVGRVGIADRYQDLAILWNCLGEFSPSLQKRLFQKYGIDNPDM
16 | NKLQFHLMLDEFF*
17 |
--------------------------------------------------------------------------------
/test/data/score3.sparse.tsv:
--------------------------------------------------------------------------------
1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A1F4ZT98|unreviewed|Superoxide 410.0
2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 216.0
3 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 199.0
4 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 429.0
5 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 279.0
6 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 277.0
7 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 405.0
8 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 239.0
9 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 232.0
10 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 425.0
11 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 287.0
12 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 229.0
13 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 411.0
14 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 271.0
15 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 258.0
16 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2H0YVA1|unreviewed|Superoxide 410.0
17 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 208.0
18 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 192.0
19 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_R7J7P3|unreviewed|Superoxide 400.0
20 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 209.0
21 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 199.0
22 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 434.0
23 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 267.0
24 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 264.0
25 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 426.0
26 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 275.0
27 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 264.0
28 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 414.0
29 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 279.0
30 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 261.0
31 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 398.0
32 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 258.0
33 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 239.0
34 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 424.0
35 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 177.0
36 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 171.0
37 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 430.0
38 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 244.0
39 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 234.0
40 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_R7F5H2|unreviewed|Superoxide 413.0
41 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 149.0
42 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 140.0
43 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 402.0
44 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 237.0
45 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 233.0
46 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 417.0
47 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 275.0
48 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 243.0
49 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A060HP82|unreviewed|Superoxide 451.0
50 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 226.0
51 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 208.0
52 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A4P7WS39|unreviewed|Superoxide 419.0
53 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 234.0
54 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 208.0
55 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 429.0
56 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 271.0
57 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 248.0
58 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 422.0
59 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 271.0
60 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 231.0
61 |
--------------------------------------------------------------------------------
/test/data/scorefull.dense.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/scorefull.dense.hdf5
--------------------------------------------------------------------------------
/test/data/scorefull.tsv:
--------------------------------------------------------------------------------
1 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 410.0 120.0 169.0 199.0 161.0 189.0 157.0 132.0 134.0 216.0 165.0 119.0 112.0 107.0 160.0 151.0 178.0 147.0 127.0 187.0
3 | FeSOD_A0A067LT26|unreviewed|Superoxide 125.0 429.0 136.0 153.0 130.0 119.0 111.0 271.0 279.0 162.0 128.0 174.0 247.0 138.0 135.0 115.0 166.0 130.0 277.0 153.0
4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 168.0 131.0 405.0 186.0 239.0 150.0 187.0 130.0 136.0 221.0 232.0 140.0 119.0 97.4 228.0 227.0 154.0 228.0 130.0 215.0
5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 199.0 149.0 187.0 425.0 180.0 183.0 139.0 150.0 166.0 287.0 159.0 125.0 167.0 91.3 169.0 174.0 205.0 148.0 148.0 229.0
6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 162.0 125.0 240.0 181.0 411.0 134.0 174.0 128.0 128.0 191.0 258.0 131.0 110.0 86.7 234.0 271.0 144.0 201.0 115.0 189.0
7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 191.0 117.0 153.0 184.0 138.0 410.0 134.0 135.0 138.0 208.0 126.0 124.0 120.0 109.0 158.0 144.0 175.0 132.0 127.0 192.0
8 | FeSOD_R7J7P3|unreviewed|Superoxide 161.0 110.0 192.0 143.0 178.0 136.0 400.0 109.0 104.0 154.0 199.0 122.0 89.7 104.0 209.0 171.0 119.0 191.0 108.0 152.0
9 | FeSOD_B8LFE6|unreviewed|Superoxide 130.0 267.0 130.0 150.0 128.0 132.0 105.0 434.0 264.0 162.0 122.0 175.0 214.0 109.0 129.0 120.0 158.0 121.0 239.0 150.0
10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 134.0 275.0 137.0 166.0 128.0 136.0 100.0 264.0 426.0 172.0 127.0 167.0 232.0 130.0 124.0 128.0 169.0 118.0 249.0 159.0
11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 209.0 149.0 214.0 279.0 184.0 199.0 142.0 155.0 165.0 414.0 180.0 135.0 155.0 108.0 196.0 203.0 221.0 163.0 139.0 261.0
12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 165.0 124.0 233.0 160.0 258.0 125.0 196.0 122.0 127.0 188.0 398.0 124.0 101.0 73.6 219.0 239.0 151.0 199.0 115.0 185.0
13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 120.0 171.0 142.0 127.0 131.0 123.0 119.0 177.0 169.0 143.0 126.0 424.0 162.0 150.0 137.0 117.0 140.0 126.0 169.0 131.0
14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 115.0 244.0 122.0 170.0 112.0 119.0 87.8 216.0 234.0 166.0 103.0 163.0 430.0 141.0 122.0 118.0 145.0 118.0 222.0 148.0
15 | FeSOD_R7F5H2|unreviewed|Superoxide 108.0 135.0 99.8 93.2 87.4 109.0 103.0 111.0 132.0 116.0 75.9 149.0 140.0 413.0 106.0 69.7 115.0 110.0 135.0 115.0
16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 163.0 134.0 233.0 172.0 237.0 159.0 207.0 133.0 127.0 206.0 222.0 139.0 124.0 108.0 402.0 223.0 144.0 198.0 132.0 185.0
17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 155.0 114.0 231.0 178.0 275.0 146.0 170.0 124.0 131.0 215.0 243.0 119.0 120.0 71.2 224.0 417.0 137.0 207.0 112.0 190.0
18 | FeSOD_A0A060HP82|unreviewed|Superoxide 177.0 158.0 152.0 202.0 141.0 171.0 113.0 155.0 166.0 226.0 148.0 137.0 140.0 110.0 137.0 130.0 451.0 122.0 158.0 208.0
19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 152.0 129.0 234.0 153.0 205.0 134.0 191.0 126.0 122.0 174.0 203.0 130.0 121.0 112.0 200.0 208.0 129.0 419.0 138.0 176.0
20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 126.0 271.0 130.0 147.0 114.0 124.0 103.0 238.0 248.0 146.0 114.0 166.0 219.0 132.0 128.0 107.0 160.0 132.0 429.0 151.0
21 | FeSOD_G8R729|unreviewed|Superoxide 190.0 150.0 218.0 231.0 191.0 192.0 151.0 152.0 161.0 271.0 187.0 133.0 148.0 115.0 183.0 188.0 213.0 173.0 155.0 422.0
22 |
--------------------------------------------------------------------------------
/test/data/simple_genpept.gb:
--------------------------------------------------------------------------------
1 | LOCUS pDONR201_1 35 aa UNK 19-OCT-2021
2 | DEFINITION .
3 | ACCESSION pDONR201_1
4 | VERSION pDONR201_1
5 | KEYWORDS .
6 | SOURCE .
7 | ORGANISM .
8 | .
9 | FEATURES Location/Qualifiers
10 | ORIGIN
11 | 1 fpalspdsvd nritasqeef vetqkghpsg wpsa*
12 | //
13 | LOCUS pDONR201_2 102 aa UNK 19-OCT-2021
14 | DEFINITION .
15 | ACCESSION pDONR201_2
16 | VERSION pDONR201_2
17 | KEYWORDS .
18 | SOURCE .
19 | ORGANISM .
20 | .
21 | FEATURES Location/Qualifiers
22 | Domainator 2..100
23 | /program="hmmsearch"
24 | /name="CcdB"
25 | /description="CcdB protein"
26 | /evalue="1.1e-32"
27 | /score="103.0"
28 | /cds_id="958_-1_1264"
29 | /database="Pfam-A"
30 | ORIGIN
31 | 1 mqfkvytykr esryrlfvdv qsdiidtpgr rmviplasar llsdkvsrel ypvvhigdes
32 | 61 wrmmttdmas vpvsvigeev adlshrendi knainlmfwg i*
33 | //
34 | LOCUS pDONR201_3 42 aa UNK 19-OCT-2021
35 | DEFINITION .
36 | ACCESSION pDONR201_3
37 | VERSION pDONR201_3
38 | KEYWORDS .
39 | SOURCE .
40 | ORGANISM .
41 | .
42 | FEATURES Location/Qualifiers
43 | Domainator 1..41
44 | /program="hmmsearch"
45 | /name="CcdA"
46 | /description="Post-segregation antitoxin CcdA"
47 | /evalue="3.8e-16"
48 | /score="50.1"
49 | /cds_id="1265_-1_1391"
50 | /database="Pfam-A"
51 | ORIGIN
52 | 1 mqnearrlra erwkaenqeg maevarfiem ngsfadenrd w*
53 | //
54 | LOCUS pDONR201_4 220 aa UNK 19-OCT-2021
55 | DEFINITION .
56 | ACCESSION pDONR201_4
57 | VERSION pDONR201_4
58 | KEYWORDS .
59 | SOURCE .
60 | ORGANISM .
61 | .
62 | FEATURES Location/Qualifiers
63 | Domainator 11..212
64 | /program="hmmsearch"
65 | /name="CAT"
66 | /description="Chloramphenicol acetyltransferase"
67 | /evalue="9.7e-102"
68 | /score="329.7"
69 | /cds_id="1605_-1_2265"
70 | /database="Pfam-A"
71 | Domainator 2..33
72 | /program="hmmsearch"
73 | /name="Condensation"
74 | /description="Condensation domain"
75 | /evalue="0.00015"
76 | /score="11.2"
77 | /cds_id="1605_-1_2265"
78 | /database="Pfam-A"
79 | Domainator 6..36
80 | /program="hmmsearch"
81 | /name="2-oxoacid_dh"
82 | /description="2-oxoacid dehydrogenases acyltransferase
83 | (catalytic domain)"
84 | /evalue="0.0037"
85 | /score="7.3"
86 | /cds_id="1605_-1_2265"
87 | /database="Pfam-A"
88 | Domainator 137..196
89 | /program="hmmsearch"
90 | /name="2-oxoacid_dh"
91 | /description="2-oxoacid dehydrogenases acyltransferase
92 | (catalytic domain)"
93 | /evalue="0.0047"
94 | /score="7.0"
95 | /cds_id="1605_-1_2265"
96 | /database="Pfam-A"
97 | Domainator 139..159
98 | /program="hmmsearch"
99 | /name="Condensation"
100 | /description="Condensation domain"
101 | /evalue="0.81"
102 | /score="-1.2"
103 | /cds_id="1605_-1_2265"
104 | /database="Pfam-A"
105 | ORIGIN
106 | 1 mekkitgytt vdisqwhrke hfeafqsvaq ctynqtvqld itaflktvkk nkhkfypafi
107 | 61 hilarlmnah pefrmamkdg elviwdsvhp cytvfheqte tfsslwseyh ddfrqflhiy
108 | 121 sqdvacygen layfpkgfie nmffvsanpw vsftsfdlnv anmdnffapv ftmgkyytqg
109 | 181 dkvlmplaiq vhhavcdgfh vgrmlnelqq ycdewqgga*
110 | //
111 | LOCUS pDONR201_5 254 aa UNK 19-OCT-2021
112 | DEFINITION .
113 | ACCESSION pDONR201_5
114 | VERSION pDONR201_5
115 | KEYWORDS .
116 | SOURCE .
117 | ORGANISM .
118 | .
119 | FEATURES Location/Qualifiers
120 | Domainator 48..238
121 | /program="hmmsearch"
122 | /name="APH"
123 | /description="Phosphotransferase enzyme family"
124 | /evalue="2e-28"
125 | /score="90.5"
126 | /cds_id="2915_1_3677"
127 | /database="Pfam-A"
128 | Domainator 174..218
129 | /program="hmmsearch"
130 | /name="TCAD9"
131 | /description="Ternary complex associated domain 9"
132 | /evalue="0.00029"
133 | /score="10.5"
134 | /cds_id="2915_1_3677"
135 | /database="Pfam-A"
136 | ORIGIN
137 | 1 mdadlygykw ardnvgqsga tiyrlygkpd apelflkhgk gsvandvtde mvrlnwltef
138 | 61 mplptikhfi rtpddawllt taipgktafq vleeypdsge nivdalavfl rrlhsipvcn
139 | 121 cpfnsdrvfr laqaqsrmnn glvdasdfdd erngwpveqv wkemhkllpf spdsvvthgd
140 | 181 fsldnlifde gkligcidvg rvgiadryqd lailwnclge fspslqkrlf qkygidnpdm
141 | 241 nklqfhlmld eff*
142 | //
143 |
--------------------------------------------------------------------------------
/test/data/simple_genpept_contigs.txt:
--------------------------------------------------------------------------------
1 | pDONR201_1
2 | pDONR201_5
3 |
--------------------------------------------------------------------------------
/test/data/simple_genpept_quote_name.gb:
--------------------------------------------------------------------------------
1 | LOCUS pDONR201_1 35 aa UNK 19-OCT-2021
2 | DEFINITION .
3 | ACCESSION pDONR201_1
4 | VERSION pDONR201_1
5 | KEYWORDS .
6 | SOURCE .
7 | ORGANISM .
8 | .
9 | FEATURES Location/Qualifiers
10 | ORIGIN
11 | 1 fpalspdsvd nritasqeef vetqkghpsg wpsa*
12 | //
13 | LOCUS pDONR201_2 102 aa UNK 19-OCT-2021
14 | DEFINITION .
15 | ACCESSION pDONR201_2
16 | VERSION pDONR201_2
17 | KEYWORDS .
18 | SOURCE .
19 | ORGANISM .
20 | .
21 | FEATURES Location/Qualifiers
22 | Domainator 2..100
23 | /program="phmmer"
24 | /name="""CcdB"""
25 | /description="CcdB protein"
26 | /evalue="1.1e-32"
27 | /score="103.0"
28 | /cds_id="958_-1_1264"
29 | /database="Pfam-A"
30 | ORIGIN
31 | 1 mqfkvytykr esryrlfvdv qsdiidtpgr rmviplasar llsdkvsrel ypvvhigdes
32 | 61 wrmmttdmas vpvsvigeev adlshrendi knainlmfwg i*
33 | //
34 |
--------------------------------------------------------------------------------
/test/data/ssn_FeSOD_clusters.tsv:
--------------------------------------------------------------------------------
1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1
2 | FeSOD_A0A067LT26|unreviewed|Superoxide 2
3 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 1
4 | FeSOD_A0A538G8K1|unreviewed|Superoxide 1
5 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1
6 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 1
7 | FeSOD_R7J7P3|unreviewed|Superoxide 1
8 | FeSOD_B8LFE6|unreviewed|Superoxide 2
9 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 2
10 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1
11 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 1
12 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 2
13 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 2
14 | FeSOD_R7F5H2|unreviewed|Superoxide 3
15 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 1
16 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 1
17 | FeSOD_A0A060HP82|unreviewed|Superoxide 1
18 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 1
19 | FeSOD_A0A076JJX0|unreviewed|Superoxide 2
20 | FeSOD_G8R729|unreviewed|Superoxide 1
21 |
--------------------------------------------------------------------------------
/test/data/ssn_FeSOD_clusters_header.tsv:
--------------------------------------------------------------------------------
1 | contig cluster
2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1
3 | FeSOD_A0A067LT26|unreviewed|Superoxide 2
4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 1
5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 1
6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1
7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 1
8 | FeSOD_R7J7P3|unreviewed|Superoxide 1
9 | FeSOD_B8LFE6|unreviewed|Superoxide 2
10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 2
11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1
12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 1
13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 2
14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 2
15 | FeSOD_R7F5H2|unreviewed|Superoxide 3
16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 1
17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 1
18 | FeSOD_A0A060HP82|unreviewed|Superoxide 1
19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 1
20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 2
21 | FeSOD_G8R729|unreviewed|Superoxide 1
22 |
--------------------------------------------------------------------------------
/test/data/swissprot_CuSOD_subset.fasta:
--------------------------------------------------------------------------------
1 | >sp|P0AGD1|SODC_ECOLI Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1
2 | MKRFSLAILALVVATGAQAASEKVEMNLVTSQGVGQSIGSVTITETDKGLEFSPDLKALP
3 | PGEHGFHIHAKGSCQPATKDGKASAAESAGGHLDPQNTGKHEGPEGAGHLGDLPALVVNN
4 | DGKATDAVIAPRLKSLDEIKDKALMVHVGGDNMSDQPKPLGGGGERYACGVIK
5 | >sp|O31851|YOJM_BACSU Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1
6 | MHRLLLLMMLTALGVAGCGQKKPPDPPNRVPEKKVVETSAFGHHVQLVNREGKAVGFIEI
7 | KESDDEGLDIHISANSLRPGASLGFHIYEKGSCVRPDFESAGGPFNPLNKEHGFNNPMGH
8 | HAGDLPNLEVGADGKVDVIMNAPDTSLKKGSKLNILDEDGSAFIIHEQADDYLTNPSGNS
9 | GARIVCGALLGNNEKQ
--------------------------------------------------------------------------------
/test/data/taxdmp/delnodes.dmp:
--------------------------------------------------------------------------------
1 | 1985417 |
--------------------------------------------------------------------------------
/test/data/taxdmp/merged.dmp:
--------------------------------------------------------------------------------
1 | 17 | 561 |
--------------------------------------------------------------------------------
/test/data/taxdmp/nodes.dmp:
--------------------------------------------------------------------------------
1 | 1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
2 | 131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | |
3 | 2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | |
4 | 2759 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | |
5 | 1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
6 | 1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
7 | 91347 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
8 | 543 | 91347 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
9 | 561 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
10 | 17 | 543 | genus | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
11 | 562 | 17 | species | EC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
12 | 1783272 | 2 | clade | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
13 | 1239 | 1783272 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
14 | 91061 | 1239 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
15 | 1385 | 91061 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
16 | 186817 | 1385 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
17 | 1386 | 186817 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
18 | 653685 | 1386 | species group | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
19 | 1423 | 653685 | species | BS | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
20 |
--------------------------------------------------------------------------------
/test/data/taxdmp/taxdump.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/taxdmp/taxdump.tar.gz
--------------------------------------------------------------------------------
/test/data/test_matrix.dense.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/test_matrix.dense.hdf5
--------------------------------------------------------------------------------
/test/data/test_matrix.dense.tsv:
--------------------------------------------------------------------------------
1 | X Y Z
2 | A 1 2 3
3 | B 4 5 6
4 | C 7 8 9
5 |
--------------------------------------------------------------------------------
/test/data/test_matrix.sparse.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/test_matrix.sparse.hdf5
--------------------------------------------------------------------------------
/test/data/thymidylate_synthase.fasta:
--------------------------------------------------------------------------------
1 | >Thymidylate_synthase
2 | mkqylelmqkvldegtqkndrtgtgtlsifghqmrfnlqegfplvttkrchlrsiihellwflqgdtniaylhennvtiwdewadengdlgpvygkqwrawptpdgrhidqiatvlsqlk
--------------------------------------------------------------------------------
/test/helpers.py:
--------------------------------------------------------------------------------
1 | from domainator.Bio import SeqIO
2 |
3 | def compare_files(f1,f2, skip_lines=0):
4 | with open(f1,"r") as newfile, open(f2, "r") as oldfile:
5 | for x in range(skip_lines):
6 | newfile.readline()
7 | oldfile.readline()
8 | assert newfile.read() == oldfile.read()
9 |
10 | def compare_iterables(i1, i2):
11 | assert all(a == b for a,b in zip(i1, i2))
12 |
13 | def compare_seqfiles(gb1, gb2, format="genbank", skip_attrs={}, skip_qualifiers={}):
14 | recs1 = list(SeqIO.parse(gb1, format))
15 | recs2 = list(SeqIO.parse(gb2, format))
16 | assert len(recs1) == len(recs2)
17 | for i in range(len(recs1)):
18 | compare_seqrecords(recs1[i], recs2[i], skip_attrs=skip_attrs, skip_qualifiers=skip_qualifiers)
19 |
20 | def compare_seqrecords(rec1, rec2, skip_attrs={}, skip_qualifiers={}):
21 | attrs = {"seq", "id", "description", "name"}
22 | skip_attrs = set(skip_attrs)
23 | skip_qualifiers = set(skip_qualifiers)
24 | attrs = attrs.difference(skip_attrs)
25 |
26 | for attr in attrs:
27 | try:
28 | assert getattr(rec1, attr) == getattr(rec2, attr)
29 | except AssertionError as e:
30 | e.args += (attr, rec1, rec2)
31 | raise
32 |
33 |
34 | assert rec1.letter_annotations == rec2.letter_annotations
35 | for k in rec1.letter_annotations:
36 | assert rec1.letter_annotations[k] == rec2.letter_annotations[k]
37 | for k in rec1.annotations:
38 | if k != "date":
39 | assert rec1.annotations[k] == rec2.annotations[k]
40 | assert len(rec1.features) == len(rec2.features)
41 |
42 |
43 | for i in range(len(rec1.features)):
44 | feature1 = rec1.features[i]
45 | feature2 = rec2.features[i]
46 |
47 | for qualifier in feature1.qualifiers:
48 | if qualifier in skip_qualifiers:
49 | continue
50 | try:
51 | assert feature1.qualifiers[qualifier] == feature2.qualifiers[qualifier], f"qualifiers not equal in: {rec1}, {rec2}"
52 | except:
53 | #print(f"{rec1}, {rec2}")
54 | print(f"{feature1}, {feature2}")
55 | print(f"{feature1.qualifiers[qualifier]}, {feature2.qualifiers[qualifier]}")
56 | raise
57 |
--------------------------------------------------------------------------------
/test/test_SeqFeature.py:
--------------------------------------------------------------------------------
1 | import os
2 | from domainator.Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
3 | import tempfile
4 | from glob import glob
5 | import pytest
6 |
7 |
8 | def test_overlay_1():
9 | """
10 | Test CompoundLocation.overlay on a split CompoundLocation
11 | """
12 | loc1 = FeatureLocation(1, 10)
13 | loc2 = FeatureLocation(20, 30)
14 | loc3 = FeatureLocation(40, 50)
15 | loc4 = FeatureLocation(60, 70)
16 | loc5 = FeatureLocation(80, 90)
17 | loc6 = FeatureLocation(100, 110)
18 |
19 | cloc1 = CompoundLocation([loc1, loc2, loc3, loc4, loc5, loc6])
20 | cloc3 = cloc1.overlay(4, 40)
21 | assert len(cloc3.parts) == 5
22 | assert cloc3.parts[0].start == 5
23 | assert cloc3.parts[0].end == 10
24 | assert cloc3.parts[1].start == 20
25 | assert cloc3.parts[1].end == 30
26 | assert cloc3.parts[2].start == 40
27 | assert cloc3.parts[2].end == 50
28 | assert cloc3.parts[3].start == 60
29 | assert cloc3.parts[3].end == 70
30 | assert cloc3.parts[4].start == 80
31 | assert cloc3.parts[4].end == 85
32 |
33 |
34 | def test_overlaps():
35 | """
36 | Test SeqFeature.overlaps method
37 | """
38 | loc1 = FeatureLocation(1, 10)
39 | loc2 = FeatureLocation(10, 40)
40 | loc3 = FeatureLocation(40, 50)
41 | loc4 = FeatureLocation(50, 70)
42 | loc5 = FeatureLocation(70, 90)
43 | loc6 = FeatureLocation(90, 110)
44 |
45 | cloc1 = CompoundLocation([loc1, loc2, loc3, loc4, loc5, loc6])
46 | cloc2 = CompoundLocation([loc1, loc2, loc3])
47 | cloc3 = CompoundLocation([loc4, loc5, loc6])
48 | cloc4 = CompoundLocation([loc2, loc3, loc4])
49 | cloc5 = CompoundLocation([loc3, loc4, loc5])
50 | cloc6 = CompoundLocation([loc2, loc3, loc5])
51 | cloc7 = CompoundLocation([loc1, loc6])
52 |
53 | assert cloc1.overlaps(cloc2) == True
54 | assert cloc1.overlaps(cloc3) == True
55 | assert cloc2.overlaps(cloc3) == False
56 | assert cloc2.overlaps(cloc4) == True
57 | assert cloc3.overlaps(cloc4) == True
58 | assert cloc3.overlaps(cloc5) == True
59 | assert cloc4.overlaps(cloc5) == True
60 | assert cloc4.overlaps(cloc6) == True
61 | assert cloc5.overlaps(cloc6) == True
62 | assert cloc1.overlaps(cloc7) == True
63 | assert cloc2.overlaps(cloc7) == True
64 | assert cloc3.overlaps(cloc7) == True
65 | assert cloc4.overlaps(cloc7) == False
66 | assert cloc5.overlaps(cloc7) == False
67 | assert cloc6.overlaps(cloc7) == False
--------------------------------------------------------------------------------
/test/test_build_ssn.py:
--------------------------------------------------------------------------------
1 | from domainator import build_ssn
2 | import pytest
3 | import tempfile
4 | import pandas as pd
5 | from pathlib import Path
6 | from helpers import compare_files
7 | import re
8 |
9 | @pytest.mark.parametrize("input_file,expected_output",
10 | [
11 | ["FeSOD_dist.tsv","ssn_FeSOD.xgmml"],
12 | ["FeSOD_dist.sparse.hdf5","ssn_FeSOD.sparse.xgmml"],
13 | ["FeSOD_dist.dense.hdf5","ssn_FeSOD.xgmml"]
14 | ])
15 | def test_build_ssn(input_file, expected_output, shared_datadir):
16 | with tempfile.TemporaryDirectory() as output_dir:
17 | # output_dir = "test_out"
18 | metadata = str(shared_datadir / "FeSOD_metadata.tsv")
19 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv"
20 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml"
21 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster", "--cluster_tsv", out_clusters, "--no_cluster_header", "--metadata", metadata])
22 | assert Path(out_cytoscape).is_file()
23 | assert Path(out_clusters).is_file()
24 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters.tsv')
25 | compare_files(out_cytoscape, shared_datadir/expected_output, skip_lines=2)
26 |
27 | @pytest.mark.parametrize("input_file,expected_output",
28 | [
29 | ["FeSOD_dist.tsv","ssn_FeSOD.xgmml"],
30 | ])
31 | def test_build_ssn_2(input_file, expected_output, shared_datadir):
32 | with tempfile.TemporaryDirectory() as output_dir:
33 | # output_dir = "test_out"
34 | metadata = str(shared_datadir / "FeSOD_metadata.tsv")
35 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv"
36 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml"
37 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster", "--cluster_tsv", out_clusters, "--metadata", metadata])
38 | assert Path(out_cytoscape).is_file()
39 | assert Path(out_clusters).is_file()
40 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters_header.tsv')
41 | compare_files(out_cytoscape, shared_datadir/expected_output, skip_lines=2)
42 |
43 |
44 | def test_build_ssn_3(shared_datadir):
45 | input_file = "FeSOD_dist.tsv"
46 | with tempfile.TemporaryDirectory() as output_dir:
47 | # output_dir = "test_out"
48 | metadata = str(shared_datadir / "FeSOD_metadata.tsv")
49 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv"
50 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml"
51 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster",
52 | "--cluster_tsv", out_clusters, "--metadata", metadata, "--color_table_out", output_dir + "/color_table.tsv"])
53 | assert Path(out_cytoscape).is_file()
54 | assert Path(out_clusters).is_file()
55 | assert Path(output_dir + "/color_table.tsv").is_file()
56 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters_header.tsv')
57 | compare_files(out_cytoscape, shared_datadir/"ssn_FeSOD.xgmml", skip_lines=2)
58 |
59 | color_table_dict = {}
60 | with open(output_dir + "/color_table.tsv", "r") as f:
61 | for line in f:
62 | domain, color = line.strip().split("\t")
63 | color_table_dict[domain] = color
64 | assert len(color_table_dict) == 3
65 | assert set(color_table_dict.keys()) == {"1","2","3"}
66 | assert all([re.match(r"#[0-9a-fA-F]{6}",x) for x in color_table_dict.values()])
67 | assert len(set(color_table_dict.values())) == 3
68 |
69 | def test_build_ssn_4(shared_datadir):
70 | input_file = "FeSOD_dist.tsv"
71 | with tempfile.TemporaryDirectory() as output_dir:
72 | # output_dir = "test_out"
73 | metadata = str(shared_datadir / "FeSOD_metadata.tsv")
74 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv"
75 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml"
76 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster",
77 | "--cluster_tsv", out_clusters, "--metadata", metadata, "--color_table_out", output_dir + "/color_table.tsv", "--color_table", str(shared_datadir / "color_table_123.tsv")])
78 | assert Path(out_cytoscape).is_file()
79 | assert Path(out_clusters).is_file()
80 | assert Path(output_dir + "/color_table.tsv").is_file()
81 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters_header.tsv')
82 | compare_files(output_dir + "/color_table.tsv", shared_datadir/"color_table_123.tsv")
83 |
--------------------------------------------------------------------------------
/test/test_build_tree.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tempfile
3 | from domainator.build_tree import get_newick
4 | from domainator import build_tree
5 | from helpers import compare_files
6 |
7 | # Simple tree structure for testing purposes
8 | class TreeNode:
9 | def __init__(self, id, dist=0):
10 | self.id = id
11 | self.dist = dist
12 | self.left = None
13 | self.right = None
14 |
15 | def is_leaf(self):
16 | return self.left is None and self.right is None
17 |
18 | def get_left(self):
19 | return self.left
20 |
21 | def get_right(self):
22 | return self.right
23 |
24 | @pytest.fixture
25 | def leaf_names():
26 | return ['A', 'B', 'C', 'D', 'E']
27 |
28 | @pytest.fixture
29 | def tree():
30 | root = TreeNode(-1, 4)
31 | root.left = TreeNode(-1, 2)
32 | root.right = TreeNode(-1, 1)
33 | root.left.left = TreeNode(0, 0)
34 | root.left.right = TreeNode(1, 0)
35 | root.right.left = TreeNode(2, 0)
36 | root.right.right = TreeNode(3, 0)
37 | return root
38 |
39 | def test_get_newick(tree, leaf_names):
40 | result = get_newick(tree, tree.dist, leaf_names)
41 | assert result == '((D:1.00,C:1.00):3.00,(B:2.00,A:2.00):2.00);'
42 |
43 | def test_get_newick_single_node():
44 | single_node_tree = TreeNode(0, 0)
45 | leaf_names = ['A']
46 | result = get_newick(single_node_tree, 0, leaf_names)
47 | assert result == '(A:0.00);'
48 |
49 | def test_get_newick_single_level(leaf_names):
50 | root = TreeNode(-1, 3)
51 | root.left = TreeNode(0, 0)
52 | root.right = TreeNode(1, 0)
53 | result = get_newick(root, root.dist, leaf_names)
54 | assert result == '(B:3.00,A:3.00);'
55 |
56 | def test_get_newick_with_empty_leaf_names(tree, leaf_names):
57 | result = get_newick(tree, tree.dist, [''] * len(leaf_names))
58 | assert result == '((:1.00,:1.00):3.00,(:2.00,:2.00):2.00);'
59 |
60 |
61 | def test_newick_output(shared_datadir):
62 |
63 | with tempfile.TemporaryDirectory() as output_dir:
64 | # output_dir = "test_out"
65 | newick_out = str(output_dir + "/test.newick")
66 | xgmm_out = str(output_dir + "/test.xgmml")
67 | metadata = str(shared_datadir / 'FeSOD_metadata.tsv')
68 |
69 | build_tree.main(['--input', str(shared_datadir / 'FeSOD_score_dist.tsv'), '--newick', newick_out, '--xgmml', xgmm_out, "--metadata", metadata])
70 | compare_files(newick_out, str(shared_datadir / 'FeSOD_score_dist.newick'))
71 | compare_files(xgmm_out, str(shared_datadir / 'FeSOD_score_dist.xgmml'))
72 |
--------------------------------------------------------------------------------
/test/test_color_table_to_legend.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from domainator import color_table_to_legend
3 |
4 | def test_color_table_legend(shared_datadir, tmp_path):
5 | svg_file = tmp_path / "test.svg"
6 | title = "Test Legend"
7 |
8 | color_table_to_legend.main(["-i", str(shared_datadir / "color_specification.tsv"), "--svg", str(svg_file), "--title", title])
9 | """
10 | CcdB #ff0000
11 | APH #00ff00
12 | CAT #0000ff
13 | Condensation #ff00ff
14 | 2-oxoacid_dh #ffffff
15 |
16 | """
17 |
18 | with open(svg_file, "r") as f:
19 | # read entire file into string
20 | text = f.read()
21 |
22 | assert "Test Legend" in text
23 | assert "#FF0000" in text
24 | assert "#00FF00" in text
25 | assert "#0000FF" in text
26 | assert "#FF00FF" in text
27 | assert "#FFFFFF" in text
28 | assert "CcdB" in text
29 | assert "APH" in text
30 | assert "CAT" in text
31 | assert "Condensation" in text
32 | assert "2-oxoacid_dh" in text
33 |
34 |
--------------------------------------------------------------------------------
/test/test_data_matrix.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | warnings.filterwarnings("ignore", module='numpy')
3 | import pytest
4 | from domainator.data_matrix import DataMatrix
5 | import scipy.sparse
6 | import numpy as np
7 | import pytest_datadir
8 |
9 | # Test initialization of DataMatrix
10 | def test_init():
11 | # Test case 1: Initialize with data
12 | data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
13 | row_names = ['A', 'B', 'C']
14 | col_names = ['X', 'Y', 'Z']
15 | matrix = DataMatrix(data, row_names, col_names)
16 | assert matrix.shape == (3, 3)
17 | assert matrix.size == 9
18 | assert not matrix.sparse
19 | assert matrix.rows == row_names
20 | assert matrix.columns == col_names
21 | assert matrix.row_lengths is None
22 | assert matrix.column_lengths is None
23 | assert matrix.data_type == ""
24 |
25 | # Test case 2: Initialize without data
26 | matrix = DataMatrix()
27 | assert matrix.shape == (0, 0)
28 | assert matrix.size == 0
29 | assert not matrix.sparse
30 | assert matrix.rows is None
31 | assert matrix.columns is None
32 | assert matrix.row_lengths is None
33 | assert matrix.column_lengths is None
34 | assert matrix.data_type == ""
35 |
36 | # Test from_file method of DataMatrix
37 |
38 | @pytest.mark.parametrize("filename,sparse",
39 | [
40 | ("test_matrix.dense.hdf5",False),
41 | ("test_matrix.dense.tsv",False),
42 | ("test_matrix.sparse.hdf5",True)
43 | ])
44 | def test_from_file(shared_datadir, filename, sparse):
45 | # Test case 1: Read dense matrix from file
46 | matrix_file = shared_datadir / filename
47 | matrix = DataMatrix.from_file(matrix_file)
48 | assert matrix.shape == (3, 3)
49 | assert matrix.size == 9
50 | assert matrix.sparse is sparse
51 | assert matrix.rows == ['A', 'B', 'C']
52 | assert matrix.columns == ['X', 'Y', 'Z']
53 | assert matrix.row_lengths is None
54 | assert matrix.column_lengths is None
55 | assert matrix.data_type == ""
56 |
57 |
58 | # Test convert_to_sparse method of DataMatrix
59 | def test_convert_to_sparse():
60 | # Test case 1: Convert dense matrix to sparse
61 | data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
62 | row_names = ['A', 'B', 'C']
63 | col_names = ['X', 'Y', 'Z']
64 | matrix = DataMatrix(data, row_names, col_names)
65 | matrix.convert_to_sparse()
66 | assert matrix.sparse
67 | assert matrix.data.shape == (3, 3)
68 |
69 | # Test case 2: Convert already sparse matrix to sparse
70 | matrix = DataMatrix()
71 | matrix.sparse = True
72 | matrix.data = scipy.sparse.csr_matrix([[1, 0, 0], [0, 2, 0], [0, 0, 3]])
73 | matrix.convert_to_sparse()
74 | assert matrix.sparse
75 | assert matrix.data.shape == (3, 3)
76 |
--------------------------------------------------------------------------------
/test/test_domainator_db_download.py:
--------------------------------------------------------------------------------
1 | from domainator import domainator_db_download
2 | import tempfile
3 | from domainator import utils
4 | from pathlib import Path
5 |
6 |
7 |
8 | def test_uniprot_download_fasta(shared_datadir):
9 | with tempfile.TemporaryDirectory() as output_dir:
10 | # output_dir = "test_out"
11 | outfile = Path(output_dir) / "uniprot_sprot.fasta"
12 | domainator_db_download.main(["--db", "swissprot", "--num_recs", "2", "--output", str(outfile)])
13 | assert outfile.exists()
14 | assert outfile.stat().st_size > 0
15 | recs = list(utils.parse_seqfiles([str(outfile)]))
16 | assert len(recs) == 2
17 |
18 | def test_uniprot_download_genbank(shared_datadir):
19 | with tempfile.TemporaryDirectory() as output_dir:
20 | # output_dir = "test_out"
21 | outfile = Path(output_dir) / "uniprot_sprot.gb"
22 | domainator_db_download.main(["--db", "swissprot_gb", "--num_recs", "2", "--output", str(outfile)])
23 | assert outfile.exists()
24 | assert outfile.stat().st_size > 0
25 | recs = list(utils.parse_seqfiles([str(outfile)]))
26 | assert len(recs) == 2
27 |
28 | #TODO: add tests for genbank downloads
29 |
30 |
31 |
32 | def test_genbank_download_genbank_1(shared_datadir):
33 | small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM876677v1','https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/031/580/395/GCA_031580395.1_ASM3158039v1']
34 | with tempfile.TemporaryDirectory() as output_dir:
35 | # output_dir = "test_out"
36 | outfile = Path(output_dir) / "gb.gb"
37 | domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call=None, num_recs=1, cpus=3)
38 | assert outfile.exists()
39 | # read output file
40 | recs = list(utils.parse_seqfiles([str(outfile)]))
41 | assert len(recs) == 1
42 |
43 | def test_genbank_download_genbank_2(shared_datadir):
44 | small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM876677v1','https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/031/580/395/GCA_031580395.1_ASM3158039v1']
45 | with tempfile.TemporaryDirectory() as output_dir:
46 | # output_dir = "test_out"
47 | outfile = Path(output_dir) / "gb.gb"
48 | domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call=None, num_recs=None, cpus=3)
49 | assert outfile.exists()
50 | # read output file
51 | recs = list(utils.parse_seqfiles([str(outfile)]))
52 | assert len(recs) == 2
53 |
54 | def test_genbank_download_genbank_3(shared_datadir):
55 | small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM876677v1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/987/885/GCA_002987885.1_ASM298788v1']
56 | with tempfile.TemporaryDirectory() as output_dir:
57 | #output_dir = "test_out"
58 | outfile = Path(output_dir) / "gb.gb"
59 | domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call="all", num_recs=None, cpus=2)
60 | assert outfile.exists()
61 | # read output file
62 | recs = list(utils.parse_seqfiles([str(outfile)]))
63 | assert len(recs) == 2
64 | outfile_text = outfile.read_text()
65 | assert "CDS" in outfile_text
66 | assert '/gene_id="AM260465_1"' in outfile_text
67 |
68 | # def test_genbank_download_genbank_skipped_record_log(shared_datadir):
69 | # small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM87667v1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/987/885/GCA_002987885.1_ASM298788v1']
70 | # with tempfile.TemporaryDirectory() as output_dir:
71 | # #output_dir = "test_out"
72 | # outfile = Path(output_dir) / "gb.gb"
73 | # skipped_record_log = Path(output_dir) / "skipped_record_log.txt"
74 | # domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call="all", num_recs=None, cpus=2, skipped_record_log=skipped_record_log)
75 | # assert outfile.exists()
76 | # # read output file
77 | # recs = list(utils.parse_seqfiles([str(outfile)]))
78 | # assert len(recs) == 1
79 | # outfile_text = outfile.read_text()
80 | # assert "CDS" in outfile_text
81 | # assert '/gene_id="AM260465_1"' in outfile_text
82 | # assert skipped_record_log.exists()
83 | # skipped_record_log_text = skipped_record_log.read_text()
84 | # assert "GCA_008766775.1_ASM87667v1" in skipped_record_log_text
85 | # assert "GCA_002987885.1_ASM298788v1" not in skipped_record_log_text
--------------------------------------------------------------------------------
/test/test_extract_unannotated.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from domainator.Bio import SeqIO
3 | from domainator import extract_unannotated
4 | from domainator.utils import parse_seqfiles, DomainatorCDS
5 | import pytest
6 | from io import StringIO
7 | import sys
8 | import subprocess
9 |
10 |
11 | def test_extract_unannotated_1(shared_datadir):
12 | with tempfile.TemporaryDirectory() as output_dir:
13 | #output_dir = "test_out"
14 | out = output_dir + "/extraction.gb"
15 | extract_unannotated.main(["-i", str(shared_datadir / "simple_genpept.gb"), "-o", out])
16 | # assert 0
17 | seqs = list(parse_seqfiles([out]))
18 | assert len(seqs) == 8
19 | [str(seq) for seq in seqs] == ["FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA*","M","I*","*","M","DEWQGGA*","MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDV","DMNKLQFHLMLDEFF*"]
20 |
21 | def test_extract_unannotated_largest_keep_name_2(shared_datadir):
22 | with tempfile.TemporaryDirectory() as output_dir:
23 | #output_dir = "test_out"
24 | out = output_dir + "/extraction.gb"
25 | extract_unannotated.main(["-i", str(shared_datadir / "simple_genpept.gb"), "-o", out, "--largest", "--keep_name"])
26 | seqs = list(parse_seqfiles([out]))
27 | assert len(seqs) == 5
28 | [str(seq) for seq in seqs] == ["FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA*","I*","*","DEWQGGA*","MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDV"]
29 |
30 | def test_extract_unannotated_lb_2(shared_datadir):
31 | with tempfile.TemporaryDirectory() as output_dir:
32 | #output_dir = "test_out"
33 | out = output_dir + "/extraction.gb"
34 | extract_unannotated.main(["-i", str(shared_datadir / "simple_genpept.gb"), "-o", out, "--lb", "10"])
35 | seqs = list(parse_seqfiles([out]))
36 | assert len(seqs) == 3
37 | [str(seq) for seq in seqs] == ["FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA*", "MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDV", "DMNKLQFHLMLDEFF*"]
--------------------------------------------------------------------------------
/test/test_hmmer_build.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tempfile
3 | import os
4 | from io import BytesIO, StringIO
5 | from pyhmmer import easel
6 | from pyhmmer.plan7 import HMM
7 | from domainator.hmmer_build import hmmer_build, main
8 |
9 |
10 | @pytest.fixture
11 | def msa_content():
12 | return b">seq1\nACGTACGT\n>seq2\nACGTACGA\n"
13 |
14 | def create_msa_file(content):
15 | with tempfile.NamedTemporaryFile(delete=False, mode="wb") as f:
16 | f.write(content)
17 | return f.name
18 |
19 | @pytest.fixture
20 | def msa_file(msa_content):
21 | file_path = create_msa_file(msa_content)
22 | yield file_path
23 | os.unlink(file_path)
24 |
25 | def test_hmmer_build_with_required_params():
26 | msa_content = b">seq1\nACGTACGT\n>seq2\nACGTACGA\n"
27 | msa_file = create_msa_file(msa_content)
28 |
29 | with open(msa_file, "rb") as file:
30 | hmm = hmmer_build(file, name="test_profile")
31 | assert isinstance(hmm, HMM)
32 | assert hmm.name.decode() == "test_profile"
33 |
34 | os.unlink(msa_file)
35 |
36 | def test_hmmer_build_with_optional_params():
37 | msa_content = b">seq1\nACGTACGT\n>seq2\nACGTACGA\n"
38 | msa_file = create_msa_file(msa_content)
39 |
40 | with open(msa_file, "rb") as file:
41 | hmm = hmmer_build(file, name="test_profile", acc="P12345", desc="Test profile description", alphabet=easel.Alphabet.dna())
42 | assert isinstance(hmm, HMM)
43 | assert hmm.name.decode() == "test_profile"
44 | assert hmm.accession.decode() == "P12345"
45 | assert hmm.description.decode() == "Test profile description"
46 |
47 | os.unlink(msa_file)
48 |
49 | def test_hmmer_build_with_binaryio():
50 | msa_content = b">seq1\nACGTACGT\n>seq2\nACGTACGA\n"
51 | msa_file = BytesIO(msa_content)
52 |
53 | hmm = hmmer_build(msa_file, name="test_profile", acc="P12345", desc="Test profile description", alphabet=easel.Alphabet.dna())
54 | assert isinstance(hmm, HMM)
55 | assert hmm.name.decode() == "test_profile"
56 | assert hmm.accession.decode() == "P12345"
57 | assert hmm.description.decode() == "Test profile description"
58 |
59 |
60 | def test_main_with_required_params(msa_file, capsys):
61 | main(["--name", "test_profile", "--input", msa_file])
62 | captured = capsys.readouterr()
63 | assert "HMMER3/f" in captured.out
64 | assert "NAME test_profile" in captured.out
65 |
66 | def test_main_with_optional_params(msa_file, capsys):
67 | main(["--name", "test_profile", "--acc", "P12345", "--desc", "Test profile description", "--input", msa_file, "--alphabet", "dna"])
68 | caputured = capsys.readouterr()
69 | output = caputured.out
70 | assert "HMMER3/f" in output
71 | assert "NAME test_profile" in output
72 | assert "ACC P12345" in output
73 | assert "DESC Test profile description" in output
74 |
75 |
--------------------------------------------------------------------------------
/test/test_hmmer_compare.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from domainator import hmmer_compare
3 | from helpers import compare_files
4 |
5 | #TODO: better tests!
6 |
7 | def test_hmmer_compare_1(shared_datadir):
8 |
9 | with tempfile.TemporaryDirectory() as output_dir:
10 | # output_dir = "test_out"
11 | out_path = output_dir + f"/out_scores.tsv"
12 | hmmer_compare.main(["-i", str(shared_datadir / "pdonr_hmms.hmm"), "-r", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out_path, "--alignment", "--score_cutoff", "13", "--cpu", "10"])
13 | compare_files(out_path, shared_datadir / "pDONR_201_hmm_scores.tsv")
14 |
15 |
--------------------------------------------------------------------------------
/test/test_hmmer_report.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import tempfile
3 | from glob import glob
4 | from helpers import compare_files
5 | import pytest
6 |
7 | from domainator import hmmer_report
8 |
9 | def test_hmmer_report_1(shared_datadir):
10 |
11 | with tempfile.TemporaryDirectory() as output_dir:
12 | # output_dir = "test_out"
13 | out = output_dir + "/hmmer_report.tsv"
14 | hmmer_report.main(["-i", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out, '--source', '--acc', '--desc', '--length', '--consensus', '--append', 'one', 'int', '1', '--append', 'two', 'float', '2.0', '--append', 'three', 'str', 'three'])
15 | assert Path(out).is_file()
16 |
17 | with open(out) as f:
18 | lines = f.readlines()
19 | assert len(lines) == 8
20 | assert lines[0].strip().split("\t") == ["name","source","acc","desc","length","consensus","one","two","three"]
21 | assert lines[1].strip() == "2-oxoacid_dh\tpdonr_hmms\tPF00198.25\t2-oxoacid dehydrogenases acyltransferase (catalytic domain)\t233\teqeeervplsgirkaiakrlteskqeiphftlsdevdvtallalrkelkedeakeekakltlldflikavalAlkefPelnasvdeeekeivlkkhvniGvAvatprGLlvPviknadkkslleiakelkelaeraregklkpedleggtftisNlGmlGvtsftPiinppqvaIlgvgrikerpvvkegelvarkvmplslsaDHRvidGaeaarFlntlkkllenpeelll\t1\t2.0\tthree"
22 | assert lines[-1].strip() == "TCAD9\tpdonr_hmms\tPF19974.1\tTernary complex associated domain 9\t437\tdqvevvrvLtgGrSGaqVlevtvfvkeknqalrhVlKigsaseiakEweAyqrliqpllnalfatIiavsesvlengdqvldelgavvYshagqfagepgeklrsLedlfqealrgpeaadravallerlletllnllYagateeplqtlreelnsrLGpdlvvevkevdseqlvvypdDllqakmssysaseynskvagilvsvelsrlevkvrgprlsavdddvrvevllsggalseleeqgdefleGsvvatranlrlrllkeledelvleetllevdglqlahPfaalrsaLtealearvtssvHGDLNprNiLlaeedrvyLIDfartreggpllsDlAwLevnLlrtvladrldlqellrLqrlLalasrllelealaealagesealakafrllaaiRrfarkqyplerrelwwreylaaLllaahrtLk\t1\t2.0\tthree"
23 |
--------------------------------------------------------------------------------
/test/test_hmmer_search.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from domainator import hmmer_search
3 | from pathlib import Path
4 |
5 | #TODO: better tests!
6 |
7 | def test_hmmer_search_1(shared_datadir):
8 |
9 | with tempfile.TemporaryDirectory() as output_dir:
10 | # output_dir = "test_out"
11 | out_path = output_dir + f"/out_scores.hmm"
12 | hmmer_search.main(["-i", str(shared_datadir / "pdonr_hmms_1.hmm"), "-r", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out_path, "--score_cutoff", "13"])
13 | file_contents = Path(out_path).read_text()
14 | assert "NAME CAT" in file_contents
15 | assert "NAME 2-oxoacid_dh" in file_contents
16 | assert "NAME APH" in file_contents
17 | assert "NAME CcdA" not in file_contents
18 | assert "NAME CcdB" not in file_contents
19 | assert "NAME Condensation" not in file_contents
20 | assert "NAME TCAD9" not in file_contents
21 |
22 | def test_hmmer_search_2(shared_datadir):
23 |
24 | with tempfile.TemporaryDirectory() as output_dir:
25 | # output_dir = "test_out"
26 | out_path = output_dir + f"/out_scores.hmm"
27 | hmmer_search.main(["-i", str(shared_datadir / "pdonr_hmms_1.hmm"), "-r", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out_path, "--score_cutoff", "13", "--max_hits", "2"])
28 | file_contents = Path(out_path).read_text()
29 | assert "NAME CAT" in file_contents
30 | assert "NAME 2-oxoacid_dh" in file_contents
31 | assert "NAME APH" not in file_contents
32 | assert "NAME CcdA" not in file_contents
33 | assert "NAME CcdB" not in file_contents
34 | assert "NAME Condensation" not in file_contents
35 | assert "NAME TCAD9" not in file_contents
36 |
--------------------------------------------------------------------------------
/test/test_hmmer_select.py:
--------------------------------------------------------------------------------
1 | from domainator.hmmer_select import main, hmmer_select
2 | import tempfile
3 | import pyhmmer
4 | import os
5 |
6 | def test_hmmer_select_1(shared_datadir):
7 | with tempfile.TemporaryDirectory() as output_dir:
8 | # output_dir = "test_out"
9 | out = output_dir + "/out.hmm"
10 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "all", "--regex", "dehyd.*"])
11 | output_hmms = list(pyhmmer.plan7.HMMFile(out))
12 | assert len(output_hmms) == 1
13 |
14 | def test_hmmer_select_2(shared_datadir):
15 | with tempfile.TemporaryDirectory() as output_dir:
16 | # output_dir = "test_out"
17 | out = output_dir + "/out.hmm"
18 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "name", "--exact", "TCAD9"])
19 | output_hmms = list(pyhmmer.plan7.HMMFile(out))
20 | assert len(output_hmms) == 1
21 |
22 | def test_hmmer_select_3(shared_datadir):
23 | with tempfile.TemporaryDirectory() as output_dir:
24 | # output_dir = "test_out"
25 | out = output_dir + "/out.hmm"
26 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "name", "--exact", "TCAD"])
27 |
28 | # check that the file size of out is 0
29 | assert os.path.getsize(out) == 0
30 | def test_hmmer_select_4(shared_datadir):
31 | with tempfile.TemporaryDirectory() as output_dir:
32 | # output_dir = "test_out"
33 | out = output_dir + "/out.hmm"
34 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--contains", "PF19974"])
35 |
36 | output_hmms = list(pyhmmer.plan7.HMMFile(out))
37 | assert len(output_hmms) == 1
38 |
39 |
40 | def test_hmmer_select_case_sensitivity_1(shared_datadir):
41 | with tempfile.TemporaryDirectory() as output_dir:
42 | # output_dir = "test_out"
43 | out = output_dir + "/out.hmm"
44 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--contains", "pf19974"])
45 |
46 | output_hmms = list(pyhmmer.plan7.HMMFile(out))
47 | assert len(output_hmms) == 1
48 |
49 | def test_hmmer_select_case_sensitivity_2(shared_datadir):
50 | with tempfile.TemporaryDirectory() as output_dir:
51 | # output_dir = "test_out"
52 | out = output_dir + "/out.hmm"
53 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--regex", "pf19974"])
54 |
55 | output_hmms = list(pyhmmer.plan7.HMMFile(out))
56 | assert len(output_hmms) == 1
57 |
58 | def test_hmmer_select_case_sensitivity_3(shared_datadir):
59 | with tempfile.TemporaryDirectory() as output_dir:
60 | # output_dir = "test_out"
61 | out = output_dir + "/out.hmm"
62 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--contains", "pf19974", "--case_sensitive"])
63 |
64 | assert os.path.getsize(out) == 0
65 |
66 | def test_hmmer_select_case_sensitivity_4(shared_datadir):
67 | with tempfile.TemporaryDirectory() as output_dir:
68 | # output_dir = "test_out"
69 | out = output_dir + "/out.hmm"
70 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--regex", "pf19974", "--case_sensitive"])
71 |
72 | assert os.path.getsize(out) == 0
--------------------------------------------------------------------------------
/test/test_matrix_report.py:
--------------------------------------------------------------------------------
1 | from domainator import matrix_report
2 | import tempfile
3 | import pytest
4 |
5 | @pytest.mark.parametrize("input_file",
6 | [
7 | "scorefull.tsv",
8 | "scorefull.dense.hdf5"
9 | ])
10 | def test_matrix_report_1(shared_datadir, input_file):
11 |
12 | with tempfile.TemporaryDirectory() as output_dir:
13 | # output_dir = "test_out"
14 | out_html = output_dir + "/matrix_report_test.html"
15 | out_txt = output_dir + "/matrix_report_test.txt"
16 | matrix_report.main(["-i", str(shared_datadir / input_file), "-o", out_txt, "--html", out_html])
17 | for fh in (out_html, out_txt):
18 | f_txt = open(fh).read()
19 | assert "Matrix Report" in f_txt
20 | assert "Min" in f_txt
21 | assert "152.0" in f_txt
22 | assert "451.0" in f_txt
23 | assert "Total values" in f_txt
24 | assert f_txt.count("400") == 2
25 |
26 |
27 | # def test_matrix_report_empty_input(shared_datadir):
28 | # pass
29 | # #TODO
--------------------------------------------------------------------------------
/test/test_ncbi_taxonomy.py:
--------------------------------------------------------------------------------
1 | from domainator.Taxonomy import NCBITaxonomy
2 | import pytest
3 |
4 |
5 | def test_NCBItaxonomy(shared_datadir):
6 | tx = NCBITaxonomy(shared_datadir / "taxdmp", overwrite=False)
7 |
8 | assert tx.lineage(562) == [562, 561, 543, 91347, 1236, 1224, 2, 131567, 1]
9 | assert tx.lineage(1423) == [1423, 653685, 1386, 186817, 1385, 91061, 1239, 1783272, 2, 131567, 1]
10 | assert tx.rank(562) == "species"
11 | assert tx.rank(1423) == "species"
12 | assert tx.name(562) == "Escherichia coli"
13 | assert tx.name(1423) == "Bacillus subtilis"
14 |
15 | def test_NCBItaxonomy_2(shared_datadir):
16 | tx = NCBITaxonomy(shared_datadir / "taxdmp", overwrite=False)
17 | with pytest.warns(UserWarning):
18 | assert tx.lineage(1985417) == []
19 |
--------------------------------------------------------------------------------
/test/test_partition_seqfile.py:
--------------------------------------------------------------------------------
1 | from math import prod
2 | from domainator.partition_seqfile import main
3 | import tempfile
4 | from glob import glob
5 | from io import StringIO
6 | import pytest
7 |
8 | @pytest.mark.parametrize("file,option,value,num_proteins,offsets,recs_to_read",
9 | [("pDONR201.gb","--partitions",2,3,[0],[1]),
10 | ("pDONR201.gb","--cdss_per_partition",1,3,[0],[1]),
11 | ("pDONR201_multigenemark_partition.gb","--partitions",2,24,[0,16382],[2,2]),
12 | ("pDONR201_multigenemark_partition.gb","--partitions",1,24,[0],[4]),
13 | ("pDONR201_multigenemark_partition.gb","--partitions",10,24,[0,8191,16382,24573],[1,1,1,1]),
14 | ("pDONR201_multigenemark_partition.gb","--partitions",4,24,[0,8191,16382,24573],[1,1,1,1]),
15 | ("pDONR201_multigenemark_partition.gb","--cdss_per_partition",6,24,[0,8191,16382,24573],[1,1,1,1]),
16 | ("pDONR201_multigenemark_partition.gb","--cdss_per_partition",7,24,[0,16382],[2,2]),
17 | ])
18 | def test_partition_seqfile(file, num_proteins, option, value, offsets, recs_to_read, shared_datadir):
19 | with tempfile.TemporaryDirectory() as output_dir:
20 | outfile = output_dir + "/" + "outfile.txt"
21 | args = ["-i", str(shared_datadir / file), "-o", outfile, option, str(value)]
22 | main(args)
23 | with open(outfile) as f:
24 | proteins = int(f.readline().strip())
25 | assert proteins == num_proteins
26 | produced_offsets = list()
27 | produced_recs_to_read = list()
28 | for line in f:
29 | parts = line.strip().split()
30 | produced_offsets.append(int(parts[0]))
31 | produced_recs_to_read.append(int(parts[1]))
32 | assert produced_offsets == offsets
33 | assert produced_recs_to_read == recs_to_read
34 |
--------------------------------------------------------------------------------
/test/test_partition_seqids.py:
--------------------------------------------------------------------------------
1 | from domainator.partition_seqids import partition_seqids
2 | import tempfile
3 | from glob import glob
4 | from io import StringIO
5 | import pytest
6 |
7 | @pytest.mark.parametrize("partitions, ids_per_partition, rec_count",
8 | [(1, None, 10),
9 | (3, None, 10),
10 | (6, None, 10),
11 | (9, None, 10),
12 | (10, None, 10),
13 | (None, 5, 10),
14 | (None, 9, 10),
15 | (None, 2, 10),
16 | # (1, None, ":memory:", 10),
17 | # (1, None, ":memory:", 10),
18 | ])
19 | def test_partition_seqids_fasta(partitions, ids_per_partition, rec_count, shared_datadir, capsys):
20 | names = [f"seq{x}" for x in range(rec_count)]
21 | f = StringIO("\n".join([f">{x}\nMAGICCATS" for x in names]))
22 | # (input_path, output_prefix, partitions, ids_per_partition, index_path=None, file_format="fasta")
23 | if partitions is not None:
24 | intended_partitions = partitions
25 | else:
26 | intended_partitions = int(rec_count / ids_per_partition)
27 | if rec_count % ids_per_partition != 0:
28 | intended_partitions += 1
29 | with tempfile.TemporaryDirectory() as output_dir:
30 | output_prefix = output_dir + "/out"
31 | partition_seqids([f], output_prefix, partitions, ids_per_partition, filetype="fasta")
32 | captured = capsys.readouterr()
33 | assert captured.out == str(rec_count)
34 | assert len(glob(output_prefix+"*.txt")) <= intended_partitions
35 | assert len(glob(output_prefix+"*.txt")) >= int(intended_partitions/2)
36 | ids_in_files = set()
37 | bins_with_too_few = 0
38 | for f in glob(output_prefix+"*.txt"):
39 | ids_in_file = 0
40 | with open(f) as inf:
41 | for line in inf:
42 | ids_in_file += 1
43 | line = line.rstrip()
44 | ids_in_files.add(line)
45 | if ids_per_partition is not None:
46 | assert ids_in_file <= ids_per_partition
47 | if ids_in_file < ids_per_partition:
48 | bins_with_too_few += 1
49 |
50 | assert bins_with_too_few <= 1
51 | assert len(ids_in_files) == len(names)
52 | assert len(ids_in_files.intersection(set(names))) == len(ids_in_files)
53 |
54 | def test_partition_seqids_genbank(shared_datadir, capsys):
55 | with tempfile.TemporaryDirectory() as output_dir:
56 |
57 | output_prefix = output_dir + "/out"
58 | partition_seqids([str(shared_datadir / "pDONR201_multi_genemark_domainator.gb")], output_prefix, 2, None, filetype="genbank")
59 | captured = capsys.readouterr()
60 | assert captured.out == "24"
61 | assert len(glob(output_prefix+"*.txt")) == 2
62 |
63 |
64 | def test_partition_seqids_genbank_peptide(shared_datadir, capsys):
65 | with tempfile.TemporaryDirectory() as output_dir:
66 |
67 | output_prefix = output_dir + "/out"
68 | partition_seqids([str(shared_datadir / "FeSOD_20.gb")], output_prefix, 2, None, filetype="genbank")
69 | captured = capsys.readouterr()
70 | assert captured.out == "20"
71 | assert len(glob(output_prefix+"*.txt")) == 2
72 |
--------------------------------------------------------------------------------
/test/test_plot_contigs.py:
--------------------------------------------------------------------------------
1 | from domainator import plot_contigs
2 | import tempfile
3 |
4 | def test_plot_contigs_1(shared_datadir):
5 | with tempfile.TemporaryDirectory() as output_dir:
6 | # output_dir = "test_out"
7 | input = str(shared_datadir / "MT_nbs.gb")
8 | output = output_dir + "/contigs.html"
9 | plot_contigs.main(["-i", input, "--html", output])
10 | output_text = open(output).read()
11 | assert "Domainator Contigs Plot" in output_text
12 | assert "BX548174_369054:361090rc" in output_text
13 | assert not ('"type": "source"' in output_text)
14 |
15 | def test_plot_contigs_2(shared_datadir):
16 | with tempfile.TemporaryDirectory() as output_dir:
17 | # output_dir = "test_out"
18 | input = str(shared_datadir / "MT_nbs.gb")
19 | output = output_dir + "/contigs.html"
20 | plot_contigs.main(["-i", input, "--html", output, "--height", "1000", "--width", "800"])
21 | output_text = open(output).read()
22 | assert '