├── .gitignore ├── LICENSE.txt ├── README.md ├── conda_env.yml ├── docs ├── README.md ├── developing_domainator.md ├── esm_3b_foldseek.md ├── examples.md ├── file_formats.md ├── limitations_and_FAQ.md └── media │ ├── Domainator_logo.svg │ ├── Overview_diagram.png │ ├── detective_Domainic_transparent_background.png │ ├── genome_mining_workflow.png │ └── hmm_profile_comparison.png ├── domainator.def ├── domainator_esmologs.def ├── package.sh ├── pyproject.toml ├── src └── domainator │ ├── Bio │ ├── Data │ │ ├── CodonTable.py │ │ ├── IUPACData.py │ │ ├── SCOPData.py │ │ └── __init__.py │ ├── File.py │ ├── GenBank │ │ ├── Scanner.py │ │ ├── __init__.py │ │ └── utils.py │ ├── LICENSE.rst │ ├── Seq.py │ ├── SeqFeature.py │ ├── SeqIO │ │ ├── FastaIO.py │ │ ├── InsdcIO.py │ │ ├── InsdcIO_new.py │ │ ├── Interfaces.py │ │ ├── SwissIO.py │ │ ├── TabIO.py │ │ └── __init__.py │ ├── SeqRecord.py │ ├── SwissProt │ │ └── __init__.py │ └── __init__.py │ ├── Taxonomy │ └── __init__.py │ ├── __init__.py │ ├── build_projection.py │ ├── build_ssn.py │ ├── build_tree.py │ ├── color_genbank.py │ ├── color_table_to_legend.py │ ├── compare_contigs.py │ ├── cytoscape.py │ ├── data_matrix.py │ ├── deduplicate_genbank.py │ ├── domain_search.py │ ├── domainate.py │ ├── domainator_db_download.py │ ├── enum_report.py │ ├── extract_domains.py │ ├── extract_peptides.py │ ├── extract_unannotated.py │ ├── filter_domains.py │ ├── foldseek.py │ ├── genbank_to_fasta.py │ ├── hmmer_build.py │ ├── hmmer_compare.py │ ├── hmmer_report.py │ ├── hmmer_search.py │ ├── hmmer_select.py │ ├── matrix_report.py │ ├── partition_seqfile.py │ ├── partition_seqids.py │ ├── plot_contigs.py │ ├── select_by_cds.py │ ├── select_by_contig.py │ ├── seq_dist.py │ ├── summary_report.py │ ├── transform_matrix.py │ ├── trim_contigs.py │ └── utils.py └── test ├── data ├── 206.gb ├── CcdB.hmm ├── CcdB.hmm.h3f ├── CcdB.hmm.h3i ├── CcdB.hmm.h3m ├── CcdB.hmm.h3p ├── CuSOD_enum_report_test.gb ├── FeSOD_20.fasta ├── FeSOD_20.gb ├── FeSOD_20_pfam.gb ├── FeSOD_dist.dense.hdf5 ├── FeSOD_dist.sparse.hdf5 ├── FeSOD_dist.tsv ├── FeSOD_metadata.tsv ├── FeSOD_pfam.hmm ├── FeSOD_score_dist.newick ├── FeSOD_score_dist.tsv ├── FeSOD_score_dist.xgmml ├── JABFVH010000506_extraction.gb ├── MT_nbs.enum_report.tsv ├── MT_nbs.gb ├── Peptidase_M28.hmm ├── Peptidase_M28.hmm.h3f ├── Peptidase_M28.hmm.h3i ├── Peptidase_M28.hmm.h3m ├── Peptidase_M28.hmm.h3p ├── Polymorphism_feature.gb ├── SPR.hmm ├── Staph_phages.gb ├── bacillus_phage_SPR.gb ├── bacillus_phage_SPR_with_annotations.gb ├── bacillus_phage_SPR_with_annotations_reversed.gb ├── bin3.sparse.hdf5 ├── bin3.sparse.tsv ├── bin3.tsv ├── ccdb.gb ├── color_domain_search_test.gb ├── color_specification.tsv ├── color_table_123.tsv ├── domain_search_test_out1.gb ├── domain_search_test_out2.gb ├── domain_search_test_out3.gb ├── domain_search_translate_out.gb ├── empty.gb ├── enum_report_html_max_size_out.html ├── enum_report_html_out.html ├── enum_report_html_out_quote_escape.html ├── extract_peptides_test_1_out.gb ├── extract_peptides_test_2.gb ├── foldseek │ ├── FeSOD │ ├── FeSOD.dbtype │ ├── FeSOD.index │ ├── FeSOD.lookup │ ├── FeSOD_20.3di.fasta │ ├── FeSOD_20.fasta │ ├── FeSOD_h │ ├── FeSOD_h.dbtype │ ├── FeSOD_h.index │ ├── FeSOD_ss │ ├── FeSOD_ss.dbtype │ └── FeSOD_ss.index ├── metadata_FeSOD_20.tsv ├── pDONR201.fasta ├── pDONR201.gb ├── pDONR201_domainator_circular.gb ├── pDONR201_empty.gb ├── pDONR201_genemark.gb ├── pDONR201_genemark.gff ├── pDONR201_multi.fasta ├── pDONR201_multi_genemark.gb ├── pDONR201_multi_genemark.gff ├── pDONR201_multi_genemark_clipped_domainator.gb ├── pDONR201_multi_genemark_domainator.gb ├── pDONR201_multi_genemark_domainator_multi_hmm.gb ├── pDONR201_multi_genemark_domainator_multi_hmm_2.gb ├── pDONR201_multi_subset.txt ├── pDONR201_multigenemark_partition.gb ├── pDONR201_no_CDSs.gb ├── pDONR201_partly_CDSs.gb ├── pDONR201_pseudo.gb ├── pDONR_201_domain_search.gb ├── pDONR_201_domain_search_long_annotations.gb ├── pDONR_201_domainator.gb ├── pDONR_201_domainator_domain_reorder.gb ├── pDONR_201_hmm_scores.tsv ├── pdonr_hmms.hmm ├── pdonr_hmms_1.hmm ├── pdonr_hmms_2.hmm ├── pdonr_peptides.fasta ├── saccharomyces_defense_finder.hmm ├── saccharomyces_extraction.gb ├── saccharomyces_extraction_circular.gb ├── score3.sparse.tsv ├── score4.sparse.tsv ├── scorefull.dense.hdf5 ├── scorefull.tsv ├── simple_genpept.gb ├── simple_genpept_contigs.txt ├── simple_genpept_equals_second_line.gb ├── simple_genpept_quote_name.gb ├── ssn_FeSOD.sparse.xgmml ├── ssn_FeSOD.xgmml ├── ssn_FeSOD_clusters.tsv ├── ssn_FeSOD_clusters_header.tsv ├── swissprot_CuSOD_subset.fasta ├── taxdmp │ ├── delnodes.dmp │ ├── merged.dmp │ ├── names.dmp │ ├── nodes.dmp │ └── taxdump.tar.gz ├── test_matrix.dense.hdf5 ├── test_matrix.dense.tsv ├── test_matrix.sparse.hdf5 └── thymidylate_synthase.fasta ├── helpers.py ├── test_SeqFeature.py ├── test_build_projection.py ├── test_build_ssn.py ├── test_build_tree.py ├── test_color_genbank.py ├── test_color_table_to_legend.py ├── test_compare_contigs.py ├── test_data_matrix.py ├── test_deduplicate_genbank.py ├── test_domain_search.py ├── test_domainate.py ├── test_domainator_db_download.py ├── test_enum_report.py ├── test_extract_domains.py ├── test_extract_peptides.py ├── test_extract_unannotated.py ├── test_filter_domains.py ├── test_hmmer_build.py ├── test_hmmer_compare.py ├── test_hmmer_report.py ├── test_hmmer_search.py ├── test_hmmer_select.py ├── test_matrix_report.py ├── test_ncbi_taxonomy.py ├── test_partition_seqfile.py ├── test_partition_seqids.py ├── test_plot_contigs.py ├── test_select_by_cds.py ├── test_select_by_contig.py ├── test_seq_dist.py ├── test_summary_report.py ├── test_transform_matrix.py ├── test_trim_contigs.py └── test_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | #other 133 | tmp/ 134 | 135 | pfam/ 136 | 137 | test_out/ 138 | nextflow/conda 139 | .nextflow* 140 | work 141 | output 142 | 143 | *.code-workspace 144 | *.zip 145 | *.pdf 146 | src/domainator/_lib/* 147 | 148 | .vscode/* 149 | 150 | nextflow/* 151 | 152 | *.sif -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Code derived from Biopython (found in src/domainator/Bio) is released under the "Biopython License Agreement" (given in full below). Unless stated otherwise in individual file headers, all Biopython's files are under the "Biopython License Agreement". 2 | 3 | Some files are explicitly dual licensed under your choice of the "Biopython License Agreement" or the "BSD 3-Clause License" (both given in full below). This is with the intention of later offering all of Biopython under this dual licensing approach. 4 | 5 | All other code (that is, code not in the "src/domainator/Bio" directory) is Licensed under the MIT License agreement (given in full below). 6 | 7 | MIT License 8 | 9 | Copyright (c) 2023 Sean R. Johnson 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. 28 | 29 | 30 | Biopython License Agreement 31 | Permission to use, copy, modify, and distribute this software and its documentation with or without modifications and for any purpose and without fee is hereby granted, provided that any copyright notices appear in all copies and that both those copyright notices and this permission notice appear in supporting documentation, and that the names of the contributors or copyright holders not be used in advertising or publicity pertaining to distribution of the software without specific prior permission. 32 | 33 | THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 34 | 35 | BSD 3-Clause License 36 | Copyright (c) 1999-2023, The Biopython Contributors All rights reserved. 37 | 38 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 39 | 40 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 41 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 42 | Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 44 | -------------------------------------------------------------------------------- /conda_env.yml: -------------------------------------------------------------------------------- 1 | name: domainator 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python>=3.10 7 | - pip>=23.3 8 | - setuptools>=61.0.0 9 | - coverage~=6.3.2 10 | - cd-hit~=4.8.1 11 | - diamond>=2.0.0 12 | - hmmer~=3.3.2 13 | - usearch 14 | - pip: 15 | - -e .[test] 16 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | - [Home](../README.md) 3 | - [File Formats](file_formats.md) 4 | - [Examples](examples.md) 5 | - [Developing Domainator](developing_domainator.md) 6 | - [ESM-2 3B 3Di and foldseek integration](esm_3b_foldseek.md) 7 | - [Limitations and FAQ](limitations_and_FAQ.md) 8 | -------------------------------------------------------------------------------- /docs/developing_domainator.md: -------------------------------------------------------------------------------- 1 | [index](README.md) 2 | # Developing Domainator 3 | 4 | ## testing 5 | 6 | run all tests 7 | ``` 8 | pytest test 9 | ``` 10 | 11 | examine test coverage 12 | ``` 13 | coverage run -m pytest test 14 | coverage report -m 15 | coverage html 16 | ``` 17 | then open htmlcov/index.html in a browser to see coverage 18 | 19 | 20 | ## Converting the manual into a pdf 21 | on Ubuntu 22 | 23 | Install pandoc 24 | ``` 25 | sudo apt-get install pandoc texlive-latex-base texlive-fonts-recommended texlive-extra-utils texlive-latex-extra librsvg2-bin 26 | ``` 27 | 28 | run pandoc 29 | ``` 30 | pandoc -V geometry:margin=0.5in -V geometry:paperwidth=13.5in README.md -o README.pdf 31 | ``` 32 | 33 | ## Projects that domainator depends on 34 | 35 | - hmmer3 36 | - usearch 37 | - python3 38 | - biopython 39 | - pytest 40 | - pytest-datadir 41 | - pandas 42 | - seaborn 43 | - cd-hit 44 | - scipy 45 | - pyhmmer 46 | - umap-learn 47 | - diamond 48 | - coverage 49 | -------------------------------------------------------------------------------- /docs/esm_3b_foldseek.md: -------------------------------------------------------------------------------- 1 | Domainator can support sequence searches at high sensitivity across deep evolutionary distances by leveraging the ESM-2 3B 3Di model, described in the manuscript: 2 | [https://www.biorxiv.org/content/10.1101/2023.07.26.550718v1](https://www.biorxiv.org/content/10.1101/2023.07.26.550718v1) 3 | 4 | # Installation 5 | In addition to Domainator, the esmologs package ([https://github.com/seanrjohnson/esmologs](https://github.com/seanrjohnson/esmologs)) and pytorch with CUDA support must be installed. The easiest way to accomplish this is to create a new conda environment with esmologs, and install domainator into that environment. 6 | 7 | You also need to download the ESM-2 3B 3Di fine tuning checkpoint, from [https://zenodo.org/record/8174960](https://zenodo.org/record/8174960) 8 | 9 | ## download the checkpoint 10 | 11 | ```bash 12 | wget https://zenodo.org/record/8174960/files/ESM-2_3B_3Di.pt 13 | ``` 14 | 15 | ## Install via conda 16 | 17 | ```bash 18 | git clone https://github.com/seanrjohnson/esmologs.git 19 | cd esmologs 20 | 21 | conda env create --name domainator_esmologs --file conda_env.yml 22 | 23 | cd .. 24 | git clone https://github.com/nebiolabs/domainator.git 25 | cd domainator 26 | conda env update --name domainator_esmologs --file conda_env.yml 27 | 28 | conda activate domainator_esmologs 29 | pytest test 30 | cd .. 31 | ``` 32 | 33 | ## install via Apptainer/Singularity 34 | 35 | ```bash 36 | git clone https://github.com/nebiolabs/domainator.git 37 | cd domainator 38 | 39 | apptainer build domainator_esmologs.sif domainator_esmologs.def 40 | 41 | # if using wsl, you need to use --nvccli. In other linux, --nv also works. These flags make the GPU visible to the container. 42 | apptainer shell --nvccli domainator_esmologs.sif 43 | 44 | # in the apptainer shell 45 | cd /opt/domainator 46 | pytest test 47 | exit # or ctrl + d 48 | ``` 49 | 50 | # Using ESM-2 3B 3Di with domainate.py 51 | 52 | In this workflow, we first create a reference Foldseek 3Di database, and then use domainator to annotate contigs from that database 53 | 54 | (Note that domain_search.py with ESM-2 3B 3Di is not yet supported) 55 | 56 | ## conda 57 | ```bash 58 | conda activate domainator_esmologs 59 | 60 | # convert a reference file to 3di 61 | predict_from_ESM2_to_3Di.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_20.3di.fasta --weights ESM-2_3B_3Di.pt --device cuda:0 62 | 63 | # convert the amino acid and 3di fasta files into a foldseek database 64 | fasta2foldseek.py --aa domainator/test/data/foldseek/FeSOD_20.fasta --tdi FeSOD_20.3di.fasta -o FeSOD 65 | 66 | # run domainate.py with the foldseek reference database. In this example, our query is the same file we used to make the database, but it could be any fasta or genbank file. 67 | domainate.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_all_to_all_3Di.gb --foldseek FeSOD --esm2_3Di_weights ESM-2_3B_3Di.pt --esm2_3Di_device cuda:0 68 | ``` 69 | 70 | ## Apptainer/Singularity 71 | 72 | ```bash 73 | 74 | # convert a reference file to 3di 75 | apptainer exec --nv domainator/domainator_esmologs.sif predict_from_ESM2_to_3Di.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_20.3di.fasta --weights ESM-2_3B_3Di.pt --device cuda:0 76 | 77 | # convert the amino acid and 3di fasta files into a foldseek database 78 | apptainer exec domainator/domainator_esmologs.sif fasta2foldseek.py --aa domainator/test/data/foldseek/FeSOD_20.fasta --tdi FeSOD_20.3di.fasta -o FeSOD 79 | 80 | # run domainate.py with the foldseek reference database. In this example, our query is the same file we used to make the database, but it could be any fasta or genbank file. 81 | apptainer exec --nv domainator/domainator_esmologs.sif domainate.py -i domainator/test/data/foldseek/FeSOD_20.fasta -o FeSOD_all_to_all_3Di.gb --foldseek FeSOD --esm2_3Di_weights ESM-2_3B_3Di.pt --esm2_3Di_device cuda:0 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /docs/limitations_and_FAQ.md: -------------------------------------------------------------------------------- 1 | # Limitations 2 | ## Large reference databases 3 | Domainator currently loads reference databases into memory, so it is not suitable for large reference databases, for example it is suitable for annotating using Pfam as a reference with ~30,000 profiles, but not NCBI nr with millions of sequences. 4 | 5 | ## Contigs vs genomes 6 | A major limitation of Domainator is that it operates on the contig level, not the genome level. So for example, when reporting taxonomy, fragmented genome assemblies will be counted multiple times, once for each contig in the assembly. 7 | 8 | ## Scores and E-values 9 | Domainator uses scores and evalues somewhat inconsistently. Some programs allow filtering by evalue, others allow filtering by local alignment scores. It would be nice to be more consistent about that. Also domainator evalues are typically not adjusted by database size. Z = 1000 by default in most cases. One possible solution could be to stop using evalues and scores all together, and do most operations in the space of EFI scores. 10 | 11 | ## plot_contigs.py handling of large contigs and large lists of contigs 12 | 13 | plot_contigs.py output looks best when the contigs are of sizes in the range of kb to 10s of kb, and when there are a fewer than about 300 of them. 14 | 15 | For whole genomes, [genome_notebook](https://github.com/dbikard/genomenotebook) might work better. 16 | 17 | [Geneious Prime](https://www.geneious.com/) works very well for visualizing Domainator annotations, but is commercial software. 18 | 19 | We welcome any other recommendations for contig visualization software. 20 | 21 | # Frequently asked questions 22 | 23 | ![Detective Domainic](media/detective_Domainic_transparent_background.png) 24 | ## What's with all the otters? 25 | The Domainator mascot is Domainic, the Domain-otter. Domainic is a American river otter. Besides the irresistible pun and cuteness, the choice of an otter mascot was inspired by the river otters that live in the pond on the New England Biolabs Ipswich campus, where most of Domainator was written. You may see Domainic throughout our documentation, donning his detective gear as he investigates new proteins and genome neighborhoods. The otters holding hands as the M in our logo are a metaphor for a multidomain protein. Similar to the domains of many two-domain proteins, the individual otters are connected by a flexible linker, and can function independently when separated, but are happiest when they are together. While river otters are not known to hold hands, it's sea otters that exhibit that behavior, we hope you'll forgive our creative license in incorporating hand holding river otters into our logo. 26 | ## Can Domainator handle eukaryotic genomes with introns in their CDSs? 27 | Yes! Domainator can add domain annotations across introns and process those files just the same as files without introns. Input files must be in genbank format, and the gene must annotated as a CDS with a complex location, for example `join(11356..11374,11523..17083)`. One potential complication is that eukaryotic genome annotations often include multiple gene models for the same gene. Domainator considers these to be distinct CDSs, so tools like `domain_search.py` and `select_by_cds.py` may behave in unexpected ways, extracting redundant hits, or smaller neighborhoods than expected. If possible, select non redundant representative gene models and delete others from the GenBank file before using it as input to Domainator. If there is enough demand, we will consider adding automated tools to Domainator for better handling of alternative gene models. 28 | -------------------------------------------------------------------------------- /docs/media/Overview_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/Overview_diagram.png -------------------------------------------------------------------------------- /docs/media/detective_Domainic_transparent_background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/detective_Domainic_transparent_background.png -------------------------------------------------------------------------------- /docs/media/genome_mining_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/genome_mining_workflow.png -------------------------------------------------------------------------------- /docs/media/hmm_profile_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/docs/media/hmm_profile_comparison.png -------------------------------------------------------------------------------- /domainator.def: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: condaforge/miniforge3:24.9.2-0 3 | 4 | %files 5 | ./src /opt/domainator/src 6 | ./conda_env.yml /opt/domainator 7 | ./test /opt/domainator/test 8 | ./pyproject.toml /opt/domainator 9 | 10 | %post 11 | cd /opt/domainator 12 | conda env update --name base --file conda_env.yml 13 | -------------------------------------------------------------------------------- /domainator_esmologs.def: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: continuumio/miniconda3:23.10.0-1 3 | 4 | %files 5 | ./src /opt/domainator/src 6 | ./conda_env.yml /opt/domainator 7 | ./test /opt/domainator/test 8 | ./pyproject.toml /opt/domainator 9 | 10 | 11 | %post 12 | 13 | git clone https://github.com/seanrjohnson/esmologs.git 14 | cd esmologs 15 | 16 | conda env update --name base --file conda_env.yml 17 | 18 | cd /opt/domainator 19 | conda env update --name base --file conda_env.yml 20 | -------------------------------------------------------------------------------- /package.sh: -------------------------------------------------------------------------------- 1 | pandoc -V geometry:margin=0.5in -V geometry:paperwidth=13.5in README.md -o README.pdf 2 | zip -r domainator.zip README.md README.pdf pyproject.toml conda_env.yml test src -x "src/domainator.egg-info/*" "src/domainator/__pycache__/*" "test/__pycache__/*" "test/.ipynb_checkpoints/*" "test/data/.ipynb_checkpoints/*" "src/Bio/__pycache__/*" 3 | 4 | # maybe create a directory in the zip? 5 | # maybe instead of having all of those exclusions, make a temporary directory, copy in everything important, zip it, and delete it. 6 | 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name="domainator" 7 | authors = [ 8 | {name = "Sean Johnson", email = "sjohnson@neb.com"}, 9 | {name = "Andrew Ge"}, 10 | {name = "Yu-Cheng Lin"}, 11 | {name = "Zhiyi Sun"} 12 | ] 13 | description="A flexible and modular software suite for domain-based gene neighborhood and protein search, extraction, and clustering." 14 | readme = "README.md" 15 | requires-python = ">=3.9" 16 | 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | ] 20 | license = {file = "LICENSE.txt"} 21 | dynamic = ["version"] 22 | dependencies = [ 23 | "pandas >=2.1.0", 24 | "seaborn >=0.13.0", 25 | "scipy >=1.11.2", 26 | "pyhmmer >=0.10.2 ", 27 | "umap-learn >=0.5.4", 28 | "h5py >=3.9.0", 29 | "jsonargparse >=4.18.0", 30 | "psutil >=5.9.6", 31 | "tqdm >=4.65.0", 32 | "pyrodigal >=3.0.1", 33 | "bashplotlib >=0.6.5", 34 | "requests >=2.31.0" 35 | ] 36 | 37 | [project.optional-dependencies] 38 | test = ["pytest >=7.4.2", 39 | "pytest-datadir~=1.4.1"] 40 | # add dependency for Foldseek/ESM2 related stuff. 41 | 42 | [project.urls] 43 | Bug_Tracker = "https://github.com/nebiolabs/domainator/issues" 44 | 45 | 46 | [tool.setuptools] 47 | package-dir = {"" = "src"} 48 | 49 | 50 | [tool.setuptools.dynamic] 51 | version = {attr = "domainator.__version__"} 52 | 53 | [project.scripts] 54 | "build_projection.py" = "domainator:build_projection._entrypoint" 55 | "build_ssn.py" = "domainator:build_ssn._entrypoint" 56 | "build_tree.py" = "domainator:build_tree._entrypoint" 57 | "color_genbank.py" = "domainator:color_genbank._entrypoint" 58 | "color_table_to_legend.py" = "domainator:color_table_to_legend._entrypoint" 59 | "compare_contigs.py" = "domainator:compare_contigs._entrypoint" 60 | "deduplicate_genbank.py" = "domainator:deduplicate_genbank._entrypoint" 61 | "domain_search.py" = "domainator:domain_search._entrypoint" 62 | "domainate.py" = "domainator:domainate._entrypoint" 63 | "domainator_db_download.py" = "domainator:domainator_db_download._entrypoint" 64 | "enum_report.py" = "domainator:enum_report._entrypoint" 65 | "extract_domains.py" = "domainator:extract_domains._entrypoint" 66 | "extract_peptides.py" = "domainator:extract_peptides._entrypoint" 67 | "extract_unannotated.py" = "domainator:extract_unannotated._entrypoint" 68 | "filter_domains.py" = "domainator:filter_domains._entrypoint" 69 | "genbank_to_fasta.py" = "domainator:genbank_to_fasta._entrypoint" 70 | "hmmer_build.py" = "domainator:hmmer_build._entrypoint" 71 | "hmmer_compare.py" = "domainator:hmmer_compare._entrypoint" 72 | "hmmer_report.py" = "domainator:hmmer_report._entrypoint" 73 | "hmmer_search.py" = "domainator:hmmer_search._entrypoint" 74 | "hmmer_select.py" = "domainator:hmmer_select._entrypoint" 75 | "matrix_report.py" = "domainator:matrix_report._entrypoint" 76 | "partition_seqfile.py" = "domainator:partition_seqfile._entrypoint" 77 | "select_by_cds.py" = "domainator:select_by_cds._entrypoint" 78 | "select_by_contig.py" = "domainator:select_by_contig._entrypoint" 79 | "seq_dist.py" = "domainator:seq_dist._entrypoint" 80 | "summary_report.py" = "domainator:summary_report._entrypoint" 81 | "transform_matrix.py" = "domainator:transform_matrix._entrypoint" 82 | "plot_contigs.py" = "domainator:plot_contigs._entrypoint" 83 | "trim_contigs.py" = "domainator:trim_contigs._entrypoint" -------------------------------------------------------------------------------- /src/domainator/Bio/Data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2000 Andrew Dalke. All rights reserved. 2 | # 3 | # This file is part of the Biopython distribution and governed by your 4 | # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5 | # Please see the LICENSE file that should have been included as part of this 6 | # package. 7 | 8 | """Collections of various bits of useful biological data.""" 9 | -------------------------------------------------------------------------------- /src/domainator/Bio/GenBank/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2000 by Jeffrey Chang, Brad Chapman. All rights reserved. 2 | # Copyright 2006-2017 by Peter Cock. All rights reserved. 3 | # 4 | # This code is part of the Biopython distribution and governed by its 5 | # license. Please see the LICENSE file that should have been included 6 | # as part of this package. 7 | 8 | from domainator.Bio.GenBank.utils import * 9 | 10 | # if __name__ == "__main__": 11 | # from Bio._utils import run_doctest 12 | 13 | # run_doctest() 14 | -------------------------------------------------------------------------------- /src/domainator/Bio/LICENSE.rst: -------------------------------------------------------------------------------- 1 | Files in the Bio directory are modified from the Biopython project. The original files are licensed under the Biopython License Agreement and the BSD 3-Clause License. The original files are available at:https://github.com/biopython/biopython 2 | 3 | Files in this directory are licensed as follows: 4 | 5 | Biopython is currently released under the "Biopython License Agreement" (given in full below). Unless stated otherwise in individual file headers, all Biopython's files are under the "Biopython License Agreement". 6 | 7 | Some files are explicitly dual licensed under your choice of the "Biopython License Agreement" or the "BSD 3-Clause License" (both given in full below). This is with the intention of later offering all of Biopython under this dual licensing approach. 8 | 9 | Biopython License Agreement 10 | Permission to use, copy, modify, and distribute this software and its documentation with or without modifications and for any purpose and without fee is hereby granted, provided that any copyright notices appear in all copies and that both those copyright notices and this permission notice appear in supporting documentation, and that the names of the contributors or copyright holders not be used in advertising or publicity pertaining to distribution of the software without specific prior permission. 11 | 12 | THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 13 | 14 | BSD 3-Clause License 15 | Copyright (c) 1999-2023, The Biopython Contributors All rights reserved. 16 | 17 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 18 | 19 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 20 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 21 | Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /src/domainator/Bio/SeqIO/SwissIO.py: -------------------------------------------------------------------------------- 1 | # Copyright 2006-2013,2020 by Peter Cock. 2 | # Revisions copyright 2008-2009 by Michiel de Hoon. 3 | # All rights reserved. 4 | # 5 | # This file is part of the Biopython distribution and governed by your 6 | # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 7 | # Please see the LICENSE file that should have been included as part of this 8 | # package. 9 | """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format. 10 | You are expected to use this module via the Bio.SeqIO functions. 11 | See also the Bio.SwissProt module which offers more than just accessing 12 | the sequences as SeqRecord objects. 13 | See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format. 14 | """ 15 | from domainator.Bio import SeqFeature 16 | from domainator.Bio import SwissProt 17 | from domainator.Bio.Seq import Seq 18 | from domainator.Bio.SeqRecord import SeqRecord 19 | 20 | 21 | def SwissIterator(source): 22 | """Break up a Swiss-Prot/UniProt file into SeqRecord objects. 23 | Argument source is a file-like object or a path to a file. 24 | Every section from the ID line to the terminating // becomes 25 | a single SeqRecord with associated annotation and features. 26 | This parser is for the flat file "swiss" format as used by: 27 | - Swiss-Prot aka SwissProt 28 | - TrEMBL 29 | - UniProtKB aka UniProt Knowledgebase 30 | For consistency with BioPerl and EMBOSS we call this the "swiss" 31 | format. See also the SeqIO support for "uniprot-xml" format. 32 | Rather than calling it directly, you are expected to use this 33 | parser via Bio.SeqIO.parse(..., format="swiss") instead. 34 | """ 35 | swiss_records = SwissProt.parse(source) 36 | 37 | for swiss_record in swiss_records: 38 | # Convert the SwissProt record to a SeqRecord 39 | record = SeqRecord( 40 | Seq(swiss_record.sequence), 41 | id=swiss_record.accessions[0], 42 | name=swiss_record.entry_name, 43 | description=swiss_record.description, 44 | features=swiss_record.features, 45 | ) 46 | for cross_reference in swiss_record.cross_references: 47 | if len(cross_reference) < 2: 48 | continue 49 | database, accession = cross_reference[:2] 50 | dbxref = f"{database}:{accession}" 51 | if dbxref not in record.dbxrefs: 52 | record.dbxrefs.append(dbxref) 53 | annotations = record.annotations 54 | annotations["molecule_type"] = "protein" 55 | annotations["accessions"] = swiss_record.accessions 56 | if swiss_record.protein_existence: 57 | annotations["protein_existence"] = swiss_record.protein_existence 58 | if swiss_record.created: 59 | date, version = swiss_record.created 60 | annotations["date"] = date 61 | annotations["sequence_version"] = version 62 | if swiss_record.sequence_update: 63 | date, version = swiss_record.sequence_update 64 | annotations["date_last_sequence_update"] = date 65 | annotations["sequence_version"] = version 66 | if swiss_record.annotation_update: 67 | date, version = swiss_record.annotation_update 68 | annotations["date_last_annotation_update"] = date 69 | annotations["entry_version"] = version 70 | if swiss_record.gene_name: 71 | annotations["gene_name"] = swiss_record.gene_name 72 | annotations["organism"] = swiss_record.organism.rstrip(".") 73 | annotations["taxonomy"] = swiss_record.organism_classification 74 | annotations["ncbi_taxid"] = swiss_record.taxonomy_id 75 | if swiss_record.host_organism: 76 | annotations["organism_host"] = swiss_record.host_organism 77 | if swiss_record.host_taxonomy_id: 78 | annotations["host_ncbi_taxid"] = swiss_record.host_taxonomy_id 79 | if swiss_record.comments: 80 | annotations["comment"] = "\n".join(swiss_record.comments) 81 | if swiss_record.references: 82 | annotations["references"] = [] 83 | for reference in swiss_record.references: 84 | feature = SeqFeature.Reference() 85 | feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) 86 | for key, value in reference.references: 87 | if key == "PubMed": 88 | feature.pubmed_id = value 89 | elif key == "MEDLINE": 90 | feature.medline_id = value 91 | elif key == "DOI": 92 | pass 93 | elif key == "AGRICOLA": 94 | pass 95 | else: 96 | raise ValueError(f"Unknown key {key} found in references") 97 | feature.authors = reference.authors 98 | feature.title = reference.title 99 | feature.journal = reference.location 100 | annotations["references"].append(feature) 101 | if swiss_record.keywords: 102 | record.annotations["keywords"] = swiss_record.keywords 103 | yield record -------------------------------------------------------------------------------- /src/domainator/Bio/SeqIO/TabIO.py: -------------------------------------------------------------------------------- 1 | # Copyright 2008-2017,2020 by Peter Cock. All rights reserved. 2 | # 3 | # This file is part of the Biopython distribution and governed by your 4 | # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5 | # Please see the LICENSE file that should have been included as part of this 6 | # package. 7 | """Bio.SeqIO support for the "tab" (simple tab separated) file format. 8 | 9 | You are expected to use this module via the Bio.SeqIO functions. 10 | 11 | The "tab" format is an ad-hoc plain text file format where each sequence is 12 | on one (long) line. Each line contains the identifier/description, followed 13 | by a tab, followed by the sequence. For example, consider the following 14 | short FASTA format file:: 15 | 16 | >ID123456 possible binding site? 17 | CATCNAGATGACACTACGACTACGACTCAGACTAC 18 | >ID123457 random sequence 19 | ACACTACGACTACGACTCAGACTACAAN 20 | 21 | Apart from the descriptions, this can be represented in the simple two column 22 | tab separated format as follows:: 23 | 24 | ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC 25 | ID123457(tab)ACACTACGACTACGACTCAGACTACAAN 26 | 27 | When reading this file, "ID123456" or "ID123457" will be taken as the record's 28 | .id and .name property. There is no other information to record. 29 | 30 | Similarly, when writing to this format, Biopython will ONLY record the record's 31 | .id and .seq (and not the description or any other information) as in the 32 | example above. 33 | """ 34 | from domainator.Bio.Seq import Seq 35 | from domainator.Bio.SeqRecord import SeqRecord 36 | 37 | from .Interfaces import _clean 38 | from .Interfaces import _get_seq_string 39 | from .Interfaces import SequenceIterator 40 | from .Interfaces import SequenceWriter 41 | 42 | 43 | class TabIterator(SequenceIterator): 44 | """Parser for tab-delimited files.""" 45 | 46 | def __init__(self, source): 47 | """Iterate over tab separated lines as SeqRecord objects. 48 | 49 | Each line of the file should contain one tab only, dividing the line 50 | into an identifier and the full sequence. 51 | 52 | Arguments: 53 | - source - file-like object opened in text mode, or a path to a file 54 | 55 | The first field is taken as the record's .id and .name (regardless of 56 | any spaces within the text) and the second field is the sequence. 57 | 58 | Any blank lines are ignored. 59 | 60 | Examples 61 | -------- 62 | >>> with open("GenBank/NC_005816.tsv") as handle: 63 | ... for record in TabIterator(handle): 64 | ... print("%s length %i" % (record.id, len(record))) 65 | gi|45478712|ref|NP_995567.1| length 340 66 | gi|45478713|ref|NP_995568.1| length 260 67 | gi|45478714|ref|NP_995569.1| length 64 68 | gi|45478715|ref|NP_995570.1| length 123 69 | gi|45478716|ref|NP_995571.1| length 145 70 | gi|45478717|ref|NP_995572.1| length 357 71 | gi|45478718|ref|NP_995573.1| length 138 72 | gi|45478719|ref|NP_995574.1| length 312 73 | gi|45478720|ref|NP_995575.1| length 99 74 | gi|45478721|ref|NP_995576.1| length 90 75 | 76 | """ 77 | super().__init__(source, mode="t", fmt="Tab-separated plain-text") 78 | 79 | def parse(self, handle): 80 | """Start parsing the file, and return a SeqRecord generator.""" 81 | records = self.iterate(handle) 82 | return records 83 | 84 | def iterate(self, handle): 85 | """Parse the file and generate SeqRecord objects.""" 86 | for line in handle: 87 | try: 88 | title, seq = line.split("\t") # will fail if more than one tab! 89 | except ValueError: 90 | if line.strip() == "": 91 | # It's a blank line, ignore it 92 | continue 93 | raise ValueError( 94 | "Each line should have one tab separating the" 95 | + " title and sequence, this line has %i tabs: %r" 96 | % (line.count("\t"), line) 97 | ) from None 98 | title = title.strip() 99 | seq = seq.strip() # removes the trailing new line 100 | yield SeqRecord(Seq(seq), id=title, name=title, description="") 101 | 102 | 103 | class TabWriter(SequenceWriter): 104 | """Class to write simple tab separated format files. 105 | 106 | Each line consists of "id(tab)sequence" only. 107 | 108 | Any description, name or other annotation is not recorded. 109 | 110 | This class is not intended to be used directly. Instead, please use 111 | the function ``as_tab``, or the top level ``Bio.SeqIO.write()`` function 112 | with ``format="tab"``. 113 | """ 114 | 115 | def write_record(self, record): 116 | """Write a single tab line to the file.""" 117 | assert self._header_written 118 | assert not self._footer_written 119 | self._record_written = True 120 | self.handle.write(as_tab(record)) 121 | 122 | 123 | def as_tab(record): 124 | """Return record as tab separated (id(tab)seq) string.""" 125 | title = _clean(record.id) 126 | seq = _get_seq_string(record) # Catches sequence being None 127 | assert "\t" not in title 128 | assert "\n" not in title 129 | assert "\r" not in title 130 | assert "\t" not in seq 131 | assert "\n" not in seq 132 | assert "\r" not in seq 133 | return f"{title}\t{seq}\n" 134 | 135 | 136 | # if __name__ == "__main__": 137 | # from Bio._utils import run_doctest 138 | 139 | # run_doctest(verbose=0) 140 | -------------------------------------------------------------------------------- /src/domainator/__init__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | __version__ = "0.7.1" # remember to update version in README.md also. And if you make breaking changes to the matrix file format, increment the matrix file version in data_matrix.py. 3 | DOMAIN_FEATURE_NAME="Domainator" # Changing this will break backwards compatibility with files generated by other versions of Domainator 4 | DOMAIN_SEARCH_BEST_HIT_NAME="Domain_Search" # Changing this will break backwards compatibility with files generated by other versions of Domainator 5 | # SYS_ANNOTATION_NAME="Domainator_sys_" #Changing this will (probably) NOT break backwards compatibility with files generated by other versions of Domainator 6 | 7 | class RawAndDefaultsFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter): 8 | pass 9 | 10 | -------------------------------------------------------------------------------- /src/domainator/color_table_to_legend.py: -------------------------------------------------------------------------------- 1 | """ Generates an SVG legend based on a color table. 2 | The color table is a tsv file that maps annotations to hex color codes. 3 | The generated SVG file contains rectangles filled with the corresponding colors and text labels for each annotation. 4 | """ 5 | 6 | 7 | from jsonargparse import ArgumentParser, ActionConfigFile 8 | import sys 9 | from typing import Tuple, List, Optional, Set, NamedTuple, Union, Dict 10 | from os import PathLike 11 | from domainator import __version__, RawAndDefaultsFormatter 12 | from domainator.color_genbank import read_color_table 13 | from collections import OrderedDict 14 | import html 15 | 16 | 17 | def color_table_to_legend(table: Dict[str, str], svg: str, title: str): 18 | # Constants for layout and styling 19 | title_font_size = 24 20 | item_font_size = 20 21 | stroke_width = 2 # Width of the stroke for boxes 22 | padding = 10 # Padding around the content inside the legend box 23 | title_space = 40 # Space allocated for the title at the top of the legend 24 | 25 | box_height = item_font_size * 2 # Height of each color box, increased to add whitespace 26 | box_width = box_height # Width of color boxes 27 | text_offset_y = box_height/2 # Adjust to vertically center text in the color box 28 | 29 | longest_key = max(len(k) for k in table.keys()) 30 | 31 | text_offset_x = box_width + padding*2 # X offset for text to align it nicely with boxes 32 | total_height = title_space + len(table) * box_height + 2 * padding 33 | total_width = max(box_width + padding*3 + (longest_key * item_font_size)/2, padding*3 + (len(title) * title_font_size) /2) # Assumed extra space for text 34 | 35 | 36 | with open(svg, "w") as f: 37 | f.write(f""" 38 | 39 | 40 | 41 | {html.escape(title)} 42 | """) 43 | y = title_space 44 | for k, v in table.items(): 45 | # Draw the color box with a black border 46 | f.write(f""" 47 | {html.escape(k)} 48 | """) 49 | y += box_height 50 | f.write(""" 51 | 52 | """) 53 | 54 | def main(argv): 55 | parser = ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter) 56 | 57 | parser.add_argument("-i", "--input", default=None, nargs="+", type=str, required=False, 58 | help="""names of color table files. If not supplied, reads from stdin.\n 59 | Files are tab separated with two columns and no header, columns are: annotation, hex color. For example: CCDB cc0000""") 60 | parser.add_argument("--svg", default=None, type=str, required=True, 61 | help="name of output svg file") 62 | parser.add_argument("--title", default="Legend", type=str, required=False, 63 | help="Title of the legend. Default: 'Legend'") 64 | #TODO: font, font size, indicator shape, etc. 65 | 66 | parser.add_argument('--config', action=ActionConfigFile) 67 | 68 | params = parser.parse_args(argv) 69 | 70 | ### validate input 71 | 72 | 73 | if params.input is None: 74 | input_path = (sys.stdin,) 75 | else: 76 | input_path = params.input 77 | 78 | table = OrderedDict() 79 | for color_table_file in input_path: 80 | for k, v in read_color_table(color_table_file).items(): 81 | table[k] = v 82 | 83 | ### Run 84 | 85 | color_table_to_legend(table, params.svg, params.title) 86 | 87 | def _entrypoint(): 88 | main(sys.argv[1:]) 89 | 90 | if __name__ == '__main__': 91 | main(sys.argv[1:]) 92 | 93 | 94 | -------------------------------------------------------------------------------- /src/domainator/foldseek.py: -------------------------------------------------------------------------------- 1 | try: 2 | from esmologs.ESM2_to_3Di import ESM2_to_3Di 3 | from esmologs.predict_from_ESM2_to_3Di import convert_batch 4 | from esmologs.fasta2foldseek import fasta2foldseek 5 | import torch 6 | except ImportError: 7 | pass 8 | 9 | import psutil 10 | import tempfile 11 | import subprocess 12 | from typing import List, Iterable, Tuple, Union, Iterator 13 | from collections import namedtuple 14 | 15 | # define a named tuple for hits with fields "query,target,qheader,theader,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits" 16 | FoldseekHit = namedtuple("Hit", ["query","target","qheader","theader","pident","alnlen","mismatch","gapopen","qstart","qend","tstart","tend","evalue","bits", "qlen", "tlen"]) 17 | 18 | MAX_PROTEIN_SIZE = 2500 19 | 20 | def search(database_path, proteins, foldseek, cpu, E) -> Iterable[FoldseekHit]: 21 | with tempfile.TemporaryDirectory() as tmpdirname: 22 | out_base_name = tmpdirname + "/output" 23 | protein_fasta_name = tmpdirname + "/protein.fasta" 24 | threedi_fasta_name = tmpdirname + "/threedi.fasta" 25 | foldseek_tmpfolder = tmpdirname + "/foldseek_tmpfolder" 26 | aln_path = tmpdirname + "/aln" 27 | 28 | num_seqs = 0 29 | with open(protein_fasta_name, "w") as protein_f: 30 | with open(threedi_fasta_name, "w") as threedi_f: 31 | for i, foldseek_seq in enumerate(foldseek): 32 | if foldseek_seq is None: 33 | continue 34 | num_seqs += 1 35 | 36 | protein = proteins[i] 37 | protein = protein.textize() 38 | 39 | protein_f.write(f">{protein.name.decode('utf-8')} {protein.description.decode('utf-8')}\n{protein.sequence}\n") 40 | threedi_f.write(foldseek_seq + "\n") 41 | 42 | if num_seqs == 0: 43 | return # no sequences to search, yield nothing 44 | 45 | fasta2foldseek(protein_fasta_name, threedi_fasta_name, out_base_name) 46 | foldseek_options = ["foldseek", "search", out_base_name, database_path, aln_path, foldseek_tmpfolder, "-e", str(E)] 47 | if cpu > 0 and cpu is not None: 48 | foldseek_options += ["--threads", str(cpu)] 49 | foldseek_out = subprocess.Popen(foldseek_options, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 50 | foldseek_out.wait() 51 | if foldseek_out.returncode != 0: 52 | raise RuntimeError(f"foldseek exited with code {foldseek_out.returncode}:\n{foldseek_out.stderr.read().decode('utf-8')}") 53 | 54 | convertalis_out = subprocess.Popen(["foldseek", "convertalis", "--format-output", "query,target,qheader,theader,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen", out_base_name, database_path, aln_path, foldseek_tmpfolder + "/results.tsv"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 55 | convertalis_out.wait() 56 | if convertalis_out.returncode != 0: 57 | raise RuntimeError(f"convertalis exited with code {convertalis_out.returncode}:\n{convertalis_out.stderr.read().decode('utf-8')}") 58 | 59 | with open(foldseek_tmpfolder + "/results.tsv", "r") as f: 60 | for line in f: 61 | yield FoldseekHit(*line.strip().split("\t")) 62 | 63 | 64 | class foldseekBuilder(): 65 | def __init__(self, device="cuda:0", checkpoint=None): 66 | self.device = device 67 | self.checkpoint = checkpoint 68 | self.model = ESM2_to_3Di("esm2_t36_3B_UR50D", torch.load(checkpoint, map_location=device)) 69 | self.checkpoint=checkpoint 70 | self.device=device 71 | self.model.to(self.device) 72 | self.model.eval() 73 | 74 | def __call__(self, name:str, prot:str) -> bytes: 75 | if len(prot) > MAX_PROTEIN_SIZE: #TODO: maybe warn? 76 | return None 77 | # skip if contains non-amino acid characters 78 | if prot.strip("ACDEFGHIKLMNPQRSTVWY") != "": #TODO: maybe warn? 79 | return None 80 | predicted_seqs = convert_batch(self.model, [prot], device=self.device) 81 | return f">{name}\n{predicted_seqs[0]}" -------------------------------------------------------------------------------- /src/domainator/genbank_to_fasta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert a GenBank file to a FASTA file. 3 | """ 4 | import argparse 5 | import sys 6 | from domainator.Bio import SeqIO 7 | from domainator import __version__, RawAndDefaultsFormatter 8 | from domainator.utils import parse_seqfiles 9 | 10 | # TODO: replace this with a more general purpose script that can convert between any of the supported formats 11 | # allow specification of various paramters, such as species, accession, etc. 12 | 13 | def main(argv): 14 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter) 15 | 16 | parser.add_argument('-i', '--input', nargs='+', required=False, default=None, 17 | help="Genbank filenames. If not supplied, reads from stdin.") 18 | 19 | parser.add_argument('-o', '--output', default=None, required=False, 20 | help="the name of the output fasta file. If not supplied writes to stdout.") 21 | 22 | params = parser.parse_args(argv) 23 | 24 | ### Figure out what input and output files #### 25 | 26 | if params.input is None: 27 | genbanks = [sys.stdin] 28 | else: 29 | genbanks = params.input 30 | 31 | if params.output is None: 32 | out = sys.stdout 33 | else: 34 | out = open(params.output, "w") 35 | 36 | for rec in parse_seqfiles(genbanks, filetype_override="genbank"): 37 | SeqIO.write(rec, out, "fasta") 38 | 39 | if params.output is not None: 40 | out.close() 41 | 42 | def _entrypoint(): 43 | main(sys.argv[1:]) 44 | 45 | if __name__ == "__main__": 46 | main(sys.argv[1:]) 47 | -------------------------------------------------------------------------------- /src/domainator/hmmer_build.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to build a HMM profile from a multiple sequence alignment (MSA). 3 | Allows the user to specify the ACC, NAME, and DESC fields of the HMM profile. 4 | """ 5 | from jsonargparse import ArgumentParser, ActionConfigFile 6 | from pyhmmer.easel import MSAFile 7 | from domainator import __version__, RawAndDefaultsFormatter 8 | import sys 9 | import pyhmmer 10 | from typing import Optional,BinaryIO,Union 11 | import re 12 | 13 | def sanitize_string(s:str) -> str: 14 | return re.sub("[^ \w\d_\-\.;:]", "_", s) 15 | 16 | def hmmer_build(file:Union[str,BinaryIO], alphabet:Optional[pyhmmer.easel.Alphabet]=None, name:Optional[str]=None, acc:Optional[str]=None, desc:Optional[str]=None) -> pyhmmer.plan7.HMM: 17 | with MSAFile(file, digital=True, alphabet=alphabet) as msa_file: 18 | msa = msa_file.read() 19 | 20 | msa.name = sanitize_string(name).encode() 21 | if acc is not None: 22 | msa.accession = sanitize_string(acc).encode() 23 | if desc is not None: 24 | msa.description = sanitize_string(desc).encode() 25 | 26 | builder = pyhmmer.plan7.Builder(msa.alphabet) 27 | background = pyhmmer.plan7.Background(msa.alphabet) 28 | hmm, _, _ = builder.build_msa(msa, background) 29 | return hmm 30 | 31 | 32 | def main(argv): 33 | parser = ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter) 34 | 35 | parser.add_argument("-i", "--input", default=None, required=False, type=str, 36 | help="Path of input msa. If not supplied, reads from stdin. Acceptable formats are the same as for hmmbuild.") 37 | 38 | parser.add_argument("-o", "--output", default=None, required=False, type=str, 39 | help="hmm output file path. If not supplied writes to stdout.") 40 | 41 | parser.add_argument("--name", default=None, required=True, type=str, 42 | help="Name of the HMM profile.") 43 | parser.add_argument("--acc", default=None, required=False, type=str, 44 | help="Accession of the HMM profile.") 45 | parser.add_argument("--desc", default=None, required=False, type=str, 46 | help="Description of the HMM profile.") 47 | parser.add_argument("--alphabet", default=None, required=False, type=str.lower, choices={"amino", "dna", "rna"},) 48 | 49 | parser.add_argument("--config", action=ActionConfigFile) 50 | 51 | params = parser.parse_args(argv) 52 | 53 | 54 | if params.input is None: 55 | input_file = sys.stdin.buffer 56 | else: 57 | input_file = open(params.input, "rb") 58 | 59 | if params.output is None: 60 | output_handle = sys.stdout.buffer 61 | else: 62 | output_handle = open(params.output, "wb") 63 | 64 | alphabet = None 65 | if params.alphabet == "amino": 66 | alphabet = pyhmmer.easel.Alphabet.amino() 67 | elif params.alphabet == "dna": 68 | alphabet = pyhmmer.easel.Alphabet.dna() 69 | elif params.alphabet == "rna": 70 | alphabet = pyhmmer.easel.Alphabet.rna() 71 | 72 | hmm = hmmer_build(file=input_file, alphabet=alphabet, name=params.name, acc=params.acc, desc=params.desc) 73 | hmm.write(output_handle) 74 | 75 | if params.input is not None: 76 | input_file.close() 77 | 78 | if params.output is not None: 79 | output_handle.close() 80 | 81 | def _entrypoint(): 82 | main(sys.argv[1:]) 83 | 84 | if __name__ == "__main__": 85 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /src/domainator/hmmer_compare.py: -------------------------------------------------------------------------------- 1 | """Aligns and calculates alignment scores between hmmer3 profiles 2 | 3 | Kind of like hhsearch except much slower and for hmmer3 profiles instead of hhsuite profiles. 4 | 5 | Adapted from pseudocode in: 6 | Steinegger, Martin, Markus Meier, Milot Mirdita, Harald Vöhringer, Stephan J. Haunsberger, and Johannes Söding. “HH-Suite3 for Fast Remote Homology Detection and Deep Protein Annotation.” BMC Bioinformatics 20, no. 1 (September 14, 2019): 473. https://doi.org/10.1186/s12859-019-3019-7. 7 | 8 | and 9 | 10 | Söding, Johannes. “Protein Homology Detection by HMM–HMM Comparison.” Bioinformatics 21, no. 7 (April 1, 2005): 951–60. https://doi.org/10.1093/bioinformatics/bti125. 11 | 12 | """ 13 | import argparse 14 | import sys 15 | import pyhmmer 16 | from typing import Iterable, TextIO 17 | import heapq 18 | from multiprocessing import Pool 19 | from domainator import __version__, RawAndDefaultsFormatter 20 | from domainator.hmmer_search import read_hmms, compare_hmmer, traceback, HmmerHit 21 | 22 | class _hmmer_compare_worker(): 23 | def __init__(self, hmmer_targets, alignment=False, k=None, score_cutoff=float("-inf")): 24 | self.k = k 25 | self.hmmer_targets = hmmer_targets 26 | self.alignment = alignment 27 | self.score_cutoff = score_cutoff 28 | 29 | def __call__(self, input_profile): 30 | out_heap = [] 31 | for target_dataset in self.hmmer_targets.values(): 32 | for target_profile in target_dataset.values(): 33 | score, backtrace, max_index, match_scores = compare_hmmer(input_profile, target_profile) 34 | if score >= self.score_cutoff: 35 | if self.alignment: 36 | alignment = traceback(input_profile,target_profile,backtrace,max_index, match_scores) 37 | else: 38 | alignment = None 39 | 40 | result = HmmerHit(score, input_profile.name.decode(), target_profile.name.decode(), alignment) 41 | if (self.k is None) or (len(out_heap) < self.k): 42 | heapq.heappush(out_heap, result) 43 | else: 44 | heapq.heappushpop(out_heap, result) 45 | out_heap.sort(reverse=True) 46 | return out_heap 47 | 48 | 49 | 50 | def hmmer_compare(query_files:Iterable[str], reference_files:Iterable[str], out_handle:TextIO, score_cutoff:float, alignments:bool, k:int, cpu:int): 51 | references = read_hmms(hmm_files=reference_files) # list of lists of pyhmmer hmm objects 52 | 53 | worker = _hmmer_compare_worker(references, alignments, k, score_cutoff) 54 | 55 | sep="\t" 56 | print(sep.join(("query","reference","score")), file=out_handle) #TODO: how to write the alignment? 57 | 58 | for file in query_files: 59 | # file_name = os.path.basename(Path(file).stem) 60 | with Pool(processes=cpu) as pool: 61 | for hits in pool.imap(worker, pyhmmer.plan7.HMMFile(file), chunksize=1): # I tested some chunk sizes and it didn't seem to make a difference 62 | for hit in hits: 63 | print(sep.join( (hit.query_name,hit.reference_name,f"{round(hit.score,2):.2f}") ), file=out_handle) 64 | if alignments: 65 | print(hit.alignment, file=out_handle) 66 | print("\n\n", file=out_handle) 67 | 68 | 69 | def main(argv): 70 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter) 71 | parser.add_argument('-i', '--input', type=str, required=True, nargs='+', 72 | help="Input query files. One or more hmm text files with one or more hmmer3 profiles.") 73 | parser.add_argument('-r', "--reference", type=str, required=True, nargs='+', 74 | help="Reference files. One or more hmm text files with one or more hmmer3 profiles.") 75 | 76 | parser.add_argument('--score_cutoff', type=float, default = 0, 77 | help="Report alignments with scores greater than or equal to this.") #TODO: what is a reasonable cutoff? 78 | 79 | parser.add_argument('-k', type=int, required=False, default=None, 80 | help="Include at most this many of the top hits for each query. Default: Include all hits.") 81 | 82 | parser.add_argument('-o', '--output', type=str, default=None, 83 | help="File to write the scores and alignments to.") 84 | 85 | parser.add_argument('--alignments', action='store_true', default=False, 86 | help="when activated, will write the alignments to the output.") 87 | 88 | parser.add_argument('--cpu', type=int, default=8, required=False, 89 | help="how many cpu threads to use. Default: 8") 90 | 91 | 92 | params = parser.parse_args(argv) 93 | 94 | if params.output is None: 95 | out = sys.stdout 96 | else: 97 | out = open(params.output, "w") 98 | 99 | hmmer_compare(params.input, params.reference, out, params.score_cutoff, params.alignments, params.k, params.cpu) 100 | 101 | if params.output is not None: 102 | out.close() 103 | 104 | def _entrypoint(): 105 | main(sys.argv[1:]) 106 | 107 | if __name__ == '__main__': 108 | main(sys.argv[1:]) 109 | -------------------------------------------------------------------------------- /src/domainator/partition_seqfile.py: -------------------------------------------------------------------------------- 1 | """Find record offsets in a sequence file 2 | 3 | Given an input sequence file, writes the total count of CDSs/protein sequences, 4 | followed by a list of offset\trecords pairs to divide the sequence file. 5 | 6 | Seeking to each offset and reading the specified number of records will result in reading the entire file once. 7 | """ 8 | import argparse 9 | import sys 10 | from domainator import __version__, RawAndDefaultsFormatter 11 | from domainator import utils 12 | 13 | 14 | def i_partition_seqfiles(input_paths, cdss_per_partition): #TODO: test this! 15 | """ 16 | input: 17 | input_paths: a list of paths to genbank (peptide or nucleotide) or fasta (peptide) files. 18 | cdss_per_partition: how many cdss to try to include in each partition (mutually exclusive with partitions) 19 | 20 | yields: 21 | (input_path, offset, recs_to_read) 22 | 23 | """ 24 | 25 | for input_path in input_paths: 26 | running_sum = 0 27 | next_offset = None 28 | recs_in_buffer = 0 29 | for offset, cds_count in utils.i_get_offsets(input_path): 30 | if next_offset is None: 31 | next_offset = offset 32 | running_sum += cds_count 33 | recs_in_buffer += 1 34 | if running_sum >= cdss_per_partition: 35 | yield (input_path, next_offset, recs_in_buffer) 36 | running_sum = 0 37 | recs_in_buffer = 0 38 | next_offset = None 39 | if recs_in_buffer > 0: 40 | yield (input_path, next_offset, recs_in_buffer) 41 | 42 | 43 | def partition_seqfile(input_path, partitions=None, cdss_per_partition=None): 44 | """ 45 | input: 46 | input_path: path to a genbank (peptide or nucleotide) or fasta (peptide) file. 47 | partitions: desired number of partitions to divide the file into 48 | cdss_per_partition: how many cdss to try to include in each partition (mutually exclusive with partitions) 49 | 50 | output: 51 | total_cds_count, [(input_path, offset, recs_to_read)] 52 | """ 53 | 54 | cds_count = 0 55 | out_list = list() 56 | 57 | if partitions is None and cdss_per_partition is None: 58 | raise ValueError("Error: Must specify either partitions or cdss_per_partition") 59 | 60 | if partitions is not None and cdss_per_partition is not None: 61 | raise ValueError("Error: Must specify only one of partitions or cdss_per_partition") 62 | 63 | offsets, num_proteins = utils.get_offsets(input_path) 64 | 65 | num_recs = len(offsets) 66 | 67 | cds_count = sum(num_proteins) 68 | 69 | if num_recs == 0: 70 | raise ValueError("Error: No sequence records found in input file (maybe the file has the wrong extension or there is a formatting error in the input file).") 71 | 72 | 73 | if cdss_per_partition is None: 74 | cdss_per_partition = int(cds_count/partitions) 75 | 76 | running_sum = 0 77 | next_offset = offsets[0] 78 | recs_in_buffer = 0 79 | for i in range(num_recs): 80 | running_sum += num_proteins[i] 81 | recs_in_buffer += 1 82 | if running_sum >= cdss_per_partition: 83 | out_list.append((input_path, next_offset, recs_in_buffer)) 84 | running_sum = 0 85 | recs_in_buffer = 0 86 | if i + 1 < num_recs: 87 | next_offset = offsets[i+1] # next_offset = the offset of the next record 88 | if recs_in_buffer > 0: 89 | out_list.append((input_path, next_offset, recs_in_buffer)) 90 | 91 | return cds_count, out_list 92 | 93 | 94 | def main(args): 95 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter) 96 | 97 | parser.add_argument('-i', '--input', default=None, type=str, required=True, 98 | help="the genbank or fasta file to split the contig names of. Genbank files can be nucleotide (with CDS annotations) or peptide. Fasta files must be peptide.") 99 | parser.add_argument('-o',"--output", default=None, type=str, 100 | help="Name of output file. Default: stdout") 101 | 102 | overwrite_group = parser.add_mutually_exclusive_group(required=True) 103 | overwrite_group.add_argument('--partitions', type=int, default=None, 104 | help="The number of partitions to divide the ids into, roughly evenly.") 105 | overwrite_group.add_argument('--cdss_per_partition', type=int, default=None, 106 | help="The approximate number of ids to write to each partition. Partitioning algorithm is greedy, it adds records until the CDS count is met or exceeded, then goes to the next start pointer.") 107 | params = parser.parse_args(args) 108 | 109 | if params.output is None: 110 | out = sys.stdout 111 | else: 112 | out = open(params.output, "w") 113 | 114 | 115 | cds_count, splits = partition_seqfile(params.input, params.partitions, params.cdss_per_partition) 116 | print(cds_count, file=out) 117 | for _, offset, recs in splits: 118 | print(f"{offset}\t{recs}", file=out) 119 | 120 | if params.output is not None: 121 | out.close() 122 | 123 | def _entrypoint(): 124 | main(sys.argv[1:]) 125 | 126 | if __name__ == '__main__': 127 | main(sys.argv[1:]) 128 | -------------------------------------------------------------------------------- /src/domainator/partition_seqids.py: -------------------------------------------------------------------------------- 1 | """Partitions the sequence IDs from a sequence file 2 | 3 | Given an input sequence file, write new text files consisting of the sequence IDs partitioned into groups. 4 | Also prints the total number of records in the file (without a newline at the end). 5 | The "number of records" is the number of CDSs in nucleotide databases, and the number of contigs in protein databases. 6 | 7 | """ 8 | #TODO: maybe deprecate this whole file? 9 | 10 | #TODO: change the purpose of the file to split based on some kind of annotation, like SSN_cluster, then it could be piped into a script for generating group-specific HMMS. 11 | 12 | import argparse 13 | from domainator.utils import parse_seqfiles, count_peptides_in_record 14 | from domainator import __version__, RawAndDefaultsFormatter 15 | import sys 16 | 17 | def partition_seqids(input_path, output_prefix, partitions, ids_per_partition, filetype=None): 18 | cds_count = 0 19 | rec_names = list() 20 | 21 | if partitions is None and ids_per_partition is None: 22 | raise ValueError("Error: Must specify either partitions or ids_per_partition") 23 | 24 | if partitions is not None and ids_per_partition is not None: 25 | raise ValueError("Error: Must specify only one of partitions or ids_per_partition") 26 | 27 | for rec in parse_seqfiles(input_path, filetype_override=filetype): 28 | cds_count += count_peptides_in_record(rec) 29 | rec_names.append(rec.id) 30 | 31 | num_recs = len(rec_names) 32 | 33 | if num_recs == 0: 34 | raise ValueError("Error: No sequence records found in input file (maybe you specified the wrong file type or there is a formatting error in the input file).") 35 | 36 | if partitions is None: #partitions is None, so we need to set it based on num_recs and ids_per_partition 37 | partitions = int(num_recs / ids_per_partition) 38 | if num_recs % ids_per_partition != 0: # if rec ids don't divide evenly into ids_per_partition we need an extra partition 39 | partitions += 1 40 | if ids_per_partition is None: #ids_per_partition is None, so we need to set it based on num_recs and partitions 41 | ids_per_partition = int(num_recs / partitions) 42 | if num_recs % partitions != 0: # if rec ids don't divide evenly into partitions we need an extra partition 43 | ids_per_partition += 1 44 | max_digits = len(f"{partitions}") 45 | out_index = 0 46 | out_file = None 47 | 48 | for position, id in enumerate(rec_names): 49 | 50 | if position % ids_per_partition == 0: 51 | out_index += 1 52 | if out_file is not None: 53 | out_file.close() 54 | out_file = open(output_prefix + str(out_index).zfill(max_digits)+ ".txt", "w") 55 | print(id, file=out_file) 56 | 57 | out_file.close() 58 | 59 | print(cds_count,end='') 60 | 61 | def main(argv): 62 | parser = argparse.ArgumentParser(f"\nversion: {__version__}\n\n" + __doc__, formatter_class=RawAndDefaultsFormatter) 63 | 64 | parser.add_argument('-i', '--input', default=None, 65 | nargs='+', type=str, 66 | help="the genbank or fasta files to split the contig names of. Genbank files can be nucleotide (with CDS annotations) or peptide. Fasta files must be peptide.") 67 | parser.add_argument('--output_prefix', required=True, type=str, 68 | help="Output files will be named [output_prefix][0-9]+.txt") 69 | 70 | overwrite_group = parser.add_mutually_exclusive_group(required=True) 71 | overwrite_group.add_argument('--partitions', type=int, default=None, 72 | help="The number of partitions to divide the ids into, roughly evenly. Actual number of partitions will usually be a little smaller than this number due to bin rounding.") 73 | overwrite_group.add_argument('--ids_per_partition', type=int, default=None, 74 | help="The number of ids to write to each partition.") 75 | params = parser.parse_args(argv) 76 | 77 | 78 | partition_seqids(params.input, params.output_prefix, params.partitions, params.ids_per_partition) 79 | 80 | def _entrypoint(): 81 | main(sys.argv[1:]) 82 | 83 | if __name__ == '__main__': 84 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /test/data/CcdB.hmm.h3f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3f -------------------------------------------------------------------------------- /test/data/CcdB.hmm.h3i: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3i -------------------------------------------------------------------------------- /test/data/CcdB.hmm.h3m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3m -------------------------------------------------------------------------------- /test/data/CcdB.hmm.h3p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/CcdB.hmm.h3p -------------------------------------------------------------------------------- /test/data/CuSOD_enum_report_test.gb: -------------------------------------------------------------------------------- 1 | LOCUS sp|P0AGD1|SODC_ECOLI 173 aa UNK 06-JUN-2023 2 | DEFINITION sp|P0AGD1|SODC_ECOLI Superoxide dismutase [Cu-Zn] OS=Escherichia 3 | coli (strain K12) OX=562 GN=sodC PE=1 SV=1. 4 | ACCESSION sp|P0AGD1|SODC_ECOLI 5 | VERSION sp|P0AGD1|SODC_ECOLI 6 | KEYWORDS . 7 | SOURCE . 8 | ORGANISM . 9 | . 10 | FEATURES Location/Qualifiers 11 | Domain_Search 1..173 12 | /program="phmmer" 13 | /database="swissprot_CuSOD_subset" 14 | /description="Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1" 15 | /evalue="2.0e-114" 16 | /score="375.2" 17 | /name="sp|P0AGD1|SODC_ECOLI" 18 | /identity="100.0" 19 | /cds_id="0_1_173" 20 | /rstart="3" 21 | /rend="79" 22 | /rlen="100" 23 | Domainator 1..173 24 | /program="phmmer" 25 | /database="swissprot_CuSOD_subset" 26 | /description="Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1" 27 | /evalue="2.0e-114" 28 | /score="375.2" 29 | /name="sp|P0AGD1|SODC_ECOLI" 30 | /identity="100.0" 31 | /cds_id="0_1_173" 32 | /rstart="3" 33 | /rend="79" 34 | /rlen="100" 35 | Domainator 14..173 36 | /program="phmmer" 37 | /database="swissprot_CuSOD_subset" 38 | /description="Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1" 39 | /evalue="6.5e-13" 40 | /score="44.8" 41 | /name="sp|O31851|YOJM_BACSU" 42 | /identity="38.1" 43 | /cds_id="0_1_173" 44 | /rstart="3" 45 | /rend="79" 46 | /rlen="100" 47 | ORIGIN 48 | 1 mkrfslaila lvvatgaqaa sekvemnlvt sqgvgqsigs vtitetdkgl efspdlkalp 49 | 61 pgehgfhiha kgscqpatkd gkasaaesag ghldpqntgk hegpegaghl gdlpalvvnn 50 | 121 dgkatdavia prlksldeik dkalmvhvgg dnmsdqpkpl ggggeryacg vik 51 | // 52 | LOCUS sp|O31851|YOJM_BACSU 196 aa UNK 06-JUN-2023 53 | DEFINITION sp|O31851|YOJM_BACSU Superoxide dismutase-like protein YojM 54 | OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1. 55 | ACCESSION sp|O31851|YOJM_BACSU 56 | VERSION sp|O31851|YOJM_BACSU 57 | KEYWORDS . 58 | SOURCE . 59 | ORGANISM . 60 | . 61 | FEATURES Location/Qualifiers 62 | Domain_Search 1..196 63 | /program="phmmer" 64 | /database="swissprot_CuSOD_subset" 65 | /description="Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1" 66 | /evalue="6.8e-135" 67 | /score="442.9" 68 | /name="sp|O31851|YOJM_BACSU" 69 | /identity="100.0" 70 | /cds_id="0_1_196" 71 | /rstart="3" 72 | /rend="79" 73 | /rlen="100" 74 | Domainator 1..196 75 | /program="phmmer" 76 | /database="swissprot_CuSOD_subset" 77 | /description="Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1" 78 | /evalue="6.8e-135" 79 | /score="442.9" 80 | /name="sp|O31851|YOJM_BACSU" 81 | /identity="100.0" 82 | /cds_id="0_1_196" 83 | /rstart="3" 84 | /rend="79" 85 | /rlen="100" 86 | Domainator 36..190 87 | /program="phmmer" 88 | /database="swissprot_CuSOD_subset" 89 | /description="Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1" 90 | /evalue="9.8e-13" 91 | /score="44.3" 92 | /name="sp|P0AGD1|SODC_ECOLI" 93 | /identity="37.4" 94 | /cds_id="0_1_196" 95 | /rstart="3" 96 | /rend="79" 97 | /rlen="100" 98 | ORIGIN 99 | 1 mhrllllmml talgvagcgq kkppdppnrv pekkvvetsa fghhvqlvnr egkavgfiei 100 | 61 kesddegldi hisanslrpg aslgfhiyek gscvrpdfes aggpfnplnk ehgfnnpmgh 101 | 121 hagdlpnlev gadgkvdvim napdtslkkg sklnildedg safiiheqad dyltnpsgns 102 | 181 garivcgall gnnekq 103 | // 104 | -------------------------------------------------------------------------------- /test/data/FeSOD_20.fasta: -------------------------------------------------------------------------------- 1 | >FeSOD_A0A1F4ZT98|unreviewed|Superoxide 2 | MFTLPPLPYPTNALEPYLDTQTLEIHFGKHHATYLKNLNDLLPEKSDADLIPVLQHLDDL 3 | PQDIRVKVRNNAGGVYNHNLYWQCMSPKSKSPSPRLLSSIESGFGTLDAFKEKFSQAALT 4 | HFGSGWAWLVKGTKGLEIVTTPNQDSPVSTGLTPILGLDVWEHAYYLKYQNRRVEYIQAW 5 | WNVVNWDYVSSLLADR 6 | >FeSOD_A0A067LT26|unreviewed|Superoxide 7 | MADYTLVDLPYDYSALEPSISGRIMELHHDKHHKTYVDGANTALVKLQEARDAGDLTFVN 8 | KLQKDLAFNLAGHVNHTVFWNNLSPDGGDKPTGELAAAIDEFFGSYDKFQAHFTASALGI 9 | QGSGWSILAWDSLGQKLIIEQLYDHQGNLAAATVPILLLDMWEHAFYLDYVNVKADYVKA 10 | FWNIVNWADVQARFDAARTKTQGLFLLS 11 | >FeSOD_A0A2E1RF15|unreviewed|Superoxide 12 | MAHQLPELPYSKDALSPHISAETLDYHHGKHHNAYVTKLNAAIEGTEHAEKSIEELVKTT 13 | SGGLFNNAAQHYNHSFYWNCLAPNAGGSATGTVGEMISSKWGSFDKFKEDFSNAGAANFG 14 | SGWTWLVKNASGDLEIVNTDDAECPLTEGHTPLLTMDVWEHAYYVDYRNARPKYIEAFWN 15 | LVNWDFVNSNL 16 | >FeSOD_A0A538G8K1|unreviewed|Superoxide 17 | MAYSVPPLAYDFDALEPHIDAQTMEIHHDKHHGAYVTNLNAALEGTEWMDRPIESVLASL 18 | DVIPEDKRTAVRNNGGGHANHTFFWEIMGPNGGGEPSGALADAIADTFGGLDQLKTQVND 19 | AGVKRFGSGWTWLVWDGTGLAVKSTPNQDSPVMDSDVPLLGIDVWEHAYYLRYQNRRPDY 20 | LAAWWNVVNWEAVATRYEAAK 21 | >FeSOD_A0A6L8W9C4|unreviewed|Superoxide 22 | MAFELPPLPYAKDALAPHISENTLDFHYGKHHNAYVTNLNGLLEDGDSRSLEEIMKDTAG 23 | DAAKAGVFNNAAQVWNHTFYWHSMKPNGGGKPTGAIADKINEDFGSYEKFAEEFKAAGAT 24 | QFGSGWAWLVLDGGKLKVTKTPNAACPLTDGAKPLLTMDVWEHAYYLDYQNARPKYMETF 25 | LESLVNWDFANENLG 26 | >FeSOD_A0A2H0YVA1|unreviewed|Superoxide 27 | MKYELPKLNYAYDALEPYLDAQTMEIHHSKHHQAYTDNFNKALAEHPDLAEMPVEELLAK 28 | VNELSIKDRDKNALRNHGGGYYNHKLFWEIMDPANKKDESLIKDIETEFGSVDSFKEQFS 29 | ETAKTLLGSGWAWLARDKEGKLHVYGLPNQDSPFQKGHTPIICLDVWEHAYYLKYQNRRP 30 | EYIENWWNVLKLI 31 | >FeSOD_R7J7P3|unreviewed|Superoxide 32 | MHQMPKLPYEMEALAPLMSKETFDFHYGKHLQTYVNNLNKLIVGTPYENLELEQIVCQAD 33 | GGIYNNAAQTWNHTFFFQLLTPEQPSLPDDLAGLLTRDFGSVDQFKEDFTKAALGLFGSG 34 | WVWLVLGKDGKLSLLPTPNAGNPLKDGLKPLLVIDVWEHAYYIDYRNNRAAFIEAFWKLV 35 | NWEKVADLLG 36 | >FeSOD_B8LFE6|unreviewed|Superoxide 37 | MAEYTLPDLPYDYAALEPHISGRIMELHHTKHHATYVKGANDTLDKLAEARADGSIAGKV 38 | YGLSATLSFHLGGHTNHSIFWKNLSPNGGDKPEGDLAAAIDDQFGGFDKFQAHFTAAATT 39 | LQGSGWAILGYDTIGGKLVILQLTDQSDNIPAAIIPVVMLDDMWEHAFYLDYQNVKPDYV 40 | KAWWNVVNWADAAERLGRATSQGKGLIVPA 41 | >FeSOD_A0A1C0AS03|unreviewed|Superoxide 42 | MTYTLPDLDYDYGALAPHIAPEIMELHHSKHHAAYVKGINDALEQLAAAREKGDLGAVNK 43 | LSKDAAFHLGGHINHSVFWKNMSPDGGGRPDGELASAIDEYFGSFDGFQKHFNAVANGIQ 44 | GSGWSMLVWDTLGQRMNINQLFDQQGNLPAGQIPLLQLDMWEHAFYLQYKNVKGDYVTAW 45 | WNVVNWTDVTERFARAKAASAGLV 46 | >FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 47 | MAFELPPLPYAYDALEPHIDARTMEIHHDKHHAGYVSKLNSAIAGTEWESKSIDEILRNI 48 | NSVPEDIRVAVQNNGGGHANHSLFWEIMGPNGGGSPSGALAEAINAAFGSFDAFKEKFSN 49 | AAASRFGSGWAWLVVDDAGNLAVYSTANQDSPYMQGHTPILGLDVWEHAYYLNYQNRRPD 50 | YIAAFWNVVNWDAVAEKFAAAKK 51 | >FeSOD_A0A2E1VW30|unreviewed|Superoxide 52 | MAYELPALPYGENDLAPHITAETIQYHYGKHHAAYVGKLNELLDGDDSKSLEELIQSTDG 53 | GVFNNAAQVWNHTFYWNCMKPGGGGAPSGDLAAAIERDFGSYDAFVTKFKAAAMARFGSG 54 | WAWLVADADGKLSVVETLNAGNPMTDGLKPLLTCDVWEHAYYIDFRNARPKYLDVFFDSL 55 | VNWDFVASNL 56 | >FeSOD_A0A1V4UH29|unreviewed|Superoxide 57 | MEPNRSYTLPKLPYDYSALAPSISEEQLRIHHTKHHQGYVNGANAIYEKLDKTRKDGGDI 58 | DQKATLKELSFHVGGFLLHALYWENMAPAGQGGGVPKGALGSRIEKEFGSFDQFKKEFTA 59 | AANSVEGSGWAALTYCQKTGRLLLMQIEKHNVHVFPSFSVLMVVDVWEHAYYIDYKNDRA 60 | KYLENFWNIINWDAVNSRLEKALKG 61 | >FeSOD_A0A7V9SPC9|unreviewed|Superoxide 62 | MPTEVYELPDLSYDYSALEPHINARIMELHHDKHHATYVKGANTALEKLAEVRATGDFAT 63 | IAMLEKNLAFNVSGHVLHSIFWTNLSPNGGGEPDGELATALTDTFGGFEHFRKQMNEAAA 64 | TVQGAGWALASWEPIAQRLIVQQVHDHQGNHGQGTIPLLAIDAWEHAYYLQYENRKTEFF 65 | DAVWNVVSWGDVEARFKAARNAELIRQT 66 | >FeSOD_R7F5H2|unreviewed|Superoxide 67 | MIKKINLEYPLDSLEPYYSRETLNIHYNTLYVGYVDNTNITLEKLEKARKERNFENIKCL 68 | EKNLSFFGSGVILHELFFENMGPAIPSSPDINLMEQINKDFGSFELFKEQFTESSKVVEA 69 | SGWNLLVWVPRFNKLEIIQCEKHQDLTLWNCKPILVLDMWEHSYFLQYKANRGEYIKAFW 70 | NIINWNNVNKRFRNTIKY 71 | >FeSOD_A0A2N5YS14|unreviewed|Superoxide 72 | MFELPKLPYEFNSLEPKISAKTVEFHYTKHHQVYVNKLNGLIEGTDYAGKTLEEIIKTSE 73 | GGIFNNAAQVWNHTFYWEGFGPNPQSAPSGKLAEMINETFGSFEKFKEEFSTKAATLFGS 74 | GWAWLVLDNGQLKITGTSNAGSPLTEGHKPILTCDVWEHAYYLDYQNLRPKYIENFWELV 75 | DWKKIEGRI 76 | >FeSOD_A0A0F6MY72|unreviewed|Superoxide 77 | MAFELPSLPFDQDALESSKMSANTLSYHHGKHHAAYVKNLNAAIEGTDMANMSLEEIIKA 78 | TYNDPSKSGIFNNAAQVWNHSFFWKCLKPNGGGQPTGALADKIQADFGSFDAFIQEFKNA 79 | AATQFGSGWAWLVLDNGTLKVTKTANAVNPMVEGKTPLLTLDVWEHAYYLDFQNARPGFI 80 | DNFIENLVNWDFVAENLASAS 81 | >FeSOD_A0A060HP82|unreviewed|Superoxide 82 | MPRRPSHLMANFTLPQLPYAYDALEPHIDATTMQIHHTKHHQAYTDGLNKALGSLDAKFQ 83 | SMDAVDILKNIDTVPENARGAVNFHGGGYNNHTLFWNNMKKGGGGEPSGELADAIKKAFG 84 | SFADFKTKFQTDSVAIQGSGWGWLVKNASGGVQFITMPNQTSPWTRWKAEKLTPLLGLDV 85 | WEHSYYLKYQNRRADYVTAWWNVVNWDEVAKRFKA 86 | >FeSOD_A0A4P7WS39|unreviewed|Superoxide 87 | MKITHQLPELPFNKSALNPIITEETFDYHYGKHHAAYVNNLATLIQDTELINFSIEDIIK 88 | KGFYEKNASLFNNAAQHWSHTFFWNCLSPNGGKAPVGRITELITRDFGSFELFKDQFSNA 89 | AIKLFGCGWAWLVQDENDKLEIIAMKEAQTPLILNKKPILTLDVWEHAYYIDYKNARPKF 90 | VEGFWDIVNWDFANKNVI 91 | >FeSOD_A0A076JJX0|unreviewed|Superoxide 92 | MPVYTLPELPYDYSALEPYVSGKIMELHHDKHHQAYVNGANQALEQIHDAAESGNVAQSN 93 | LLEKNLAFNLAGHKNHTIFWKNMAPSIGQEPTGELKAAIEDQFGSFEGFQRYFESMCAGI 94 | QGSGWAVLAWDSLGERLVTLQMYDHQGNLPVTIFPLILLDLWEHAYYLDYLNVRADYVKA 95 | WWHIVNWEDASKRFDEVRNLNTNLVK 96 | >FeSOD_G8R729|unreviewed|Superoxide 97 | MSFELPDLPYSKSALEPYIDAQTMEIHHDKHHAGYTTKLNDAIEGTELEKQSIEDILKNV 98 | SKHSGGVRNNGGGYYNHSLFWSIMGPDAGGDPTGDVGAAIDDAFGSYENFKTEFSNAAAT 99 | RFGSGWAWLIVNGEGKLEVTSSPNQDNPLMDVAEKKGTPILGLDVWEHAYYLKYQNKRPD 100 | YISAFFNVINWDEVNRRFAEAK 101 | -------------------------------------------------------------------------------- /test/data/FeSOD_dist.dense.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/FeSOD_dist.dense.hdf5 -------------------------------------------------------------------------------- /test/data/FeSOD_dist.sparse.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/FeSOD_dist.sparse.hdf5 -------------------------------------------------------------------------------- /test/data/FeSOD_dist.tsv: -------------------------------------------------------------------------------- 1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 410.0 0.0 0.0 199.0 0.0 189.0 0.0 0.0 0.0 216.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 187.0 3 | FeSOD_A0A067LT26|unreviewed|Superoxide 0.0 429.0 0.0 0.0 0.0 0.0 0.0 271.0 279.0 0.0 0.0 0.0 247.0 0.0 0.0 0.0 0.0 0.0 277.0 0.0 4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 0.0 0.0 405.0 0.0 239.0 0.0 0.0 0.0 0.0 0.0 232.0 0.0 0.0 0.0 228.0 0.0 0.0 228.0 0.0 0.0 5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 199.0 0.0 0.0 425.0 0.0 0.0 0.0 0.0 0.0 287.0 0.0 0.0 0.0 0.0 0.0 0.0 205.0 0.0 0.0 229.0 6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 0.0 0.0 240.0 0.0 411.0 0.0 0.0 0.0 0.0 0.0 258.0 0.0 0.0 0.0 234.0 271.0 0.0 0.0 0.0 0.0 7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 191.0 0.0 0.0 184.0 0.0 410.0 0.0 0.0 0.0 208.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 192.0 8 | FeSOD_R7J7P3|unreviewed|Superoxide 0.0 0.0 192.0 0.0 0.0 0.0 400.0 0.0 0.0 0.0 199.0 0.0 0.0 0.0 209.0 0.0 0.0 191.0 0.0 0.0 9 | FeSOD_B8LFE6|unreviewed|Superoxide 0.0 267.0 0.0 0.0 0.0 0.0 0.0 434.0 264.0 0.0 0.0 0.0 214.0 0.0 0.0 0.0 0.0 0.0 239.0 0.0 10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 0.0 275.0 0.0 0.0 0.0 0.0 0.0 264.0 426.0 0.0 0.0 0.0 232.0 0.0 0.0 0.0 0.0 0.0 249.0 0.0 11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 0.0 0.0 214.0 279.0 0.0 0.0 0.0 0.0 0.0 414.0 0.0 0.0 0.0 0.0 0.0 0.0 221.0 0.0 0.0 261.0 12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 0.0 0.0 233.0 0.0 258.0 0.0 0.0 0.0 0.0 0.0 398.0 0.0 0.0 0.0 219.0 239.0 0.0 0.0 0.0 0.0 13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 0.0 171.0 0.0 0.0 0.0 0.0 0.0 177.0 169.0 0.0 0.0 424.0 0.0 0.0 0.0 0.0 0.0 0.0 169.0 0.0 14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 0.0 244.0 0.0 0.0 0.0 0.0 0.0 216.0 234.0 0.0 0.0 0.0 430.0 0.0 0.0 0.0 0.0 0.0 222.0 0.0 15 | FeSOD_R7F5H2|unreviewed|Superoxide 0.0 135.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 149.0 140.0 413.0 0.0 0.0 0.0 0.0 135.0 0.0 16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 0.0 0.0 233.0 0.0 237.0 0.0 0.0 0.0 0.0 0.0 222.0 0.0 0.0 0.0 402.0 223.0 0.0 0.0 0.0 0.0 17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 0.0 0.0 231.0 0.0 275.0 0.0 0.0 0.0 0.0 0.0 243.0 0.0 0.0 0.0 224.0 417.0 0.0 0.0 0.0 0.0 18 | FeSOD_A0A060HP82|unreviewed|Superoxide 177.0 0.0 0.0 202.0 0.0 0.0 0.0 0.0 0.0 226.0 0.0 0.0 0.0 0.0 0.0 0.0 451.0 0.0 0.0 208.0 19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 0.0 0.0 234.0 0.0 205.0 0.0 0.0 0.0 0.0 0.0 203.0 0.0 0.0 0.0 0.0 208.0 0.0 419.0 0.0 0.0 20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 0.0 271.0 0.0 0.0 0.0 0.0 0.0 238.0 248.0 0.0 0.0 0.0 219.0 0.0 0.0 0.0 0.0 0.0 429.0 0.0 21 | FeSOD_G8R729|unreviewed|Superoxide 0.0 0.0 218.0 231.0 0.0 0.0 0.0 0.0 0.0 271.0 0.0 0.0 0.0 0.0 0.0 0.0 213.0 0.0 0.0 422.0 22 | -------------------------------------------------------------------------------- /test/data/FeSOD_metadata.tsv: -------------------------------------------------------------------------------- 1 | Name Type Accession Status Substrate active expression level 2 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD A0A060HP82 unreviewed Superoxide 1 0.602844258 3 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD A0A067LT26 unreviewed Superoxide 0 0.828329655 4 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD A0A076JJX0 unreviewed Superoxide 1 0.342002723 5 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD A0A0F6MY72 unreviewed Superoxide 1 0.341330749 6 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD A0A1C0AS03 unreviewed Superoxide 0 0.59654186 7 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD A0A1F4ZT98 unreviewed Superoxide 0 0.097440548 8 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD A0A1V4UH29 unreviewed Superoxide 1 0.549335033 9 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD A0A2E1RF15 unreviewed Superoxide 0 0.340114417 10 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD A0A2E1VW30 unreviewed Superoxide 0 0.450472955 11 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD A0A2H0YVA1 unreviewed Superoxide 1 0.08286451 12 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD A0A2M8Q2V9 unreviewed Superoxide 1 0.146398972 13 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD A0A2N5YS14 unreviewed Superoxide 0 0.91315881 14 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD A0A4P7WS39 unreviewed Superoxide 1 0.468473632 15 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD A0A538G8K1 unreviewed Superoxide 0 0.021717089 16 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD A0A6L8W9C4 unreviewed Superoxide 1 0.05147778 17 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD A0A7V9SPC9 unreviewed Superoxide 1 0.129515554 18 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD B8LFE6 unreviewed Superoxide 0 0.433221718 19 | FeSOD_G8R729|unreviewed|Superoxide FeSOD G8R729 unreviewed Superoxide 1 0.265199983 20 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD R7F5H2 unreviewed Superoxide 0 0.441960291 21 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD R7J7P3 unreviewed Superoxide 1 0.029610013 22 | -------------------------------------------------------------------------------- /test/data/FeSOD_score_dist.newick: -------------------------------------------------------------------------------- 1 | ((((((FeSOD_A0A1C0AS03|unreviewed|Superoxide:0.40,FeSOD_B8LFE6|unreviewed|Superoxide:0.40):0.01,(FeSOD_A0A076JJX0|unreviewed|Superoxide:0.37,FeSOD_A0A067LT26|unreviewed|Superoxide:0.37):0.04):0.07,FeSOD_A0A7V9SPC9|unreviewed|Superoxide:0.49):0.14,FeSOD_A0A1V4UH29|unreviewed|Superoxide:0.63):0.08,FeSOD_R7F5H2|unreviewed|Superoxide:0.70):0.02,((((((FeSOD_A0A2M8Q2V9|unreviewed|Superoxide:0.34,FeSOD_A0A538G8K1|unreviewed|Superoxide:0.34):0.10,FeSOD_G8R729|unreviewed|Superoxide:0.43):0.08,FeSOD_A0A060HP82|unreviewed|Superoxide:0.52):0.03,FeSOD_A0A1F4ZT98|unreviewed|Superoxide:0.54):0.02,FeSOD_A0A2H0YVA1|unreviewed|Superoxide:0.56):0.07,((((FeSOD_A0A2N5YS14|unreviewed|Superoxide:0.44,FeSOD_A0A2E1RF15|unreviewed|Superoxide:0.44):0.01,((FeSOD_A0A0F6MY72|unreviewed|Superoxide:0.36,FeSOD_A0A6L8W9C4|unreviewed|Superoxide:0.36):0.05,FeSOD_A0A2E1VW30|unreviewed|Superoxide:0.40):0.05):0.06,FeSOD_A0A4P7WS39|unreviewed|Superoxide:0.51):0.04,FeSOD_R7J7P3|unreviewed|Superoxide:0.55):0.08):0.09); 2 | -------------------------------------------------------------------------------- /test/data/MT_nbs.enum_report.tsv: -------------------------------------------------------------------------------- 1 | contig cds_count domain_count taxid_species 2 | BX548174_369054:361090rc 9 6 1219 3 | AP010935_277029:265042rc 9 10 1334 4 | AP010935_936190:944542 9 13 1334 5 | AP010958_937230:927711rc 9 10 562 6 | AP010958_5381772:5387197 8 10 562 7 | AP010958_2919630:2924198 9 10 562 8 | AP010958_2902985:2909899 9 9 562 9 | AP010958_2275839:2267457rc 9 10 562 10 | AP010958_4084369:4094899 9 10 562 11 | AP010958_2177479:2170952rc 8 8 562 12 | AP010958_4182350:4173105rc 9 13 562 13 | AP010958_1458605:1465836 9 13 562 14 | AP010958_578280:583796 9 8 562 15 | AP011121_834340:842417 9 10 438 16 | AP011121_1001485:1013239 9 12 438 17 | BA000040_5565877:5553106rc 9 11 1355477 18 | BA000040_2743729:2731520rc 9 16 1355477 19 | AP011115_461187:454129rc 9 10 37919 20 | AP010950_263272:271283 9 12 137722 21 | AP011115_4384225:4376630rc 9 8 37919 22 | -------------------------------------------------------------------------------- /test/data/Peptidase_M28.hmm.h3f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3f -------------------------------------------------------------------------------- /test/data/Peptidase_M28.hmm.h3i: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3i -------------------------------------------------------------------------------- /test/data/Peptidase_M28.hmm.h3m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3m -------------------------------------------------------------------------------- /test/data/Peptidase_M28.hmm.h3p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/Peptidase_M28.hmm.h3p -------------------------------------------------------------------------------- /test/data/bin3.sparse.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/bin3.sparse.hdf5 -------------------------------------------------------------------------------- /test/data/bin3.sparse.tsv: -------------------------------------------------------------------------------- 1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1.0 2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0 3 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0 4 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0 5 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0 6 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 1.0 7 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 1.0 8 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0 9 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0 10 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0 11 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0 12 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0 13 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0 14 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0 15 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0 16 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2H0YVA1|unreviewed|Superoxide 1.0 17 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0 18 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0 19 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_R7J7P3|unreviewed|Superoxide 1.0 20 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 1.0 21 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0 22 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 1.0 23 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0 24 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0 25 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0 26 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0 27 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 1.0 28 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0 29 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0 30 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0 31 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0 32 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0 33 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0 34 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 1.0 35 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 1.0 36 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0 37 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 1.0 38 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0 39 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0 40 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_R7F5H2|unreviewed|Superoxide 1.0 41 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 1.0 42 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 1.0 43 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 1.0 44 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0 45 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 1.0 46 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0 47 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1.0 48 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 1.0 49 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A060HP82|unreviewed|Superoxide 1.0 50 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0 51 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0 52 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A4P7WS39|unreviewed|Superoxide 1.0 53 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 1.0 54 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 1.0 55 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 1.0 56 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 1.0 57 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 1.0 58 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 1.0 59 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1.0 60 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 1.0 61 | -------------------------------------------------------------------------------- /test/data/bin3.tsv: -------------------------------------------------------------------------------- 1 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 3 | FeSOD_A0A067LT26|unreviewed|Superoxide 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 8 | FeSOD_R7J7P3|unreviewed|Superoxide 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 9 | FeSOD_B8LFE6|unreviewed|Superoxide 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 15 | FeSOD_R7F5H2|unreviewed|Superoxide 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 18 | FeSOD_A0A060HP82|unreviewed|Superoxide 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 21 | FeSOD_G8R729|unreviewed|Superoxide 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 22 | -------------------------------------------------------------------------------- /test/data/ccdb.gb: -------------------------------------------------------------------------------- 1 | LOCUS pDONR201_1265:958rc 306 bp DNA UNK 27-DEC-2023 2 | DEFINITION Gateway donor vector. 3 | ACCESSION urn.local...v2-dgufz7r 4 | VERSION urn.local...v2-dgufz7r 5 | KEYWORDS . 6 | SOURCE 7 | ORGANISM . 8 | . 9 | FEATURES Location/Qualifiers 10 | source complement(1..306) 11 | /invitrogen="""584" 12 | /label="Invitrogen vector 584" 13 | CDS 1..306 14 | /invitrogen="""1060000" 15 | /label="ccdB" 16 | /translation="MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDK 17 | VSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI*" 18 | /cds_id="1264_-1_959" 19 | /domainator_CcdB="CcdB (CcdB protein, 4.4e-31, 103.1)" 20 | Domainator 4..300 21 | /program="hmmsearch" 22 | /database="CcdB" 23 | /description="CcdB protein" 24 | /evalue="4.4e-31" 25 | /score="103.1" 26 | /name="CcdB" 27 | /identity="39.0" 28 | /cds_id="1264_-1_959" 29 | /rstart="1" 30 | /rend="100" 31 | /rlen="100" 32 | Domain_Search 4..300 33 | /program="hmmsearch" 34 | /database="CcdB" 35 | /description="CcdB protein" 36 | /evalue="4.4e-31" 37 | /score="103.1" 38 | /name="CcdB" 39 | /identity="39.0" 40 | /cds_id="1264_-1_959" 41 | /rstart="1" 42 | /rend="100" 43 | /rlen="100" 44 | ORIGIN 45 | 1 atgcagttta aggtttacac ctataaaaga gagagccgtt atcgtctgtt tgtggatgta 46 | 61 cagagtgata ttattgacac gcccgggcga cggatggtga tccccctggc cagtgcacgt 47 | 121 ctgctgtcag ataaagtctc ccgtgaactt tacccggtgg tgcatatcgg ggatgaaagc 48 | 181 tggcgcatga tgaccaccga tatggccagt gtgccggtct ccgttatcgg ggaagaagtg 49 | 241 gctgatctca gccaccgcga aaatgacatc aaaaacgcca ttaacctgat gttctgggga 50 | 301 atataa 51 | // 52 | -------------------------------------------------------------------------------- /test/data/color_specification.tsv: -------------------------------------------------------------------------------- 1 | CcdB #ff0000 2 | APH #00ff00 3 | CAT #0000ff 4 | Condensation #ff00ff 5 | 2-oxoacid_dh #ffffff 6 | -------------------------------------------------------------------------------- /test/data/color_table_123.tsv: -------------------------------------------------------------------------------- 1 | 1 #1F77C4 2 | 2 #FEC7E8 3 | 3 #FFFF0E 4 | -------------------------------------------------------------------------------- /test/data/domain_search_translate_out.gb: -------------------------------------------------------------------------------- 1 | LOCUS 1264_-1_959 102 aa UNK 27-DEC-2023 2 | DEFINITION Gateway donor vector. 3 | ACCESSION 1264_-1_959 4 | VERSION 1264_-1_959 5 | KEYWORDS . 6 | SOURCE 7 | ORGANISM . 8 | . 9 | FEATURES Location/Qualifiers 10 | source complement(1..102) 11 | /invitrogen="""584" 12 | /label="Invitrogen vector 584" 13 | CDS 1..102 14 | /invitrogen="""1060000" 15 | /label="ccdB" 16 | /translation="MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDK 17 | VSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI*" 18 | /cds_id="1264_-1_959" 19 | /source_contig="pDONR201" 20 | Domain_Search 2..100 21 | /program="hmmsearch" 22 | /database="CcdB" 23 | /description="CcdB protein" 24 | /evalue="4.4e-31" 25 | /score="103.1" 26 | /name="CcdB" 27 | /identity="39.0" 28 | /cds_id="1264_-1_959" 29 | /rstart="1" 30 | /rend="100" 31 | /rlen="100" 32 | ORIGIN 33 | 1 mqfkvytykr esryrlfvdv qsdiidtpgr rmviplasar llsdkvsrel ypvvhigdes 34 | 61 wrmmttdmas vpvsvigeev adlshrendi knainlmfwg i* 35 | // 36 | -------------------------------------------------------------------------------- /test/data/empty.gb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/empty.gb -------------------------------------------------------------------------------- /test/data/enum_report_html_max_size_out.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Enum Report 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 |
15 |
16 | 64 | 65 | -------------------------------------------------------------------------------- /test/data/enum_report_html_out.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Enum Report 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 |
15 |
16 | 63 | 64 | -------------------------------------------------------------------------------- /test/data/enum_report_html_out_quote_escape.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Enum Report 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 |
15 |
16 | 55 | 56 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD: -------------------------------------------------------------------------------- 1 | MFTLPPLPYPTNALEPYLDTQTLEIHFGKHHATYLKNLNDLLPEKSDADLIPVLQHLDDLPQDIRVKVRNNAGGVYNHNLYWQCMSPKSKSPSPRLLSSIESGFGTLDAFKEKFSQAALTHFGSGWAWLVKGTKGLEIVTTPNQDSPVSTGLTPILGLDVWEHAYYLKYQNRRVEYIQAWWNVVNWDYVSSLLADR 2 | MADYTLVDLPYDYSALEPSISGRIMELHHDKHHKTYVDGANTALVKLQEARDAGDLTFVNKLQKDLAFNLAGHVNHTVFWNNLSPDGGDKPTGELAAAIDEFFGSYDKFQAHFTASALGIQGSGWSILAWDSLGQKLIIEQLYDHQGNLAAATVPILLLDMWEHAFYLDYVNVKADYVKAFWNIVNWADVQARFDAARTKTQGLFLLS 3 | MAHQLPELPYSKDALSPHISAETLDYHHGKHHNAYVTKLNAAIEGTEHAEKSIEELVKTTSGGLFNNAAQHYNHSFYWNCLAPNAGGSATGTVGEMISSKWGSFDKFKEDFSNAGAANFGSGWTWLVKNASGDLEIVNTDDAECPLTEGHTPLLTMDVWEHAYYVDYRNARPKYIEAFWNLVNWDFVNSNL 4 | MAYSVPPLAYDFDALEPHIDAQTMEIHHDKHHGAYVTNLNAALEGTEWMDRPIESVLASLDVIPEDKRTAVRNNGGGHANHTFFWEIMGPNGGGEPSGALADAIADTFGGLDQLKTQVNDAGVKRFGSGWTWLVWDGTGLAVKSTPNQDSPVMDSDVPLLGIDVWEHAYYLRYQNRRPDYLAAWWNVVNWEAVATRYEAAK 5 | MAFELPPLPYAKDALAPHISENTLDFHYGKHHNAYVTNLNGLLEDGDSRSLEEIMKDTAGDAAKAGVFNNAAQVWNHTFYWHSMKPNGGGKPTGAIADKINEDFGSYEKFAEEFKAAGATQFGSGWAWLVLDGGKLKVTKTPNAACPLTDGAKPLLTMDVWEHAYYLDYQNARPKYMETFLESLVNWDFANENLG 6 | MKYELPKLNYAYDALEPYLDAQTMEIHHSKHHQAYTDNFNKALAEHPDLAEMPVEELLAKVNELSIKDRDKNALRNHGGGYYNHKLFWEIMDPANKKDESLIKDIETEFGSVDSFKEQFSETAKTLLGSGWAWLARDKEGKLHVYGLPNQDSPFQKGHTPIICLDVWEHAYYLKYQNRRPEYIENWWNVLKLI 7 | MHQMPKLPYEMEALAPLMSKETFDFHYGKHLQTYVNNLNKLIVGTPYENLELEQIVCQADGGIYNNAAQTWNHTFFFQLLTPEQPSLPDDLAGLLTRDFGSVDQFKEDFTKAALGLFGSGWVWLVLGKDGKLSLLPTPNAGNPLKDGLKPLLVIDVWEHAYYIDYRNNRAAFIEAFWKLVNWEKVADLLG 8 | MAEYTLPDLPYDYAALEPHISGRIMELHHTKHHATYVKGANDTLDKLAEARADGSIAGKVYGLSATLSFHLGGHTNHSIFWKNLSPNGGDKPEGDLAAAIDDQFGGFDKFQAHFTAAATTLQGSGWAILGYDTIGGKLVILQLTDQSDNIPAAIIPVVMLDDMWEHAFYLDYQNVKPDYVKAWWNVVNWADAAERLGRATSQGKGLIVPA 9 | MTYTLPDLDYDYGALAPHIAPEIMELHHSKHHAAYVKGINDALEQLAAAREKGDLGAVNKLSKDAAFHLGGHINHSVFWKNMSPDGGGRPDGELASAIDEYFGSFDGFQKHFNAVANGIQGSGWSMLVWDTLGQRMNINQLFDQQGNLPAGQIPLLQLDMWEHAFYLQYKNVKGDYVTAWWNVVNWTDVTERFARAKAASAGLV 10 | MAFELPPLPYAYDALEPHIDARTMEIHHDKHHAGYVSKLNSAIAGTEWESKSIDEILRNINSVPEDIRVAVQNNGGGHANHSLFWEIMGPNGGGSPSGALAEAINAAFGSFDAFKEKFSNAAASRFGSGWAWLVVDDAGNLAVYSTANQDSPYMQGHTPILGLDVWEHAYYLNYQNRRPDYIAAFWNVVNWDAVAEKFAAAKK 11 | MAYELPALPYGENDLAPHITAETIQYHYGKHHAAYVGKLNELLDGDDSKSLEELIQSTDGGVFNNAAQVWNHTFYWNCMKPGGGGAPSGDLAAAIERDFGSYDAFVTKFKAAAMARFGSGWAWLVADADGKLSVVETLNAGNPMTDGLKPLLTCDVWEHAYYIDFRNARPKYLDVFFDSLVNWDFVASNL 12 | MEPNRSYTLPKLPYDYSALAPSISEEQLRIHHTKHHQGYVNGANAIYEKLDKTRKDGGDIDQKATLKELSFHVGGFLLHALYWENMAPAGQGGGVPKGALGSRIEKEFGSFDQFKKEFTAAANSVEGSGWAALTYCQKTGRLLLMQIEKHNVHVFPSFSVLMVVDVWEHAYYIDYKNDRAKYLENFWNIINWDAVNSRLEKALKG 13 | MPTEVYELPDLSYDYSALEPHINARIMELHHDKHHATYVKGANTALEKLAEVRATGDFATIAMLEKNLAFNVSGHVLHSIFWTNLSPNGGGEPDGELATALTDTFGGFEHFRKQMNEAAATVQGAGWALASWEPIAQRLIVQQVHDHQGNHGQGTIPLLAIDAWEHAYYLQYENRKTEFFDAVWNVVSWGDVEARFKAARNAELIRQT 14 | MIKKINLEYPLDSLEPYYSRETLNIHYNTLYVGYVDNTNITLEKLEKARKERNFENIKCLEKNLSFFGSGVILHELFFENMGPAIPSSPDINLMEQINKDFGSFELFKEQFTESSKVVEASGWNLLVWVPRFNKLEIIQCEKHQDLTLWNCKPILVLDMWEHSYFLQYKANRGEYIKAFWNIINWNNVNKRFRNTIKY 15 | MFELPKLPYEFNSLEPKISAKTVEFHYTKHHQVYVNKLNGLIEGTDYAGKTLEEIIKTSEGGIFNNAAQVWNHTFYWEGFGPNPQSAPSGKLAEMINETFGSFEKFKEEFSTKAATLFGSGWAWLVLDNGQLKITGTSNAGSPLTEGHKPILTCDVWEHAYYLDYQNLRPKYIENFWELVDWKKIEGRI 16 | MAFELPSLPFDQDALESSKMSANTLSYHHGKHHAAYVKNLNAAIEGTDMANMSLEEIIKATYNDPSKSGIFNNAAQVWNHSFFWKCLKPNGGGQPTGALADKIQADFGSFDAFIQEFKNAAATQFGSGWAWLVLDNGTLKVTKTANAVNPMVEGKTPLLTLDVWEHAYYLDFQNARPGFIDNFIENLVNWDFVAENLASAS 17 | MPRRPSHLMANFTLPQLPYAYDALEPHIDATTMQIHHTKHHQAYTDGLNKALGSLDAKFQSMDAVDILKNIDTVPENARGAVNFHGGGYNNHTLFWNNMKKGGGGEPSGELADAIKKAFGSFADFKTKFQTDSVAIQGSGWGWLVKNASGGVQFITMPNQTSPWTRWKAEKLTPLLGLDVWEHSYYLKYQNRRADYVTAWWNVVNWDEVAKRFKA 18 | MKITHQLPELPFNKSALNPIITEETFDYHYGKHHAAYVNNLATLIQDTELINFSIEDIIKKGFYEKNASLFNNAAQHWSHTFFWNCLSPNGGKAPVGRITELITRDFGSFELFKDQFSNAAIKLFGCGWAWLVQDENDKLEIIAMKEAQTPLILNKKPILTLDVWEHAYYIDYKNARPKFVEGFWDIVNWDFANKNVI 19 | MPVYTLPELPYDYSALEPYVSGKIMELHHDKHHQAYVNGANQALEQIHDAAESGNVAQSNLLEKNLAFNLAGHKNHTIFWKNMAPSIGQEPTGELKAAIEDQFGSFEGFQRYFESMCAGIQGSGWAVLAWDSLGERLVTLQMYDHQGNLPVTIFPLILLDLWEHAYYLDYLNVRADYVKAWWHIVNWEDASKRFDEVRNLNTNLVK 20 | MSFELPDLPYSKSALEPYIDAQTMEIHHDKHHAGYTTKLNDAIEGTELEKQSIEDILKNVSKHSGGVRNNGGGYYNHSLFWSIMGPDAGGDPTGDVGAAIDDAFGSYENFKTEFSNAAATRFGSGWAWLIVNGEGKLEVTSSPNQDNPLMDVAEKKGTPILGLDVWEHAYYLKYQNKRPDYISAFFNVINWDEVNRRFAEAK 21 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD.dbtype: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD.index: -------------------------------------------------------------------------------- 1 | 0 0 198 2 | 1 198 210 3 | 2 408 193 4 | 3 601 203 5 | 4 804 197 6 | 5 1001 195 7 | 6 1196 192 8 | 7 1388 212 9 | 8 1600 206 10 | 9 1806 205 11 | 10 2011 192 12 | 11 2203 207 13 | 12 2410 210 14 | 13 2620 200 15 | 14 2820 191 16 | 15 3011 203 17 | 16 3214 217 18 | 17 3431 200 19 | 18 3631 208 20 | 19 3839 204 21 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD.lookup: -------------------------------------------------------------------------------- 1 | 0 FeSOD_A0A1F4ZT98 0 2 | 1 FeSOD_A0A067LT26 1 3 | 2 FeSOD_A0A2E1RF15 2 4 | 3 FeSOD_A0A538G8K1 3 5 | 4 FeSOD_A0A6L8W9C4 4 6 | 5 FeSOD_A0A2H0YVA1 5 7 | 6 FeSOD_R7J7P3 6 8 | 7 FeSOD_B8LFE6 7 9 | 8 FeSOD_A0A1C0AS03 8 10 | 9 FeSOD_A0A2M8Q2V9 9 11 | 10 FeSOD_A0A2E1VW30 10 12 | 11 FeSOD_A0A1V4UH29 11 13 | 12 FeSOD_A0A7V9SPC9 12 14 | 13 FeSOD_R7F5H2 13 15 | 14 FeSOD_A0A2N5YS14 14 16 | 15 FeSOD_A0A0F6MY72 15 17 | 16 FeSOD_A0A060HP82 16 18 | 17 FeSOD_A0A4P7WS39 17 19 | 18 FeSOD_A0A076JJX0 18 20 | 19 FeSOD_G8R729 19 21 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_20.3di.fasta: -------------------------------------------------------------------------------- 1 | >FeSOD_A0A1F4ZT98 unreviewed Superoxide 2 | DDDADDAPDDLCLLPPLAHNVLSCCQRNPQLVVLSVLLCVLCVVPDPDDLLVCLVCLVVDDPVSSVSSLQSSLSNLLSSLLSVLFGSDFDAFDPVLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQDPVGTHIDIDHRSHDCSNVVTDTSDMDTCHCSNPCVVCNVPSSSSSVSRSSTTNRVSSNVSSVVD 3 | >FeSOD_A0A067LT26 unreviewed Superoxide 4 | DDDDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLRVQLRVLVVQLVVCVVVVNPVSNVVSLLSNLLSVLSNLLSNLLSVLFANQFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLLPDDFFWKWFWWQQLVVRGTYIDIDHHSPPCDSVRTDTLDIDTSHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVCSVVVDPDPDPDD 5 | >FeSOD_A0A2E1RF15 unreviewed Superoxide 6 | DADAADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLRVLCPPHPCSPDDLLVCLQPDDDSSNLSSLLSVLSNLLSVQFANPFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPVRRTDIDIDHRRHDCNNVRTDTSDMDTCHCSNPCVRCNPPSSSSSVSNSSTTPRVSSNVSD 7 | >FeSOD_A0A538G8K1 unreviewed Superoxide 8 | DADDQDDFPDDLCLLPPLAHSVLSCCCRPPQLNVLSVQLRVLCPPHPCLVDDLLVCLVPLVPDDPVSSLSNQQSSLSNLLSSLLSVQFANQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDPPPGTDIDIDHRSHHCSDPPTRTLDMDTCHCSNPCVVQNVPSSSSSVSSSRTYNRVSSNVSSVVSD 9 | >FeSOD_A0A6L8W9C4 unreviewed Superoxide 10 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLCVLCVVPDPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSCLLSVLFGNNFFDAADDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDDPSGTHIDIDHRSHGSSNVPTDTLDMDTCHCSNCCPVCNPPSSSSSVSCRVRTGDNVSSNVSSD 11 | >FeSOD_A0A2H0YVA1 unreviewed Superoxide 12 | DADDADDFPDDLCLLPPLAHRVLSCCCRPPQLVVLSVQLRVLCVVPVVLSPDDLLVCLVCLVVDPDDDPSSVSSQLSSLSNLLSVLLSVQLHSPDDDDPVLQVQCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWADPVRGTDIDIDHRSHDCSNVVTHTSDMDTCHCSNPCVVCNVCSSSSSVSSVVSGDRD 13 | >FeSOD_R7J7P3 unreviewed Superoxide 14 | DDDQDDQPDDLCLCPPLAHSVLSCCCRNPQLVVLSVQLCVLCPPHPCNPDDLLVSLQPDDDSSNLSSLLNCLSNVLLVLFAPDFDAADPVLQVLCCVQQNHPVSVLVVQLCQQLPDDAWWKWFWFQDPVRGTDIDTDHSSHGCVVVVTDTLDIDTCHCSNCCVVCNPPSNSSSVSSSSTTPSVSSSVSND 15 | >FeSOD_B8LFE6 unreviewed Superoxide 16 | DDADDADDQPDDLCLLPPLFHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCPVCNVVSVVSNLLSSLSNLLSVLLSVLFHSQFFDAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWWWWQQLQVRGTYIDIDHHSSPCDDSSRTDTSDIDTCHCSNCCVVCNPPSSSSSVSRVRTTNRVSSNVSSVVSVVSSPPDDDDD 17 | >FeSOD_A0A1C0AS03 unreviewed Superoxide 18 | DADDADDFPDDLCLLPPFAHSVLSCCCRPPQLVVLSVQLNVLVVQLVVCVVVVNPVSNLVSLLSNLQSVLSNLLSNLLSVLFASQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWQQLVVRGTYIDIDHGSSPDDDPRIDTLDIDTCHCSNPCVVQNPPSSSSSVSSSRTTPRVSSNVSSVSSVVSSVVSD 19 | >FeSOD_A0A2M8Q2V9 unreviewed Superoxide 20 | DADDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVLLRVLCPPHPCSVDDLLVCLVPLVSDDPVSSVSNLQSSLSNLQSSLLSVQFGNNFADAADDPLQVLCCVQQNHPVSVLVVLLCQLLPDPAWWKWFWKQALVLGTDIDIDHRSHDCSNVRTDTSDMDTCHCSNPCVVQNVCSSSSSVSSSRTTNRVSSNVSSVVSVD 21 | >FeSOD_A0A2E1VW30 unreviewed Superoxide 22 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLSVQLRVLCVVPPPDDLLRCLQPDDDPSNLSSQLNVLSNLLSNLFGNPFFDADDDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWADPVRGTDIDTDHSSHDSSDVPTDTSDMDTCHCSNPCPVCNPPRSSSSVSCRVTTTDNVSSNVSD 23 | >FeSOD_A0A1V4UH29 unreviewed Superoxide 24 | DDDPDADDQDDQPDDLCLLPPLAHSVLSCCCRRPQLSLLSVQLRVLSVVVVVCVVVVHDDDPVVSLLSNLQSSQSNLLSQLLSVLFAAQVLFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQDLVPRGTYIDIDGRSDPVDDPRTRTSDIDTCHCSNPCVVQNPPSNSSSVSSSRTTNSVSSNVSSVVSVVD 25 | >FeSOD_A0A7V9SPC9 unreviewed Superoxide 26 | DDDDADDADFQPDDLCLLPPQAHSVLSCCCRPPQLVVLSVQLNVLVVVLVVCVVVVPPVSNVVSLVSNLLSSLSNLLSNLLSVLFGNQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDPAWWKWFWWQQLVVRGIYIDTDHHSCPCDDPRTQTSDIDGSHCSNPCVVPNPPSSSSSVSRVRTGDRVSSNVSSVVSVVSVVSVVD 27 | >FeSOD_R7F5H2 unreviewed Superoxide 28 | DDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCVSNVVSLVSNLLSVLSNLLSCLLSVQFGAFDPDAFDPVLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQQLVVRGTDIDIDGGSSDCVVSRIDTLDMDTCHCSNPCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVRVVD 29 | >FeSOD_A0A2N5YS14 unreviewed Superoxide 30 | DDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLRVVLCVLCPPHPCNPDDLLVCLQPDDDSSNLSSLLSCLSNLLSVQFALDDPDADDDPLQVQCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPSGTHIDIDHRSHGSSNVPTDTSDMDTCHCSNQCVVCNPPSSSSSVSSVVGHPSVSSNVSD 31 | >FeSOD_A0A0F6MY72 unreviewed Superoxide 32 | DADDADDAPDDLCQCPDQLAHSVLSCCCRNPQLVVLSVQLRVLCPPDPCNPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSVLLSVLFGNNFFAFADDPLQVQCCVQQNHPVSVLVVQLVLLVPPPAWWKWFFWDDPSGTHIDIDHRSDDCSDVPTDTLDMDTCHCSNCCVVCNVPRSSSSVSCRVTTTDNVSSNVSSVVND 33 | >FeSOD_A0A060HP82 unreviewed Superoxide 34 | DDDDPPPPPDQDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLCVPDDPVLLPDDLLVCLLPLVSRDPSSSVSNQQSSLSNLLSSLQSVQAGHQFFDAADDPLQVLCCVQQNHPVSVLVVQLVQQLPDDAWWKWWWWADPVRGTDIDIDHRSHDSSDDPPPVRIDTSDMDTSHCSNQCVVQNPPSNSSSVSNVRTTPRVSSNVSNVD 35 | >FeSOD_A0A4P7WS39 unreviewed Superoxide 36 | DDDDDDADDQPDDLCLLPPLAHSVLSCCCRNPQLSVLSVQLRVLCPPDPCSPDDLLVLLVVCVVVVNVSSNLSSQLNVLSNLLSVLFGLQFFDADDDPLVVQCCVQQNHPVSVLVVQLCCLLPDDAWWKWFWWADPVRGTDIDTDHHSHHCVVVVTHTLDMDTCHCSNCCVVQNPPSSSSSVSRVSTTNRVSSNVSRD 37 | >FeSOD_A0A076JJX0 unreviewed Superoxide 38 | DDADDADDQPDDLCLLPPQFHSVLSCCCRPPQLVVLSVQLRVLVVQLVVCVVVVNPVSNLVSLLSNLLSVLSNLLSSLLSVLLGNDDDAAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWFWWQQLVVRGIYIDIDHHSPPDDDPRIDTLDIDTCHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVSVVVDVVSVD 39 | >FeSOD_G8R729 unreviewed Superoxide 40 | DADDADAAPDDLCLLPPLAHNVLSCCCRPPQLVVLRVQLRVLCPPHPLLVDDLLVCLVCLVVDDPSSLLSSLSNLLSSLLSVQFGNQLFDFADDPQQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQALVLGTDIDIDHRSHDSCPPPVPRRTRTSDMDTSHCSNQCVVQNPPSSSSSVSNVRTTNRVSSNVSSVVSD 41 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_20.fasta: -------------------------------------------------------------------------------- 1 | >FeSOD_A0A1F4ZT98 unreviewed Superoxide 2 | MFTLPPLPYPTNALEPYLDTQTLEIHFGKHHATYLKNLNDLLPEKSDADLIPVLQHLDDL 3 | PQDIRVKVRNNAGGVYNHNLYWQCMSPKSKSPSPRLLSSIESGFGTLDAFKEKFSQAALT 4 | HFGSGWAWLVKGTKGLEIVTTPNQDSPVSTGLTPILGLDVWEHAYYLKYQNRRVEYIQAW 5 | WNVVNWDYVSSLLADR 6 | >FeSOD_A0A067LT26 unreviewed Superoxide 7 | MADYTLVDLPYDYSALEPSISGRIMELHHDKHHKTYVDGANTALVKLQEARDAGDLTFVN 8 | KLQKDLAFNLAGHVNHTVFWNNLSPDGGDKPTGELAAAIDEFFGSYDKFQAHFTASALGI 9 | QGSGWSILAWDSLGQKLIIEQLYDHQGNLAAATVPILLLDMWEHAFYLDYVNVKADYVKA 10 | FWNIVNWADVQARFDAARTKTQGLFLLS 11 | >FeSOD_A0A2E1RF15 unreviewed Superoxide 12 | MAHQLPELPYSKDALSPHISAETLDYHHGKHHNAYVTKLNAAIEGTEHAEKSIEELVKTT 13 | SGGLFNNAAQHYNHSFYWNCLAPNAGGSATGTVGEMISSKWGSFDKFKEDFSNAGAANFG 14 | SGWTWLVKNASGDLEIVNTDDAECPLTEGHTPLLTMDVWEHAYYVDYRNARPKYIEAFWN 15 | LVNWDFVNSNL 16 | >FeSOD_A0A538G8K1 unreviewed Superoxide 17 | MAYSVPPLAYDFDALEPHIDAQTMEIHHDKHHGAYVTNLNAALEGTEWMDRPIESVLASL 18 | DVIPEDKRTAVRNNGGGHANHTFFWEIMGPNGGGEPSGALADAIADTFGGLDQLKTQVND 19 | AGVKRFGSGWTWLVWDGTGLAVKSTPNQDSPVMDSDVPLLGIDVWEHAYYLRYQNRRPDY 20 | LAAWWNVVNWEAVATRYEAAK 21 | >FeSOD_A0A6L8W9C4 unreviewed Superoxide 22 | MAFELPPLPYAKDALAPHISENTLDFHYGKHHNAYVTNLNGLLEDGDSRSLEEIMKDTAG 23 | DAAKAGVFNNAAQVWNHTFYWHSMKPNGGGKPTGAIADKINEDFGSYEKFAEEFKAAGAT 24 | QFGSGWAWLVLDGGKLKVTKTPNAACPLTDGAKPLLTMDVWEHAYYLDYQNARPKYMETF 25 | LESLVNWDFANENLG 26 | >FeSOD_A0A2H0YVA1 unreviewed Superoxide 27 | MKYELPKLNYAYDALEPYLDAQTMEIHHSKHHQAYTDNFNKALAEHPDLAEMPVEELLAK 28 | VNELSIKDRDKNALRNHGGGYYNHKLFWEIMDPANKKDESLIKDIETEFGSVDSFKEQFS 29 | ETAKTLLGSGWAWLARDKEGKLHVYGLPNQDSPFQKGHTPIICLDVWEHAYYLKYQNRRP 30 | EYIENWWNVLKLI 31 | >FeSOD_R7J7P3 unreviewed Superoxide 32 | MHQMPKLPYEMEALAPLMSKETFDFHYGKHLQTYVNNLNKLIVGTPYENLELEQIVCQAD 33 | GGIYNNAAQTWNHTFFFQLLTPEQPSLPDDLAGLLTRDFGSVDQFKEDFTKAALGLFGSG 34 | WVWLVLGKDGKLSLLPTPNAGNPLKDGLKPLLVIDVWEHAYYIDYRNNRAAFIEAFWKLV 35 | NWEKVADLLG 36 | >FeSOD_B8LFE6 unreviewed Superoxide 37 | MAEYTLPDLPYDYAALEPHISGRIMELHHTKHHATYVKGANDTLDKLAEARADGSIAGKV 38 | YGLSATLSFHLGGHTNHSIFWKNLSPNGGDKPEGDLAAAIDDQFGGFDKFQAHFTAAATT 39 | LQGSGWAILGYDTIGGKLVILQLTDQSDNIPAAIIPVVMLDDMWEHAFYLDYQNVKPDYV 40 | KAWWNVVNWADAAERLGRATSQGKGLIVPA 41 | >FeSOD_A0A1C0AS03 unreviewed Superoxide 42 | MTYTLPDLDYDYGALAPHIAPEIMELHHSKHHAAYVKGINDALEQLAAAREKGDLGAVNK 43 | LSKDAAFHLGGHINHSVFWKNMSPDGGGRPDGELASAIDEYFGSFDGFQKHFNAVANGIQ 44 | GSGWSMLVWDTLGQRMNINQLFDQQGNLPAGQIPLLQLDMWEHAFYLQYKNVKGDYVTAW 45 | WNVVNWTDVTERFARAKAASAGLV 46 | >FeSOD_A0A2M8Q2V9 unreviewed Superoxide 47 | MAFELPPLPYAYDALEPHIDARTMEIHHDKHHAGYVSKLNSAIAGTEWESKSIDEILRNI 48 | NSVPEDIRVAVQNNGGGHANHSLFWEIMGPNGGGSPSGALAEAINAAFGSFDAFKEKFSN 49 | AAASRFGSGWAWLVVDDAGNLAVYSTANQDSPYMQGHTPILGLDVWEHAYYLNYQNRRPD 50 | YIAAFWNVVNWDAVAEKFAAAKK 51 | >FeSOD_A0A2E1VW30 unreviewed Superoxide 52 | MAYELPALPYGENDLAPHITAETIQYHYGKHHAAYVGKLNELLDGDDSKSLEELIQSTDG 53 | GVFNNAAQVWNHTFYWNCMKPGGGGAPSGDLAAAIERDFGSYDAFVTKFKAAAMARFGSG 54 | WAWLVADADGKLSVVETLNAGNPMTDGLKPLLTCDVWEHAYYIDFRNARPKYLDVFFDSL 55 | VNWDFVASNL 56 | >FeSOD_A0A1V4UH29 unreviewed Superoxide 57 | MEPNRSYTLPKLPYDYSALAPSISEEQLRIHHTKHHQGYVNGANAIYEKLDKTRKDGGDI 58 | DQKATLKELSFHVGGFLLHALYWENMAPAGQGGGVPKGALGSRIEKEFGSFDQFKKEFTA 59 | AANSVEGSGWAALTYCQKTGRLLLMQIEKHNVHVFPSFSVLMVVDVWEHAYYIDYKNDRA 60 | KYLENFWNIINWDAVNSRLEKALKG 61 | >FeSOD_A0A7V9SPC9 unreviewed Superoxide 62 | MPTEVYELPDLSYDYSALEPHINARIMELHHDKHHATYVKGANTALEKLAEVRATGDFAT 63 | IAMLEKNLAFNVSGHVLHSIFWTNLSPNGGGEPDGELATALTDTFGGFEHFRKQMNEAAA 64 | TVQGAGWALASWEPIAQRLIVQQVHDHQGNHGQGTIPLLAIDAWEHAYYLQYENRKTEFF 65 | DAVWNVVSWGDVEARFKAARNAELIRQT 66 | >FeSOD_R7F5H2 unreviewed Superoxide 67 | MIKKINLEYPLDSLEPYYSRETLNIHYNTLYVGYVDNTNITLEKLEKARKERNFENIKCL 68 | EKNLSFFGSGVILHELFFENMGPAIPSSPDINLMEQINKDFGSFELFKEQFTESSKVVEA 69 | SGWNLLVWVPRFNKLEIIQCEKHQDLTLWNCKPILVLDMWEHSYFLQYKANRGEYIKAFW 70 | NIINWNNVNKRFRNTIKY 71 | >FeSOD_A0A2N5YS14 unreviewed Superoxide 72 | MFELPKLPYEFNSLEPKISAKTVEFHYTKHHQVYVNKLNGLIEGTDYAGKTLEEIIKTSE 73 | GGIFNNAAQVWNHTFYWEGFGPNPQSAPSGKLAEMINETFGSFEKFKEEFSTKAATLFGS 74 | GWAWLVLDNGQLKITGTSNAGSPLTEGHKPILTCDVWEHAYYLDYQNLRPKYIENFWELV 75 | DWKKIEGRI 76 | >FeSOD_A0A0F6MY72 unreviewed Superoxide 77 | MAFELPSLPFDQDALESSKMSANTLSYHHGKHHAAYVKNLNAAIEGTDMANMSLEEIIKA 78 | TYNDPSKSGIFNNAAQVWNHSFFWKCLKPNGGGQPTGALADKIQADFGSFDAFIQEFKNA 79 | AATQFGSGWAWLVLDNGTLKVTKTANAVNPMVEGKTPLLTLDVWEHAYYLDFQNARPGFI 80 | DNFIENLVNWDFVAENLASAS 81 | >FeSOD_A0A060HP82 unreviewed Superoxide 82 | MPRRPSHLMANFTLPQLPYAYDALEPHIDATTMQIHHTKHHQAYTDGLNKALGSLDAKFQ 83 | SMDAVDILKNIDTVPENARGAVNFHGGGYNNHTLFWNNMKKGGGGEPSGELADAIKKAFG 84 | SFADFKTKFQTDSVAIQGSGWGWLVKNASGGVQFITMPNQTSPWTRWKAEKLTPLLGLDV 85 | WEHSYYLKYQNRRADYVTAWWNVVNWDEVAKRFKA 86 | >FeSOD_A0A4P7WS39 unreviewed Superoxide 87 | MKITHQLPELPFNKSALNPIITEETFDYHYGKHHAAYVNNLATLIQDTELINFSIEDIIK 88 | KGFYEKNASLFNNAAQHWSHTFFWNCLSPNGGKAPVGRITELITRDFGSFELFKDQFSNA 89 | AIKLFGCGWAWLVQDENDKLEIIAMKEAQTPLILNKKPILTLDVWEHAYYIDYKNARPKF 90 | VEGFWDIVNWDFANKNVI 91 | >FeSOD_A0A076JJX0 unreviewed Superoxide 92 | MPVYTLPELPYDYSALEPYVSGKIMELHHDKHHQAYVNGANQALEQIHDAAESGNVAQSN 93 | LLEKNLAFNLAGHKNHTIFWKNMAPSIGQEPTGELKAAIEDQFGSFEGFQRYFESMCAGI 94 | QGSGWAVLAWDSLGERLVTLQMYDHQGNLPVTIFPLILLDLWEHAYYLDYLNVRADYVKA 95 | WWHIVNWEDASKRFDEVRNLNTNLVK 96 | >FeSOD_G8R729 unreviewed Superoxide 97 | MSFELPDLPYSKSALEPYIDAQTMEIHHDKHHAGYTTKLNDAIEGTELEKQSIEDILKNV 98 | SKHSGGVRNNGGGYYNHSLFWSIMGPDAGGDPTGDVGAAIDDAFGSYENFKTEFSNAAAT 99 | RFGSGWAWLIVNGEGKLEVTSSPNQDNPLMDVAEKKGTPILGLDVWEHAYYLKYQNKRPD 100 | YISAFFNVINWDEVNRRFAEAK 101 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_h: -------------------------------------------------------------------------------- 1 | FeSOD_A0A1F4ZT98 unreviewed Superoxide 2 | FeSOD_A0A067LT26 unreviewed Superoxide 3 | FeSOD_A0A2E1RF15 unreviewed Superoxide 4 | FeSOD_A0A538G8K1 unreviewed Superoxide 5 | FeSOD_A0A6L8W9C4 unreviewed Superoxide 6 | FeSOD_A0A2H0YVA1 unreviewed Superoxide 7 | FeSOD_R7J7P3 unreviewed Superoxide 8 | FeSOD_B8LFE6 unreviewed Superoxide 9 | FeSOD_A0A1C0AS03 unreviewed Superoxide 10 | FeSOD_A0A2M8Q2V9 unreviewed Superoxide 11 | FeSOD_A0A2E1VW30 unreviewed Superoxide 12 | FeSOD_A0A1V4UH29 unreviewed Superoxide 13 | FeSOD_A0A7V9SPC9 unreviewed Superoxide 14 | FeSOD_R7F5H2 unreviewed Superoxide 15 | FeSOD_A0A2N5YS14 unreviewed Superoxide 16 | FeSOD_A0A0F6MY72 unreviewed Superoxide 17 | FeSOD_A0A060HP82 unreviewed Superoxide 18 | FeSOD_A0A4P7WS39 unreviewed Superoxide 19 | FeSOD_A0A076JJX0 unreviewed Superoxide 20 | FeSOD_G8R729 unreviewed Superoxide 21 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_h.dbtype: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_h.index: -------------------------------------------------------------------------------- 1 | 0 0 40 2 | 1 40 40 3 | 2 80 40 4 | 3 120 40 5 | 4 160 40 6 | 5 200 40 7 | 6 240 36 8 | 7 276 36 9 | 8 312 40 10 | 9 352 40 11 | 10 392 40 12 | 11 432 40 13 | 12 472 40 14 | 13 512 36 15 | 14 548 40 16 | 15 588 40 17 | 16 628 40 18 | 17 668 40 19 | 18 708 40 20 | 19 748 36 21 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_ss: -------------------------------------------------------------------------------- 1 | DDDADDAPDDLCLLPPLAHNVLSCCQRNPQLVVLSVLLCVLCVVPDPDDLLVCLVCLVVDDPVSSVSSLQSSLSNLLSSLLSVLFGSDFDAFDPVLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQDPVGTHIDIDHRSHDCSNVVTDTSDMDTCHCSNPCVVCNVPSSSSSVSRSSTTNRVSSNVSSVVD 2 | DDDDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLRVQLRVLVVQLVVCVVVVNPVSNVVSLLSNLLSVLSNLLSNLLSVLFANQFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLLPDDFFWKWFWWQQLVVRGTYIDIDHHSPPCDSVRTDTLDIDTSHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVCSVVVDPDPDPDD 3 | DADAADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLRVLCPPHPCSPDDLLVCLQPDDDSSNLSSLLSVLSNLLSVQFANPFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPVRRTDIDIDHRRHDCNNVRTDTSDMDTCHCSNPCVRCNPPSSSSSVSNSSTTPRVSSNVSD 4 | DADDQDDFPDDLCLLPPLAHSVLSCCCRPPQLNVLSVQLRVLCPPHPCLVDDLLVCLVPLVPDDPVSSLSNQQSSLSNLLSSLLSVQFANQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDPPPGTDIDIDHRSHHCSDPPTRTLDMDTCHCSNPCVVQNVPSSSSSVSSSRTYNRVSSNVSSVVSD 5 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLRVQLCVLCVVPDPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSCLLSVLFGNNFFDAADDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWDDPSGTHIDIDHRSHGSSNVPTDTLDMDTCHCSNCCPVCNPPSSSSSVSCRVRTGDNVSSNVSSD 6 | DADDADDFPDDLCLLPPLAHRVLSCCCRPPQLVVLSVQLRVLCVVPVVLSPDDLLVCLVCLVVDPDDDPSSVSSQLSSLSNLLSVLLSVQLHSPDDDDPVLQVQCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWADPVRGTDIDIDHRSHDCSNVVTHTSDMDTCHCSNPCVVCNVCSSSSSVSSVVSGDRD 7 | DDDQDDQPDDLCLCPPLAHSVLSCCCRNPQLVVLSVQLCVLCPPHPCNPDDLLVSLQPDDDSSNLSSLLNCLSNVLLVLFAPDFDAADPVLQVLCCVQQNHPVSVLVVQLCQQLPDDAWWKWFWFQDPVRGTDIDTDHSSHGCVVVVTDTLDIDTCHCSNCCVVCNPPSNSSSVSSSSTTPSVSSSVSND 8 | DDADDADDQPDDLCLLPPLFHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCPVCNVVSVVSNLLSSLSNLLSVLLSVLFHSQFFDAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWWWWQQLQVRGTYIDIDHHSSPCDDSSRTDTSDIDTCHCSNCCVVCNPPSSSSSVSRVRTTNRVSSNVSSVVSVVSSPPDDDDD 9 | DADDADDFPDDLCLLPPFAHSVLSCCCRPPQLVVLSVQLNVLVVQLVVCVVVVNPVSNLVSLLSNLQSVLSNLLSNLLSVLFASQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDDAWWKWFWWQQLVVRGTYIDIDHGSSPDDDPRIDTLDIDTCHCSNPCVVQNPPSSSSSVSSSRTTPRVSSNVSSVSSVVSSVVSD 10 | DADDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVLLRVLCPPHPCSVDDLLVCLVPLVSDDPVSSVSNLQSSLSNLQSSLLSVQFGNNFADAADDPLQVLCCVQQNHPVSVLVVLLCQLLPDPAWWKWFWKQALVLGTDIDIDHRSHDCSNVRTDTSDMDTCHCSNPCVVQNVCSSSSSVSSSRTTNRVSSNVSSVVSVD 11 | DADDADDQPDDLCLLPPLAHSVLSCCCRNPQLVVLSVQLRVLCVVPPPDDLLRCLQPDDDPSNLSSQLNVLSNLLSNLFGNPFFDADDDPLQVQCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWADPVRGTDIDTDHSSHDSSDVPTDTSDMDTCHCSNPCPVCNPPRSSSSVSCRVTTTDNVSSNVSD 12 | DDDPDADDQDDQPDDLCLLPPLAHSVLSCCCRRPQLSLLSVQLRVLSVVVVVCVVVVHDDDPVVSLLSNLQSSQSNLLSQLLSVLFAAQVLFFAADDDPLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQDLVPRGTYIDIDGRSDPVDDPRTRTSDIDTCHCSNPCVVQNPPSNSSSVSSSRTTNSVSSNVSSVVSVVD 13 | DDDDADDADFQPDDLCLLPPQAHSVLSCCCRPPQLVVLSVQLNVLVVVLVVCVVVVPPVSNVVSLVSNLLSSLSNLLSNLLSVLFGNQFFAAADDPLQVLCCVQQNHPVSVLVVQLVQLLPDPAWWKWFWWQQLVVRGIYIDTDHHSCPCDDPRTQTSDIDGSHCSNPCVVPNPPSSSSSVSRVRTGDRVSSNVSSVVSVVSVVSVVD 14 | DDDADAAPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLVVVLVVCVVVVNCVSNVVSLVSNLLSVLSNLLSCLLSVQFGAFDPDAFDPVLQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWFQQLVVRGTDIDIDGGSSDCVVSRIDTLDMDTCHCSNPCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVRVVD 15 | DDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLRVVLCVLCPPHPCNPDDLLVCLQPDDDSSNLSSLLSCLSNLLSVQFALDDPDADDDPLQVQCCVQQNHPVSVLVVQLVQLLPDAAWWKWFWWQDPSGTHIDIDHRSHGSSNVPTDTSDMDTCHCSNQCVVCNPPSSSSSVSSVVGHPSVSSNVSD 16 | DADDADDAPDDLCQCPDQLAHSVLSCCCRNPQLVVLSVQLRVLCPPDPCNPDDLLVLLLVQLPPPVSVSSNLSSQLSVLSVLLSVLFGNNFFAFADDPLQVQCCVQQNHPVSVLVVQLVLLVPPPAWWKWFFWDDPSGTHIDIDHRSDDCSDVPTDTLDMDTCHCSNCCVVCNVPRSSSSVSCRVTTTDNVSSNVSSVVND 17 | DDDDPPPPPDQDDADDQPDDLCLLPPLAHSVLSCCCRPPQLVVLSVQLRVLCVPDDPVLLPDDLLVCLLPLVSRDPSSSVSNQQSSLSNLLSSLQSVQAGHQFFDAADDPLQVLCCVQQNHPVSVLVVQLVQQLPDDAWWKWWWWADPVRGTDIDIDHRSHDSSDDPPPVRIDTSDMDTSHCSNQCVVQNPPSNSSSVSNVRTTPRVSSNVSNVD 18 | DDDDDDADDQPDDLCLLPPLAHSVLSCCCRNPQLSVLSVQLRVLCPPDPCSPDDLLVLLVVCVVVVNVSSNLSSQLNVLSNLLSVLFGLQFFDADDDPLVVQCCVQQNHPVSVLVVQLCCLLPDDAWWKWFWWADPVRGTDIDTDHHSHHCVVVVTHTLDMDTCHCSNCCVVQNPPSSSSSVSRVSTTNRVSSNVSRD 19 | DDADDADDQPDDLCLLPPQFHSVLSCCCRPPQLVVLSVQLRVLVVQLVVCVVVVNPVSNLVSLLSNLLSVLSNLLSSLLSVLLGNDDDAAADDPLQVLCCVQQNHPVSVLVVQLCQLLPDDAWWKWFWWQQLVVRGIYIDIDHHSPPDDDPRIDTLDIDTCHCSRCCVVQNPPSSSSSVSRVRTTPRVSSNVSSVVSVVVDVVSVD 20 | DADDADAAPDDLCLLPPLAHNVLSCCCRPPQLVVLRVQLRVLCPPHPLLVDDLLVCLVCLVVDDPSSLLSSLSNLLSSLLSVQFGNQLFDFADDPQQVLCCVQQNHPVSVLVVQLVQLVPDPAWWKWFWWQALVLGTDIDIDHRSHDSCPPPVPRRTRTSDMDTSHCSNQCVVQNPPSSSSSVSNVRTTNRVSSNVSSVVSD 21 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_ss.dbtype: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/data/foldseek/FeSOD_ss.index: -------------------------------------------------------------------------------- 1 | 0 0 198 2 | 1 198 210 3 | 2 408 193 4 | 3 601 203 5 | 4 804 197 6 | 5 1001 195 7 | 6 1196 192 8 | 7 1388 212 9 | 8 1600 206 10 | 9 1806 205 11 | 10 2011 192 12 | 11 2203 207 13 | 12 2410 210 14 | 13 2620 200 15 | 14 2820 191 16 | 15 3011 203 17 | 16 3214 217 18 | 17 3431 200 19 | 18 3631 208 20 | 19 3839 204 21 | -------------------------------------------------------------------------------- /test/data/metadata_FeSOD_20.tsv: -------------------------------------------------------------------------------- 1 | group 2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide g2 3 | FeSOD_A0A067LT26|unreviewed|Superoxide g3 4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide g1 5 | FeSOD_A0A538G8K1|unreviewed|Superoxide g2 6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide g1 7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide g2 8 | FeSOD_R7J7P3|unreviewed|Superoxide g1 9 | FeSOD_B8LFE6|unreviewed|Superoxide g3 10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide g3 11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide g2 12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide g1 13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide g2 14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide g3 15 | FeSOD_R7F5H2|unreviewed|Superoxide g2 16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide g1 17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide g1 18 | FeSOD_A0A060HP82|unreviewed|Superoxide g2 19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide g1 20 | FeSOD_A0A076JJX0|unreviewed|Superoxide g3 21 | FeSOD_G8R729|unreviewed|Superoxide g2 22 | -------------------------------------------------------------------------------- /test/data/pDONR201.fasta: -------------------------------------------------------------------------------- 1 | >pDONR201 2 | CTTTCCTGCGTTATCCCCTGATTCTGTGGATAACCGTATTACCGCTAGCCAGGAAGAGTTTGTAGAAACGCAAAAAGGCCATCCGTCAGGATGGCCTTCTGCTTAGTTTGATGCCTGGCAGTTTATGGCGGGCGTCCTGCCCGCCACCCTCCGGGCCGTTGCTTCACAACGTTCAAATCCGCTCCCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACAACAGATAAAACGAAAGGCCCAGTCTTCCGACTGAGCCTTTCGTTTTATTTGATGCCTGGCAGTTCCCTACTCTCGCGTTAACGCTAGCATGGATCTCGGGCCCCAAATAATGATTTTATTTTGACTGATAGTGACCTGTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAAGTTTGTACAAAAAAGCAGAACGAGAAACGTAAAATGATATAAATATCAATATATTAAATTAGATTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATCCAGTCACTATGAATCAACTACTTAGATGGTATTAGTGACCTGTAGTCGACCGACAGCCTTCCAAATGTTCTTCGGGTGATGCTGCCAACTTAGTCGACCGACAGCCTTCCAAATGTTCTTCTCAAACGGAATCGTCGTATCCAGCCTACTCGCTATTGTCCTCAATGCCGTATTAAATCATAAAAAGAAATAAGAAAAAGAGGTGCGAGCCTCTTTTTTGTGTGACAAAATAAAAACATCTACCTATTCATATACGCTAGTGTCATAGTCCTGAAAATCATCTGCATCAAGAACAATTTCACAACTCTTATACTTTTCTCTTACAAGTCGTTCGGCTTCATCTGGATTTTCAGCCTCTATACTTACTAAACGTGATAAAGTTTCTGTAATTTCTACTGTATCGACCTGCAGACTGGCTGTGTATAAGGGAGCCTGACATTTATATTCCCCAGAACATCAGGTTAATGGCGTTTTTGATGTCATTTTCGCGGTGGCTGAGATCAGCCACTTCTTCCCCGATAACGGAGACCGGCACACTGGCCATATCGGTGGTCATCATGCGCCAGCTTTCATCCCCGATATGCACCACCGGGTAAAGTTCACGGGAGACTTTATCTGACAGCAGACGTGCACTGGCCAGGGGGATCACCATCCGTCGCCCGGGCGTGTCAATAATATCACTCTGTACATCCACAAACAGACGATAACGGCTCTCTCTTTTATAGGTGTAAACCTTAAACTGCATTTCACCAGTCCCTGTTCTCGTCAGCAAAAGAGCCGTTCATTTCAATAAACCGGGCGACCTCAGCCATCCCTTCCTGATTTTCCGCTTTCCAGCGTTCGGCACGCAGACGACGGGCTTCATTCTGCATGGTTGTGCTTACCAGACCGGAGATATTGACATCATATATGCCTTGAGCAACTGATAGCTGTCGCTGTCAACTGTCACTGTAATACGCTGCTTCATAGCACACCTCTTTTTGACATACTTCGGGTATACATATCAGTATATATTCTTATACCGCAAAAATCAGCGCGCAAATACGCATACTGTTATCTGGCTTTTAGTAAGCCGGATCCACGCGATTACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTCATTAAGCATTCTGCCGACATGGAAGCCATCACAGACGGCATGATGAACCTGAATCGCCAGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGGCGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGGATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTTTCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGTGGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACAAGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCCGGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCTTATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATAGGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGATATATCAACGGTGGTATATCCAGTGATTTTTTTCTCCATTTTAGCTTCCTTAGCTCCTGAAAATCTCGATAACTCAAAAAATACGCCCGGTAGTGATCTTATTTCATTATGGTGAAAGTTGGAACCTCTTACGTGCCGATCAACGTCTCATTTTCGCCAAAAGTTGGCCCAGGGCTTCCCGGTATCAACAGGGACACCAGGATTTATTTATTCTGCGAAGTGATCTTCCGTCACAGGTATTTATTCGGCGCAAAGTGCGTCGGGTGATGCTGCCAACTTAGTCGACTACAGGTCACTAATACCATCTAAGTAGTTGATTCATAGTGACTGGATATGTTGTGTTTTACAGTATTATGTAGTCTGTTTTTTATGCAAAATCTAATTTAATATATTGATATTTATATCATTTTACGTTTCTCGTTCAGCTTTCTTGTACAAAGTGGGCATTATAAGAAAGCATTGCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGCCATCCAGCTGCAGCTCTGGCCCGTGTCTCAAAATCTCTGATGTTACATTGCACAAGATAAAAATATATCATCATGAACAATAAAACTGTCTGCTTACATAAACAGTAATACAAGGGGTGTTATGAGCCATATTCAACGGGAAACGTCGAGGCCGCGATTAAATTCCAACATGGATGCTGATTTATATGGGTATAAATGGGCTCGCGATAATGTCGGGCAATCAGGTGCGACAATCTATCGCTTGTATGGGAAGCCCGATGCGCCAGAGTTGTTTCTGAAACATGGCAAAGGTAGCGTTGCCAATGATGTTACAGATGAGATGGTCAGACTAAACTGGCTGACGGAATTTATGCCTCTTCCGACCATCAAGCATTTTATCCGTACTCCTGATGATGCATGGTTACTCACCACTGCGATCCCCGGAAAAACAGCATTCCAGGTATTAGAAGAATATCCTGATTCAGGTGAAAATATTGTTGATGCGCTGGCAGTGTTCCTGCGCCGGTTGCATTCGATTCCTGTTTGTAATTGTCCTTTTAACAGCGATCGCGTATTTCGTCTCGCTCAGGCGCAATCACGAATGAATAACGGTTTGGTTGATGCGAGTGATTTTGATGACGAGCGTAATGGCTGGCCTGTTGAACAAGTCTGGAAAGAAATGCATAAACTTTTGCCATTCTCACCGGATTCAGTCGTCACTCATGGTGATTTCTCACTTGATAACCTTATTTTTGACGAGGGGAAATTAATAGGTTGTATTGATGTTGGACGAGTCGGAATCGCAGACCGATACCAGGATCTTGCCATCCTATGGAACTGCCTCGGTGAGTTTTCTCCTTCATTACAGAAACGGCTTTTTCAAAAATATGGTATTGATAATCCTGATATGAATAAATTGCAGTTTCATTTGATGCTCGATGAGTTTTTCTAATCAGAATTGGTTAATTGGTTGTAACACTGGCAGAGCATTACGCTGACTTGACGGGACGGCGCAAGCTCATGACCAAAATCCCTTAACGTGAGTTTTCGTTCCACTGAGCGTCAGACCCCGTAGAAAAGATCAAAGGATCTTCTTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAAAAACCACCGCTACCAGCGGTGGTTTGTTTGCCGGATCAAGAGCTACCAACTCTTTTTCCGAAGGTAACTGGCTTCAGCAGAGCGCAGATACCAAATACTGTCCTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACGCCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAAAACGCCAGCAACGCGGCCTTTTTACGGTTCCTGGCCTTTTGCTGGCCTTTTGCTCACATGTT 3 | -------------------------------------------------------------------------------- /test/data/pDONR201_genemark.gff: -------------------------------------------------------------------------------- 1 | ##gff-version 2 2 | # GeneMark.hmm-2 prokaryotic version: 1.24 3 | # File with sequence: pDONR201.fasta 4 | # File with MetaGeneMark parameters: /mnt/rbg/programs/srj_nextflow/workflows/metagenemark/resources/mgm_11.mod 5 | # translation table: 11 6 | # output date start: Wed Sep 22 12:11:10 2021 7 | 8 | ##sequence-region pDONR201 1 4470 9 | pDONR201 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105; 10 | pDONR201 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306; 11 | pDONR201 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126; 12 | pDONR201 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660; 13 | pDONR201 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762; 14 | pDONR201 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93; 15 | # pDONR201 total_logodd 127.403 average_length 342 average_density 1.34 16 | -------------------------------------------------------------------------------- /test/data/pDONR201_multi_genemark.gff: -------------------------------------------------------------------------------- 1 | ##gff-version 2 2 | # GeneMark.hmm-2 prokaryotic version: 1.24 3 | # File with sequence: pDONR201.fasta 4 | # File with MetaGeneMark parameters: /mnt/rbg/programs/srj_nextflow/workflows/metagenemark/resources/mgm_11.mod 5 | # translation table: 11 6 | # output date start: Wed Sep 22 12:11:10 2021 7 | 8 | pDONR201_1 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105; 9 | pDONR201_1 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306; 10 | pDONR201_1 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126; 11 | pDONR201_1 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660; 12 | pDONR201_1 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762; 13 | pDONR201_1 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93; 14 | 15 | pDONR201_2 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105; 16 | pDONR201_2 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306; 17 | pDONR201_2 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126; 18 | pDONR201_2 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660; 19 | pDONR201_2 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762; 20 | pDONR201_2 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93; 21 | 22 | pDONR201_3 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105; 23 | pDONR201_3 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306; 24 | pDONR201_3 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126; 25 | pDONR201_3 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660; 26 | pDONR201_3 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762; 27 | pDONR201_3 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93; 28 | 29 | pDONR201_4 GeneMark.hmm2 CDS 2 106 5.09 + 0 gene_id pDONR201_1; gene_type bacteria; partial 10; gc 49; length 105; 30 | pDONR201_4 GeneMark.hmm2 CDS 959 1264 20.71 - 0 gene_id pDONR201_2; gene_type bacteria; complete; gc 50; length 306; 31 | pDONR201_4 GeneMark.hmm2 CDS 1266 1391 5.68 - 0 gene_id pDONR201_3; gene_type bacteria; complete; gc 54; length 126; 32 | pDONR201_4 GeneMark.hmm2 CDS 1606 2265 41.74 - 0 gene_id pDONR201_4; gene_type bacteria; complete; gc 45; length 660; 33 | pDONR201_4 GeneMark.hmm2 CDS 2916 3677 51.92 + 0 gene_id pDONR201_5; gene_type bacteria; complete; gc 43; length 762; 34 | pDONR201_4 GeneMark.hmm2 CDS 4378 4470 2.26 + 0 gene_id pDONR201_6; gene_type bacteria; partial 01; gc 56; length 93; -------------------------------------------------------------------------------- /test/data/pDONR201_multi_subset.txt: -------------------------------------------------------------------------------- 1 | 2 | pDONR201_4 3 | -------------------------------------------------------------------------------- /test/data/pdonr_peptides.fasta: -------------------------------------------------------------------------------- 1 | >pDONR201_1 2 | FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA* 3 | >pDONR201_2 4 | MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDKVSRELYPVVHIGDESWRMMTTDMASVPVSVIGEEV 5 | ADLSHRENDIKNAINLMFWGI* 6 | >pDONR201_3 7 | MQNEARRLRAERWKAENQEGMAEVARFIEMNGSFADENRDW* 8 | >pDONR201_4 9 | MEKKITGYTTVDISQWHRKEHFEAFQSVAQCTYNQTVQLDITAFLKTVKKNKHKFYPAFIHILARLMNAHPEFRMAMKDG 10 | ELVIWDSVHPCYTVFHEQTETFSSLWSEYHDDFRQFLHIYSQDVACYGENLAYFPKGFIENMFFVSANPWVSFTSFDLNV 11 | ANMDNFFAPVFTMGKYYTQGDKVLMPLAIQVHHAVCDGFHVGRMLNELQQYCDEWQGGA* 12 | >pDONR201_5 13 | MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDVTDEMVRLNWLTEFMPLPTIKHFIRTPDDAWLLT 14 | TAIPGKTAFQVLEEYPDSGENIVDALAVFLRRLHSIPVCNCPFNSDRVFRLAQAQSRMNNGLVDASDFDDERNGWPVEQV 15 | WKEMHKLLPFSPDSVVTHGDFSLDNLIFDEGKLIGCIDVGRVGIADRYQDLAILWNCLGEFSPSLQKRLFQKYGIDNPDM 16 | NKLQFHLMLDEFF* 17 | -------------------------------------------------------------------------------- /test/data/score3.sparse.tsv: -------------------------------------------------------------------------------- 1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A1F4ZT98|unreviewed|Superoxide 410.0 2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 216.0 3 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 199.0 4 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 429.0 5 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 279.0 6 | FeSOD_A0A067LT26|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 277.0 7 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 405.0 8 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 239.0 9 | FeSOD_A0A2E1RF15|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 232.0 10 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 425.0 11 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 287.0 12 | FeSOD_A0A538G8K1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 229.0 13 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 411.0 14 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 271.0 15 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 258.0 16 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2H0YVA1|unreviewed|Superoxide 410.0 17 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 208.0 18 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 192.0 19 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_R7J7P3|unreviewed|Superoxide 400.0 20 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 209.0 21 | FeSOD_R7J7P3|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 199.0 22 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 434.0 23 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 267.0 24 | FeSOD_B8LFE6|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 264.0 25 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 426.0 26 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 275.0 27 | FeSOD_A0A1C0AS03|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 264.0 28 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 414.0 29 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 279.0 30 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 261.0 31 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 398.0 32 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 258.0 33 | FeSOD_A0A2E1VW30|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 239.0 34 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 424.0 35 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_B8LFE6|unreviewed|Superoxide 177.0 36 | FeSOD_A0A1V4UH29|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 171.0 37 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 430.0 38 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 244.0 39 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 234.0 40 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_R7F5H2|unreviewed|Superoxide 413.0 41 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A1V4UH29|unreviewed|Superoxide 149.0 42 | FeSOD_R7F5H2|unreviewed|Superoxide FeSOD_A0A7V9SPC9|unreviewed|Superoxide 140.0 43 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2N5YS14|unreviewed|Superoxide 402.0 44 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 237.0 45 | FeSOD_A0A2N5YS14|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 233.0 46 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 417.0 47 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A6L8W9C4|unreviewed|Superoxide 275.0 48 | FeSOD_A0A0F6MY72|unreviewed|Superoxide FeSOD_A0A2E1VW30|unreviewed|Superoxide 243.0 49 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A060HP82|unreviewed|Superoxide 451.0 50 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 226.0 51 | FeSOD_A0A060HP82|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 208.0 52 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A4P7WS39|unreviewed|Superoxide 419.0 53 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A2E1RF15|unreviewed|Superoxide 234.0 54 | FeSOD_A0A4P7WS39|unreviewed|Superoxide FeSOD_A0A0F6MY72|unreviewed|Superoxide 208.0 55 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A076JJX0|unreviewed|Superoxide 429.0 56 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A067LT26|unreviewed|Superoxide 271.0 57 | FeSOD_A0A076JJX0|unreviewed|Superoxide FeSOD_A0A1C0AS03|unreviewed|Superoxide 248.0 58 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_G8R729|unreviewed|Superoxide 422.0 59 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 271.0 60 | FeSOD_G8R729|unreviewed|Superoxide FeSOD_A0A538G8K1|unreviewed|Superoxide 231.0 61 | -------------------------------------------------------------------------------- /test/data/scorefull.dense.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/scorefull.dense.hdf5 -------------------------------------------------------------------------------- /test/data/scorefull.tsv: -------------------------------------------------------------------------------- 1 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 410.0 120.0 169.0 199.0 161.0 189.0 157.0 132.0 134.0 216.0 165.0 119.0 112.0 107.0 160.0 151.0 178.0 147.0 127.0 187.0 3 | FeSOD_A0A067LT26|unreviewed|Superoxide 125.0 429.0 136.0 153.0 130.0 119.0 111.0 271.0 279.0 162.0 128.0 174.0 247.0 138.0 135.0 115.0 166.0 130.0 277.0 153.0 4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 168.0 131.0 405.0 186.0 239.0 150.0 187.0 130.0 136.0 221.0 232.0 140.0 119.0 97.4 228.0 227.0 154.0 228.0 130.0 215.0 5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 199.0 149.0 187.0 425.0 180.0 183.0 139.0 150.0 166.0 287.0 159.0 125.0 167.0 91.3 169.0 174.0 205.0 148.0 148.0 229.0 6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 162.0 125.0 240.0 181.0 411.0 134.0 174.0 128.0 128.0 191.0 258.0 131.0 110.0 86.7 234.0 271.0 144.0 201.0 115.0 189.0 7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 191.0 117.0 153.0 184.0 138.0 410.0 134.0 135.0 138.0 208.0 126.0 124.0 120.0 109.0 158.0 144.0 175.0 132.0 127.0 192.0 8 | FeSOD_R7J7P3|unreviewed|Superoxide 161.0 110.0 192.0 143.0 178.0 136.0 400.0 109.0 104.0 154.0 199.0 122.0 89.7 104.0 209.0 171.0 119.0 191.0 108.0 152.0 9 | FeSOD_B8LFE6|unreviewed|Superoxide 130.0 267.0 130.0 150.0 128.0 132.0 105.0 434.0 264.0 162.0 122.0 175.0 214.0 109.0 129.0 120.0 158.0 121.0 239.0 150.0 10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 134.0 275.0 137.0 166.0 128.0 136.0 100.0 264.0 426.0 172.0 127.0 167.0 232.0 130.0 124.0 128.0 169.0 118.0 249.0 159.0 11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 209.0 149.0 214.0 279.0 184.0 199.0 142.0 155.0 165.0 414.0 180.0 135.0 155.0 108.0 196.0 203.0 221.0 163.0 139.0 261.0 12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 165.0 124.0 233.0 160.0 258.0 125.0 196.0 122.0 127.0 188.0 398.0 124.0 101.0 73.6 219.0 239.0 151.0 199.0 115.0 185.0 13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 120.0 171.0 142.0 127.0 131.0 123.0 119.0 177.0 169.0 143.0 126.0 424.0 162.0 150.0 137.0 117.0 140.0 126.0 169.0 131.0 14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 115.0 244.0 122.0 170.0 112.0 119.0 87.8 216.0 234.0 166.0 103.0 163.0 430.0 141.0 122.0 118.0 145.0 118.0 222.0 148.0 15 | FeSOD_R7F5H2|unreviewed|Superoxide 108.0 135.0 99.8 93.2 87.4 109.0 103.0 111.0 132.0 116.0 75.9 149.0 140.0 413.0 106.0 69.7 115.0 110.0 135.0 115.0 16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 163.0 134.0 233.0 172.0 237.0 159.0 207.0 133.0 127.0 206.0 222.0 139.0 124.0 108.0 402.0 223.0 144.0 198.0 132.0 185.0 17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 155.0 114.0 231.0 178.0 275.0 146.0 170.0 124.0 131.0 215.0 243.0 119.0 120.0 71.2 224.0 417.0 137.0 207.0 112.0 190.0 18 | FeSOD_A0A060HP82|unreviewed|Superoxide 177.0 158.0 152.0 202.0 141.0 171.0 113.0 155.0 166.0 226.0 148.0 137.0 140.0 110.0 137.0 130.0 451.0 122.0 158.0 208.0 19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 152.0 129.0 234.0 153.0 205.0 134.0 191.0 126.0 122.0 174.0 203.0 130.0 121.0 112.0 200.0 208.0 129.0 419.0 138.0 176.0 20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 126.0 271.0 130.0 147.0 114.0 124.0 103.0 238.0 248.0 146.0 114.0 166.0 219.0 132.0 128.0 107.0 160.0 132.0 429.0 151.0 21 | FeSOD_G8R729|unreviewed|Superoxide 190.0 150.0 218.0 231.0 191.0 192.0 151.0 152.0 161.0 271.0 187.0 133.0 148.0 115.0 183.0 188.0 213.0 173.0 155.0 422.0 22 | -------------------------------------------------------------------------------- /test/data/simple_genpept.gb: -------------------------------------------------------------------------------- 1 | LOCUS pDONR201_1 35 aa UNK 19-OCT-2021 2 | DEFINITION . 3 | ACCESSION pDONR201_1 4 | VERSION pDONR201_1 5 | KEYWORDS . 6 | SOURCE . 7 | ORGANISM . 8 | . 9 | FEATURES Location/Qualifiers 10 | ORIGIN 11 | 1 fpalspdsvd nritasqeef vetqkghpsg wpsa* 12 | // 13 | LOCUS pDONR201_2 102 aa UNK 19-OCT-2021 14 | DEFINITION . 15 | ACCESSION pDONR201_2 16 | VERSION pDONR201_2 17 | KEYWORDS . 18 | SOURCE . 19 | ORGANISM . 20 | . 21 | FEATURES Location/Qualifiers 22 | Domainator 2..100 23 | /program="hmmsearch" 24 | /name="CcdB" 25 | /description="CcdB protein" 26 | /evalue="1.1e-32" 27 | /score="103.0" 28 | /cds_id="958_-1_1264" 29 | /database="Pfam-A" 30 | ORIGIN 31 | 1 mqfkvytykr esryrlfvdv qsdiidtpgr rmviplasar llsdkvsrel ypvvhigdes 32 | 61 wrmmttdmas vpvsvigeev adlshrendi knainlmfwg i* 33 | // 34 | LOCUS pDONR201_3 42 aa UNK 19-OCT-2021 35 | DEFINITION . 36 | ACCESSION pDONR201_3 37 | VERSION pDONR201_3 38 | KEYWORDS . 39 | SOURCE . 40 | ORGANISM . 41 | . 42 | FEATURES Location/Qualifiers 43 | Domainator 1..41 44 | /program="hmmsearch" 45 | /name="CcdA" 46 | /description="Post-segregation antitoxin CcdA" 47 | /evalue="3.8e-16" 48 | /score="50.1" 49 | /cds_id="1265_-1_1391" 50 | /database="Pfam-A" 51 | ORIGIN 52 | 1 mqnearrlra erwkaenqeg maevarfiem ngsfadenrd w* 53 | // 54 | LOCUS pDONR201_4 220 aa UNK 19-OCT-2021 55 | DEFINITION . 56 | ACCESSION pDONR201_4 57 | VERSION pDONR201_4 58 | KEYWORDS . 59 | SOURCE . 60 | ORGANISM . 61 | . 62 | FEATURES Location/Qualifiers 63 | Domainator 11..212 64 | /program="hmmsearch" 65 | /name="CAT" 66 | /description="Chloramphenicol acetyltransferase" 67 | /evalue="9.7e-102" 68 | /score="329.7" 69 | /cds_id="1605_-1_2265" 70 | /database="Pfam-A" 71 | Domainator 2..33 72 | /program="hmmsearch" 73 | /name="Condensation" 74 | /description="Condensation domain" 75 | /evalue="0.00015" 76 | /score="11.2" 77 | /cds_id="1605_-1_2265" 78 | /database="Pfam-A" 79 | Domainator 6..36 80 | /program="hmmsearch" 81 | /name="2-oxoacid_dh" 82 | /description="2-oxoacid dehydrogenases acyltransferase 83 | (catalytic domain)" 84 | /evalue="0.0037" 85 | /score="7.3" 86 | /cds_id="1605_-1_2265" 87 | /database="Pfam-A" 88 | Domainator 137..196 89 | /program="hmmsearch" 90 | /name="2-oxoacid_dh" 91 | /description="2-oxoacid dehydrogenases acyltransferase 92 | (catalytic domain)" 93 | /evalue="0.0047" 94 | /score="7.0" 95 | /cds_id="1605_-1_2265" 96 | /database="Pfam-A" 97 | Domainator 139..159 98 | /program="hmmsearch" 99 | /name="Condensation" 100 | /description="Condensation domain" 101 | /evalue="0.81" 102 | /score="-1.2" 103 | /cds_id="1605_-1_2265" 104 | /database="Pfam-A" 105 | ORIGIN 106 | 1 mekkitgytt vdisqwhrke hfeafqsvaq ctynqtvqld itaflktvkk nkhkfypafi 107 | 61 hilarlmnah pefrmamkdg elviwdsvhp cytvfheqte tfsslwseyh ddfrqflhiy 108 | 121 sqdvacygen layfpkgfie nmffvsanpw vsftsfdlnv anmdnffapv ftmgkyytqg 109 | 181 dkvlmplaiq vhhavcdgfh vgrmlnelqq ycdewqgga* 110 | // 111 | LOCUS pDONR201_5 254 aa UNK 19-OCT-2021 112 | DEFINITION . 113 | ACCESSION pDONR201_5 114 | VERSION pDONR201_5 115 | KEYWORDS . 116 | SOURCE . 117 | ORGANISM . 118 | . 119 | FEATURES Location/Qualifiers 120 | Domainator 48..238 121 | /program="hmmsearch" 122 | /name="APH" 123 | /description="Phosphotransferase enzyme family" 124 | /evalue="2e-28" 125 | /score="90.5" 126 | /cds_id="2915_1_3677" 127 | /database="Pfam-A" 128 | Domainator 174..218 129 | /program="hmmsearch" 130 | /name="TCAD9" 131 | /description="Ternary complex associated domain 9" 132 | /evalue="0.00029" 133 | /score="10.5" 134 | /cds_id="2915_1_3677" 135 | /database="Pfam-A" 136 | ORIGIN 137 | 1 mdadlygykw ardnvgqsga tiyrlygkpd apelflkhgk gsvandvtde mvrlnwltef 138 | 61 mplptikhfi rtpddawllt taipgktafq vleeypdsge nivdalavfl rrlhsipvcn 139 | 121 cpfnsdrvfr laqaqsrmnn glvdasdfdd erngwpveqv wkemhkllpf spdsvvthgd 140 | 181 fsldnlifde gkligcidvg rvgiadryqd lailwnclge fspslqkrlf qkygidnpdm 141 | 241 nklqfhlmld eff* 142 | // 143 | -------------------------------------------------------------------------------- /test/data/simple_genpept_contigs.txt: -------------------------------------------------------------------------------- 1 | pDONR201_1 2 | pDONR201_5 3 | -------------------------------------------------------------------------------- /test/data/simple_genpept_quote_name.gb: -------------------------------------------------------------------------------- 1 | LOCUS pDONR201_1 35 aa UNK 19-OCT-2021 2 | DEFINITION . 3 | ACCESSION pDONR201_1 4 | VERSION pDONR201_1 5 | KEYWORDS . 6 | SOURCE . 7 | ORGANISM . 8 | . 9 | FEATURES Location/Qualifiers 10 | ORIGIN 11 | 1 fpalspdsvd nritasqeef vetqkghpsg wpsa* 12 | // 13 | LOCUS pDONR201_2 102 aa UNK 19-OCT-2021 14 | DEFINITION . 15 | ACCESSION pDONR201_2 16 | VERSION pDONR201_2 17 | KEYWORDS . 18 | SOURCE . 19 | ORGANISM . 20 | . 21 | FEATURES Location/Qualifiers 22 | Domainator 2..100 23 | /program="phmmer" 24 | /name="""CcdB""" 25 | /description="CcdB protein" 26 | /evalue="1.1e-32" 27 | /score="103.0" 28 | /cds_id="958_-1_1264" 29 | /database="Pfam-A" 30 | ORIGIN 31 | 1 mqfkvytykr esryrlfvdv qsdiidtpgr rmviplasar llsdkvsrel ypvvhigdes 32 | 61 wrmmttdmas vpvsvigeev adlshrendi knainlmfwg i* 33 | // 34 | -------------------------------------------------------------------------------- /test/data/ssn_FeSOD_clusters.tsv: -------------------------------------------------------------------------------- 1 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1 2 | FeSOD_A0A067LT26|unreviewed|Superoxide 2 3 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 1 4 | FeSOD_A0A538G8K1|unreviewed|Superoxide 1 5 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1 6 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 1 7 | FeSOD_R7J7P3|unreviewed|Superoxide 1 8 | FeSOD_B8LFE6|unreviewed|Superoxide 2 9 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 2 10 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1 11 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 1 12 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 2 13 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 2 14 | FeSOD_R7F5H2|unreviewed|Superoxide 3 15 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 1 16 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 1 17 | FeSOD_A0A060HP82|unreviewed|Superoxide 1 18 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 1 19 | FeSOD_A0A076JJX0|unreviewed|Superoxide 2 20 | FeSOD_G8R729|unreviewed|Superoxide 1 21 | -------------------------------------------------------------------------------- /test/data/ssn_FeSOD_clusters_header.tsv: -------------------------------------------------------------------------------- 1 | contig cluster 2 | FeSOD_A0A1F4ZT98|unreviewed|Superoxide 1 3 | FeSOD_A0A067LT26|unreviewed|Superoxide 2 4 | FeSOD_A0A2E1RF15|unreviewed|Superoxide 1 5 | FeSOD_A0A538G8K1|unreviewed|Superoxide 1 6 | FeSOD_A0A6L8W9C4|unreviewed|Superoxide 1 7 | FeSOD_A0A2H0YVA1|unreviewed|Superoxide 1 8 | FeSOD_R7J7P3|unreviewed|Superoxide 1 9 | FeSOD_B8LFE6|unreviewed|Superoxide 2 10 | FeSOD_A0A1C0AS03|unreviewed|Superoxide 2 11 | FeSOD_A0A2M8Q2V9|unreviewed|Superoxide 1 12 | FeSOD_A0A2E1VW30|unreviewed|Superoxide 1 13 | FeSOD_A0A1V4UH29|unreviewed|Superoxide 2 14 | FeSOD_A0A7V9SPC9|unreviewed|Superoxide 2 15 | FeSOD_R7F5H2|unreviewed|Superoxide 3 16 | FeSOD_A0A2N5YS14|unreviewed|Superoxide 1 17 | FeSOD_A0A0F6MY72|unreviewed|Superoxide 1 18 | FeSOD_A0A060HP82|unreviewed|Superoxide 1 19 | FeSOD_A0A4P7WS39|unreviewed|Superoxide 1 20 | FeSOD_A0A076JJX0|unreviewed|Superoxide 2 21 | FeSOD_G8R729|unreviewed|Superoxide 1 22 | -------------------------------------------------------------------------------- /test/data/swissprot_CuSOD_subset.fasta: -------------------------------------------------------------------------------- 1 | >sp|P0AGD1|SODC_ECOLI Superoxide dismutase [Cu-Zn] OS=Escherichia coli (strain K12) OX=562 GN=sodC PE=1 SV=1 2 | MKRFSLAILALVVATGAQAASEKVEMNLVTSQGVGQSIGSVTITETDKGLEFSPDLKALP 3 | PGEHGFHIHAKGSCQPATKDGKASAAESAGGHLDPQNTGKHEGPEGAGHLGDLPALVVNN 4 | DGKATDAVIAPRLKSLDEIKDKALMVHVGGDNMSDQPKPLGGGGERYACGVIK 5 | >sp|O31851|YOJM_BACSU Superoxide dismutase-like protein YojM OS=Bacillus subtilis (strain 168) OX=1423 GN=yojM PE=1 SV=1 6 | MHRLLLLMMLTALGVAGCGQKKPPDPPNRVPEKKVVETSAFGHHVQLVNREGKAVGFIEI 7 | KESDDEGLDIHISANSLRPGASLGFHIYEKGSCVRPDFESAGGPFNPLNKEHGFNNPMGH 8 | HAGDLPNLEVGADGKVDVIMNAPDTSLKKGSKLNILDEDGSAFIIHEQADDYLTNPSGNS 9 | GARIVCGALLGNNEKQ -------------------------------------------------------------------------------- /test/data/taxdmp/delnodes.dmp: -------------------------------------------------------------------------------- 1 | 1985417 | -------------------------------------------------------------------------------- /test/data/taxdmp/merged.dmp: -------------------------------------------------------------------------------- 1 | 17 | 561 | -------------------------------------------------------------------------------- /test/data/taxdmp/nodes.dmp: -------------------------------------------------------------------------------- 1 | 1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | 2 | 131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | 3 | 2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | 4 | 2759 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | 5 | 1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 6 | 1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 7 | 91347 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 8 | 543 | 91347 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 9 | 561 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 10 | 17 | 543 | genus | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | 11 | 562 | 17 | species | EC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 12 | 1783272 | 2 | clade | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 13 | 1239 | 1783272 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 14 | 91061 | 1239 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 15 | 1385 | 91061 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 16 | 186817 | 1385 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 17 | 1386 | 186817 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 18 | 653685 | 1386 | species group | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 19 | 1423 | 653685 | species | BS | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 20 | -------------------------------------------------------------------------------- /test/data/taxdmp/taxdump.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/taxdmp/taxdump.tar.gz -------------------------------------------------------------------------------- /test/data/test_matrix.dense.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/test_matrix.dense.hdf5 -------------------------------------------------------------------------------- /test/data/test_matrix.dense.tsv: -------------------------------------------------------------------------------- 1 | X Y Z 2 | A 1 2 3 3 | B 4 5 6 4 | C 7 8 9 5 | -------------------------------------------------------------------------------- /test/data/test_matrix.sparse.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nebiolabs/domainator/663454806768f7462b33ade914684c6b3487dab2/test/data/test_matrix.sparse.hdf5 -------------------------------------------------------------------------------- /test/data/thymidylate_synthase.fasta: -------------------------------------------------------------------------------- 1 | >Thymidylate_synthase 2 | mkqylelmqkvldegtqkndrtgtgtlsifghqmrfnlqegfplvttkrchlrsiihellwflqgdtniaylhennvtiwdewadengdlgpvygkqwrawptpdgrhidqiatvlsqlk -------------------------------------------------------------------------------- /test/helpers.py: -------------------------------------------------------------------------------- 1 | from domainator.Bio import SeqIO 2 | 3 | def compare_files(f1,f2, skip_lines=0): 4 | with open(f1,"r") as newfile, open(f2, "r") as oldfile: 5 | for x in range(skip_lines): 6 | newfile.readline() 7 | oldfile.readline() 8 | assert newfile.read() == oldfile.read() 9 | 10 | def compare_iterables(i1, i2): 11 | assert all(a == b for a,b in zip(i1, i2)) 12 | 13 | def compare_seqfiles(gb1, gb2, format="genbank", skip_attrs={}, skip_qualifiers={}): 14 | recs1 = list(SeqIO.parse(gb1, format)) 15 | recs2 = list(SeqIO.parse(gb2, format)) 16 | assert len(recs1) == len(recs2) 17 | for i in range(len(recs1)): 18 | compare_seqrecords(recs1[i], recs2[i], skip_attrs=skip_attrs, skip_qualifiers=skip_qualifiers) 19 | 20 | def compare_seqrecords(rec1, rec2, skip_attrs={}, skip_qualifiers={}): 21 | attrs = {"seq", "id", "description", "name"} 22 | skip_attrs = set(skip_attrs) 23 | skip_qualifiers = set(skip_qualifiers) 24 | attrs = attrs.difference(skip_attrs) 25 | 26 | for attr in attrs: 27 | try: 28 | assert getattr(rec1, attr) == getattr(rec2, attr) 29 | except AssertionError as e: 30 | e.args += (attr, rec1, rec2) 31 | raise 32 | 33 | 34 | assert rec1.letter_annotations == rec2.letter_annotations 35 | for k in rec1.letter_annotations: 36 | assert rec1.letter_annotations[k] == rec2.letter_annotations[k] 37 | for k in rec1.annotations: 38 | if k != "date": 39 | assert rec1.annotations[k] == rec2.annotations[k] 40 | assert len(rec1.features) == len(rec2.features) 41 | 42 | 43 | for i in range(len(rec1.features)): 44 | feature1 = rec1.features[i] 45 | feature2 = rec2.features[i] 46 | 47 | for qualifier in feature1.qualifiers: 48 | if qualifier in skip_qualifiers: 49 | continue 50 | try: 51 | assert feature1.qualifiers[qualifier] == feature2.qualifiers[qualifier], f"qualifiers not equal in: {rec1}, {rec2}" 52 | except: 53 | #print(f"{rec1}, {rec2}") 54 | print(f"{feature1}, {feature2}") 55 | print(f"{feature1.qualifiers[qualifier]}, {feature2.qualifiers[qualifier]}") 56 | raise 57 | -------------------------------------------------------------------------------- /test/test_SeqFeature.py: -------------------------------------------------------------------------------- 1 | import os 2 | from domainator.Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation 3 | import tempfile 4 | from glob import glob 5 | import pytest 6 | 7 | 8 | def test_overlay_1(): 9 | """ 10 | Test CompoundLocation.overlay on a split CompoundLocation 11 | """ 12 | loc1 = FeatureLocation(1, 10) 13 | loc2 = FeatureLocation(20, 30) 14 | loc3 = FeatureLocation(40, 50) 15 | loc4 = FeatureLocation(60, 70) 16 | loc5 = FeatureLocation(80, 90) 17 | loc6 = FeatureLocation(100, 110) 18 | 19 | cloc1 = CompoundLocation([loc1, loc2, loc3, loc4, loc5, loc6]) 20 | cloc3 = cloc1.overlay(4, 40) 21 | assert len(cloc3.parts) == 5 22 | assert cloc3.parts[0].start == 5 23 | assert cloc3.parts[0].end == 10 24 | assert cloc3.parts[1].start == 20 25 | assert cloc3.parts[1].end == 30 26 | assert cloc3.parts[2].start == 40 27 | assert cloc3.parts[2].end == 50 28 | assert cloc3.parts[3].start == 60 29 | assert cloc3.parts[3].end == 70 30 | assert cloc3.parts[4].start == 80 31 | assert cloc3.parts[4].end == 85 32 | 33 | 34 | def test_overlaps(): 35 | """ 36 | Test SeqFeature.overlaps method 37 | """ 38 | loc1 = FeatureLocation(1, 10) 39 | loc2 = FeatureLocation(10, 40) 40 | loc3 = FeatureLocation(40, 50) 41 | loc4 = FeatureLocation(50, 70) 42 | loc5 = FeatureLocation(70, 90) 43 | loc6 = FeatureLocation(90, 110) 44 | 45 | cloc1 = CompoundLocation([loc1, loc2, loc3, loc4, loc5, loc6]) 46 | cloc2 = CompoundLocation([loc1, loc2, loc3]) 47 | cloc3 = CompoundLocation([loc4, loc5, loc6]) 48 | cloc4 = CompoundLocation([loc2, loc3, loc4]) 49 | cloc5 = CompoundLocation([loc3, loc4, loc5]) 50 | cloc6 = CompoundLocation([loc2, loc3, loc5]) 51 | cloc7 = CompoundLocation([loc1, loc6]) 52 | 53 | assert cloc1.overlaps(cloc2) == True 54 | assert cloc1.overlaps(cloc3) == True 55 | assert cloc2.overlaps(cloc3) == False 56 | assert cloc2.overlaps(cloc4) == True 57 | assert cloc3.overlaps(cloc4) == True 58 | assert cloc3.overlaps(cloc5) == True 59 | assert cloc4.overlaps(cloc5) == True 60 | assert cloc4.overlaps(cloc6) == True 61 | assert cloc5.overlaps(cloc6) == True 62 | assert cloc1.overlaps(cloc7) == True 63 | assert cloc2.overlaps(cloc7) == True 64 | assert cloc3.overlaps(cloc7) == True 65 | assert cloc4.overlaps(cloc7) == False 66 | assert cloc5.overlaps(cloc7) == False 67 | assert cloc6.overlaps(cloc7) == False -------------------------------------------------------------------------------- /test/test_build_ssn.py: -------------------------------------------------------------------------------- 1 | from domainator import build_ssn 2 | import pytest 3 | import tempfile 4 | import pandas as pd 5 | from pathlib import Path 6 | from helpers import compare_files 7 | import re 8 | 9 | @pytest.mark.parametrize("input_file,expected_output", 10 | [ 11 | ["FeSOD_dist.tsv","ssn_FeSOD.xgmml"], 12 | ["FeSOD_dist.sparse.hdf5","ssn_FeSOD.sparse.xgmml"], 13 | ["FeSOD_dist.dense.hdf5","ssn_FeSOD.xgmml"] 14 | ]) 15 | def test_build_ssn(input_file, expected_output, shared_datadir): 16 | with tempfile.TemporaryDirectory() as output_dir: 17 | # output_dir = "test_out" 18 | metadata = str(shared_datadir / "FeSOD_metadata.tsv") 19 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv" 20 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml" 21 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster", "--cluster_tsv", out_clusters, "--no_cluster_header", "--metadata", metadata]) 22 | assert Path(out_cytoscape).is_file() 23 | assert Path(out_clusters).is_file() 24 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters.tsv') 25 | compare_files(out_cytoscape, shared_datadir/expected_output, skip_lines=2) 26 | 27 | @pytest.mark.parametrize("input_file,expected_output", 28 | [ 29 | ["FeSOD_dist.tsv","ssn_FeSOD.xgmml"], 30 | ]) 31 | def test_build_ssn_2(input_file, expected_output, shared_datadir): 32 | with tempfile.TemporaryDirectory() as output_dir: 33 | # output_dir = "test_out" 34 | metadata = str(shared_datadir / "FeSOD_metadata.tsv") 35 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv" 36 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml" 37 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster", "--cluster_tsv", out_clusters, "--metadata", metadata]) 38 | assert Path(out_cytoscape).is_file() 39 | assert Path(out_clusters).is_file() 40 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters_header.tsv') 41 | compare_files(out_cytoscape, shared_datadir/expected_output, skip_lines=2) 42 | 43 | 44 | def test_build_ssn_3(shared_datadir): 45 | input_file = "FeSOD_dist.tsv" 46 | with tempfile.TemporaryDirectory() as output_dir: 47 | # output_dir = "test_out" 48 | metadata = str(shared_datadir / "FeSOD_metadata.tsv") 49 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv" 50 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml" 51 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster", 52 | "--cluster_tsv", out_clusters, "--metadata", metadata, "--color_table_out", output_dir + "/color_table.tsv"]) 53 | assert Path(out_cytoscape).is_file() 54 | assert Path(out_clusters).is_file() 55 | assert Path(output_dir + "/color_table.tsv").is_file() 56 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters_header.tsv') 57 | compare_files(out_cytoscape, shared_datadir/"ssn_FeSOD.xgmml", skip_lines=2) 58 | 59 | color_table_dict = {} 60 | with open(output_dir + "/color_table.tsv", "r") as f: 61 | for line in f: 62 | domain, color = line.strip().split("\t") 63 | color_table_dict[domain] = color 64 | assert len(color_table_dict) == 3 65 | assert set(color_table_dict.keys()) == {"1","2","3"} 66 | assert all([re.match(r"#[0-9a-fA-F]{6}",x) for x in color_table_dict.values()]) 67 | assert len(set(color_table_dict.values())) == 3 68 | 69 | def test_build_ssn_4(shared_datadir): 70 | input_file = "FeSOD_dist.tsv" 71 | with tempfile.TemporaryDirectory() as output_dir: 72 | # output_dir = "test_out" 73 | metadata = str(shared_datadir / "FeSOD_metadata.tsv") 74 | out_clusters = output_dir + f"/{input_file}_out_clusters.tsv" 75 | out_cytoscape = output_dir + f"/{input_file}_out.xgmml" 76 | build_ssn.main(["-i", str(shared_datadir / input_file),"--xgmml", out_cytoscape, "--lb", "175", "--color_by", "SSN_cluster", 77 | "--cluster_tsv", out_clusters, "--metadata", metadata, "--color_table_out", output_dir + "/color_table.tsv", "--color_table", str(shared_datadir / "color_table_123.tsv")]) 78 | assert Path(out_cytoscape).is_file() 79 | assert Path(out_clusters).is_file() 80 | assert Path(output_dir + "/color_table.tsv").is_file() 81 | compare_files(out_clusters,shared_datadir/'ssn_FeSOD_clusters_header.tsv') 82 | compare_files(output_dir + "/color_table.tsv", shared_datadir/"color_table_123.tsv") 83 | -------------------------------------------------------------------------------- /test/test_build_tree.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tempfile 3 | from domainator.build_tree import get_newick 4 | from domainator import build_tree 5 | from helpers import compare_files 6 | 7 | # Simple tree structure for testing purposes 8 | class TreeNode: 9 | def __init__(self, id, dist=0): 10 | self.id = id 11 | self.dist = dist 12 | self.left = None 13 | self.right = None 14 | 15 | def is_leaf(self): 16 | return self.left is None and self.right is None 17 | 18 | def get_left(self): 19 | return self.left 20 | 21 | def get_right(self): 22 | return self.right 23 | 24 | @pytest.fixture 25 | def leaf_names(): 26 | return ['A', 'B', 'C', 'D', 'E'] 27 | 28 | @pytest.fixture 29 | def tree(): 30 | root = TreeNode(-1, 4) 31 | root.left = TreeNode(-1, 2) 32 | root.right = TreeNode(-1, 1) 33 | root.left.left = TreeNode(0, 0) 34 | root.left.right = TreeNode(1, 0) 35 | root.right.left = TreeNode(2, 0) 36 | root.right.right = TreeNode(3, 0) 37 | return root 38 | 39 | def test_get_newick(tree, leaf_names): 40 | result = get_newick(tree, tree.dist, leaf_names) 41 | assert result == '((D:1.00,C:1.00):3.00,(B:2.00,A:2.00):2.00);' 42 | 43 | def test_get_newick_single_node(): 44 | single_node_tree = TreeNode(0, 0) 45 | leaf_names = ['A'] 46 | result = get_newick(single_node_tree, 0, leaf_names) 47 | assert result == '(A:0.00);' 48 | 49 | def test_get_newick_single_level(leaf_names): 50 | root = TreeNode(-1, 3) 51 | root.left = TreeNode(0, 0) 52 | root.right = TreeNode(1, 0) 53 | result = get_newick(root, root.dist, leaf_names) 54 | assert result == '(B:3.00,A:3.00);' 55 | 56 | def test_get_newick_with_empty_leaf_names(tree, leaf_names): 57 | result = get_newick(tree, tree.dist, [''] * len(leaf_names)) 58 | assert result == '((:1.00,:1.00):3.00,(:2.00,:2.00):2.00);' 59 | 60 | 61 | def test_newick_output(shared_datadir): 62 | 63 | with tempfile.TemporaryDirectory() as output_dir: 64 | # output_dir = "test_out" 65 | newick_out = str(output_dir + "/test.newick") 66 | xgmm_out = str(output_dir + "/test.xgmml") 67 | metadata = str(shared_datadir / 'FeSOD_metadata.tsv') 68 | 69 | build_tree.main(['--input', str(shared_datadir / 'FeSOD_score_dist.tsv'), '--newick', newick_out, '--xgmml', xgmm_out, "--metadata", metadata]) 70 | compare_files(newick_out, str(shared_datadir / 'FeSOD_score_dist.newick')) 71 | compare_files(xgmm_out, str(shared_datadir / 'FeSOD_score_dist.xgmml')) 72 | -------------------------------------------------------------------------------- /test/test_color_table_to_legend.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from domainator import color_table_to_legend 3 | 4 | def test_color_table_legend(shared_datadir, tmp_path): 5 | svg_file = tmp_path / "test.svg" 6 | title = "Test Legend" 7 | 8 | color_table_to_legend.main(["-i", str(shared_datadir / "color_specification.tsv"), "--svg", str(svg_file), "--title", title]) 9 | """ 10 | CcdB #ff0000 11 | APH #00ff00 12 | CAT #0000ff 13 | Condensation #ff00ff 14 | 2-oxoacid_dh #ffffff 15 | 16 | """ 17 | 18 | with open(svg_file, "r") as f: 19 | # read entire file into string 20 | text = f.read() 21 | 22 | assert "Test Legend" in text 23 | assert "#FF0000" in text 24 | assert "#00FF00" in text 25 | assert "#0000FF" in text 26 | assert "#FF00FF" in text 27 | assert "#FFFFFF" in text 28 | assert "CcdB" in text 29 | assert "APH" in text 30 | assert "CAT" in text 31 | assert "Condensation" in text 32 | assert "2-oxoacid_dh" in text 33 | 34 | -------------------------------------------------------------------------------- /test/test_data_matrix.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore", module='numpy') 3 | import pytest 4 | from domainator.data_matrix import DataMatrix 5 | import scipy.sparse 6 | import numpy as np 7 | import pytest_datadir 8 | 9 | # Test initialization of DataMatrix 10 | def test_init(): 11 | # Test case 1: Initialize with data 12 | data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 13 | row_names = ['A', 'B', 'C'] 14 | col_names = ['X', 'Y', 'Z'] 15 | matrix = DataMatrix(data, row_names, col_names) 16 | assert matrix.shape == (3, 3) 17 | assert matrix.size == 9 18 | assert not matrix.sparse 19 | assert matrix.rows == row_names 20 | assert matrix.columns == col_names 21 | assert matrix.row_lengths is None 22 | assert matrix.column_lengths is None 23 | assert matrix.data_type == "" 24 | 25 | # Test case 2: Initialize without data 26 | matrix = DataMatrix() 27 | assert matrix.shape == (0, 0) 28 | assert matrix.size == 0 29 | assert not matrix.sparse 30 | assert matrix.rows is None 31 | assert matrix.columns is None 32 | assert matrix.row_lengths is None 33 | assert matrix.column_lengths is None 34 | assert matrix.data_type == "" 35 | 36 | # Test from_file method of DataMatrix 37 | 38 | @pytest.mark.parametrize("filename,sparse", 39 | [ 40 | ("test_matrix.dense.hdf5",False), 41 | ("test_matrix.dense.tsv",False), 42 | ("test_matrix.sparse.hdf5",True) 43 | ]) 44 | def test_from_file(shared_datadir, filename, sparse): 45 | # Test case 1: Read dense matrix from file 46 | matrix_file = shared_datadir / filename 47 | matrix = DataMatrix.from_file(matrix_file) 48 | assert matrix.shape == (3, 3) 49 | assert matrix.size == 9 50 | assert matrix.sparse is sparse 51 | assert matrix.rows == ['A', 'B', 'C'] 52 | assert matrix.columns == ['X', 'Y', 'Z'] 53 | assert matrix.row_lengths is None 54 | assert matrix.column_lengths is None 55 | assert matrix.data_type == "" 56 | 57 | 58 | # Test convert_to_sparse method of DataMatrix 59 | def test_convert_to_sparse(): 60 | # Test case 1: Convert dense matrix to sparse 61 | data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 62 | row_names = ['A', 'B', 'C'] 63 | col_names = ['X', 'Y', 'Z'] 64 | matrix = DataMatrix(data, row_names, col_names) 65 | matrix.convert_to_sparse() 66 | assert matrix.sparse 67 | assert matrix.data.shape == (3, 3) 68 | 69 | # Test case 2: Convert already sparse matrix to sparse 70 | matrix = DataMatrix() 71 | matrix.sparse = True 72 | matrix.data = scipy.sparse.csr_matrix([[1, 0, 0], [0, 2, 0], [0, 0, 3]]) 73 | matrix.convert_to_sparse() 74 | assert matrix.sparse 75 | assert matrix.data.shape == (3, 3) 76 | -------------------------------------------------------------------------------- /test/test_domainator_db_download.py: -------------------------------------------------------------------------------- 1 | from domainator import domainator_db_download 2 | import tempfile 3 | from domainator import utils 4 | from pathlib import Path 5 | 6 | 7 | 8 | def test_uniprot_download_fasta(shared_datadir): 9 | with tempfile.TemporaryDirectory() as output_dir: 10 | # output_dir = "test_out" 11 | outfile = Path(output_dir) / "uniprot_sprot.fasta" 12 | domainator_db_download.main(["--db", "swissprot", "--num_recs", "2", "--output", str(outfile)]) 13 | assert outfile.exists() 14 | assert outfile.stat().st_size > 0 15 | recs = list(utils.parse_seqfiles([str(outfile)])) 16 | assert len(recs) == 2 17 | 18 | def test_uniprot_download_genbank(shared_datadir): 19 | with tempfile.TemporaryDirectory() as output_dir: 20 | # output_dir = "test_out" 21 | outfile = Path(output_dir) / "uniprot_sprot.gb" 22 | domainator_db_download.main(["--db", "swissprot_gb", "--num_recs", "2", "--output", str(outfile)]) 23 | assert outfile.exists() 24 | assert outfile.stat().st_size > 0 25 | recs = list(utils.parse_seqfiles([str(outfile)])) 26 | assert len(recs) == 2 27 | 28 | #TODO: add tests for genbank downloads 29 | 30 | 31 | 32 | def test_genbank_download_genbank_1(shared_datadir): 33 | small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM876677v1','https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/031/580/395/GCA_031580395.1_ASM3158039v1'] 34 | with tempfile.TemporaryDirectory() as output_dir: 35 | # output_dir = "test_out" 36 | outfile = Path(output_dir) / "gb.gb" 37 | domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call=None, num_recs=1, cpus=3) 38 | assert outfile.exists() 39 | # read output file 40 | recs = list(utils.parse_seqfiles([str(outfile)])) 41 | assert len(recs) == 1 42 | 43 | def test_genbank_download_genbank_2(shared_datadir): 44 | small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM876677v1','https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/031/580/395/GCA_031580395.1_ASM3158039v1'] 45 | with tempfile.TemporaryDirectory() as output_dir: 46 | # output_dir = "test_out" 47 | outfile = Path(output_dir) / "gb.gb" 48 | domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call=None, num_recs=None, cpus=3) 49 | assert outfile.exists() 50 | # read output file 51 | recs = list(utils.parse_seqfiles([str(outfile)])) 52 | assert len(recs) == 2 53 | 54 | def test_genbank_download_genbank_3(shared_datadir): 55 | small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM876677v1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/987/885/GCA_002987885.1_ASM298788v1'] 56 | with tempfile.TemporaryDirectory() as output_dir: 57 | #output_dir = "test_out" 58 | outfile = Path(output_dir) / "gb.gb" 59 | domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call="all", num_recs=None, cpus=2) 60 | assert outfile.exists() 61 | # read output file 62 | recs = list(utils.parse_seqfiles([str(outfile)])) 63 | assert len(recs) == 2 64 | outfile_text = outfile.read_text() 65 | assert "CDS" in outfile_text 66 | assert '/gene_id="AM260465_1"' in outfile_text 67 | 68 | # def test_genbank_download_genbank_skipped_record_log(shared_datadir): 69 | # small_genbanks=['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/008/766/775/GCA_008766775.1_ASM87667v1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/987/885/GCA_002987885.1_ASM298788v1'] 70 | # with tempfile.TemporaryDirectory() as output_dir: 71 | # #output_dir = "test_out" 72 | # outfile = Path(output_dir) / "gb.gb" 73 | # skipped_record_log = Path(output_dir) / "skipped_record_log.txt" 74 | # domainator_db_download.process_genbank_accessions([{'ftp_path':small_genbank} for small_genbank in small_genbanks], outfile, gene_call="all", num_recs=None, cpus=2, skipped_record_log=skipped_record_log) 75 | # assert outfile.exists() 76 | # # read output file 77 | # recs = list(utils.parse_seqfiles([str(outfile)])) 78 | # assert len(recs) == 1 79 | # outfile_text = outfile.read_text() 80 | # assert "CDS" in outfile_text 81 | # assert '/gene_id="AM260465_1"' in outfile_text 82 | # assert skipped_record_log.exists() 83 | # skipped_record_log_text = skipped_record_log.read_text() 84 | # assert "GCA_008766775.1_ASM87667v1" in skipped_record_log_text 85 | # assert "GCA_002987885.1_ASM298788v1" not in skipped_record_log_text -------------------------------------------------------------------------------- /test/test_extract_unannotated.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from domainator.Bio import SeqIO 3 | from domainator import extract_unannotated 4 | from domainator.utils import parse_seqfiles, DomainatorCDS 5 | import pytest 6 | from io import StringIO 7 | import sys 8 | import subprocess 9 | 10 | 11 | def test_extract_unannotated_1(shared_datadir): 12 | with tempfile.TemporaryDirectory() as output_dir: 13 | #output_dir = "test_out" 14 | out = output_dir + "/extraction.gb" 15 | extract_unannotated.main(["-i", str(shared_datadir / "simple_genpept.gb"), "-o", out]) 16 | # assert 0 17 | seqs = list(parse_seqfiles([out])) 18 | assert len(seqs) == 8 19 | [str(seq) for seq in seqs] == ["FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA*","M","I*","*","M","DEWQGGA*","MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDV","DMNKLQFHLMLDEFF*"] 20 | 21 | def test_extract_unannotated_largest_keep_name_2(shared_datadir): 22 | with tempfile.TemporaryDirectory() as output_dir: 23 | #output_dir = "test_out" 24 | out = output_dir + "/extraction.gb" 25 | extract_unannotated.main(["-i", str(shared_datadir / "simple_genpept.gb"), "-o", out, "--largest", "--keep_name"]) 26 | seqs = list(parse_seqfiles([out])) 27 | assert len(seqs) == 5 28 | [str(seq) for seq in seqs] == ["FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA*","I*","*","DEWQGGA*","MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDV"] 29 | 30 | def test_extract_unannotated_lb_2(shared_datadir): 31 | with tempfile.TemporaryDirectory() as output_dir: 32 | #output_dir = "test_out" 33 | out = output_dir + "/extraction.gb" 34 | extract_unannotated.main(["-i", str(shared_datadir / "simple_genpept.gb"), "-o", out, "--lb", "10"]) 35 | seqs = list(parse_seqfiles([out])) 36 | assert len(seqs) == 3 37 | [str(seq) for seq in seqs] == ["FPALSPDSVDNRITASQEEFVETQKGHPSGWPSA*", "MDADLYGYKWARDNVGQSGATIYRLYGKPDAPELFLKHGKGSVANDV", "DMNKLQFHLMLDEFF*"] -------------------------------------------------------------------------------- /test/test_hmmer_build.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tempfile 3 | import os 4 | from io import BytesIO, StringIO 5 | from pyhmmer import easel 6 | from pyhmmer.plan7 import HMM 7 | from domainator.hmmer_build import hmmer_build, main 8 | 9 | 10 | @pytest.fixture 11 | def msa_content(): 12 | return b">seq1\nACGTACGT\n>seq2\nACGTACGA\n" 13 | 14 | def create_msa_file(content): 15 | with tempfile.NamedTemporaryFile(delete=False, mode="wb") as f: 16 | f.write(content) 17 | return f.name 18 | 19 | @pytest.fixture 20 | def msa_file(msa_content): 21 | file_path = create_msa_file(msa_content) 22 | yield file_path 23 | os.unlink(file_path) 24 | 25 | def test_hmmer_build_with_required_params(): 26 | msa_content = b">seq1\nACGTACGT\n>seq2\nACGTACGA\n" 27 | msa_file = create_msa_file(msa_content) 28 | 29 | with open(msa_file, "rb") as file: 30 | hmm = hmmer_build(file, name="test_profile") 31 | assert isinstance(hmm, HMM) 32 | assert hmm.name.decode() == "test_profile" 33 | 34 | os.unlink(msa_file) 35 | 36 | def test_hmmer_build_with_optional_params(): 37 | msa_content = b">seq1\nACGTACGT\n>seq2\nACGTACGA\n" 38 | msa_file = create_msa_file(msa_content) 39 | 40 | with open(msa_file, "rb") as file: 41 | hmm = hmmer_build(file, name="test_profile", acc="P12345", desc="Test profile description", alphabet=easel.Alphabet.dna()) 42 | assert isinstance(hmm, HMM) 43 | assert hmm.name.decode() == "test_profile" 44 | assert hmm.accession.decode() == "P12345" 45 | assert hmm.description.decode() == "Test profile description" 46 | 47 | os.unlink(msa_file) 48 | 49 | def test_hmmer_build_with_binaryio(): 50 | msa_content = b">seq1\nACGTACGT\n>seq2\nACGTACGA\n" 51 | msa_file = BytesIO(msa_content) 52 | 53 | hmm = hmmer_build(msa_file, name="test_profile", acc="P12345", desc="Test profile description", alphabet=easel.Alphabet.dna()) 54 | assert isinstance(hmm, HMM) 55 | assert hmm.name.decode() == "test_profile" 56 | assert hmm.accession.decode() == "P12345" 57 | assert hmm.description.decode() == "Test profile description" 58 | 59 | 60 | def test_main_with_required_params(msa_file, capsys): 61 | main(["--name", "test_profile", "--input", msa_file]) 62 | captured = capsys.readouterr() 63 | assert "HMMER3/f" in captured.out 64 | assert "NAME test_profile" in captured.out 65 | 66 | def test_main_with_optional_params(msa_file, capsys): 67 | main(["--name", "test_profile", "--acc", "P12345", "--desc", "Test profile description", "--input", msa_file, "--alphabet", "dna"]) 68 | caputured = capsys.readouterr() 69 | output = caputured.out 70 | assert "HMMER3/f" in output 71 | assert "NAME test_profile" in output 72 | assert "ACC P12345" in output 73 | assert "DESC Test profile description" in output 74 | 75 | -------------------------------------------------------------------------------- /test/test_hmmer_compare.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from domainator import hmmer_compare 3 | from helpers import compare_files 4 | 5 | #TODO: better tests! 6 | 7 | def test_hmmer_compare_1(shared_datadir): 8 | 9 | with tempfile.TemporaryDirectory() as output_dir: 10 | # output_dir = "test_out" 11 | out_path = output_dir + f"/out_scores.tsv" 12 | hmmer_compare.main(["-i", str(shared_datadir / "pdonr_hmms.hmm"), "-r", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out_path, "--alignment", "--score_cutoff", "13", "--cpu", "10"]) 13 | compare_files(out_path, shared_datadir / "pDONR_201_hmm_scores.tsv") 14 | 15 | -------------------------------------------------------------------------------- /test/test_hmmer_report.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import tempfile 3 | from glob import glob 4 | from helpers import compare_files 5 | import pytest 6 | 7 | from domainator import hmmer_report 8 | 9 | def test_hmmer_report_1(shared_datadir): 10 | 11 | with tempfile.TemporaryDirectory() as output_dir: 12 | # output_dir = "test_out" 13 | out = output_dir + "/hmmer_report.tsv" 14 | hmmer_report.main(["-i", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out, '--source', '--acc', '--desc', '--length', '--consensus', '--append', 'one', 'int', '1', '--append', 'two', 'float', '2.0', '--append', 'three', 'str', 'three']) 15 | assert Path(out).is_file() 16 | 17 | with open(out) as f: 18 | lines = f.readlines() 19 | assert len(lines) == 8 20 | assert lines[0].strip().split("\t") == ["name","source","acc","desc","length","consensus","one","two","three"] 21 | assert lines[1].strip() == "2-oxoacid_dh\tpdonr_hmms\tPF00198.25\t2-oxoacid dehydrogenases acyltransferase (catalytic domain)\t233\teqeeervplsgirkaiakrlteskqeiphftlsdevdvtallalrkelkedeakeekakltlldflikavalAlkefPelnasvdeeekeivlkkhvniGvAvatprGLlvPviknadkkslleiakelkelaeraregklkpedleggtftisNlGmlGvtsftPiinppqvaIlgvgrikerpvvkegelvarkvmplslsaDHRvidGaeaarFlntlkkllenpeelll\t1\t2.0\tthree" 22 | assert lines[-1].strip() == "TCAD9\tpdonr_hmms\tPF19974.1\tTernary complex associated domain 9\t437\tdqvevvrvLtgGrSGaqVlevtvfvkeknqalrhVlKigsaseiakEweAyqrliqpllnalfatIiavsesvlengdqvldelgavvYshagqfagepgeklrsLedlfqealrgpeaadravallerlletllnllYagateeplqtlreelnsrLGpdlvvevkevdseqlvvypdDllqakmssysaseynskvagilvsvelsrlevkvrgprlsavdddvrvevllsggalseleeqgdefleGsvvatranlrlrllkeledelvleetllevdglqlahPfaalrsaLtealearvtssvHGDLNprNiLlaeedrvyLIDfartreggpllsDlAwLevnLlrtvladrldlqellrLqrlLalasrllelealaealagesealakafrllaaiRrfarkqyplerrelwwreylaaLllaahrtLk\t1\t2.0\tthree" 23 | -------------------------------------------------------------------------------- /test/test_hmmer_search.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from domainator import hmmer_search 3 | from pathlib import Path 4 | 5 | #TODO: better tests! 6 | 7 | def test_hmmer_search_1(shared_datadir): 8 | 9 | with tempfile.TemporaryDirectory() as output_dir: 10 | # output_dir = "test_out" 11 | out_path = output_dir + f"/out_scores.hmm" 12 | hmmer_search.main(["-i", str(shared_datadir / "pdonr_hmms_1.hmm"), "-r", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out_path, "--score_cutoff", "13"]) 13 | file_contents = Path(out_path).read_text() 14 | assert "NAME CAT" in file_contents 15 | assert "NAME 2-oxoacid_dh" in file_contents 16 | assert "NAME APH" in file_contents 17 | assert "NAME CcdA" not in file_contents 18 | assert "NAME CcdB" not in file_contents 19 | assert "NAME Condensation" not in file_contents 20 | assert "NAME TCAD9" not in file_contents 21 | 22 | def test_hmmer_search_2(shared_datadir): 23 | 24 | with tempfile.TemporaryDirectory() as output_dir: 25 | # output_dir = "test_out" 26 | out_path = output_dir + f"/out_scores.hmm" 27 | hmmer_search.main(["-i", str(shared_datadir / "pdonr_hmms_1.hmm"), "-r", str(shared_datadir / "pdonr_hmms.hmm"), "-o", out_path, "--score_cutoff", "13", "--max_hits", "2"]) 28 | file_contents = Path(out_path).read_text() 29 | assert "NAME CAT" in file_contents 30 | assert "NAME 2-oxoacid_dh" in file_contents 31 | assert "NAME APH" not in file_contents 32 | assert "NAME CcdA" not in file_contents 33 | assert "NAME CcdB" not in file_contents 34 | assert "NAME Condensation" not in file_contents 35 | assert "NAME TCAD9" not in file_contents 36 | -------------------------------------------------------------------------------- /test/test_hmmer_select.py: -------------------------------------------------------------------------------- 1 | from domainator.hmmer_select import main, hmmer_select 2 | import tempfile 3 | import pyhmmer 4 | import os 5 | 6 | def test_hmmer_select_1(shared_datadir): 7 | with tempfile.TemporaryDirectory() as output_dir: 8 | # output_dir = "test_out" 9 | out = output_dir + "/out.hmm" 10 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "all", "--regex", "dehyd.*"]) 11 | output_hmms = list(pyhmmer.plan7.HMMFile(out)) 12 | assert len(output_hmms) == 1 13 | 14 | def test_hmmer_select_2(shared_datadir): 15 | with tempfile.TemporaryDirectory() as output_dir: 16 | # output_dir = "test_out" 17 | out = output_dir + "/out.hmm" 18 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "name", "--exact", "TCAD9"]) 19 | output_hmms = list(pyhmmer.plan7.HMMFile(out)) 20 | assert len(output_hmms) == 1 21 | 22 | def test_hmmer_select_3(shared_datadir): 23 | with tempfile.TemporaryDirectory() as output_dir: 24 | # output_dir = "test_out" 25 | out = output_dir + "/out.hmm" 26 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "name", "--exact", "TCAD"]) 27 | 28 | # check that the file size of out is 0 29 | assert os.path.getsize(out) == 0 30 | def test_hmmer_select_4(shared_datadir): 31 | with tempfile.TemporaryDirectory() as output_dir: 32 | # output_dir = "test_out" 33 | out = output_dir + "/out.hmm" 34 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--contains", "PF19974"]) 35 | 36 | output_hmms = list(pyhmmer.plan7.HMMFile(out)) 37 | assert len(output_hmms) == 1 38 | 39 | 40 | def test_hmmer_select_case_sensitivity_1(shared_datadir): 41 | with tempfile.TemporaryDirectory() as output_dir: 42 | # output_dir = "test_out" 43 | out = output_dir + "/out.hmm" 44 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--contains", "pf19974"]) 45 | 46 | output_hmms = list(pyhmmer.plan7.HMMFile(out)) 47 | assert len(output_hmms) == 1 48 | 49 | def test_hmmer_select_case_sensitivity_2(shared_datadir): 50 | with tempfile.TemporaryDirectory() as output_dir: 51 | # output_dir = "test_out" 52 | out = output_dir + "/out.hmm" 53 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--regex", "pf19974"]) 54 | 55 | output_hmms = list(pyhmmer.plan7.HMMFile(out)) 56 | assert len(output_hmms) == 1 57 | 58 | def test_hmmer_select_case_sensitivity_3(shared_datadir): 59 | with tempfile.TemporaryDirectory() as output_dir: 60 | # output_dir = "test_out" 61 | out = output_dir + "/out.hmm" 62 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--contains", "pf19974", "--case_sensitive"]) 63 | 64 | assert os.path.getsize(out) == 0 65 | 66 | def test_hmmer_select_case_sensitivity_4(shared_datadir): 67 | with tempfile.TemporaryDirectory() as output_dir: 68 | # output_dir = "test_out" 69 | out = output_dir + "/out.hmm" 70 | main(['--input', str(shared_datadir/"pdonr_hmms.hmm"), "--output", out, "--field", "acc", "--regex", "pf19974", "--case_sensitive"]) 71 | 72 | assert os.path.getsize(out) == 0 -------------------------------------------------------------------------------- /test/test_matrix_report.py: -------------------------------------------------------------------------------- 1 | from domainator import matrix_report 2 | import tempfile 3 | import pytest 4 | 5 | @pytest.mark.parametrize("input_file", 6 | [ 7 | "scorefull.tsv", 8 | "scorefull.dense.hdf5" 9 | ]) 10 | def test_matrix_report_1(shared_datadir, input_file): 11 | 12 | with tempfile.TemporaryDirectory() as output_dir: 13 | # output_dir = "test_out" 14 | out_html = output_dir + "/matrix_report_test.html" 15 | out_txt = output_dir + "/matrix_report_test.txt" 16 | matrix_report.main(["-i", str(shared_datadir / input_file), "-o", out_txt, "--html", out_html]) 17 | for fh in (out_html, out_txt): 18 | f_txt = open(fh).read() 19 | assert "Matrix Report" in f_txt 20 | assert "Min" in f_txt 21 | assert "152.0" in f_txt 22 | assert "451.0" in f_txt 23 | assert "Total values" in f_txt 24 | assert f_txt.count("400") == 2 25 | 26 | 27 | # def test_matrix_report_empty_input(shared_datadir): 28 | # pass 29 | # #TODO -------------------------------------------------------------------------------- /test/test_ncbi_taxonomy.py: -------------------------------------------------------------------------------- 1 | from domainator.Taxonomy import NCBITaxonomy 2 | import pytest 3 | 4 | 5 | def test_NCBItaxonomy(shared_datadir): 6 | tx = NCBITaxonomy(shared_datadir / "taxdmp", overwrite=False) 7 | 8 | assert tx.lineage(562) == [562, 561, 543, 91347, 1236, 1224, 2, 131567, 1] 9 | assert tx.lineage(1423) == [1423, 653685, 1386, 186817, 1385, 91061, 1239, 1783272, 2, 131567, 1] 10 | assert tx.rank(562) == "species" 11 | assert tx.rank(1423) == "species" 12 | assert tx.name(562) == "Escherichia coli" 13 | assert tx.name(1423) == "Bacillus subtilis" 14 | 15 | def test_NCBItaxonomy_2(shared_datadir): 16 | tx = NCBITaxonomy(shared_datadir / "taxdmp", overwrite=False) 17 | with pytest.warns(UserWarning): 18 | assert tx.lineage(1985417) == [] 19 | -------------------------------------------------------------------------------- /test/test_partition_seqfile.py: -------------------------------------------------------------------------------- 1 | from math import prod 2 | from domainator.partition_seqfile import main 3 | import tempfile 4 | from glob import glob 5 | from io import StringIO 6 | import pytest 7 | 8 | @pytest.mark.parametrize("file,option,value,num_proteins,offsets,recs_to_read", 9 | [("pDONR201.gb","--partitions",2,3,[0],[1]), 10 | ("pDONR201.gb","--cdss_per_partition",1,3,[0],[1]), 11 | ("pDONR201_multigenemark_partition.gb","--partitions",2,24,[0,16382],[2,2]), 12 | ("pDONR201_multigenemark_partition.gb","--partitions",1,24,[0],[4]), 13 | ("pDONR201_multigenemark_partition.gb","--partitions",10,24,[0,8191,16382,24573],[1,1,1,1]), 14 | ("pDONR201_multigenemark_partition.gb","--partitions",4,24,[0,8191,16382,24573],[1,1,1,1]), 15 | ("pDONR201_multigenemark_partition.gb","--cdss_per_partition",6,24,[0,8191,16382,24573],[1,1,1,1]), 16 | ("pDONR201_multigenemark_partition.gb","--cdss_per_partition",7,24,[0,16382],[2,2]), 17 | ]) 18 | def test_partition_seqfile(file, num_proteins, option, value, offsets, recs_to_read, shared_datadir): 19 | with tempfile.TemporaryDirectory() as output_dir: 20 | outfile = output_dir + "/" + "outfile.txt" 21 | args = ["-i", str(shared_datadir / file), "-o", outfile, option, str(value)] 22 | main(args) 23 | with open(outfile) as f: 24 | proteins = int(f.readline().strip()) 25 | assert proteins == num_proteins 26 | produced_offsets = list() 27 | produced_recs_to_read = list() 28 | for line in f: 29 | parts = line.strip().split() 30 | produced_offsets.append(int(parts[0])) 31 | produced_recs_to_read.append(int(parts[1])) 32 | assert produced_offsets == offsets 33 | assert produced_recs_to_read == recs_to_read 34 | -------------------------------------------------------------------------------- /test/test_partition_seqids.py: -------------------------------------------------------------------------------- 1 | from domainator.partition_seqids import partition_seqids 2 | import tempfile 3 | from glob import glob 4 | from io import StringIO 5 | import pytest 6 | 7 | @pytest.mark.parametrize("partitions, ids_per_partition, rec_count", 8 | [(1, None, 10), 9 | (3, None, 10), 10 | (6, None, 10), 11 | (9, None, 10), 12 | (10, None, 10), 13 | (None, 5, 10), 14 | (None, 9, 10), 15 | (None, 2, 10), 16 | # (1, None, ":memory:", 10), 17 | # (1, None, ":memory:", 10), 18 | ]) 19 | def test_partition_seqids_fasta(partitions, ids_per_partition, rec_count, shared_datadir, capsys): 20 | names = [f"seq{x}" for x in range(rec_count)] 21 | f = StringIO("\n".join([f">{x}\nMAGICCATS" for x in names])) 22 | # (input_path, output_prefix, partitions, ids_per_partition, index_path=None, file_format="fasta") 23 | if partitions is not None: 24 | intended_partitions = partitions 25 | else: 26 | intended_partitions = int(rec_count / ids_per_partition) 27 | if rec_count % ids_per_partition != 0: 28 | intended_partitions += 1 29 | with tempfile.TemporaryDirectory() as output_dir: 30 | output_prefix = output_dir + "/out" 31 | partition_seqids([f], output_prefix, partitions, ids_per_partition, filetype="fasta") 32 | captured = capsys.readouterr() 33 | assert captured.out == str(rec_count) 34 | assert len(glob(output_prefix+"*.txt")) <= intended_partitions 35 | assert len(glob(output_prefix+"*.txt")) >= int(intended_partitions/2) 36 | ids_in_files = set() 37 | bins_with_too_few = 0 38 | for f in glob(output_prefix+"*.txt"): 39 | ids_in_file = 0 40 | with open(f) as inf: 41 | for line in inf: 42 | ids_in_file += 1 43 | line = line.rstrip() 44 | ids_in_files.add(line) 45 | if ids_per_partition is not None: 46 | assert ids_in_file <= ids_per_partition 47 | if ids_in_file < ids_per_partition: 48 | bins_with_too_few += 1 49 | 50 | assert bins_with_too_few <= 1 51 | assert len(ids_in_files) == len(names) 52 | assert len(ids_in_files.intersection(set(names))) == len(ids_in_files) 53 | 54 | def test_partition_seqids_genbank(shared_datadir, capsys): 55 | with tempfile.TemporaryDirectory() as output_dir: 56 | 57 | output_prefix = output_dir + "/out" 58 | partition_seqids([str(shared_datadir / "pDONR201_multi_genemark_domainator.gb")], output_prefix, 2, None, filetype="genbank") 59 | captured = capsys.readouterr() 60 | assert captured.out == "24" 61 | assert len(glob(output_prefix+"*.txt")) == 2 62 | 63 | 64 | def test_partition_seqids_genbank_peptide(shared_datadir, capsys): 65 | with tempfile.TemporaryDirectory() as output_dir: 66 | 67 | output_prefix = output_dir + "/out" 68 | partition_seqids([str(shared_datadir / "FeSOD_20.gb")], output_prefix, 2, None, filetype="genbank") 69 | captured = capsys.readouterr() 70 | assert captured.out == "20" 71 | assert len(glob(output_prefix+"*.txt")) == 2 72 | -------------------------------------------------------------------------------- /test/test_plot_contigs.py: -------------------------------------------------------------------------------- 1 | from domainator import plot_contigs 2 | import tempfile 3 | 4 | def test_plot_contigs_1(shared_datadir): 5 | with tempfile.TemporaryDirectory() as output_dir: 6 | # output_dir = "test_out" 7 | input = str(shared_datadir / "MT_nbs.gb") 8 | output = output_dir + "/contigs.html" 9 | plot_contigs.main(["-i", input, "--html", output]) 10 | output_text = open(output).read() 11 | assert "Domainator Contigs Plot" in output_text 12 | assert "BX548174_369054:361090rc" in output_text 13 | assert not ('"type": "source"' in output_text) 14 | 15 | def test_plot_contigs_2(shared_datadir): 16 | with tempfile.TemporaryDirectory() as output_dir: 17 | # output_dir = "test_out" 18 | input = str(shared_datadir / "MT_nbs.gb") 19 | output = output_dir + "/contigs.html" 20 | plot_contigs.main(["-i", input, "--html", output, "--height", "1000", "--width", "800"]) 21 | output_text = open(output).read() 22 | assert 'taxid_species: Domainator Contigs Plot" in output_text 58 | assert "FeSOD_A0A1F4ZT98|unreviewed|Superoxide" in output_text 59 | assert not ('"type": "source"' in output_text) -------------------------------------------------------------------------------- /test/test_summary_report.py: -------------------------------------------------------------------------------- 1 | from domainator import summary_report 2 | import tempfile 3 | 4 | def test_contig_stats_1(shared_datadir): 5 | 6 | with tempfile.TemporaryDirectory() as output_dir: 7 | # output_dir = "test_out" 8 | out_html = output_dir + "/contig_stats_test.html" 9 | out_txt = output_dir + "/contig_stats_test.txt" 10 | summary_report.main(["-i", str(shared_datadir / "FeSOD_20_pfam.gb"), "-o", out_txt, "--html", out_html, "--domains", "Sod_Fe_C", "Sod_Fe_N" ]) 11 | for fh in (out_html, out_txt): 12 | f_txt = open(fh).read() 13 | assert "Domain Stats" in f_txt 14 | assert "Sod_Fe_C" in f_txt 15 | assert "avg score" in f_txt 16 | assert "100.0" in f_txt 17 | assert "101" in f_txt 18 | 19 | 20 | 21 | def test_contig_stats_empty_input(shared_datadir): 22 | 23 | with tempfile.TemporaryDirectory() as output_dir: 24 | # output_dir = "test_out" 25 | out_html = output_dir + "/contig_stats_test.html" 26 | out_txt = output_dir + "/contig_stats_test.txt" 27 | summary_report.main(["-i", str(shared_datadir / "empty.gb"), "-o", out_txt, "--html", out_html, ]) 28 | for fh in (out_html, out_txt): 29 | f_txt = open(fh).read() 30 | assert "LOCUS" not in f_txt 31 | assert "avg score" in f_txt 32 | assert "Domain Stats" in f_txt 33 | # assert 0 34 | # compare_seqfiles(out, shared_datadir / "extract_peptides_test_1_out.gb") 35 | # assert compare_files(out, shared_datadir / "extract_peptides_test_1_out.gb") 36 | 37 | def test_contig_stats_taxonomy_1(shared_datadir): 38 | 39 | with tempfile.TemporaryDirectory() as output_dir: 40 | #output_dir = "test_out" 41 | out_html = output_dir + "/contig_stats_test.html" 42 | out_txt = output_dir + "/contig_stats_test.txt" 43 | summary_report.main(["-i", str(shared_datadir / "swissprot_CuSOD_subset.fasta"), "-o", out_txt, "--html", out_html, "--taxonomy", "--ncbi_taxonomy_path", str(shared_datadir / "taxdmp")]) 44 | # TODO: test output 45 | 46 | 47 | def test_summary_report_database_1(shared_datadir): 48 | 49 | with tempfile.TemporaryDirectory() as output_dir: 50 | # output_dir = "test_out" 51 | out_html = output_dir + "/contig_stats_test.html" 52 | out_txt = output_dir + "/contig_stats_test.txt" 53 | summary_report.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator_multi_hmm_2.gb"), "-o", out_txt, "--html", out_html, "--databases", "pdonr_hmms_1"]) 54 | for fh in (out_html, out_txt): 55 | f_txt = open(fh).read() 56 | assert "pdonr_hmms_1" in f_txt 57 | assert "pdonr_hmms_2" not in f_txt 58 | -------------------------------------------------------------------------------- /test/test_transform_matrix.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore", module='numpy') 3 | from domainator import transform_matrix 4 | import tempfile 5 | import pandas as pd 6 | import numpy as np 7 | import scipy 8 | import scipy.sparse 9 | from domainator.data_matrix import DataMatrix 10 | from helpers import compare_iterables 11 | import pytest 12 | 13 | def compare_data_matrix(matrix_1:DataMatrix, matrix_2:DataMatrix): 14 | assert matrix_1.rows == matrix_2.rows 15 | assert matrix_1.columns == matrix_2.columns 16 | #assert np.testing.assert_array_equal(np.array(matrix_1.row_lengths), np.array(matrix_2.row_lengths)) 17 | np.testing.assert_array_equal(matrix_1.row_lengths, matrix_2.row_lengths) 18 | np.testing.assert_array_equal(matrix_1.column_lengths, matrix_2.column_lengths) 19 | assert matrix_1.data_type == matrix_2.data_type 20 | 21 | 22 | @pytest.mark.parametrize("output_mode,expected", 23 | [("row_norm_score", np.array([ 24 | [-1/3.,2/3.,1/3.,1,0], 25 | [1/5.,2/5.,3/5.,4/5.,1], 26 | [1,4/5.,3/5.,2/5.,1/5.], 27 | [1/3.,1,1,1,2/3.], 28 | [1/3.,1/3.,1/3.,1,2/3.], 29 | ]) 30 | ), 31 | ("norm_score", np.array([ 32 | [1 + 1/3., 1/3., 2/3., 0., 1.], 33 | [4/5., 1/2., 0., 0., 0.], 34 | [0., 0., 0., 1/2., 4/5.], 35 | [2/3., 0., 0., 0., 1/3.], 36 | [2/3., 2/3., 2/3., 0., 1/3.], 37 | ]) 38 | ), 39 | ("efi_score", np.array([[-0., 29.19990958, 14.09725727, 44.15449935, -0., ], 40 | [13.90537175, 28.89887958, 43.89922684, 58.90496914, 73.91507624], 41 | [73.93527962, 58.82578789, 43.72313559, 28.62587831, 13.53298584], 42 | [13.60434175, 43.64934937, 43.59819685, 43.55243936, 28.45954689], 43 | [13.50743174, 13.44943979, 13.39828727, 43.45552935, 28.36263688] 44 | ]) 45 | ) 46 | ]) 47 | def test_transform_matrix_1(output_mode, expected, shared_datadir): 48 | with tempfile.TemporaryDirectory() as output_dir: 49 | input = np.array([ 50 | [-1,2,1,3,0], 51 | [1,2,3,4,5], 52 | [5,4,3,2,1], 53 | [1,3,3,3,2], 54 | [1,1,1,3,2], 55 | ]) 56 | 57 | input_file = output_dir + "/input.hdf5" 58 | dense_out = output_dir + "/dense_out.hdf5" 59 | sparse_out = output_dir + "/sparse_out.hdf5" 60 | 61 | rows = ["a", "b", "c", "d", "e"] 62 | columns = ["A", "B", "C", "D", "E"] 63 | row_lengths = [1, 2, 3, 4, 5] 64 | col_lengths = [7, 8, 9, 10, 11] 65 | 66 | matrix = DataMatrix(input, rows, columns, row_lengths, col_lengths, "score") 67 | expected_matrix = DataMatrix(expected, rows, columns, row_lengths, col_lengths, output_mode) 68 | matrix.write(input_file, "dense") 69 | transform_matrix.main(["-i", input_file, "--dense", dense_out, "--sparse", sparse_out, "--mode", output_mode]) 70 | dense_output_matrix = DataMatrix.from_file(dense_out) 71 | sparse_output_matrix = DataMatrix.from_file(sparse_out) 72 | 73 | compare_data_matrix(dense_output_matrix, expected_matrix) 74 | expected_matrix.convert_to_sparse() 75 | compare_data_matrix(sparse_output_matrix, expected_matrix) 76 | 77 | 78 | def test_transform_matrix_2(shared_datadir): 79 | with tempfile.TemporaryDirectory() as output_dir: 80 | input = np.array([ 81 | [-1,2,1,3,0], 82 | [1,2,3,4,5], 83 | [5,4,3,2,1], 84 | [1,3,3,3,2], 85 | [1,1,1,3,2], 86 | ]) 87 | 88 | (output_mode, expected) = ("score_dist", np.array([ 89 | [1 + 1/3., 1/3., 2/3., 0., 1.], 90 | [4/5., 1/2., 0., 0., 0.], 91 | [0., 0., 0., 1/2., 4/5.], 92 | [2/3., 0., 0., 0., 1/3.], 93 | [2/3., 2/3., 2/3., 0., 1/3.], 94 | ])) 95 | 96 | input_file = output_dir + "/input.hdf5" 97 | dense_out = output_dir + "/dense_out.hdf5" 98 | 99 | rows = ["a", "b", "c", "d", "e"] 100 | columns = ["A", "B", "C", "D", "E"] 101 | row_lengths = [1, 2, 3, 4, 5] 102 | col_lengths = [7, 8, 9, 10, 11] 103 | 104 | matrix = DataMatrix(input, rows, columns, row_lengths, col_lengths, "score") 105 | expected_matrix = DataMatrix(expected, rows, columns, row_lengths, col_lengths, output_mode) 106 | matrix.write(input_file, "dense") 107 | transform_matrix.main(["-i", input_file, "--dense", dense_out, "--mode", output_mode]) 108 | dense_output_matrix = DataMatrix.from_file(dense_out) 109 | 110 | compare_data_matrix(dense_output_matrix, expected_matrix) 111 | 112 | 113 | #TODO: test more conversion modes -------------------------------------------------------------------------------- /test/test_trim_contigs.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from domainator.Bio import SeqIO 3 | from domainator import trim_contigs 4 | from domainator.utils import parse_seqfiles, DomainatorCDS 5 | import pytest 6 | from io import StringIO 7 | import sys 8 | import subprocess 9 | 10 | 11 | def test_trim_contigs_cds_both_1(shared_datadir): 12 | with tempfile.TemporaryDirectory() as output_dir: 13 | # output_dir = "test_out" 14 | out = output_dir + "/extraction.gb" 15 | trim_contigs.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator.gb"), "-o", out, "--contigs", "pDONR201_1", "--cds_both", "2",]) 16 | 17 | new_file = list(SeqIO.parse(out, "genbank")) 18 | assert len(new_file) == 1 19 | for record in new_file: 20 | assert len(record) == 1000 21 | assert record.name == "pDONR201_1_1266:2265" 22 | 23 | def test_trim_contigs_cds_both_2(shared_datadir): 24 | with tempfile.TemporaryDirectory() as output_dir: 25 | # output_dir = "test_out" 26 | out = output_dir + "/extraction.gb" 27 | trim_contigs.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator.gb"), "-o", out, "--contigs", "pDONR201_1", "--cds_both", "3",]) 28 | 29 | new_file = list(SeqIO.parse(out, "genbank")) 30 | assert len(new_file) == 0 31 | 32 | def test_trim_contigs_cds_both_domains_1(shared_datadir): 33 | with tempfile.TemporaryDirectory() as output_dir: 34 | # output_dir = "test_out" 35 | out = output_dir + "/extraction.gb" 36 | trim_contigs.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator.gb"), "-o", out, "--domains", "APH", "--contigs", "pDONR201_1", "--cds_both", "1",]) 37 | 38 | new_file = list(SeqIO.parse(out, "genbank")) 39 | assert len(new_file) == 1 40 | for record in new_file: 41 | assert len(record) == 1307 42 | assert record.name == "pDONR201_1_959:2265" 43 | 44 | 45 | def test_trim_contigs_domain_expr_2(shared_datadir): 46 | with tempfile.TemporaryDirectory() as output_dir: 47 | # output_dir = "test_out" 48 | out = output_dir + "/extraction.gb" 49 | trim_contigs.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator.gb"), "-o", out, "--domain_expr", "(APH & TCAD9) | (CcdB)", "--contigs", "pDONR201_1", "--cds_both", "1",]) 50 | 51 | new_file = list(SeqIO.parse(out, "genbank")) 52 | assert len(new_file) == 1 53 | for record in new_file: 54 | assert len(record) == 1000 55 | assert record.name == "pDONR201_1_1266:2265" 56 | 57 | def test_trim_contigs_no_domain_1(shared_datadir): 58 | with tempfile.TemporaryDirectory() as output_dir: 59 | # output_dir = "test_out" 60 | out = output_dir + "/extraction.gb" 61 | trim_contigs.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator.gb"), "-o", out, "--domain_expr", "(APH & TCAD9) | (CcdB)", "--contigs", "pDONR201_1", "--no_domain"]) 62 | 63 | new_file = list(SeqIO.parse(out, "genbank")) 64 | assert len(new_file) == 1 65 | for record in new_file: 66 | assert len(record) == 1000 67 | assert record.name == "pDONR201_1_1266:2265" 68 | 69 | 70 | def test_trim_contigs_kb_both_1(shared_datadir): 71 | with tempfile.TemporaryDirectory() as output_dir: 72 | # output_dir = "test_out" 73 | out = output_dir + "/extraction.gb" 74 | trim_contigs.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator.gb"), "-o", out, "--contigs", "pDONR201_1", "--kb_both", "2",]) 75 | 76 | new_file = list(SeqIO.parse(out, "genbank")) 77 | assert len(new_file) == 1 78 | for record in new_file: 79 | assert len(record) == 470 80 | assert record.name == "pDONR201_1_2001:2470" 81 | 82 | def test_trim_contigs_kb_both_2(shared_datadir): 83 | with tempfile.TemporaryDirectory() as output_dir: 84 | # output_dir = "test_out" 85 | out = output_dir + "/extraction.gb" 86 | trim_contigs.main(["-i", str(shared_datadir / "pDONR201_multi_genemark_domainator.gb"), "-o", out, "--contigs", "pDONR201_1", "--kb_both", "4",]) 87 | 88 | new_file = list(SeqIO.parse(out, "genbank")) 89 | assert len(new_file) == 0 90 | 91 | 92 | def test_trim_contigs_kb_both_peptides_1(shared_datadir): 93 | with tempfile.TemporaryDirectory() as output_dir: 94 | # output_dir = "test_out" 95 | out = output_dir + "/extraction.gb" 96 | trim_contigs.main(["-i", str(shared_datadir / "pdonr_peptides.fasta"), "-o", out, "--contigs", "pDONR201_2", "--kb_both", "0.010",]) 97 | 98 | new_file = list(SeqIO.parse(out, "genbank")) 99 | assert len(new_file) == 1 100 | assert len(new_file[0]) == 82 101 | 102 | 103 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | from domainator import utils 2 | import tempfile 3 | from domainator.Bio import SeqIO, SeqRecord, Seq 4 | import io 5 | import pytest 6 | from array import array 7 | import re 8 | 9 | def test_split_string_list(): 10 | data=["abcde asdfasdf asdf", " abcdef::GACAF", ""] 11 | out = utils.split_string_list(data) 12 | assert out[0] == ["abcde asdfasdf asdf"] 13 | assert out[1] == ["abcdef", "GACAF"] 14 | assert out[2] == [""] 15 | 16 | 17 | def test_write_genbank_1(shared_datadir): 18 | rec = SeqRecord.SeqRecord(Seq.Seq("GACT"),id="BIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAME", 19 | name="BIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAME", 20 | description="BIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAME", 21 | ) 22 | buf = io.StringIO() 23 | utils.write_genbank([rec], buf) 24 | 25 | def test_write_genbank_space_name(shared_datadir): 26 | rec = SeqRecord.SeqRecord(Seq.Seq("GACT"),id="BIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAME with_space", 27 | name="BIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAME with_space", 28 | description="BIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAMEBIGNAME with_space", 29 | ) 30 | buf = io.StringIO() 31 | utils.write_genbank([rec], buf) 32 | 33 | 34 | def test_list_and_file_to_dict_keys(shared_datadir): 35 | keys = utils.list_and_file_to_dict_keys(None, str(shared_datadir / "CcdB.hmm")) 36 | print(keys) 37 | assert 'CcdB' in keys 38 | 39 | 40 | # regions are tuples of start and stop coordinates 41 | # returns true if a fraction of region2 >= min_overlap_fraction overlaps with region1 42 | # coordinates within regions must be sorted low to high 43 | @pytest.mark.parametrize("region1,region2,min_overlap_fraction,expected", 44 | [((327,503),(325,507),0.6,True), 45 | ((325,507),(327,503),0.6,True), 46 | ]) 47 | def test_regions_overlap(region1, region2, min_overlap_fraction, expected): 48 | 49 | assert utils.regions_overlap(region1, region2, min_overlap_fraction) == expected 50 | 51 | @pytest.mark.parametrize("files,offset,read_count,expected_record_ct,rec0_name", 52 | [(["pDONR201.gb"],0,1,1,"pDONR201"), 53 | (["pDONR201.gb"],0,10,1,"pDONR201"), 54 | (["pDONR201.gb"],0,0,0,""), 55 | (["pDONR201_multi_genemark.gb","pDONR201.gb"],16382,10,2, "pDONR201_3"), #seeks past the end of pDONR201.gb 56 | # (["simple_genpept_equals_second_line.gb"],0,float("inf"),5,"pDONR201_1") #uncomment to test multiline qualifier name handling 57 | ]) 58 | def test_parse_seqfiles(files,offset,read_count,expected_record_ct,rec0_name,shared_datadir): 59 | files = [str(shared_datadir / x) for x in files] 60 | recs = list( utils.parse_seqfiles(files,None,None,offset,read_count) ) 61 | assert len(recs) == expected_record_ct 62 | if len(recs) > 0: 63 | assert recs[0].id == rec0_name 64 | 65 | @pytest.mark.parametrize("file,offsets,num_proteins", 66 | [("pDONR201.gb",[0],[3]), 67 | ("pDONR201_multigenemark_partition.gb",[0,8191,16382,24573],[6,6,6,6]), 68 | ("pdonr_peptides.fasta",[0,50,169,226,465],[1,1,1,1,1]), 69 | ("pDONR201_empty.gb",[0],[0]), 70 | ("simple_genpept.gb",[0,296,987,1620,3904],[1,1,1,1,1]), 71 | ]) 72 | def test_get_offsets(file,offsets,num_proteins,shared_datadir): 73 | new_offsets, new_num_proteins = utils.get_offsets(str(shared_datadir / file)) 74 | assert len(new_offsets) == len(new_num_proteins) 75 | assert new_offsets == array('Q', offsets) 76 | assert new_num_proteins == array('Q',num_proteins) 77 | 78 | 79 | def test_get_palette_1(): 80 | palette = utils.get_palette(["A","B","C"]) 81 | assert set(palette.keys()) == {"A","B","C"} 82 | assert len(palette) == 3 83 | assert len(set(palette.values())) == 3 84 | assert all([re.match(r"#[0-9a-fA-F]{6}",x) for x in palette.values()]) 85 | --------------------------------------------------------------------------------