├── commec ├── tests │ ├── __init__.py │ ├── test_dbs │ │ ├── biorisk │ │ │ ├── reg_taxids.txt │ │ │ ├── biorisk.hmm.h3f │ │ │ ├── biorisk.hmm.h3i │ │ │ ├── biorisk.hmm.h3m │ │ │ ├── biorisk.hmm.h3p │ │ │ └── biorisk_annotations.csv │ │ ├── low_concern │ │ │ ├── vax_taxids.txt │ │ │ ├── syn_taxids.txt │ │ │ ├── low_concern_annotations.tsv │ │ │ ├── dna │ │ │ │ └── benign.fasta │ │ │ ├── rna │ │ │ │ ├── benign.cm.i1f │ │ │ │ ├── benign.cm.i1i │ │ │ │ ├── benign.cm.i1m │ │ │ │ ├── benign.cm.i1p │ │ │ │ └── benign.cm.exemplar.out │ │ │ └── protein │ │ │ │ ├── benign.hmm.h3f │ │ │ │ ├── benign.hmm.h3i │ │ │ │ ├── benign.hmm.h3m │ │ │ │ ├── benign.hmm.h3p │ │ │ │ └── benign.hmm.exemplar.out │ │ ├── nr_blast │ │ │ ├── nr.pto │ │ │ ├── nr.pdb │ │ │ ├── nr.phr │ │ │ ├── nr.pin │ │ │ ├── nr.ptf │ │ │ ├── nr.pot │ │ │ ├── nr.pjs │ │ │ └── nr.exemplar.out │ │ ├── nt_blast │ │ │ ├── core_nt.nto │ │ │ ├── core_nt.ndb │ │ │ ├── core_nt.nhr │ │ │ ├── core_nt.nin │ │ │ ├── core_nt.nsq │ │ │ ├── core_nt.ntf │ │ │ ├── core_nt.not │ │ │ ├── core_nt.njs │ │ │ └── core_nt.exemplar.out │ │ ├── taxonomy │ │ │ └── taxonomy_placeholder.txt │ │ └── nr_dmnd │ │ │ ├── nr.1.dmnd │ │ │ ├── nr.2.dmnd │ │ │ ├── nr.3.dmnd │ │ │ └── nr.dmnd.exemplar.out │ ├── test_data │ │ ├── has_empty_description.fasta │ │ ├── screen-files │ │ │ ├── biorisk-error-2025-02.screen │ │ │ ├── README.md │ │ │ ├── no-hits-2024-06.screen │ │ │ ├── fast-mode-2025-02.screen │ │ │ ├── prot-error-2024-08.screen │ │ │ ├── prot-mixed-hit-2024-06.screen │ │ │ ├── prot-hit-not-cleared-2024-06.screen │ │ │ ├── prot-multiple-hits-2024-06.screen │ │ │ └── prot-nt-hits-cleared-2024-09.screen │ │ ├── input_has_empty_description │ │ │ └── has_empty_description.cleaned.fasta │ │ ├── single_record.fasta │ │ ├── input_single_record │ │ │ └── single_record.cleaned.fasta │ │ ├── psuedo_query.fasta │ │ ├── has_empty_record.fasta │ │ ├── input_has_empty_record │ │ │ └── has_empty_record.cleaned.fasta │ │ ├── has_records_with_same_description.fasta │ │ ├── multiple_records.fasta │ │ ├── input_multiple_records │ │ │ └── multiple_records.cleaned.fasta │ │ ├── input_has_records_with_same_description │ │ │ └── has_records_with_same_description.cleaned.fasta │ │ └── single_record.transeq.faa │ ├── .pylintrc │ ├── test_rationales.py │ ├── test_flag.py │ ├── test_trim.py │ ├── test_coverage.py │ ├── test_check_biorisk.py │ ├── test_screen_io.py │ ├── test_nc_to_nt.py │ ├── test_split.py │ ├── test_aa_to_nt.py │ ├── test_blast_tools.py │ ├── test_fetch_nc_bits.py │ ├── test_json.py │ ├── test_dbs.py │ └── test_query.py ├── __init__.py ├── config │ ├── constants.py │ ├── screen_tools.py │ └── json_io.py ├── screen-default-config.yaml ├── utils │ ├── concat_seqs.py │ ├── file_utils.py │ ├── template.html │ ├── dict_utils.py │ ├── coordinates.py │ └── logger.py ├── split.py ├── tools │ ├── blastn.py │ ├── blastx.py │ ├── cmscan.py │ ├── fetch_nc_bits.py │ ├── search_handler.py │ └── hmmer.py └── cli.py ├── conftest.py ├── environment.yaml ├── .gitignore ├── .github ├── workflows │ ├── automate_tests.yml │ ├── automate_release.yml │ └── release-version-sha-update.yml └── PULL_REQUEST_TEMPLATE.md ├── example_data ├── screen_pipeline_status.csv ├── output_commec-examples │ ├── commec-examples.low_concern.cmscan │ ├── commec-examples.biorisk.hmmscan │ ├── commec-examples.low_concern.blastn │ ├── commec-examples.low_concern.hmmscan │ └── commec-examples.nt.blastn ├── input_commec-examples │ ├── commec-examples_config.yaml │ └── commec-examples.noncoding.fasta └── README.md ├── LICENSE ├── pyproject.toml ├── conda-recipe └── meta.yaml ├── dev_scripts ├── split_fasta.py ├── run_blastx.sh ├── run_diamond.sh ├── summarize_screens.py └── collate-screens.py └── README.md /commec/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/biorisk/reg_taxids.txt: -------------------------------------------------------------------------------- 1 | 12345 -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/vax_taxids.txt: -------------------------------------------------------------------------------- 1 | 11589 -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/syn_taxids.txt: -------------------------------------------------------------------------------- 1 | 394040 2 | 32630 -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.pto: -------------------------------------------------------------------------------- 1 | 2 |  -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.nto: -------------------------------------------------------------------------------- 1 | 2 |  -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/low_concern_annotations.tsv: -------------------------------------------------------------------------------- 1 | ID Description 2 | Benign1 TEST_BENIGN_DESCRIPTION 3 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/taxonomy/taxonomy_placeholder.txt: -------------------------------------------------------------------------------- 1 | This is an empty file, as we need the taxonomy directory to exist. -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.pdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.pdb -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.phr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.phr -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.pin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.pin -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.ptf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.ptf -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_dmnd/nr.1.dmnd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_dmnd/nr.1.dmnd -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_dmnd/nr.2.dmnd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_dmnd/nr.2.dmnd -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_dmnd/nr.3.dmnd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_dmnd/nr.3.dmnd -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.ndb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.ndb -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.nhr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.nhr -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.nin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.nin -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.nsq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.nsq -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.ntf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.ntf -------------------------------------------------------------------------------- /commec/tests/test_dbs/biorisk/biorisk.hmm.h3f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3f -------------------------------------------------------------------------------- /commec/tests/test_dbs/biorisk/biorisk.hmm.h3i: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3i -------------------------------------------------------------------------------- /commec/tests/test_dbs/biorisk/biorisk.hmm.h3m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3m -------------------------------------------------------------------------------- /commec/tests/test_dbs/biorisk/biorisk.hmm.h3p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3p -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/dna/benign.fasta: -------------------------------------------------------------------------------- 1 | >TEST_BENIGN_FASTA 2 | aaagaggagaaatactagatgaaaaacataaatgccgacgacacatacagaataattaataaaattaaagcttgtagaag 3 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/rna/benign.cm.i1f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1f -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/rna/benign.cm.i1i: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1i -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/rna/benign.cm.i1m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1m -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/rna/benign.cm.i1p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1p -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.pot: -------------------------------------------------------------------------------- 1 | 2 |  3 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/protein/benign.hmm.h3f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3f -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/protein/benign.hmm.h3i: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3i -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/protein/benign.hmm.h3m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3m -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/protein/benign.hmm.h3p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3p -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.not: -------------------------------------------------------------------------------- 1 | 2 |  3 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/biorisk/biorisk_annotations.csv: -------------------------------------------------------------------------------- 1 | ID,Description,Must flag 2 | Toxin1, TestBioriskToxinFlag,TRUE 3 | Toxin2, TestBioriskToxin,False 4 | Toxin3, TestBioriskToxin,False -------------------------------------------------------------------------------- /commec/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version, PackageNotFoundError 2 | try: 3 | __version__ = version("commec") 4 | except (ImportError, PackageNotFoundError): 5 | __version__ = "X.X.X" 6 | -------------------------------------------------------------------------------- /commec/tests/test_data/has_empty_description.fasta: -------------------------------------------------------------------------------- 1 | > 2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_dmnd/nr.dmnd.exemplar.out: -------------------------------------------------------------------------------- 1 | BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1 Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1 1.97e-23 74.3 90.9 174 41 172 45 1 0 2 | -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/biorisk-error-2025-02.screen: -------------------------------------------------------------------------------- 1 | INFO | Validating Inputs... 2 | INFO | >> STEP 1: Checking for biorisk genes... 3 | ERROR | ...Biorisk annotations file does not exist: commec-dbs/biorisk_db/biorisk_annotations.csv 4 | -------------------------------------------------------------------------------- /commec/tests/test_data/input_has_empty_description/has_empty_description.cleaned.fasta: -------------------------------------------------------------------------------- 1 | > 2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa 3 | -------------------------------------------------------------------------------- /commec/tests/test_data/single_record.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" 2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa -------------------------------------------------------------------------------- /commec/tests/.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | load-plugins=pylint.extensions.docparams 3 | 4 | [MESSAGES CONTROL] 5 | disable= 6 | missing-function-docstring, 7 | missing-class-docstring, 8 | missing-module-docstring 9 | 10 | [DOCSTRING] 11 | ignore-private-members=yes 12 | -------------------------------------------------------------------------------- /commec/tests/test_data/input_single_record/single_record.cleaned.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K380009_A_20830 2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacct 3 | aacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaa 4 | agcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa 5 | -------------------------------------------------------------------------------- /commec/config/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | 4 | # SCREENING 5 | MINIMUM_QUERY_LENGTH = 41 6 | 7 | # I/O 8 | DEFAULT_CONFIG_YAML_PATH = "screen-default-config.yaml" 9 | MAXIMUM_FILENAME_SIZE = 255 10 | -------------------------------------------------------------------------------- /commec/tests/test_data/psuedo_query.fasta: -------------------------------------------------------------------------------- 1 | > TEST_QUERY_01"TEST01"|taxid=562 2 | AGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTC 3 | > TEST_QUERY_02"TEST01"|taxid=562 4 | AGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTC -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Additional Configurations for Pytest 3 | """ 4 | def pytest_addoption(parser): 5 | """ Adds unique argument to pytest for commec database example outputs generation.""" 6 | print("Test Configuration loaded!") 7 | parser.addoption( 8 | "--gen-examples", action="store_true", default=False, 9 | help="Generate exemplar output files instead of testing against them." 10 | ) 11 | -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/README.md: -------------------------------------------------------------------------------- 1 | These screen files each represent different outcomes. Files are named according to the unique outcome they show as well as the date they were generated. 2 | 3 | On a few occasions, lines from multiple screen files produced by the same commec version were combined. 4 | 5 | Future versions of `commec flag` should maintain backwards compatibility with all the files in this directory if possible. 6 | -------------------------------------------------------------------------------- /commec/tests/test_data/has_empty_record.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K205004_A_16908_Coding_"MerT_-_Membranous_Mercury_transporter_" 2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa 3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" -------------------------------------------------------------------------------- /commec/tests/test_data/input_has_empty_record/has_empty_record.cleaned.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K205004_A_16908_Coding_"MerT_-_Membranous_Mercury_transporter_" 2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa 3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" 4 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: commec-dev 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python>=3.10 8 | # Runtime Python dependencies 9 | - biopython 10 | - numpy 11 | - pandas 12 | - pytaxonkit 13 | - pyyaml 14 | # Runtime non-Python dependencies 15 | - blast 16 | - diamond>=2.1 17 | - hmmer 18 | - infernal 19 | - plotly 20 | - yaml 21 | - mako 22 | - wget 23 | # Development dependencies 24 | - pip 25 | - pytest 26 | - matplotlib 27 | - pip: 28 | - -e . 29 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.pjs: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.2", 3 | "dbname": "nr", 4 | "dbtype": "Protein", 5 | "db-version": 5, 6 | "description": "pseudo_queries_aa.fasta", 7 | "number-of-letters": 6970, 8 | "number-of-sequences": 10, 9 | "last-updated": "2024-09-03T00:32:00", 10 | "number-of-volumes": 1, 11 | "bytes-total": 45266, 12 | "bytes-to-cache": 7165, 13 | "files": [ 14 | "nr.pdb", 15 | "nr.phr", 16 | "nr.pin", 17 | "nr.pot", 18 | "nr.psq", 19 | "nr.ptf", 20 | "nr.pto" 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.njs: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.2", 3 | "dbname": "core_nt", 4 | "dbtype": "Nucleotide", 5 | "db-version": 5, 6 | "description": "pseudo_queries.fasta", 7 | "number-of-letters": 31640, 8 | "number-of-sequences": 10, 9 | "last-updated": "2025-05-21T08:12:00", 10 | "number-of-volumes": 1, 11 | "bytes-total": 46226, 12 | "bytes-to-cache": 8145, 13 | "files": [ 14 | "core_nt.ndb", 15 | "core_nt.nhr", 16 | "core_nt.nin", 17 | "core_nt.not", 18 | "core_nt.nsq", 19 | "core_nt.ntf", 20 | "core_nt.nto" 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/nr_blast/nr.exemplar.out: -------------------------------------------------------------------------------- 1 | # BLASTX 2.15.0+ 2 | # Query: BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" 3 | # Database: /root/repo/json/common-mechanism/commec/tests/test_dbs/nr_blast/nr 4 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end 5 | # 1 hits found 6 | BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1 Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1 0 3.67e-23 73.6 90.909 174 41 172 45 1 44 7 | # BLAST processed 1 queries 8 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/nt_blast/core_nt.exemplar.out: -------------------------------------------------------------------------------- 1 | # BLASTN 2.15.0+ 2 | # Query: BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" 3 | # Database: /root/repo/json/common-mechanism/commec/tests/test_dbs/nt_blast/core_nt 4 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end 5 | # 1 hits found 6 | BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" Pseudogene_E"FakeGeneE_Biorisk"[taxid=560] Pseudogene_E"FakeGeneE_Biorisk"[taxid=560] 0 4.33e-91 322 100.000 174 1 174 174 1 174 7 | # BLAST processed 1 queries 8 | -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/no-hits-2024-06.screen: -------------------------------------------------------------------------------- 1 | >> STEP 1: Checking for biorisk genes... 2 | --> Biorisks: no significant hits detected, PASS 3 | STEP 1 completed at 2024-06-24 16:49:29 4 | >> STEP 2: Checking regulated pathogen proteins... 5 | ...no hits 6 | STEP 2 completed at 2024-06-24 16:51:11 7 | >> STEP 3: Checking regulated pathogen nucleotides... 8 | ...no hits to the nr database 9 | ...no hits 10 | STEP 3 completed at 2024-06-24 16:51:12 11 | >> STEP 4: Checking any pathogen regions for benign components... 12 | ...no regulated regions to clear 13 | >> COMPLETED AT 2024-06-24 16:51:13 -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/fast-mode-2025-02.screen: -------------------------------------------------------------------------------- 1 | Validating Inputs... 2 | >> STEP 1: Checking for biorisk genes... 3 | --> Biorisks: Regulated genes not found, PASS 4 | 5 | --> Virulence factor found in bases 4 to 471, WARNING 6 | Gene: putative secreted protein-tyrosine phosphatase [Yersinia pestis CO92] 7 | 8 | STEP 1 completed at 2025-02-18 02:05:16 9 | SKIPPING STEP 2: Protein search 10 | SKIPPING STEP 3: Nucleotide search 11 | >> STEP 4: Checking any pathogen regions for low_concern components... 12 | ...no regulated regions to clear 13 | 14 | >> STEP 4 completed at 2025-02-18 02:05:16 15 | >> COMPLETED AT 2025-02-18 02:05:16 16 | -------------------------------------------------------------------------------- /commec/tests/test_data/has_records_with_same_description.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K380009 A_20830_Coding_"Protein_A_Z-domain" 2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa 3 | >BBa_K380009 A_20830_Coding_"Protein_A_Z-domain" 4 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa -------------------------------------------------------------------------------- /commec/tests/test_data/multiple_records.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K205004_A_16908_Coding_"MerT_-_Membranous_Mercury_transporter_" 2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa 3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" 4 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa -------------------------------------------------------------------------------- /commec/tests/test_data/input_multiple_records/multiple_records.cleaned.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K205004_A_16908 2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctc 3 | gcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttgg 4 | atcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtg 5 | gcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggat 6 | gtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcg 7 | ctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa 8 | >BBa_K380009_A_20830 9 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacct 10 | aacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaa 11 | agcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa 12 | -------------------------------------------------------------------------------- /commec/tests/test_data/input_has_records_with_same_description/has_records_with_same_description.cleaned.fasta: -------------------------------------------------------------------------------- 1 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" 2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa 3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain" 4 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa 5 | -------------------------------------------------------------------------------- /commec/tests/test_data/single_record.transeq.faa: -------------------------------------------------------------------------------- 1 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_1 2 | VDNKFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNDAQAPK 3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_2 4 | XTTNSTKNNKTRSMRSYIYLTXTKNNETPSSKVXKMTQAKALTFXQKLKSXMMLRRRX 5 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_3 6 | RQQIQQRTTKRVLXDLTFTXLKRRTTKRLHPKFKRXPKPKRXPFSRSXKAKXCSGAEX 7 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_4 8 | FRRLSIIXLFSFCXKVSALAWVIFXTLDEGVSLFFVXVRXMXDLIERVLLFFVEFVVY 9 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_5 10 | SAPEHHLAFXLLLKGXRFGLGHLLNFGXRRFVVLRLSXVNVRSHRTRFVVLCXICCLX 11 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_6 12 | FGAXASFSFLASAKRLALWLGSSFKLWMKAFRCSSFKLGKCKISXNAFCCSLLNLLST 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | dist/ 10 | *.egg-info/ 11 | *.egg 12 | 13 | # PyInstaller 14 | # Usually these files are written by a python script from a template 15 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 16 | *.manifest 17 | *.spec 18 | 19 | # Installer logs 20 | pip-log.txt 21 | pip-delete-this-directory.txt 22 | 23 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 24 | __pypackages__/ 25 | 26 | # Environments 27 | .venv 28 | venv/ 29 | .conda 30 | .vscode 31 | .pylintrc 32 | *:Zone.Identifier 33 | 34 | # test visual output 35 | commec/tests/test_data/functional.html -------------------------------------------------------------------------------- /.github/workflows/automate_tests.yml: -------------------------------------------------------------------------------- 1 | name: Test Commec 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - develop 7 | - main 8 | workflow_dispatch: 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v4 17 | 18 | - name: Set up Conda environment 19 | uses: conda-incubator/setup-miniconda@v3 20 | with: 21 | activate-environment: commec-env 22 | environment-file: environment.yaml 23 | auto-activate-base: false 24 | clean-patched-environment-file: true 25 | 26 | - name: Run tests 27 | shell: bash -l {0} 28 | run: | 29 | conda activate commec-env 30 | pytest -vv 31 | -------------------------------------------------------------------------------- /example_data/screen_pipeline_status.csv: -------------------------------------------------------------------------------- 1 | name,filepath,flag,biorisk,protein,nucleotide,low_concern,virus_flag,bacteria_flag,eukaryote_flag,low_concern_protein,low_concern_rna,low_concern_dna 2 | Part:BBa_K5108009_creA_-,2025-08-06/commec-examples.output.json,Warning,Warning,Pass,Skip,Warning,False,False,False,False,False,False 3 | encrypted,2025-08-06/commec-examples.output.json,Warning,Pass,Pass,Pass,Pass,False,False,False,False,False,False 4 | xylanase_zero_shot_des31,2025-08-06/commec-examples.output.json,Pass,Pass,Pass,Pass,Pass,False,False,False,False,False,False 5 | RVFV_Rift_valley_fever,2025-08-06/commec-examples.output.json,Flag,Warning,Flag,Flag,Flag,True,False,False,False,False,False 6 | BBa_K209429_A_15261,2025-08-06/commec-examples.output.json,Pass,Pass,Mixed,Pass,Pass,False,True,False,False,False,False 7 | -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/prot-error-2024-08.screen: -------------------------------------------------------------------------------- 1 | >> STEP 1: Checking for biorisk genes... 2 | --> Biorisks: no hits detected, PASS 3 | STEP 1 completed at 2024-08-27 16:58:14 4 | >> STEP 2: Checking regulated pathogen proteins... 5 | Traceback (most recent call last): 6 | File "/blue/salemi/brittany.rife/nti/common-mechanism/commec/check_reg_path.py", line 158, in 7 | main() 8 | File "/blue/salemi/brittany.rife/nti/common-mechanism/commec/check_reg_path.py", line 57, in main 9 | blast = taxdist(blast, reg_ids, vax_ids, args.db + "/taxonomy/", args.threads) 10 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 11 | File "/blue/salemi/brittany.rife/nti/common-mechanism/commec/utils.py", line 118, in taxdist 12 | zip( 13 | TypeError: 'float' object is not iterable 14 | ERROR: command 'python /blue/salemi/brittany.rife/nti/common-mechanism/commec/check_reg_path.py -i AF006966.1_8_6_dmnd.nr.blastx -d /orange/salemi/brittany.rife/databases -t 6' failed 15 | -------------------------------------------------------------------------------- /commec/screen-default-config.yaml: -------------------------------------------------------------------------------- 1 | base_paths: 2 | default: commec-dbs/ 3 | databases: 4 | biorisk: 5 | path: '{default}biorisk/biorisk.hmm' 6 | taxids: "{default}biorisk/reg_taxids.txt" 7 | annotations: '{default}biorisk/biorisk_annotations.csv' 8 | regulated_protein: 9 | blast: 10 | path: '{default}nr_blast/nr' 11 | diamond: 12 | path: '{default}nr_dmnd/nr.dmnd' 13 | regulated_nt: 14 | path: '{default}nt_blast/core_nt' 15 | low_concern: 16 | rna: 17 | path: '{default}low_concern/rna/benign.cm' 18 | dna: 19 | path: '{default}low_concern/dna/benign.fasta' 20 | protein: 21 | path: '{default}low_concern/protein/benign.hmm' 22 | taxids: "{default}low_concern/vax_taxids.txt" 23 | annotations: '{default}low_concern/low_concern_annotations.tsv' 24 | taxonomy: 25 | path: "{default}taxonomy/" 26 | threads: 1 27 | diamond_jobs: null 28 | do_cleanup: False 29 | force: False 30 | skip_taxonomy_search: False 31 | protein_search_tool: 'blastx' 32 | resume: False 33 | skip_nt_search: False 34 | verbose: False -------------------------------------------------------------------------------- /commec/tests/test_rationales.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for controlled rationale outcomes 3 | """ 4 | 5 | from commec.tests.screen_factory import ( 6 | ScreenTesterFactory, 7 | ScreenStep 8 | ) 9 | from commec.config.result import ScreenStatus, Rationale 10 | 11 | def test_hmmer(tmp_path): 12 | """ 13 | When there are hits to Biorisk with a large E-value, but no other hits, and we 14 | are running in the skip taxonomy mode, we correctly label the outcome 15 | as warning, however the rationale is set to "Matches to ." instead of 16 | the correct Rationale text indicating no hits. 17 | """ 18 | screen_test = ScreenTesterFactory("low_evalue_hmmer", tmp_path) 19 | screen_test.add_query("query1",1200) 20 | screen_test.add_hit(ScreenStep.BIORISK, "query1", 100, 200, "HighEvalueHit", "HEH", 500, regulated=True, evalue = 100.0) 21 | result = screen_test.run("--skip-tx") 22 | assert result.queries["query1"].status.screen_status == ScreenStatus.WARN 23 | assert result.queries["query1"].status.rationale == str(Rationale.NO_HITS_SKIP_NOTE) 24 | -------------------------------------------------------------------------------- /commec/utils/concat_seqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Script that concatenates all sequences in a FASTA file. 5 | 6 | Usage: 7 | concat_seqs.py input.fasta 8 | """ 9 | import sys 10 | 11 | # read in the file based on the command line argument 12 | filename = sys.argv[1] 13 | # open the file 14 | f = open(filename, "r") 15 | # read the file 16 | lines = f.readlines() 17 | # close the file 18 | f.close() 19 | 20 | # use the first line as the sequence ID 21 | seq_id = lines[0].rstrip() 22 | 23 | # concatenate all other lines that don't start with '>' 24 | seq = "" 25 | for line in lines[1:]: 26 | if line.startswith(">"): 27 | continue 28 | seq += line.rstrip() 29 | 30 | # print the sequence ID and the sequence to a FASTA file 31 | # open output file (filename but with _concat appended before suffix) 32 | out_filename = filename.replace(".fasta", "_concat.fasta") 33 | out_file = open(out_filename, "w") 34 | # write the sequence ID and sequence to the file 35 | out_file.write(">" + seq_id + "\n") 36 | out_file.write(seq + "\n") 37 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/rna/benign.cm.exemplar.out: -------------------------------------------------------------------------------- 1 | #target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target 2 | #------------------- --------- ----------------------------------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --------------------- 3 | # 4 | # Program: cmscan 5 | # Version: 1.1.5 (Sep 2023) 6 | # Pipeline mode: SCAN 7 | # Query file: /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta 8 | # Target file: /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.cmscan 9 | # Option settings: cmscan --tblout /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.cmscan.exemplar.out /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.cmscan /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta 10 | # Current dir: /root/repo/json/common-mechanism 11 | # Date: Thu Sep 19 12:15:55 2024 12 | # [ok] 13 | -------------------------------------------------------------------------------- /commec/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Static functions useful for dealing with common file parsing tasks. 5 | """ 6 | 7 | import argparse 8 | import os 9 | 10 | # Below go to config parameters. 11 | @staticmethod 12 | def directory_arg(path): 13 | """Raise ArgumentTypeError if `path` is not a directory.""" 14 | if not os.path.isdir(path): 15 | raise argparse.ArgumentTypeError(f"{path} is not a valid directory path") 16 | return path 17 | 18 | @staticmethod 19 | def file_arg(path): 20 | """Raise ArgumentTypeError if `path` is not a file.""" 21 | if not os.path.isfile(path): 22 | raise argparse.ArgumentTypeError(f"{path} is not a valid file") 23 | if not os.path.getsize(path) > 0: 24 | raise argparse.ArgumentTypeError(f"{path} is an empty file") 25 | return path 26 | 27 | @staticmethod 28 | def expand_and_normalize(path): 29 | """Expand ~ and $var path elements, and normalize path, removing double slashes, etc.""" 30 | return os.path.normpath(os.path.expandvars(os.path.expanduser(path))) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 International Biosecurity and Biosafety Initiative for Science 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /commec/utils/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ${page_title | h} 6 | 31 | 32 | 33 |
34 | % for fig_html in figures_html: 35 |
36 | ${fig_html | n} 37 |
38 | % endfor 39 |
40 | 41 | 42 | -------------------------------------------------------------------------------- /example_data/output_commec-examples/commec-examples.low_concern.cmscan: -------------------------------------------------------------------------------- 1 | #target name accession query name accession mdl mdl from mdl to seq from seq to strand trunc pass gc bias score E-value inc description of target 2 | #------------------- --------- ------------------------ --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- --------------------- 3 | # 4 | # Program: cmscan 5 | # Version: 1.1.5 (Sep 2023) 6 | # Pipeline mode: SCAN 7 | # Query file: /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.cleaned.fasta 8 | # Target file: /mnt/data/home/ec2-user/cm-dbs/low_concern/rna/benign.cm 9 | # Option settings: cmscan --tblout /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/output_commec-examples/commec-examples.low_concern.cmscan --cpu 12 /mnt/data/home/ec2-user/cm-dbs/low_concern/rna/benign.cm /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.cleaned.fasta 10 | # Current dir: /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07 11 | # Date: Wed Aug 6 22:30:56 2025 12 | # [ok] 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Background 2 | 9 | 10 | 11 | 14 | **Issues**: 15 | 21 | 22 | ## Changes 23 | 26 | ### Bug fixes 27 | * 28 | 29 | ### New features 30 | * 31 | 32 | ### Breaking changes 33 | * 34 | 35 | ### Refactoring 36 | * 37 | 38 | ## Relevant logs, error messages, etc. 39 | -------------------------------------------------------------------------------- /example_data/input_commec-examples/commec-examples_config.yaml: -------------------------------------------------------------------------------- 1 | base_paths: 2 | default: /mnt/data/home/ec2-user/cm-dbs/ 3 | databases: 4 | biorisk: 5 | annotations: /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk_annotations.csv 6 | path: /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk.hmm 7 | taxids: /mnt/data/home/ec2-user/cm-dbs/biorisk/reg_taxids.txt 8 | low_concern: 9 | annotations: /mnt/data/home/ec2-user/cm-dbs/low_concern/low_concern_annotations.tsv 10 | dna: 11 | path: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta 12 | protein: 13 | path: /mnt/data/home/ec2-user/cm-dbs/low_concern/protein/benign.hmm 14 | rna: 15 | path: /mnt/data/home/ec2-user/cm-dbs/low_concern/rna/benign.cm 16 | taxids: /mnt/data/home/ec2-user/cm-dbs/low_concern/vax_taxids.txt 17 | regulated_nt: 18 | path: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt 19 | regulated_protein: 20 | blast: 21 | path: /mnt/data/home/ec2-user/cm-dbs/nr_blast/nr 22 | diamond: 23 | path: /mnt/data/home/ec2-user/cm-dbs/nr_dmnd/nr.dmnd 24 | taxonomy: 25 | path: /mnt/data/home/ec2-user/cm-dbs/taxonomy/ 26 | diamond_jobs: null 27 | do_cleanup: false 28 | force: false 29 | protein_search_tool: blastx 30 | resume: false 31 | skip_nt_search: false 32 | skip_taxonomy_search: false 33 | threads: 12 34 | verbose: true 35 | -------------------------------------------------------------------------------- /commec/tests/test_dbs/low_concern/protein/benign.hmm.exemplar.out: -------------------------------------------------------------------------------- 1 | # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord 2 | # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target 3 | #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- 4 | # 5 | # Program: hmmscan 6 | # Version: 3.4 (Aug 2023) 7 | # Pipeline mode: SCAN 8 | # Query file: /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta 9 | # Target file: /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.hmm 10 | # Option settings: hmmscan --domtblout /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.hmm.exemplar.out /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.hmm /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta 11 | # Current dir: /root/repo/json/common-mechanism 12 | # Date: Thu Sep 19 12:15:55 2024 13 | # [ok] 14 | -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/prot-mixed-hit-2024-06.screen: -------------------------------------------------------------------------------- 1 | >> STEP 1: Checking for biorisk genes... 2 | --> Biorisks: no hits detected, PASS 3 | STEP 1 completed at 2024-06-24 19:29:55 4 | >> STEP 2: Checking regulated pathogen proteins... 5 | 0 10239;2559587;2732396;2732408;2732506;76804;2499399;11118;2501931;694002;2509511;694009;1508227 6 | 1 10239;2559587;2732396;2732408;2732506;76804;2499399;11118;2501931;694002;2509511;694009;1508227 7 | --> Best match to sequence(s) QGA88265, QGA88308, WEG19430, QZX47334, QWN56262, QZX47339, QGA88261 at bases 3 - 365 found in both regulated and non-regulated organisms 8 | Species: Severe acute respiratory syndrome-related coronavirus, Betacoronavirus sp. RsYN09, Sarbecovirus sp. (taxid(s): 2833184, 694009, 2872810, 1508227) (100.0 percent identity to query) 9 | Description: ORF1ab polyprotein [Severe acute respiratory syndrome-related coronavirus] 10 | --> no top hit exclusive to a regulated pathogen: PASS 11 | STEP 2 completed at 2024-06-24 19:32:11 12 | >> STEP 3: Checking regulated pathogen nucleotides... 13 | ...protein hits found, fetching nt regions not covered by a 90% ID hit or better 14 | --> no noncoding regions >= 50 bases found, skipping nt scan 15 | STEP 3 completed at 2024-06-24 19:32:12 16 | >> STEP 4: Checking any pathogen regions for benign components... 17 | ...no regulated regions to clear 18 | >> COMPLETED AT 2024-06-24 19:32:13 19 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "commec" 7 | version = '1.0.2' 8 | requires-python = ">=3.10" 9 | # This is not a pure python project; dependencies are managed through environment.yml 10 | authors = [ 11 | { name = "Nicole Wheeler" }, 12 | { name = "Jen Lu" }, 13 | { name = "Michael Barnett" }, 14 | { name = "Tessa Alexanian", email = "tessa@ibbis.bio" }, 15 | ] 16 | maintainers = [ 17 | { name = "International Biosecurity and Biosafety Initiative for Science (IBBIS)", email = "info@ibbis.bio" }, 18 | ] 19 | description = 'Free, open-source, globally available tool for DNA sequence screening' 20 | readme = "README.md" 21 | license = { file = "LICENSE" } 22 | keywords = ["synthesis screening", "DNA synthesis"] 23 | classifiers = [ 24 | "Development Status :: 3 - Alpha", 25 | "Intended Audience :: Developers", 26 | "License :: OSI Approved :: MIT License", 27 | "Programming Language :: Python :: 3", 28 | "Topic :: Scientific/Engineering :: Bio-Informatics", 29 | ] 30 | 31 | [project.urls] 32 | Homepage = "https://ibbis.bio/common-mechanism" 33 | Repository = "https://github.com/ibbis-screening/common-mechanism.git" 34 | 35 | [project.scripts] 36 | "commec" = "commec.cli:main" 37 | 38 | [tool.setuptools] 39 | packages = { find = { "include" = ["commec", "commec.*"] } } 40 | package-data = { "commec" = ["utils/template.html", "screen-default-config.yaml"] } -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/prot-hit-not-cleared-2024-06.screen: -------------------------------------------------------------------------------- 1 | >> STEP 1: Checking for biorisk genes... 2 | --> Biorisks: no hits detected, PASS 3 | STEP 1 completed at 2024-09-03 05:57:18 4 | >> STEP 2: Checking regulated pathogen proteins... 5 | --> Best match to sequence(s) AMT79992, AHL83753, AAF04797, AMT80120, AMT80254, ADW86059, ANB41717, WFP21365, ACY66806, AHL83655, WFP21263, AHL83687, AMT80218, ANB41697, AHL83781, AMT80072, WGH73008, WFP21271, Q4QXJ7, WFP21337, 7V0N_A, WFP21323, ANB41727, AMN91481, AMN91457, AMT80132, WFP21377, AHL83797, ANB41585, WFP21295, ALE15082, AMT80102, AMT79954, ANB41701, AHL83809, WFP21289, AMN91563, AHL83649, AMT80320, AHL83707, WGH73012, WFP21353, AHL83735, AMN91527, ADB08660, ADW86014, ADW86051, ANB41675, AMN91485, ANB41577 at bases 3 - 629 found in only regulated organisms: FLAG (virus) 6 | Species: Eastern equine encephalitis virus (taxid(s): 11021, 374598) (100.0 percent identity to query) 7 | Description: E1 glycoprotein, partial [Eastern equine encephalitis virus] 8 | STEP 2 completed at 2024-09-03 06:42:37 9 | >> STEP 3: Checking regulated pathogen nucleotides... 10 | ...protein hits found, fetching nt regions not covered by a 90% ID hit or better 11 | --> no noncoding regions >= 50 bases found, skipping nt scan 12 | STEP 3 completed at 2024-09-03 06:42:38 13 | >> STEP 4: Checking any pathogen regions for benign components... 14 | ...no housekeeping protein hits 15 | ...no benign RNA hits 16 | ...no Synbio sequence hits 17 | -->Regulated region at bases 3 to 629 failed to clear: FLAG 18 | >> COMPLETED AT 2024-09-03 06:42:41 19 | -------------------------------------------------------------------------------- /conda-recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "commec" %} 2 | {% set version = "1.0.2" %} 3 | {% set sha256 = "" %} 4 | 5 | package: 6 | name: "{{ name }}" 7 | version: "{{ version }}" 8 | 9 | source: 10 | url: https://github.com/ibbis-screening/common-mechanism/archive/refs/tags/v{{version}}.tar.gz 11 | sha256: {{ sha256 }} 12 | 13 | build: 14 | number: 0 15 | noarch: python 16 | script: "{{ PYTHON }} -m pip install . --no-deps --no-build-isolation --no-cache-dir -vvv" 17 | run_exports: 18 | - {{ pin_subpackage('commec', max_pin="x.x.x") }} 19 | 20 | requirements: 21 | build: 22 | - python >=3.10 23 | - pip 24 | - setuptools 25 | host: 26 | - python >=3.10 27 | - pip 28 | - setuptools 29 | run: 30 | - python >=3.10 31 | # Runtime Python dependencies 32 | - biopython 33 | - numpy 34 | - pandas 35 | - pytaxonkit 36 | - pyyaml 37 | # Runtime non-Python dependencies 38 | - blast >=2.16 39 | - diamond >=2.1 40 | - hmmer 41 | - infernal 42 | - wget 43 | - plotly 44 | - yaml 45 | - mako 46 | 47 | test: 48 | commands: 49 | - commec screen --help 50 | - commec flag --help 51 | - commec split --help 52 | 53 | about: 54 | home: https://github.com/ibbis-screening/common-mechanism 55 | license: MIT 56 | license_family: MIT 57 | doc_url: https://github.com/ibbis-screening/common-mechanism/wiki 58 | summary: "commec: a free, open-source, globally available tool for DNA sequence screening" 59 | dev_url: https://github.com/ibbis-screening/common-mechanism 60 | 61 | extra: 62 | identifiers: 63 | - biotools:commec 64 | container: 65 | image: "quay.io/biocontainers/commec" -------------------------------------------------------------------------------- /dev_scripts/split_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Split a multi-record FASTA file into files with a set number of sequences per file. Output will be 5 | input.#.fa. 6 | 7 | Command-line usage: 8 | split_fasta.py --i input_fasta -n num_seqs 9 | """ 10 | import os, sys, argparse 11 | from Bio import SeqIO 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("-i","--input", dest="i_file", 16 | required=True, help="multi-FASTA file to split") 17 | parser.add_argument("-n","--num", dest="num_seqs", 18 | type=int, 19 | required=True, help="Number of sequences per file (min)") 20 | args = parser.parse_args() 21 | 22 | basename = os.path.splitext(args.i_file)[0] 23 | basename = os.path.basename(basename) 24 | count_curr = 0 25 | count_total = 0 26 | num_splits = 0 27 | sys.stdout.write("\t%i sequences printed (%i splits)" % (count_total,num_splits)) 28 | sys.stdout.flush() 29 | for record in SeqIO.parse(args.i_file,"fasta"): 30 | if count_curr == 0: 31 | num_splits += 1 32 | o_file = open(basename + "." + str(num_splits) + ".fa" , 'w') 33 | SeqIO.write(record, o_file, "fasta") 34 | count_curr += 1 35 | count_total += 1 36 | if count_total % 10000 == 0: 37 | sys.stdout.write("\r\t%i sequences printed (%i splits)" % (count_total,num_splits)) 38 | sys.stdout.flush() 39 | if count_curr == args.num_seqs: 40 | count_curr = 0 41 | o_file.close() 42 | 43 | sys.stdout.write("\t%i sequences printed (%i splits)\n" % (count_total,num_splits)) 44 | sys.stdout.flush() 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /commec/utils/dict_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Static functions useful for dealing with common dictionary tasks. 5 | """ 6 | 7 | @staticmethod 8 | def deep_update(to_update: dict[str, any], 9 | has_updates: dict[str, any]) -> tuple[ 10 | dict[str,any], 11 | list[tuple[str,any]]]: 12 | """ 13 | Recursively update a nested dictionary without completely overwriting nested dictionaries. 14 | Only already existing keys are updated. Any keys not existing in the dictionary 15 | to be updated are returned as a list of rejected key value pairs. 16 | ----- 17 | Inputs: 18 | * to_update : dict[str, any] Dictionary to be updated. 19 | * has_updates : dict[str, any] New dictionary information to be added. 20 | ---- 21 | Outputs: 22 | * updated : dict[str, any] a copy of the to_update dictionary, with values 23 | from any matching keys overridden by has_updates. 24 | * rejected : list[tuple[str,any] A list of the rejected key value pairs, i.e. 25 | keys present in has_updates, but not present in to_update. 26 | """ 27 | rejected = [] 28 | updated = to_update.copy() 29 | for key, value in has_updates.items(): 30 | # If both values are dictionaries, recursively update 31 | if key in updated and isinstance(updated[key], dict) and isinstance(value, dict): 32 | updated[key], additional_rejects = deep_update(updated[key], value) 33 | rejected.extend(additional_rejects) 34 | # If not a dictionary, just copy the value. 35 | elif key in updated: 36 | updated[key] = value 37 | # If not present, we log an unexpected input one. 38 | else: 39 | rejected.append((key, value)) 40 | return updated, rejected 41 | -------------------------------------------------------------------------------- /commec/tests/test_flag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import textwrap 4 | 5 | from commec.flag import add_args, run 6 | 7 | SCREEN_DIR = os.path.join(os.path.dirname(__file__), "test_data") 8 | 9 | def test_flag(tmp_path): 10 | """We are lazily writing tests for a full run of flag instead of unit tests.""" 11 | parser = argparse.ArgumentParser() 12 | add_args(parser) 13 | args = parser.parse_args([SCREEN_DIR, "-o", str(tmp_path), "-r"]) 14 | run(args) 15 | 16 | # Check if the output file exists 17 | status_output = tmp_path / "screen_pipeline_status.csv" 18 | assert status_output.exists() 19 | 20 | expected_status = textwrap.dedent( 21 | f"""\ 22 | name,filepath,flag,biorisk,protein,nucleotide,low_concern,virus_flag,bacteria_flag,eukaryote_flag,low_concern_protein,low_concern_rna,low_concern_dna,rationale 23 | FLAG_TEST_01,{SCREEN_DIR}/flag_tests.json,Flag,Flag,Pass,Pass,Flag,False,False,False,False,False,False, 24 | FLAG_TEST_02,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Flag,Pass,Flag,True,False,False,False,False,False, 25 | FLAG_TEST_03,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Flag,Pass,Flag,False,True,False,False,False,False, 26 | FLAG_TEST_04,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Flag,Pass,Flag,False,False,True,False,False,False, 27 | FLAG_TEST_05,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Pass,Flag,Flag,True,True,True,False,False,False, 28 | FLAG_TEST_06,{SCREEN_DIR}/flag_tests.json,Pass,Pass,Mixed,Pass,Pass,True,False,False,False,False,False, 29 | FCTEST1,{SCREEN_DIR}/functional.json,Flag,Flag,Flag,Flag,Flag,True,False,False,True,True,True,"Matches sequence with pathogenic or toxin function, and protein and nucleotide sequence with regulated organisms; as well as virulence factor; as well as flags cleared as common or non-hazardous." 30 | """ 31 | ) 32 | actual_status = status_output.read_text() 33 | assert expected_status.strip() == actual_status.strip() -------------------------------------------------------------------------------- /example_data/output_commec-examples/commec-examples.biorisk.hmmscan: -------------------------------------------------------------------------------- 1 | # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord 2 | # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target 3 | #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- 4 | PeptidaseM24 PF00557.27 209 Part:BBa_K5108009_creA_-_1 - 686 1.3e-25 87.5 0.0 1 2 0.6 1.4e+03 -4.1 0.0 20 43 38 58 31 76 0.64 Metallopeptidase family M24 5 | PeptidaseM24 PF00557.27 209 Part:BBa_K5108009_creA_-_1 - 686 1.3e-25 87.5 0.0 2 2 1e-28 2.5e-25 86.6 0.0 2 207 174 389 173 391 0.84 Metallopeptidase family M24 6 | gi3006115embCAA73290.1 - 200 RVFV_Rift_valley_fever_2 - 563 2.2e-139 458.7 0.2 1 1 1.3e-142 3.1e-139 458.3 0.2 1 200 33 232 33 232 1.00 - 7 | # 8 | # Program: hmmscan 9 | # Version: 3.4 (Aug 2023) 10 | # Pipeline mode: SCAN 11 | # Query file: /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa 12 | # Target file: /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk.hmm 13 | # Option settings: hmmscan --domtblout /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/output_commec-examples/commec-examples.biorisk.hmmscan --cpu 12 /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk.hmm /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa 14 | # Current dir: /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07 15 | # Date: Wed Aug 6 21:23:52 2025 16 | # [ok] 17 | -------------------------------------------------------------------------------- /example_data/input_commec-examples/commec-examples.noncoding.fasta: -------------------------------------------------------------------------------- 1 | >encrypted (1-552) 2 | GACCAAGCCTGCAAAAACAAACGGCAAATTGACGGACTACTAAAGAAGACTTGCCAAATCGGACGAAGACTTGCTCGAAGTACTATGATATATATCGAAGTGGTAGGCGAAGGAGTTAAGAGATTAAGAGCTTAAGACATTCCCTAAGAGATCCCCTTTCGAAGGTATTAAAGAAGCAATTAAGGCCTGAAGGTTTTAAGGAATGGTCTAAGAAATTAAGCAGTTAAGACATGAAGGATTACAAGCAAGCTAACACGAAGATTTTAGACGGATAACACCACCCAGAAGTGGTAGCCTAAGGTGTAACATAAGAAGTAGCATAAGACCTCCCCGAGCGGTATAAGCATTTAAGCCGTGAAGGAATTAAGGCTTTAAGTTCTGAAGTAGTTAAGAAAACTTTCAAAGAAATAAGTTTTCGGATCTCTAAGGCATTAAGGAGTGAAGCCTTGAAGCTGTTAAGATATACTACTCCCTTCGAAGGCCTTAAGTCATGAAGTAGTTGCATTTCGTGCTAAGAAATTTACGAAGTATTTAAGGCGTGCACCCACCGAC 3 | >xylanase_zero_shot_des31 (1-756) 4 | atggaagaagtgctggcgaaaattgtgcgcgataaaaaaatttgggtggaagaacgcaaacagcagcagccgctggaaagctttcgcgataaagtgcagccgagcacccgcaacttttatgatgcgctgcagggcgataaaaccgcgtttattctggtgtgcctgaaagcgagcccgagcaaaggcctgattcgcgaagattttgatccggtgcgcattgcggcggtgtatcgccattatgcgaacgcgattaccgtgctgaccgatgaaaaatattttcagggcagctttgattttctgcgccaggtgagccaggtggcgccgcagccgattctgtgctttgattttattgtggatgaatggcagctgtatctggcggcgctgtttggcgcggcggcgattctgctgattgtggcgattctgggcgatcgcaccaaagaatttattgatatggcgcgcgaactgggcctggatgtgctggtggatgtgcatgatgaagaagatctggaaaaagtgtttagctattgccgcccgaaaattattggcgtggtgaacaccgattggcgcaccatggaaaccgatctgaacaccaccgaaaaactggcgaaactgattccgccggataccattgtgattgcgattagcggcattagcgaaccggaacaggtgaaacgcctgcgcaaagcgggcgtggatggcgtgctgattggcagcacctttgcgcgcaacccggataaagcggcggaagcgaaagaa 5 | >RVFV_Rift_valley_fever (830.0-960.0) 6 | TAGAGATTAAGGCTGCCCCACCCCCCACCCCCAATCCCGACCGTAACCCCAACCACCCCCTTTTCCCCAAACCCCTGGGCAGCCACTTAGGCTGCTGTCTTGTACGCCTGAGCAGCTGCCATGACAGCTGC 7 | >BBa_K209429_A_15261 (643.0-758.0) (1833.0-1933.0) (2633.0-2764.0) (3497.0-3550.0) 8 | gccaactttgtacaaaaaagcaggctttaaggagcaaggcaggtggacaagaggagttctagtggatccttgaacttgtctagaagctggaactcccacctgcaacatgcgaatactaatcagaattggttaattggttgtaacactggcagagcattacgctgacttgacgggacggcgcaagctcatgaccaaaatcccttaacgtgagttacgctactagatggacagcaaaggttcgtcgcagaaagggtcccgcctgctcctgctgctggtggtgtcaaatctactcttgtgccagggtgtggtctccgattacaaagatgatgatgatgtcgactccccgatctactagatgaataactcaacaaactcctctaacaatagcctggctcttacaagt 9 | -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/prot-multiple-hits-2024-06.screen: -------------------------------------------------------------------------------- 1 | >> STEP 1: Checking for biorisk genes... 2 | --> Biorisks: Regulated genes not found, PASS 3 | --> Virulence factor found in bases 526 to 802, WARNING 4 | Gene: PE2 [Venezuelan equine encephalitis virus] 5 | STEP 1 completed at 2024-06-24 23:10:58 6 | >> STEP 2: Checking regulated pathogen proteins... 7 | --> Best match to sequence(s) KXB23588 at bases 7507 - 7607 found in only regulated organisms: FLAG (bacteria) 8 | Species: Legionella pneumophila (taxid(s): 446) (73.469 percent identity to query) 9 | Description: hypothetical protein PtVF66_13895, partial [Legionella pneumophila] 10 | --> Best match to sequence(s) WP_099588326 at bases 6123 - 6929 found in both regulated and non-regulated organisms 11 | Species: Bacillus cereus group sp. BY9-3LC, Clostridioides difficile, Streptococcus dysgalactiae, 12 | Lactiplantibacillus plantarum, Bacillus cereus, Francisella tularensis, Sinomonas cellulolyticus, 13 | Pseudomonas sp. MWU13-2860, Lacticaseibacillus rhamnosus, Escherichia coli, Corynebacterium 14 | glutamicum, Borreliella burgdorferi (taxid(s): 2071714, 47715, 139, 99822, 119857, 562, 1396, 1590, 1718, 1496, 3018075, 2801916) (100.0 percent identity to query) 15 | Description: MULTISPECIES: APH(3')-I family aminoglycoside O-phosphotransferase [Bacteria] 16 | STEP 2 completed at 2024-06-24 23:23:21 17 | >> STEP 3: Checking regulated pathogen nucleotides... 18 | ...protein hits found, fetching nt regions not covered by a 90% ID hit or better 19 | --> no top hit exclusive to a regulated pathogen: PASS 20 | STEP 3 completed at 2024-06-24 23:23:32 21 | >> STEP 4: Checking any pathogen regions for benign components... 22 | -->Housekeeping proteins covering 2.0 to 925.0 = PASS 23 | COG0449: glutamine-fructose-6-phosphate transaminase (isomerizing) activity (E-value: 5.6e-50 24 | ...no benign RNA hits 25 | -->Synbio sequences - <80% coverage achieved = FAIL 26 | -->Regulated region at bases 7507 to 7607 failed to clear: FLAG 27 | >> COMPLETED AT 2024-06-24 23:23:43 28 | -------------------------------------------------------------------------------- /commec/tests/test_trim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple tests to ensure the correct trimming of overlapping components in a Hmmer database 3 | when parsed to remove_overlaps. 4 | The following behaviour is expected: 5 | * Fully encapsulated hits, should be removed. 6 | * Partially overlapping hits, should both be kept (To maximise extents) 7 | * Hits from different queries are independant in logic. 8 | """ 9 | 10 | import pandas as pd 11 | import pytest 12 | from commec.tools.hmmer import remove_overlaps 13 | 14 | # Test the following hmmer configuration: 15 | # 10-----------------50 (Largest, should stay.) 16 | # 40-------60 (Extends 1. Should stay.) 17 | # 20-------40 (encapsulated, but high score, should stay!) 18 | # 10-----------------50 (lower score than 1, should be removed.) 19 | # 20---30 (Different query, removed) 20 | # 10------------------------------90 (different query) 21 | 22 | # Example DataFrame 23 | example_hmmer_01 = pd.DataFrame({ 24 | "query name": ["one","one","one","one","two", "two"], 25 | "q. start": [10, 40, 20, 10, 20, 10], 26 | "q. end": [50, 60, 40, 50, 30, 90], 27 | "score": [3, 5, 6, 1, 1, 2] 28 | }) 29 | 30 | # Example DataFrame 31 | example_hmmer_01_output = pd.DataFrame({ 32 | "query name": ["one","one", "one", "two"], 33 | "q. start": [10, 40, 20, 10], 34 | "q. end": [50, 60, 40, 90], 35 | "score": [3, 5, 6, 2] 36 | }) 37 | 38 | @pytest.mark.parametrize( 39 | "input_hmmer, expected_output_hmmer", 40 | [ 41 | (example_hmmer_01, example_hmmer_01_output), 42 | ] 43 | ) 44 | def test_hmmer_overlaps( 45 | input_hmmer : pd.DataFrame, 46 | expected_output_hmmer : pd.DataFrame 47 | ): 48 | """ 49 | Checks common configurations that require trimming in Hmmer outputs, 50 | In particular partial overlaps, full encapsulations, score differences, and different queries. 51 | """ 52 | trimmed_input = remove_overlaps(input_hmmer) 53 | print("INPUT:") 54 | print(input_hmmer) 55 | print("TRIMMED:") 56 | print(trimmed_input) 57 | print("CORRECT:") 58 | print(expected_output_hmmer) 59 | assert trimmed_input.equals(expected_output_hmmer) 60 | -------------------------------------------------------------------------------- /example_data/output_commec-examples/commec-examples.low_concern.blastn: -------------------------------------------------------------------------------- 1 | # BLASTN 2.16.0+ 2 | # Query: Part:BBa_K5108009_creA_- 3 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta 4 | # 0 hits found 5 | # BLASTN 2.16.0+ 6 | # Query: encrypted 7 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta 8 | # 0 hits found 9 | # BLASTN 2.16.0+ 10 | # Query: xylanase_zero_shot_des31 11 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta 12 | # 0 hits found 13 | # BLASTN 2.16.0+ 14 | # Query: RVFV_Rift_valley_fever 15 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta 16 | # 0 hits found 17 | # BLASTN 2.16.0+ 18 | # Query: BBa_K209429_A_15261 19 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta 20 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end 21 | # 9 hits found 22 | BBa_K209429_A_15261 BBa_K209429 A 15261 Composite "SSFYFP-hM2D-Tuba" BBa_K209429 0 0.0 17813 100.000 9646 1 9646 9646 1 9646 23 | BBa_K209429_A_15261 BBa_K209440 A 15305 Composite "pEF1-SSFYFP-hM2D-Tuba" BBa_K209440 0 0.0 12942 100.000 9646 2639 9646 7008 1 7008 24 | BBa_K209429_A_15261 BBa_K209427 A 15259 Composite "SSFYFP-hM2D-ActA(30-612)" BBa_K209427 0 0.0 9070 100.000 9646 1 4911 6649 1 4911 25 | BBa_K209429_A_15261 BBa_K209430 A 15262 Composite "SSFYFP-hM2D-ITSN" BBa_K209430 0 0.0 9068 100.000 9646 1 4910 6586 1 4910 26 | BBa_K209429_A_15261 BBa_K209431 A 15263 Composite "SSFYFP-hM2D-Beta Pix" BBa_K209431 0 0.0 9066 100.000 9646 1 4909 6844 1 4909 27 | BBa_K209429_A_15261 BBa_K209428 A 15260 Composite "SSFYFP-hM2D-Vav" BBa_K209428 0 0.0 9066 100.000 9646 1 4909 7246 1 4909 28 | BBa_K209429_A_15261 BBa_K209445 A 15308 Composite "pEF1-SSFYFP-Rs1.3-Tuba" BBa_K209445 0 0.0 8765 100.000 9646 4901 9646 7019 2274 7019 29 | BBa_K209429_A_15261 BBa_K209424 A 15256 Composite "SSFYFP-Rs1.3-Tuba" BBa_K209424 0 0.0 8765 100.000 9646 4901 9646 9657 4912 9657 30 | BBa_K209429_A_15261 BBa_K209409 A 15183 Coding "AarI C-D part, Tuba" BBa_K209409 0 0.0 8754 100.000 9646 4907 9646 4740 1 4740 31 | # BLAST processed 5 queries 32 | -------------------------------------------------------------------------------- /example_data/README.md: -------------------------------------------------------------------------------- 1 | # Example data 2 | 3 | This directory contains a file, `commec-examples.fasta`, which inludes queries illustrating different possible screening outcomes, as well as the results of running `commec screen` on that file. 4 | 5 | A guide to interpreting these results is provided in the [Tutorial](https://github.com/ibbis-bio/common-mechanism/wiki/tutorial) on the `commec` wiki. 6 | 7 | ### Examples included 8 | 9 | * **BBa_K5108009_creA_** (`WARN`): This is a [composite DNA part](https://parts.igem.org/Part:BBa_K5108009) developed by 2024 iGEM team Toulouse-INSA-UPS for space exploration applications. It is an artificial operon composed of four basic parts: creatinase and creatinine amidohydrolase ORFs (creA BBa K5108003, crnA BBa K5108004) and two RBS (BBa K5108006, BBaK5108007) enabling their expression in the plant growth-promoting rhizobacteria, _Pseudomonas fluorescens_, enabling the metabolization of creatinine by this organism. 10 | * **encrypted** (`WARN`): This DNA sequence contains an encrypted message generated using the [CryptoGErM](https://2016.igem.org/Team:Groningen/Tour) algorithm developed by the 2016 iGEM team from Groningen. It is therefore an entirely artificial sequence, with no biological function or related taxonomy across the domains of life. 11 | * **xylanase_zero_shot_des31** (`PASS`): This sequence is one of the xylanase variants used in the zero-shot enzyme activity prediction challenge problem from [Align Bio’s](https://alignbio.org/) 2023 [Protein Engineering Tournament](https://alignbio.org/tournamentpilot-results-2023). Xylanase is an enzyme that degrades the second-most-abundant polysaccharide and should not be flagged. 12 | * **BBa_K209429_A_15261** (`PASS`): This sequence is another [composite DNA part](https://parts.igem.org/Part:BBa_K209429) created by the [igem UCSF team in 2009](https://2009.igem.org/Team:UCSF) with the goal of manipulating signaling pathways to mediate chemotaxis. 13 | * **RVFV_Rift_valley_fever** (`FLAG`): The Rift Valley Fever virus sample is successfully flagged during the taxonomic steps as containing extensive regions of material from regulated organisms, namely nucleocapsid proteins from _Phlebovirus riftense_ AKA Rift Valley Fever virus. 14 | -------------------------------------------------------------------------------- /commec/tests/test_coverage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple tests to ensure the correct trimming of overlapping components in a Hmmer database 3 | when parsed to remove_overlaps. 4 | The following behaviour is expected: 5 | * Fully encapsulated hits, should be removed. 6 | * Partially overlapping hits, should both be kept (To maximise extents) 7 | * Hits from different queries are independant in logic. 8 | """ 9 | 10 | import pandas as pd 11 | import pytest 12 | from commec.screeners.check_low_concern import _calculate_coverage 13 | from commec.config.result import MatchRange 14 | 15 | # Test the following hmmer configuration: 16 | # 10-----------------50 (Largest, should stay.) 17 | # 40-------60 (Extends 1. Should stay.) 18 | # 20-------40 (encapsulated, but high score, should stay!) 19 | # 10-----------------50 (lower score than 1, should be removed.) 20 | # 20---30 (Different query, removed) 21 | # 10------------------------------90 (different query) 22 | 23 | # Example DataFrame 24 | example_hmmer_01 = pd.DataFrame({ 25 | "q. start": [100, 50, 0, 0], 26 | "q. end": [200, 150, 100, 200], 27 | }) 28 | 29 | # Example DataFrame 30 | example_hmmer_01_output = pd.DataFrame({ 31 | "q. start": [100, 50, 0, 0], 32 | "q. end": [200, 150, 100, 200], 33 | "coverage_nt": [100, 50, 0, 100], 34 | "coverage_ratio": [1.0, 0.5, 0.0, 1.0] 35 | }) 36 | 37 | reg_range_01 = MatchRange(0.0, 100, 200, 100, 200) 38 | 39 | @pytest.mark.parametrize( 40 | "input_hmmer, input_region, expected_output_hmmer", 41 | [ 42 | (example_hmmer_01, reg_range_01, example_hmmer_01_output), 43 | ] 44 | ) 45 | def test_coverage_overlaps( 46 | input_hmmer : pd.DataFrame, 47 | input_region : MatchRange, 48 | expected_output_hmmer : pd.DataFrame 49 | ): 50 | """ 51 | Checks common configurations that require trimming in Hmmer outputs, 52 | In particular partial overlaps, full encapsulations, score differences, and different queries. 53 | """ 54 | trimmed_input = _calculate_coverage(input_hmmer, input_region) 55 | print("INPUT:") 56 | print(input_hmmer) 57 | print("TRIMMED:") 58 | print(trimmed_input) 59 | print("CORRECT:") 60 | print(expected_output_hmmer) 61 | assert trimmed_input.equals(expected_output_hmmer) 62 | -------------------------------------------------------------------------------- /example_data/output_commec-examples/commec-examples.low_concern.hmmscan: -------------------------------------------------------------------------------- 1 | # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord 2 | # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target 3 | #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- 4 | COG0006 - 321 Part:BBa_K5108009_creA_-_1 - 686 9.9e-57 187.1 0.8 1 1 8e-59 1.4e-56 186.6 0.8 3 314 29 402 26 411 0.82 - 5 | COG0024 - 475 Part:BBa_K5108009_creA_-_1 - 686 9.5e-17 55.5 0.0 1 1 8.2e-19 1.4e-16 54.9 0.0 217 428 159 373 29 406 0.86 - 6 | COG0042 - 326 xylanase_zero_shot_des31_1 - 252 0.0017 12.0 0.0 1 2 0.58 1e+02 -3.7 0.0 70 88 136 155 129 163 0.73 - 7 | COG0042 - 326 xylanase_zero_shot_des31_1 - 252 0.0017 12.0 0.0 2 2 1.7e-05 0.0031 11.2 0.0 184 240 193 250 180 252 0.82 - 8 | COG0516 - 472 xylanase_zero_shot_des31_1 - 252 0.0027 11.1 0.0 1 1 2.1e-05 0.0037 10.6 0.0 225 283 156 237 115 241 0.72 - 9 | # 10 | # Program: hmmscan 11 | # Version: 3.4 (Aug 2023) 12 | # Pipeline mode: SCAN 13 | # Query file: /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa 14 | # Target file: /mnt/data/home/ec2-user/cm-dbs/low_concern/protein/benign.hmm 15 | # Option settings: hmmscan --domtblout /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/output_commec-examples/commec-examples.low_concern.hmmscan --cpu 12 /mnt/data/home/ec2-user/cm-dbs/low_concern/protein/benign.hmm /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa 16 | # Current dir: /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07 17 | # Date: Wed Aug 6 22:30:55 2025 18 | # [ok] 19 | -------------------------------------------------------------------------------- /commec/utils/coordinates.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions associated with the handling of 3 | basepair|amino-acid, nucleotide|peptide coordinate systems. 4 | """ 5 | 6 | import numpy as np 7 | 8 | def convert_protein_to_nucleotide_coords(frame, 9 | protein_start, 10 | protein_end, 11 | seq_length): 12 | """ 13 | Convert protein coordinates to nucleotide coordinates considering the reading frame. 14 | 15 | Parameters: 16 | frame (int, or [int]): Reading frame (1-6) 17 | Frames 1-3: Forward frames starting at positions 0, 1, 2 18 | Frames 4-6: Reverse frames starting from the end at positions 0, 1, 2 19 | protein_start (int, or [int]): Start position in protein coordinates, counting from 1. 20 | protein_end (int, or [int]): End position in protein coordinates, counting from 1. 21 | seq_length (int): Length of the original sequence, mandatory for reverse frames (4,5,6) only. 22 | 23 | Returns: 24 | tuple: (nucleotide_start, nucleotide_end) 25 | """ 26 | # Convert protein coordinates to 0-based, for calculation. 27 | protein_start = np.asarray(protein_start, dtype=np.int64) - 1 28 | protein_end = np.asarray(protein_end, dtype=np.int64) - 1 29 | frame = np.asarray(frame, dtype=np.int64) 30 | seq_length = np.asarray(seq_length, dtype=np.int64) 31 | 32 | # Reverse frame offsets, for when total length not divisible into codons. 33 | reverse_offset = seq_length % 3 34 | 35 | # Initialize arrays for nucleotide start and end 36 | nucleotide_start = np.zeros_like(protein_start, dtype=np.int64) 37 | nucleotide_end = np.zeros_like(protein_end, dtype=np.int64) 38 | 39 | # Forward frames (1, 2, 3) 40 | forward_mask = frame <= 3 41 | nucleotide_start[forward_mask] = (protein_start[forward_mask] * 3) + (frame[forward_mask] - 1) 42 | nucleotide_end[forward_mask] = (protein_end[forward_mask] * 3) + 2 + (frame[forward_mask] - 1) 43 | 44 | # Reverse frames (4, 5, 6) 45 | reverse_mask = frame > 3 46 | reverse_frame = frame[reverse_mask] - 3 47 | nuc_start_reverse = (protein_start[reverse_mask] * 3) + (reverse_frame - 1) 48 | nuc_end_reverse = (protein_end[reverse_mask] * 3) + 2 + (reverse_frame - 1) 49 | 50 | nucleotide_start[reverse_mask] = seq_length[reverse_mask] - nuc_end_reverse - 1 + reverse_offset[reverse_mask] 51 | nucleotide_end[reverse_mask] = seq_length[reverse_mask] - nuc_start_reverse - 1 + reverse_offset[reverse_mask] 52 | 53 | # Convert to back to 1-based coordinates for reporting. 54 | nucleotide_start += 1 55 | nucleotide_end += 1 56 | 57 | return nucleotide_start, nucleotide_end 58 | -------------------------------------------------------------------------------- /commec/tests/test_check_biorisk.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import patch 3 | import pandas as pd 4 | import os 5 | from Bio.SeqRecord import SeqRecord, Seq 6 | 7 | from commec.screeners.check_biorisk import parse_biorisk_hits, HmmerHandler 8 | from commec.config.result import ScreenResult 9 | from commec.config.query import Query 10 | 11 | INPUT_QUERY = os.path.join(os.path.dirname(__file__), "test_data/single_record.fasta") 12 | DATABASE_DIRECTORY = os.path.join(os.path.dirname(__file__), "test_dbs/") 13 | 14 | @pytest.mark.parametrize( 15 | "annotations_exists, has_empty_output, has_hits, expected_return", 16 | [ 17 | # Case 1: annotations file doesn't exist 18 | (False, False, False, 1), 19 | # Case 2: HMMER output is empty or doesn't exist 20 | (True, True, False, 1), 21 | # Case 3: No hits detected (successful pass) 22 | (True, False, False, 0), 23 | # Case 4: Successful execution with hits 24 | (True, False, True, 0), 25 | ], 26 | ) 27 | def test_check_biorisk_return_codes(annotations_exists, has_empty_output, has_hits, expected_return): 28 | mock_hit_df = pd.DataFrame( 29 | { 30 | "target name": ["test_id"], 31 | "query name": ["testname_1"], 32 | "E-value": [1e-30], 33 | "ali from": [100], 34 | "ali to": [200], 35 | "qlen": [1000], 36 | "frame" : 1 37 | } 38 | ) 39 | 40 | mock_annot_df = pd.DataFrame( 41 | {"ID": ["test_id"], "description": ["test description"], "Must flag": [True]} 42 | ) 43 | 44 | # No filesystem interactions, patch ALL the things 45 | with ( 46 | patch("os.path.exists", return_value=annotations_exists), 47 | patch("pandas.read_csv", return_value=mock_annot_df), 48 | patch("commec.screeners.check_biorisk.readhmmer", return_value=mock_hit_df), 49 | patch("commec.screeners.check_biorisk.remove_overlaps", return_value=mock_hit_df), 50 | patch("commec.screeners.check_biorisk.HmmerHandler.has_empty_output", return_value=has_empty_output), 51 | patch("commec.screeners.check_biorisk.HmmerHandler.has_hits", return_value=has_hits), 52 | ): 53 | handler = HmmerHandler(DATABASE_DIRECTORY + "biorisk/biorisk.hmm", INPUT_QUERY, "/mock/path/test.hmmscan") 54 | results = ScreenResult() 55 | queries : dict[str,Query] = {"testname" : Query(SeqRecord(Seq("atgatgatgatgatgatgatg"),"testname","testname"))} 56 | # Run the function - input paths are unused given all the mocking above 57 | result = parse_biorisk_hits(handler, "/mock/path/biorisk/biorisk_annotations.csv", results, queries) 58 | 59 | # Check the result 60 | assert result == expected_return -------------------------------------------------------------------------------- /commec/tests/test_screen_io.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import mock_open, patch 3 | import os 4 | 5 | from commec.config.screen_io import ScreenIO, IoValidationError 6 | from commec.screen import add_args, ScreenArgumentParser 7 | 8 | 9 | @pytest.fixture 10 | def test_data_dir(): 11 | return os.path.join(os.path.dirname(__file__), "test_data") 12 | 13 | 14 | @pytest.fixture 15 | def database_dir(): 16 | return os.path.join(os.path.dirname(__file__), "test_dbs") 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "fasta_name", 21 | [ 22 | "single_record.fasta", 23 | "multiple_records.fasta", 24 | "has_empty_record.fasta", 25 | "has_empty_description.fasta", 26 | "has_records_with_same_description.fasta", 27 | ], 28 | ) 29 | def test_default_parameters(fasta_name, test_data_dir, database_dir, tmp_path): 30 | input_fasta = os.path.join(test_data_dir, fasta_name) 31 | with patch( 32 | "sys.argv", 33 | ["test.py", "--skip-tx", input_fasta, "-d", database_dir, "-o", str(tmp_path)], 34 | ): 35 | parser = ScreenArgumentParser() 36 | add_args(parser) 37 | screen_io = ScreenIO(parser.parse_args()) 38 | assert screen_io.setup() 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "fasta_name,expected_record_count", 43 | [ 44 | pytest.param("single_record.fasta", 1), 45 | pytest.param("multiple_records.fasta", 2), 46 | ], 47 | ) 48 | def test_parse_input_fasta( 49 | fasta_name, expected_record_count, test_data_dir, database_dir, tmp_path 50 | ): 51 | input_fasta = os.path.join(test_data_dir, fasta_name) 52 | with patch( 53 | "sys.argv", 54 | ["test.py", "--skip-tx", input_fasta, "-d", database_dir, "-o", str(tmp_path)], 55 | ): 56 | parser = ScreenArgumentParser() 57 | add_args(parser) 58 | screen_io = ScreenIO(parser.parse_args()) 59 | screen_io.setup() 60 | 61 | queries = screen_io.parse_input_fasta() 62 | assert len(queries) == expected_record_count 63 | 64 | 65 | @pytest.mark.parametrize( 66 | "fasta_name", 67 | [ 68 | "has_empty_record.fasta", 69 | "has_empty_description.fasta", 70 | "has_records_with_same_description.fasta", 71 | ], 72 | ) 73 | def test_parse_invalid_input_fasta(fasta_name, test_data_dir, database_dir, tmp_path): 74 | input_fasta = os.path.join(test_data_dir, fasta_name) 75 | with patch( 76 | "sys.argv", 77 | ["test.py", "--skip-tx", input_fasta, "-d", database_dir, "-o", str(tmp_path)], 78 | ): 79 | parser = ScreenArgumentParser() 80 | add_args(parser) 81 | screen_io = ScreenIO(parser.parse_args()) 82 | screen_io.setup() 83 | 84 | with pytest.raises(IoValidationError): 85 | screen_io.parse_input_fasta() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # commec: a free, open-source, globally available tool for DNA sequence screening 2 | 3 | 4 | 5 | commec logo 6 | 7 | 8 | The `commec` package is a tool for DNA sequence screening that is part of the 9 | [Common Mechanism for DNA Synthesis screening](https://ibbis.bio/common-mechanism/). The package offers several sub-commands through the `commec` entrypoint: 10 | 11 | screen Run Common Mechanism screening on an input FASTA. 12 | flag Parse .screen.json files in a directory and create a CSV file of outcomes 13 | setup A command-line helper tool to download the required databases 14 | split Split a multi-record FASTA file into individual files, one for each record 15 | 16 | The `commec screen` command runs an input FASTA through the following screening steps: 17 | 18 | 1. **Biorisk search**: Fast HMM-based search against curated sequence profiles 19 | 2. **Taxonomy Search**: look for best matches to regulated pathogens using a two-step process: 20 | * **Protein search**: BLASTX/DIAMOND search against NCBI nr 21 | * **Nucleotide search**: BLASTN search against NCBI core_nt 22 | 3. **Low concern search**: Clear earlier flags based on matches to common or conserved sequences 23 | 24 | ![Flowchart showing decision-making by the common mechanism flag module.](https://ibbis.bio/wp-content/uploads/2025/08/commec-screening-flow-v1.jpg "Decision Flow") 25 | 26 | Information about the databases supporting screening can be found in the [commec-databases](https://github.com/ibbis-bio/commec-databases/) repostiory. 27 | 28 | ## Documentation 29 | The [GitHub Wiki](https://github.com/ibbis-screening/common-mechanism/wiki) has documentation for this package, including information about installing `commec` and interpreting screening results. 30 | 31 | More information about the Common Mechanism project is available on the [IBBIS project page](https://ibbis.bio/common-mechanism/) and [Common Mechanism FAQ](https://ibbis.bio/our-work/common-mechanism/faq/). 32 | 33 | ## Development 34 | The `commec` package is being actively developed by IBBIS staff. We welcome contributions! To get started, install conda, and make sure 35 | that [your channels are configured correctly](http://bioconda.github.io/). Then create the dev environment with: 36 | 37 | ``` 38 | conda env create -f environment.yml 39 | conda activate commec-dev 40 | ``` 41 | 42 | From here, you should have an interactive version of the package installed via `pip -e .` and the necessary shell dependencies. 43 | -------------------------------------------------------------------------------- /commec/split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Split a multi-record FASTA file into individual files, one for each record. 5 | 6 | Command-line usage: 7 | split.py input.fasta 8 | """ 9 | import argparse 10 | import os 11 | import string 12 | from Bio import SeqIO 13 | from commec.utils.file_utils import file_arg 14 | 15 | VALID_FILENAME_CHARS = f"-._{string.ascii_letters}{string.digits}" 16 | DESCRIPTION = ( 17 | "Split a multi-record FASTA file into individual files, one for each record" 18 | ) 19 | 20 | 21 | def add_args(parser): 22 | """ 23 | Add module arguments to an ArgumentParser object. 24 | """ 25 | parser.add_argument( 26 | action="store", dest="fasta_file", type=file_arg, help="Input fasta file" 27 | ) 28 | return parser 29 | 30 | 31 | def clean_description(description): 32 | """ 33 | Cleans the description from a sequence record for use as part of a filename. 34 | """ 35 | cleaned = description.strip() 36 | cleaned = "".join(x for x in cleaned if x in VALID_FILENAME_CHARS) 37 | if len(cleaned) > 150: 38 | cleaned = cleaned[:150] 39 | return cleaned 40 | 41 | 42 | def write_split_fasta(fasta_file): 43 | """ 44 | Parse all sequence records in an input FASTA file, and write a new file for each record. 45 | """ 46 | output_dir = os.path.dirname(fasta_file) 47 | fasta_name = os.path.splitext(os.path.basename(fasta_file))[0] 48 | 49 | with open(fasta_file, "r", encoding="utf-8") as input_file: 50 | for i, record in enumerate(SeqIO.parse(input_file, "fasta")): 51 | desc = clean_description(record.description) 52 | 53 | # Handle empty descriptions and avoid overwriting input files 54 | if not desc or desc == fasta_name: 55 | output_basename = f"{fasta_name}-split-{i}.fasta" 56 | else: 57 | output_basename = f"{desc}.fasta" 58 | 59 | output_path = os.path.join(output_dir, output_basename) 60 | with open(output_path, "w", encoding="utf-8") as output_file: 61 | output_file.write(f">{desc}{os.linesep}") 62 | output_file.write(str(record.seq)) 63 | 64 | 65 | def run(parsed_args): 66 | """ 67 | Wrapper so that args be parsed in main() or commec.py interface. 68 | """ 69 | write_split_fasta(parsed_args.fasta_file) 70 | 71 | 72 | def main(): 73 | """ 74 | Main function. Passes FASTA file to `write_split_fasta`. 75 | 76 | Arguments: 77 | - fasta_file: Path to the input FASTA file. 78 | 79 | """ 80 | parser = argparse.ArgumentParser(description=DESCRIPTION) 81 | add_args(parser) 82 | run(parser.parse_args()) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /commec/tests/test_data/screen-files/prot-nt-hits-cleared-2024-09.screen: -------------------------------------------------------------------------------- 1 | >> STEP 1: Checking for biorisk genes... 2 | --> Biorisks: no hits detected, PASS 3 | STEP 1 completed at 2024-09-03 05:57:18 4 | >> STEP 2: Checking regulated pathogen proteins... 5 | --> Best match to sequence(s) NDS77015 at bases 89 - 268 found in only regulated organisms: FLAG (bacteria) 6 | Species: Francisella tularensis (taxid(s): 119857) (98.333 percent identity to query) 7 | Description: hypothetical protein [Francisella tularensis subsp. holarctica] 8 | --> Best match to sequence(s) CEE24912 at bases 391 - 501 found in only regulated organisms: FLAG (bacteria) 9 | Species: Xanthomonas citri (taxid(s): 611301, 487854) (75.758 percent identity to query) 10 | Description: conserved hypothetical protein [Xanthomonas citri pv. citri] 11 | STEP 2 completed at 2024-09-03 06:42:50 12 | >> STEP 3: Checking regulated pathogen nucleotides... 13 | ...protein hits found, fetching nt regions not covered by a 90% ID hit or better 14 | ...Regulated protein region at bases 89 to 268 overlapped with a nucleotide hit 15 | Species: Francisella tularensis (taxid(s): 263) (100.0 percent identity to query) 16 | ...Regulated protein region at bases 391 to 501 overlapped with a nucleotide hit 17 | Species: Francisella tularensis (taxid(s): 263) (100.0 percent identity to query) 18 | --> Best match to sequence(s) AF143093 at bases 1 - 411 found in only regulated organisms: FLAG (bacteria) 19 | Species: Francisella tularensis (taxid(s): 263) (100.0 percent identity to query) 20 | Description: Francisella tularensis 16S ribosomal RNA gene, partial sequence 21 | STEP 3 completed at 2024-09-03 11:29:43 22 | >> STEP 4: Checking any pathogen regions for benign components... 23 | ...no housekeeping protein hits 24 | -->Housekeeping RNAs - <50 bases unaccounted for: PASS 25 | RNA family: SSU_rRNA_bacteria 26 | -->Housekeeping RNAs - <50 bases unaccounted for: PASS 27 | RNA family: SSU_rRNA_bacteria 28 | -->Housekeeping RNAs - <50 bases unaccounted for: PASS 29 | RNA family: SSU_rRNA_bacteria 30 | -->Synbio sequences - >80% coverage achieved = PASS 31 | Synbio parts: BBa_I1100 P 3987 Composite "SBWB Ribosome Test System" 32 | BBa_C0101 P 3986 Coding "16s rRNA (non-standard 5%27 end, matched to B0035)" 33 | -->Synbio sequences - >80% coverage achieved = PASS 34 | Synbio parts: BBa_I1100 P 3987 Composite "SBWB Ribosome Test System" 35 | BBa_C0101 P 3986 Coding "16s rRNA (non-standard 5%27 end, matched to B0035)" 36 | -->Synbio sequences - >80% coverage achieved = PASS 37 | Synbio parts: BBa_I1100 P 3987 Composite "SBWB Ribosome Test System" 38 | BBa_C0101 P 3986 Coding "16s rRNA (non-standard 5%27 end, matched to B0035)" 39 | 40 | -->all regulated regions cleared: PASS 41 | >> COMPLETED AT 2024-09-03 11:29:50 42 | -------------------------------------------------------------------------------- /commec/tests/test_nc_to_nt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from Bio.Seq import Seq 3 | from Bio.SeqRecord import SeqRecord 4 | from commec.config.query import Query, QueryValueError 5 | 6 | @pytest.fixture 7 | def seq_record(): 8 | """ 9 | Fixture to generate a SeqRecord with 10 | defined coding ('t') and non-coding ('a') regions. 11 | Total size = 150 12 | non-coding regions = 100 13 | """ 14 | non_coding_1 = "a" * 50 15 | coding_1 = "t" * 20 16 | non_coding_2 = "a" * 30 17 | coding_2 = "t" * 30 18 | non_coding_3 = "a" * 20 # Final non-coding segment 19 | 20 | sequence = non_coding_1 + coding_1 + non_coding_2 + coding_2 + non_coding_3 21 | return SeqRecord(Seq(sequence), id="test_seq", description="") 22 | 23 | @pytest.fixture 24 | def non_coding_regions(): 25 | """ 26 | Fixture to generate non-coding region tuples based on the sequence definition. 27 | Uses the same lengths as in seq_record() to compute (start, end) values. 28 | """ 29 | regions = [ 30 | (1, 50), # First 'a' region 31 | (71, 100), # Second 'a' region (starts after first coding region) 32 | (131, 150) # Third 'a' region (starts after second coding region) 33 | ] 34 | return regions 35 | 36 | # 0 based coordinates: 37 | # NT COORDS: 0 - 49 50 - 69 70 - 99 100 - 129 130 - 149 38 | # NC COORDS: 0 - 49 50 - 79 80 - 99 39 | 40 | # 1 based coordinates: 41 | # NT COORDS: 1 - 50 51 - 70 71 - 100 101 - 130 131 - 150 42 | # NC COORDS: 1 - 50 51 - 80 81 - 100 43 | 44 | @pytest.fixture 45 | def test_cases(): 46 | """ 47 | Fixture providing a list of (input_coordinate, expected_output) tuples. 48 | The input is a sequence coordinate, and the expected output is its transformed coordinate. 49 | """ 50 | return [ 51 | (1, 1), 52 | (10, 10), 53 | (50, 50), 54 | (51, 71), 55 | (80, 100), 56 | (81, 131), 57 | (99, 149), 58 | (100, 150), 59 | ] 60 | 61 | def test_coordinate_conversion(seq_record, non_coding_regions, test_cases): 62 | """ 63 | Placeholder test function for coordinate conversion. 64 | """ 65 | # Query setup 66 | test_query : Query = Query(seq_record) 67 | test_query.non_coding_regions = non_coding_regions 68 | 69 | # Test Correct coords: 70 | for nc, nt in test_cases: 71 | assert nt == test_query.nc_to_nt_query_coords(nc) 72 | 73 | # Test Failure out of bounds. 74 | try: 75 | _x = test_query.nc_to_nt_query_coords(test_cases[-1][0]+1) 76 | assert False 77 | except QueryValueError: 78 | assert True 79 | 80 | # Test Failure out of bounds. 81 | try: 82 | _x = test_query.nc_to_nt_query_coords(0) 83 | assert False 84 | except QueryValueError: 85 | assert True 86 | 87 | -------------------------------------------------------------------------------- /commec/tools/blastn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Handler for BLASTN search of nucleotide databases using nucleotide queries. 5 | Initialise with local input database, fasta to screen, and output file. 6 | Throws error if inputs are invalid. Creates a temporary log file, which is deleted on completion. 7 | """ 8 | 9 | import subprocess 10 | from commec.tools.blast_tools import BlastHandler 11 | from commec.tools.search_handler import SearchToolVersion 12 | 13 | 14 | class BlastNHandler(BlastHandler): 15 | """ 16 | A search handler specifically for BLASTN command-line during commec screening. 17 | Modify `arguments_dictionary` to change passed to the command line call. 18 | """ 19 | 20 | def __init__( 21 | self, database_file: str, input_file: str, out_file: str, **kwargs, 22 | ): 23 | super().__init__(database_file, input_file, out_file, **kwargs) 24 | # We fill this with defaults, however they can always be overridden before screening. 25 | self.arguments_dictionary = { 26 | "-outfmt": [ 27 | "7", 28 | "qacc", 29 | "stitle", 30 | "sacc", 31 | "staxids", 32 | "evalue", 33 | "bitscore", 34 | "pident", 35 | "qlen", 36 | "qstart", 37 | "qend", 38 | "slen", 39 | "sstart", 40 | "send", 41 | ], 42 | "-num_threads": self.threads, 43 | "-evalue": 10, 44 | "-max_target_seqs": 50, 45 | "-culling_limit": 5, 46 | } 47 | self.blastcall = "blastn" 48 | 49 | def _search(self): 50 | command = [ 51 | self.blastcall, 52 | "-db", 53 | self.db_file, 54 | "-query", 55 | self.input_file, 56 | "-out", 57 | self.out_file, 58 | ] 59 | command.extend(self.format_args_for_cli()) 60 | self.run_as_subprocess(command, self.temp_log_file) 61 | 62 | def get_version_information(self) -> SearchToolVersion: 63 | try: 64 | result = subprocess.run( 65 | ["blastn", "-version"], capture_output=True, text=True, check=True 66 | ) 67 | tool_info = result.stdout.strip() 68 | 69 | result = subprocess.run( 70 | ["blastdbcmd", "-info", "-db", self.db_file, "-dbtype", "nucl"], 71 | capture_output=True, 72 | text=True, 73 | check=True, 74 | ) 75 | lines = result.stdout.splitlines() 76 | database_info: str = lines[5] + lines[3] 77 | 78 | return SearchToolVersion(tool_info, database_info) 79 | except (subprocess.CalledProcessError, FileNotFoundError): 80 | return SearchToolVersion() -------------------------------------------------------------------------------- /dev_scripts/run_blastx.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | ##################################################################### 4 | #run_blastx.sh runs blastx against a specified database. 5 | ##################################################################### 6 | #Usage: run_blastx.sh -d DB -q QUERY -o OUTPUT [-t THREADS -f FURTHEROPT] 7 | 8 | #set -eux #debug mode 9 | set -eu 10 | DB="" 11 | QUERY="" 12 | OUTPUT="out" 13 | THREADS=1 14 | FURTHEROPT="" 15 | 16 | #Get options from user 17 | while getopts "t:d:q:o:f:" OPTION 18 | do 19 | case $OPTION in 20 | t) 21 | THREADS=$OPTARG 22 | ;; 23 | d) 24 | DB=$OPTARG 25 | ;; 26 | q) 27 | QUERY=$OPTARG 28 | ;; 29 | o) 30 | OUTPUT=$OPTARG 31 | ;; 32 | f) 33 | FURTHEROPT=$OPTARG 34 | ;; 35 | \?) 36 | echo "Usage: run_blastx.sh -d DB -q QUERY -s OUTPUT [-t THREADS -f FURTHEROPT]" 37 | echo " DB full path to database (required)" 38 | echo " QUERY query file to align to each database (required)" 39 | echo " OUTPUT output prefix for alignments (default: out)" 40 | echo " THREADS number of threads for each database run (default: 1)" 41 | echo " FURTHEROPT any further options to specify" 42 | exit 43 | ;; 44 | esac 45 | done 46 | 47 | #Check for values 48 | if [ "$DB" == "" ] && [ "$INPUT" == "" ] 49 | then 50 | echo "Usage: run_blastx.sh -d DB -q QUERY -s OUTPUT [-t THREADS -f FURTHEROPT]" 51 | echo " DB full path to database (required)" 52 | echo " QUERY query file to align to each database (required)" 53 | echo " OUTPUT output prefix for alignments (default: out)" 54 | echo " THREADS number of threads for each database run (default: 1)" 55 | echo " FURTHEROPT any further options to specify" 56 | exit 57 | fi 58 | 59 | #Check for database 60 | echo -e "\t...checking for valid options..." 61 | if [ ! -f "$DB".pal ] # this is v. blast specific 62 | then 63 | echo " ERROR: blastx database $DB does not exist" 64 | exit 65 | fi 66 | 67 | #Check for input file 68 | if [ ! -f "$QUERY" ] 69 | then 70 | echo " ERROR: input file $QUERY does not exist" 71 | exit 72 | fi 73 | 74 | echo -e "\t...running protein search..." 75 | blastx -db ${DB} -query ${QUERY} -out ${OUTPUT}.blastx -outfmt "7 qacc stitle sacc staxids evalue bitscore pident qlen qstart qend slen sstart send" -evalue 1e-10 -word_size 6 -threshold 21 -max_target_seqs 5000 -culling_limit 50 -window_size 40 -matrix BLOSUM62 -gapopen 11 -gapextend 1 -seg yes -num_threads ${THREADS} ${FURTHEROPT} 76 | -------------------------------------------------------------------------------- /commec/tests/test_split.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import mock_open, patch 3 | import pytest 4 | from Bio import SeqIO 5 | from commec.split import clean_description, write_split_fasta 6 | 7 | 8 | @pytest.fixture 9 | def test_data_dir(): 10 | return os.path.join(os.path.dirname(__file__), "test_data") 11 | 12 | 13 | @pytest.fixture 14 | def fasta_records(test_data_dir): 15 | """Fixture to parse records from multiple FASTA files into a dictionary.""" 16 | files = [ 17 | "multiple_records.fasta", 18 | "single_record.fasta", 19 | "has_empty_record.fasta", 20 | "has_empty_description.fasta", 21 | ] 22 | record_dict = {} 23 | for filename in files: 24 | file_path = os.path.join(test_data_dir, filename) 25 | with open(file_path, "r", encoding="utf-8") as input_file: 26 | records = list(SeqIO.parse(input_file, "fasta")) 27 | record_dict[filename] = records 28 | return record_dict 29 | 30 | 31 | @pytest.mark.parametrize( 32 | "description, expected", 33 | [ 34 | ( 35 | 'BBa_K620001_P_22737_Coding_"WT-F87A_(p450)"', 36 | "BBa_K620001_P_22737_Coding_WT-F87A_p450", 37 | ), 38 | ("long description" * 20, "longdescription" * 10), 39 | ("", ""), 40 | ], 41 | ) 42 | def test_clean_description(description, expected): 43 | assert clean_description(description) == expected 44 | 45 | 46 | @pytest.mark.parametrize( 47 | "filename", 48 | [ 49 | "multiple_records.fasta", 50 | "single_record.fasta", 51 | "has_empty_record.fasta", 52 | "has_empty_description.fasta", 53 | ], 54 | ) 55 | @patch("builtins.open", new_callable=mock_open) 56 | @patch("os.path.join", side_effect=lambda a, b: f"{a}/{b}") 57 | @patch("commec.split.SeqIO.parse") 58 | def test_write_split_fasta( 59 | mock_seqio_parse, 60 | mock_os_path_join, 61 | mock_open, 62 | filename, 63 | test_data_dir, 64 | fasta_records, 65 | ): 66 | filepath = os.path.join(test_data_dir, filename) 67 | records = fasta_records[filename] 68 | mock_seqio_parse.return_value = records 69 | write_split_fasta(filepath) 70 | 71 | # Check the correct number of output files were opened (one input + as many outputs as records) 72 | assert mock_open.call_count == len(records) + 1 73 | 74 | for record in records: 75 | desc = clean_description(record.description) 76 | 77 | if desc: 78 | output_filename = f"{desc}.fasta" 79 | else: 80 | output_filename = f"{os.path.splitext(filename)[0]}-split-0.fasta" 81 | 82 | mock_os_path_join.assert_any_call(os.path.dirname(filepath), output_filename) 83 | mock_open.assert_any_call( 84 | os.path.join(os.path.dirname(filepath), output_filename), 85 | "w", 86 | encoding="utf-8", 87 | ) 88 | mock_open().write.assert_any_call(f">{desc}{os.linesep}") 89 | mock_open().write.assert_any_call(f"{record.seq}") 90 | -------------------------------------------------------------------------------- /dev_scripts/run_diamond.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | ##################################################################### 4 | # run_diamond.sh runs DIAMOND against a specified NCBI nr database. 5 | # DIAMOND citation: 6 | # Buchfink B, Reuter K, Drost HG, "Sensitive protein alignments at 7 | # tree-of-life scale using DIAMOND", 8 | # Nature Methods 18, 366–368 (2021). 9 | # doi:10.1038/s41592-021-01101-x 10 | ##################################################################### 11 | 12 | set -eu 13 | 14 | # Default values 15 | JOBS="" 16 | THREADS=1 17 | DB_PATH="" 18 | INPUT="" 19 | OUTPUT="out" 20 | 21 | usage() { 22 | echo "Usage: run_diamond.sh -d MY_DB -i INPUT_FILE [-o OUTPUT_FILE] [-j JOBS] [-t THREADS]" 23 | echo " MY_DB location of NCBI nr database (required)" 24 | echo " INPUT_FILE input file to align to each database (required)" 25 | echo " OUTPUT_FILE output prefix for alignments (default: out)" 26 | echo " JOBS number of diamond runs to do in parallel (default: # CPUs / THREADS)" 27 | echo " THREADS number of threads for each diamond run (default: 1)" 28 | exit 1 29 | } 30 | 31 | # Parse command line arguments 32 | while getopts "j:t:d:i:o:" opt; do 33 | case $opt in 34 | j) JOBS=$OPTARG ;; 35 | t) THREADS=$OPTARG ;; 36 | d) DB_PATH=$OPTARG ;; 37 | i) INPUT=$OPTARG ;; 38 | o) OUTPUT=$OPTARG ;; 39 | \?) usage ;; 40 | esac 41 | done 42 | 43 | # Check for required arguments 44 | if [[ -z "$DB_PATH" || -z "$INPUT" ]]; then 45 | usage 46 | fi 47 | 48 | # Validate input 49 | if [[ ! -d "$DB_PATH" ]]; then 50 | echo "ERROR: nr diamond database folder $DB_PATH does not exist" >&2 51 | exit 1 52 | fi 53 | 54 | shopt -s failglob 55 | if ! files=("${DB_PATH}"/nr*.dmnd); then 56 | echo "ERROR: No nr diamond database files (nr*.dmnd) found in $DB_PATH" >&2 57 | exit 1 58 | fi 59 | shopt -u failglob 60 | 61 | if [[ ! -f "$INPUT" ]]; then 62 | echo "ERROR: input file $INPUT does not exist" >&2 63 | exit 1 64 | fi 65 | 66 | # Set JOBS if not specified by user 67 | if [[ -z "$JOBS" ]]; then 68 | CPU_COUNT=$(parallel --number-of-cpus) 69 | JOBS=$((CPU_COUNT / THREADS)) 70 | fi 71 | 72 | echo "Running diamond protein search..." 73 | echo "Using $JOBS job(s) in parallel with $THREADS thread(s) each" 74 | 75 | # Run diamond 76 | ls ${DB_PATH}/nr*.dmnd | parallel --will-cite --use-cpus-instead-of-cores --jobs ${JOBS} \ 77 | diamond blastx --quiet \ 78 | -d {} \ 79 | --threads ${THREADS} \ 80 | -q ${INPUT} \ 81 | -o ${OUTPUT}.{/.}.tsv \ 82 | --outfmt 6 qseqid stitle sseqid staxids evalue bitscore pident qlen qstart qend slen sstart send \ 83 | --frameshift 15 --range-culling 84 | 85 | # Combine results and clean up 86 | cat ${OUTPUT}.*.tsv > ${OUTPUT}.dmnd 87 | rm ${OUTPUT}.*.tsv 88 | 89 | echo "Diamond protein search completed. Results are in ${OUTPUT}.dmnd" 90 | -------------------------------------------------------------------------------- /commec/tools/blastx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Handler for BLASTX search of protein databases using nucleotide queries. 5 | Initialise with local input database, fasta to screen, and output file. 6 | Throws error if inputs are invalid. Creates a temporary log file, which is deleted on completion. 7 | """ 8 | 9 | import subprocess 10 | from commec.tools.blast_tools import BlastHandler 11 | from commec.tools.search_handler import SearchToolVersion 12 | 13 | 14 | class BlastXHandler(BlastHandler): 15 | """ 16 | A search handler specifically for BLASTX command-line during commec screening. 17 | Modify `arguments_dictionary` to change arguments passed to the CLI. 18 | """ 19 | 20 | def __init__( 21 | self, database_file: str, input_file: str, out_file: str, **kwargs, 22 | ): 23 | super().__init__(database_file, input_file, out_file, **kwargs) 24 | # We fill this with defaults, however they can always be overridden before screening. 25 | self.arguments_dictionary = { 26 | "-num_threads": self.threads, 27 | "-evalue": 1e-10, 28 | "-word_size": 6, 29 | "-threshold": 21, 30 | "-max_target_seqs": 5000, 31 | "-culling_limit": 50, 32 | "-window_size": 40, 33 | "-matrix": "BLOSUM62", 34 | "-gapopen": 11, 35 | "-gapextend": 1, 36 | "-seg": "yes", 37 | "-outfmt": [ 38 | "7", 39 | "qacc", 40 | "stitle", 41 | "sacc", 42 | "staxids", 43 | "evalue", 44 | "bitscore", 45 | "pident", 46 | "qlen", 47 | "qstart", 48 | "qend", 49 | "slen", 50 | "sstart", 51 | "send", 52 | ], 53 | } 54 | self.blastcall = "blastx" 55 | 56 | def _search(self): 57 | command = [ 58 | self.blastcall, 59 | "-db", 60 | self.db_file, 61 | "-query", 62 | self.input_file, 63 | "-out", 64 | self.out_file, 65 | ] 66 | command.extend(self.format_args_for_cli()) 67 | self.run_as_subprocess(command, self.temp_log_file) 68 | 69 | def get_version_information(self) -> SearchToolVersion: 70 | try: 71 | result = subprocess.run( 72 | ["blastx", "-version"], capture_output=True, text=True, check=True 73 | ) 74 | tool_info = result.stdout.strip() 75 | 76 | result = subprocess.run( 77 | ["blastdbcmd", "-info", "-db", self.db_file, "-dbtype", "prot"], 78 | capture_output=True, 79 | text=True, 80 | check=True, 81 | ) 82 | lines = result.stdout.splitlines() 83 | database_info: str = lines[5] + lines[3] 84 | 85 | return SearchToolVersion(tool_info, database_info) 86 | 87 | except (subprocess.CalledProcessError, FileNotFoundError): 88 | return SearchToolVersion() 89 | -------------------------------------------------------------------------------- /commec/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Command-line entrypoint for the package. Calls `screen.py`, `flag.py` and `split.py` as subcommands. 5 | 6 | The subcommands: 7 | screen Run Common Mechanism screening on an input FASTA. 8 | flag Parse all .screen files in a directory and create two CSVs file of flags raised 9 | split Split a multi-record FASTA file into individual files, one for each record 10 | 11 | Command-line usage: 12 | - commec screen -d /path/to/databases input.fasta 13 | - commec flag /path/to/directory/with/output.screen 14 | - commec split input.fasta 15 | - commec -h, --help 16 | - commec -v, --version 17 | """ 18 | from commec.flag import ( 19 | DESCRIPTION as flag_DESCRIPTION, 20 | add_args as flag_add_args, 21 | run as flag_run, 22 | ) 23 | from commec.screen import ( 24 | DESCRIPTION as screen_DESCRIPTION, 25 | add_args as screen_add_args, 26 | run as screen_run, 27 | ScreenArgumentParser 28 | ) 29 | from commec.split import ( 30 | DESCRIPTION as split_DESCRIPTION, 31 | add_args as split_add_args, 32 | run as split_run, 33 | ) 34 | from commec.setup import ( 35 | DESCRIPTION as setup_DESCRIPTION, 36 | add_args as setup_add_args, 37 | run as setup_run, 38 | ) 39 | 40 | from commec import __version__ as COMMEC_VERSION 41 | 42 | def main(): 43 | """ 44 | Parse the command line arguments and call the relevant sub-command. 45 | """ 46 | parser = ScreenArgumentParser( 47 | prog="commec", description="Command-line entrypoint for the Common Mechanism" 48 | ) 49 | # Sub argument for version information 50 | parser.add_argument( 51 | "-v", 52 | "--version", 53 | dest="version", 54 | action="store_true", 55 | default=False, 56 | help="show version information and exit", 57 | ) 58 | 59 | # Setup sub parsers: 60 | subparsers = parser.add_subparsers(dest="command") 61 | 62 | # Sub-command for "screen" 63 | screen_parser = subparsers.add_parser("screen", description=screen_DESCRIPTION) 64 | screen_add_args(screen_parser) 65 | 66 | # Sub-command for "flag" 67 | flag_parser = subparsers.add_parser("flag", description=flag_DESCRIPTION) 68 | flag_add_args(flag_parser) 69 | 70 | # Sub-command for "split" 71 | split_parser = subparsers.add_parser("split", description=split_DESCRIPTION) 72 | split_add_args(split_parser) 73 | 74 | # Sub-command for "setup" 75 | setup_parser = subparsers.add_parser("setup", description=setup_DESCRIPTION) 76 | setup_add_args(setup_parser) 77 | 78 | args = parser.parse_args() 79 | 80 | if args.command == "screen": 81 | screen_run(args) 82 | elif args.command == "flag": 83 | flag_run(args) 84 | elif args.command == "split": 85 | split_run(args) 86 | elif args.command == "setup": 87 | setup_run(args) 88 | elif args.version: 89 | print( "Commec : The Common Mechanism\n" 90 | f"Version : {COMMEC_VERSION}\n" 91 | "Copyright IBBIS (c) 2021-2025\n" 92 | "International Biosecurity and Biosafety Initiative for Science") 93 | else: 94 | parser.print_help() 95 | 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /commec/tools/cmscan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Cmscan search handler, and calling cmscan command line interface. 5 | Additional methods for reading handler output, readcmscan, which returns a pandas database. 6 | Instantiate a CmscanHandler, with input local database, input fasta, and output file. 7 | Throws if inputs are invalid. Creates a temporary log file, which is deleted on completion. 8 | """ 9 | import subprocess 10 | import re 11 | import pandas as pd 12 | from commec.tools.search_handler import SearchHandler, SearchToolVersion 13 | 14 | 15 | class CmscanHandler(SearchHandler): 16 | """A Database handler specifically for use with Hmmer files for commec screening.""" 17 | 18 | def _search(self): 19 | command = [ 20 | "cmscan", 21 | "--cpu", 22 | str(self.threads), 23 | "--tblout", 24 | self.out_file, 25 | self.db_file, 26 | self.input_file, 27 | ] 28 | self.run_as_subprocess(command, self.temp_log_file) 29 | 30 | def read_output(self): 31 | output_dataframe = readcmscan(self.out_file) 32 | # Standardize the output column names to be like blast: 33 | output_dataframe = output_dataframe.rename(columns={ 34 | "seq from": "q. start", 35 | "seq to": "q. end", 36 | "coverage": "q. coverage", 37 | "target name": "subject title", 38 | "mdl from": "s. start", 39 | "mdl to" : "s. end", 40 | 'E-value': "evalue", 41 | }) 42 | return output_dataframe 43 | 44 | 45 | def get_version_information(self) -> SearchToolVersion: 46 | try: 47 | database_info = None 48 | with open(self.db_file, "r", encoding="utf-8") as file: 49 | for line in file: 50 | if line.startswith("INFERNAL1/a"): 51 | database_info = line.strip() 52 | continue 53 | # Early exit if data has been found 54 | if database_info: 55 | break 56 | 57 | result = subprocess.run( 58 | ["cmscan", "-h"], capture_output=True, text=True, check=True 59 | ) 60 | tool_info = result.stdout.splitlines()[1].strip()[2:] or "error retrieving info" 61 | 62 | return SearchToolVersion(tool_info, database_info or "error") 63 | 64 | except (subprocess.CalledProcessError, FileNotFoundError): 65 | return None 66 | 67 | 68 | def readcmscan(fileh): 69 | """ 70 | Read in cmscan output files 71 | """ 72 | columns = [ 73 | "target name", 74 | "accession", 75 | "query name", 76 | "accession", 77 | "mdl", 78 | "mdl from", 79 | "mdl to", 80 | "seq from", 81 | "seq to", 82 | "strand", 83 | "trunc", 84 | "pass", 85 | "gc", 86 | "bias", 87 | "score", 88 | "E-value", 89 | "inc", 90 | "description of target", 91 | ] 92 | 93 | cmscan = [] 94 | 95 | with open(fileh, "r", encoding="utf-8") as f: 96 | for line in f: 97 | if "# Program: cmscan" in line: 98 | break 99 | if "#" in line: 100 | continue 101 | bits = re.split(r"\s+", line) 102 | description = " ".join(bits[17:]) 103 | bits = bits[:17] 104 | bits.append(description) 105 | cmscan.append(bits) 106 | cmscan = pd.DataFrame(cmscan, columns=columns) 107 | cmscan["E-value"] = pd.to_numeric(cmscan["E-value"]) 108 | cmscan["score"] = pd.to_numeric(cmscan["score"]) 109 | cmscan["seq from"] = pd.to_numeric(cmscan["seq from"]) 110 | cmscan["seq to"] = pd.to_numeric(cmscan["seq to"]) 111 | 112 | return cmscan -------------------------------------------------------------------------------- /commec/tests/test_aa_to_nt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple tests to ensure the correct trimming of overlapping components in a Hmmer database 3 | when parsed to remove_overlaps. 4 | The following behaviour is expected: 5 | * Fully encapsulated hits, should be removed. 6 | * Partially overlapping hits, should both be kept (To maximise extents) 7 | * Hits from different queries are independant in logic. 8 | """ 9 | 10 | import pandas as pd 11 | from pandas.testing import assert_frame_equal 12 | import pytest 13 | from commec.tools.hmmer import recalculate_hmmer_query_coordinates 14 | 15 | # Example DataFrame 16 | example_hmmer_01 = pd.DataFrame({ 17 | "query name": ["F1","F2","F3","R1","R2", "R3"], 18 | "frame": [1,2,3,4,5,6], 19 | "ali from": [1, 2, 3, 1, 2, 3], 20 | "ali to": [4, 5, 6, 4, 5, 6], 21 | "nt_qlen": [31, 31, 31, 31, 31, 31] 22 | }) 23 | 24 | # Logically, we would expect this to match Fwd and Rev all frame AA to NT coordinates: 25 | # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 26 | # [ 1 ] [ 2 ] [ 3 ] [ 4 ] [ 5 ] [ 6 ] [ 7 ] [ 8 ] [ 9 ] [ 10 ] 27 | # [ 1 ] [ 2 ] [ 3 ] [ 4 ] [ 5 ] [ 6 ] [ 7 ] [ 8 ] [ 9 ] [ 10 ] 28 | # [ 1 ] [ 2 ] [ 3 ] [ 4 ] [ 5 ] [ 6 ] [ 7 ] [ 8 ] [ 9 ] 29 | # [ 10 ] [ 9 ] [ 8 ] [ 7 ] [ 6 ] [ 5 ] [ 4 ] [ 3 ] [ 2 ] [ 1 ] 30 | # [ 9 ] [ 8 ] [ 7 ] [ 6 ] [ 5 ] [ 4 ] [ 3 ] [ 2 ] [ 1 ] 31 | # [ 9 ] [ 8 ] [ 7 ] [ 6 ] [ 5 ] [ 4 ] [ 3 ] [ 2 ] [ 1 ] [ -1 ] 32 | 33 | # However, HMMER Biorisk behaves like the following 34 | #(According to testing with BBa_I766605 YopH-EE under medium constitutive promotor, and reverse complements.) 35 | # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 36 | # [ 1 ] [ 2 ] [ 3 ] [ 4 ] [ 5 ] [ 6 ] [ 7 ] [ 8 ] [ 9 ] [ 10 ] 37 | # [ 1 ] [ 2 ] [ 3 ] [ 4 ] [ 5 ] [ 6 ] [ 7 ] [ 8 ] [ 9 ] [ 10 ] 38 | # [ 1 ] [ 2 ] [ 3 ] [ 4 ] [ 5 ] [ 6 ] [ 7 ] [ 8 ] [ 9 ] 39 | # [ 10 ] [ 9 ] [ 8 ] [ 7 ] [ 6 ] [ 5 ] [ 4 ] [ 3 ] [ 2 ] [ 1 ] 40 | # [ 9 ] [ 8 ] [ 7 ] [ 6 ] [ 5 ] [ 4 ] [ 3 ] [ 2 ] [ 1 ] 41 | # [ 9 ] [ 8 ] [ 7 ] [ 6 ] [ 5 ] [ 4 ] [ 3 ] [ 2 ] [ 1 ] [ -1 ] 42 | 43 | # Example DataFrame 44 | original_expected_output = pd.DataFrame({ 45 | "query name": ["F1","F2","F3","R1","R2", "R3"], 46 | "frame": [1,2,3,4,5,6], 47 | "ali from": [1, 2, 3, 1, 2, 3], 48 | "ali to": [4, 5, 6, 4, 5, 6], 49 | "nt_qlen": [31, 31, 31, 31, 31, 31], 50 | "q. start": [ 1, 5, 9, 19, 15, 11], 51 | "q. end": [12, 16, 20, 30, 26, 22] 52 | }) 53 | 54 | # Example DataFrame which matches biorisk hmmer outputs: 55 | example_hmmer_01_output = pd.DataFrame({ 56 | "query name": ["F1","F2","F3","R1","R2", "R3"], 57 | "frame": [1,2,3,4,5,6], 58 | "ali from": [1, 2, 3, 1, 2, 3], 59 | "ali to": [4, 5, 6, 4, 5, 6], 60 | "nt_qlen": [31, 31, 31, 31, 31, 31], 61 | "q. start": [ 1, 5, 9, 21, 17, 13], 62 | "q. end": [12, 16, 20, 32, 28, 24] 63 | }) 64 | 65 | @pytest.mark.parametrize( 66 | "input_hmmer, expected_output_hmmer", 67 | [ 68 | (example_hmmer_01, example_hmmer_01_output), 69 | ] 70 | ) 71 | def test_hmmer_overlaps( 72 | input_hmmer : pd.DataFrame, 73 | expected_output_hmmer : pd.DataFrame 74 | ): 75 | """ 76 | Checks common configurations that require trimming in Hmmer outputs, 77 | In particular partial overlaps, 78 | full encapsulations, score differences, 79 | and different queries. 80 | """ 81 | print("INPUT:") 82 | print(input_hmmer) 83 | print(input_hmmer.dtypes) 84 | recalculate_hmmer_query_coordinates(input_hmmer) 85 | print("PROCESSED:") 86 | print(input_hmmer) 87 | print(input_hmmer.dtypes) 88 | print("CORRECT:") 89 | print(expected_output_hmmer) 90 | print(expected_output_hmmer.dtypes) 91 | assert_frame_equal(input_hmmer, expected_output_hmmer) 92 | #assert input_hmmer.equals(expected_output_hmmer) 93 | -------------------------------------------------------------------------------- /commec/tests/test_blast_tools.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | import pytest 3 | import textwrap 4 | from unittest.mock import patch 5 | import numpy as np 6 | import pandas as pd 7 | from commec.tools.blast_tools import _split_by_tax_id, read_blast, _get_lineages, get_taxonomic_labels 8 | 9 | 10 | @pytest.fixture 11 | def blast_df(): 12 | """ 13 | Return a dataframe containing 3 BLAST hits, 2 with multiple taxids, 1 of which is invalid and 1 14 | of which is a synthetic taxid 15 | """ 16 | blast_to_parse = textwrap.dedent( 17 | """\ 18 | # BLASTX 2.15.0+ 19 | # Query: NC_TEST 20 | # Database: /root/commec-dbs/mock 21 | #query acc. subject title subject acc. subject tax ids evalue bit score % identity query length q. start q. end subject length s. start s. end 22 | # 3 hits found 23 | BT_01 SUBJECT SUBJECT_ACC 2371;644357 0.0 BITSCORE 99.999 300 101 200 500 1 100 24 | BT_01 SUBJECT SUBJECT_ACC 10760;110011001100 0.0 BITSCORE 99.999 300 25 80 500 1 100 25 | BT_01 SUBJECT SUBJECT_ACC 32630 0.0 BITSCORE 99.999 300 275 300 500 1 100 26 | """ 27 | ) 28 | return read_blast(StringIO(blast_to_parse)) 29 | 30 | 31 | @pytest.fixture 32 | def lineage_df(): 33 | """ 34 | Dataframe subsetting columns from the results of pytaxonkit.lineage applied to blast_df 35 | """ 36 | return pd.DataFrame( 37 | { 38 | "TaxID": [2371, 644357, 10760, 110011001100, 32630], 39 | "Code": [2371, 644357, 10760, -1, 32630], 40 | "FullLineage": [ 41 | "cellular organisms;Bacteria;Pseudomonadota;Gammaproteobacteria;Lysobacterales;Lysobacteraceae;Xylella;Xylella fastidiosa", 42 | "cellular organisms;Bacteria;Pseudomonadota;Gammaproteobacteria;Lysobacterales;Lysobacteraceae;Xylella;Xylella fastidiosa;Xylella fastidiosa subsp. multiplex", 43 | "Viruses;Duplodnaviria;Heunggongvirae;Uroviricota;Caudoviricetes;Autographiviridae;Studiervirinae;Teseptimavirus;Teseptimavirus T7;Escherichia phage T7", 44 | np.nan, 45 | "other entries;other sequences;artificial sequences;synthetic construct", 46 | ], 47 | "FullLineageTaxIDs": [ 48 | "131567;2;1224;1236;135614;32033;2370;2371", 49 | "131567;2;1224;1236;135614;32033;2370;2371;644357", 50 | "10239;2731341;2731360;2731618;2731619;2731643;2731653;110456;1985738;10760", 51 | np.nan, 52 | "2787854;28384;81077;32630", 53 | ], 54 | "FullLineageRanks": [ 55 | "no rank;superkingdom;phylum;class;order;family;genus;species", 56 | "no rank;superkingdom;phylum;class;order;family;genus;species;subspecies", 57 | "superkingdom;clade;kingdom;phylum;class;family;subfamily;genus;species;no rank", 58 | np.nan, 59 | "no rank;no rank;no rank;species", 60 | ], 61 | } 62 | ) 63 | 64 | 65 | def test_split_by_tax_id(blast_df: pd.DataFrame): 66 | assert len(blast_df) == 3 67 | split_blast = _split_by_tax_id(blast_df) 68 | assert len(split_blast) == 5 69 | expected_tax_ids = {2371, 644357, 10760, 110011001100, 32630} 70 | assert set(split_blast["subject tax ids"]) == expected_tax_ids 71 | 72 | 73 | @patch("pytaxonkit.lineage") 74 | def test_get_lineages(mock_lineage, blast_df, lineage_df): 75 | mock_lineage.return_value = lineage_df 76 | blast_df = _split_by_tax_id(blast_df) 77 | lin = _get_lineages( 78 | blast_df["subject tax ids"], "commec-dbs/taxonomy/", 8 79 | ) 80 | # Expect the invalid taxid to be filtered out 81 | expected_tax_ids = {2371, 644357, 10760, 32630} 82 | assert set(lin["TaxID"]) == expected_tax_ids 83 | 84 | 85 | @patch("pytaxonkit.lineage") 86 | def test_taxdist(mock_lineage, blast_df, lineage_df): 87 | mock_lineage.return_value = lineage_df 88 | # Fake values - should find 1 regulated hit after filtering 89 | reg_taxids = ['644357', '10760'] 90 | vax_taxids = ['10760'] 91 | reg_df = get_taxonomic_labels( 92 | blast_df, reg_taxids, vax_taxids, "commec-dbs/taxonomy/", 8 93 | ) 94 | # Expect the synthetic taxid to be filtered out 95 | expected_tax_ids = {2371, 644357, 10760} 96 | assert set(reg_df["subject tax ids"]) == expected_tax_ids 97 | 98 | # Expect only taxid 644357 to be marked as "regulated" 99 | assert reg_df[reg_df["subject tax ids"] == 2371]["regulated"].iloc[0] == False 100 | assert reg_df[reg_df["subject tax ids"] == 644357]["regulated"].iloc[0] == True 101 | assert reg_df[reg_df["subject tax ids"] == 10760]["regulated"].iloc[0] == False 102 | -------------------------------------------------------------------------------- /commec/tools/fetch_nc_bits.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Fetch parts of a query that had no high-quality protein matches for use in nucloetide screening. 5 | 6 | Usage: 7 | fetch_nc_bits.py query_name fasta_file_path 8 | """ 9 | import argparse 10 | import logging 11 | import shutil 12 | import re 13 | import pandas as pd 14 | from Bio import SeqIO 15 | from commec.config.query import Query 16 | from commec.tools.blast_tools import get_high_identity_hits 17 | from commec.tools.search_handler import SearchHandler 18 | from commec.config.result import ScreenStatus 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | def _get_ranges_with_no_hits(input_df : pd.DataFrame): 23 | """ 24 | Get indices not covered by the query start / end ranges in the BLAST results. 25 | """ 26 | 27 | assert "q. start" in input_df.columns, ( 28 | "Column \"q. start\" does not exist for get_ranges_with_no_hits().\n" 29 | f"Existing columns: {', '.join(input_df.columns)}" 30 | ) 31 | 32 | assert "q. end" in input_df.columns, ( 33 | "Column \"q. end\" does not exist for get_ranges_with_no_hits().\n" 34 | f"Existing columns: {', '.join(input_df.columns)}" 35 | ) 36 | 37 | assert "query length" in input_df.columns, ( 38 | "Column \"query length\" does not exist for get_ranges_with_no_hits().\n" 39 | f"Existing columns: {', '.join(input_df.columns)}" 40 | ) 41 | 42 | assert not input_df.empty, "Input dataframe for get_ranges_with_no_hits() is empty." 43 | 44 | unique_hits = input_df.drop_duplicates(subset=["q. start", "q. end"]) 45 | hit_ranges = unique_hits[["q. start", "q. end"]].values.tolist() 46 | 47 | # Sort each pair to ensure that start < end, then sort entire list of ranges 48 | hit_ranges = sorted([sorted(pair) for pair in hit_ranges]) 49 | 50 | nc_ranges : list[tuple[int,int]] = [] 51 | 52 | # Include the start if the first hit begins more than 50 bp after the start 53 | if hit_ranges[0][0] > 50: 54 | nc_ranges.append((1, hit_ranges[0][0] - 1)) 55 | 56 | # Add ranges if there is a noncoding region of >=50 between hits 57 | for i in range(len(hit_ranges) - 1): 58 | nc_start = hit_ranges[i][1] + 1 # starts after this hit 59 | nc_end = hit_ranges[i + 1][0] - 1 # ends before next hit 60 | 61 | if nc_end - nc_start + 1 >= 50: 62 | nc_ranges.append((nc_start, nc_end)) 63 | 64 | # Include the end if the last hit ends more than 50 bp before the end 65 | query_length = input_df["query length"].iloc[0] 66 | if query_length - hit_ranges[-1][1] >= 50: 67 | nc_ranges.append((hit_ranges[-1][1] + 1, int(query_length))) 68 | 69 | return nc_ranges 70 | 71 | def _set_no_coding_regions(query : Query): 72 | """Set the query to be entirely non-coding (i.e. no high-quality protein hits).""" 73 | query.non_coding_regions.append((1, query.length)) 74 | 75 | def calculate_noncoding_regions_per_query( 76 | protein_search_handler : SearchHandler, 77 | queries : dict[str, Query] 78 | ): 79 | """ 80 | Fetch noncoding regions > 50bp for every query, and 81 | updates the Query dictionary to include non-coding meta-data. 82 | """ 83 | logger.debug("Checking protein hits in: %s", protein_search_handler.out_file) 84 | 85 | if not protein_search_handler.has_hits(): 86 | logger.info("No protein hits found, screening entire sequence.") 87 | for query in queries.values(): 88 | _set_no_coding_regions(query) 89 | return 90 | 91 | protein_hits = get_high_identity_hits(protein_search_handler.out_file) 92 | 93 | query_col = "query acc." 94 | 95 | for query in queries.values(): 96 | protein_hits_for_query = protein_hits[protein_hits[query_col] == query.name].copy() 97 | 98 | if protein_hits_for_query.empty: 99 | logger.info("No protein hits found for %s, screening entire sequence.", query.name) 100 | _set_no_coding_regions(query) 101 | continue 102 | 103 | # Correcting query length in nc coordinate output. 104 | protein_hits_for_query.loc[:, "q.len"] = query.length 105 | 106 | logger.debug("\t --> Protein hits found for %s, fetching nt regions not covered by a 90%% ID hit or better", query.name) 107 | 108 | ranges_to_screen = _get_ranges_with_no_hits(protein_hits_for_query) 109 | # if the entire sequence, save regions <50 bases, is covered with protein, skip nt scan 110 | if not ranges_to_screen: 111 | logger.info("\t --> no noncoding regions >= 50 bases found for %s, skipping nt scan for query.", query.name) 112 | query.result.status.nucleotide_taxonomy = ScreenStatus.SKIP 113 | continue 114 | 115 | # Update the list of start and end non-coding tuples for query. 116 | query.non_coding_regions.extend(ranges_to_screen) 117 | 118 | ranges_str = ", ".join(f"{start}-{end}" for start, end in ranges_to_screen) 119 | logger.info("\t --> Identified noncoding regions for query %s: [%s]", query.name, ranges_str) 120 | -------------------------------------------------------------------------------- /commec/tests/test_fetch_nc_bits.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | import os 3 | import pandas as pd 4 | import pytest 5 | import textwrap 6 | from Bio import SeqIO 7 | from unittest.mock import patch 8 | 9 | from commec.tools.fetch_nc_bits import ( 10 | _get_ranges_with_no_hits, 11 | calculate_noncoding_regions_per_query, 12 | ) 13 | 14 | from commec.config.screen_io import ScreenIO 15 | from commec.config.result import QueryResult, ScreenStatus 16 | from commec.screen import add_args, ScreenArgumentParser 17 | from commec.tools.blastx import BlastXHandler 18 | 19 | DATABASE_DIRECTORY = os.path.join(os.path.dirname(__file__), "test_dbs") 20 | 21 | @pytest.mark.parametrize( 22 | "hits, nc_ranges", 23 | [ 24 | # Two protein hits, no noncoding regions > 50bp 25 | ([(1, 50), (100, 150), (175, 299)], []), 26 | # One protein hit, < 50bp nocoding regions on the ends 27 | ([(50, 251)], []), 28 | # One protein hit, > 50bp nocoding regions on the ends 29 | ([(51, 250)], [(1, 50), (251, 300)]), 30 | # Three protein hits, one noncoding region >50bp 31 | ( 32 | [(1, 40), (140, 265), (300, 349)], 33 | [(41, 139)], 34 | ), 35 | ], 36 | ) 37 | def test_get_ranges_with_no_hits(hits, nc_ranges): 38 | """ 39 | Test the BLAST hits are successfully converted into noncoding ranges. 40 | """ 41 | 42 | def _create_mock_blast_df_from(hits): 43 | data = { 44 | "q. start": [hit[0] for hit in hits], 45 | "q. end": [hit[1] for hit in hits], 46 | "query length": [300] * len(hits), 47 | } 48 | df = pd.DataFrame(data) 49 | return df.reset_index(drop=True) # This adds a numeric index 50 | 51 | blast_df = _create_mock_blast_df_from(hits) 52 | assert _get_ranges_with_no_hits(blast_df) == nc_ranges 53 | 54 | 55 | def test_fetch_nocoding_regions(tmp_path): 56 | """Full test, including file parsing.""" 57 | 58 | desc_1 = "NC_TEST01" 59 | desc_2 = "NC_TEST02" 60 | seq_1 = textwrap.dedent( 61 | """\ 62 | ggtagttccctaaacttatcattaagcgatcttcatcgtcaggtatctcgattggtgcagcaagagagcggtgattgt 63 | accgggaaattaagaggtaacgttgctgccaataaagaaactacctttcaaggtttgaccatagccagtggagccaga 64 | gagtcagaaaaagtatttgctcaaactgtactaagccacgtagcaaatgttgttctaactcaagaagataccgctaag 65 | ctattgcaaagtacggtaaagcataatttgaataattatgacttaagaagtgtcggcaatggtaat 66 | """ 67 | ) 68 | seq_2 = textwrap.dedent( 69 | """\ 70 | atggcacaagtcattaataccaacagcctctcgctgatcactcaaaataatatcaacaagaaccagtctgcgctgtcg 71 | agttctatcgagcgtctgtcttctggcttgcgtattaacagcgcgaaggatgacgcagcgggtcaggcgattgctaac 72 | cgtttcacctctaacattaaaggcctgactcaggcggcccgtaacgccaacgacggtatctccgttgcgcagaccacc 73 | gaaggcgcgctgtccgaaatcaacaacaacttacagcgtgtgcgtgaactgacggtacaggccact 74 | """ 75 | ) 76 | 77 | blast_to_parse = textwrap.dedent( 78 | """\ 79 | # BLASTX 2.15.0+ 80 | # Query: NC_TEST 81 | # Database: /root/commec-dbs/mock 82 | #query acc. subject title subject acc. subject tax ids evalue bit score % identity query length q. start q. end subject length s. start s. end 83 | # 3 hits found 84 | NC_TEST01 SUBJECT SUBJECT_ACC TAXID 0.0 BITSCORE 99.999 300 101 200 500 1 100 85 | NC_TEST02 SUBJECT SUBJECT_ACC TAXID 0.0 BITSCORE 99.999 300 25 80 500 1 100 86 | NC_TEST02 SUBJECT SUBJECT_ACC TAXID 0.0 BITSCORE 99.999 300 100 300 500 1 100 87 | """ 88 | ) 89 | 90 | expected_output = textwrap.dedent( 91 | """\ 92 | >NC_TEST01 (1-100) (201-300) 93 | ggtagttccctaaacttatcattaagcgatcttcatcgtcaggtatctcgattggtgcagcaagagagcggtgattgtaccgggaaattaagaggtaacgaaatgttgttctaactcaagaagataccgctaagctattgcaaagtacggtaaagcataatttgaataattatgacttaagaagtgtcggcaatggtaat 94 | """ 95 | ) 96 | 97 | # Setup Expected files 98 | input_fasta = tmp_path / "fetch_nc_input.fasta" 99 | input_fasta.write_text(f">{desc_1}\n{seq_1}\n>{desc_2}\n{seq_2}\n") 100 | input_blast = tmp_path / "fetch_nc_input.blastx" 101 | input_blast.write_text(blast_to_parse) 102 | 103 | # Create Dictionary of queries for funciton input. 104 | with patch( 105 | "sys.argv", 106 | ["test.py", "--skip-tx", str(input_fasta), "-d", str(DATABASE_DIRECTORY), "-o", str(tmp_path)], 107 | ): 108 | parser = ScreenArgumentParser() 109 | add_args(parser) 110 | screen_io = ScreenIO(parser.parse_args()) 111 | screen_io.setup() 112 | 113 | queries = screen_io.parse_input_fasta() 114 | for query in queries.values(): 115 | query.result = QueryResult() 116 | 117 | # Setup result handler for function input. 118 | db_file = os.path.join(DATABASE_DIRECTORY, "nr_blast/nr") 119 | handler = BlastXHandler(db_file, input_fasta, input_blast, force=True) 120 | 121 | calculate_noncoding_regions_per_query(handler, queries) 122 | 123 | # Generate the non-coding fasta text. 124 | actual_output = "" 125 | for query in queries.values(): 126 | if query.result.status.nucleotide_taxonomy == ScreenStatus.SKIP: 127 | continue 128 | actual_output += query.get_non_coding_regions_as_fasta() 129 | 130 | assert actual_output.strip() == expected_output.strip() 131 | -------------------------------------------------------------------------------- /.github/workflows/automate_release.yml: -------------------------------------------------------------------------------- 1 | name: Automate Release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | release_branch: 7 | description: 'Branch to release (e.g. release_v1.2.3)' 8 | required: true 9 | type: string 10 | version_string: 11 | description: 'Version string (e.g. 1.2.3)' 12 | required: true 13 | type: string 14 | 15 | permissions: 16 | contents: write 17 | pull-requests: write 18 | 19 | jobs: 20 | release: 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - name: Check token permissions 25 | run: gh auth status 26 | env: 27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 28 | 29 | - name: Checkout release branch 30 | uses: actions/checkout@v4 31 | with: 32 | ref: ${{ github.event.inputs.release_branch }} 33 | 34 | #- name: Set up Python 35 | # uses: actions/setup-python@v5 36 | # with: 37 | # python-version: '3.11' 38 | 39 | - name: Check and update pyproject.toml version 40 | run: | 41 | ver="${{ github.event.inputs.version_string }}" 42 | current=$(grep "^version = '" pyproject.toml | head -n1 | cut -d"'" -f2) 43 | if [ "$current" != "$ver" ]; then 44 | sed -i "s/^version = '.*'/version = '$ver'/" pyproject.toml 45 | echo "updated_pyproject=true" >> $GITHUB_ENV 46 | fi 47 | 48 | - name: Check and update conda meta.yaml version 49 | run: | 50 | ver="${{ github.event.inputs.version_string }}" 51 | current=$(grep '{% set version = "' conda-recipe/meta.yaml | head -n1 | cut -d'"' -f2) 52 | if [ "$current" != "$ver" ]; then 53 | sed -i "s/{% set version = \".*\" %}/{% set version = \"$ver\" %}/" conda-recipe/meta.yaml 54 | echo "updated_meta=true" >> $GITHUB_ENV 55 | fi 56 | 57 | - name: Update functional test JSON with new version 58 | if: env.updated_pyproject == 'true' 59 | run: | 60 | ver="${{ github.event.inputs.version_string }}" 61 | json_file="commec/tests/test_data/functional.json" 62 | 63 | if [ -f "$json_file" ]; then 64 | # Use jq to replace the version field safely 65 | tmpfile=$(mktemp) 66 | jq --arg ver "$ver" '.commec_info.commec_version = $ver' "$json_file" > "$tmpfile" && mv "$tmpfile" "$json_file" 67 | echo "Updated commec_version in $json_file to $ver" 68 | else 69 | echo "File $json_file not found!" 70 | exit 1 71 | fi 72 | 73 | - name: Commit version changes (if needed) 74 | if: env.updated_pyproject == 'true' || env.updated_meta == 'true' 75 | run: | 76 | git config user.name "github-actions" 77 | git config user.email "github-actions@github.com" 78 | git add pyproject.toml conda-recipe/meta.yaml commec/tests/test_data/functional.json 79 | if git commit -m "Update version texts to ${{ github.event.inputs.version_string }}"; then 80 | git push origin HEAD:${{ github.event.inputs.release_branch }} 81 | else 82 | echo "No changes to commit, skipping push." 83 | fi 84 | 85 | - name: Tag release 86 | run: | 87 | git tag -a "v${{ github.event.inputs.version_string }}" -m "Release ${{ github.event.inputs.version_string }}" 88 | git push origin "v${{ github.event.inputs.version_string }}" 89 | 90 | - name: Download tar.gz from GitHub release 91 | run: | 92 | curl -L -o source.tar.gz https://github.com/${{ github.repository }}/archive/refs/tags/v${{ github.event.inputs.version_string }}.tar.gz 93 | 94 | - name: Calculate SHA256 95 | id: hash 96 | run: | 97 | sha256=$(sha256sum source.tar.gz | cut -d ' ' -f1) 98 | echo "sha256=$sha256" >> $GITHUB_OUTPUT 99 | 100 | - name: Clone bioconda fork 101 | run: | 102 | git clone https://github.com/ibbis-bio/bioconda-recipes.git 103 | cd bioconda-recipes 104 | git remote add upstream https://github.com/bioconda/bioconda-recipes.git 105 | git fetch upstream 106 | git reset --hard upstream/master 107 | git push origin master --force 108 | 109 | - name: Update recipe with new meta.yaml 110 | run: | 111 | cp conda-recipe/meta.yaml bioconda-recipes/recipes/commec/meta.yaml 112 | cd bioconda-recipes 113 | sed -i '3s/.*/{% set sha256 = "'"${{ steps.hash.outputs.sha256 }}"'" %}/' recipes/commec/meta.yaml 114 | git config user.name "github-actions" 115 | git config user.email "github-actions@github.com" 116 | git add recipes/commec/meta.yaml 117 | git commit -m "Update commec recipe to v${{ github.event.inputs.version_string }}" 118 | git push origin master 119 | 120 | - name: Create PR to bioconda upstream 121 | env: 122 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 123 | run: | 124 | gh pr create \ 125 | --repo bioconda/bioconda-recipes \ 126 | --head ibbis-bio:master \ 127 | --base master \ 128 | --title "Update commec to v${{ github.event.inputs.version_string }}" \ 129 | --body "This PR updates the commec recipe to version ${{ github.event.inputs.version_string }}." 130 | -------------------------------------------------------------------------------- /.github/workflows/release-version-sha-update.yml: -------------------------------------------------------------------------------- 1 | name: Update version + SHA for release 2 | 3 | on: 4 | pull_request: 5 | types: [closed] 6 | branches: 7 | - main 8 | workflow_dispatch: 9 | inputs: 10 | release_version: 11 | description: 'Release version (without v prefix)' 12 | required: true 13 | default: '' 14 | 15 | jobs: 16 | update-version-sha: 17 | # Run if the PR was merged (not just closed) and it was from a release branch OR manually triggered 18 | if: (github.event_name == 'pull_request' && github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, 'release-v')) || 19 | github.event_name == 'workflow_dispatch' 20 | runs-on: ubuntu-latest 21 | permissions: 22 | contents: write 23 | 24 | steps: 25 | - name: Check out repository code 26 | uses: actions/checkout@v3 27 | with: 28 | ref: main # Ensure we're on the main branch after merge 29 | fetch-depth: 0 # Need full history 30 | 31 | - name: Extract version 32 | id: get_version 33 | run: | 34 | if [ "${{ github.event_name }}" == "pull_request" ]; then 35 | # Extract version from the release branch name (release-vX.Y.Z -> X.Y.Z) 36 | BRANCH_NAME="${{ github.event.pull_request.head.ref }}" 37 | VERSION=$(echo $BRANCH_NAME | sed 's/release-v//') 38 | else 39 | # Use the manually provided version 40 | VERSION="${{ github.event.inputs.release_version }}" 41 | fi 42 | echo "VERSION=$VERSION" >> $GITHUB_ENV 43 | echo "Version: $VERSION" 44 | 45 | - name: Find pyproject.toml and meta.yaml files 46 | id: find_files 47 | run: | 48 | PYPROJECT_PATH=$(find . -name "pyproject.toml" -type f | head -n 1) 49 | if [ -z "$PYPROJECT_PATH" ]; then 50 | echo "ERROR: pyproject.toml not found" 51 | exit 1 52 | fi 53 | echo "Found pyproject.toml at: $PYPROJECT_PATH" 54 | echo "PYPROJECT_PATH=$PYPROJECT_PATH" >> $GITHUB_ENV 55 | 56 | META_PATH=$(find . -name "meta.yaml" -type f | head -n 1) 57 | if [ -z "$META_PATH" ]; then 58 | echo "ERROR: meta.yaml not found" 59 | exit 1 60 | fi 61 | echo "Found meta.yaml at: $META_PATH" 62 | echo "META_PATH=$META_PATH" >> $GITHUB_ENV 63 | 64 | - name: Set up Python 65 | uses: actions/setup-python@v4 66 | with: 67 | python-version: '3.10' 68 | 69 | - name: Update version in files 70 | run: | 71 | # Update version in pyproject.toml 72 | sed -i "s/^version = \".*\"/version = \"${{ env.VERSION }}\"/" "${{ env.PYPROJECT_PATH }}" 73 | echo "Updated version to ${{ env.VERSION }} in ${{ env.PYPROJECT_PATH }}" 74 | 75 | # Update version in meta.yaml 76 | sed -i "s/{% set version = \".*\" %}/{% set version = \"${{ env.VERSION }}\" %}/" "${{ env.META_PATH }}" 77 | echo "Updated version to ${{ env.VERSION }} in ${{ env.META_PATH }}" 78 | 79 | - name: Calculate SHA256 hash 80 | id: calculate_sha 81 | run: | 82 | # Create source distribution 83 | python -m pip install build 84 | python -m build --sdist 85 | 86 | # Find the generated tar.gz file 87 | SDIST_FILE=$(find dist -name "*.tar.gz" | head -n 1) 88 | 89 | if [ -z "$SDIST_FILE" ]; then 90 | echo "ERROR: No tar.gz file found in dist directory" 91 | exit 1 92 | fi 93 | echo "Found sdist file: $SDIST_FILE" 94 | 95 | # Get the SHA256 hash of the generated tar.gz file 96 | SHA256=$(sha256sum "$SDIST_FILE" | cut -d ' ' -f 1) 97 | echo "SHA256=$SHA256" >> $GITHUB_ENV 98 | echo "SHA256 hash: $SHA256" 99 | 100 | - name: Update SHA in meta.yaml 101 | run: | 102 | # Update SHA256 in meta.yaml 103 | sed -i "s/{% set sha256 = \".*\" %}/{% set sha256 = \"${{ env.SHA256 }}\" %}/" "${{ env.META_PATH }}" 104 | echo "Updated SHA256 in ${{ env.META_PATH }}" 105 | 106 | - name: Check for changes 107 | id: check_changes 108 | run: | 109 | if git diff --quiet; then 110 | echo "No changes detected" 111 | echo "CHANGES_DETECTED=false" >> $GITHUB_ENV 112 | else 113 | echo "Changes detected" 114 | echo "CHANGES_DETECTED=true" >> $GITHUB_ENV 115 | fi 116 | 117 | - name: Commit changes 118 | if: env.CHANGES_DETECTED == 'true' 119 | run: | 120 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 121 | git config --local user.name "github-actions[bot]" 122 | 123 | # Add the files 124 | git add "${{ env.PYPROJECT_PATH }}" 125 | git add "${{ env.META_PATH }}" 126 | 127 | git commit -m "Bump version to ${{ env.VERSION }} and update SHA [skip ci]" 128 | git push 129 | echo "Pushed changes to repository" 130 | 131 | - name: Create version tag 132 | run: | 133 | git tag "v${{ env.VERSION }}" 134 | git push origin "v${{ env.VERSION }}" 135 | echo "Created and pushed tag v${{ env.VERSION }}" 136 | -------------------------------------------------------------------------------- /commec/config/screen_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | 4 | """ 5 | Container for search handlers used throughout the Commec screen workflow. 6 | Sets and alters defaults based on input parameters. 7 | """ 8 | 9 | import logging 10 | import os 11 | from commec.config.screen_io import ScreenIO 12 | from commec.tools.blastn import BlastNHandler 13 | from commec.tools.blastx import BlastXHandler 14 | from commec.tools.diamond import DiamondHandler 15 | from commec.tools.cmscan import CmscanHandler 16 | from commec.tools.hmmer import HmmerHandler 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | class ScreenTools: 21 | """ 22 | Using parameters and filenames in `ScreenIo`, set up the tools needed to search datbases. 23 | """ 24 | 25 | def __init__(self, params: ScreenIO): 26 | self.biorisk: HmmerHandler = None 27 | self.regulated_protein : BlastXHandler | DiamondHandler = None 28 | self.regulated_nt: BlastNHandler = None 29 | self.low_concern_hmm: HmmerHandler = None 30 | self.low_concern_blastn: BlastNHandler = None 31 | self.low_concern_cmscan: CmscanHandler = None 32 | 33 | self.taxonomy_path: str | os.PathLike = None 34 | self.biorisk_taxid_path: str | os.PathLike = None 35 | self.low_concern_taxid_path: str | os.PathLike = None 36 | self.biorisk_annotations_csv: str | os.PathLike = None 37 | 38 | # Paths for vaxid, taxids, and taxonomy directory, used for check_regulated_pathogens 39 | # (Declared this way for backwards compatibility to old database structure at this stage) 40 | self.taxonomy_path = params.config["databases"]["taxonomy"]["path"] 41 | self.biorisk_taxid_path = params.config["databases"]["biorisk"]["taxids"] 42 | self.low_concern_taxid_path = params.config["databases"]["low_concern"]["taxids"] 43 | self.biorisk_annotations = params.config["databases"]["biorisk"]["annotations"] 44 | self.low_concern_annotations = params.config["databases"]["low_concern"]["annotations"] 45 | 46 | # Database tools for Biorisks / Protein and NT screens / Benign screen: 47 | self.biorisk = HmmerHandler( 48 | params.config["databases"]["biorisk"]["path"], 49 | params.aa_path, 50 | f"{params.output_prefix}.biorisk.hmmscan", 51 | threads=params.config["threads"], 52 | force=params.config["force"], 53 | ) 54 | 55 | if params.should_do_protein_screening: 56 | if params.config["protein_search_tool"] == "blastx": 57 | self.regulated_protein = BlastXHandler( 58 | params.config["databases"]["regulated_protein"]["blast"]["path"], 59 | input_file=params.nt_path, 60 | out_file=f"{params.output_prefix}.nr.blastx", 61 | threads=params.config["threads"], 62 | force=params.config["force"], 63 | ) 64 | elif params.config["protein_search_tool"] in ("nr.dmnd", "diamond"): 65 | self.regulated_protein = DiamondHandler( 66 | params.config["databases"]["regulated_protein"]["diamond"]["path"], 67 | input_file=params.nt_path, 68 | out_file=f"{params.output_prefix}.nr.dmnd", 69 | threads=params.config["threads"], 70 | force=params.config["force"], 71 | ) 72 | self.regulated_protein.jobs = params.config["diamond_jobs"] 73 | if params.config["protein_search_tool"] == "nr.dmnd": 74 | logger.info( 75 | "Using old \"nr.dmnd\" keyword for search tool will not be supported" 76 | " in future releases,consider using \"diamond\" instead." 77 | ) 78 | else: 79 | raise RuntimeError('Search tool not defined as "blastx" or "diamond"') 80 | 81 | if params.should_do_nucleotide_screening: 82 | self.regulated_nt = BlastNHandler( 83 | params.config["databases"]["regulated_nt"]["path"], 84 | input_file=params.nc_path, 85 | out_file=f"{params.output_prefix}.nt.blastn", 86 | threads=params.config["threads"], 87 | force=params.config["force"], 88 | ) 89 | 90 | if params.should_do_low_concern_screening: 91 | self.low_concern_hmm = HmmerHandler( 92 | params.config["databases"]["low_concern"]["protein"]["path"], 93 | input_file=params.aa_path, 94 | out_file=f"{params.output_prefix}.low_concern.hmmscan", 95 | threads=params.config["threads"], 96 | force=params.config["force"], 97 | ) 98 | self.low_concern_blastn = BlastNHandler( 99 | params.config["databases"]["low_concern"]["dna"]["path"], 100 | input_file=params.nt_path, 101 | out_file=f"{params.output_prefix}.low_concern.blastn", 102 | threads=params.config["threads"], 103 | force=params.config["force"], 104 | ) 105 | self.low_concern_cmscan = CmscanHandler( 106 | params.config["databases"]["low_concern"]["rna"]["path"], 107 | input_file=params.nt_path, 108 | out_file=f"{params.output_prefix}.low_concern.cmscan", 109 | threads=params.config["threads"], 110 | force=params.config["force"], 111 | ) -------------------------------------------------------------------------------- /example_data/output_commec-examples/commec-examples.nt.blastn: -------------------------------------------------------------------------------- 1 | # BLASTN 2.16.0+ 2 | # Query: encrypted (1-552) 3 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt 4 | # 0 hits found 5 | # BLASTN 2.16.0+ 6 | # Query: xylanase_zero_shot_des31 (1-756) 7 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt 8 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end 9 | # 5 hits found 10 | xylanase_zero_shot_des31 Escherichia albertii strain 205_2_TBG_B chromosome, complete genome CP099890 208962 2.75e-31 150 75.542 756 16 333 4752472 4352667 4352984 11 | xylanase_zero_shot_des31 Enterobacter hormaechei strain A26358 chromosome, complete genome CP163152 158836 1.28e-29 145 74.854 756 16 351 4839353 2207862 2207527 12 | xylanase_zero_shot_des31 Enterobacter hormaechei subsp. xiangfangensis strain HD2292 chromosome, complete genome CP130333 1296536 1.28e-29 145 74.854 756 16 351 4806739 3158310 3157975 13 | xylanase_zero_shot_des31 Enterobacter hormaechei strain 2020CK-00204 chromosome, complete genome CP115689 158836 1.28e-29 145 75.152 756 16 339 4935101 2693068 2693391 14 | xylanase_zero_shot_des31 Enterobacter hormaechei strain UCI161 chromosome CP060481 158836 1.28e-29 145 75.152 756 16 339 4802325 2306527 2306850 15 | # BLASTN 2.16.0+ 16 | # Query: RVFV_Rift_valley_fever (830.0-960.0) 17 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt 18 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end 19 | # 5 hits found 20 | RVFV_Rift_valley_fever Rift Valley fever virus strain SA-75 segment S, complete sequence DQ380175 11588 5.95e-60 243 100.000 131 1 131 1691 830 960 21 | RVFV_Rift_valley_fever Rift Valley fever virus segment S nonstructural protein and nucleocapsid genes, complete cds OM744402 11588 5.95e-60 243 100.000 131 1 131 1691 830 960 22 | RVFV_Rift_valley_fever Rift Valley fever virus strain H1825RSA75 segment S, complete sequence EU312120 11588 5.95e-60 243 100.000 131 1 131 1691 830 960 23 | RVFV_Rift_valley_fever Rift Valley fever virus strain 35/74 segment S, complete sequence JF784388 11588 5.95e-60 243 100.000 131 1 131 1691 830 960 24 | RVFV_Rift_valley_fever Rift Valley fever virus isolate M57/74 nonstructural protein and nucleocapsid protein genes, complete cds KX944821 11588 5.95e-60 243 100.000 131 1 131 1654 801 931 25 | # BLASTN 2.16.0+ 26 | # Query: BBa_K209429_A_15261 (643.0-758.0) (1833.0-1933.0) (2633.0-2764.0) (3497.0-3550.0) 27 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt 28 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end 29 | # 20 hits found 30 | BBa_K209429_A_15261 Mammalian expression vector pNBioSec, complete sequence EU082004 478810 6.39e-45 195 98.198 403 224 334 7032 1335 1445 31 | BBa_K209429_A_15261 Gateway entry vector pMpGE_En01 DNA, complete sequence LC090754 1740675 2.97e-43 189 100.000 403 116 217 6126 5326 5427 32 | BBa_K209429_A_15261 Transformation vector pGL193, complete sequence OK017460 2902901 2.97e-43 189 100.000 403 116 217 10319 9451 9552 33 | BBa_K209429_A_15261 Cloning vector pNRVL-caSAT1, complete sequence MN989861 2713480 2.97e-43 189 100.000 403 116 217 5220 4420 4521 34 | BBa_K209429_A_15261 Cloning vector p5E-CAGGS, complete sequence JN715850 1115837 2.97e-43 189 100.000 403 116 217 4511 3711 3812 35 | BBa_K209429_A_15261 Cloning vector pNRVL-N5L-ffDronpa, complete sequence PP986966 3231919 2.97e-43 189 100.000 403 116 217 4951 4151 4252 36 | BBa_K209429_A_15261 PREDICTED: Bos indicus x Bos taurus prolactin (PRL), transcript variant X2, mRNA XM_027524257 30522 1.39e-36 167 100.000 403 224 313 923 83 172 37 | BBa_K209429_A_15261 PREDICTED: Moschus berezovskii prolactin (LOC129543410), mRNA XM_055406509 68408 1.39e-36 167 100.000 403 224 313 690 1 90 38 | BBa_K209429_A_15261 PREDICTED: Bison bison bison prolactin (LOC104992619), mRNA XM_010845567 43346 1.39e-36 167 100.000 403 224 313 891 56 145 39 | BBa_K209429_A_15261 PREDICTED: Bos indicus prolactin (LOC109577296), transcript variant X2, mRNA XM_019986186 9915 1.39e-36 167 100.000 403 224 313 945 105 194 40 | BBa_K209429_A_15261 Homo sapiens cholinergic receptor, muscarinic 2, mRNA (cDNA clone MGC:111772 IMAGE:6519221), complete cds BC095547 9606 3.10e-13 89.8 100.000 403 356 403 1404 4 51 41 | BBa_K209429_A_15261 PREDICTED: Pan paniscus cholinergic receptor muscarinic 2 (CHRM2), transcript variant X4, mRNA XM_055115415 9597 3.10e-13 89.8 100.000 403 356 403 6013 386 433 42 | BBa_K209429_A_15261 PREDICTED: Pan troglodytes cholinergic receptor muscarinic 2 (CHRM2), transcript variant X1, mRNA XM_009454280 9598 3.10e-13 89.8 100.000 403 356 403 9169 766 813 43 | BBa_K209429_A_15261 PREDICTED: Pan paniscus cholinergic receptor muscarinic 2 (CHRM2), transcript variant X1, mRNA XM_055115412 9597 3.10e-13 89.8 100.000 403 356 403 11057 5430 5477 44 | BBa_K209429_A_15261 Homo sapiens cholinergic receptor muscarinic 2 (CHRM2), transcript variant 5, mRNA NM_001006631 9606 3.10e-13 89.8 100.000 403 356 403 5663 316 363 45 | BBa_K209429_A_15261 Cloning vector pE3n, complete sequence EU334818 490921 8.74e-04 58.4 88.000 403 1 48 2595 432 481 46 | BBa_K209429_A_15261 Vector pENTR1A-3Flag-WFS1-IRES2-mCherry, complete sequence OQ238874 3062755 0.003 56.5 97.059 403 1 33 6432 432 465 47 | BBa_K209429_A_15261 Cloning vector pIPKTA33, complete sequence EF622217 444601 0.003 56.5 97.059 403 1 33 2311 141 174 48 | BBa_K209429_A_15261 Vector pETNR1A-R-GECO, complete sequence OQ238871 3062760 0.003 56.5 97.059 403 1 33 3534 432 465 49 | BBa_K209429_A_15261 Cloning vector pE1c, complete sequence EU334822 490916 0.003 56.5 97.059 403 1 33 2336 432 465 50 | # BLAST processed 4 queries 51 | -------------------------------------------------------------------------------- /commec/tests/test_json.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dataclasses import asdict 3 | from commec.config.json_io import * 4 | from commec.config.result import * 5 | from commec.tools.search_handler import SearchToolVersion 6 | 7 | @pytest.fixture 8 | def test_screendata(): 9 | '''Fixture to provide the ScreenResult for testing.''' 10 | return ScreenResult( 11 | #recommendation="PASS", 12 | commec_info = ScreenRunInfo( 13 | commec_version="0.1.2", 14 | json_output_version=JSON_COMMEC_FORMAT_VERSION, 15 | time_taken="00:00:00:00", 16 | date_run="1.1.2024", 17 | search_tool_info= SearchToolInfo( 18 | biorisk_search_info=SearchToolVersion("HMM 0.0.0","DB 0.0.0"), 19 | protein_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"), 20 | nucleotide_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"), 21 | low_concern_protein_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"), 22 | low_concern_rna_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"), 23 | low_concern_dna_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"), 24 | ) 25 | ), 26 | query_info = ScreenQueryInfo( 27 | file="no file", 28 | number_of_queries=1, 29 | total_query_length=10 30 | ), 31 | queries= { 32 | "Query1": 33 | QueryResult( 34 | query="Query1", 35 | length=10, 36 | status = QueryScreenStatus(), 37 | hits = { 38 | "ImportantProtein1": 39 | HitResult( 40 | recommendation=HitScreenStatus(ScreenStatus.WARN, ScreenStep.BIORISK), 41 | name="ImportantProtein1", 42 | annotations = {"domain" : ["Bacteria"]}, 43 | ranges = [ 44 | MatchRange( 45 | e_value = 0.0, 46 | match_start = 0, 47 | match_end = 10, 48 | query_start = 0, 49 | query_end = 10 50 | ) 51 | ] 52 | ) 53 | } 54 | ) 55 | }, 56 | ) 57 | 58 | @pytest.fixture 59 | def empty_screendata(): 60 | '''Fixture to provide the ScreenResult for testing.''' 61 | return ScreenResult() 62 | 63 | @pytest.mark.parametrize("test_data_fixture",["test_screendata", "empty_screendata"]) 64 | def test_json_io(tmp_path, request, test_data_fixture): 65 | ''' Test to ensure that read/write for JSON ScreenResult I/O is working correctly.''' 66 | test_data = request.getfixturevalue(test_data_fixture) 67 | json_filename1 = tmp_path / "testread1.json" 68 | json_filename2 = tmp_path / "testread2.json" 69 | encode_screen_data_to_json(test_data, json_filename1) 70 | test_data_retrieved = get_screen_data_from_json(json_filename1) 71 | encode_screen_data_to_json(test_data_retrieved, json_filename2) 72 | test_data_retrieved_twice = get_screen_data_from_json(json_filename2) 73 | 74 | # Convert both original and retrieved data to dictionaries and compare 75 | assert asdict(test_data) == asdict(test_data_retrieved), ( 76 | f"JSON Write/Read interpreter failed.\n" 77 | f"Test JSON Reference data: \n{asdict(test_data)}\n" 78 | f"Test JSON output data: \n{asdict(test_data_retrieved)}" 79 | ) 80 | 81 | # Convert both original and retrieved data to dictionaries and compare 82 | assert asdict(test_data) == asdict(test_data_retrieved_twice), ( 83 | f"JSON Write/Read/Write/Read interpreter failed.\n" 84 | f"Test JSON Reference data: \n{asdict(test_data)}\n" 85 | f"Test JSON output data: \n{asdict(test_data_retrieved)}" 86 | ) 87 | 88 | def test_erroneous_info(tmp_path, test_screendata): 89 | ''' Test to ensure that read/write for JSON ScreenResult I/O is working correctly.''' 90 | test_data = test_screendata 91 | json_filename3 = tmp_path / "testread3.json" 92 | json_filename4 = tmp_path / "testread4.json" 93 | 94 | encode_screen_data_to_json(test_data, json_filename3) 95 | test_data_retrieved = get_screen_data_from_json(json_filename3) 96 | 97 | # Add erroneous information 98 | test_data_dict = asdict(test_data_retrieved) 99 | test_data_dict["ExtraStuff1"] = "ExtraBitStuff1" 100 | test_data_dict["queries"]["Query1"]["ExtraStuff2"] = "ExtraBitStuff2" 101 | test_data_dict["queries"]["Query1"]["hits"]["ImportantProtein1"]["ranges"].append("ExtraStuff3") 102 | test_data_dict["queries"]["Query1"]["hits"]["ImportantProtein1"]["ranges"].append({"ExtraDictStuff4" : 9999}) 103 | test_data_dict2 = encode_dict_to_screen_data(test_data_dict) 104 | encode_screen_data_to_json(test_data_dict2, json_filename4) 105 | test_data_retrieved = get_screen_data_from_json(json_filename4) 106 | 107 | # Convert both original and retrieved data to dictionaries and compare 108 | assert asdict(test_data) == asdict(test_data_retrieved), ( 109 | f"JSON Write/Read interpreter failed.\n" 110 | f"Test JSON Reference data: \n{asdict(test_data)}\n\n\n\n" 111 | f"Test JSON output data: \n{asdict(test_data_retrieved)}\n\n\n\n" 112 | ) 113 | 114 | def test_recommendation_ordering(): 115 | assert ScreenStatus.PASS.importance < ScreenStatus.FLAG.importance 116 | assert compare(ScreenStatus.PASS, ScreenStatus.FLAG) == ScreenStatus.FLAG 117 | 118 | def test_adding_data_to_existing(): 119 | """ 120 | Tests to ensure the mutability of writing to queries is working as expected. 121 | """ 122 | def write_info(input_query : QueryResult): 123 | input_query.status.biorisk = ScreenStatus.PASS 124 | 125 | new_screen_data = ScreenResult() 126 | new_screen_data.queries["test01"] = QueryResult("test01", 10, ScreenStatus.FLAG) 127 | write_query = new_screen_data.get_query("test01") 128 | write_info(write_query) 129 | assert new_screen_data.queries["test01"].status.biorisk == ScreenStatus.PASS 130 | -------------------------------------------------------------------------------- /commec/tests/test_dbs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit test for ensuring that the databases are being called without errors. 3 | Will fail if databases have not been installed as expected, with correct versions. 4 | """ 5 | import os 6 | import pytest 7 | from commec.tools.diamond import DiamondHandler 8 | from commec.tools.blastn import BlastNHandler 9 | from commec.tools.blastx import BlastXHandler 10 | from commec.tools.hmmer import HmmerHandler 11 | from commec.tools.cmscan import CmscanHandler 12 | from commec.tools.search_handler import DatabaseValidationError 13 | 14 | INPUT_QUERY = os.path.join(os.path.dirname(__file__), "test_data/single_record.fasta") 15 | DATABASE_DIRECTORY = os.path.join(os.path.dirname(__file__), "test_dbs") 16 | 17 | databases_to_implement = [ 18 | [DiamondHandler, "nr_dmnd", "nr"], 19 | [BlastNHandler, "nt_blast", "core_nt"], 20 | [BlastXHandler, "nr_blast", "nr"], 21 | [HmmerHandler, "low_concern/protein", "benign.hmm"], 22 | [CmscanHandler, "low_concern/rna", "benign.cm"], 23 | ] 24 | 25 | def print_tmp_path_contents(tmp_path): 26 | print(f"Contents of {tmp_path}:") 27 | for path in tmp_path.rglob("*"): # Recursively list all files and directories 28 | print(path.relative_to(tmp_path), "->", "DIR" if path.is_dir() else "FILE") 29 | 30 | @pytest.mark.parametrize("input_db", databases_to_implement) 31 | def test_database_can_run(input_db): 32 | """ 33 | Opens a database object on a test database, and runs the test query on it. 34 | Fails if commec environment is not setup correctly, or if the database object 35 | defaults are invalid etc. 36 | 37 | Something similar to this would be useful to be run 38 | instead of --help during the conda recipe checks. 39 | """ 40 | 41 | db_dir = os.path.join(DATABASE_DIRECTORY, input_db[1]) 42 | db_file = os.path.join(db_dir, input_db[2]) 43 | 44 | output_file = "db.out" 45 | 46 | new_db = input_db[0](db_file, INPUT_QUERY, output_file, force=True) 47 | new_db.search() 48 | assert new_db.validate_output() 49 | 50 | version: str = new_db.get_version_information() 51 | assert version 52 | 53 | if os.path.isfile(output_file): 54 | os.remove(output_file) 55 | 56 | 57 | bad_databases = [ 58 | [DiamondHandler, "nr_dmnd", "bad"], 59 | [BlastNHandler, "nt_blast", "bad"], 60 | [BlastXHandler, "nr_blast", "bad"], 61 | [HmmerHandler, "low_concern_db", "bad.hmm"], 62 | [CmscanHandler, "low_concern_db", "bad.cmscan"], 63 | [DiamondHandler, "bad", "bad"], 64 | [BlastNHandler, "bad", "bad"], 65 | [BlastXHandler, "bad", "bad"], 66 | [HmmerHandler, "bad", "bad.hmm"], 67 | [CmscanHandler, "bad", "bad.cmscan"], 68 | ] 69 | 70 | 71 | @pytest.mark.parametrize("input_db", bad_databases) 72 | def test_database_no_file(input_db): 73 | """ 74 | Simply ensures that the input databases are failing there validation. 75 | """ 76 | db_dir = os.path.join(DATABASE_DIRECTORY, input_db[1]) 77 | db_file = os.path.join(db_dir, input_db[2]) 78 | output_file = "db.out" 79 | 80 | try: 81 | input_db[0](db_file, INPUT_QUERY, output_file) 82 | assert False 83 | except DatabaseValidationError: 84 | assert True 85 | 86 | n_jobs = [ 87 | None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 88 | ] 89 | 90 | @pytest.mark.parametrize("input_jobs", n_jobs) 91 | def test_diamond_job_and_threads_calculations(input_jobs): 92 | """ 93 | Tests a range of threads, and diamond database sizes, 94 | for automatically calculating the optimum number of runs, 95 | and threads per run. Such that no CPU time is wasted. 96 | No specific expected outcomes, but we can check general expectations: 97 | - Never exceed max threads. 98 | No specific expected outcomes, but we check general expectations (e.g. never exceeding max_threads) 99 | """ 100 | handler = DiamondHandler( 101 | "commec/tests/test_dbs/nr_dmnd/nr", 102 | "commec/tests/test_data/single_record.fasta", 103 | "output.test", 104 | ) 105 | handler.jobs = input_jobs 106 | 107 | for max_threads in range(1, 25): 108 | for n_database_files in range(3, 9): 109 | concurrent_runs, threads_per_run = handler.determine_runs_and_threads( 110 | max_threads, n_database_files 111 | ) 112 | 113 | # If input jobs is provided, we should never exceed max threads. 114 | assert concurrent_runs * threads_per_run <= max_threads 115 | 116 | # If no number of input jobs is provided: 117 | # We should ALWAYS use all available threads. 118 | # We may use less than Max Threads if the remainder is 0 for the no. of database files 119 | if input_jobs is None: 120 | assert ((concurrent_runs * threads_per_run == max_threads) or 121 | (concurrent_runs * threads_per_run % n_database_files == 0)), f""" 122 | {concurrent_runs} runs with {threads_per_run} threads. Input settings: 123 | {max_threads} max threads, {n_database_files} dbs, {input_jobs} input jobs no. 124 | """ 125 | 126 | 127 | @pytest.mark.parametrize( 128 | "input_jobs, max_threads, n_database_files, expected_runs, expected_threads", 129 | [ 130 | (None, 20, 6, 2, 10), # jobs capped by db count, using all threads 131 | (None, 8, 5, 1, 5), # jobs capped by db count, not using all threads 132 | (3, 12, 6, 3, 4), # jobs=3 --> 3 runs with 4 threads each 133 | (10, 20, 5, 5, 4), # jobs=10 > db=5, capped to 5 runs with 4 threads each 134 | (20, 10, 5, 5, 2), # jobs=20 > threads=10, capped to 5 runs with 2 threads each 135 | (10, 4, 5, 4, 1), # jobs=10 > db, threads, cappted to 4 runs with 1 thread each 136 | ] 137 | ) 138 | def test_diamond_job_and_threads_calculations_parametrized( 139 | input_jobs, max_threads, n_database_files, expected_runs, expected_threads 140 | ): 141 | """ 142 | Specific test cases for Diamond Jobs. 143 | """ 144 | handler = DiamondHandler( 145 | "commec/tests/test_dbs/nr_dmnd/nr", 146 | "commec/tests/test_data/single_record.fasta", 147 | "output.test", 148 | ) 149 | handler.jobs = input_jobs 150 | concurrent_runs, threads_per_run = handler.determine_runs_and_threads( 151 | max_threads, n_database_files 152 | ) 153 | 154 | assert concurrent_runs == expected_runs, f""" 155 | {input_jobs} jobs, {max_threads} threads failed 156 | {concurrent_runs} for expected ({expected_runs}) concurrent runs. 157 | """ 158 | assert threads_per_run == expected_threads, f""" 159 | {input_jobs} jobs, {max_threads} threads failed 160 | {threads_per_run} for expected ({expected_threads}) threads per run. 161 | """ -------------------------------------------------------------------------------- /commec/config/json_io.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | ''' 4 | Set of tools for retrieving and storing information important to screen 5 | outputs. Information is stored as a structure of dataclasses (headed by ScreenResult), 6 | and are converted between the dataclass / dict / json_file as required. The 7 | conversions are done dynamically, and it is recommended to only use and 8 | interact with the dataclasses only, to maintain version format, and not 9 | create erroneous outputs to the JSON which wont be read back in. This 10 | ensures an expected i/o behaviour. 11 | 12 | The single exception to this is the "annotations" dictionary, present 13 | in the HitDescription, which contains non-structured information, and is 14 | populated with differing information under differing keys depending on 15 | which step the information is derived (Biorisk, Taxonomy etc) 16 | 17 | In this way, the JSON object serves as a common state, that can be updated 18 | whilst not being temporally appended like a log file i.e. .screen file. 19 | 20 | The JSON stores all pertinent information of a run. 21 | ''' 22 | 23 | # Consider whether this can get away with being part of config. rename to IO config? 24 | 25 | import json 26 | import string 27 | import os 28 | from dataclasses import asdict, fields, is_dataclass 29 | from typing import Dict, Type, get_origin, Any, get_args 30 | from enum import StrEnum 31 | from commec.config.result import ScreenResult, JSON_COMMEC_FORMAT_VERSION 32 | 33 | class IoVersionError(RuntimeError): 34 | """Custom exception when handling differing versions with Commec output JSON.""" 35 | 36 | def encode_screen_data_to_json(input_result: ScreenResult, 37 | 38 | output_json_filepath: string = "output.json") -> None: 39 | ''' Converts a ScreenResult class object into a JSON file at the given filepath.''' 40 | try: 41 | with open(output_json_filepath, "w", encoding="utf-8") as json_file: 42 | json.dump(asdict(input_result), json_file, indent=2) 43 | except TypeError as e: 44 | print("Error outputting JSON:", e) 45 | print(input_result) 46 | 47 | def encode_dict_to_screen_data(input_dict : dict) -> ScreenResult: 48 | ''' Converts a dictionary into a ScreenResult object, 49 | any keys within the dictionary not part of the ScreenResult format are lost. 50 | any missing information will be simple set as defaults.''' 51 | return dict_to_dataclass(ScreenResult, input_dict) 52 | 53 | # Convert the dictionary back to the dataclass or list of dataclass 54 | def dict_to_dataclass(cls: Type, data: Dict[str, Any]) -> Any: 55 | ''' 56 | Convert a dict, into appropriate dataclass, or list of dataclass, 57 | invalid keys to the dataclass structure are ignored. 58 | ''' 59 | # Prepare a dictionary for filtered data 60 | filtered_data = {} 61 | 62 | if data is None: 63 | return filtered_data 64 | 65 | for f in fields(cls): 66 | field_name = f.name 67 | field_type = f.type 68 | 69 | if field_name in data: 70 | field_value = data[field_name] 71 | 72 | # Check if the field is a dataclass 73 | if is_dataclass(field_type): 74 | filtered_data[field_name] = dict_to_dataclass(field_type, field_value) 75 | continue 76 | 77 | # Check if the field is a list 78 | if get_origin(field_type) is list: 79 | item_type = get_args(field_type)[0] 80 | 81 | # Handle lists of StrEnums 82 | if issubclass(item_type, StrEnum): 83 | filtered_data[field_name] = [item_type(item) for item in field_value] 84 | 85 | #Handles Dataclasses 86 | if is_dataclass(item_type) and isinstance(field_value, list): 87 | filtered_data[field_name] = [ 88 | dict_to_dataclass(item_type, item) for item in field_value 89 | if isinstance(item, dict) 90 | and any(key in {f.name for f in fields(item_type)} 91 | for key in item.keys()) or isinstance(item, item_type)] 92 | continue 93 | 94 | filtered_data[field_name] = field_value 95 | continue 96 | 97 | # Check if the field is a dict of dataclasses 98 | if get_origin(field_type) is dict: 99 | _key_type, value_type = get_args(field_type) 100 | 101 | # Handle dicts of dataclasses 102 | if is_dataclass(value_type): 103 | filtered_data[field_name] = { 104 | key: dict_to_dataclass(value_type, value) if isinstance(value, dict) 105 | else value for key, value in field_value.items() 106 | if isinstance(value, (dict, value_type)) 107 | } 108 | continue 109 | 110 | filtered_data[field_name] = field_value 111 | continue 112 | 113 | # Handle custom StrEnums 114 | if issubclass(field_type, StrEnum): 115 | try: 116 | filtered_data[field_name] = field_type(field_value) 117 | except ValueError: 118 | print(f"Invalid value '{field_value}' for " 119 | f"field '{field_name}' of type {field_type}.") 120 | continue 121 | 122 | # Handle other field types 123 | filtered_data[field_name] = field_value 124 | 125 | # Create an instance of the dataclass with the filtered data 126 | return cls(**filtered_data) 127 | 128 | def get_screen_data_from_json(input_json_filepath: string) -> ScreenResult: 129 | ''' Loads a JSON file from given filepath and returns 130 | a populated ScreenResult object from its contents. If the file does not 131 | exist, then returns a new screen data object.''' 132 | if not os.path.exists(input_json_filepath): 133 | return ScreenResult() 134 | 135 | json_string : str 136 | with open(input_json_filepath, "r", encoding="utf-8") as json_file: 137 | # Read the file contents as a string 138 | json_string = json_file.read() 139 | my_data : dict = json.loads(json_string) 140 | 141 | # Check version of imported json. 142 | input_version = my_data["commec_info"]["json_output_version"] 143 | if not input_version == JSON_COMMEC_FORMAT_VERSION: 144 | raise IoVersionError(f"Version difference between input (v.{input_version}) and" 145 | f" expected (v.{JSON_COMMEC_FORMAT_VERSION})" 146 | f": {input_json_filepath}") 147 | return encode_dict_to_screen_data(my_data) 148 | -------------------------------------------------------------------------------- /dev_scripts/summarize_screens.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Summarize the screening for all .screen files in a directory. This is intended to be useful for 5 | debugging the pipeline, rather than for interpreting the outputs. 6 | 7 | Produces a CSV (name set by -o, defaults to 'output.csv') which contains the outcome for each step 8 | of the pipeline: 9 | 10 | * flag the sequence was flagged in this step 11 | * pass the sequence passed in this step 12 | * skip this step was intentionally not run 13 | * error an error occurred during this step 14 | * - this step was not run due to an error, interrupt, or other unexpected outcome 15 | * mix (protein only) the best match is to a mix of regulated- and non-regulated organisms 16 | * warn (biorisk only) found a significant hit to a virulence not from a regulated pathogen 17 | 18 | Each line in the CSV corresponds to a .screen file. The full paths to the files are also provided. 19 | 20 | Additionally, it includes three columns indicating whether the sequence was flagged as a regulated 21 | virus, bacteria, or eukaryote. 22 | """ 23 | import os 24 | import csv 25 | import argparse 26 | import re 27 | 28 | 29 | def process_step(step_content, step_number): 30 | """ 31 | Process the .screen file output to determine the outcome of the step. 32 | """ 33 | step_processors = { 34 | 1: get_biorisk_outcome, 35 | 2: get_protein_outcome, 36 | 3: get_nucleotide_outcome, 37 | 4: process_benign_step, 38 | } 39 | return step_processors.get(step_number, lambda _: "-")(step_content) 40 | 41 | 42 | def get_biorisk_outcome(step_content): 43 | """Process biorisk scan step from .screen file.""" 44 | if "FLAG" in step_content: 45 | return "flag" 46 | if "Virulence factor found" in step_content: 47 | return "warn" 48 | if ( 49 | "Biorisks: no hits detected, PASS" in step_content 50 | or "Biorisks: no significant hits detected, PASS" in step_content 51 | ): 52 | return "pass" 53 | if "ERROR:" in step_content: 54 | return "error" 55 | return "-" 56 | 57 | 58 | def get_protein_outcome(step_content): 59 | """Process protein scan step from .screen file.""" 60 | if "Best match to sequence(s)" in step_content and "FLAG" in step_content: 61 | return "flag" 62 | if "found in both regulated and non-regulated organisms" in step_content: 63 | return "mix" 64 | if "no top hit exclusive to a regulated pathogen: PASS" in step_content: 65 | return "pass" 66 | if "ERROR:" in step_content: 67 | return "error" 68 | return "-" 69 | 70 | 71 | def get_nucleotide_outcome(step_content): 72 | """Process nucleotide scan step from .screen file.""" 73 | if "no noncoding regions >= 50 bases found, skipping nt scan" in step_content: 74 | return "skip" 75 | if "Best match to sequence(s)" in step_content and "FLAG" in step_content: 76 | return "flag" 77 | if "no top hit exclusive to a regulated pathogen: PASS" in step_content: 78 | return "pass" 79 | if "ERROR:" in step_content: 80 | return "error" 81 | return "-" 82 | 83 | 84 | def process_benign_step(step_content): 85 | """Process benign scan step from .screen file.""" 86 | if "no regulated regions to clear" in step_content: 87 | return "skip" 88 | if ( 89 | "Regulated region at bases" in step_content 90 | and "failed to clear: FLAG" in step_content 91 | ): 92 | return "flag" 93 | if "all regulated regions cleared: PASS" in step_content: 94 | return "pass" 95 | if "ERROR:" in step_content: 96 | return "error" 97 | return "-" 98 | 99 | 100 | def check_regulated_flags(content): 101 | """ 102 | Check for regulated virus, bacteria, and eukaryote flags in the content. 103 | """ 104 | return { 105 | "virus_flag": "true" if "FLAG (virus)" in content else "false", 106 | "bacteria_flag": "true" if "FLAG (bacteria)" in content else "false", 107 | "eukaryote_flag": "true" if "FLAG (eukaryote)" in content else "false", 108 | } 109 | 110 | 111 | def process_file(file_path): 112 | """ 113 | Read input screen file, split into steps, and prepare dict of results for CSV output. 114 | """ 115 | with open(file_path, "r", encoding="utf-8") as file: 116 | content = file.read() 117 | 118 | filename = os.path.basename(file_path) 119 | filename_without_extension = os.path.splitext(filename)[0] 120 | 121 | steps = re.split(r">> STEP \d:", content) 122 | steps = [step.strip() for step in steps if step.strip()] 123 | 124 | results = { 125 | "filename": filename_without_extension, 126 | "location": file_path, 127 | "biorisk": process_step(steps[0] if len(steps) > 0 else "-", 1), 128 | "protein": process_step(steps[1] if len(steps) > 1 else "-", 2), 129 | "nucleotide": process_step(steps[2] if len(steps) > 2 else "-", 3), 130 | "benign": process_step(steps[3] if len(steps) > 3 else "-", 4), 131 | } 132 | 133 | # Add regulated flags 134 | results.update(check_regulated_flags(content)) 135 | 136 | return results 137 | 138 | 139 | def main(directory, output_file): 140 | """ 141 | Read all files that end with .screen in the input directory, then summarize their outcomes in a 142 | CSV. 143 | """ 144 | results = [] 145 | for root, _, files in os.walk(directory): 146 | for file in files: 147 | if file.endswith(".screen"): 148 | file_path = os.path.join(root, file) 149 | results.append(process_file(file_path)) 150 | 151 | # Write results to CSV 152 | with open(output_file, "w", newline="", encoding="utf-8") as csvfile: 153 | fieldnames = [ 154 | "filename", 155 | "location", 156 | "biorisk", 157 | "protein", 158 | "nucleotide", 159 | "virus_flag", 160 | "bacteria_flag", 161 | "eukaryote_flag", 162 | "benign", 163 | ] 164 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 165 | 166 | writer.writeheader() 167 | for result in results: 168 | writer.writerow(result) 169 | 170 | print(f"Results written to {output_file}") 171 | 172 | 173 | if __name__ == "__main__": 174 | parser = argparse.ArgumentParser( 175 | description="Process .screen files and output results to CSV." 176 | ) 177 | parser.add_argument("directory", help="Directory to search for .screen files") 178 | parser.add_argument( 179 | "-o", 180 | "--output", 181 | default="output.csv", 182 | help="Output CSV file name (default: output.csv)", 183 | ) 184 | 185 | args = parser.parse_args() 186 | 187 | main(args.directory, args.output) 188 | -------------------------------------------------------------------------------- /dev_scripts/collate-screens.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Collate screen files located in any subdirectories of an input directory, then match description 4 | fields between FASTAs in adjacent _input directories and FASTAs in another directory, renaming 5 | collated screen files with the names of matched FASTA files. Created as a workaround for breaking 6 | changes introduced in the commec output format in v0.3. 7 | 8 | Required inputs: 9 | -i, --input-dir input directory to recursively search for .screen files 10 | -o, --output-dir output directory where screen files should be collated 11 | -f, --fasta-dir directory to search recursively for FASTAs input to commec screen 12 | 13 | Example: 14 | $ python collate-screens.py -i . -o ./test-collate-pls -f ../functional-json-test/ 15 | """ 16 | import argparse 17 | import csv 18 | import os 19 | import shutil 20 | from pathlib import Path 21 | from typing import Dict, List, Tuple 22 | 23 | def clean_header(header: str) -> str: 24 | """ 25 | Clean a FASTA header by replacing whitespace and special characters with underscores. 26 | 27 | Args: 28 | header: Original FASTA header 29 | 30 | Returns: 31 | Cleaned header string 32 | """ 33 | return "".join( 34 | "_" if c.isspace() or c == "\xc2\xa0" or c == "#" else c 35 | for c in header 36 | ) 37 | 38 | def parse_fasta_header(fasta_path: Path) -> Tuple[str, str]: 39 | """ 40 | Extract filename and cleaned FASTA header from a FASTA file. 41 | 42 | Args: 43 | fasta_path: Path to the FASTA file 44 | 45 | Returns: 46 | Tuple of (filename, cleaned_header) 47 | """ 48 | filename = fasta_path.name 49 | with open(fasta_path) as f: 50 | for line in f: 51 | if line.startswith('>'): 52 | header = line.strip() 53 | cleaned_header = clean_header(header) 54 | return filename, cleaned_header 55 | raise ValueError(f"No valid FASTA header found in {fasta_path}") 56 | 57 | def build_fasta_mapping(fasta_dir: Path) -> Dict[str, str]: 58 | """ 59 | Build mapping of FASTA headers to filenames from all FASTA files in directory. 60 | 61 | Args: 62 | fasta_dir: Directory containing FASTA files 63 | 64 | Returns: 65 | Dictionary mapping FASTA headers to original filenames 66 | """ 67 | mapping = {} 68 | for root, _, files in os.walk(fasta_dir): 69 | for file in files: 70 | if (file.endswith('.fasta') and 71 | not file.endswith('.noncoding.fasta') and 72 | not file.endswith('.cleaned.fasta')): 73 | fasta_path = Path(root) / file 74 | try: 75 | filename, cleaned_header = parse_fasta_header(fasta_path) 76 | mapping[cleaned_header] = filename 77 | except (ValueError, IOError) as e: 78 | print(f"Warning: Could not process {fasta_path}: {e}") 79 | return mapping 80 | 81 | def find_screen_files(input_dir: Path) -> List[Path]: 82 | """ 83 | Find all .screen files in the input directory. 84 | """ 85 | screen_files = [] 86 | for root, _, files in os.walk(input_dir): 87 | for file in files: 88 | if file.endswith('.screen'): 89 | screen_files.append(Path(root) / file) 90 | return screen_files 91 | 92 | def get_matching_fasta_header(screen_path: Path) -> str: 93 | """ 94 | Get the FASTA header from the input screen file. 95 | """ 96 | # For a file named "something.screen", look for "input_something/something.cleaned.fasta" 97 | screen_name = screen_path.stem # removes .screen extension 98 | fasta_path = screen_path.parent / f"input_{screen_name}" / f"{screen_name}.cleaned.fasta" 99 | if not fasta_path.exists(): 100 | raise FileNotFoundError(f"Expected FASTA not found at {fasta_path}") 101 | 102 | with open(fasta_path) as f: 103 | for line in f: 104 | if line.startswith('>'): 105 | header = line.strip() 106 | return clean_header(header) 107 | raise ValueError(f"No valid FASTA header found in {fasta_path}") 108 | 109 | def main(): 110 | parser = argparse.ArgumentParser(description='Collate and rename screen files based on FASTA headers') 111 | parser.add_argument('-i', '--input-dir', required=True, help='Input directory containing screen files') 112 | parser.add_argument('-o', '--output-dir', required=True, help='Output directory for renamed screen files') 113 | parser.add_argument('-f', '--fasta-dir', required=True, help='Directory containing original FASTA files') 114 | 115 | args = parser.parse_args() 116 | 117 | input_dir = Path(args.input_dir).resolve() 118 | output_dir = Path(args.output_dir).resolve() 119 | fasta_dir = Path(args.fasta_dir).resolve() 120 | 121 | # Create output directory if it doesn't exist 122 | output_dir.mkdir(parents=True, exist_ok=True) 123 | 124 | # Build mapping of FASTA headers to original filenames 125 | print(f"Building FASTA header mapping based on files found in {fasta_dir}...") 126 | header_to_filename = build_fasta_mapping(fasta_dir) 127 | num_fastas_mapped = len(header_to_filename) 128 | if num_fastas_mapped == 0: 129 | print("Could not find any FASTAs to map! Note that .cleaned and .noncoding are filtered out.\nExiting...") 130 | exit(0) 131 | print(f"Found {num_fastas_mapped} FASTAS for mapping...") 132 | 133 | # Find all screen files 134 | print(f"Finding screen files in {input_dir}...") 135 | screen_files = find_screen_files(input_dir) 136 | print(f"Processing {len(screen_files)} screen files...") 137 | 138 | mappings = [] 139 | for screen_path in screen_files: 140 | try: 141 | # Get the FASTA header for this screen file 142 | fasta_header = get_matching_fasta_header(screen_path) 143 | 144 | # Look up the matching filename 145 | if fasta_header not in header_to_filename: 146 | print(f"Warning: No matching FASTA file found for {screen_path}") 147 | continue 148 | 149 | matching_filename = header_to_filename[fasta_header] 150 | new_filename = f"{Path(matching_filename).stem}.screen" 151 | output_path = output_dir / new_filename 152 | 153 | # Copy the screen file with the new name 154 | shutil.copy2(screen_path, output_path) 155 | 156 | # Record the mapping 157 | mappings.append({ 158 | 'screen': str(screen_path), 159 | 'matched_fasta': matching_filename, 160 | 'renamed_screen': new_filename 161 | }) 162 | 163 | print(f"Processed: {screen_path} -> {output_path}") 164 | 165 | except (FileNotFoundError, ValueError, IOError) as e: 166 | print(f"Error processing {screen_path}: {e}") 167 | 168 | # Write mapping CSV 169 | csv_path = output_dir / 'screen_mappings.csv' 170 | with open(csv_path, 'w', newline='') as f: 171 | writer = csv.DictWriter(f, fieldnames=['screen', 'matched_fasta', 'renamed_screen']) 172 | writer.writeheader() 173 | writer.writerows(mappings) 174 | 175 | print(f"\nProcessed {len(mappings)} screen files") 176 | print(f"Mapping saved to {csv_path}") 177 | 178 | if __name__ == '__main__': 179 | main() 180 | -------------------------------------------------------------------------------- /commec/tests/test_query.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | import os 3 | import pandas as pd 4 | import pytest 5 | import textwrap 6 | from Bio.Seq import Seq 7 | from Bio.SeqRecord import SeqRecord 8 | from commec.config.query import Query, QueryTranslation 9 | 10 | INPUT_QUERY = os.path.join(os.path.dirname(__file__), "test_data/single_record.fasta") 11 | 12 | def test_get_frame_length(): 13 | # 11 nt query 14 | query = Query(SeqRecord(Seq("atgtgccatgg"), id="test")) 15 | assert 9 == query._get_frame_length(frame_offset=0) 16 | assert 9 == query._get_frame_length(frame_offset=1) 17 | assert 9 == query._get_frame_length(frame_offset=2) 18 | 19 | # 15 nt query 20 | query = Query(SeqRecord(Seq("atgtgccatggatgc"), id="test")) 21 | assert 15 == query._get_frame_length(frame_offset=0) 22 | assert 12 == query._get_frame_length(frame_offset=1) 23 | assert 12 == query._get_frame_length(frame_offset=2) 24 | 25 | # 16 nt query 26 | query = Query(SeqRecord(Seq("atgtgccatggatgca"), id="test")) 27 | assert 15 == query._get_frame_length(frame_offset=0) 28 | assert 15 == query._get_frame_length(frame_offset=1) 29 | assert 12 == query._get_frame_length(frame_offset=2) 30 | 31 | def test_translate_to_file(tmp_path): 32 | query = Query(SeqRecord(Seq("atgtgccatgg"), id="test")) 33 | 34 | expected_output = textwrap.dedent( 35 | """\ 36 | >test_1 37 | MCH 38 | >test_2 39 | CAM 40 | >test_3 41 | VPW 42 | >test_4 43 | MAH 44 | >test_5 45 | HGT 46 | >test_6 47 | PWH 48 | """ 49 | ) 50 | 51 | aa_output = tmp_path / "test_translated.faa" 52 | 53 | query.translate(aa_output) 54 | 55 | # Check if the output file exists 56 | assert aa_output.exists() 57 | 58 | actual_output = aa_output.read_text() 59 | assert expected_output.strip() == actual_output.strip() 60 | 61 | 62 | def test_translate(): 63 | """ 64 | Test translation from nucleotide to 6 frames of protein sequences. 65 | """ 66 | # 11nt query 67 | query = Query(SeqRecord(Seq("atgtgccatgg"), id="test")) 68 | 69 | # Input sequence: atgtgccatgg 70 | # Translations: 71 | # Frame Pos Codon split Translation 72 | # 1 0 atg tgc cat gg MCH 73 | # 2 1 a tgt gcc atg g CAM 74 | # 3 2 at gtg cca tgg VPW 75 | # 4 -0 cc atg gca cat MAH 76 | # 5 -1 c cat ggc aca t HGT 77 | # 6 -2 cca tgg cac at PWH 78 | expected_translations = [ 79 | QueryTranslation(frame=1, sequence="MCH"), 80 | QueryTranslation(frame=2, sequence="CAM"), 81 | QueryTranslation(frame=3, sequence="VPW"), 82 | QueryTranslation(frame=4, sequence="MAH"), 83 | QueryTranslation(frame=5, sequence="HGT"), 84 | QueryTranslation(frame=6, sequence="PWH"), 85 | ] 86 | 87 | query._translate() 88 | assert expected_translations == query.translations 89 | 90 | # 15nt query 91 | query = Query(SeqRecord(Seq("acgcacctgatcgct"), id="test")) 92 | 93 | 94 | # Input sequence: acgcacctgatcgct 95 | # Translations: 96 | # Frame Pos Codon split Translation 97 | # 1 0 acg cac ctg atc gct THLIA 98 | # 2 1 a cgc acc tga tcg ct RTXS 99 | # 3 2 ac gca cct gat cgc t APDR 100 | # 4 -0 agc gat cag gtg cgt SDQVR 101 | # 5 -1 a gcg atc agg tgc gt RSGA 102 | # 6 -2 ag cga tca ggt gcg t AIRC 103 | expected_translations = [ 104 | QueryTranslation(frame=1, sequence="THLIA"), 105 | QueryTranslation(frame=2, sequence="RTXS"), 106 | QueryTranslation(frame=3, sequence="APDR"), 107 | QueryTranslation(frame=4, sequence="SDQVR"), 108 | QueryTranslation(frame=5, sequence="RSGA"), 109 | QueryTranslation(frame=6, sequence="AIRC"), 110 | ] 111 | 112 | query._translate() 113 | assert expected_translations == query.translations 114 | 115 | 116 | def test_ambigious(): 117 | """ 118 | Test translation from nucleotide to 6 frames of protein sequences using ambigious nts 119 | | --------------------------------------------------------------------- | 120 | | Code | Bases Represented | Meaning | 121 | | ---------------- | ----------------- | ------------------------------ | 122 | | **A** | A | Adenine | 123 | | **C** | C | Cytosine | 124 | | **G** | G | Guanine | 125 | | **T** (or **U**) | T (or U in RNA) | Thymine (or Uracil) | 126 | | **R** | A or G | puRine | 127 | | **Y** | C or T | pYrimidine | 128 | | **S** | G or C | Strong interaction (3 H-bonds) | 129 | | **W** | A or T | Weak interaction (2 H-bonds) | 130 | | **K** | G or T | Keto | 131 | | **M** | A or C | aMino | 132 | | **B** | C or G or T | not A | 133 | | **D** | A or G or T | not C | 134 | | **H** | A or C or T | not G | 135 | | **V** | A or C or G | not T | 136 | | **N** | A or C or G or T | any base (completely unknown) | 137 | | --------------------------------------------------------------------- | 138 | | Codon | Ambiguity | Expansions | Amino Acid | 139 | | ----- | --------- | ------------------ | ---------- | 140 | | AAR | R=A/G | AAA, AAG | Lys | 141 | | TAY | Y=C/T | TAC, TAT | Tyr | 142 | | GCN | N=A/C/G/T | GCA, GCC, GCG, GCT | Ala | 143 | | AAY | Y=C/T | AAC, AAT | Asn | 144 | | GAR | R=A/G | GAA, GAG | Glu | 145 | | ACM | M=A/C | ACA, ACC | Thr | 146 | | CCS | S=C/G | CCC, CCG | Pro | 147 | | GGW | W=A/T | GGA, GGT | Gly | 148 | | GTK | K=G/T | GTG, GTT | Val | 149 | | ATH | H=A/C/T | ATA, ATC, ATT | Ile | 150 | | GTD | D=A/G/T | GTA, GTG, GTT | Val | 151 | | CCB | B=C/G/T | CCC, CCG, CCT | Pro | 152 | | GTV | V=A/C/G | GTA, GTC, GTG | Val | 153 | """ 154 | # 11nt query 155 | #query = Query(SeqRecord(Seq("atntnccatgg"), id="test")) 156 | #query = Query(SeqRecord(Seq("ATGAARTAYGCNAAYGARACNABGGADCAHGAVACNTGG"), id="test")) 157 | query = Query(SeqRecord(Seq("ATGAARTAYGCNAAYGARACMCCSGGWGTKATHGTDCCBGTV"), id="test")) 158 | 159 | expected_translations = [ 160 | QueryTranslation(frame=1, sequence="MKYANETPGVIVPV"), 161 | QueryTranslation(frame=2, sequence="XXXXXXXXXXXXX"), 162 | QueryTranslation(frame=3, sequence="EXXXXBXXXXXXX"), 163 | QueryTranslation(frame=4, sequence="XXXXXXXXXXXXXH"), 164 | QueryTranslation(frame=5, sequence="XXXXXXXXXXXXS"), 165 | QueryTranslation(frame=6, sequence="TGTITPGVSXAYF"), 166 | ] 167 | 168 | query._translate() 169 | assert expected_translations == query.translations, query.translations[2:4] 170 | -------------------------------------------------------------------------------- /commec/utils/logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Utilities to set up commec package logging. 5 | """ 6 | 7 | import logging 8 | import sys 9 | import textwrap 10 | 11 | class TextWrapFormatter(logging.Formatter): 12 | """ 13 | Format multi-line log messages with proper vertical alignment, 14 | configurable styling, and text wrapping for longer messages. 15 | """ 16 | 17 | def __init__(self, *args, fmt=None, continuation_marker="│ ", line_width=120, **kwargs): 18 | if fmt is None: 19 | fmt = f"%(levelname)-8s{continuation_marker}%(message)s" 20 | super().__init__(fmt, *args, **kwargs) 21 | self.continuation_marker = continuation_marker 22 | self.line_width = line_width 23 | 24 | # String to prepended to all lines of wrapped output except the first 25 | self.indent_size = self._find_message_start() - len(self.continuation_marker) 26 | self.indent = " " * self.indent_size + self.continuation_marker 27 | 28 | def _find_message_start(self): 29 | """ 30 | Deterine how far to indent messages by formatting a dummy message. 31 | """ 32 | sample = logging.LogRecord( 33 | name="dummy", 34 | level=logging.INFO, 35 | pathname="./test", 36 | lineno=0, 37 | msg="DUMMY_MESSAGE", 38 | args=(), 39 | exc_info=None, 40 | ) 41 | sample.asctime = self.formatTime(sample) 42 | sample_formatted = super().format(sample) 43 | return sample_formatted.find(sample.msg) 44 | 45 | def format(self, record): 46 | """ 47 | Custom formatter for Commec logging. 48 | 49 | Accepts the following keywords in the `extra` dictionary: 50 | 51 | - **no_wrap**: 52 | Skips text wrapping. 53 | 54 | - **no_prefix**: 55 | Skips the `INFO │ ` prefixes. 56 | 57 | - **box**, **box_up**, **box_down**: 58 | Use Unicode box-drawing characters (e.g., `"─┘"` or `"─┐"`) to tie off 59 | formatted prefixes when switching to no-prefix lines. 60 | """ 61 | 62 | # Check extra options for format removal: 63 | if getattr(record, "no_prefix", False): 64 | box_up = getattr(record, "box_up", False) 65 | box_down = getattr(record, "box_down", False) 66 | if getattr(record, "cap", False): 67 | box_up = True 68 | box_down = True 69 | 70 | prefix = self.indent_size * "─" + "┘\n" if box_up else "" 71 | suffix = "\n" + self.indent_size * "─" + "┐" if box_down else "" 72 | 73 | return prefix + record.getMessage() + suffix # No formatting 74 | 75 | message = super().format(record) 76 | 77 | if getattr(record, "no_wrap", False): 78 | return message 79 | 80 | lines = message.splitlines() 81 | 82 | formatted_lines = [] 83 | # First line gets the levelname/timestamp/etc from super().format, then 84 | # long lines are wrapped with the indent 85 | wrapped_first = textwrap.wrap( 86 | lines[0], 87 | width=self.line_width, 88 | subsequent_indent=self.indent, 89 | break_long_words=False, 90 | break_on_hyphens=False, 91 | ) 92 | formatted_lines.extend(wrapped_first) 93 | 94 | # When a message has newlines, lines after the first should be indented even if short 95 | for line in lines[1:]: 96 | wrapped = textwrap.wrap( 97 | line, 98 | width=self.line_width, 99 | initial_indent=self.indent, 100 | subsequent_indent=self.indent, 101 | break_long_words=False, 102 | break_on_hyphens=False, 103 | ) 104 | formatted_lines.extend(wrapped) 105 | 106 | return "\n".join(formatted_lines) 107 | 108 | 109 | def setup_console_logging(log_level=logging.INFO): 110 | """Set up logging to console.""" 111 | commec_logger = logging.getLogger("commec") 112 | commec_logger.setLevel(log_level) 113 | 114 | # Check if the handler already exists to avoid duplicates 115 | if not any(isinstance(h, logging.StreamHandler) for h in commec_logger.handlers): 116 | console_handler = logging.StreamHandler() 117 | console_handler.setLevel(log_level) 118 | console_handler.setFormatter(TextWrapFormatter()) 119 | commec_logger.addHandler(console_handler) 120 | 121 | add_logging_to_excepthook() 122 | 123 | 124 | def setup_file_logging(filename, log_level=logging.INFO, log_mode="w"): 125 | """Set up logging to a file. Format determined based on level.""" 126 | commec_logger = logging.getLogger("commec") 127 | 128 | # Ensure the logger level is set to the lowest level of any handler 129 | current_level = commec_logger.level or logging.INFO 130 | commec_logger.setLevel(min(current_level, log_level)) 131 | 132 | # Log format has more detail if logging down to the debug level 133 | if log_level == logging.DEBUG: 134 | formatter = TextWrapFormatter( 135 | fmt="%(asctime)s│ %(levelname)-8s│ %(message)s", 136 | datefmt="%Y-%m-%d %H:%M:%S", # Full ISO-like format 137 | line_width = 300, # Longer lines for debug purposes. 138 | ) 139 | else: 140 | formatter = TextWrapFormatter("%(levelname)-8s│ %(message)s") 141 | 142 | # Update existing filehandlers, avoiding duplicates 143 | file_handler = None 144 | for handler in commec_logger.handlers: 145 | if ( 146 | isinstance(handler, logging.FileHandler) 147 | and getattr(handler, "baseFilename", None) == filename 148 | ): 149 | file_handler = handler 150 | break 151 | 152 | file_handler = file_handler or logging.FileHandler(filename, log_mode) 153 | file_handler.setLevel(log_level) 154 | file_handler.setFormatter(formatter) 155 | commec_logger.addHandler(file_handler) 156 | 157 | 158 | def add_logging_to_excepthook(): 159 | """ 160 | Ensure unhandled exceptions are logged to the commec package logger; 161 | original excepthook is still called. 162 | """ 163 | original_excepthook = sys.excepthook 164 | 165 | def commec_exception_logger(exc_type, exc_value, exc_traceback): 166 | """Log exception to package logger.""" 167 | commec_logger = logging.getLogger("commec") 168 | 169 | if commec_logger.handlers: 170 | # Log the exception message at ERROR level 171 | error_message = f"Unhandled exception: {exc_type.__name__}: {exc_value}" 172 | commec_logger.error(error_message) 173 | 174 | # Log the full traceback at the DEBUG level 175 | commec_logger.debug( 176 | "Exception traceback:", exc_info=(exc_type, exc_value, exc_traceback) 177 | ) 178 | 179 | # Still call the original handler for console output 180 | original_excepthook(exc_type, exc_value, exc_traceback) 181 | 182 | sys.excepthook = commec_exception_logger 183 | 184 | 185 | def set_log_level(log_level, update_only_handler_type=None): 186 | """ 187 | Update the log level for the commec logger, as well as associated handlers. 188 | Optionally, restrict updates to only a particular class of handlers (e.g. StreamHandler). 189 | """ 190 | commec_logger = logging.getLogger("commec") 191 | commec_logger.setLevel(log_level) 192 | 193 | handlers_to_update = commec_logger.handlers 194 | if update_only_handler_type: 195 | handlers_to_update = [ 196 | h for h in handlers_to_update if isinstance(h, update_only_handler_type) 197 | ] 198 | 199 | for handler in handlers_to_update: 200 | handler.setLevel(log_level) 201 | -------------------------------------------------------------------------------- /commec/tools/search_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Abstract base class defining a shared interface for search tools. 5 | """ 6 | from abc import ABC, abstractmethod 7 | import os 8 | from dataclasses import dataclass 9 | import subprocess 10 | import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | @dataclass 15 | class SearchToolVersion: 16 | """Container class for outputting version related information from a database.""" 17 | 18 | tool_info: str = "x.x.x" 19 | database_info: str = "x.x.x" 20 | 21 | 22 | class DatabaseValidationError(Exception): 23 | """Custom exception for database validation errors.""" 24 | 25 | class SearchHandler(ABC): 26 | """ 27 | Abstract class defining tool interface including a database directory / file to search, an input 28 | query, and an output file to be used for screening. 29 | """ 30 | 31 | def __init__( 32 | self, 33 | database_file: str | os.PathLike, 34 | input_file: str | os.PathLike, 35 | out_file: str | os.PathLike, 36 | **kwargs, 37 | ): 38 | """ 39 | Initialise a Search Handler. 40 | 41 | Parameters 42 | ---------- 43 | database_file : str | os.PathLike 44 | Path to the database file. 45 | input_file : str | os.PathLike 46 | Path to the input file to be processed. 47 | out_file : str | os.PathLike 48 | Path where the output will be saved. 49 | 50 | Keyword Arguments 51 | ----------------- 52 | threads : int, optional 53 | Number of threads to use for processing. Default is 1. 54 | force : bool, optional 55 | Whether to force overwrite existing files. Default is False. 56 | 57 | Notes 58 | ----- 59 | - `database_file`, `input_file`, and `out_file` are validated on instantiation. 60 | """ 61 | 62 | self.db_file = os.path.abspath(os.path.expanduser(database_file)) 63 | self.input_file = os.path.abspath(os.path.expanduser(input_file)) 64 | self.out_file = os.path.abspath(os.path.expanduser(out_file)) 65 | self.threads = kwargs.get('threads', 1) 66 | self.force = kwargs.get('force', False) 67 | self.arguments_dictionary = {} 68 | self.successful = True 69 | 70 | # Only validate database files if we actually intend on using them 71 | if not self.should_use_existing_output: 72 | self._validate_db() 73 | 74 | self.version_info = self.get_version_information() 75 | 76 | @property 77 | def db_directory(self): 78 | """Directory where databases to be searched are located.""" 79 | return os.path.dirname(self.db_file) 80 | 81 | @property 82 | def temp_log_file(self): 83 | """Temporary log file used for this search. Based on outfile name.""" 84 | return f"{self.out_file}.log.tmp" 85 | 86 | @property 87 | def should_use_existing_output(self) -> bool: 88 | """ 89 | True if (1) search is not forced and (2) output exists and is valid. 90 | """ 91 | return not self.force and self.validate_output() 92 | 93 | def search(self): 94 | """ 95 | Wrapper for _search, skipping if existing output should not be overwritten. 96 | """ 97 | if self.should_use_existing_output: 98 | logger.warning("%s expected output data already exists, " 99 | "will use existing data found in:", 100 | self.__class__.__name__) 101 | logger.warning(self.out_file, extra = {"no_prefix" : True, "cap":True}) 102 | else: 103 | self._search() 104 | 105 | @abstractmethod 106 | def _search(self): 107 | """ 108 | Use a tool to search the input query against a database. 109 | Should be implemented by all subclasses to perform the actual search against the database. 110 | """ 111 | 112 | @abstractmethod 113 | def read_output(self): 114 | """ 115 | Returns the output of the handler in the form of a pandas dataframe. 116 | """ 117 | 118 | @abstractmethod 119 | def get_version_information(self) -> SearchToolVersion: 120 | """ 121 | Provide version for the search tool used, to allow reproducibility. 122 | This method should be implemented by all subclasses to return tool-specific version info. 123 | """ 124 | 125 | def validate_output(self): 126 | """ 127 | Check the output file contains something, indicating that the search ran. 128 | Can be overridden if more complex checks for a particular tool are desired. 129 | Is overridden for Diamond outputs, which have no header information, and simply only 130 | checks for file-existance, rather than lack of content, for example. 131 | """ 132 | return not self.has_empty_output() 133 | 134 | def _validate_db(self): 135 | """ 136 | Validates that the database directory and file exists. Called on init. 137 | """ 138 | if not os.path.isdir(self.db_directory): 139 | raise DatabaseValidationError( 140 | f"Screening database directory not found at: {self.db_directory}." 141 | " Screening directory path can be set via --databases option or --config yaml." 142 | ) 143 | 144 | if not os.path.isfile(self.db_file): 145 | raise DatabaseValidationError( 146 | f"Provided database file not found: {self.db_file}." 147 | " File location can be set via --databases option or --config yaml." 148 | ) 149 | 150 | def has_empty_output(self) -> bool: 151 | """Check if the output file is empty or non-existent.""" 152 | try: 153 | return os.path.getsize(self.out_file) == 0 154 | except OSError: 155 | # Errors such as FileNotFoundError considered empty 156 | return True 157 | 158 | def has_hits(self) -> bool: 159 | """Check if the output file has any hits (lines that do not start with '#').""" 160 | try: 161 | with open(self.out_file, "r", encoding="utf-8") as file: 162 | return any(not line.strip().startswith("#") for line in file) 163 | except FileNotFoundError: 164 | return False 165 | 166 | def format_args_for_cli(self) -> list: 167 | """ 168 | Format `self.arguments_dictionary` into a list of strings for use in the command line. 169 | """ 170 | formatted_args = [] 171 | for key, value in self.arguments_dictionary.items(): 172 | formatted_args.append(str(key)) 173 | if isinstance(value, list): 174 | formatted_args.append(" ".join(map(str, value))) 175 | elif value is not None: 176 | formatted_args.append(str(value)) 177 | return formatted_args 178 | 179 | def run_as_subprocess(self, command, out_file, raise_errors=False): 180 | """ 181 | Run a command using subprocess.run, piping stdout and stderr to `out_file`. 182 | """ 183 | self.successful = False 184 | 185 | logger.debug("SUBPROCESS: %s", " ".join(command)) 186 | logger.debug(" ".join(command), extra = {"no_prefix":True,"cap":True}) 187 | 188 | with open(out_file, "a", encoding="utf-8") as f: 189 | result = subprocess.run( 190 | command, stdout=f, stderr=subprocess.STDOUT, check=raise_errors 191 | ) 192 | 193 | if result.returncode != 0: 194 | command_str = " ".join(command) 195 | logger.error( 196 | "\t command '%s' failed with error '%s'", 197 | command_str, 198 | result.stderr, 199 | ) 200 | raise RuntimeError( 201 | f"subprocess.run of command '{command_str}' encountered error." 202 | f" Check {out_file} for logs." 203 | ) 204 | 205 | self.successful = True 206 | 207 | def __del__(self): 208 | if os.path.exists(self.temp_log_file) and self.successful: 209 | os.remove(self.temp_log_file) -------------------------------------------------------------------------------- /commec/tools/hmmer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science 3 | """ 4 | Module for a hidden markov model handler, specifically for calling hmmscan command line interface. 5 | Additional methods for reading hmmscan output, readhmmer, which returns a pandas database. 6 | Instantiate a HmmerHandler, with input local database, input fasta, and output file. 7 | Throws if inputs are invalid. Creates a temporary log file, which is deleted on completion. 8 | """ 9 | import re 10 | import subprocess 11 | import pandas as pd 12 | import itertools 13 | from commec.config.query import Query 14 | from commec.tools.search_handler import SearchHandler, SearchToolVersion 15 | from commec.utils.coordinates import convert_protein_to_nucleotide_coords 16 | 17 | 18 | class HmmerHandler(SearchHandler): 19 | """A Database handler specifically for use with Hmmer files for commec screening.""" 20 | 21 | def _search(self): 22 | command = [ 23 | "hmmscan", 24 | "--cpu", 25 | str(self.threads), 26 | "--domtblout", 27 | self.out_file, 28 | self.db_file, 29 | self.input_file, 30 | ] 31 | self.run_as_subprocess(command, self.temp_log_file) 32 | 33 | def read_output(self): 34 | output_dataframe = readhmmer(self.out_file) 35 | # Standardize the output column names to be like blast: 36 | output_dataframe = output_dataframe.rename(columns={ 37 | #"ali from": "q. start", # These are no re-calculated to Query NT coordinates. 38 | #"ali to": "q. end", 39 | "coverage": "q. coverage", 40 | "target name": "subject title", 41 | "qlen":"query length", 42 | "hmm from":"s. start", 43 | "hmm to":"s. end", 44 | 'E-value': "evalue", 45 | }) 46 | return output_dataframe 47 | 48 | def get_version_information(self) -> SearchToolVersion: 49 | """ 50 | The first line of the HMM database typically contains creation date 51 | information, and some version information. 52 | """ 53 | database_info: str = None 54 | try: 55 | with open(self.db_file, "r", encoding="utf-8") as file: 56 | for line in file: 57 | if line.startswith("HMMER3/f"): 58 | database_info = line.split(";", maxsplit=1)[0].strip() 59 | continue 60 | # Early exit if data has been found 61 | if database_info: 62 | break 63 | 64 | tool_version_result = subprocess.run( 65 | ["hmmscan", "-h"], capture_output=True, text=True, check=True 66 | ) 67 | tool_info: str = tool_version_result.stdout.splitlines()[1].strip() 68 | return SearchToolVersion(tool_info, database_info) 69 | 70 | except (subprocess.CalledProcessError, FileNotFoundError): 71 | return None 72 | 73 | 74 | def readhmmer(fileh): 75 | """ 76 | Read in HMMER output files 77 | """ 78 | columns = [ 79 | "target name", 80 | "accession", 81 | "tlen", 82 | "query name", 83 | " accession", 84 | "qlen", 85 | "E-value", 86 | "score", 87 | "bias", 88 | "hit #", 89 | "of", 90 | "c-Evalue", 91 | "i-Evalue", 92 | "score2", 93 | "bias", 94 | "hmm from", 95 | "hmm to", 96 | "ali from", 97 | "ali to", 98 | "env from", 99 | "env to", 100 | "acc", 101 | "description of target", 102 | ] 103 | 104 | hmmer = [] 105 | 106 | with open(fileh, "r", encoding="utf-8") as f: 107 | for line in f: 108 | if "# Program: hmmscan" in line: 109 | break 110 | if "#" in line: 111 | continue 112 | bits = re.split(r"\s+", line) 113 | description = " ".join(bits[22:]) 114 | bits = bits[:22] 115 | bits.append(description) 116 | hmmer.append(bits) 117 | hmmer = pd.DataFrame(hmmer, columns=columns) 118 | hmmer["E-value"] = pd.to_numeric(hmmer["E-value"]) 119 | hmmer["score"] = pd.to_numeric(hmmer["score"]) 120 | hmmer["ali from"] = pd.to_numeric(hmmer["ali from"]) 121 | hmmer["ali to"] = pd.to_numeric(hmmer["ali to"]) 122 | hmmer["qlen"] = pd.to_numeric(hmmer["qlen"]) 123 | # Extract the frame information. 124 | hmmer["frame"] = hmmer["query name"].str.split('_').str[-1].astype(int) 125 | return hmmer 126 | 127 | def remove_overlaps(hmmer : pd.DataFrame) -> pd.DataFrame: 128 | """ 129 | Trims verbosity of a HMMER output, 130 | by removing weaker hits which are 131 | encompassed in their extent by higher scoring hits. 132 | 133 | Note, works to trim nucleotide coordinates relative to the query, 134 | not ali from and ali to from the HMMER itself. 135 | 136 | This means it can be used on any DataFrame with the q. start and q. end NT headings. 137 | (Consider moving to a general coordinates tool function?) 138 | """ 139 | assert "q. start" in hmmer.columns, ("No \"q. start\" heading in HMMER output dataframe being " 140 | "passed to remove overlaps, ensure that the dataframe has " 141 | "been processed for converstion to nucleotide coordinates.") 142 | 143 | assert "q. end" in hmmer.columns, ("No \"q. end\" heading in HMMER output dataframe being " 144 | "passed to remove overlaps, ensure that the dataframe has " 145 | "been processed for converstion to nucleotide coordinates.") 146 | 147 | trimmed_hmmer = hmmer # Direct Assignment, reassigned later with .drop() for deep-copy. 148 | 149 | # Ensure all logic is performed per unique Query name. 150 | for query in hmmer["query name"].unique(): 151 | 152 | hmmer_for_query = hmmer[hmmer["query name"] == query] 153 | sorted_values = hmmer_for_query.sort_values(by=["score"], ascending = False) 154 | 155 | for i, j in itertools.combinations(sorted_values.index, 2): 156 | # If J is encapsulated: 157 | if (sorted_values.loc[i, "q. start"] <= sorted_values.loc[j, "q. start"] 158 | and sorted_values.loc[i, "q. end"] >= sorted_values.loc[j, "q. end"] 159 | and sorted_values.loc[i, "score"] >= sorted_values.loc[j, "score"]): 160 | if j in trimmed_hmmer.index: 161 | trimmed_hmmer = trimmed_hmmer.drop([j]) 162 | continue 163 | # If I is encapsulated: 164 | if (sorted_values.loc[i, "q. start"] >= sorted_values.loc[j, "q. start"] 165 | and sorted_values.loc[i, "q. end"] <= sorted_values.loc[j, "q. end"] 166 | and sorted_values.loc[i, "score"] <= sorted_values.loc[j, "score"]): 167 | if i in trimmed_hmmer.index: 168 | trimmed_hmmer = trimmed_hmmer.drop([i]) 169 | 170 | # Tidy the output indices. 171 | trimmed_hmmer = trimmed_hmmer.reset_index(drop=True) 172 | 173 | return trimmed_hmmer 174 | 175 | def recalculate_hmmer_query_coordinates(hmmer : pd.DataFrame): 176 | """ 177 | Recalculate the coordinates of the hmmer database , such that each translated frame 178 | reverts to original nucleotide coordinates. 179 | """ 180 | assert "nt_qlen" in hmmer.columns, ("No \"nt_qlen\" heading in HMMER output dataframe being " 181 | "passed to calculate nt coordinates, ensure that the dataframe has " 182 | "been processed to include nucleotide query length data.") 183 | hmmer["q. start"], hmmer["q. end"] = convert_protein_to_nucleotide_coords( 184 | hmmer["frame"].to_numpy(), 185 | hmmer["ali from"].to_numpy(), 186 | hmmer["ali to"].to_numpy(), 187 | hmmer["nt_qlen"].to_numpy()) 188 | 189 | def append_nt_querylength_info(hmmer : pd.DataFrame, queries : dict[str, Query]): 190 | """ 191 | Take the hmmer output, and add a series (nt_qlen) 192 | of the true nt length based on query name. 193 | """ 194 | hmmer["nt_qlen"] = [queries[q[:-2]].length for q in hmmer["query name"]] 195 | --------------------------------------------------------------------------------