├── commec
    ├── tests
    │   ├── __init__.py
    │   ├── test_dbs
    │   │   ├── biorisk
    │   │   │   ├── reg_taxids.txt
    │   │   │   ├── biorisk.hmm.h3f
    │   │   │   ├── biorisk.hmm.h3i
    │   │   │   ├── biorisk.hmm.h3m
    │   │   │   ├── biorisk.hmm.h3p
    │   │   │   └── biorisk_annotations.csv
    │   │   ├── low_concern
    │   │   │   ├── vax_taxids.txt
    │   │   │   ├── syn_taxids.txt
    │   │   │   ├── low_concern_annotations.tsv
    │   │   │   ├── dna
    │   │   │   │   └── benign.fasta
    │   │   │   ├── rna
    │   │   │   │   ├── benign.cm.i1f
    │   │   │   │   ├── benign.cm.i1i
    │   │   │   │   ├── benign.cm.i1m
    │   │   │   │   ├── benign.cm.i1p
    │   │   │   │   └── benign.cm.exemplar.out
    │   │   │   └── protein
    │   │   │   │   ├── benign.hmm.h3f
    │   │   │   │   ├── benign.hmm.h3i
    │   │   │   │   ├── benign.hmm.h3m
    │   │   │   │   ├── benign.hmm.h3p
    │   │   │   │   └── benign.hmm.exemplar.out
    │   │   ├── nr_blast
    │   │   │   ├── nr.pto
    │   │   │   ├── nr.pdb
    │   │   │   ├── nr.phr
    │   │   │   ├── nr.pin
    │   │   │   ├── nr.ptf
    │   │   │   ├── nr.pot
    │   │   │   ├── nr.pjs
    │   │   │   └── nr.exemplar.out
    │   │   ├── nt_blast
    │   │   │   ├── core_nt.nto
    │   │   │   ├── core_nt.ndb
    │   │   │   ├── core_nt.nhr
    │   │   │   ├── core_nt.nin
    │   │   │   ├── core_nt.nsq
    │   │   │   ├── core_nt.ntf
    │   │   │   ├── core_nt.not
    │   │   │   ├── core_nt.njs
    │   │   │   └── core_nt.exemplar.out
    │   │   ├── taxonomy
    │   │   │   └── taxonomy_placeholder.txt
    │   │   └── nr_dmnd
    │   │   │   ├── nr.1.dmnd
    │   │   │   ├── nr.2.dmnd
    │   │   │   ├── nr.3.dmnd
    │   │   │   └── nr.dmnd.exemplar.out
    │   ├── test_data
    │   │   ├── has_empty_description.fasta
    │   │   ├── screen-files
    │   │   │   ├── biorisk-error-2025-02.screen
    │   │   │   ├── README.md
    │   │   │   ├── no-hits-2024-06.screen
    │   │   │   ├── fast-mode-2025-02.screen
    │   │   │   ├── prot-error-2024-08.screen
    │   │   │   ├── prot-mixed-hit-2024-06.screen
    │   │   │   ├── prot-hit-not-cleared-2024-06.screen
    │   │   │   ├── prot-multiple-hits-2024-06.screen
    │   │   │   └── prot-nt-hits-cleared-2024-09.screen
    │   │   ├── input_has_empty_description
    │   │   │   └── has_empty_description.cleaned.fasta
    │   │   ├── single_record.fasta
    │   │   ├── input_single_record
    │   │   │   └── single_record.cleaned.fasta
    │   │   ├── psuedo_query.fasta
    │   │   ├── has_empty_record.fasta
    │   │   ├── input_has_empty_record
    │   │   │   └── has_empty_record.cleaned.fasta
    │   │   ├── has_records_with_same_description.fasta
    │   │   ├── multiple_records.fasta
    │   │   ├── input_multiple_records
    │   │   │   └── multiple_records.cleaned.fasta
    │   │   ├── input_has_records_with_same_description
    │   │   │   └── has_records_with_same_description.cleaned.fasta
    │   │   └── single_record.transeq.faa
    │   ├── .pylintrc
    │   ├── test_rationales.py
    │   ├── test_flag.py
    │   ├── test_trim.py
    │   ├── test_coverage.py
    │   ├── test_check_biorisk.py
    │   ├── test_screen_io.py
    │   ├── test_nc_to_nt.py
    │   ├── test_split.py
    │   ├── test_aa_to_nt.py
    │   ├── test_blast_tools.py
    │   ├── test_fetch_nc_bits.py
    │   ├── test_json.py
    │   ├── test_dbs.py
    │   └── test_query.py
    ├── __init__.py
    ├── config
    │   ├── constants.py
    │   ├── screen_tools.py
    │   └── json_io.py
    ├── screen-default-config.yaml
    ├── utils
    │   ├── concat_seqs.py
    │   ├── file_utils.py
    │   ├── template.html
    │   ├── dict_utils.py
    │   ├── coordinates.py
    │   └── logger.py
    ├── split.py
    ├── tools
    │   ├── blastn.py
    │   ├── blastx.py
    │   ├── cmscan.py
    │   ├── fetch_nc_bits.py
    │   ├── search_handler.py
    │   └── hmmer.py
    └── cli.py
├── conftest.py
├── environment.yaml
├── .gitignore
├── .github
    ├── workflows
    │   ├── automate_tests.yml
    │   ├── automate_release.yml
    │   └── release-version-sha-update.yml
    └── PULL_REQUEST_TEMPLATE.md
├── example_data
    ├── screen_pipeline_status.csv
    ├── output_commec-examples
    │   ├── commec-examples.low_concern.cmscan
    │   ├── commec-examples.biorisk.hmmscan
    │   ├── commec-examples.low_concern.blastn
    │   ├── commec-examples.low_concern.hmmscan
    │   └── commec-examples.nt.blastn
    ├── input_commec-examples
    │   ├── commec-examples_config.yaml
    │   └── commec-examples.noncoding.fasta
    └── README.md
├── LICENSE
├── pyproject.toml
├── conda-recipe
    └── meta.yaml
├── dev_scripts
    ├── split_fasta.py
    ├── run_blastx.sh
    ├── run_diamond.sh
    ├── summarize_screens.py
    └── collate-screens.py
└── README.md


/commec/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/biorisk/reg_taxids.txt:
--------------------------------------------------------------------------------
1 | 12345


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/vax_taxids.txt:
--------------------------------------------------------------------------------
1 | 11589


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/syn_taxids.txt:
--------------------------------------------------------------------------------
1 | 394040
2 | 32630


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.pto:
--------------------------------------------------------------------------------
1 | 
2 |                                	   


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.nto:
--------------------------------------------------------------------------------
1 | 
2 |                                	   


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/low_concern_annotations.tsv:
--------------------------------------------------------------------------------
1 | ID	Description
2 | Benign1	TEST_BENIGN_DESCRIPTION
3 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/taxonomy/taxonomy_placeholder.txt:
--------------------------------------------------------------------------------
1 | This is an empty file, as we need the taxonomy directory to exist.


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.pdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.pdb


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.phr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.phr


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.pin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.pin


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.ptf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_blast/nr.ptf


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_dmnd/nr.1.dmnd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_dmnd/nr.1.dmnd


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_dmnd/nr.2.dmnd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_dmnd/nr.2.dmnd


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_dmnd/nr.3.dmnd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nr_dmnd/nr.3.dmnd


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.ndb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.ndb


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.nhr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.nhr


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.nin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.nin


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.nsq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.nsq


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.ntf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/nt_blast/core_nt.ntf


--------------------------------------------------------------------------------
/commec/tests/test_dbs/biorisk/biorisk.hmm.h3f:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3f


--------------------------------------------------------------------------------
/commec/tests/test_dbs/biorisk/biorisk.hmm.h3i:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3i


--------------------------------------------------------------------------------
/commec/tests/test_dbs/biorisk/biorisk.hmm.h3m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3m


--------------------------------------------------------------------------------
/commec/tests/test_dbs/biorisk/biorisk.hmm.h3p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/biorisk/biorisk.hmm.h3p


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/dna/benign.fasta:
--------------------------------------------------------------------------------
1 | >TEST_BENIGN_FASTA
2 | aaagaggagaaatactagatgaaaaacataaatgccgacgacacatacagaataattaataaaattaaagcttgtagaag
3 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/rna/benign.cm.i1f:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1f


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/rna/benign.cm.i1i:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1i


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/rna/benign.cm.i1m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1m


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/rna/benign.cm.i1p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/rna/benign.cm.i1p


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.pot:
--------------------------------------------------------------------------------
1 | 
2 |                                                                	       
3 |                                                


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3f:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3f


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3i:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3i


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3m


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibbis-bio/common-mechanism/HEAD/commec/tests/test_dbs/low_concern/protein/benign.hmm.h3p


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.not:
--------------------------------------------------------------------------------
1 | 
2 |                                                                	       
3 |                                                


--------------------------------------------------------------------------------
/commec/tests/test_dbs/biorisk/biorisk_annotations.csv:
--------------------------------------------------------------------------------
1 | ID,Description,Must flag
2 | Toxin1, TestBioriskToxinFlag,TRUE
3 | Toxin2, TestBioriskToxin,False
4 | Toxin3, TestBioriskToxin,False


--------------------------------------------------------------------------------
/commec/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import version, PackageNotFoundError
2 | try:
3 |     __version__ = version("commec")
4 | except (ImportError, PackageNotFoundError):
5 |     __version__ = "X.X.X"
6 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/has_empty_description.fasta:
--------------------------------------------------------------------------------
1 | >
2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_dmnd/nr.dmnd.exemplar.out:
--------------------------------------------------------------------------------
1 | BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"	Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1	Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1		1.97e-23	74.3	90.9	174	41	172	45	1	0
2 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/biorisk-error-2025-02.screen:
--------------------------------------------------------------------------------
1 | INFO     | Validating Inputs...
2 | INFO     | >> STEP 1: Checking for biorisk genes...
3 | ERROR    | 	...Biorisk annotations file does not exist: commec-dbs/biorisk_db/biorisk_annotations.csv
4 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/input_has_empty_description/has_empty_description.cleaned.fasta:
--------------------------------------------------------------------------------
1 | >
2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa
3 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/single_record.fasta:
--------------------------------------------------------------------------------
1 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"
2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa


--------------------------------------------------------------------------------
/commec/tests/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MASTER]
 2 | load-plugins=pylint.extensions.docparams
 3 | 
 4 | [MESSAGES CONTROL]
 5 | disable=
 6 |     missing-function-docstring,
 7 |     missing-class-docstring,
 8 |     missing-module-docstring
 9 | 
10 | [DOCSTRING]
11 | ignore-private-members=yes
12 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/input_single_record/single_record.cleaned.fasta:
--------------------------------------------------------------------------------
1 | >BBa_K380009_A_20830
2 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacct
3 | aacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaa
4 | agcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa
5 | 


--------------------------------------------------------------------------------
/commec/config/constants.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | 
 4 | # SCREENING
 5 | MINIMUM_QUERY_LENGTH = 41
 6 | 
 7 | # I/O
 8 | DEFAULT_CONFIG_YAML_PATH = "screen-default-config.yaml"
 9 | MAXIMUM_FILENAME_SIZE = 255
10 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/psuedo_query.fasta:
--------------------------------------------------------------------------------
1 | > TEST_QUERY_01"TEST01"|taxid=562
2 | AGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTC
3 | > TEST_QUERY_02"TEST01"|taxid=562
4 | AGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTC


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Additional Configurations for Pytest
 3 | """
 4 | def pytest_addoption(parser):
 5 |     """ Adds unique argument to pytest for commec database example outputs generation."""
 6 |     print("Test Configuration loaded!")
 7 |     parser.addoption(
 8 |         "--gen-examples", action="store_true", default=False,
 9 |         help="Generate exemplar output files instead of testing against them."
10 |     )
11 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/README.md:
--------------------------------------------------------------------------------
1 | These screen files each represent different outcomes. Files are named according to the unique outcome they show as well as the date they were generated.
2 | 
3 | On a few occasions, lines from multiple screen files produced by the same commec version were combined.
4 | 
5 | Future versions of `commec flag` should maintain backwards compatibility with all the files in this directory if possible.
6 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/has_empty_record.fasta:
--------------------------------------------------------------------------------
1 | >BBa_K205004_A_16908_Coding_"MerT_-_Membranous_Mercury_transporter_"
2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa
3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"


--------------------------------------------------------------------------------
/commec/tests/test_data/input_has_empty_record/has_empty_record.cleaned.fasta:
--------------------------------------------------------------------------------
1 | >BBa_K205004_A_16908_Coding_"MerT_-_Membranous_Mercury_transporter_"
2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa
3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"
4 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: commec-dev
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python>=3.10
 8 |   # Runtime Python dependencies
 9 |   - biopython
10 |   - numpy
11 |   - pandas
12 |   - pytaxonkit
13 |   - pyyaml
14 |   # Runtime non-Python dependencies
15 |   - blast
16 |   - diamond>=2.1
17 |   - hmmer
18 |   - infernal
19 |   - plotly
20 |   - yaml
21 |   - mako
22 |   - wget
23 |   # Development dependencies
24 |   - pip
25 |   - pytest
26 |   - matplotlib
27 |   - pip:
28 |       - -e .
29 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.pjs:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "1.2",
 3 |   "dbname": "nr",
 4 |   "dbtype": "Protein",
 5 |   "db-version": 5,
 6 |   "description": "pseudo_queries_aa.fasta",
 7 |   "number-of-letters": 6970,
 8 |   "number-of-sequences": 10,
 9 |   "last-updated": "2024-09-03T00:32:00",
10 |   "number-of-volumes": 1,
11 |   "bytes-total": 45266,
12 |   "bytes-to-cache": 7165,
13 |   "files": [
14 |     "nr.pdb",
15 |     "nr.phr",
16 |     "nr.pin",
17 |     "nr.pot",
18 |     "nr.psq",
19 |     "nr.ptf",
20 |     "nr.pto"
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.njs:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "1.2",
 3 |   "dbname": "core_nt",
 4 |   "dbtype": "Nucleotide",
 5 |   "db-version": 5,
 6 |   "description": "pseudo_queries.fasta",
 7 |   "number-of-letters": 31640,
 8 |   "number-of-sequences": 10,
 9 |   "last-updated": "2025-05-21T08:12:00",
10 |   "number-of-volumes": 1,
11 |   "bytes-total": 46226,
12 |   "bytes-to-cache": 8145,
13 |   "files": [
14 |     "core_nt.ndb",
15 |     "core_nt.nhr",
16 |     "core_nt.nin",
17 |     "core_nt.not",
18 |     "core_nt.nsq",
19 |     "core_nt.ntf",
20 |     "core_nt.nto"
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nr_blast/nr.exemplar.out:
--------------------------------------------------------------------------------
1 | # BLASTX 2.15.0+
2 | # Query: BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"
3 | # Database: /root/repo/json/common-mechanism/commec/tests/test_dbs/nr_blast/nr
4 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end
5 | # 1 hits found
6 | BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"	Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1	Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]_1	0	3.67e-23	73.6	90.909	174	41	172	45	1	44
7 | # BLAST processed 1 queries
8 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/nt_blast/core_nt.exemplar.out:
--------------------------------------------------------------------------------
1 | # BLASTN 2.15.0+
2 | # Query: BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"
3 | # Database: /root/repo/json/common-mechanism/commec/tests/test_dbs/nt_blast/core_nt
4 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end
5 | # 1 hits found
6 | BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"	Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]	Pseudogene_E"FakeGeneE_Biorisk"[taxid=560]	0	4.33e-91	322	100.000	174	1	174	174	1	174
7 | # BLAST processed 1 queries
8 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/no-hits-2024-06.screen:
--------------------------------------------------------------------------------
 1 | >> STEP 1: Checking for biorisk genes...
 2 | 		 --> Biorisks: no significant hits detected, PASS
 3 |  STEP 1 completed at 2024-06-24 16:49:29
 4 |  >> STEP 2: Checking regulated pathogen proteins...
 5 | 	...no hits
 6 |  STEP 2 completed at 2024-06-24 16:51:11
 7 |  >> STEP 3: Checking regulated pathogen nucleotides...
 8 | 	...no hits to the nr database
 9 | 	...no hits
10 |  STEP 3 completed at 2024-06-24 16:51:12
11 | >> STEP 4: Checking any pathogen regions for benign components...
12 | 	...no regulated regions to clear
13 | >> COMPLETED AT 2024-06-24 16:51:13


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/fast-mode-2025-02.screen:
--------------------------------------------------------------------------------
 1 |  Validating Inputs...
 2 | >> STEP 1: Checking for biorisk genes...
 3 | 		 --> Biorisks: Regulated genes not found, PASS
 4 | 
 5 | 		 --> Virulence factor found in bases 4 to 471, WARNING
 6 | 		     Gene: putative secreted protein-tyrosine phosphatase [Yersinia pestis CO92]
 7 | 
 8 |  STEP 1 completed at 2025-02-18 02:05:16
 9 |  SKIPPING STEP 2: Protein search
10 |  SKIPPING STEP 3: Nucleotide search
11 | >> STEP 4: Checking any pathogen regions for low_concern components...
12 | 	...no regulated regions to clear
13 | 
14 | >> STEP 4 completed at 2025-02-18 02:05:16
15 | >> COMPLETED AT 2025-02-18 02:05:16
16 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/has_records_with_same_description.fasta:
--------------------------------------------------------------------------------
1 | >BBa_K380009 A_20830_Coding_"Protein_A_Z-domain"
2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa
3 | >BBa_K380009 A_20830_Coding_"Protein_A_Z-domain"
4 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa


--------------------------------------------------------------------------------
/commec/tests/test_data/multiple_records.fasta:
--------------------------------------------------------------------------------
1 | >BBa_K205004_A_16908_Coding_"MerT_-_Membranous_Mercury_transporter_"
2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa
3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"
4 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa


--------------------------------------------------------------------------------
/commec/tests/test_data/input_multiple_records/multiple_records.cleaned.fasta:
--------------------------------------------------------------------------------
 1 | >BBa_K205004_A_16908
 2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctc
 3 | gcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttgg
 4 | atcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtg
 5 | gcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggat
 6 | gtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcg
 7 | ctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa
 8 | >BBa_K380009_A_20830
 9 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacct
10 | aacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaa
11 | agcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa
12 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/input_has_records_with_same_description/has_records_with_same_description.cleaned.fasta:
--------------------------------------------------------------------------------
1 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"
2 | atgtctgaacctcaaaacgggcgcggggcgctcttcactggcgggctagccgccatcctcgcctcggcttgctgcctggggccgctggttctgatcgccctggggttcagcggcgcttggatcggcaacttgacggtgttggaaccttatcgcccgatcttcatcggcgcggcgttggtggcgctgtttttcgcctggcggcgcatctaccgaccggcgcaagcctgcaaaccaggggatgtgtgtgcgattccccaagtgcgcgctacttacaagctcattttctgggtcgtggccgcgctggttctggtcgcgctcggatttccctacgtcatgccatttttctattaa
3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"
4 | gtagacaacaaattcaacaaagaacaacaaaacgcgttctatgagatcttacatttacctaacttaaacgaagaacaacgaaacgccttcatccaaagtttaaaagatgacccaagccaaagcgctaaccttttagcagaagctaaaaagctaaatgatgctcaggcgccgaaa
5 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/single_record.transeq.faa:
--------------------------------------------------------------------------------
 1 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_1
 2 | VDNKFNKEQQNAFYEILHLPNLNEEQRNAFIQSLKDDPSQSANLLAEAKKLNDAQAPK
 3 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_2
 4 | XTTNSTKNNKTRSMRSYIYLTXTKNNETPSSKVXKMTQAKALTFXQKLKSXMMLRRRX
 5 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_3
 6 | RQQIQQRTTKRVLXDLTFTXLKRRTTKRLHPKFKRXPKPKRXPFSRSXKAKXCSGAEX
 7 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_4
 8 | FRRLSIIXLFSFCXKVSALAWVIFXTLDEGVSLFFVXVRXMXDLIERVLLFFVEFVVY
 9 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_5
10 | SAPEHHLAFXLLLKGXRFGLGHLLNFGXRRFVVLRLSXVNVRSHRTRFVVLCXICCLX
11 | >BBa_K380009_A_20830_Coding_"Protein_A_Z-domain"_6
12 | FGAXASFSFLASAKRLALWLGSSFKLWMKAFRCSSFKLGKCKISXNAFCCSLLNLLST
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | dist/
10 | *.egg-info/
11 | *.egg
12 | 
13 | # PyInstaller
14 | #  Usually these files are written by a python script from a template
15 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
16 | *.manifest
17 | *.spec
18 | 
19 | # Installer logs
20 | pip-log.txt
21 | pip-delete-this-directory.txt
22 | 
23 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
24 | __pypackages__/
25 | 
26 | # Environments
27 | .venv
28 | venv/
29 | .conda
30 | .vscode
31 | .pylintrc
32 | *:Zone.Identifier
33 | 
34 | # test visual output
35 | commec/tests/test_data/functional.html


--------------------------------------------------------------------------------
/.github/workflows/automate_tests.yml:
--------------------------------------------------------------------------------
 1 | name: Test Commec
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - develop
 7 |       - main
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - name: Checkout code
16 |         uses: actions/checkout@v4
17 | 
18 |       - name: Set up Conda environment
19 |         uses: conda-incubator/setup-miniconda@v3
20 |         with:
21 |           activate-environment: commec-env
22 |           environment-file: environment.yaml
23 |           auto-activate-base: false
24 |           clean-patched-environment-file: true
25 | 
26 |       - name: Run tests
27 |         shell: bash -l {0}
28 |         run: |
29 |           conda activate commec-env
30 |           pytest -vv
31 | 


--------------------------------------------------------------------------------
/example_data/screen_pipeline_status.csv:
--------------------------------------------------------------------------------
1 | name,filepath,flag,biorisk,protein,nucleotide,low_concern,virus_flag,bacteria_flag,eukaryote_flag,low_concern_protein,low_concern_rna,low_concern_dna
2 | Part:BBa_K5108009_creA_-,2025-08-06/commec-examples.output.json,Warning,Warning,Pass,Skip,Warning,False,False,False,False,False,False
3 | encrypted,2025-08-06/commec-examples.output.json,Warning,Pass,Pass,Pass,Pass,False,False,False,False,False,False
4 | xylanase_zero_shot_des31,2025-08-06/commec-examples.output.json,Pass,Pass,Pass,Pass,Pass,False,False,False,False,False,False
5 | RVFV_Rift_valley_fever,2025-08-06/commec-examples.output.json,Flag,Warning,Flag,Flag,Flag,True,False,False,False,False,False
6 | BBa_K209429_A_15261,2025-08-06/commec-examples.output.json,Pass,Pass,Mixed,Pass,Pass,False,True,False,False,False,False
7 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/prot-error-2024-08.screen:
--------------------------------------------------------------------------------
 1 | >> STEP 1: Checking for biorisk genes...
 2 | 		 --> Biorisks: no hits detected, PASS
 3 |  STEP 1 completed at 2024-08-27 16:58:14
 4 |  >> STEP 2: Checking regulated pathogen proteins...
 5 | Traceback (most recent call last):
 6 |   File "/blue/salemi/brittany.rife/nti/common-mechanism/commec/check_reg_path.py", line 158, in <module>
 7 |     main()
 8 |   File "/blue/salemi/brittany.rife/nti/common-mechanism/commec/check_reg_path.py", line 57, in main
 9 |     blast = taxdist(blast, reg_ids, vax_ids, args.db + "/taxonomy/", args.threads)
10 |             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
11 |   File "/blue/salemi/brittany.rife/nti/common-mechanism/commec/utils.py", line 118, in taxdist
12 |     zip(
13 | TypeError: 'float' object is not iterable
14 | 	 ERROR: command 'python /blue/salemi/brittany.rife/nti/common-mechanism/commec/check_reg_path.py -i AF006966.1_8_6_dmnd.nr.blastx -d /orange/salemi/brittany.rife/databases -t 6' failed
15 | 


--------------------------------------------------------------------------------
/commec/screen-default-config.yaml:
--------------------------------------------------------------------------------
 1 | base_paths:
 2 |   default: commec-dbs/
 3 | databases:
 4 |   biorisk:
 5 |     path: '{default}biorisk/biorisk.hmm'
 6 |     taxids: "{default}biorisk/reg_taxids.txt"
 7 |     annotations: '{default}biorisk/biorisk_annotations.csv'
 8 |   regulated_protein:
 9 |     blast:
10 |       path: '{default}nr_blast/nr'
11 |     diamond:
12 |       path: '{default}nr_dmnd/nr.dmnd'
13 |   regulated_nt:
14 |     path: '{default}nt_blast/core_nt'
15 |   low_concern:
16 |     rna:
17 |       path: '{default}low_concern/rna/benign.cm'
18 |     dna:
19 |       path: '{default}low_concern/dna/benign.fasta'
20 |     protein:
21 |       path: '{default}low_concern/protein/benign.hmm'
22 |     taxids: "{default}low_concern/vax_taxids.txt"
23 |     annotations: '{default}low_concern/low_concern_annotations.tsv'
24 |   taxonomy:
25 |       path: "{default}taxonomy/"
26 | threads: 1
27 | diamond_jobs: null
28 | do_cleanup: False
29 | force: False
30 | skip_taxonomy_search: False
31 | protein_search_tool: 'blastx'
32 | resume: False
33 | skip_nt_search: False
34 | verbose: False


--------------------------------------------------------------------------------
/commec/tests/test_rationales.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for controlled rationale outcomes
 3 | """
 4 | 
 5 | from commec.tests.screen_factory import (
 6 |     ScreenTesterFactory,
 7 |     ScreenStep
 8 | )
 9 | from commec.config.result import ScreenStatus, Rationale
10 | 
11 | def test_hmmer(tmp_path):
12 |     """
13 |     When there are hits to Biorisk with a large E-value, but no other hits, and we 
14 |      are running in the skip taxonomy mode, we correctly label the outcome
15 |      as warning, however the rationale is set to "Matches to ." instead of
16 |      the correct Rationale text indicating no hits.
17 |     """
18 |     screen_test = ScreenTesterFactory("low_evalue_hmmer", tmp_path)
19 |     screen_test.add_query("query1",1200)
20 |     screen_test.add_hit(ScreenStep.BIORISK, "query1", 100, 200, "HighEvalueHit", "HEH", 500, regulated=True, evalue = 100.0)
21 |     result = screen_test.run("--skip-tx")
22 |     assert result.queries["query1"].status.screen_status == ScreenStatus.WARN
23 |     assert result.queries["query1"].status.rationale == str(Rationale.NO_HITS_SKIP_NOTE)
24 | 


--------------------------------------------------------------------------------
/commec/utils/concat_seqs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Script that concatenates all sequences in a FASTA file.
 5 | 
 6 | Usage:
 7 |     concat_seqs.py input.fasta
 8 | """
 9 | import sys
10 | 
11 | # read in the file based on the command line argument
12 | filename = sys.argv[1]
13 | # open the file
14 | f = open(filename, "r")
15 | # read the file
16 | lines = f.readlines()
17 | # close the file
18 | f.close()
19 | 
20 | # use the first line as the sequence ID
21 | seq_id = lines[0].rstrip()
22 | 
23 | # concatenate all other lines that don't start with '>'
24 | seq = ""
25 | for line in lines[1:]:
26 |     if line.startswith(">"):
27 |         continue
28 |     seq += line.rstrip()
29 | 
30 | # print the sequence ID and the sequence to a FASTA file
31 | # open output file (filename but with _concat appended before suffix)
32 | out_filename = filename.replace(".fasta", "_concat.fasta")
33 | out_file = open(out_filename, "w")
34 | # write the sequence ID and sequence to the file
35 | out_file.write(">" + seq_id + "\n")
36 | out_file.write(seq + "\n")
37 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/rna/benign.cm.exemplar.out:
--------------------------------------------------------------------------------
 1 | #target name         accession query name                                      accession mdl mdl from   mdl to seq from   seq to strand trunc pass   gc  bias  score   E-value inc description of target
 2 | #------------------- --------- ----------------------------------------------- --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
 3 | #
 4 | # Program:         cmscan
 5 | # Version:         1.1.5 (Sep 2023)
 6 | # Pipeline mode:   SCAN
 7 | # Query file:      /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta
 8 | # Target file:     /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.cmscan
 9 | # Option settings: cmscan --tblout /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.cmscan.exemplar.out /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.cmscan /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta 
10 | # Current dir:     /root/repo/json/common-mechanism
11 | # Date:            Thu Sep 19 12:15:55 2024
12 | # [ok]
13 | 


--------------------------------------------------------------------------------
/commec/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Static functions useful for dealing with common file parsing tasks.
 5 | """
 6 | 
 7 | import argparse
 8 | import os
 9 | 
10 | # Below go to config parameters.
11 | @staticmethod
12 | def directory_arg(path):
13 |     """Raise ArgumentTypeError if `path` is not a directory."""
14 |     if not os.path.isdir(path):
15 |         raise argparse.ArgumentTypeError(f"{path} is not a valid directory path")
16 |     return path
17 | 
18 | @staticmethod
19 | def file_arg(path):
20 |     """Raise ArgumentTypeError if `path` is not a file."""
21 |     if not os.path.isfile(path):
22 |         raise argparse.ArgumentTypeError(f"{path} is not a valid file")
23 |     if not os.path.getsize(path) > 0:
24 |         raise argparse.ArgumentTypeError(f"{path} is an empty file")
25 |     return path
26 | 
27 | @staticmethod
28 | def expand_and_normalize(path):
29 |     """Expand ~ and $var path elements, and normalize path, removing double slashes, etc."""
30 |     return os.path.normpath(os.path.expandvars(os.path.expanduser(path)))
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 International Biosecurity and Biosafety Initiative for Science
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/commec/utils/template.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>${page_title | h}</title>
 6 |     <style>
 7 |         body {
 8 |             margin: 0;
 9 |             padding: 0;
10 |             font-family: Arial, sans-serif;
11 |         }
12 | 
13 |         .container {
14 |             display: flex;
15 |             flex-direction: column; /* Changed to column to stack figures vertically */
16 |             height: 100vh;
17 |             width: 100vw;
18 |             margin: 0 auto;
19 |             overflow: auto; /* Enable scrolling for the container */
20 |         }
21 | 
22 |         .figure {
23 |             padding: 10px;
24 |             margin-bottom: 20px; /* Space between figures */
25 |         }
26 | 
27 |         .figure:nth-child(even) {
28 |             background-color: #f0f0f0; /* Light grey background for even figures */
29 |         }
30 |     </style>
31 | </head>
32 | <body>
33 |     <div class="container">
34 |         % for fig_html in figures_html:
35 |             <div class="figure">
36 |                 ${fig_html | n}
37 |             </div>
38 |         % endfor
39 |     </div>
40 | </body>
41 | </html>
42 | 


--------------------------------------------------------------------------------
/example_data/output_commec-examples/commec-examples.low_concern.cmscan:
--------------------------------------------------------------------------------
 1 | #target name         accession query name               accession mdl mdl from   mdl to seq from   seq to strand trunc pass   gc  bias  score   E-value inc description of target
 2 | #------------------- --------- ------------------------ --------- --- -------- -------- -------- -------- ------ ----- ---- ---- ----- ------ --------- --- ---------------------
 3 | #
 4 | # Program:         cmscan
 5 | # Version:         1.1.5 (Sep 2023)
 6 | # Pipeline mode:   SCAN
 7 | # Query file:      /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.cleaned.fasta
 8 | # Target file:     /mnt/data/home/ec2-user/cm-dbs/low_concern/rna/benign.cm
 9 | # Option settings: cmscan --tblout /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/output_commec-examples/commec-examples.low_concern.cmscan --cpu 12 /mnt/data/home/ec2-user/cm-dbs/low_concern/rna/benign.cm /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.cleaned.fasta 
10 | # Current dir:     /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07
11 | # Date:            Wed Aug  6 22:30:56 2025
12 | # [ok]
13 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Background
 2 | <!--
 3 | Leave a gift for your future self about what this PR was.
 4 | 
 5 | Please include relevant motivation and context for this PR.
 6 | 
 7 | List any dependencies that are required for this change.
 8 | -->
 9 | 
10 | 
11 | <!--
12 | Does this relate to any open issues? Please include them, or delete the line below.
13 | -->
14 | **Issues**:
15 | <!--
16 | Having the text: "#1234" would connect the current pull request to issue 1234.
17 | 
18 | You can use the Github keywords [fix/es, resolve/s], but this will automatically
19 | close the issue, so only do so if the PR fully resolves them!
20 | -->
21 | 
22 | ## Changes
23 | <!--
24 | Please delete sections that are not relevant.
25 | -->
26 | ### Bug fixes
27 | * <!-- Describe any non-breaking changes which fix an identified bug -->
28 | 
29 | ### New features
30 | * <!-- Describe non-breaking change which adds functionality -->
31 | 
32 | ### Breaking changes 
33 | * <!-- Describe any fixes or features that will cause existing functionality to no longer work as currently expected -->
34 | 
35 | ### Refactoring
36 | * <!-- Describe any non-breaking changes to code or documentation which should not change functionality -->
37 | 
38 | ## Relevant logs, error messages, etc.
39 | <!--
40 | This will help to find this PR message in future if similar errors appear.
41 | -->


--------------------------------------------------------------------------------
/example_data/input_commec-examples/commec-examples_config.yaml:
--------------------------------------------------------------------------------
 1 | base_paths:
 2 |   default: /mnt/data/home/ec2-user/cm-dbs/
 3 | databases:
 4 |   biorisk:
 5 |     annotations: /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk_annotations.csv
 6 |     path: /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk.hmm
 7 |     taxids: /mnt/data/home/ec2-user/cm-dbs/biorisk/reg_taxids.txt
 8 |   low_concern:
 9 |     annotations: /mnt/data/home/ec2-user/cm-dbs/low_concern/low_concern_annotations.tsv
10 |     dna:
11 |       path: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta
12 |     protein:
13 |       path: /mnt/data/home/ec2-user/cm-dbs/low_concern/protein/benign.hmm
14 |     rna:
15 |       path: /mnt/data/home/ec2-user/cm-dbs/low_concern/rna/benign.cm
16 |     taxids: /mnt/data/home/ec2-user/cm-dbs/low_concern/vax_taxids.txt
17 |   regulated_nt:
18 |     path: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt
19 |   regulated_protein:
20 |     blast:
21 |       path: /mnt/data/home/ec2-user/cm-dbs/nr_blast/nr
22 |     diamond:
23 |       path: /mnt/data/home/ec2-user/cm-dbs/nr_dmnd/nr.dmnd
24 |   taxonomy:
25 |     path: /mnt/data/home/ec2-user/cm-dbs/taxonomy/
26 | diamond_jobs: null
27 | do_cleanup: false
28 | force: false
29 | protein_search_tool: blastx
30 | resume: false
31 | skip_nt_search: false
32 | skip_taxonomy_search: false
33 | threads: 12
34 | verbose: true
35 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs/low_concern/protein/benign.hmm.exemplar.out:
--------------------------------------------------------------------------------
 1 | #                                                                                                       --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
 2 | # target name        accession   tlen query name                                      accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
 3 | #------------------- ---------- -----                            -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
 4 | #
 5 | # Program:         hmmscan
 6 | # Version:         3.4 (Aug 2023)
 7 | # Pipeline mode:   SCAN
 8 | # Query file:      /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta
 9 | # Target file:     /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.hmm
10 | # Option settings: hmmscan --domtblout /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.hmm.exemplar.out /root/repo/json/common-mechanism/commec/tests/test_dbs/benign_db/benign.hmm /root/repo/json/common-mechanism/commec/tests/test_data/single_record.fasta 
11 | # Current dir:     /root/repo/json/common-mechanism
12 | # Date:            Thu Sep 19 12:15:55 2024
13 | # [ok]
14 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/prot-mixed-hit-2024-06.screen:
--------------------------------------------------------------------------------
 1 | >> STEP 1: Checking for biorisk genes...
 2 | 		 --> Biorisks: no hits detected, PASS
 3 |  STEP 1 completed at 2024-06-24 19:29:55
 4 |  >> STEP 2: Checking regulated pathogen proteins...
 5 | 0           10239;2559587;2732396;2732408;2732506;76804;2499399;11118;2501931;694002;2509511;694009;1508227
 6 | 1           10239;2559587;2732396;2732408;2732506;76804;2499399;11118;2501931;694002;2509511;694009;1508227
 7 | 		 --> Best match to sequence(s) QGA88265, QGA88308, WEG19430, QZX47334, QWN56262, QZX47339, QGA88261 at bases 3 - 365 found in both regulated and non-regulated organisms
 8 | 		     Species: Severe acute respiratory syndrome-related coronavirus, Betacoronavirus sp. RsYN09, Sarbecovirus sp. (taxid(s): 2833184, 694009, 2872810, 1508227) (100.0 percent identity to query)
 9 | 		     Description: ORF1ab polyprotein [Severe acute respiratory syndrome-related coronavirus]
10 | 		 --> no top hit exclusive to a regulated pathogen: PASS
11 |  STEP 2 completed at 2024-06-24 19:32:11
12 |  >> STEP 3: Checking regulated pathogen nucleotides...
13 | 	...protein hits found, fetching nt regions not covered by a 90% ID hit or better
14 | 		 --> no noncoding regions >= 50 bases found, skipping nt scan
15 |  STEP 3 completed at 2024-06-24 19:32:12
16 | >> STEP 4: Checking any pathogen regions for benign components...
17 | 	...no regulated regions to clear
18 | >> COMPLETED AT 2024-06-24 19:32:13
19 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "commec"
 7 | version = '1.0.2'
 8 | requires-python = ">=3.10"
 9 | # This is not a pure python project; dependencies are managed through environment.yml
10 | authors = [
11 |   { name = "Nicole Wheeler" },
12 |   { name = "Jen Lu" },
13 |   { name = "Michael Barnett" },
14 |   { name = "Tessa Alexanian", email = "tessa@ibbis.bio" },
15 | ]
16 | maintainers = [
17 |   { name = "International Biosecurity and Biosafety Initiative for Science (IBBIS)", email = "info@ibbis.bio" },
18 | ]
19 | description = 'Free, open-source, globally available tool for DNA sequence screening'
20 | readme = "README.md"
21 | license = { file = "LICENSE" }
22 | keywords = ["synthesis screening", "DNA synthesis"]
23 | classifiers = [
24 |   "Development Status :: 3 - Alpha",
25 |   "Intended Audience :: Developers",
26 |   "License :: OSI Approved :: MIT License",
27 |   "Programming Language :: Python :: 3",
28 |   "Topic :: Scientific/Engineering :: Bio-Informatics",
29 | ]
30 | 
31 | [project.urls]
32 | Homepage = "https://ibbis.bio/common-mechanism"
33 | Repository = "https://github.com/ibbis-screening/common-mechanism.git"
34 | 
35 | [project.scripts]
36 | "commec" = "commec.cli:main"
37 | 
38 | [tool.setuptools]
39 | packages = { find = { "include" = ["commec", "commec.*"] } }
40 | package-data = { "commec" = ["utils/template.html", "screen-default-config.yaml"] }


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/prot-hit-not-cleared-2024-06.screen:
--------------------------------------------------------------------------------
 1 | >> STEP 1: Checking for biorisk genes...
 2 | 		 --> Biorisks: no hits detected, PASS
 3 |  STEP 1 completed at 2024-09-03 05:57:18
 4 |  >> STEP 2: Checking regulated pathogen proteins...
 5 | 		 --> Best match to sequence(s) AMT79992, AHL83753, AAF04797, AMT80120, AMT80254, ADW86059, ANB41717, WFP21365, ACY66806, AHL83655, WFP21263, AHL83687, AMT80218, ANB41697, AHL83781, AMT80072, WGH73008, WFP21271, Q4QXJ7, WFP21337, 7V0N_A, WFP21323, ANB41727, AMN91481, AMN91457, AMT80132, WFP21377, AHL83797, ANB41585, WFP21295, ALE15082, AMT80102, AMT79954, ANB41701, AHL83809, WFP21289, AMN91563, AHL83649, AMT80320, AHL83707, WGH73012, WFP21353, AHL83735, AMN91527, ADB08660, ADW86014, ADW86051, ANB41675, AMN91485, ANB41577 at bases 3 - 629 found in only regulated organisms: FLAG (virus)
 6 | 		     Species: Eastern equine encephalitis virus (taxid(s): 11021, 374598) (100.0 percent identity to query)
 7 | 		     Description: E1 glycoprotein, partial [Eastern equine encephalitis virus]
 8 |  STEP 2 completed at 2024-09-03 06:42:37
 9 |  >> STEP 3: Checking regulated pathogen nucleotides...
10 | 	...protein hits found, fetching nt regions not covered by a 90% ID hit or better
11 | 		 --> no noncoding regions >= 50 bases found, skipping nt scan
12 |  STEP 3 completed at 2024-09-03 06:42:38
13 | >> STEP 4: Checking any pathogen regions for benign components...
14 | 	...no housekeeping protein hits
15 | 	...no benign RNA hits
16 | 	...no Synbio sequence hits
17 | 		 -->Regulated region at bases 3 to 629 failed to clear: FLAG
18 | >> COMPLETED AT 2024-09-03 06:42:41
19 | 


--------------------------------------------------------------------------------
/conda-recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "commec" %}
 2 | {% set version = "1.0.2" %}
 3 | {% set sha256 = "" %}
 4 | 
 5 | package:
 6 |   name: "{{ name }}"
 7 |   version: "{{ version }}"
 8 | 
 9 | source:
10 |   url: https://github.com/ibbis-screening/common-mechanism/archive/refs/tags/v{{version}}.tar.gz
11 |   sha256: {{ sha256 }}
12 | 
13 | build:
14 |   number: 0
15 |   noarch: python
16 |   script: "{{ PYTHON }} -m pip install . --no-deps --no-build-isolation --no-cache-dir -vvv"
17 |   run_exports:
18 |     - {{ pin_subpackage('commec', max_pin="x.x.x") }}
19 | 
20 | requirements:
21 |   build:
22 |     - python >=3.10
23 |     - pip
24 |     - setuptools
25 |   host:
26 |     - python >=3.10
27 |     - pip
28 |     - setuptools
29 |   run:
30 |     - python >=3.10
31 |     # Runtime Python dependencies
32 |     - biopython
33 |     - numpy
34 |     - pandas
35 |     - pytaxonkit
36 |     - pyyaml
37 |     # Runtime non-Python dependencies
38 |     - blast >=2.16
39 |     - diamond >=2.1
40 |     - hmmer
41 |     - infernal
42 |     - wget
43 |     - plotly
44 |     - yaml
45 |     - mako
46 | 
47 | test:
48 |   commands:
49 |     - commec screen --help
50 |     - commec flag --help
51 |     - commec split --help
52 | 
53 | about:
54 |   home: https://github.com/ibbis-screening/common-mechanism
55 |   license: MIT
56 |   license_family: MIT
57 |   doc_url: https://github.com/ibbis-screening/common-mechanism/wiki
58 |   summary: "commec: a free, open-source, globally available tool for DNA sequence screening"
59 |   dev_url: https://github.com/ibbis-screening/common-mechanism
60 | 
61 | extra:
62 |   identifiers:
63 |     - biotools:commec
64 |   container:
65 |     image: "quay.io/biocontainers/commec"


--------------------------------------------------------------------------------
/dev_scripts/split_fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Split a multi-record FASTA file into files with a set number of sequences per file. Output will be
 5 | input.#.fa.
 6 | 
 7 | Command-line usage:
 8 |     split_fasta.py --i input_fasta -n num_seqs
 9 | """
10 | import os, sys, argparse
11 | from Bio import SeqIO 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument("-i","--input", dest="i_file",
16 |         required=True, help="multi-FASTA file to split") 
17 |     parser.add_argument("-n","--num", dest="num_seqs",
18 |         type=int,
19 |         required=True, help="Number of sequences per file (min)") 
20 |     args = parser.parse_args() 
21 |     
22 |     basename = os.path.splitext(args.i_file)[0]
23 |     basename = os.path.basename(basename)
24 |     count_curr = 0
25 |     count_total = 0 
26 |     num_splits = 0
27 |     sys.stdout.write("\t%i sequences printed (%i splits)" % (count_total,num_splits))
28 |     sys.stdout.flush()
29 |     for record in SeqIO.parse(args.i_file,"fasta"):
30 |         if count_curr == 0:
31 |             num_splits += 1
32 |             o_file = open(basename + "." + str(num_splits) + ".fa" , 'w')
33 |         SeqIO.write(record, o_file, "fasta")
34 |         count_curr += 1 
35 |         count_total += 1
36 |         if count_total % 10000 == 0:
37 |             sys.stdout.write("\r\t%i sequences printed (%i splits)" % (count_total,num_splits))
38 |             sys.stdout.flush()
39 |         if count_curr == args.num_seqs:
40 |             count_curr = 0 
41 |             o_file.close()
42 |     
43 |     sys.stdout.write("\t%i sequences printed (%i splits)\n" % (count_total,num_splits))
44 |     sys.stdout.flush()
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main() 
49 | 


--------------------------------------------------------------------------------
/commec/utils/dict_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Static functions useful for dealing with common dictionary tasks.
 5 | """
 6 | 
 7 | @staticmethod
 8 | def deep_update(to_update: dict[str, any], 
 9 |                 has_updates: dict[str, any]) -> tuple[
10 |                     dict[str,any], 
11 |                     list[tuple[str,any]]]:
12 |     """
13 |     Recursively update a nested dictionary without completely overwriting nested dictionaries.
14 |     Only already existing keys are updated. Any keys not existing in the dictionary
15 |     to be updated are returned as a list of rejected key value pairs.
16 |     -----
17 |     Inputs:
18 |     * to_update : dict[str, any] Dictionary to be updated.
19 |     * has_updates : dict[str, any] New dictionary information to be added.
20 |     ----
21 |     Outputs:
22 |     * updated : dict[str, any] a copy of the to_update dictionary, with values
23 |     from any matching keys overridden by has_updates.
24 |     * rejected : list[tuple[str,any] A list of the rejected key value pairs, i.e. 
25 |     keys present in has_updates, but not present in to_update.
26 |     """
27 |     rejected = []
28 |     updated = to_update.copy()
29 |     for key, value in has_updates.items():
30 |         # If both values are dictionaries, recursively update
31 |         if key in updated and isinstance(updated[key], dict) and isinstance(value, dict):
32 |             updated[key], additional_rejects = deep_update(updated[key], value)
33 |             rejected.extend(additional_rejects)
34 |         # If not a dictionary, just copy the value.
35 |         elif key in updated:
36 |             updated[key] = value
37 |         # If not present, we log an unexpected input one.
38 |         else:
39 |             rejected.append((key, value))
40 |     return updated, rejected
41 | 


--------------------------------------------------------------------------------
/commec/tests/test_flag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import textwrap
 4 | 
 5 | from commec.flag import add_args, run
 6 | 
 7 | SCREEN_DIR = os.path.join(os.path.dirname(__file__), "test_data")
 8 | 
 9 | def test_flag(tmp_path):
10 |     """We are lazily writing tests for a full run of flag instead of unit tests."""
11 |     parser = argparse.ArgumentParser()
12 |     add_args(parser)
13 |     args = parser.parse_args([SCREEN_DIR, "-o", str(tmp_path), "-r"])
14 |     run(args)
15 | 
16 |     # Check if the output file exists
17 |     status_output = tmp_path / "screen_pipeline_status.csv"
18 |     assert status_output.exists()
19 | 
20 |     expected_status = textwrap.dedent(
21 |         f"""\
22 |         name,filepath,flag,biorisk,protein,nucleotide,low_concern,virus_flag,bacteria_flag,eukaryote_flag,low_concern_protein,low_concern_rna,low_concern_dna,rationale
23 |         FLAG_TEST_01,{SCREEN_DIR}/flag_tests.json,Flag,Flag,Pass,Pass,Flag,False,False,False,False,False,False,
24 |         FLAG_TEST_02,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Flag,Pass,Flag,True,False,False,False,False,False,
25 |         FLAG_TEST_03,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Flag,Pass,Flag,False,True,False,False,False,False,
26 |         FLAG_TEST_04,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Flag,Pass,Flag,False,False,True,False,False,False,
27 |         FLAG_TEST_05,{SCREEN_DIR}/flag_tests.json,Flag,Pass,Pass,Flag,Flag,True,True,True,False,False,False,
28 |         FLAG_TEST_06,{SCREEN_DIR}/flag_tests.json,Pass,Pass,Mixed,Pass,Pass,True,False,False,False,False,False,
29 |         FCTEST1,{SCREEN_DIR}/functional.json,Flag,Flag,Flag,Flag,Flag,True,False,False,True,True,True,"Matches sequence with pathogenic or toxin function, and protein and nucleotide sequence with regulated organisms; as well as virulence factor; as well as flags cleared as common or non-hazardous."
30 |         """
31 |     )
32 |     actual_status = status_output.read_text()
33 |     assert expected_status.strip() == actual_status.strip()


--------------------------------------------------------------------------------
/example_data/output_commec-examples/commec-examples.biorisk.hmmscan:
--------------------------------------------------------------------------------
 1 | #                                                                                  --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
 2 | # target name        accession   tlen query name                 accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
 3 | #------------------- ---------- -----       -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
 4 | PeptidaseM24         PF00557.27   209 Part:BBa_K5108009_creA_-_1 -            686   1.3e-25   87.5   0.0   1   2       0.6   1.4e+03   -4.1   0.0    20    43    38    58    31    76 0.64 Metallopeptidase family M24
 5 | PeptidaseM24         PF00557.27   209 Part:BBa_K5108009_creA_-_1 -            686   1.3e-25   87.5   0.0   2   2     1e-28   2.5e-25   86.6   0.0     2   207   174   389   173   391 0.84 Metallopeptidase family M24
 6 | gi3006115embCAA73290.1 -            200 RVFV_Rift_valley_fever_2 -            563  2.2e-139  458.7   0.2   1   1  1.3e-142  3.1e-139  458.3   0.2     1   200    33   232    33   232 1.00 -
 7 | #
 8 | # Program:         hmmscan
 9 | # Version:         3.4 (Aug 2023)
10 | # Pipeline mode:   SCAN
11 | # Query file:      /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa
12 | # Target file:     /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk.hmm
13 | # Option settings: hmmscan --domtblout /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/output_commec-examples/commec-examples.biorisk.hmmscan --cpu 12 /mnt/data/home/ec2-user/cm-dbs/biorisk/biorisk.hmm /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa 
14 | # Current dir:     /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07
15 | # Date:            Wed Aug  6 21:23:52 2025
16 | # [ok]
17 | 


--------------------------------------------------------------------------------
/example_data/input_commec-examples/commec-examples.noncoding.fasta:
--------------------------------------------------------------------------------
1 | >encrypted (1-552)
2 | GACCAAGCCTGCAAAAACAAACGGCAAATTGACGGACTACTAAAGAAGACTTGCCAAATCGGACGAAGACTTGCTCGAAGTACTATGATATATATCGAAGTGGTAGGCGAAGGAGTTAAGAGATTAAGAGCTTAAGACATTCCCTAAGAGATCCCCTTTCGAAGGTATTAAAGAAGCAATTAAGGCCTGAAGGTTTTAAGGAATGGTCTAAGAAATTAAGCAGTTAAGACATGAAGGATTACAAGCAAGCTAACACGAAGATTTTAGACGGATAACACCACCCAGAAGTGGTAGCCTAAGGTGTAACATAAGAAGTAGCATAAGACCTCCCCGAGCGGTATAAGCATTTAAGCCGTGAAGGAATTAAGGCTTTAAGTTCTGAAGTAGTTAAGAAAACTTTCAAAGAAATAAGTTTTCGGATCTCTAAGGCATTAAGGAGTGAAGCCTTGAAGCTGTTAAGATATACTACTCCCTTCGAAGGCCTTAAGTCATGAAGTAGTTGCATTTCGTGCTAAGAAATTTACGAAGTATTTAAGGCGTGCACCCACCGAC
3 | >xylanase_zero_shot_des31 (1-756)
4 | atggaagaagtgctggcgaaaattgtgcgcgataaaaaaatttgggtggaagaacgcaaacagcagcagccgctggaaagctttcgcgataaagtgcagccgagcacccgcaacttttatgatgcgctgcagggcgataaaaccgcgtttattctggtgtgcctgaaagcgagcccgagcaaaggcctgattcgcgaagattttgatccggtgcgcattgcggcggtgtatcgccattatgcgaacgcgattaccgtgctgaccgatgaaaaatattttcagggcagctttgattttctgcgccaggtgagccaggtggcgccgcagccgattctgtgctttgattttattgtggatgaatggcagctgtatctggcggcgctgtttggcgcggcggcgattctgctgattgtggcgattctgggcgatcgcaccaaagaatttattgatatggcgcgcgaactgggcctggatgtgctggtggatgtgcatgatgaagaagatctggaaaaagtgtttagctattgccgcccgaaaattattggcgtggtgaacaccgattggcgcaccatggaaaccgatctgaacaccaccgaaaaactggcgaaactgattccgccggataccattgtgattgcgattagcggcattagcgaaccggaacaggtgaaacgcctgcgcaaagcgggcgtggatggcgtgctgattggcagcacctttgcgcgcaacccggataaagcggcggaagcgaaagaa
5 | >RVFV_Rift_valley_fever (830.0-960.0)
6 | TAGAGATTAAGGCTGCCCCACCCCCCACCCCCAATCCCGACCGTAACCCCAACCACCCCCTTTTCCCCAAACCCCTGGGCAGCCACTTAGGCTGCTGTCTTGTACGCCTGAGCAGCTGCCATGACAGCTGC
7 | >BBa_K209429_A_15261 (643.0-758.0) (1833.0-1933.0) (2633.0-2764.0) (3497.0-3550.0)
8 | gccaactttgtacaaaaaagcaggctttaaggagcaaggcaggtggacaagaggagttctagtggatccttgaacttgtctagaagctggaactcccacctgcaacatgcgaatactaatcagaattggttaattggttgtaacactggcagagcattacgctgacttgacgggacggcgcaagctcatgaccaaaatcccttaacgtgagttacgctactagatggacagcaaaggttcgtcgcagaaagggtcccgcctgctcctgctgctggtggtgtcaaatctactcttgtgccagggtgtggtctccgattacaaagatgatgatgatgtcgactccccgatctactagatgaataactcaacaaactcctctaacaatagcctggctcttacaagt
9 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/prot-multiple-hits-2024-06.screen:
--------------------------------------------------------------------------------
 1 | >> STEP 1: Checking for biorisk genes...
 2 | 		 --> Biorisks: Regulated genes not found, PASS
 3 | 		 --> Virulence factor found in bases 526 to 802, WARNING
 4 | 		     Gene: PE2 [Venezuelan equine encephalitis virus]
 5 |  STEP 1 completed at 2024-06-24 23:10:58
 6 |  >> STEP 2: Checking regulated pathogen proteins...
 7 | 		 --> Best match to sequence(s) KXB23588 at bases 7507 - 7607 found in only regulated organisms: FLAG (bacteria)
 8 | 		     Species: Legionella pneumophila (taxid(s): 446) (73.469 percent identity to query)
 9 | 		     Description: hypothetical protein PtVF66_13895, partial [Legionella pneumophila]
10 | 		 --> Best match to sequence(s) WP_099588326 at bases 6123 - 6929 found in both regulated and non-regulated organisms
11 | 		     Species: Bacillus cereus group sp. BY9-3LC, Clostridioides difficile, Streptococcus dysgalactiae,
12 | 		     Lactiplantibacillus plantarum, Bacillus cereus, Francisella tularensis, Sinomonas cellulolyticus,
13 | 		     Pseudomonas sp. MWU13-2860, Lacticaseibacillus rhamnosus, Escherichia coli, Corynebacterium
14 | 		     glutamicum, Borreliella burgdorferi (taxid(s): 2071714, 47715, 139, 99822, 119857, 562, 1396, 1590, 1718, 1496, 3018075, 2801916) (100.0 percent identity to query)
15 | 		     Description: MULTISPECIES: APH(3')-I family aminoglycoside O-phosphotransferase [Bacteria]
16 |  STEP 2 completed at 2024-06-24 23:23:21
17 |  >> STEP 3: Checking regulated pathogen nucleotides...
18 | 	...protein hits found, fetching nt regions not covered by a 90% ID hit or better
19 | 		 --> no top hit exclusive to a regulated pathogen: PASS
20 |  STEP 3 completed at 2024-06-24 23:23:32
21 | >> STEP 4: Checking any pathogen regions for benign components...
22 | 		 -->Housekeeping proteins covering 2.0 to 925.0 = PASS
23 | 		   COG0449: glutamine-fructose-6-phosphate transaminase (isomerizing) activity (E-value: 5.6e-50
24 | 	...no benign RNA hits
25 | 		 -->Synbio sequences - <80% coverage achieved = FAIL
26 | 		 -->Regulated region at bases 7507 to 7607 failed to clear: FLAG
27 | >> COMPLETED AT 2024-06-24 23:23:43
28 | 


--------------------------------------------------------------------------------
/commec/tests/test_trim.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple tests to ensure the correct trimming of overlapping components in a Hmmer database
 3 | when parsed to remove_overlaps.
 4 | The following behaviour is expected:
 5 |  * Fully encapsulated hits, should be removed.
 6 |  * Partially overlapping hits, should both be kept (To maximise extents)
 7 |  * Hits from different queries are independant in logic.
 8 | """
 9 | 
10 | import pandas as pd
11 | import pytest
12 | from commec.tools.hmmer import remove_overlaps
13 | 
14 | # Test the following hmmer configuration:
15 | # 10-----------------50 (Largest, should stay.)
16 | #               40-------60 (Extends 1. Should stay.)
17 | #      20-------40      (encapsulated, but high score, should stay!)
18 | # 10-----------------50 (lower score than 1, should be removed.)
19 | #      20---30 (Different query, removed)
20 | # 10------------------------------90 (different query)
21 | 
22 | # Example DataFrame
23 | example_hmmer_01 = pd.DataFrame({
24 |     "query name": ["one","one","one","one","two", "two"],
25 |     "q. start": [10, 40, 20, 10, 20, 10],
26 |     "q. end":   [50, 60, 40, 50, 30, 90],
27 |     "score":    [3, 5, 6, 1, 1, 2]
28 | })
29 | 
30 | # Example DataFrame
31 | example_hmmer_01_output = pd.DataFrame({
32 |     "query name": ["one","one", "one", "two"],
33 |     "q. start": [10, 40, 20, 10],
34 |     "q. end":   [50, 60, 40, 90],
35 |     "score":    [3, 5, 6, 2]
36 | })
37 | 
38 | @pytest.mark.parametrize(
39 |     "input_hmmer, expected_output_hmmer",
40 |     [
41 |         (example_hmmer_01, example_hmmer_01_output),
42 |     ]
43 | )
44 | def test_hmmer_overlaps(
45 |     input_hmmer : pd.DataFrame,
46 |     expected_output_hmmer : pd.DataFrame
47 | ):
48 |     """
49 |     Checks common configurations that require trimming in Hmmer outputs,
50 |     In particular partial overlaps, full encapsulations, score differences, and different queries.
51 |     """
52 |     trimmed_input = remove_overlaps(input_hmmer)
53 |     print("INPUT:")
54 |     print(input_hmmer)
55 |     print("TRIMMED:")
56 |     print(trimmed_input)
57 |     print("CORRECT:")
58 |     print(expected_output_hmmer)
59 |     assert trimmed_input.equals(expected_output_hmmer)
60 | 


--------------------------------------------------------------------------------
/example_data/output_commec-examples/commec-examples.low_concern.blastn:
--------------------------------------------------------------------------------
 1 | # BLASTN 2.16.0+
 2 | # Query: Part:BBa_K5108009_creA_-
 3 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta
 4 | # 0 hits found
 5 | # BLASTN 2.16.0+
 6 | # Query: encrypted
 7 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta
 8 | # 0 hits found
 9 | # BLASTN 2.16.0+
10 | # Query: xylanase_zero_shot_des31
11 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta
12 | # 0 hits found
13 | # BLASTN 2.16.0+
14 | # Query: RVFV_Rift_valley_fever
15 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta
16 | # 0 hits found
17 | # BLASTN 2.16.0+
18 | # Query: BBa_K209429_A_15261
19 | # Database: /mnt/data/home/ec2-user/cm-dbs/low_concern/dna/benign.fasta
20 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end
21 | # 9 hits found
22 | BBa_K209429_A_15261	BBa_K209429 A 15261 Composite "SSFYFP-hM2D-Tuba"	BBa_K209429	0	0.0	17813	100.000	9646	1	9646	9646	1	9646
23 | BBa_K209429_A_15261	BBa_K209440 A 15305 Composite "pEF1-SSFYFP-hM2D-Tuba"	BBa_K209440	0	0.0	12942	100.000	9646	2639	9646	7008	1	7008
24 | BBa_K209429_A_15261	BBa_K209427 A 15259 Composite "SSFYFP-hM2D-ActA(30-612)"	BBa_K209427	0	0.0	9070	100.000	9646	1	4911	6649	1	4911
25 | BBa_K209429_A_15261	BBa_K209430 A 15262 Composite "SSFYFP-hM2D-ITSN"	BBa_K209430	0	0.0	9068	100.000	9646	1	4910	6586	1	4910
26 | BBa_K209429_A_15261	BBa_K209431 A 15263 Composite "SSFYFP-hM2D-Beta Pix"	BBa_K209431	0	0.0	9066	100.000	9646	1	4909	6844	1	4909
27 | BBa_K209429_A_15261	BBa_K209428 A 15260 Composite "SSFYFP-hM2D-Vav"	BBa_K209428	0	0.0	9066	100.000	9646	1	4909	7246	1	4909
28 | BBa_K209429_A_15261	BBa_K209445 A 15308 Composite "pEF1-SSFYFP-Rs1.3-Tuba"	BBa_K209445	0	0.0	8765	100.000	9646	4901	9646	7019	2274	7019
29 | BBa_K209429_A_15261	BBa_K209424 A 15256 Composite "SSFYFP-Rs1.3-Tuba"	BBa_K209424	0	0.0	8765	100.000	9646	4901	9646	9657	4912	9657
30 | BBa_K209429_A_15261	BBa_K209409 A 15183 Coding "AarI C-D part, Tuba"	BBa_K209409	0	0.0	8754	100.000	9646	4907	9646	4740	1	4740
31 | # BLAST processed 5 queries
32 | 


--------------------------------------------------------------------------------
/example_data/README.md:
--------------------------------------------------------------------------------
 1 | # Example data
 2 | 
 3 | This directory contains a file, `commec-examples.fasta`, which inludes queries illustrating different possible screening outcomes, as well as the results of running `commec screen` on that file.
 4 | 
 5 | A guide to interpreting these results is provided in the [Tutorial](https://github.com/ibbis-bio/common-mechanism/wiki/tutorial) on the `commec` wiki.
 6 | 
 7 | ### Examples included
 8 | 
 9 | * **BBa_K5108009_creA_** (`WARN`): This is a [composite DNA part](https://parts.igem.org/Part:BBa_K5108009) developed by 2024 iGEM team Toulouse-INSA-UPS for space exploration applications. It is an artificial operon composed of four basic parts: creatinase and creatinine amidohydrolase ORFs (creA BBa K5108003, crnA BBa K5108004) and two RBS (BBa K5108006, BBaK5108007) enabling their expression in the plant growth-promoting rhizobacteria, _Pseudomonas fluorescens_, enabling the metabolization of creatinine by this organism.
10 | * **encrypted** (`WARN`): This DNA sequence contains an encrypted message generated using the [CryptoGErM](https://2016.igem.org/Team:Groningen/Tour) algorithm developed by the 2016 iGEM team from Groningen. It is therefore an entirely artificial sequence, with no biological function or related taxonomy across the domains of life.
11 | * **xylanase_zero_shot_des31** (`PASS`):  This sequence is one of the xylanase variants used in the zero-shot enzyme activity prediction challenge problem from [Align Bio’s](https://alignbio.org/) 2023 [Protein Engineering Tournament](https://alignbio.org/tournamentpilot-results-2023). Xylanase is an enzyme that degrades the second-most-abundant polysaccharide and should not be flagged.
12 | * **BBa_K209429_A_15261** (`PASS`): This sequence is another [composite DNA part](https://parts.igem.org/Part:BBa_K209429) created by the [igem UCSF team in 2009](https://2009.igem.org/Team:UCSF) with the goal of manipulating signaling pathways to mediate chemotaxis.
13 | * **RVFV_Rift_valley_fever** (`FLAG`): The Rift Valley Fever virus sample is successfully flagged during the taxonomic steps as containing extensive regions of material from regulated organisms, namely nucleocapsid proteins from _Phlebovirus riftense_ AKA Rift Valley Fever virus.
14 | 


--------------------------------------------------------------------------------
/commec/tests/test_coverage.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple tests to ensure the correct trimming of overlapping components in a Hmmer database
 3 | when parsed to remove_overlaps.
 4 | The following behaviour is expected:
 5 |  * Fully encapsulated hits, should be removed.
 6 |  * Partially overlapping hits, should both be kept (To maximise extents)
 7 |  * Hits from different queries are independant in logic.
 8 | """
 9 | 
10 | import pandas as pd
11 | import pytest
12 | from commec.screeners.check_low_concern import _calculate_coverage
13 | from commec.config.result import MatchRange
14 | 
15 | # Test the following hmmer configuration:
16 | # 10-----------------50 (Largest, should stay.)
17 | #               40-------60 (Extends 1. Should stay.)
18 | #      20-------40      (encapsulated, but high score, should stay!)
19 | # 10-----------------50 (lower score than 1, should be removed.)
20 | #      20---30 (Different query, removed)
21 | # 10------------------------------90 (different query)
22 | 
23 | # Example DataFrame
24 | example_hmmer_01 = pd.DataFrame({
25 |     "q. start": [100,  50,   0,   0],
26 |     "q. end":   [200, 150, 100, 200],
27 | })
28 | 
29 | # Example DataFrame
30 | example_hmmer_01_output = pd.DataFrame({
31 |     "q. start":    [100,  50,   0,   0],
32 |     "q. end":      [200, 150, 100, 200],
33 |     "coverage_nt": [100,  50,   0, 100],
34 |     "coverage_ratio": [1.0, 0.5, 0.0, 1.0]
35 | })
36 | 
37 | reg_range_01 = MatchRange(0.0, 100, 200, 100, 200)
38 | 
39 | @pytest.mark.parametrize(
40 |     "input_hmmer, input_region, expected_output_hmmer",
41 |     [
42 |         (example_hmmer_01, reg_range_01, example_hmmer_01_output),
43 |     ]
44 | )
45 | def test_coverage_overlaps(
46 |     input_hmmer : pd.DataFrame,
47 |     input_region : MatchRange,
48 |     expected_output_hmmer : pd.DataFrame
49 | ):
50 |     """
51 |     Checks common configurations that require trimming in Hmmer outputs,
52 |     In particular partial overlaps, full encapsulations, score differences, and different queries.
53 |     """
54 |     trimmed_input = _calculate_coverage(input_hmmer, input_region)
55 |     print("INPUT:")
56 |     print(input_hmmer)
57 |     print("TRIMMED:")
58 |     print(trimmed_input)
59 |     print("CORRECT:")
60 |     print(expected_output_hmmer)
61 |     assert trimmed_input.equals(expected_output_hmmer)
62 | 


--------------------------------------------------------------------------------
/example_data/output_commec-examples/commec-examples.low_concern.hmmscan:
--------------------------------------------------------------------------------
 1 | #                                                                                  --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
 2 | # target name        accession   tlen query name                 accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
 3 | #------------------- ---------- -----       -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
 4 | COG0006              -            321 Part:BBa_K5108009_creA_-_1 -            686   9.9e-57  187.1   0.8   1   1     8e-59   1.4e-56  186.6   0.8     3   314    29   402    26   411 0.82 -
 5 | COG0024              -            475 Part:BBa_K5108009_creA_-_1 -            686   9.5e-17   55.5   0.0   1   1   8.2e-19   1.4e-16   54.9   0.0   217   428   159   373    29   406 0.86 -
 6 | COG0042              -            326 xylanase_zero_shot_des31_1 -            252    0.0017   12.0   0.0   1   2      0.58     1e+02   -3.7   0.0    70    88   136   155   129   163 0.73 -
 7 | COG0042              -            326 xylanase_zero_shot_des31_1 -            252    0.0017   12.0   0.0   2   2   1.7e-05    0.0031   11.2   0.0   184   240   193   250   180   252 0.82 -
 8 | COG0516              -            472 xylanase_zero_shot_des31_1 -            252    0.0027   11.1   0.0   1   1   2.1e-05    0.0037   10.6   0.0   225   283   156   237   115   241 0.72 -
 9 | #
10 | # Program:         hmmscan
11 | # Version:         3.4 (Aug 2023)
12 | # Pipeline mode:   SCAN
13 | # Query file:      /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa
14 | # Target file:     /mnt/data/home/ec2-user/cm-dbs/low_concern/protein/benign.hmm
15 | # Option settings: hmmscan --domtblout /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/output_commec-examples/commec-examples.low_concern.hmmscan --cpu 12 /mnt/data/home/ec2-user/cm-dbs/low_concern/protein/benign.hmm /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07/2025-08-06/input_commec-examples/commec-examples.faa 
16 | # Current dir:     /mnt/data/home/ec2-user/analysis/v1.0.0-2025-07
17 | # Date:            Wed Aug  6 22:30:55 2025
18 | # [ok]
19 | 


--------------------------------------------------------------------------------
/commec/utils/coordinates.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper functions associated with the handling of 
 3 | basepair|amino-acid, nucleotide|peptide coordinate systems.
 4 | """
 5 | 
 6 | import numpy as np
 7 | 
 8 | def convert_protein_to_nucleotide_coords(frame,
 9 |                                          protein_start,
10 |                                          protein_end,
11 |                                          seq_length):
12 |     """
13 |     Convert protein coordinates to nucleotide coordinates considering the reading frame.
14 | 
15 |     Parameters:
16 |     frame (int, or [int]): Reading frame (1-6)
17 |         Frames 1-3: Forward frames starting at positions 0, 1, 2
18 |         Frames 4-6: Reverse frames starting from the end at positions 0, 1, 2
19 |     protein_start (int, or [int]): Start position in protein coordinates, counting from 1.
20 |     protein_end (int, or [int]): End position in protein coordinates, counting from 1.
21 |     seq_length (int): Length of the original sequence, mandatory for reverse frames (4,5,6) only.
22 | 
23 |     Returns:
24 |     tuple: (nucleotide_start, nucleotide_end)
25 |     """
26 |     # Convert protein coordinates to 0-based, for calculation.
27 |     protein_start = np.asarray(protein_start, dtype=np.int64) - 1
28 |     protein_end = np.asarray(protein_end, dtype=np.int64) - 1
29 |     frame = np.asarray(frame, dtype=np.int64)
30 |     seq_length = np.asarray(seq_length, dtype=np.int64)
31 | 
32 |     # Reverse frame offsets, for when total length not divisible into codons.
33 |     reverse_offset = seq_length % 3
34 | 
35 |     # Initialize arrays for nucleotide start and end
36 |     nucleotide_start = np.zeros_like(protein_start, dtype=np.int64)
37 |     nucleotide_end = np.zeros_like(protein_end, dtype=np.int64)
38 | 
39 |     # Forward frames (1, 2, 3)
40 |     forward_mask = frame <= 3
41 |     nucleotide_start[forward_mask] = (protein_start[forward_mask] * 3) + (frame[forward_mask] - 1)
42 |     nucleotide_end[forward_mask] = (protein_end[forward_mask] * 3) + 2 + (frame[forward_mask] - 1)
43 | 
44 |     # Reverse frames (4, 5, 6)
45 |     reverse_mask = frame > 3
46 |     reverse_frame = frame[reverse_mask] - 3
47 |     nuc_start_reverse = (protein_start[reverse_mask] * 3) + (reverse_frame - 1)
48 |     nuc_end_reverse = (protein_end[reverse_mask] * 3) + 2 + (reverse_frame - 1)
49 | 
50 |     nucleotide_start[reverse_mask] = seq_length[reverse_mask] - nuc_end_reverse - 1 + reverse_offset[reverse_mask]
51 |     nucleotide_end[reverse_mask] = seq_length[reverse_mask] - nuc_start_reverse - 1 + reverse_offset[reverse_mask]
52 | 
53 |     # Convert to back to 1-based coordinates for reporting.
54 |     nucleotide_start += 1
55 |     nucleotide_end += 1
56 | 
57 |     return nucleotide_start, nucleotide_end
58 | 


--------------------------------------------------------------------------------
/commec/tests/test_check_biorisk.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from unittest.mock import patch
 3 | import pandas as pd
 4 | import os
 5 | from Bio.SeqRecord import SeqRecord, Seq
 6 | 
 7 | from commec.screeners.check_biorisk import parse_biorisk_hits, HmmerHandler
 8 | from commec.config.result import ScreenResult
 9 | from commec.config.query import Query
10 | 
11 | INPUT_QUERY = os.path.join(os.path.dirname(__file__), "test_data/single_record.fasta")
12 | DATABASE_DIRECTORY = os.path.join(os.path.dirname(__file__), "test_dbs/")
13 | 
14 | @pytest.mark.parametrize(
15 |     "annotations_exists, has_empty_output, has_hits, expected_return",
16 |     [
17 |         # Case 1: annotations file doesn't exist
18 |         (False, False, False, 1),
19 |         # Case 2: HMMER output is empty or doesn't exist
20 |         (True, True, False, 1),
21 |         # Case 3: No hits detected (successful pass)
22 |         (True, False, False, 0),
23 |         # Case 4: Successful execution with hits
24 |         (True, False, True, 0),
25 |     ],
26 | )
27 | def test_check_biorisk_return_codes(annotations_exists, has_empty_output, has_hits, expected_return):
28 |     mock_hit_df = pd.DataFrame(
29 |         {
30 |             "target name": ["test_id"],
31 |             "query name": ["testname_1"],
32 |             "E-value": [1e-30],
33 |             "ali from": [100],
34 |             "ali to": [200],
35 |             "qlen": [1000],
36 |             "frame" : 1
37 |         }
38 |     )
39 | 
40 |     mock_annot_df = pd.DataFrame(
41 |         {"ID": ["test_id"], "description": ["test description"], "Must flag": [True]}
42 |     )
43 | 
44 |     # No filesystem interactions, patch ALL the things
45 |     with (
46 |         patch("os.path.exists", return_value=annotations_exists),
47 |         patch("pandas.read_csv", return_value=mock_annot_df),
48 |         patch("commec.screeners.check_biorisk.readhmmer", return_value=mock_hit_df),
49 |         patch("commec.screeners.check_biorisk.remove_overlaps", return_value=mock_hit_df),
50 |         patch("commec.screeners.check_biorisk.HmmerHandler.has_empty_output", return_value=has_empty_output),
51 |         patch("commec.screeners.check_biorisk.HmmerHandler.has_hits", return_value=has_hits),
52 |     ):
53 |         handler = HmmerHandler(DATABASE_DIRECTORY + "biorisk/biorisk.hmm", INPUT_QUERY, "/mock/path/test.hmmscan")
54 |         results = ScreenResult()
55 |         queries : dict[str,Query] = {"testname" : Query(SeqRecord(Seq("atgatgatgatgatgatgatg"),"testname","testname"))}
56 |         # Run the function - input paths are unused given all the mocking above
57 |         result = parse_biorisk_hits(handler, "/mock/path/biorisk/biorisk_annotations.csv", results, queries)
58 | 
59 |         # Check the result
60 |         assert result == expected_return


--------------------------------------------------------------------------------
/commec/tests/test_screen_io.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from unittest.mock import mock_open, patch
 3 | import os
 4 | 
 5 | from commec.config.screen_io import ScreenIO, IoValidationError
 6 | from commec.screen import add_args, ScreenArgumentParser
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def test_data_dir():
11 |     return os.path.join(os.path.dirname(__file__), "test_data")
12 | 
13 | 
14 | @pytest.fixture
15 | def database_dir():
16 |     return os.path.join(os.path.dirname(__file__), "test_dbs")
17 | 
18 | 
19 | @pytest.mark.parametrize(
20 |     "fasta_name",
21 |     [
22 |         "single_record.fasta",
23 |         "multiple_records.fasta",
24 |         "has_empty_record.fasta",
25 |         "has_empty_description.fasta",
26 |         "has_records_with_same_description.fasta",
27 |     ],
28 | )
29 | def test_default_parameters(fasta_name, test_data_dir, database_dir, tmp_path):
30 |     input_fasta = os.path.join(test_data_dir, fasta_name)
31 |     with patch(
32 |         "sys.argv",
33 |         ["test.py", "--skip-tx", input_fasta, "-d", database_dir, "-o", str(tmp_path)],
34 |     ):
35 |         parser = ScreenArgumentParser()
36 |         add_args(parser)
37 |         screen_io = ScreenIO(parser.parse_args())
38 |         assert screen_io.setup()
39 | 
40 | 
41 | @pytest.mark.parametrize(
42 |     "fasta_name,expected_record_count",
43 |     [
44 |         pytest.param("single_record.fasta", 1),
45 |         pytest.param("multiple_records.fasta", 2),
46 |     ],
47 | )
48 | def test_parse_input_fasta(
49 |     fasta_name, expected_record_count, test_data_dir, database_dir, tmp_path
50 | ):
51 |     input_fasta = os.path.join(test_data_dir, fasta_name)
52 |     with patch(
53 |         "sys.argv",
54 |         ["test.py", "--skip-tx", input_fasta, "-d", database_dir, "-o", str(tmp_path)],
55 |     ):
56 |         parser = ScreenArgumentParser()
57 |         add_args(parser)
58 |         screen_io = ScreenIO(parser.parse_args())
59 |         screen_io.setup()
60 | 
61 |     queries = screen_io.parse_input_fasta()
62 |     assert len(queries) == expected_record_count
63 | 
64 | 
65 | @pytest.mark.parametrize(
66 |     "fasta_name",
67 |     [
68 |         "has_empty_record.fasta",
69 |         "has_empty_description.fasta",
70 |         "has_records_with_same_description.fasta",
71 |     ],
72 | )
73 | def test_parse_invalid_input_fasta(fasta_name, test_data_dir, database_dir, tmp_path):
74 |     input_fasta = os.path.join(test_data_dir, fasta_name)
75 |     with patch(
76 |         "sys.argv",
77 |         ["test.py", "--skip-tx", input_fasta, "-d", database_dir, "-o", str(tmp_path)],
78 |     ):
79 |         parser = ScreenArgumentParser()
80 |         add_args(parser)
81 |         screen_io = ScreenIO(parser.parse_args())
82 |         screen_io.setup()
83 | 
84 |     with pytest.raises(IoValidationError):
85 |         screen_io.parse_input_fasta()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # commec: a free, open-source, globally available tool for DNA sequence screening
 2 | 
 3 | <picture style="max-width: 512; display: inline-block;">
 4 | 	<source media="(prefers-color-scheme: dark)" srcset="https://ibbis.bio/wp-content/uploads/2025/06/COMMEC_Logo_Horiz_Color_IBBIS_onDark.png">
 5 | 	<img align="left" alt="commec logo" style="max-width: 512;" src="https://ibbis.bio/wp-content/uploads/2025/06/COMMEC_Logo_Horiz_Color_IBBIS_onWhite.png">
 6 | </picture>
 7 | 
 8 | The `commec` package is a tool for DNA sequence screening that is part of the
 9 | [Common Mechanism for DNA Synthesis screening](https://ibbis.bio/common-mechanism/). The package offers several sub-commands through the `commec` entrypoint:
10 | 
11 |     screen  Run Common Mechanism screening on an input FASTA.
12 |     flag    Parse .screen.json files in a directory and create a CSV file of outcomes
13 |     setup   A command-line helper tool to download the required databases
14 |     split   Split a multi-record FASTA file into individual files, one for each record
15 | 
16 | The `commec screen` command runs an input FASTA through the following screening steps:
17 | 
18 | 1. **Biorisk search**: Fast HMM-based search against curated sequence profiles
19 | 2. **Taxonomy Search**: look for best matches to regulated pathogens using a two-step process:
20 |    * **Protein search**: BLASTX/DIAMOND search against NCBI nr
21 |    *  **Nucleotide search**: BLASTN search against NCBI core_nt
22 | 3. **Low concern search**: Clear earlier flags based on matches to common or conserved sequences
23 | 
24 | ![Flowchart showing decision-making by the common mechanism flag module.](https://ibbis.bio/wp-content/uploads/2025/08/commec-screening-flow-v1.jpg "Decision Flow")
25 | 
26 | Information about the databases supporting screening can be found in the [commec-databases](https://github.com/ibbis-bio/commec-databases/) repostiory.
27 | 
28 | ## Documentation
29 | The [GitHub Wiki](https://github.com/ibbis-screening/common-mechanism/wiki) has documentation for this package, including information about installing `commec` and interpreting screening results.
30 | 
31 | More information about the Common Mechanism project is available on the [IBBIS project page](https://ibbis.bio/common-mechanism/) and [Common Mechanism FAQ](https://ibbis.bio/our-work/common-mechanism/faq/).
32 | 
33 | ## Development
34 | The `commec` package is being actively developed by IBBIS staff. We welcome contributions! To get started, install conda, and make sure
35 | that [your channels are configured correctly](http://bioconda.github.io/). Then create the dev environment with:
36 | 
37 | ```
38 | conda env create -f environment.yml
39 | conda activate commec-dev
40 | ```
41 | 
42 | From here, you should have an interactive version of the package installed via `pip -e .` and the necessary shell dependencies.
43 | 


--------------------------------------------------------------------------------
/commec/split.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Split a multi-record FASTA file into individual files, one for each record.
 5 | 
 6 | Command-line usage:
 7 |     split.py input.fasta 
 8 | """
 9 | import argparse
10 | import os
11 | import string
12 | from Bio import SeqIO
13 | from commec.utils.file_utils import file_arg
14 | 
15 | VALID_FILENAME_CHARS = f"-._{string.ascii_letters}{string.digits}"
16 | DESCRIPTION = (
17 |     "Split a multi-record FASTA file into individual files, one for each record"
18 | )
19 | 
20 | 
21 | def add_args(parser):
22 |     """
23 |     Add module arguments to an ArgumentParser object.
24 |     """
25 |     parser.add_argument(
26 |         action="store", dest="fasta_file", type=file_arg, help="Input fasta file"
27 |     )
28 |     return parser
29 | 
30 | 
31 | def clean_description(description):
32 |     """
33 |     Cleans the description from a sequence record for use as part of a filename.
34 |     """
35 |     cleaned = description.strip()
36 |     cleaned = "".join(x for x in cleaned if x in VALID_FILENAME_CHARS)
37 |     if len(cleaned) > 150:
38 |         cleaned = cleaned[:150]
39 |     return cleaned
40 | 
41 | 
42 | def write_split_fasta(fasta_file):
43 |     """
44 |     Parse all sequence records in an input FASTA file, and write a new file for each record.
45 |     """
46 |     output_dir = os.path.dirname(fasta_file)
47 |     fasta_name = os.path.splitext(os.path.basename(fasta_file))[0]
48 | 
49 |     with open(fasta_file, "r", encoding="utf-8") as input_file:
50 |         for i, record in enumerate(SeqIO.parse(input_file, "fasta")):
51 |             desc = clean_description(record.description)
52 | 
53 |             # Handle empty descriptions and avoid overwriting input files
54 |             if not desc or desc == fasta_name:
55 |                 output_basename = f"{fasta_name}-split-{i}.fasta"
56 |             else:
57 |                 output_basename = f"{desc}.fasta"
58 | 
59 |             output_path = os.path.join(output_dir, output_basename)
60 |             with open(output_path, "w", encoding="utf-8") as output_file:
61 |                 output_file.write(f">{desc}{os.linesep}")
62 |                 output_file.write(str(record.seq))
63 | 
64 | 
65 | def run(parsed_args):
66 |     """
67 |     Wrapper so that args be parsed in main() or commec.py interface.
68 |     """
69 |     write_split_fasta(parsed_args.fasta_file)
70 | 
71 | 
72 | def main():
73 |     """
74 |     Main function. Passes FASTA file to `write_split_fasta`.
75 | 
76 |     Arguments:
77 |       - fasta_file: Path to the input FASTA file.
78 | 
79 |     """
80 |     parser = argparse.ArgumentParser(description=DESCRIPTION)
81 |     add_args(parser)
82 |     run(parser.parse_args())
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/commec/tests/test_data/screen-files/prot-nt-hits-cleared-2024-09.screen:
--------------------------------------------------------------------------------
 1 | >> STEP 1: Checking for biorisk genes...
 2 | 		 --> Biorisks: no hits detected, PASS
 3 |  STEP 1 completed at 2024-09-03 05:57:18
 4 |  >> STEP 2: Checking regulated pathogen proteins...
 5 | 		 --> Best match to sequence(s) NDS77015 at bases 89 - 268 found in only regulated organisms: FLAG (bacteria)
 6 | 		     Species: Francisella tularensis (taxid(s): 119857) (98.333 percent identity to query)
 7 | 		     Description: hypothetical protein [Francisella tularensis subsp. holarctica]
 8 | 		 --> Best match to sequence(s) CEE24912 at bases 391 - 501 found in only regulated organisms: FLAG (bacteria)
 9 | 		     Species: Xanthomonas citri (taxid(s): 611301, 487854) (75.758 percent identity to query)
10 | 		     Description: conserved hypothetical protein [Xanthomonas citri pv. citri]
11 |  STEP 2 completed at 2024-09-03 06:42:50
12 |  >> STEP 3: Checking regulated pathogen nucleotides...
13 | 	...protein hits found, fetching nt regions not covered by a 90% ID hit or better
14 | 	...Regulated protein region at bases 89 to 268 overlapped with a nucleotide hit
15 | 		     Species: Francisella tularensis (taxid(s): 263) (100.0 percent identity to query)
16 | 	...Regulated protein region at bases 391 to 501 overlapped with a nucleotide hit
17 | 		     Species: Francisella tularensis (taxid(s): 263) (100.0 percent identity to query)
18 | 		 --> Best match to sequence(s) AF143093 at bases 1 - 411 found in only regulated organisms: FLAG (bacteria)
19 | 		     Species: Francisella tularensis (taxid(s): 263) (100.0 percent identity to query)
20 | 		     Description: Francisella tularensis 16S ribosomal RNA gene, partial sequence
21 |  STEP 3 completed at 2024-09-03 11:29:43
22 | >> STEP 4: Checking any pathogen regions for benign components...
23 | 	...no housekeeping protein hits
24 | 		 -->Housekeeping RNAs - <50 bases unaccounted for: PASS
25 | 		   RNA family: SSU_rRNA_bacteria
26 | 		 -->Housekeeping RNAs - <50 bases unaccounted for: PASS
27 | 		   RNA family: SSU_rRNA_bacteria
28 | 		 -->Housekeeping RNAs - <50 bases unaccounted for: PASS
29 | 		   RNA family: SSU_rRNA_bacteria
30 | 		 -->Synbio sequences - >80% coverage achieved = PASS
31 | 		   Synbio parts: BBa_I1100 P 3987 Composite "SBWB Ribosome Test System"
32 | 		   BBa_C0101 P 3986 Coding "16s rRNA (non-standard 5%27 end, matched to B0035)"
33 | 		 -->Synbio sequences - >80% coverage achieved = PASS
34 | 		   Synbio parts: BBa_I1100 P 3987 Composite "SBWB Ribosome Test System"
35 | 		   BBa_C0101 P 3986 Coding "16s rRNA (non-standard 5%27 end, matched to B0035)"
36 | 		 -->Synbio sequences - >80% coverage achieved = PASS
37 | 		   Synbio parts: BBa_I1100 P 3987 Composite "SBWB Ribosome Test System"
38 | 		   BBa_C0101 P 3986 Coding "16s rRNA (non-standard 5%27 end, matched to B0035)"
39 | 
40 | 		 -->all regulated regions cleared: PASS
41 | >> COMPLETED AT 2024-09-03 11:29:50
42 | 


--------------------------------------------------------------------------------
/commec/tests/test_nc_to_nt.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from Bio.Seq import Seq
 3 | from Bio.SeqRecord import SeqRecord
 4 | from commec.config.query import Query, QueryValueError
 5 | 
 6 | @pytest.fixture
 7 | def seq_record():
 8 |     """
 9 |     Fixture to generate a SeqRecord with 
10 |     defined coding ('t') and non-coding ('a') regions.
11 |     Total size = 150
12 |     non-coding regions = 100
13 |     """
14 |     non_coding_1 = "a" * 50
15 |     coding_1 = "t" * 20
16 |     non_coding_2 = "a" * 30
17 |     coding_2 = "t" * 30
18 |     non_coding_3 = "a" * 20  # Final non-coding segment
19 | 
20 |     sequence = non_coding_1 + coding_1 + non_coding_2 + coding_2 + non_coding_3
21 |     return SeqRecord(Seq(sequence), id="test_seq", description="")
22 | 
23 | @pytest.fixture
24 | def non_coding_regions():
25 |     """
26 |     Fixture to generate non-coding region tuples based on the sequence definition.
27 |     Uses the same lengths as in seq_record() to compute (start, end) values.
28 |     """
29 |     regions = [
30 |         (1, 50),       # First 'a' region
31 |         (71, 100),     # Second 'a' region (starts after first coding region)
32 |         (131, 150)     # Third 'a' region (starts after second coding region)
33 |     ]
34 |     return regions
35 | 
36 | # 0 based coordinates:
37 | # NT COORDS: 0 - 49    50 - 69   70 - 99   100 - 129   130 - 149
38 | # NC COORDS: 0 - 49              50 - 79                80 -  99
39 | 
40 | # 1 based coordinates:
41 | # NT COORDS: 1 - 50    51 - 70   71 - 100   101 - 130   131 - 150
42 | # NC COORDS: 1 - 50              51 -  80                81 - 100
43 | 
44 | @pytest.fixture
45 | def test_cases():
46 |     """
47 |     Fixture providing a list of (input_coordinate, expected_output) tuples.
48 |     The input is a sequence coordinate, and the expected output is its transformed coordinate.
49 |     """
50 |     return [
51 |         (1, 1),
52 |         (10, 10),
53 |         (50, 50),
54 |         (51, 71),
55 |         (80, 100),
56 |         (81, 131),
57 |         (99, 149),
58 |         (100, 150),
59 |     ]
60 | 
61 | def test_coordinate_conversion(seq_record, non_coding_regions, test_cases):
62 |     """
63 |     Placeholder test function for coordinate conversion.
64 |     """
65 |     # Query setup
66 |     test_query : Query = Query(seq_record)
67 |     test_query.non_coding_regions = non_coding_regions
68 | 
69 |     # Test Correct coords:
70 |     for nc, nt in test_cases:
71 |         assert nt == test_query.nc_to_nt_query_coords(nc)
72 | 
73 |     # Test Failure out of bounds.
74 |     try:
75 |         _x = test_query.nc_to_nt_query_coords(test_cases[-1][0]+1)
76 |         assert False
77 |     except QueryValueError:
78 |         assert True
79 | 
80 |     # Test Failure out of bounds.
81 |     try:
82 |         _x = test_query.nc_to_nt_query_coords(0)
83 |         assert False
84 |     except QueryValueError:
85 |         assert True
86 | 
87 | 


--------------------------------------------------------------------------------
/commec/tools/blastn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Handler for BLASTN search of nucleotide databases using nucleotide queries.
 5 | Initialise with local input database, fasta to screen, and output file.
 6 | Throws error if inputs are invalid. Creates a temporary log file, which is deleted on completion.
 7 | """
 8 | 
 9 | import subprocess
10 | from commec.tools.blast_tools import BlastHandler
11 | from commec.tools.search_handler import SearchToolVersion
12 | 
13 | 
14 | class BlastNHandler(BlastHandler):
15 |     """
16 |     A search handler specifically for BLASTN command-line during commec screening.
17 |     Modify `arguments_dictionary` to change passed to the command line call.
18 |     """
19 | 
20 |     def __init__(
21 |         self, database_file: str, input_file: str, out_file: str, **kwargs,
22 |     ):
23 |         super().__init__(database_file, input_file, out_file, **kwargs)
24 |         # We fill this with defaults, however they can always be overridden before screening.
25 |         self.arguments_dictionary = {
26 |             "-outfmt": [
27 |                 "7",
28 |                 "qacc",
29 |                 "stitle",
30 |                 "sacc",
31 |                 "staxids",
32 |                 "evalue",
33 |                 "bitscore",
34 |                 "pident",
35 |                 "qlen",
36 |                 "qstart",
37 |                 "qend",
38 |                 "slen",
39 |                 "sstart",
40 |                 "send",
41 |             ],
42 |             "-num_threads": self.threads,
43 |             "-evalue": 10,
44 |             "-max_target_seqs": 50,
45 |             "-culling_limit": 5,
46 |         }
47 |         self.blastcall = "blastn"
48 | 
49 |     def _search(self):
50 |         command = [
51 |             self.blastcall,
52 |             "-db",
53 |             self.db_file,
54 |             "-query",
55 |             self.input_file,
56 |             "-out",
57 |             self.out_file,
58 |         ]
59 |         command.extend(self.format_args_for_cli())
60 |         self.run_as_subprocess(command, self.temp_log_file)
61 | 
62 |     def get_version_information(self) -> SearchToolVersion:
63 |         try:
64 |             result = subprocess.run(
65 |                 ["blastn", "-version"], capture_output=True, text=True, check=True
66 |             )
67 |             tool_info = result.stdout.strip()
68 | 
69 |             result = subprocess.run(
70 |                 ["blastdbcmd", "-info", "-db", self.db_file, "-dbtype", "nucl"],
71 |                 capture_output=True,
72 |                 text=True,
73 |                 check=True,
74 |             )
75 |             lines = result.stdout.splitlines()
76 |             database_info: str = lines[5] + lines[3]
77 | 
78 |             return SearchToolVersion(tool_info, database_info)
79 |         except (subprocess.CalledProcessError, FileNotFoundError):
80 |             return SearchToolVersion()


--------------------------------------------------------------------------------
/dev_scripts/run_blastx.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | #####################################################################
 4 | #run_blastx.sh runs blastx against a specified database.
 5 | #####################################################################
 6 | #Usage: run_blastx.sh -d DB -q QUERY -o OUTPUT [-t THREADS -f FURTHEROPT]
 7 | 
 8 | #set -eux #debug mode
 9 | set -eu
10 | DB=""
11 | QUERY=""
12 | OUTPUT="out" 
13 | THREADS=1
14 | FURTHEROPT=""
15 | 
16 | #Get options from user
17 | while getopts "t:d:q:o:f:" OPTION
18 |     do 
19 |         case $OPTION in
20 |             t)
21 |                 THREADS=$OPTARG
22 |                 ;;
23 |             d) 
24 |                 DB=$OPTARG
25 |                 ;;
26 |             q)
27 |                 QUERY=$OPTARG
28 |                 ;;
29 |             o) 
30 |                 OUTPUT=$OPTARG
31 |                 ;;
32 |             f)
33 |                 FURTHEROPT=$OPTARG
34 |                 ;;
35 |             \?)
36 |                 echo "Usage: run_blastx.sh -d DB -q QUERY -s OUTPUT [-t THREADS -f FURTHEROPT]"
37 |                 echo "  DB              full path to database (required)"
38 |                 echo "  QUERY           query file to align to each database (required)"
39 |                 echo "  OUTPUT          output prefix for alignments (default: out)"
40 |                 echo "  THREADS         number of threads for each database run (default: 1)"
41 |                 echo "  FURTHEROPT      any further options to specify"
42 |                 exit
43 |                 ;;  
44 |         esac 
45 |     done
46 | 
47 | #Check for values
48 | if [ "$DB" == "" ] && [ "$INPUT" == "" ]
49 | then
50 |     echo "Usage: run_blastx.sh -d DB -q QUERY -s OUTPUT [-t THREADS -f FURTHEROPT]"
51 |         echo "  DB              full path to database (required)"
52 |         echo "  QUERY           query file to align to each database (required)"
53 |         echo "  OUTPUT          output prefix for alignments (default: out)"
54 |         echo "  THREADS         number of threads for each database run (default: 1)"
55 |         echo "  FURTHEROPT      any further options to specify"
56 |     exit
57 | fi
58 | 
59 | #Check for database
60 | echo -e "\t...checking for valid options..."
61 | if [ ! -f "$DB".pal ] # this is v. blast specific
62 | then
63 |     echo " ERROR: blastx database $DB does not exist"
64 |     exit
65 | fi
66 |   
67 | #Check for input file 
68 | if [ ! -f  "$QUERY" ]
69 | then
70 |     echo " ERROR: input file $QUERY does not exist"
71 |     exit
72 | fi      
73 | 
74 | echo -e "\t...running protein search..."
75 | blastx -db ${DB} -query ${QUERY} -out ${OUTPUT}.blastx -outfmt "7 qacc stitle sacc staxids evalue bitscore pident qlen qstart qend slen sstart send" -evalue 1e-10 -word_size 6 -threshold 21 -max_target_seqs 5000 -culling_limit 50 -window_size 40 -matrix BLOSUM62 -gapopen 11 -gapextend 1 -seg yes -num_threads ${THREADS} ${FURTHEROPT}
76 | 


--------------------------------------------------------------------------------
/commec/tests/test_split.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest.mock import mock_open, patch
 3 | import pytest
 4 | from Bio import SeqIO
 5 | from commec.split import clean_description, write_split_fasta
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def test_data_dir():
10 |     return os.path.join(os.path.dirname(__file__), "test_data")
11 | 
12 | 
13 | @pytest.fixture
14 | def fasta_records(test_data_dir):
15 |     """Fixture to parse records from multiple FASTA files into a dictionary."""
16 |     files = [
17 |         "multiple_records.fasta",
18 |         "single_record.fasta",
19 |         "has_empty_record.fasta",
20 |         "has_empty_description.fasta",
21 |     ]
22 |     record_dict = {}
23 |     for filename in files:
24 |         file_path = os.path.join(test_data_dir, filename)
25 |         with open(file_path, "r", encoding="utf-8") as input_file:
26 |             records = list(SeqIO.parse(input_file, "fasta"))
27 |             record_dict[filename] = records
28 |     return record_dict
29 | 
30 | 
31 | @pytest.mark.parametrize(
32 |     "description, expected",
33 |     [
34 |         (
35 |             'BBa_K620001_P_22737_Coding_"WT-F87A_(p450)"',
36 |             "BBa_K620001_P_22737_Coding_WT-F87A_p450",
37 |         ),
38 |         ("long description" * 20, "longdescription" * 10),
39 |         ("", ""),
40 |     ],
41 | )
42 | def test_clean_description(description, expected):
43 |     assert clean_description(description) == expected
44 | 
45 | 
46 | @pytest.mark.parametrize(
47 |     "filename",
48 |     [
49 |         "multiple_records.fasta",
50 |         "single_record.fasta",
51 |         "has_empty_record.fasta",
52 |         "has_empty_description.fasta",
53 |     ],
54 | )
55 | @patch("builtins.open", new_callable=mock_open)
56 | @patch("os.path.join", side_effect=lambda a, b: f"{a}/{b}")
57 | @patch("commec.split.SeqIO.parse")
58 | def test_write_split_fasta(
59 |     mock_seqio_parse,
60 |     mock_os_path_join,
61 |     mock_open,
62 |     filename,
63 |     test_data_dir,
64 |     fasta_records,
65 | ):
66 |     filepath = os.path.join(test_data_dir, filename)
67 |     records = fasta_records[filename]
68 |     mock_seqio_parse.return_value = records
69 |     write_split_fasta(filepath)
70 | 
71 |     # Check the correct number of output files were opened (one input + as many outputs as records)
72 |     assert mock_open.call_count == len(records) + 1
73 | 
74 |     for record in records:
75 |         desc = clean_description(record.description)
76 | 
77 |         if desc:
78 |             output_filename = f"{desc}.fasta"
79 |         else:
80 |             output_filename = f"{os.path.splitext(filename)[0]}-split-0.fasta"
81 | 
82 |         mock_os_path_join.assert_any_call(os.path.dirname(filepath), output_filename)
83 |         mock_open.assert_any_call(
84 |             os.path.join(os.path.dirname(filepath), output_filename),
85 |             "w",
86 |             encoding="utf-8",
87 |         )
88 |         mock_open().write.assert_any_call(f">{desc}{os.linesep}")
89 |         mock_open().write.assert_any_call(f"{record.seq}")
90 | 


--------------------------------------------------------------------------------
/dev_scripts/run_diamond.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | #####################################################################
 4 | # run_diamond.sh runs DIAMOND against a specified NCBI nr database.
 5 | # DIAMOND citation: 
 6 | #   Buchfink B, Reuter K, Drost HG, "Sensitive protein alignments at 
 7 | #   tree-of-life scale using DIAMOND", 
 8 | #   Nature Methods 18, 366–368 (2021). 
 9 | #   doi:10.1038/s41592-021-01101-x
10 | #####################################################################
11 | 
12 | set -eu
13 | 
14 | # Default values
15 | JOBS=""
16 | THREADS=1
17 | DB_PATH=""
18 | INPUT=""
19 | OUTPUT="out"
20 | 
21 | usage() {
22 |     echo "Usage: run_diamond.sh -d MY_DB -i INPUT_FILE [-o OUTPUT_FILE] [-j JOBS] [-t THREADS]"
23 |     echo "  MY_DB           location of NCBI nr database (required)"
24 |     echo "  INPUT_FILE      input file to align to each database (required)" 
25 |     echo "  OUTPUT_FILE     output prefix for alignments (default: out)" 
26 |     echo "  JOBS            number of diamond runs to do in parallel (default: # CPUs / THREADS)"
27 |     echo "  THREADS         number of threads for each diamond run (default: 1)"
28 |     exit 1
29 | }
30 | 
31 | # Parse command line arguments
32 | while getopts "j:t:d:i:o:" opt; do
33 |     case $opt in
34 |         j) JOBS=$OPTARG ;;
35 |         t) THREADS=$OPTARG ;;
36 |         d) DB_PATH=$OPTARG ;;
37 |         i) INPUT=$OPTARG ;;
38 |         o) OUTPUT=$OPTARG ;;
39 |         \?) usage ;;
40 |     esac
41 | done
42 | 
43 | # Check for required arguments
44 | if [[ -z "$DB_PATH" || -z "$INPUT" ]]; then
45 |     usage
46 | fi
47 | 
48 | # Validate input
49 | if [[ ! -d "$DB_PATH" ]]; then
50 |     echo "ERROR: nr diamond database folder $DB_PATH does not exist" >&2
51 |     exit 1
52 | fi
53 | 
54 | shopt -s failglob
55 | if ! files=("${DB_PATH}"/nr*.dmnd); then
56 |     echo "ERROR: No nr diamond database files (nr*.dmnd) found in $DB_PATH" >&2
57 |     exit 1
58 | fi
59 | shopt -u failglob
60 | 
61 | if [[ ! -f "$INPUT" ]]; then
62 |     echo "ERROR: input file $INPUT does not exist" >&2
63 |     exit 1
64 | fi
65 | 
66 | # Set JOBS if not specified by user
67 | if [[ -z "$JOBS" ]]; then
68 |     CPU_COUNT=$(parallel --number-of-cpus)
69 |     JOBS=$((CPU_COUNT / THREADS))
70 | fi
71 | 
72 | echo "Running diamond protein search..."
73 | echo "Using $JOBS job(s) in parallel with $THREADS thread(s) each"
74 | 
75 | # Run diamond
76 | ls ${DB_PATH}/nr*.dmnd | parallel --will-cite --use-cpus-instead-of-cores --jobs ${JOBS} \
77 |     diamond blastx --quiet \
78 |     -d {} \
79 |     --threads ${THREADS} \
80 |     -q ${INPUT} \
81 |     -o ${OUTPUT}.{/.}.tsv \
82 |     --outfmt 6 qseqid stitle sseqid staxids evalue bitscore pident qlen qstart qend slen sstart send \
83 |     --frameshift 15 --range-culling
84 | 
85 | # Combine results and clean up
86 | cat ${OUTPUT}.*.tsv > ${OUTPUT}.dmnd
87 | rm ${OUTPUT}.*.tsv
88 | 
89 | echo "Diamond protein search completed. Results are in ${OUTPUT}.dmnd"
90 | 


--------------------------------------------------------------------------------
/commec/tools/blastx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Handler for BLASTX search of protein databases using nucleotide queries.
 5 | Initialise with local input database, fasta to screen, and output file.
 6 | Throws error if inputs are invalid. Creates a temporary log file, which is deleted on completion.
 7 | """
 8 | 
 9 | import subprocess
10 | from commec.tools.blast_tools import BlastHandler
11 | from commec.tools.search_handler import SearchToolVersion
12 | 
13 | 
14 | class BlastXHandler(BlastHandler):
15 |     """
16 |     A search handler specifically for BLASTX command-line during commec screening.
17 |     Modify `arguments_dictionary` to change arguments passed to the CLI.
18 |     """
19 | 
20 |     def __init__(
21 |         self, database_file: str, input_file: str, out_file: str, **kwargs,
22 |     ):
23 |         super().__init__(database_file, input_file, out_file, **kwargs)
24 |         # We fill this with defaults, however they can always be overridden before screening.
25 |         self.arguments_dictionary = {
26 |             "-num_threads": self.threads,
27 |             "-evalue": 1e-10,
28 |             "-word_size": 6,
29 |             "-threshold": 21,
30 |             "-max_target_seqs": 5000,
31 |             "-culling_limit": 50,
32 |             "-window_size": 40,
33 |             "-matrix": "BLOSUM62",
34 |             "-gapopen": 11,
35 |             "-gapextend": 1,
36 |             "-seg": "yes",
37 |             "-outfmt": [
38 |                 "7",
39 |                 "qacc",
40 |                 "stitle",
41 |                 "sacc",
42 |                 "staxids",
43 |                 "evalue",
44 |                 "bitscore",
45 |                 "pident",
46 |                 "qlen",
47 |                 "qstart",
48 |                 "qend",
49 |                 "slen",
50 |                 "sstart",
51 |                 "send",
52 |             ],
53 |         }
54 |         self.blastcall = "blastx"
55 | 
56 |     def _search(self):
57 |         command = [
58 |             self.blastcall,
59 |             "-db",
60 |             self.db_file,
61 |             "-query",
62 |             self.input_file,
63 |             "-out",
64 |             self.out_file,
65 |         ]
66 |         command.extend(self.format_args_for_cli())
67 |         self.run_as_subprocess(command, self.temp_log_file)
68 | 
69 |     def get_version_information(self) -> SearchToolVersion:
70 |         try:
71 |             result = subprocess.run(
72 |                 ["blastx", "-version"], capture_output=True, text=True, check=True
73 |             )
74 |             tool_info = result.stdout.strip()
75 | 
76 |             result = subprocess.run(
77 |                 ["blastdbcmd", "-info", "-db", self.db_file, "-dbtype", "prot"],
78 |                 capture_output=True,
79 |                 text=True,
80 |                 check=True,
81 |             )
82 |             lines = result.stdout.splitlines()
83 |             database_info: str = lines[5] + lines[3]
84 | 
85 |             return SearchToolVersion(tool_info, database_info)
86 | 
87 |         except (subprocess.CalledProcessError, FileNotFoundError):
88 |             return SearchToolVersion()
89 | 


--------------------------------------------------------------------------------
/commec/cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
 3 | """
 4 | Command-line entrypoint for the package. Calls `screen.py`, `flag.py` and `split.py` as subcommands.
 5 | 
 6 | The subcommands:
 7 |     screen  Run Common Mechanism screening on an input FASTA.
 8 |     flag    Parse all .screen files in a directory and create two CSVs file of flags raised
 9 |     split   Split a multi-record FASTA file into individual files, one for each record
10 | 
11 | Command-line usage:
12 |     - commec screen -d /path/to/databases input.fasta
13 |     - commec flag /path/to/directory/with/output.screen 
14 |     - commec split input.fasta
15 |     - commec -h, --help
16 |     - commec -v, --version
17 | """
18 | from commec.flag import (
19 |     DESCRIPTION as flag_DESCRIPTION,
20 |     add_args as flag_add_args,
21 |     run as flag_run,
22 | )
23 | from commec.screen import (
24 |     DESCRIPTION as screen_DESCRIPTION,
25 |     add_args as screen_add_args,
26 |     run as screen_run,
27 |     ScreenArgumentParser
28 | )
29 | from commec.split import (
30 |     DESCRIPTION as split_DESCRIPTION,
31 |     add_args as split_add_args,
32 |     run as split_run,
33 | )
34 | from commec.setup import (
35 |     DESCRIPTION as setup_DESCRIPTION,
36 |     add_args as setup_add_args,
37 |     run as setup_run,
38 | )
39 | 
40 | from commec import __version__ as COMMEC_VERSION
41 | 
42 | def main():
43 |     """
44 |     Parse the command line arguments and call the relevant sub-command.
45 |     """
46 |     parser = ScreenArgumentParser(
47 |         prog="commec", description="Command-line entrypoint for the Common Mechanism"
48 |     )
49 |     # Sub argument for version information
50 |     parser.add_argument(
51 |         "-v",
52 |         "--version",
53 |         dest="version",
54 |         action="store_true",
55 |         default=False,
56 |         help="show version information and exit",
57 |     )
58 | 
59 |     # Setup sub parsers:
60 |     subparsers = parser.add_subparsers(dest="command")
61 | 
62 |     # Sub-command for "screen"
63 |     screen_parser = subparsers.add_parser("screen", description=screen_DESCRIPTION)
64 |     screen_add_args(screen_parser)
65 | 
66 |     # Sub-command for "flag"
67 |     flag_parser = subparsers.add_parser("flag", description=flag_DESCRIPTION)
68 |     flag_add_args(flag_parser)
69 | 
70 |     # Sub-command for "split"
71 |     split_parser = subparsers.add_parser("split", description=split_DESCRIPTION)
72 |     split_add_args(split_parser)
73 | 
74 |     # Sub-command for "setup"
75 |     setup_parser = subparsers.add_parser("setup", description=setup_DESCRIPTION)
76 |     setup_add_args(setup_parser)
77 | 
78 |     args = parser.parse_args()
79 | 
80 |     if args.command == "screen":
81 |         screen_run(args)
82 |     elif args.command == "flag":
83 |         flag_run(args)
84 |     elif args.command == "split":
85 |         split_run(args)
86 |     elif args.command == "setup":
87 |         setup_run(args)
88 |     elif args.version:
89 |         print( "Commec  : The Common Mechanism\n"
90 |               f"Version : {COMMEC_VERSION}\n"
91 |               "Copyright IBBIS (c) 2021-2025\n"
92 |               "International Biosecurity and Biosafety Initiative for Science")
93 |     else:
94 |         parser.print_help()
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main()
99 | 


--------------------------------------------------------------------------------
/commec/tools/cmscan.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | """
  4 | Cmscan search handler, and calling cmscan command line interface.
  5 | Additional methods for reading handler output, readcmscan, which returns a pandas database.
  6 | Instantiate a CmscanHandler, with input local database, input fasta, and output file.
  7 | Throws if inputs are invalid. Creates a temporary log file, which is deleted on completion.
  8 | """
  9 | import subprocess
 10 | import re
 11 | import pandas as pd
 12 | from commec.tools.search_handler import SearchHandler, SearchToolVersion
 13 | 
 14 | 
 15 | class CmscanHandler(SearchHandler):
 16 |     """A Database handler specifically for use with Hmmer files for commec screening."""
 17 | 
 18 |     def _search(self):
 19 |         command = [
 20 |             "cmscan",
 21 |             "--cpu",
 22 |             str(self.threads),
 23 |             "--tblout",
 24 |             self.out_file,
 25 |             self.db_file,
 26 |             self.input_file,
 27 |         ]
 28 |         self.run_as_subprocess(command, self.temp_log_file)
 29 |     
 30 |     def read_output(self):
 31 |         output_dataframe = readcmscan(self.out_file)
 32 |         # Standardize the output column names to be like blast:
 33 |         output_dataframe = output_dataframe.rename(columns={
 34 |             "seq from": "q. start",
 35 |             "seq to": "q. end",
 36 |             "coverage": "q. coverage",
 37 |             "target name": "subject title",
 38 |             "mdl from": "s. start",
 39 |             "mdl to" : "s. end",
 40 |             'E-value': "evalue",
 41 |         })
 42 |         return output_dataframe
 43 |  
 44 | 
 45 |     def get_version_information(self) -> SearchToolVersion:
 46 |         try:
 47 |             database_info = None
 48 |             with open(self.db_file, "r", encoding="utf-8") as file:
 49 |                 for line in file:
 50 |                     if line.startswith("INFERNAL1/a"):
 51 |                         database_info = line.strip()
 52 |                         continue
 53 |                     # Early exit if data has been found
 54 |                     if database_info:
 55 |                         break
 56 |             
 57 |             result = subprocess.run(
 58 |                 ["cmscan", "-h"], capture_output=True, text=True, check=True
 59 |             )
 60 |             tool_info = result.stdout.splitlines()[1].strip()[2:] or "error retrieving info"
 61 | 
 62 |             return SearchToolVersion(tool_info, database_info or "error")
 63 | 
 64 |         except (subprocess.CalledProcessError, FileNotFoundError):
 65 |             return None
 66 | 
 67 | 
 68 | def readcmscan(fileh):
 69 |     """
 70 |     Read in cmscan output files
 71 |     """
 72 |     columns = [
 73 |         "target name",
 74 |         "accession",
 75 |         "query name",
 76 |         "accession",
 77 |         "mdl",
 78 |         "mdl from",
 79 |         "mdl to",
 80 |         "seq from",
 81 |         "seq to",
 82 |         "strand",
 83 |         "trunc",
 84 |         "pass",
 85 |         "gc",
 86 |         "bias",
 87 |         "score",
 88 |         "E-value",
 89 |         "inc",
 90 |         "description of target",
 91 |     ]
 92 | 
 93 |     cmscan = []
 94 | 
 95 |     with open(fileh, "r", encoding="utf-8") as f:
 96 |         for line in f:
 97 |             if "# Program:         cmscan" in line:
 98 |                 break
 99 |             if "#" in line:
100 |                 continue
101 |             bits = re.split(r"\s+", line)
102 |             description = " ".join(bits[17:])
103 |             bits = bits[:17]
104 |             bits.append(description)
105 |             cmscan.append(bits)
106 |     cmscan = pd.DataFrame(cmscan, columns=columns)
107 |     cmscan["E-value"] = pd.to_numeric(cmscan["E-value"])
108 |     cmscan["score"] = pd.to_numeric(cmscan["score"])
109 |     cmscan["seq from"] = pd.to_numeric(cmscan["seq from"])
110 |     cmscan["seq to"] = pd.to_numeric(cmscan["seq to"])
111 | 
112 |     return cmscan


--------------------------------------------------------------------------------
/commec/tests/test_aa_to_nt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple tests to ensure the correct trimming of overlapping components in a Hmmer database
 3 | when parsed to remove_overlaps.
 4 | The following behaviour is expected:
 5 |  * Fully encapsulated hits, should be removed.
 6 |  * Partially overlapping hits, should both be kept (To maximise extents)
 7 |  * Hits from different queries are independant in logic.
 8 | """
 9 | 
10 | import pandas as pd
11 | from pandas.testing import assert_frame_equal
12 | import pytest
13 | from commec.tools.hmmer import recalculate_hmmer_query_coordinates
14 | 
15 | # Example DataFrame
16 | example_hmmer_01 = pd.DataFrame({
17 |     "query name": ["F1","F2","F3","R1","R2", "R3"],
18 |     "frame":    [1,2,3,4,5,6],
19 |     "ali from": [1, 2, 3, 1, 2, 3],
20 |     "ali to":   [4, 5, 6, 4, 5, 6],
21 |     "nt_qlen":     [31, 31, 31, 31, 31, 31]
22 | })
23 | 
24 | # Logically, we would expect this to match Fwd and Rev all frame AA to NT coordinates:
25 | # 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
26 | # [  1  ]  [  2  ]  [  3  ] [   4  ] [   5  ] [   6  ] [   7  ] [   8  ] [   9  ] [  10  ]
27 | #    [  1  ]  [  2  ]  [  3  ] [   4  ] [   5  ] [   6  ] [   7  ] [   8  ] [   9  ] [  10  ]
28 | #       [  1  ]  [  2  ]  [  3  ] [   4  ] [   5  ] [   6  ] [   7  ] [   8  ] [   9  ]
29 | # [  10 ]  [  9  ]  [  8  ] [   7  ] [   6  ] [   5  ] [   4  ] [   3  ] [   2  ] [   1  ]
30 | #       [  9  ]  [  8  ] [   7  ] [   6  ] [   5  ] [   4  ] [   3  ] [   2  ] [   1  ]
31 | #    [  9  ]  [  8  ] [   7  ] [   6  ] [   5  ] [   4  ] [   3  ] [   2  ] [   1  ] [  -1  ]
32 | 
33 | # However, HMMER Biorisk behaves like the following 
34 | #(According to testing with BBa_I766605 YopH-EE under medium constitutive promotor, and reverse complements.)
35 | # 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
36 | # [  1  ]  [  2  ]  [  3  ] [   4  ] [   5  ] [   6  ] [   7  ] [   8  ] [   9  ] [  10  ]
37 | #    [  1  ]  [  2  ]  [  3  ] [   4  ] [   5  ] [   6  ] [   7  ] [   8  ] [   9  ] [  10  ]
38 | #       [  1  ]  [  2  ]  [  3  ] [   4  ] [   5  ] [   6  ] [   7  ] [   8  ] [   9  ]
39 | #       [  10 ]  [  9  ]  [  8  ] [   7  ] [   6  ] [   5  ] [   4  ] [   3  ] [   2  ] [   1  ]
40 | #             [  9  ]  [  8  ] [   7  ] [   6  ] [   5  ] [   4  ] [   3  ] [   2  ] [   1  ]
41 | #          [  9  ]  [  8  ] [   7  ] [   6  ] [   5  ] [   4  ] [   3  ] [   2  ] [   1  ] [  -1  ]
42 | 
43 | # Example DataFrame
44 | original_expected_output = pd.DataFrame({
45 |     "query name": ["F1","F2","F3","R1","R2", "R3"],
46 |     "frame":    [1,2,3,4,5,6],
47 |     "ali from": [1, 2, 3, 1, 2, 3],
48 |     "ali to":   [4, 5, 6, 4, 5, 6],
49 |     "nt_qlen":  [31, 31, 31, 31, 31, 31],
50 |     "q. start": [ 1,  5,  9,  19,  15, 11],
51 |     "q. end":   [12, 16, 20,  30,  26, 22]
52 | })
53 | 
54 | # Example DataFrame which matches biorisk hmmer outputs:
55 | example_hmmer_01_output = pd.DataFrame({
56 |     "query name": ["F1","F2","F3","R1","R2", "R3"],
57 |     "frame":    [1,2,3,4,5,6],
58 |     "ali from": [1, 2, 3, 1, 2, 3],
59 |     "ali to":   [4, 5, 6, 4, 5, 6],
60 |     "nt_qlen":  [31, 31, 31, 31, 31, 31],
61 |     "q. start": [ 1,  5,  9,  21,  17, 13],
62 |     "q. end":   [12, 16, 20,  32,  28, 24]
63 | })
64 | 
65 | @pytest.mark.parametrize(
66 |     "input_hmmer, expected_output_hmmer",
67 |     [
68 |         (example_hmmer_01, example_hmmer_01_output),
69 |     ]
70 | )
71 | def test_hmmer_overlaps(
72 |     input_hmmer : pd.DataFrame,
73 |     expected_output_hmmer : pd.DataFrame
74 | ):
75 |     """
76 |     Checks common configurations that require trimming in Hmmer outputs,
77 |     In particular partial overlaps, 
78 |     full encapsulations, score differences, 
79 |     and different queries.
80 |     """
81 |     print("INPUT:")
82 |     print(input_hmmer)
83 |     print(input_hmmer.dtypes)
84 |     recalculate_hmmer_query_coordinates(input_hmmer)
85 |     print("PROCESSED:")
86 |     print(input_hmmer)
87 |     print(input_hmmer.dtypes)
88 |     print("CORRECT:")
89 |     print(expected_output_hmmer)
90 |     print(expected_output_hmmer.dtypes)
91 |     assert_frame_equal(input_hmmer, expected_output_hmmer)
92 |     #assert input_hmmer.equals(expected_output_hmmer)
93 | 


--------------------------------------------------------------------------------
/commec/tests/test_blast_tools.py:
--------------------------------------------------------------------------------
  1 | from io import StringIO
  2 | import pytest
  3 | import textwrap
  4 | from unittest.mock import patch
  5 | import numpy as np
  6 | import pandas as pd
  7 | from commec.tools.blast_tools import _split_by_tax_id, read_blast, _get_lineages, get_taxonomic_labels
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def blast_df():
 12 |     """
 13 |     Return a dataframe containing 3 BLAST hits, 2 with multiple taxids, 1 of which is invalid and 1
 14 |     of which is a synthetic taxid
 15 |     """
 16 |     blast_to_parse = textwrap.dedent(
 17 |         """\
 18 |         # BLASTX 2.15.0+
 19 |         # Query: NC_TEST
 20 |         # Database: /root/commec-dbs/mock
 21 |         #query acc.	subject title	subject acc.	subject tax ids	evalue	bit score	% identity	query length	q. start	q. end	subject length	s. start	s. end
 22 |         # 3 hits found
 23 |         BT_01	SUBJECT	SUBJECT_ACC	2371;644357	0.0	BITSCORE	99.999	300	101	200	500	1	100
 24 |         BT_01	SUBJECT	SUBJECT_ACC	10760;110011001100	0.0	BITSCORE	99.999	300	25	80	500	1	100
 25 |         BT_01	SUBJECT	SUBJECT_ACC	32630	0.0	BITSCORE	99.999	300	275	300	500	1	100
 26 |         """
 27 |     )
 28 |     return read_blast(StringIO(blast_to_parse))
 29 | 
 30 | 
 31 | @pytest.fixture
 32 | def lineage_df():
 33 |     """
 34 |     Dataframe subsetting columns from the results of pytaxonkit.lineage applied to blast_df
 35 |     """
 36 |     return pd.DataFrame(
 37 |         {
 38 |             "TaxID": [2371, 644357, 10760, 110011001100, 32630],
 39 |             "Code": [2371, 644357, 10760, -1, 32630],
 40 |             "FullLineage": [
 41 |                 "cellular organisms;Bacteria;Pseudomonadota;Gammaproteobacteria;Lysobacterales;Lysobacteraceae;Xylella;Xylella fastidiosa",
 42 |                 "cellular organisms;Bacteria;Pseudomonadota;Gammaproteobacteria;Lysobacterales;Lysobacteraceae;Xylella;Xylella fastidiosa;Xylella fastidiosa subsp. multiplex",
 43 |                 "Viruses;Duplodnaviria;Heunggongvirae;Uroviricota;Caudoviricetes;Autographiviridae;Studiervirinae;Teseptimavirus;Teseptimavirus T7;Escherichia phage T7",
 44 |                 np.nan,
 45 |                 "other entries;other sequences;artificial sequences;synthetic construct",
 46 |             ],
 47 |             "FullLineageTaxIDs": [
 48 |                 "131567;2;1224;1236;135614;32033;2370;2371",
 49 |                 "131567;2;1224;1236;135614;32033;2370;2371;644357",
 50 |                 "10239;2731341;2731360;2731618;2731619;2731643;2731653;110456;1985738;10760",
 51 |                 np.nan,
 52 |                 "2787854;28384;81077;32630",
 53 |             ],
 54 |             "FullLineageRanks": [
 55 |                 "no rank;superkingdom;phylum;class;order;family;genus;species",
 56 |                 "no rank;superkingdom;phylum;class;order;family;genus;species;subspecies",
 57 |                 "superkingdom;clade;kingdom;phylum;class;family;subfamily;genus;species;no rank",
 58 |                 np.nan,
 59 |                 "no rank;no rank;no rank;species",
 60 |             ],
 61 |         }
 62 |     )
 63 | 
 64 | 
 65 | def test_split_by_tax_id(blast_df: pd.DataFrame):
 66 |     assert len(blast_df) == 3
 67 |     split_blast = _split_by_tax_id(blast_df)
 68 |     assert len(split_blast) == 5
 69 |     expected_tax_ids = {2371, 644357, 10760, 110011001100, 32630}
 70 |     assert set(split_blast["subject tax ids"]) == expected_tax_ids
 71 | 
 72 | 
 73 | @patch("pytaxonkit.lineage")
 74 | def test_get_lineages(mock_lineage, blast_df, lineage_df):
 75 |     mock_lineage.return_value = lineage_df
 76 |     blast_df = _split_by_tax_id(blast_df)
 77 |     lin = _get_lineages(
 78 |         blast_df["subject tax ids"], "commec-dbs/taxonomy/", 8
 79 |     )
 80 |     # Expect the invalid taxid to be filtered out
 81 |     expected_tax_ids = {2371, 644357, 10760, 32630}
 82 |     assert set(lin["TaxID"]) == expected_tax_ids
 83 | 
 84 | 
 85 | @patch("pytaxonkit.lineage")
 86 | def test_taxdist(mock_lineage, blast_df, lineage_df):
 87 |     mock_lineage.return_value = lineage_df
 88 |     # Fake values - should find 1 regulated hit after filtering
 89 |     reg_taxids = ['644357', '10760']
 90 |     vax_taxids = ['10760']
 91 |     reg_df = get_taxonomic_labels(
 92 |         blast_df, reg_taxids, vax_taxids, "commec-dbs/taxonomy/", 8
 93 |     )
 94 |     # Expect the synthetic taxid to be filtered out
 95 |     expected_tax_ids = {2371, 644357, 10760}
 96 |     assert set(reg_df["subject tax ids"]) == expected_tax_ids
 97 | 
 98 |     # Expect only taxid 644357 to be marked as "regulated"
 99 |     assert reg_df[reg_df["subject tax ids"] == 2371]["regulated"].iloc[0] == False
100 |     assert reg_df[reg_df["subject tax ids"] == 644357]["regulated"].iloc[0] == True
101 |     assert reg_df[reg_df["subject tax ids"] == 10760]["regulated"].iloc[0] == False
102 | 


--------------------------------------------------------------------------------
/commec/tools/fetch_nc_bits.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | """
  4 | Fetch parts of a query that had no high-quality protein matches for use in nucloetide screening.
  5 | 
  6 | Usage:
  7 |     fetch_nc_bits.py query_name fasta_file_path
  8 | """
  9 | import argparse
 10 | import logging
 11 | import shutil
 12 | import re
 13 | import pandas as pd
 14 | from Bio import SeqIO
 15 | from commec.config.query import Query
 16 | from commec.tools.blast_tools import get_high_identity_hits
 17 | from commec.tools.search_handler import SearchHandler
 18 | from commec.config.result import ScreenStatus
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | def _get_ranges_with_no_hits(input_df : pd.DataFrame):
 23 |     """
 24 |     Get indices not covered by the query start / end ranges in the BLAST results.
 25 |     """
 26 | 
 27 |     assert "q. start" in input_df.columns, (
 28 |         "Column \"q. start\" does not exist for get_ranges_with_no_hits().\n"
 29 |         f"Existing columns: {', '.join(input_df.columns)}"
 30 |     )
 31 | 
 32 |     assert "q. end" in input_df.columns, (
 33 |         "Column \"q. end\" does not exist for get_ranges_with_no_hits().\n"
 34 |         f"Existing columns: {', '.join(input_df.columns)}"
 35 |     )
 36 | 
 37 |     assert "query length" in input_df.columns, (
 38 |         "Column \"query length\" does not exist for get_ranges_with_no_hits().\n"
 39 |         f"Existing columns: {', '.join(input_df.columns)}"
 40 |     )
 41 | 
 42 |     assert not input_df.empty, "Input dataframe for get_ranges_with_no_hits() is empty."
 43 | 
 44 |     unique_hits = input_df.drop_duplicates(subset=["q. start", "q. end"])
 45 |     hit_ranges = unique_hits[["q. start", "q. end"]].values.tolist()
 46 | 
 47 |     # Sort each pair to ensure that start < end, then sort entire list of ranges
 48 |     hit_ranges = sorted([sorted(pair) for pair in hit_ranges])
 49 | 
 50 |     nc_ranges : list[tuple[int,int]] = []
 51 | 
 52 |     # Include the start if the first hit begins more than 50 bp after the start
 53 |     if hit_ranges[0][0] > 50:
 54 |         nc_ranges.append((1, hit_ranges[0][0] - 1))
 55 | 
 56 |     # Add ranges if there is a noncoding region of >=50 between hits
 57 |     for i in range(len(hit_ranges) - 1):
 58 |         nc_start = hit_ranges[i][1] + 1  # starts after this hit
 59 |         nc_end = hit_ranges[i + 1][0] - 1 # ends before next hit
 60 | 
 61 |         if nc_end - nc_start + 1 >= 50:
 62 |             nc_ranges.append((nc_start, nc_end))
 63 | 
 64 |     # Include the end if the last hit ends more than 50 bp before the end
 65 |     query_length = input_df["query length"].iloc[0]
 66 |     if query_length - hit_ranges[-1][1] >= 50:
 67 |         nc_ranges.append((hit_ranges[-1][1] + 1, int(query_length)))
 68 | 
 69 |     return nc_ranges
 70 | 
 71 | def _set_no_coding_regions(query : Query):
 72 |     """Set the query to be entirely non-coding (i.e. no high-quality protein hits)."""
 73 |     query.non_coding_regions.append((1, query.length))
 74 | 
 75 | def calculate_noncoding_regions_per_query(
 76 |         protein_search_handler : SearchHandler,
 77 |         queries : dict[str, Query]
 78 |         ):
 79 |     """
 80 |     Fetch noncoding regions > 50bp for every query, and
 81 |     updates the Query dictionary to include non-coding meta-data.
 82 |     """
 83 |     logger.debug("Checking protein hits in: %s", protein_search_handler.out_file)
 84 | 
 85 |     if not protein_search_handler.has_hits():
 86 |         logger.info("No protein hits found, screening entire sequence.")
 87 |         for query in queries.values():
 88 |             _set_no_coding_regions(query)
 89 |         return
 90 | 
 91 |     protein_hits = get_high_identity_hits(protein_search_handler.out_file)
 92 | 
 93 |     query_col = "query acc."
 94 | 
 95 |     for query in queries.values():
 96 |         protein_hits_for_query = protein_hits[protein_hits[query_col] == query.name].copy()
 97 | 
 98 |         if protein_hits_for_query.empty:
 99 |             logger.info("No protein hits found for %s, screening entire sequence.", query.name)
100 |             _set_no_coding_regions(query)
101 |             continue
102 | 
103 |         # Correcting query length in nc coordinate output.
104 |         protein_hits_for_query.loc[:, "q.len"] = query.length
105 | 
106 |         logger.debug("\t --> Protein hits found for %s, fetching nt regions not covered by a 90%% ID hit or better", query.name)
107 | 
108 |         ranges_to_screen = _get_ranges_with_no_hits(protein_hits_for_query)
109 |         # if the entire sequence, save regions <50 bases, is covered with protein, skip nt scan
110 |         if not ranges_to_screen:
111 |             logger.info("\t --> no noncoding regions >= 50 bases found for %s, skipping nt scan for query.", query.name)
112 |             query.result.status.nucleotide_taxonomy = ScreenStatus.SKIP
113 |             continue
114 | 
115 |         # Update the list of start and end non-coding tuples for query.
116 |         query.non_coding_regions.extend(ranges_to_screen)
117 | 
118 |         ranges_str = ", ".join(f"{start}-{end}" for start, end in ranges_to_screen)
119 |         logger.info("\t --> Identified noncoding regions for query %s: [%s]", query.name, ranges_str)
120 | 


--------------------------------------------------------------------------------
/commec/tests/test_fetch_nc_bits.py:
--------------------------------------------------------------------------------
  1 | from io import StringIO
  2 | import os
  3 | import pandas as pd
  4 | import pytest
  5 | import textwrap
  6 | from Bio import SeqIO
  7 | from unittest.mock import patch
  8 | 
  9 | from commec.tools.fetch_nc_bits import (
 10 |     _get_ranges_with_no_hits,
 11 |     calculate_noncoding_regions_per_query,
 12 | )
 13 | 
 14 | from commec.config.screen_io import ScreenIO
 15 | from commec.config.result import QueryResult, ScreenStatus
 16 | from commec.screen import add_args, ScreenArgumentParser
 17 | from commec.tools.blastx import BlastXHandler
 18 | 
 19 | DATABASE_DIRECTORY = os.path.join(os.path.dirname(__file__), "test_dbs")
 20 | 
 21 | @pytest.mark.parametrize(
 22 |     "hits, nc_ranges",
 23 |     [
 24 |         # Two protein hits, no noncoding regions > 50bp
 25 |         ([(1, 50), (100, 150), (175, 299)], []),
 26 |         # One protein hit, < 50bp nocoding regions on the ends
 27 |         ([(50, 251)], []),
 28 |         # One protein hit, > 50bp nocoding regions on the ends
 29 |         ([(51, 250)], [(1, 50), (251, 300)]),
 30 |         # Three protein hits, one noncoding region >50bp
 31 |         (
 32 |             [(1, 40), (140, 265), (300, 349)],
 33 |             [(41, 139)],
 34 |         ),
 35 |     ],
 36 | )
 37 | def test_get_ranges_with_no_hits(hits, nc_ranges):
 38 |     """
 39 |     Test the BLAST hits are successfully converted into noncoding ranges.
 40 |     """
 41 | 
 42 |     def _create_mock_blast_df_from(hits):
 43 |         data = {
 44 |             "q. start": [hit[0] for hit in hits],
 45 |             "q. end": [hit[1] for hit in hits],
 46 |             "query length": [300] * len(hits),
 47 |         }
 48 |         df = pd.DataFrame(data)
 49 |         return df.reset_index(drop=True)  # This adds a numeric index
 50 | 
 51 |     blast_df = _create_mock_blast_df_from(hits)
 52 |     assert _get_ranges_with_no_hits(blast_df) == nc_ranges
 53 | 
 54 | 
 55 | def test_fetch_nocoding_regions(tmp_path):
 56 |     """Full test, including file parsing."""
 57 | 
 58 |     desc_1 = "NC_TEST01"
 59 |     desc_2 = "NC_TEST02"
 60 |     seq_1 = textwrap.dedent(
 61 |         """\
 62 |         ggtagttccctaaacttatcattaagcgatcttcatcgtcaggtatctcgattggtgcagcaagagagcggtgattgt
 63 |         accgggaaattaagaggtaacgttgctgccaataaagaaactacctttcaaggtttgaccatagccagtggagccaga
 64 |         gagtcagaaaaagtatttgctcaaactgtactaagccacgtagcaaatgttgttctaactcaagaagataccgctaag
 65 |         ctattgcaaagtacggtaaagcataatttgaataattatgacttaagaagtgtcggcaatggtaat
 66 |         """
 67 |     )
 68 |     seq_2 = textwrap.dedent(
 69 |         """\
 70 |         atggcacaagtcattaataccaacagcctctcgctgatcactcaaaataatatcaacaagaaccagtctgcgctgtcg
 71 |         agttctatcgagcgtctgtcttctggcttgcgtattaacagcgcgaaggatgacgcagcgggtcaggcgattgctaac
 72 |         cgtttcacctctaacattaaaggcctgactcaggcggcccgtaacgccaacgacggtatctccgttgcgcagaccacc
 73 |         gaaggcgcgctgtccgaaatcaacaacaacttacagcgtgtgcgtgaactgacggtacaggccact
 74 |         """
 75 |     )
 76 | 
 77 |     blast_to_parse = textwrap.dedent(
 78 |         """\
 79 |         # BLASTX 2.15.0+
 80 |         # Query: NC_TEST
 81 |         # Database: /root/commec-dbs/mock
 82 |         #query acc.	subject title	subject acc.	subject tax ids	evalue	bit score	% identity	query length	q. start	q. end	subject length	s. start	s. end
 83 |         # 3 hits found
 84 |         NC_TEST01	SUBJECT	SUBJECT_ACC	TAXID	0.0	BITSCORE	99.999	300	101	200	500	1	100
 85 |         NC_TEST02	SUBJECT	SUBJECT_ACC	TAXID	0.0	BITSCORE	99.999	300	25	80	500	1	100
 86 |         NC_TEST02	SUBJECT	SUBJECT_ACC	TAXID	0.0	BITSCORE	99.999	300	100	300	500	1	100
 87 |         """
 88 |     )
 89 | 
 90 |     expected_output = textwrap.dedent(
 91 |         """\
 92 |         >NC_TEST01 (1-100) (201-300)
 93 |         ggtagttccctaaacttatcattaagcgatcttcatcgtcaggtatctcgattggtgcagcaagagagcggtgattgtaccgggaaattaagaggtaacgaaatgttgttctaactcaagaagataccgctaagctattgcaaagtacggtaaagcataatttgaataattatgacttaagaagtgtcggcaatggtaat
 94 |         """
 95 |     )
 96 | 
 97 |     # Setup Expected files
 98 |     input_fasta = tmp_path / "fetch_nc_input.fasta"
 99 |     input_fasta.write_text(f">{desc_1}\n{seq_1}\n>{desc_2}\n{seq_2}\n")
100 |     input_blast = tmp_path / "fetch_nc_input.blastx"
101 |     input_blast.write_text(blast_to_parse)
102 | 
103 |     # Create Dictionary of queries for funciton input.
104 |     with patch(
105 |         "sys.argv",
106 |         ["test.py", "--skip-tx", str(input_fasta), "-d", str(DATABASE_DIRECTORY), "-o", str(tmp_path)],
107 |     ):
108 |         parser = ScreenArgumentParser()
109 |         add_args(parser)
110 |         screen_io = ScreenIO(parser.parse_args())
111 |         screen_io.setup()
112 | 
113 |     queries = screen_io.parse_input_fasta()
114 |     for query in queries.values():
115 |         query.result = QueryResult()
116 | 
117 |     # Setup result handler for function input.
118 |     db_file = os.path.join(DATABASE_DIRECTORY, "nr_blast/nr")
119 |     handler = BlastXHandler(db_file, input_fasta, input_blast, force=True)
120 | 
121 |     calculate_noncoding_regions_per_query(handler, queries)
122 | 
123 |     # Generate the non-coding fasta text.
124 |     actual_output = ""
125 |     for query in queries.values():
126 |         if query.result.status.nucleotide_taxonomy == ScreenStatus.SKIP:
127 |             continue
128 |         actual_output += query.get_non_coding_regions_as_fasta()
129 | 
130 |     assert actual_output.strip() == expected_output.strip()
131 | 


--------------------------------------------------------------------------------
/.github/workflows/automate_release.yml:
--------------------------------------------------------------------------------
  1 | name: Automate Release
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       release_branch:
  7 |         description: 'Branch to release (e.g. release_v1.2.3)'
  8 |         required: true
  9 |         type: string
 10 |       version_string:
 11 |         description: 'Version string (e.g. 1.2.3)'
 12 |         required: true
 13 |         type: string
 14 |         
 15 | permissions:
 16 |   contents: write
 17 |   pull-requests: write
 18 | 
 19 | jobs:
 20 |   release:
 21 |     runs-on: ubuntu-latest
 22 | 
 23 |     steps:
 24 |       - name: Check token permissions
 25 |         run: gh auth status
 26 |         env:
 27 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 28 |           
 29 |       - name: Checkout release branch
 30 |         uses: actions/checkout@v4
 31 |         with:
 32 |           ref: ${{ github.event.inputs.release_branch }}
 33 | 
 34 |       #- name: Set up Python
 35 |       #  uses: actions/setup-python@v5
 36 |       #  with:
 37 |       #    python-version: '3.11'
 38 | 
 39 |       - name: Check and update pyproject.toml version
 40 |         run: |
 41 |           ver="${{ github.event.inputs.version_string }}"
 42 |           current=$(grep "^version = '" pyproject.toml | head -n1 | cut -d"'" -f2)
 43 |           if [ "$current" != "$ver" ]; then
 44 |             sed -i "s/^version = '.*'/version = '$ver'/" pyproject.toml
 45 |             echo "updated_pyproject=true" >> $GITHUB_ENV
 46 |           fi
 47 | 
 48 |       - name: Check and update conda meta.yaml version
 49 |         run: |
 50 |           ver="${{ github.event.inputs.version_string }}"
 51 |           current=$(grep '{% set version = "' conda-recipe/meta.yaml | head -n1 | cut -d'"' -f2)
 52 |           if [ "$current" != "$ver" ]; then
 53 |             sed -i "s/{% set version = \".*\" %}/{% set version = \"$ver\" %}/" conda-recipe/meta.yaml
 54 |             echo "updated_meta=true" >> $GITHUB_ENV
 55 |           fi
 56 | 
 57 |       - name: Update functional test JSON with new version
 58 |         if: env.updated_pyproject == 'true'
 59 |         run: |
 60 |           ver="${{ github.event.inputs.version_string }}"
 61 |           json_file="commec/tests/test_data/functional.json"
 62 |           
 63 |           if [ -f "$json_file" ]; then
 64 |             # Use jq to replace the version field safely
 65 |             tmpfile=$(mktemp)
 66 |             jq --arg ver "$ver" '.commec_info.commec_version = $ver' "$json_file" > "$tmpfile" && mv "$tmpfile" "$json_file"
 67 |             echo "Updated commec_version in $json_file to $ver"
 68 |           else
 69 |             echo "File $json_file not found!"
 70 |             exit 1
 71 |           fi
 72 | 
 73 |       - name: Commit version changes (if needed)
 74 |         if: env.updated_pyproject == 'true' || env.updated_meta == 'true'
 75 |         run: |
 76 |           git config user.name "github-actions"
 77 |           git config user.email "github-actions@github.com"
 78 |           git add pyproject.toml conda-recipe/meta.yaml commec/tests/test_data/functional.json
 79 |           if git commit -m "Update version texts to ${{ github.event.inputs.version_string }}"; then
 80 |             git push origin HEAD:${{ github.event.inputs.release_branch }}
 81 |           else
 82 |             echo "No changes to commit, skipping push."
 83 |           fi
 84 | 
 85 |       - name: Tag release
 86 |         run: |
 87 |           git tag -a "v${{ github.event.inputs.version_string }}" -m "Release ${{ github.event.inputs.version_string }}"
 88 |           git push origin "v${{ github.event.inputs.version_string }}"
 89 | 
 90 |       - name: Download tar.gz from GitHub release
 91 |         run: |
 92 |           curl -L -o source.tar.gz https://github.com/${{ github.repository }}/archive/refs/tags/v${{ github.event.inputs.version_string }}.tar.gz
 93 | 
 94 |       - name: Calculate SHA256
 95 |         id: hash
 96 |         run: |
 97 |           sha256=$(sha256sum source.tar.gz | cut -d ' ' -f1)
 98 |           echo "sha256=$sha256" >> $GITHUB_OUTPUT
 99 | 
100 |       - name: Clone bioconda fork
101 |         run: |
102 |           git clone https://github.com/ibbis-bio/bioconda-recipes.git
103 |           cd bioconda-recipes
104 |           git remote add upstream https://github.com/bioconda/bioconda-recipes.git
105 |           git fetch upstream
106 |           git reset --hard upstream/master
107 |           git push origin master --force
108 | 
109 |       - name: Update recipe with new meta.yaml
110 |         run: |
111 |           cp conda-recipe/meta.yaml bioconda-recipes/recipes/commec/meta.yaml
112 |           cd bioconda-recipes
113 |           sed -i '3s/.*/{% set sha256 = "'"${{ steps.hash.outputs.sha256 }}"'" %}/' recipes/commec/meta.yaml
114 |           git config user.name "github-actions"
115 |           git config user.email "github-actions@github.com"
116 |           git add recipes/commec/meta.yaml
117 |           git commit -m "Update commec recipe to v${{ github.event.inputs.version_string }}"
118 |           git push origin master
119 | 
120 |       - name: Create PR to bioconda upstream
121 |         env:
122 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
123 |         run: |
124 |           gh pr create \
125 |             --repo bioconda/bioconda-recipes \
126 |             --head ibbis-bio:master \
127 |             --base master \
128 |             --title "Update commec to v${{ github.event.inputs.version_string }}" \
129 |             --body "This PR updates the commec recipe to version ${{ github.event.inputs.version_string }}."
130 | 


--------------------------------------------------------------------------------
/.github/workflows/release-version-sha-update.yml:
--------------------------------------------------------------------------------
  1 | name: Update version + SHA for release
  2 | 
  3 | on:
  4 |   pull_request:
  5 |     types: [closed]
  6 |     branches:
  7 |       - main
  8 |   workflow_dispatch:
  9 |     inputs:
 10 |       release_version:
 11 |         description: 'Release version (without v prefix)'
 12 |         required: true
 13 |         default: ''
 14 | 
 15 | jobs:
 16 |   update-version-sha:
 17 |     # Run if the PR was merged (not just closed) and it was from a release branch OR manually triggered
 18 |     if: (github.event_name == 'pull_request' && github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, 'release-v')) || 
 19 |         github.event_name == 'workflow_dispatch'
 20 |     runs-on: ubuntu-latest
 21 |     permissions:
 22 |       contents: write
 23 | 
 24 |     steps:
 25 |       - name: Check out repository code
 26 |         uses: actions/checkout@v3
 27 |         with:
 28 |           ref: main  # Ensure we're on the main branch after merge
 29 |           fetch-depth: 0  # Need full history
 30 | 
 31 |       - name: Extract version
 32 |         id: get_version
 33 |         run: |
 34 |           if [ "${{ github.event_name }}" == "pull_request" ]; then
 35 |             # Extract version from the release branch name (release-vX.Y.Z -> X.Y.Z)
 36 |             BRANCH_NAME="${{ github.event.pull_request.head.ref }}"
 37 |             VERSION=$(echo $BRANCH_NAME | sed 's/release-v//')
 38 |           else
 39 |             # Use the manually provided version
 40 |             VERSION="${{ github.event.inputs.release_version }}"
 41 |           fi
 42 |           echo "VERSION=$VERSION" >> $GITHUB_ENV
 43 |           echo "Version: $VERSION"
 44 | 
 45 |       - name: Find pyproject.toml and meta.yaml files
 46 |         id: find_files
 47 |         run: |
 48 |           PYPROJECT_PATH=$(find . -name "pyproject.toml" -type f | head -n 1)
 49 |           if [ -z "$PYPROJECT_PATH" ]; then
 50 |             echo "ERROR: pyproject.toml not found"
 51 |             exit 1
 52 |           fi
 53 |           echo "Found pyproject.toml at: $PYPROJECT_PATH"
 54 |           echo "PYPROJECT_PATH=$PYPROJECT_PATH" >> $GITHUB_ENV
 55 |           
 56 |           META_PATH=$(find . -name "meta.yaml" -type f | head -n 1)
 57 |           if [ -z "$META_PATH" ]; then
 58 |             echo "ERROR: meta.yaml not found"
 59 |             exit 1
 60 |           fi
 61 |           echo "Found meta.yaml at: $META_PATH"
 62 |           echo "META_PATH=$META_PATH" >> $GITHUB_ENV
 63 | 
 64 |       - name: Set up Python
 65 |         uses: actions/setup-python@v4
 66 |         with:
 67 |           python-version: '3.10'
 68 | 
 69 |       - name: Update version in files
 70 |         run: |
 71 |           # Update version in pyproject.toml
 72 |           sed -i "s/^version = \".*\"/version = \"${{ env.VERSION }}\"/" "${{ env.PYPROJECT_PATH }}"
 73 |           echo "Updated version to ${{ env.VERSION }} in ${{ env.PYPROJECT_PATH }}"
 74 |           
 75 |           # Update version in meta.yaml
 76 |           sed -i "s/{% set version = \".*\" %}/{% set version = \"${{ env.VERSION }}\" %}/" "${{ env.META_PATH }}"
 77 |           echo "Updated version to ${{ env.VERSION }} in ${{ env.META_PATH }}"
 78 | 
 79 |       - name: Calculate SHA256 hash
 80 |         id: calculate_sha
 81 |         run: |
 82 |           # Create source distribution
 83 |           python -m pip install build
 84 |           python -m build --sdist
 85 |           
 86 |           # Find the generated tar.gz file
 87 |           SDIST_FILE=$(find dist -name "*.tar.gz" | head -n 1)
 88 |           
 89 |           if [ -z "$SDIST_FILE" ]; then
 90 |             echo "ERROR: No tar.gz file found in dist directory"
 91 |             exit 1
 92 |           fi          
 93 |           echo "Found sdist file: $SDIST_FILE"
 94 |           
 95 |           # Get the SHA256 hash of the generated tar.gz file
 96 |           SHA256=$(sha256sum "$SDIST_FILE" | cut -d ' ' -f 1)
 97 |           echo "SHA256=$SHA256" >> $GITHUB_ENV
 98 |           echo "SHA256 hash: $SHA256"
 99 | 
100 |       - name: Update SHA in meta.yaml
101 |         run: |
102 |           # Update SHA256 in meta.yaml
103 |           sed -i "s/{% set sha256 = \".*\" %}/{% set sha256 = \"${{ env.SHA256 }}\" %}/" "${{ env.META_PATH }}"
104 |           echo "Updated SHA256 in ${{ env.META_PATH }}"
105 | 
106 |       - name: Check for changes
107 |         id: check_changes
108 |         run: |
109 |           if git diff --quiet; then
110 |             echo "No changes detected"
111 |             echo "CHANGES_DETECTED=false" >> $GITHUB_ENV
112 |           else
113 |             echo "Changes detected"
114 |             echo "CHANGES_DETECTED=true" >> $GITHUB_ENV
115 |           fi
116 | 
117 |       - name: Commit changes
118 |         if: env.CHANGES_DETECTED == 'true'
119 |         run: |
120 |           git config --local user.email "github-actions[bot]@users.noreply.github.com"
121 |           git config --local user.name "github-actions[bot]"
122 |           
123 |           # Add the files
124 |           git add "${{ env.PYPROJECT_PATH }}"
125 |           git add "${{ env.META_PATH }}"
126 |           
127 |           git commit -m "Bump version to ${{ env.VERSION }} and update SHA [skip ci]"
128 |           git push
129 |           echo "Pushed changes to repository"
130 | 
131 |       - name: Create version tag
132 |         run: |
133 |           git tag "v${{ env.VERSION }}"
134 |           git push origin "v${{ env.VERSION }}"
135 |           echo "Created and pushed tag v${{ env.VERSION }}"
136 | 


--------------------------------------------------------------------------------
/commec/config/screen_tools.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | 
  4 | """
  5 | Container for search handlers used throughout the Commec screen workflow.
  6 | Sets and alters defaults based on input parameters.
  7 | """
  8 | 
  9 | import logging
 10 | import os
 11 | from commec.config.screen_io import ScreenIO
 12 | from commec.tools.blastn import BlastNHandler
 13 | from commec.tools.blastx import BlastXHandler
 14 | from commec.tools.diamond import DiamondHandler
 15 | from commec.tools.cmscan import CmscanHandler
 16 | from commec.tools.hmmer import HmmerHandler
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | class ScreenTools:
 21 |     """
 22 |     Using parameters and filenames in `ScreenIo`, set up the tools needed to search datbases.
 23 |     """
 24 | 
 25 |     def __init__(self, params: ScreenIO):
 26 |         self.biorisk: HmmerHandler = None
 27 |         self.regulated_protein : BlastXHandler | DiamondHandler = None
 28 |         self.regulated_nt: BlastNHandler = None
 29 |         self.low_concern_hmm: HmmerHandler = None
 30 |         self.low_concern_blastn: BlastNHandler = None
 31 |         self.low_concern_cmscan: CmscanHandler = None
 32 | 
 33 |         self.taxonomy_path: str | os.PathLike = None
 34 |         self.biorisk_taxid_path: str | os.PathLike = None
 35 |         self.low_concern_taxid_path: str | os.PathLike = None
 36 |         self.biorisk_annotations_csv: str | os.PathLike = None
 37 | 
 38 |         # Paths for vaxid, taxids, and taxonomy directory, used for check_regulated_pathogens
 39 |         # (Declared this way for backwards compatibility to old database structure at this stage)
 40 |         self.taxonomy_path = params.config["databases"]["taxonomy"]["path"]
 41 |         self.biorisk_taxid_path = params.config["databases"]["biorisk"]["taxids"]
 42 |         self.low_concern_taxid_path = params.config["databases"]["low_concern"]["taxids"]
 43 |         self.biorisk_annotations = params.config["databases"]["biorisk"]["annotations"]
 44 |         self.low_concern_annotations = params.config["databases"]["low_concern"]["annotations"]
 45 | 
 46 |         # Database tools for Biorisks / Protein and NT screens / Benign screen:
 47 |         self.biorisk = HmmerHandler(
 48 |             params.config["databases"]["biorisk"]["path"],
 49 |             params.aa_path,
 50 |             f"{params.output_prefix}.biorisk.hmmscan",
 51 |             threads=params.config["threads"],
 52 |             force=params.config["force"],
 53 |         )
 54 | 
 55 |         if params.should_do_protein_screening:
 56 |             if params.config["protein_search_tool"] == "blastx":
 57 |                 self.regulated_protein = BlastXHandler(
 58 |                     params.config["databases"]["regulated_protein"]["blast"]["path"],
 59 |                     input_file=params.nt_path,
 60 |                     out_file=f"{params.output_prefix}.nr.blastx",
 61 |                     threads=params.config["threads"],
 62 |                     force=params.config["force"],
 63 |                 )
 64 |             elif params.config["protein_search_tool"] in ("nr.dmnd", "diamond"):
 65 |                 self.regulated_protein = DiamondHandler(
 66 |                     params.config["databases"]["regulated_protein"]["diamond"]["path"],
 67 |                     input_file=params.nt_path,
 68 |                     out_file=f"{params.output_prefix}.nr.dmnd",
 69 |                     threads=params.config["threads"],
 70 |                     force=params.config["force"],
 71 |                 )
 72 |                 self.regulated_protein.jobs = params.config["diamond_jobs"]
 73 |                 if params.config["protein_search_tool"] == "nr.dmnd":
 74 |                     logger.info(
 75 |                         "Using old \"nr.dmnd\" keyword for search tool will not be supported"
 76 |                         " in future releases,consider using \"diamond\" instead."
 77 |                     )
 78 |             else:
 79 |                 raise RuntimeError('Search tool not defined as "blastx" or "diamond"')
 80 | 
 81 |         if params.should_do_nucleotide_screening:
 82 |             self.regulated_nt = BlastNHandler(
 83 |                 params.config["databases"]["regulated_nt"]["path"],
 84 |                 input_file=params.nc_path,
 85 |                 out_file=f"{params.output_prefix}.nt.blastn",
 86 |                 threads=params.config["threads"],
 87 |                 force=params.config["force"],
 88 |             )
 89 | 
 90 |         if params.should_do_low_concern_screening:
 91 |             self.low_concern_hmm = HmmerHandler(
 92 |                 params.config["databases"]["low_concern"]["protein"]["path"],
 93 |                 input_file=params.aa_path,
 94 |                 out_file=f"{params.output_prefix}.low_concern.hmmscan",
 95 |                 threads=params.config["threads"],
 96 |                 force=params.config["force"],
 97 |             )
 98 |             self.low_concern_blastn = BlastNHandler(
 99 |                 params.config["databases"]["low_concern"]["dna"]["path"],
100 |                 input_file=params.nt_path,
101 |                 out_file=f"{params.output_prefix}.low_concern.blastn",
102 |                 threads=params.config["threads"],
103 |                 force=params.config["force"],
104 |             )
105 |             self.low_concern_cmscan = CmscanHandler(
106 |                 params.config["databases"]["low_concern"]["rna"]["path"],
107 |                 input_file=params.nt_path,
108 |                 out_file=f"{params.output_prefix}.low_concern.cmscan",
109 |                 threads=params.config["threads"],
110 |                 force=params.config["force"],
111 |             )


--------------------------------------------------------------------------------
/example_data/output_commec-examples/commec-examples.nt.blastn:
--------------------------------------------------------------------------------
 1 | # BLASTN 2.16.0+
 2 | # Query: encrypted (1-552)
 3 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt
 4 | # 0 hits found
 5 | # BLASTN 2.16.0+
 6 | # Query: xylanase_zero_shot_des31 (1-756)
 7 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt
 8 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end
 9 | # 5 hits found
10 | xylanase_zero_shot_des31	Escherichia albertii strain 205_2_TBG_B chromosome, complete genome	CP099890	208962	2.75e-31	150	75.542	756	16	333	4752472	4352667	4352984
11 | xylanase_zero_shot_des31	Enterobacter hormaechei strain A26358 chromosome, complete genome	CP163152	158836	1.28e-29	145	74.854	756	16	351	4839353	2207862	2207527
12 | xylanase_zero_shot_des31	Enterobacter hormaechei subsp. xiangfangensis strain HD2292 chromosome, complete genome	CP130333	1296536	1.28e-29	145	74.854	756	16	351	4806739	3158310	3157975
13 | xylanase_zero_shot_des31	Enterobacter hormaechei strain 2020CK-00204 chromosome, complete genome	CP115689	158836	1.28e-29	145	75.152	756	16	339	4935101	2693068	2693391
14 | xylanase_zero_shot_des31	Enterobacter hormaechei strain UCI161 chromosome	CP060481	158836	1.28e-29	145	75.152	756	16	339	4802325	2306527	2306850
15 | # BLASTN 2.16.0+
16 | # Query: RVFV_Rift_valley_fever (830.0-960.0)
17 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt
18 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end
19 | # 5 hits found
20 | RVFV_Rift_valley_fever	Rift Valley fever virus strain SA-75 segment S, complete sequence	DQ380175	11588	5.95e-60	243	100.000	131	1	131	1691	830	960
21 | RVFV_Rift_valley_fever	Rift Valley fever virus segment S nonstructural protein and nucleocapsid genes, complete cds	OM744402	11588	5.95e-60	243	100.000	131	1	131	1691	830	960
22 | RVFV_Rift_valley_fever	Rift Valley fever virus strain H1825RSA75 segment S, complete sequence	EU312120	11588	5.95e-60	243	100.000	131	1	131	1691	830	960
23 | RVFV_Rift_valley_fever	Rift Valley fever virus strain 35/74 segment S, complete sequence	JF784388	11588	5.95e-60	243	100.000	131	1	131	1691	830	960
24 | RVFV_Rift_valley_fever	Rift Valley fever virus isolate M57/74 nonstructural protein and nucleocapsid protein genes, complete cds	KX944821	11588	5.95e-60	243	100.000	131	1	131	1654	801	931
25 | # BLASTN 2.16.0+
26 | # Query: BBa_K209429_A_15261 (643.0-758.0) (1833.0-1933.0) (2633.0-2764.0) (3497.0-3550.0)
27 | # Database: /mnt/data/home/ec2-user/cm-dbs/nt_blast/core_nt
28 | # Fields: query acc., subject title, subject acc., subject tax ids, evalue, bit score, % identity, query length, q. start, q. end, subject length, s. start, s. end
29 | # 20 hits found
30 | BBa_K209429_A_15261	Mammalian expression vector pNBioSec, complete sequence	EU082004	478810	6.39e-45	195	98.198	403	224	334	7032	1335	1445
31 | BBa_K209429_A_15261	Gateway entry vector pMpGE_En01 DNA, complete sequence	LC090754	1740675	2.97e-43	189	100.000	403	116	217	6126	5326	5427
32 | BBa_K209429_A_15261	Transformation vector pGL193, complete sequence	OK017460	2902901	2.97e-43	189	100.000	403	116	217	10319	9451	9552
33 | BBa_K209429_A_15261	Cloning vector pNRVL-caSAT1, complete sequence	MN989861	2713480	2.97e-43	189	100.000	403	116	217	5220	4420	4521
34 | BBa_K209429_A_15261	Cloning vector p5E-CAGGS, complete sequence	JN715850	1115837	2.97e-43	189	100.000	403	116	217	4511	3711	3812
35 | BBa_K209429_A_15261	Cloning vector pNRVL-N5L-ffDronpa, complete sequence	PP986966	3231919	2.97e-43	189	100.000	403	116	217	4951	4151	4252
36 | BBa_K209429_A_15261	PREDICTED: Bos indicus x Bos taurus prolactin (PRL), transcript variant X2, mRNA	XM_027524257	30522	1.39e-36	167	100.000	403	224	313	923	83	172
37 | BBa_K209429_A_15261	PREDICTED: Moschus berezovskii prolactin (LOC129543410), mRNA	XM_055406509	68408	1.39e-36	167	100.000	403	224	313	690	1	90
38 | BBa_K209429_A_15261	PREDICTED: Bison bison bison prolactin (LOC104992619), mRNA	XM_010845567	43346	1.39e-36	167	100.000	403	224	313	891	56	145
39 | BBa_K209429_A_15261	PREDICTED: Bos indicus prolactin (LOC109577296), transcript variant X2, mRNA	XM_019986186	9915	1.39e-36	167	100.000	403	224	313	945	105	194
40 | BBa_K209429_A_15261	Homo sapiens cholinergic receptor, muscarinic 2, mRNA (cDNA clone MGC:111772 IMAGE:6519221), complete cds	BC095547	9606	3.10e-13	89.8	100.000	403	356	403	1404	4	51
41 | BBa_K209429_A_15261	PREDICTED: Pan paniscus cholinergic receptor muscarinic 2 (CHRM2), transcript variant X4, mRNA	XM_055115415	9597	3.10e-13	89.8	100.000	403	356	403	6013	386	433
42 | BBa_K209429_A_15261	PREDICTED: Pan troglodytes cholinergic receptor muscarinic 2 (CHRM2), transcript variant X1, mRNA	XM_009454280	9598	3.10e-13	89.8	100.000	403	356	403	9169	766	813
43 | BBa_K209429_A_15261	PREDICTED: Pan paniscus cholinergic receptor muscarinic 2 (CHRM2), transcript variant X1, mRNA	XM_055115412	9597	3.10e-13	89.8	100.000	403	356	403	11057	5430	5477
44 | BBa_K209429_A_15261	Homo sapiens cholinergic receptor muscarinic 2 (CHRM2), transcript variant 5, mRNA	NM_001006631	9606	3.10e-13	89.8	100.000	403	356	403	5663	316	363
45 | BBa_K209429_A_15261	Cloning vector pE3n, complete sequence	EU334818	490921	8.74e-04	58.4	88.000	403	1	48	2595	432	481
46 | BBa_K209429_A_15261	Vector pENTR1A-3Flag-WFS1-IRES2-mCherry, complete sequence	OQ238874	3062755	0.003	56.5	97.059	403	1	33	6432	432	465
47 | BBa_K209429_A_15261	Cloning vector pIPKTA33, complete sequence	EF622217	444601	0.003	56.5	97.059	403	1	33	2311	141	174
48 | BBa_K209429_A_15261	Vector pETNR1A-R-GECO, complete sequence	OQ238871	3062760	0.003	56.5	97.059	403	1	33	3534	432	465
49 | BBa_K209429_A_15261	Cloning vector pE1c, complete sequence	EU334822	490916	0.003	56.5	97.059	403	1	33	2336	432	465
50 | # BLAST processed 4 queries
51 | 


--------------------------------------------------------------------------------
/commec/tests/test_json.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from dataclasses import asdict
  3 | from commec.config.json_io import *
  4 | from commec.config.result import *
  5 | from commec.tools.search_handler import SearchToolVersion
  6 | 
  7 | @pytest.fixture
  8 | def test_screendata():
  9 |     '''Fixture to provide the ScreenResult for testing.'''
 10 |     return ScreenResult(
 11 |         #recommendation="PASS",
 12 |         commec_info = ScreenRunInfo(
 13 |             commec_version="0.1.2",
 14 |             json_output_version=JSON_COMMEC_FORMAT_VERSION,
 15 |             time_taken="00:00:00:00",
 16 |             date_run="1.1.2024",
 17 |             search_tool_info= SearchToolInfo(
 18 |                 biorisk_search_info=SearchToolVersion("HMM 0.0.0","DB 0.0.0"),
 19 |                 protein_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"),
 20 |                 nucleotide_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"),
 21 |                 low_concern_protein_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"),
 22 |                 low_concern_rna_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"),
 23 |                 low_concern_dna_search_info=SearchToolVersion("Blast 0.0.0","DB 0.0.0"),
 24 |             )
 25 |         ),
 26 |         query_info = ScreenQueryInfo(
 27 |             file="no file",
 28 |             number_of_queries=1,
 29 |             total_query_length=10
 30 |         ),
 31 |         queries= {
 32 |             "Query1":
 33 |             QueryResult(
 34 |                 query="Query1",
 35 |                 length=10,
 36 |                 status = QueryScreenStatus(),
 37 |                 hits = {
 38 |                     "ImportantProtein1":
 39 |                     HitResult(
 40 |                         recommendation=HitScreenStatus(ScreenStatus.WARN, ScreenStep.BIORISK),
 41 |                         name="ImportantProtein1",
 42 |                         annotations = {"domain" : ["Bacteria"]},
 43 |                         ranges = [
 44 |                             MatchRange(
 45 |                                 e_value = 0.0,
 46 |                                 match_start = 0,
 47 |                                 match_end = 10,
 48 |                                 query_start = 0,
 49 |                                 query_end = 10
 50 |                             )
 51 |                         ]
 52 |                     )
 53 |                 }
 54 |             )
 55 |         },
 56 |     )
 57 | 
 58 | @pytest.fixture
 59 | def empty_screendata():
 60 |     '''Fixture to provide the ScreenResult for testing.'''
 61 |     return ScreenResult()
 62 | 
 63 | @pytest.mark.parametrize("test_data_fixture",["test_screendata", "empty_screendata"])
 64 | def test_json_io(tmp_path, request, test_data_fixture):
 65 |     ''' Test to ensure that read/write for JSON ScreenResult I/O is working correctly.'''
 66 |     test_data = request.getfixturevalue(test_data_fixture)
 67 |     json_filename1 = tmp_path / "testread1.json"
 68 |     json_filename2 = tmp_path / "testread2.json"
 69 |     encode_screen_data_to_json(test_data, json_filename1)
 70 |     test_data_retrieved = get_screen_data_from_json(json_filename1)
 71 |     encode_screen_data_to_json(test_data_retrieved, json_filename2)
 72 |     test_data_retrieved_twice = get_screen_data_from_json(json_filename2)
 73 | 
 74 |     # Convert both original and retrieved data to dictionaries and compare
 75 |     assert asdict(test_data) == asdict(test_data_retrieved), (
 76 |         f"JSON Write/Read interpreter failed.\n"
 77 |         f"Test JSON Reference data: \n{asdict(test_data)}\n"
 78 |         f"Test JSON output data: \n{asdict(test_data_retrieved)}"
 79 |     )
 80 | 
 81 |     # Convert both original and retrieved data to dictionaries and compare
 82 |     assert asdict(test_data) == asdict(test_data_retrieved_twice), (
 83 |         f"JSON Write/Read/Write/Read interpreter failed.\n"
 84 |         f"Test JSON Reference data: \n{asdict(test_data)}\n"
 85 |         f"Test JSON output data: \n{asdict(test_data_retrieved)}"
 86 |     )
 87 | 
 88 | def test_erroneous_info(tmp_path, test_screendata):
 89 |     ''' Test to ensure that read/write for JSON ScreenResult I/O is working correctly.'''
 90 |     test_data = test_screendata
 91 |     json_filename3 = tmp_path / "testread3.json"
 92 |     json_filename4 = tmp_path / "testread4.json"
 93 | 
 94 |     encode_screen_data_to_json(test_data, json_filename3)
 95 |     test_data_retrieved = get_screen_data_from_json(json_filename3)
 96 | 
 97 |     # Add erroneous information
 98 |     test_data_dict = asdict(test_data_retrieved)
 99 |     test_data_dict["ExtraStuff1"] = "ExtraBitStuff1"
100 |     test_data_dict["queries"]["Query1"]["ExtraStuff2"] = "ExtraBitStuff2"
101 |     test_data_dict["queries"]["Query1"]["hits"]["ImportantProtein1"]["ranges"].append("ExtraStuff3")
102 |     test_data_dict["queries"]["Query1"]["hits"]["ImportantProtein1"]["ranges"].append({"ExtraDictStuff4" : 9999})
103 |     test_data_dict2 = encode_dict_to_screen_data(test_data_dict)
104 |     encode_screen_data_to_json(test_data_dict2, json_filename4)
105 |     test_data_retrieved = get_screen_data_from_json(json_filename4)
106 | 
107 |     # Convert both original and retrieved data to dictionaries and compare
108 |     assert asdict(test_data) == asdict(test_data_retrieved), (
109 |         f"JSON Write/Read interpreter failed.\n"
110 |         f"Test JSON Reference data: \n{asdict(test_data)}\n\n\n\n"
111 |         f"Test JSON output data: \n{asdict(test_data_retrieved)}\n\n\n\n"
112 |     )
113 | 
114 | def test_recommendation_ordering():
115 |     assert ScreenStatus.PASS.importance < ScreenStatus.FLAG.importance
116 |     assert compare(ScreenStatus.PASS, ScreenStatus.FLAG) == ScreenStatus.FLAG
117 | 
118 | def test_adding_data_to_existing():
119 |     """
120 |     Tests to ensure the mutability of writing to queries is working as expected.
121 |     """
122 |     def write_info(input_query : QueryResult):
123 |         input_query.status.biorisk = ScreenStatus.PASS
124 |     
125 |     new_screen_data = ScreenResult()
126 |     new_screen_data.queries["test01"] = QueryResult("test01", 10, ScreenStatus.FLAG)
127 |     write_query = new_screen_data.get_query("test01")
128 |     write_info(write_query)
129 |     assert new_screen_data.queries["test01"].status.biorisk == ScreenStatus.PASS
130 | 


--------------------------------------------------------------------------------
/commec/tests/test_dbs.py:
--------------------------------------------------------------------------------
  1 | """ 
  2 | Unit test for ensuring that the databases are being called without errors.
  3 | Will fail if databases have not been installed as expected, with correct versions.
  4 | """
  5 | import os
  6 | import pytest
  7 | from commec.tools.diamond import DiamondHandler
  8 | from commec.tools.blastn import BlastNHandler
  9 | from commec.tools.blastx import BlastXHandler
 10 | from commec.tools.hmmer import HmmerHandler
 11 | from commec.tools.cmscan import CmscanHandler
 12 | from commec.tools.search_handler import DatabaseValidationError
 13 | 
 14 | INPUT_QUERY = os.path.join(os.path.dirname(__file__), "test_data/single_record.fasta")
 15 | DATABASE_DIRECTORY = os.path.join(os.path.dirname(__file__), "test_dbs")
 16 | 
 17 | databases_to_implement = [
 18 |     [DiamondHandler, "nr_dmnd", "nr"],
 19 |     [BlastNHandler, "nt_blast", "core_nt"],
 20 |     [BlastXHandler, "nr_blast", "nr"],
 21 |     [HmmerHandler, "low_concern/protein", "benign.hmm"],
 22 |     [CmscanHandler, "low_concern/rna", "benign.cm"],
 23 | ]
 24 | 
 25 | def print_tmp_path_contents(tmp_path):
 26 |     print(f"Contents of {tmp_path}:")
 27 |     for path in tmp_path.rglob("*"):  # Recursively list all files and directories
 28 |         print(path.relative_to(tmp_path), "->", "DIR" if path.is_dir() else "FILE")
 29 | 
 30 | @pytest.mark.parametrize("input_db", databases_to_implement)
 31 | def test_database_can_run(input_db):
 32 |     """
 33 |     Opens a database object on a test database, and runs the test query on it.
 34 |     Fails if commec environment is not setup correctly, or if the database object
 35 |     defaults are invalid etc.
 36 | 
 37 |     Something similar to this would be useful to be run
 38 |     instead of --help during the conda recipe checks.
 39 |     """
 40 | 
 41 |     db_dir = os.path.join(DATABASE_DIRECTORY, input_db[1])
 42 |     db_file = os.path.join(db_dir, input_db[2])
 43 | 
 44 |     output_file = "db.out"
 45 | 
 46 |     new_db = input_db[0](db_file, INPUT_QUERY, output_file, force=True)
 47 |     new_db.search()
 48 |     assert new_db.validate_output()
 49 | 
 50 |     version: str = new_db.get_version_information()
 51 |     assert version
 52 | 
 53 |     if os.path.isfile(output_file):
 54 |         os.remove(output_file)
 55 | 
 56 | 
 57 | bad_databases = [
 58 |     [DiamondHandler, "nr_dmnd", "bad"],
 59 |     [BlastNHandler, "nt_blast", "bad"],
 60 |     [BlastXHandler, "nr_blast", "bad"],
 61 |     [HmmerHandler, "low_concern_db", "bad.hmm"],
 62 |     [CmscanHandler, "low_concern_db", "bad.cmscan"],
 63 |     [DiamondHandler, "bad", "bad"],
 64 |     [BlastNHandler, "bad", "bad"],
 65 |     [BlastXHandler, "bad", "bad"],
 66 |     [HmmerHandler, "bad", "bad.hmm"],
 67 |     [CmscanHandler, "bad", "bad.cmscan"],
 68 | ]
 69 | 
 70 | 
 71 | @pytest.mark.parametrize("input_db", bad_databases)
 72 | def test_database_no_file(input_db):
 73 |     """
 74 |     Simply ensures that the input databases are failing there validation.
 75 |     """
 76 |     db_dir = os.path.join(DATABASE_DIRECTORY, input_db[1])
 77 |     db_file = os.path.join(db_dir, input_db[2])
 78 |     output_file = "db.out"
 79 | 
 80 |     try:
 81 |         input_db[0](db_file, INPUT_QUERY, output_file)
 82 |         assert False
 83 |     except DatabaseValidationError:
 84 |         assert True
 85 | 
 86 | n_jobs = [
 87 |     None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
 88 | ]
 89 | 
 90 | @pytest.mark.parametrize("input_jobs", n_jobs)
 91 | def test_diamond_job_and_threads_calculations(input_jobs):
 92 |     """
 93 |     Tests a range of threads, and diamond database sizes,
 94 |     for automatically calculating the optimum number of runs,
 95 |     and threads per run. Such that no CPU time is wasted.
 96 |     No specific expected outcomes, but we can check general expectations:
 97 |      - Never exceed max threads.
 98 |      No specific expected outcomes, but we check general expectations (e.g. never exceeding max_threads)
 99 |     """
100 |     handler = DiamondHandler(
101 |         "commec/tests/test_dbs/nr_dmnd/nr",
102 |         "commec/tests/test_data/single_record.fasta",
103 |         "output.test",
104 |     )
105 |     handler.jobs = input_jobs
106 | 
107 |     for max_threads in range(1, 25):
108 |         for n_database_files in range(3, 9):
109 |             concurrent_runs, threads_per_run = handler.determine_runs_and_threads(
110 |                 max_threads, n_database_files
111 |             )
112 | 
113 |             # If input jobs is provided, we should never exceed max threads.
114 |             assert concurrent_runs * threads_per_run <= max_threads
115 | 
116 |             # If no number of input jobs is provided:
117 |             # We should ALWAYS use all available threads.
118 |             # We may use less than Max Threads if the remainder is 0 for the no. of database files
119 |             if input_jobs is None:
120 |                 assert ((concurrent_runs * threads_per_run == max_threads) or
121 |                         (concurrent_runs * threads_per_run % n_database_files == 0)), f"""
122 |                 {concurrent_runs} runs with {threads_per_run} threads. Input settings:
123 |                 {max_threads} max threads, {n_database_files} dbs, {input_jobs} input jobs no.
124 |                 """
125 | 
126 | 
127 | @pytest.mark.parametrize(
128 |     "input_jobs, max_threads, n_database_files, expected_runs, expected_threads",
129 |     [
130 |         (None, 20, 6, 2, 10), # jobs capped by db count, using all threads
131 |         (None, 8, 5, 1, 5),   # jobs capped by db count, not using all threads
132 |         (3, 12, 6, 3, 4),     # jobs=3 --> 3 runs with 4 threads each
133 |         (10, 20, 5, 5, 4),    # jobs=10 > db=5, capped to 5 runs with 4 threads each
134 |         (20, 10, 5, 5, 2),    # jobs=20 > threads=10, capped to 5 runs with 2 threads each
135 |         (10, 4, 5, 4, 1),     # jobs=10 > db, threads, cappted to 4 runs with 1 thread each
136 |     ]
137 | )
138 | def test_diamond_job_and_threads_calculations_parametrized(
139 |     input_jobs, max_threads, n_database_files, expected_runs, expected_threads
140 | ):
141 |     """
142 |     Specific test cases for Diamond Jobs.
143 |     """
144 |     handler = DiamondHandler(
145 |         "commec/tests/test_dbs/nr_dmnd/nr",
146 |         "commec/tests/test_data/single_record.fasta",
147 |         "output.test",
148 |     )
149 |     handler.jobs = input_jobs
150 |     concurrent_runs, threads_per_run = handler.determine_runs_and_threads(
151 |                 max_threads, n_database_files
152 |             )
153 | 
154 |     assert concurrent_runs == expected_runs, f"""
155 |         {input_jobs} jobs, {max_threads} threads failed
156 |         {concurrent_runs} for expected ({expected_runs}) concurrent runs.
157 |         """
158 |     assert threads_per_run == expected_threads, f"""
159 |         {input_jobs} jobs, {max_threads} threads failed
160 |         {threads_per_run} for expected ({expected_threads}) threads per run.
161 |     """


--------------------------------------------------------------------------------
/commec/config/json_io.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | '''
  4 |    Set of tools for retrieving and storing information important to screen
  5 |     outputs. Information is stored as a structure of dataclasses (headed by ScreenResult), 
  6 |     and are converted between the dataclass / dict / json_file as required. The
  7 |     conversions are done dynamically, and it is recommended to only use and
  8 |     interact with the dataclasses only, to maintain version format, and not
  9 |     create erroneous outputs to the JSON which wont be read back in. This
 10 |     ensures an expected i/o behaviour.
 11 | 
 12 |     The single exception to this is the "annotations" dictionary, present
 13 |     in the HitDescription, which contains non-structured information, and is
 14 |     populated with differing information under differing keys depending on
 15 |     which step the information is derived (Biorisk, Taxonomy etc)
 16 | 
 17 |     In this way, the JSON object serves as a common state, that can be updated
 18 |     whilst not being temporally appended like a log file i.e. .screen file.
 19 | 
 20 |     The JSON stores all pertinent information of a run.
 21 | '''
 22 | 
 23 | # Consider whether this can get away with being part of config. rename to IO config?
 24 | 
 25 | import json
 26 | import string
 27 | import os
 28 | from dataclasses import asdict, fields, is_dataclass
 29 | from typing import Dict, Type, get_origin, Any, get_args
 30 | from enum import StrEnum
 31 | from commec.config.result import ScreenResult, JSON_COMMEC_FORMAT_VERSION
 32 | 
 33 | class IoVersionError(RuntimeError):
 34 |     """Custom exception when handling differing versions with Commec output JSON."""
 35 | 
 36 | def encode_screen_data_to_json(input_result: ScreenResult,
 37 | 
 38 |                                output_json_filepath: string = "output.json") -> None:
 39 |     ''' Converts a ScreenResult class object into a JSON file at the given filepath.'''
 40 |     try:
 41 |         with open(output_json_filepath, "w", encoding="utf-8") as json_file:
 42 |             json.dump(asdict(input_result), json_file, indent=2)
 43 |     except TypeError as e:
 44 |         print("Error outputting JSON:", e)
 45 |         print(input_result)
 46 | 
 47 | def encode_dict_to_screen_data(input_dict : dict) -> ScreenResult:
 48 |     ''' Converts a dictionary into a ScreenResult object,
 49 |     any keys within the dictionary not part of the ScreenResult format are lost.
 50 |     any missing information will be simple set as defaults.'''
 51 |     return dict_to_dataclass(ScreenResult, input_dict)
 52 | 
 53 | # Convert the dictionary back to the dataclass or list of dataclass
 54 | def dict_to_dataclass(cls: Type, data: Dict[str, Any]) -> Any:
 55 |     '''
 56 |     Convert a dict, into appropriate dataclass, or list of dataclass,
 57 |     invalid keys to the dataclass structure are ignored.
 58 |     '''
 59 |     # Prepare a dictionary for filtered data
 60 |     filtered_data = {}
 61 | 
 62 |     if data is None:
 63 |         return filtered_data
 64 | 
 65 |     for f in fields(cls):
 66 |         field_name = f.name
 67 |         field_type = f.type
 68 | 
 69 |         if field_name in data:
 70 |             field_value = data[field_name]
 71 | 
 72 |             # Check if the field is a dataclass
 73 |             if is_dataclass(field_type):
 74 |                 filtered_data[field_name] = dict_to_dataclass(field_type, field_value)
 75 |                 continue
 76 | 
 77 |             # Check if the field is a list
 78 |             if get_origin(field_type) is list:
 79 |                 item_type = get_args(field_type)[0]
 80 | 
 81 |                 # Handle lists of StrEnums
 82 |                 if issubclass(item_type, StrEnum):
 83 |                     filtered_data[field_name] = [item_type(item) for item in field_value]
 84 | 
 85 |                 #Handles Dataclasses
 86 |                 if is_dataclass(item_type) and isinstance(field_value, list):
 87 |                     filtered_data[field_name] = [
 88 |                         dict_to_dataclass(item_type, item) for item in field_value
 89 |                             if isinstance(item, dict)
 90 |                             and any(key in {f.name for f in fields(item_type)}
 91 |                             for key in item.keys()) or isinstance(item, item_type)]
 92 |                     continue
 93 | 
 94 |                 filtered_data[field_name] = field_value
 95 |                 continue
 96 | 
 97 |             # Check if the field is a dict of dataclasses
 98 |             if get_origin(field_type) is dict:
 99 |                 _key_type, value_type = get_args(field_type)
100 | 
101 |                 # Handle dicts of dataclasses
102 |                 if is_dataclass(value_type):
103 |                     filtered_data[field_name] = {
104 |                         key: dict_to_dataclass(value_type, value) if isinstance(value, dict)
105 |                         else value for key, value in field_value.items()
106 |                         if isinstance(value, (dict, value_type))
107 |                     }
108 |                     continue
109 | 
110 |                 filtered_data[field_name] = field_value
111 |                 continue
112 | 
113 |             # Handle custom StrEnums
114 |             if issubclass(field_type, StrEnum):
115 |                 try:
116 |                     filtered_data[field_name] = field_type(field_value)
117 |                 except ValueError:
118 |                     print(f"Invalid value '{field_value}' for "
119 |                           f"field '{field_name}' of type {field_type}.")
120 |                 continue
121 | 
122 |             # Handle other field types
123 |             filtered_data[field_name] = field_value
124 | 
125 |     # Create an instance of the dataclass with the filtered data
126 |     return cls(**filtered_data)
127 | 
128 | def get_screen_data_from_json(input_json_filepath: string) -> ScreenResult:
129 |     ''' Loads a JSON file from given filepath and returns
130 |     a populated ScreenResult object from its contents. If the file does not
131 |     exist, then returns a new screen data object.'''
132 |     if not os.path.exists(input_json_filepath):
133 |         return ScreenResult()
134 | 
135 |     json_string : str
136 |     with open(input_json_filepath, "r", encoding="utf-8") as json_file:
137 |         # Read the file contents as a string
138 |         json_string = json_file.read()
139 |     my_data : dict = json.loads(json_string)
140 | 
141 |     # Check version of imported json.
142 |     input_version = my_data["commec_info"]["json_output_version"]
143 |     if not input_version == JSON_COMMEC_FORMAT_VERSION:
144 |         raise IoVersionError(f"Version difference between input (v.{input_version}) and"
145 |                             f" expected (v.{JSON_COMMEC_FORMAT_VERSION})"
146 |                             f": {input_json_filepath}")
147 |     return encode_dict_to_screen_data(my_data)
148 | 


--------------------------------------------------------------------------------
/dev_scripts/summarize_screens.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | """
  4 | Summarize the screening for all .screen files in a directory. This is intended to be useful for
  5 | debugging the pipeline, rather than for interpreting the outputs.
  6 | 
  7 | Produces a CSV (name set by -o, defaults to 'output.csv') which contains the outcome for each step
  8 | of the pipeline:
  9 | 
 10 | * flag      the sequence was flagged in this step
 11 | * pass      the sequence passed in this step
 12 | * skip      this step was intentionally not run
 13 | * error     an error occurred during this step
 14 | * -         this step was not run due to an error, interrupt, or other unexpected outcome
 15 | * mix       (protein only) the best match is to a mix of regulated- and non-regulated organisms
 16 | * warn      (biorisk only) found a significant hit to a virulence not from a regulated pathogen
 17 | 
 18 | Each line in the CSV corresponds to a .screen file. The full paths to the files are also provided.
 19 | 
 20 | Additionally, it includes three columns indicating whether the sequence was flagged as a regulated
 21 | virus, bacteria, or eukaryote.
 22 | """
 23 | import os
 24 | import csv
 25 | import argparse
 26 | import re
 27 | 
 28 | 
 29 | def process_step(step_content, step_number):
 30 |     """
 31 |     Process the .screen file output to determine the outcome of the step.
 32 |     """
 33 |     step_processors = {
 34 |         1: get_biorisk_outcome,
 35 |         2: get_protein_outcome,
 36 |         3: get_nucleotide_outcome,
 37 |         4: process_benign_step,
 38 |     }
 39 |     return step_processors.get(step_number, lambda _: "-")(step_content)
 40 | 
 41 | 
 42 | def get_biorisk_outcome(step_content):
 43 |     """Process biorisk scan step from .screen file."""
 44 |     if "FLAG" in step_content:
 45 |         return "flag"
 46 |     if "Virulence factor found" in step_content:
 47 |         return "warn"
 48 |     if (
 49 |         "Biorisks: no hits detected, PASS" in step_content
 50 |         or "Biorisks: no significant hits detected, PASS" in step_content
 51 |     ):
 52 |         return "pass"
 53 |     if "ERROR:" in step_content:
 54 |         return "error"
 55 |     return "-"
 56 | 
 57 | 
 58 | def get_protein_outcome(step_content):
 59 |     """Process protein scan step from .screen file."""
 60 |     if "Best match to sequence(s)" in step_content and "FLAG" in step_content:
 61 |         return "flag"
 62 |     if "found in both regulated and non-regulated organisms" in step_content:
 63 |         return "mix"
 64 |     if "no top hit exclusive to a regulated pathogen: PASS" in step_content:
 65 |         return "pass"
 66 |     if "ERROR:" in step_content:
 67 |         return "error"
 68 |     return "-"
 69 | 
 70 | 
 71 | def get_nucleotide_outcome(step_content):
 72 |     """Process nucleotide scan step from .screen file."""
 73 |     if "no noncoding regions >= 50 bases found, skipping nt scan" in step_content:
 74 |         return "skip"
 75 |     if "Best match to sequence(s)" in step_content and "FLAG" in step_content:
 76 |         return "flag"
 77 |     if "no top hit exclusive to a regulated pathogen: PASS" in step_content:
 78 |         return "pass"
 79 |     if "ERROR:" in step_content:
 80 |         return "error"
 81 |     return "-"
 82 | 
 83 | 
 84 | def process_benign_step(step_content):
 85 |     """Process benign scan step from .screen file."""
 86 |     if "no regulated regions to clear" in step_content:
 87 |         return "skip"
 88 |     if (
 89 |         "Regulated region at bases" in step_content
 90 |         and "failed to clear: FLAG" in step_content
 91 |     ):
 92 |         return "flag"
 93 |     if "all regulated regions cleared: PASS" in step_content:
 94 |         return "pass"
 95 |     if "ERROR:" in step_content:
 96 |         return "error"
 97 |     return "-"
 98 | 
 99 | 
100 | def check_regulated_flags(content):
101 |     """
102 |     Check for regulated virus, bacteria, and eukaryote flags in the content.
103 |     """
104 |     return {
105 |         "virus_flag": "true" if "FLAG (virus)" in content else "false",
106 |         "bacteria_flag": "true" if "FLAG (bacteria)" in content else "false",
107 |         "eukaryote_flag": "true" if "FLAG (eukaryote)" in content else "false",
108 |     }
109 | 
110 | 
111 | def process_file(file_path):
112 |     """
113 |     Read input screen file, split into steps, and prepare dict of results for CSV output.
114 |     """
115 |     with open(file_path, "r", encoding="utf-8") as file:
116 |         content = file.read()
117 | 
118 |     filename = os.path.basename(file_path)
119 |     filename_without_extension = os.path.splitext(filename)[0]
120 | 
121 |     steps = re.split(r">> STEP \d:", content)
122 |     steps = [step.strip() for step in steps if step.strip()]
123 | 
124 |     results = {
125 |         "filename": filename_without_extension,
126 |         "location": file_path,
127 |         "biorisk": process_step(steps[0] if len(steps) > 0 else "-", 1),
128 |         "protein": process_step(steps[1] if len(steps) > 1 else "-", 2),
129 |         "nucleotide": process_step(steps[2] if len(steps) > 2 else "-", 3),
130 |         "benign": process_step(steps[3] if len(steps) > 3 else "-", 4),
131 |     }
132 | 
133 |     # Add regulated flags
134 |     results.update(check_regulated_flags(content))
135 | 
136 |     return results
137 | 
138 | 
139 | def main(directory, output_file):
140 |     """
141 |     Read all files that end with .screen in the input directory, then summarize their outcomes in a
142 |     CSV.
143 |     """
144 |     results = []
145 |     for root, _, files in os.walk(directory):
146 |         for file in files:
147 |             if file.endswith(".screen"):
148 |                 file_path = os.path.join(root, file)
149 |                 results.append(process_file(file_path))
150 | 
151 |     # Write results to CSV
152 |     with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
153 |         fieldnames = [
154 |             "filename",
155 |             "location",
156 |             "biorisk",
157 |             "protein",
158 |             "nucleotide",
159 |             "virus_flag",
160 |             "bacteria_flag",
161 |             "eukaryote_flag",
162 |             "benign",
163 |         ]
164 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
165 | 
166 |         writer.writeheader()
167 |         for result in results:
168 |             writer.writerow(result)
169 | 
170 |     print(f"Results written to {output_file}")
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     parser = argparse.ArgumentParser(
175 |         description="Process .screen files and output results to CSV."
176 |     )
177 |     parser.add_argument("directory", help="Directory to search for .screen files")
178 |     parser.add_argument(
179 |         "-o",
180 |         "--output",
181 |         default="output.csv",
182 |         help="Output CSV file name (default: output.csv)",
183 |     )
184 | 
185 |     args = parser.parse_args()
186 | 
187 |     main(args.directory, args.output)
188 | 


--------------------------------------------------------------------------------
/dev_scripts/collate-screens.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Collate screen files located in any subdirectories of an input directory, then match description
  4 | fields between FASTAs in adjacent _input directories and FASTAs in another directory, renaming
  5 | collated screen files with the names of matched FASTA files. Created as a workaround for breaking
  6 | changes introduced in the commec output format in v0.3.
  7 | 
  8 | Required inputs:
  9 |     -i, --input-dir     input directory to recursively search for .screen files
 10 |     -o, --output-dir    output directory where screen files should be collated
 11 |     -f, --fasta-dir     directory to search recursively for FASTAs input to commec screen
 12 | 
 13 | Example:
 14 | $ python collate-screens.py -i . -o ./test-collate-pls -f ../functional-json-test/
 15 | """
 16 | import argparse
 17 | import csv
 18 | import os
 19 | import shutil
 20 | from pathlib import Path
 21 | from typing import Dict, List, Tuple
 22 | 
 23 | def clean_header(header: str) -> str:
 24 |     """
 25 |     Clean a FASTA header by replacing whitespace and special characters with underscores.
 26 |     
 27 |     Args:
 28 |         header: Original FASTA header
 29 |         
 30 |     Returns:
 31 |         Cleaned header string
 32 |     """
 33 |     return "".join(
 34 |         "_" if c.isspace() or c == "\xc2\xa0" or c == "#" else c
 35 |         for c in header
 36 |     )
 37 | 
 38 | def parse_fasta_header(fasta_path: Path) -> Tuple[str, str]:
 39 |     """
 40 |     Extract filename and cleaned FASTA header from a FASTA file.
 41 |     
 42 |     Args:
 43 |         fasta_path: Path to the FASTA file
 44 |         
 45 |     Returns:
 46 |         Tuple of (filename, cleaned_header)
 47 |     """
 48 |     filename = fasta_path.name
 49 |     with open(fasta_path) as f:
 50 |         for line in f:
 51 |             if line.startswith('>'):
 52 |                 header = line.strip()
 53 |                 cleaned_header = clean_header(header)
 54 |                 return filename, cleaned_header
 55 |     raise ValueError(f"No valid FASTA header found in {fasta_path}")
 56 | 
 57 | def build_fasta_mapping(fasta_dir: Path) -> Dict[str, str]:
 58 |     """
 59 |     Build mapping of FASTA headers to filenames from all FASTA files in directory.
 60 |     
 61 |     Args:
 62 |         fasta_dir: Directory containing FASTA files
 63 |         
 64 |     Returns:
 65 |         Dictionary mapping FASTA headers to original filenames
 66 |     """
 67 |     mapping = {}
 68 |     for root, _, files in os.walk(fasta_dir):
 69 |         for file in files:
 70 |             if (file.endswith('.fasta') and 
 71 |                 not file.endswith('.noncoding.fasta') and 
 72 |                 not file.endswith('.cleaned.fasta')):
 73 |                 fasta_path = Path(root) / file
 74 |                 try:
 75 |                     filename, cleaned_header = parse_fasta_header(fasta_path)
 76 |                     mapping[cleaned_header] = filename
 77 |                 except (ValueError, IOError) as e:
 78 |                     print(f"Warning: Could not process {fasta_path}: {e}")
 79 |     return mapping
 80 | 
 81 | def find_screen_files(input_dir: Path) -> List[Path]:
 82 |     """
 83 |     Find all .screen files in the input directory.
 84 |     """
 85 |     screen_files = []
 86 |     for root, _, files in os.walk(input_dir):
 87 |         for file in files:
 88 |             if file.endswith('.screen'):
 89 |                 screen_files.append(Path(root) / file)
 90 |     return screen_files
 91 | 
 92 | def get_matching_fasta_header(screen_path: Path) -> str:
 93 |     """
 94 |     Get the FASTA header from the input screen file.
 95 |     """
 96 |     # For a file named "something.screen", look for "input_something/something.cleaned.fasta"
 97 |     screen_name = screen_path.stem  # removes .screen extension
 98 |     fasta_path = screen_path.parent / f"input_{screen_name}" / f"{screen_name}.cleaned.fasta"
 99 |     if not fasta_path.exists():
100 |         raise FileNotFoundError(f"Expected FASTA not found at {fasta_path}")
101 |     
102 |     with open(fasta_path) as f:
103 |         for line in f:
104 |             if line.startswith('>'):
105 |                 header = line.strip()
106 |                 return clean_header(header)
107 |     raise ValueError(f"No valid FASTA header found in {fasta_path}")
108 | 
109 | def main():
110 |     parser = argparse.ArgumentParser(description='Collate and rename screen files based on FASTA headers')
111 |     parser.add_argument('-i', '--input-dir', required=True, help='Input directory containing screen files')
112 |     parser.add_argument('-o', '--output-dir', required=True, help='Output directory for renamed screen files')
113 |     parser.add_argument('-f', '--fasta-dir', required=True, help='Directory containing original FASTA files')
114 |     
115 |     args = parser.parse_args()
116 |     
117 |     input_dir = Path(args.input_dir).resolve()
118 |     output_dir = Path(args.output_dir).resolve()
119 |     fasta_dir = Path(args.fasta_dir).resolve()
120 |     
121 |     # Create output directory if it doesn't exist
122 |     output_dir.mkdir(parents=True, exist_ok=True)
123 |     
124 |     # Build mapping of FASTA headers to original filenames
125 |     print(f"Building FASTA header mapping based on files found in {fasta_dir}...")
126 |     header_to_filename = build_fasta_mapping(fasta_dir)
127 |     num_fastas_mapped = len(header_to_filename)
128 |     if num_fastas_mapped == 0:
129 |         print("Could not find any FASTAs to map! Note that .cleaned and .noncoding are filtered out.\nExiting...")
130 |         exit(0)
131 |     print(f"Found {num_fastas_mapped} FASTAS for mapping...")
132 | 
133 |     # Find all screen files
134 |     print(f"Finding screen files in {input_dir}...")
135 |     screen_files = find_screen_files(input_dir)
136 |     print(f"Processing {len(screen_files)} screen files...")
137 | 
138 |     mappings = []
139 |     for screen_path in screen_files:
140 |         try:
141 |             # Get the FASTA header for this screen file
142 |             fasta_header = get_matching_fasta_header(screen_path)
143 |             
144 |             # Look up the matching filename
145 |             if fasta_header not in header_to_filename:
146 |                 print(f"Warning: No matching FASTA file found for {screen_path}")
147 |                 continue
148 |                 
149 |             matching_filename = header_to_filename[fasta_header]
150 |             new_filename = f"{Path(matching_filename).stem}.screen"
151 |             output_path = output_dir / new_filename
152 |             
153 |             # Copy the screen file with the new name
154 |             shutil.copy2(screen_path, output_path)
155 |             
156 |             # Record the mapping
157 |             mappings.append({
158 |                 'screen': str(screen_path),
159 |                 'matched_fasta': matching_filename,
160 |                 'renamed_screen': new_filename
161 |             })
162 |             
163 |             print(f"Processed: {screen_path} -> {output_path}")
164 |             
165 |         except (FileNotFoundError, ValueError, IOError) as e:
166 |             print(f"Error processing {screen_path}: {e}")
167 |     
168 |     # Write mapping CSV
169 |     csv_path = output_dir / 'screen_mappings.csv'
170 |     with open(csv_path, 'w', newline='') as f:
171 |         writer = csv.DictWriter(f, fieldnames=['screen', 'matched_fasta', 'renamed_screen'])
172 |         writer.writeheader()
173 |         writer.writerows(mappings)
174 |     
175 |     print(f"\nProcessed {len(mappings)} screen files")
176 |     print(f"Mapping saved to {csv_path}")
177 | 
178 | if __name__ == '__main__':
179 |     main()
180 | 


--------------------------------------------------------------------------------
/commec/tests/test_query.py:
--------------------------------------------------------------------------------
  1 | from io import StringIO
  2 | import os
  3 | import pandas as pd
  4 | import pytest
  5 | import textwrap
  6 | from Bio.Seq import Seq
  7 | from Bio.SeqRecord import SeqRecord
  8 | from commec.config.query import Query, QueryTranslation
  9 | 
 10 | INPUT_QUERY = os.path.join(os.path.dirname(__file__), "test_data/single_record.fasta")
 11 | 
 12 | def test_get_frame_length():
 13 |     # 11 nt query
 14 |     query = Query(SeqRecord(Seq("atgtgccatgg"), id="test"))
 15 |     assert 9 == query._get_frame_length(frame_offset=0)
 16 |     assert 9 == query._get_frame_length(frame_offset=1)
 17 |     assert 9 == query._get_frame_length(frame_offset=2)
 18 | 
 19 |     # 15 nt query
 20 |     query = Query(SeqRecord(Seq("atgtgccatggatgc"), id="test"))
 21 |     assert 15 == query._get_frame_length(frame_offset=0)
 22 |     assert 12 == query._get_frame_length(frame_offset=1)
 23 |     assert 12 == query._get_frame_length(frame_offset=2)
 24 | 
 25 |     # 16 nt query
 26 |     query = Query(SeqRecord(Seq("atgtgccatggatgca"), id="test"))
 27 |     assert 15 == query._get_frame_length(frame_offset=0)
 28 |     assert 15 == query._get_frame_length(frame_offset=1)
 29 |     assert 12 == query._get_frame_length(frame_offset=2)
 30 | 
 31 | def test_translate_to_file(tmp_path):
 32 |     query = Query(SeqRecord(Seq("atgtgccatgg"), id="test"))
 33 | 
 34 |     expected_output = textwrap.dedent(
 35 |         """\
 36 |         >test_1
 37 |         MCH
 38 |         >test_2
 39 |         CAM
 40 |         >test_3
 41 |         VPW
 42 |         >test_4
 43 |         MAH
 44 |         >test_5
 45 |         HGT
 46 |         >test_6
 47 |         PWH
 48 |         """
 49 |     )
 50 | 
 51 |     aa_output = tmp_path / "test_translated.faa"
 52 | 
 53 |     query.translate(aa_output)
 54 | 
 55 |     # Check if the output file exists
 56 |     assert aa_output.exists()
 57 | 
 58 |     actual_output = aa_output.read_text()
 59 |     assert expected_output.strip() == actual_output.strip()
 60 | 
 61 | 
 62 | def test_translate():
 63 |     """
 64 |     Test translation from nucleotide to 6 frames of protein sequences.
 65 |     """
 66 |     # 11nt query
 67 |     query = Query(SeqRecord(Seq("atgtgccatgg"), id="test"))
 68 | 
 69 |     # Input sequence: atgtgccatgg
 70 |     # Translations:
 71 |     # Frame   Pos     Codon split         Translation
 72 |     # 1       0       atg tgc cat gg      MCH
 73 |     # 2       1       a tgt gcc atg g     CAM
 74 |     # 3       2       at gtg cca tgg      VPW
 75 |     # 4       -0      cc atg gca cat      MAH
 76 |     # 5       -1      c cat ggc aca t     HGT
 77 |     # 6       -2      cca tgg cac at      PWH
 78 |     expected_translations = [
 79 |         QueryTranslation(frame=1, sequence="MCH"),
 80 |         QueryTranslation(frame=2, sequence="CAM"),
 81 |         QueryTranslation(frame=3, sequence="VPW"),
 82 |         QueryTranslation(frame=4, sequence="MAH"),
 83 |         QueryTranslation(frame=5, sequence="HGT"),
 84 |         QueryTranslation(frame=6, sequence="PWH"),
 85 |     ]
 86 | 
 87 |     query._translate()
 88 |     assert expected_translations == query.translations
 89 | 
 90 |     # 15nt query
 91 |     query = Query(SeqRecord(Seq("acgcacctgatcgct"), id="test"))
 92 | 
 93 | 
 94 |     # Input sequence: acgcacctgatcgct
 95 |     # Translations:
 96 |     # Frame   Pos     Codon split              Translation
 97 |     # 1       0       acg cac ctg atc gct      THLIA
 98 |     # 2       1       a cgc acc tga tcg ct     RTXS
 99 |     # 3       2       ac gca cct gat cgc t      APDR
100 |     # 4       -0      agc gat cag gtg cgt      SDQVR
101 |     # 5       -1      a gcg atc agg tgc gt     RSGA
102 |     # 6       -2      ag cga tca ggt gcg t     AIRC
103 |     expected_translations = [
104 |         QueryTranslation(frame=1, sequence="THLIA"),
105 |         QueryTranslation(frame=2, sequence="RTXS"),
106 |         QueryTranslation(frame=3, sequence="APDR"),
107 |         QueryTranslation(frame=4, sequence="SDQVR"),
108 |         QueryTranslation(frame=5, sequence="RSGA"),
109 |         QueryTranslation(frame=6, sequence="AIRC"),
110 |     ]
111 | 
112 |     query._translate()
113 |     assert expected_translations == query.translations
114 | 
115 | 
116 | def test_ambigious():
117 |     """
118 |     Test translation from nucleotide to 6 frames of protein sequences using ambigious nts
119 |     | --------------------------------------------------------------------- |
120 |     | Code             | Bases Represented | Meaning                        |
121 |     | ---------------- | ----------------- | ------------------------------ |
122 |     | **A**            | A                 | Adenine                        |
123 |     | **C**            | C                 | Cytosine                       |
124 |     | **G**            | G                 | Guanine                        |
125 |     | **T** (or **U**) | T (or U in RNA)   | Thymine (or Uracil)            |
126 |     | **R**            | A or G            | puRine                         |
127 |     | **Y**            | C or T            | pYrimidine                     |
128 |     | **S**            | G or C            | Strong interaction (3 H-bonds) |
129 |     | **W**            | A or T            | Weak interaction (2 H-bonds)   |
130 |     | **K**            | G or T            | Keto                           |
131 |     | **M**            | A or C            | aMino                          |
132 |     | **B**            | C or G or T       | not A                          |
133 |     | **D**            | A or G or T       | not C                          |
134 |     | **H**            | A or C or T       | not G                          |
135 |     | **V**            | A or C or G       | not T                          |
136 |     | **N**            | A or C or G or T  | any base (completely unknown)  |
137 |     | --------------------------------------------------------------------- |
138 |     | Codon | Ambiguity | Expansions         | Amino Acid |
139 |     | ----- | --------- | ------------------ | ---------- |
140 |     | AAR   | R=A/G     | AAA, AAG           | Lys        |
141 |     | TAY   | Y=C/T     | TAC, TAT           | Tyr        |
142 |     | GCN   | N=A/C/G/T | GCA, GCC, GCG, GCT | Ala        |
143 |     | AAY   | Y=C/T     | AAC, AAT           | Asn        |
144 |     | GAR   | R=A/G     | GAA, GAG           | Glu        |
145 |     | ACM   | M=A/C     | ACA, ACC           | Thr        |
146 |     | CCS   | S=C/G     | CCC, CCG           | Pro        |
147 |     | GGW   | W=A/T     | GGA, GGT           | Gly        |
148 |     | GTK   | K=G/T     | GTG, GTT           | Val        |
149 |     | ATH   | H=A/C/T   | ATA, ATC, ATT      | Ile        |
150 |     | GTD   | D=A/G/T   | GTA, GTG, GTT      | Val        |
151 |     | CCB   | B=C/G/T   | CCC, CCG, CCT      | Pro        |
152 |     | GTV   | V=A/C/G   | GTA, GTC, GTG      | Val        |
153 |     """
154 |     # 11nt query
155 |     #query = Query(SeqRecord(Seq("atntnccatgg"), id="test"))
156 |     #query = Query(SeqRecord(Seq("ATGAARTAYGCNAAYGARACNABGGADCAHGAVACNTGG"), id="test"))
157 |     query = Query(SeqRecord(Seq("ATGAARTAYGCNAAYGARACMCCSGGWGTKATHGTDCCBGTV"), id="test"))
158 | 
159 |     expected_translations = [
160 |         QueryTranslation(frame=1, sequence="MKYANETPGVIVPV"),
161 |         QueryTranslation(frame=2, sequence="XXXXXXXXXXXXX"),
162 |         QueryTranslation(frame=3, sequence="EXXXXBXXXXXXX"),
163 |         QueryTranslation(frame=4, sequence="XXXXXXXXXXXXXH"),
164 |         QueryTranslation(frame=5, sequence="XXXXXXXXXXXXS"),
165 |         QueryTranslation(frame=6, sequence="TGTITPGVSXAYF"),
166 |     ]
167 | 
168 |     query._translate()
169 |     assert expected_translations == query.translations, query.translations[2:4]
170 | 


--------------------------------------------------------------------------------
/commec/utils/logger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | """
  4 | Utilities to set up commec package logging.
  5 | """
  6 | 
  7 | import logging
  8 | import sys
  9 | import textwrap
 10 | 
 11 | class TextWrapFormatter(logging.Formatter):
 12 |     """
 13 |     Format multi-line log messages with proper vertical alignment, 
 14 |     configurable styling, and text wrapping for longer messages.
 15 |     """
 16 | 
 17 |     def __init__(self, *args, fmt=None, continuation_marker="│ ", line_width=120, **kwargs):
 18 |         if fmt is None:
 19 |             fmt = f"%(levelname)-8s{continuation_marker}%(message)s"
 20 |         super().__init__(fmt, *args, **kwargs)
 21 |         self.continuation_marker = continuation_marker
 22 |         self.line_width = line_width
 23 | 
 24 |         # String to prepended to all lines of wrapped output except the first
 25 |         self.indent_size = self._find_message_start() - len(self.continuation_marker)
 26 |         self.indent = " " * self.indent_size + self.continuation_marker
 27 | 
 28 |     def _find_message_start(self):
 29 |         """
 30 |         Deterine how far to indent messages by formatting a dummy message.
 31 |         """
 32 |         sample = logging.LogRecord(
 33 |             name="dummy",
 34 |             level=logging.INFO,
 35 |             pathname="./test",
 36 |             lineno=0,
 37 |             msg="DUMMY_MESSAGE",
 38 |             args=(),
 39 |             exc_info=None,
 40 |         )
 41 |         sample.asctime = self.formatTime(sample)
 42 |         sample_formatted = super().format(sample)
 43 |         return sample_formatted.find(sample.msg)
 44 | 
 45 |     def format(self, record):
 46 |         """
 47 |         Custom formatter for Commec logging.
 48 | 
 49 |         Accepts the following keywords in the `extra` dictionary:
 50 | 
 51 |         - **no_wrap**:
 52 |         Skips text wrapping.
 53 |         
 54 |         - **no_prefix**:
 55 |         Skips the `INFO    │ ` prefixes.
 56 |         
 57 |         - **box**, **box_up**, **box_down**:
 58 |         Use Unicode box-drawing characters (e.g., `"─┘"` or `"─┐"`) to tie off
 59 |         formatted prefixes when switching to no-prefix lines.
 60 |         """
 61 | 
 62 |         # Check extra options for format removal:
 63 |         if getattr(record, "no_prefix", False):
 64 |             box_up = getattr(record, "box_up", False)
 65 |             box_down = getattr(record, "box_down", False)
 66 |             if getattr(record, "cap", False):
 67 |                 box_up = True
 68 |                 box_down = True
 69 | 
 70 |             prefix = self.indent_size * "─" + "┘\n" if box_up else ""
 71 |             suffix = "\n" + self.indent_size * "─" + "┐" if box_down else ""
 72 | 
 73 |             return prefix + record.getMessage() + suffix # No formatting
 74 | 
 75 |         message = super().format(record)
 76 | 
 77 |         if getattr(record, "no_wrap", False):
 78 |             return message
 79 | 
 80 |         lines = message.splitlines()
 81 | 
 82 |         formatted_lines = []
 83 |         # First line gets the levelname/timestamp/etc from super().format, then
 84 |         # long lines are wrapped with the indent
 85 |         wrapped_first = textwrap.wrap(
 86 |             lines[0],
 87 |             width=self.line_width,
 88 |             subsequent_indent=self.indent,
 89 |             break_long_words=False,
 90 |             break_on_hyphens=False,
 91 |         )
 92 |         formatted_lines.extend(wrapped_first)
 93 | 
 94 |         # When a message has newlines, lines after the first should be indented even if short
 95 |         for line in lines[1:]:
 96 |             wrapped = textwrap.wrap(
 97 |                 line,
 98 |                 width=self.line_width,
 99 |                 initial_indent=self.indent,
100 |                 subsequent_indent=self.indent,
101 |                 break_long_words=False,
102 |                 break_on_hyphens=False,
103 |             )
104 |             formatted_lines.extend(wrapped)
105 | 
106 |         return "\n".join(formatted_lines)
107 | 
108 | 
109 | def setup_console_logging(log_level=logging.INFO):
110 |     """Set up logging to console."""
111 |     commec_logger = logging.getLogger("commec")
112 |     commec_logger.setLevel(log_level)
113 | 
114 |     # Check if the handler already exists to avoid duplicates
115 |     if not any(isinstance(h, logging.StreamHandler) for h in commec_logger.handlers):
116 |         console_handler = logging.StreamHandler()
117 |         console_handler.setLevel(log_level)
118 |         console_handler.setFormatter(TextWrapFormatter())
119 |         commec_logger.addHandler(console_handler)
120 | 
121 |     add_logging_to_excepthook()
122 | 
123 | 
124 | def setup_file_logging(filename, log_level=logging.INFO, log_mode="w"):
125 |     """Set up logging to a file. Format determined based on level."""
126 |     commec_logger = logging.getLogger("commec")
127 | 
128 |     # Ensure the logger level is set to the lowest level of any handler
129 |     current_level = commec_logger.level or logging.INFO
130 |     commec_logger.setLevel(min(current_level, log_level))
131 | 
132 |     # Log format has more detail if logging down to the debug level
133 |     if log_level == logging.DEBUG:
134 |         formatter = TextWrapFormatter(
135 |             fmt="%(asctime)s│ %(levelname)-8s│ %(message)s",
136 |             datefmt="%Y-%m-%d %H:%M:%S",  # Full ISO-like format
137 |             line_width = 300, # Longer lines for debug purposes.
138 |         )
139 |     else:
140 |         formatter = TextWrapFormatter("%(levelname)-8s│ %(message)s")
141 | 
142 |     # Update existing filehandlers, avoiding duplicates
143 |     file_handler = None
144 |     for handler in commec_logger.handlers:
145 |         if (
146 |             isinstance(handler, logging.FileHandler)
147 |             and getattr(handler, "baseFilename", None) == filename
148 |         ):
149 |             file_handler = handler
150 |             break
151 | 
152 |     file_handler = file_handler or logging.FileHandler(filename, log_mode)
153 |     file_handler.setLevel(log_level)
154 |     file_handler.setFormatter(formatter)
155 |     commec_logger.addHandler(file_handler)
156 | 
157 | 
158 | def add_logging_to_excepthook():
159 |     """
160 |     Ensure unhandled exceptions are logged to the commec package logger;
161 |     original excepthook is still called.
162 |     """
163 |     original_excepthook = sys.excepthook
164 | 
165 |     def commec_exception_logger(exc_type, exc_value, exc_traceback):
166 |         """Log exception to package logger."""
167 |         commec_logger = logging.getLogger("commec")
168 | 
169 |         if commec_logger.handlers:
170 |             # Log the exception message at ERROR level
171 |             error_message = f"Unhandled exception: {exc_type.__name__}: {exc_value}"
172 |             commec_logger.error(error_message)
173 | 
174 |             # Log the full traceback at the DEBUG level
175 |             commec_logger.debug(
176 |                 "Exception traceback:", exc_info=(exc_type, exc_value, exc_traceback)
177 |             )
178 | 
179 |         # Still call the original handler for console output
180 |         original_excepthook(exc_type, exc_value, exc_traceback)
181 | 
182 |     sys.excepthook = commec_exception_logger
183 | 
184 | 
185 | def set_log_level(log_level, update_only_handler_type=None):
186 |     """
187 |     Update the log level for the commec logger, as well as associated handlers.
188 |     Optionally, restrict updates to only a particular class of handlers (e.g. StreamHandler).
189 |     """
190 |     commec_logger = logging.getLogger("commec")
191 |     commec_logger.setLevel(log_level)
192 | 
193 |     handlers_to_update = commec_logger.handlers
194 |     if update_only_handler_type:
195 |         handlers_to_update = [
196 |             h for h in handlers_to_update if isinstance(h, update_only_handler_type)
197 |         ]
198 | 
199 |     for handler in handlers_to_update:
200 |         handler.setLevel(log_level)
201 | 


--------------------------------------------------------------------------------
/commec/tools/search_handler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | """
  4 | Abstract base class defining a shared interface for search tools.
  5 | """
  6 | from abc import ABC, abstractmethod
  7 | import os
  8 | from dataclasses import dataclass
  9 | import subprocess
 10 | import logging
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | @dataclass
 15 | class SearchToolVersion:
 16 |     """Container class for outputting version related information from a database."""
 17 | 
 18 |     tool_info: str = "x.x.x"
 19 |     database_info: str = "x.x.x"
 20 | 
 21 | 
 22 | class DatabaseValidationError(Exception):
 23 |     """Custom exception for database validation errors."""
 24 | 
 25 | class SearchHandler(ABC):
 26 |     """
 27 |     Abstract class defining tool interface including a database directory / file to search, an input
 28 |     query, and an output file to be used for screening.
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         database_file: str | os.PathLike,
 34 |         input_file: str | os.PathLike,
 35 |         out_file: str | os.PathLike,
 36 |         **kwargs,
 37 |     ):
 38 |         """
 39 |         Initialise a Search Handler.
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         database_file : str | os.PathLike
 44 |             Path to the database file.
 45 |         input_file : str | os.PathLike
 46 |             Path to the input file to be processed.
 47 |         out_file : str | os.PathLike
 48 |             Path where the output will be saved.
 49 | 
 50 |         Keyword Arguments
 51 |         -----------------
 52 |         threads : int, optional
 53 |             Number of threads to use for processing. Default is 1.
 54 |         force : bool, optional
 55 |             Whether to force overwrite existing files. Default is False.
 56 | 
 57 |         Notes
 58 |         -----
 59 |         - `database_file`, `input_file`, and `out_file` are validated on instantiation.
 60 |         """
 61 | 
 62 |         self.db_file = os.path.abspath(os.path.expanduser(database_file))
 63 |         self.input_file = os.path.abspath(os.path.expanduser(input_file))
 64 |         self.out_file = os.path.abspath(os.path.expanduser(out_file))
 65 |         self.threads = kwargs.get('threads', 1)
 66 |         self.force = kwargs.get('force', False)
 67 |         self.arguments_dictionary = {}
 68 |         self.successful = True
 69 | 
 70 |         # Only validate database files if we actually intend on using them
 71 |         if not self.should_use_existing_output:
 72 |             self._validate_db()
 73 | 
 74 |         self.version_info = self.get_version_information()
 75 | 
 76 |     @property
 77 |     def db_directory(self):
 78 |         """Directory where databases to be searched are located."""
 79 |         return os.path.dirname(self.db_file)
 80 | 
 81 |     @property
 82 |     def temp_log_file(self):
 83 |         """Temporary log file used for this search. Based on outfile name."""
 84 |         return f"{self.out_file}.log.tmp"
 85 | 
 86 |     @property
 87 |     def should_use_existing_output(self) -> bool:
 88 |         """
 89 |         True if (1) search is not forced and (2) output exists and is valid.
 90 |         """
 91 |         return not self.force and self.validate_output()
 92 | 
 93 |     def search(self):
 94 |         """
 95 |         Wrapper for _search, skipping if existing output should not be overwritten.
 96 |         """
 97 |         if self.should_use_existing_output:
 98 |             logger.warning("%s expected output data already exists, "
 99 |                          "will use existing data found in:",
100 |                          self.__class__.__name__)
101 |             logger.warning(self.out_file, extra = {"no_prefix" : True, "cap":True})
102 |         else:
103 |             self._search()
104 | 
105 |     @abstractmethod
106 |     def _search(self):
107 |         """
108 |         Use a tool to search the input query against a database.
109 |         Should be implemented by all subclasses to perform the actual search against the database.
110 |         """
111 | 
112 |     @abstractmethod
113 |     def read_output(self):
114 |         """
115 |         Returns the output of the handler in the form of a pandas dataframe.
116 |         """
117 | 
118 |     @abstractmethod
119 |     def get_version_information(self) -> SearchToolVersion:
120 |         """
121 |         Provide version for the search tool used, to allow reproducibility.
122 |         This method should be implemented by all subclasses to return tool-specific version info.
123 |         """
124 | 
125 |     def validate_output(self):
126 |         """
127 |         Check the output file contains something, indicating that the search ran.
128 |         Can be overridden if more complex checks for a particular tool are desired.
129 |         Is overridden for Diamond outputs, which have no header information, and simply only
130 |         checks for file-existance, rather than lack of content, for example.
131 |         """
132 |         return not self.has_empty_output()
133 | 
134 |     def _validate_db(self):
135 |         """
136 |         Validates that the database directory and file exists. Called on init.
137 |         """
138 |         if not os.path.isdir(self.db_directory):
139 |             raise DatabaseValidationError(
140 |                 f"Screening database directory not found at: {self.db_directory}."
141 |                 " Screening directory path can be set via --databases option or --config yaml."
142 |             )
143 | 
144 |         if not os.path.isfile(self.db_file):
145 |             raise DatabaseValidationError(
146 |                 f"Provided database file not found: {self.db_file}."
147 |                 " File location can be set via --databases option or --config yaml."
148 |             )
149 | 
150 |     def has_empty_output(self) -> bool:
151 |         """Check if the output file is empty or non-existent."""
152 |         try:
153 |             return os.path.getsize(self.out_file) == 0
154 |         except OSError:
155 |             # Errors such as FileNotFoundError considered empty
156 |             return True
157 | 
158 |     def has_hits(self) -> bool:
159 |         """Check if the output file has any hits (lines that do not start with '#')."""
160 |         try:
161 |             with open(self.out_file, "r", encoding="utf-8") as file:
162 |                 return any(not line.strip().startswith("#") for line in file)
163 |         except FileNotFoundError:
164 |             return False
165 | 
166 |     def format_args_for_cli(self) -> list:
167 |         """
168 |         Format `self.arguments_dictionary` into a list of strings for use in the command line.
169 |         """
170 |         formatted_args = []
171 |         for key, value in self.arguments_dictionary.items():
172 |             formatted_args.append(str(key))
173 |             if isinstance(value, list):
174 |                 formatted_args.append(" ".join(map(str, value)))
175 |             elif value is not None:
176 |                 formatted_args.append(str(value))
177 |         return formatted_args
178 | 
179 |     def run_as_subprocess(self, command, out_file, raise_errors=False):
180 |         """
181 |         Run a command using subprocess.run, piping stdout and stderr to `out_file`.
182 |         """
183 |         self.successful = False
184 | 
185 |         logger.debug("SUBPROCESS: %s", " ".join(command))
186 |         logger.debug(" ".join(command), extra = {"no_prefix":True,"cap":True})
187 | 
188 |         with open(out_file, "a", encoding="utf-8") as f:
189 |             result = subprocess.run(
190 |                 command, stdout=f, stderr=subprocess.STDOUT, check=raise_errors
191 |             )
192 | 
193 |             if result.returncode != 0:
194 |                 command_str = " ".join(command)
195 |                 logger.error(
196 |                     "\t command '%s' failed with error '%s'",
197 |                     command_str,
198 |                     result.stderr,
199 |                 )
200 |                 raise RuntimeError(
201 |                     f"subprocess.run of command '{command_str}' encountered error."
202 |                     f" Check {out_file} for logs."
203 |                 )
204 |             
205 |         self.successful = True
206 | 
207 |     def __del__(self):
208 |         if os.path.exists(self.temp_log_file) and self.successful:
209 |             os.remove(self.temp_log_file)


--------------------------------------------------------------------------------
/commec/tools/hmmer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2021-2024 International Biosecurity and Biosafety Initiative for Science
  3 | """
  4 | Module for a hidden markov model handler, specifically for calling hmmscan command line interface.
  5 | Additional methods for reading hmmscan output, readhmmer, which returns a pandas database.
  6 | Instantiate a HmmerHandler, with input local database, input fasta, and output file.
  7 | Throws if inputs are invalid. Creates a temporary log file, which is deleted on completion.
  8 | """
  9 | import re
 10 | import subprocess
 11 | import pandas as pd
 12 | import itertools
 13 | from commec.config.query import Query
 14 | from commec.tools.search_handler import SearchHandler, SearchToolVersion
 15 | from commec.utils.coordinates import convert_protein_to_nucleotide_coords
 16 | 
 17 | 
 18 | class HmmerHandler(SearchHandler):
 19 |     """A Database handler specifically for use with Hmmer files for commec screening."""
 20 | 
 21 |     def _search(self):
 22 |         command = [
 23 |             "hmmscan",
 24 |             "--cpu",
 25 |             str(self.threads),
 26 |             "--domtblout",
 27 |             self.out_file,
 28 |             self.db_file,
 29 |             self.input_file,
 30 |         ]
 31 |         self.run_as_subprocess(command, self.temp_log_file)
 32 | 
 33 |     def read_output(self):
 34 |         output_dataframe = readhmmer(self.out_file)
 35 |         # Standardize the output column names to be like blast:
 36 |         output_dataframe = output_dataframe.rename(columns={
 37 |             #"ali from": "q. start", # These are no re-calculated to Query NT coordinates.
 38 |             #"ali to": "q. end",
 39 |             "coverage": "q. coverage",
 40 |             "target name": "subject title",
 41 |             "qlen":"query length",
 42 |             "hmm from":"s. start",
 43 |             "hmm to":"s. end",
 44 |             'E-value': "evalue",
 45 |         })
 46 |         return output_dataframe
 47 | 
 48 |     def get_version_information(self) -> SearchToolVersion:
 49 |         """
 50 |         The first line of the HMM database typically contains creation date
 51 |         information, and some version information.
 52 |         """
 53 |         database_info: str = None
 54 |         try:
 55 |             with open(self.db_file, "r", encoding="utf-8") as file:
 56 |                 for line in file:
 57 |                     if line.startswith("HMMER3/f"):
 58 |                         database_info = line.split(";", maxsplit=1)[0].strip()
 59 |                         continue
 60 |                     # Early exit if data has been found
 61 |                     if database_info:
 62 |                         break
 63 | 
 64 |             tool_version_result = subprocess.run(
 65 |                 ["hmmscan", "-h"], capture_output=True, text=True, check=True
 66 |             )
 67 |             tool_info: str = tool_version_result.stdout.splitlines()[1].strip()
 68 |             return SearchToolVersion(tool_info, database_info)
 69 | 
 70 |         except (subprocess.CalledProcessError, FileNotFoundError):
 71 |             return None
 72 | 
 73 | 
 74 | def readhmmer(fileh):
 75 |     """
 76 |     Read in HMMER output files
 77 |     """
 78 |     columns = [
 79 |         "target name",
 80 |         "accession",
 81 |         "tlen",
 82 |         "query name",
 83 |         " accession",
 84 |         "qlen",
 85 |         "E-value",
 86 |         "score",
 87 |         "bias",
 88 |         "hit #",
 89 |         "of",
 90 |         "c-Evalue",
 91 |         "i-Evalue",
 92 |         "score2",
 93 |         "bias",
 94 |         "hmm from",
 95 |         "hmm to",
 96 |         "ali from",
 97 |         "ali to",
 98 |         "env from",
 99 |         "env to",
100 |         "acc",
101 |         "description of target",
102 |     ]
103 | 
104 |     hmmer = []
105 | 
106 |     with open(fileh, "r", encoding="utf-8") as f:
107 |         for line in f:
108 |             if "# Program:         hmmscan" in line:
109 |                 break
110 |             if "#" in line:
111 |                 continue
112 |             bits = re.split(r"\s+", line)
113 |             description = " ".join(bits[22:])
114 |             bits = bits[:22]
115 |             bits.append(description)
116 |             hmmer.append(bits)
117 |     hmmer = pd.DataFrame(hmmer, columns=columns)
118 |     hmmer["E-value"] = pd.to_numeric(hmmer["E-value"])
119 |     hmmer["score"] = pd.to_numeric(hmmer["score"])
120 |     hmmer["ali from"] = pd.to_numeric(hmmer["ali from"])
121 |     hmmer["ali to"] = pd.to_numeric(hmmer["ali to"])
122 |     hmmer["qlen"] = pd.to_numeric(hmmer["qlen"])
123 |     # Extract the frame information.
124 |     hmmer["frame"] = hmmer["query name"].str.split('_').str[-1].astype(int)
125 |     return hmmer
126 | 
127 | def remove_overlaps(hmmer : pd.DataFrame) -> pd.DataFrame:
128 |     """
129 |     Trims verbosity of a HMMER output, 
130 |     by removing weaker hits which are 
131 |     encompassed in their extent by higher scoring hits.
132 | 
133 |     Note, works to trim nucleotide coordinates relative to the query, 
134 |     not ali from and ali to from the HMMER itself.
135 | 
136 |     This means it can be used on any DataFrame with the q. start and q. end NT headings.
137 |     (Consider moving to a general coordinates tool function?)
138 |     """
139 |     assert "q. start" in hmmer.columns, ("No \"q. start\" heading in HMMER output dataframe being "
140 |                                          "passed to remove overlaps, ensure that the dataframe has "
141 |                                          "been processed for converstion to nucleotide coordinates.")
142 | 
143 |     assert "q. end" in hmmer.columns, ("No \"q. end\" heading in HMMER output dataframe being "
144 |                                          "passed to remove overlaps, ensure that the dataframe has "
145 |                                          "been processed for converstion to nucleotide coordinates.")
146 | 
147 |     trimmed_hmmer = hmmer # Direct Assignment, reassigned later with .drop() for deep-copy.
148 | 
149 |     # Ensure all logic is performed per unique Query name.
150 |     for query in hmmer["query name"].unique():
151 | 
152 |         hmmer_for_query = hmmer[hmmer["query name"] == query]
153 |         sorted_values = hmmer_for_query.sort_values(by=["score"], ascending = False)
154 | 
155 |         for i, j in itertools.combinations(sorted_values.index, 2):
156 |             # If J is encapsulated:
157 |             if (sorted_values.loc[i, "q. start"] <= sorted_values.loc[j, "q. start"]
158 |                 and sorted_values.loc[i, "q. end"] >= sorted_values.loc[j, "q. end"]
159 |                 and sorted_values.loc[i, "score"] >= sorted_values.loc[j, "score"]):
160 |                 if j in trimmed_hmmer.index:
161 |                     trimmed_hmmer = trimmed_hmmer.drop([j])
162 |                     continue
163 |             # If I is encapsulated:
164 |             if (sorted_values.loc[i, "q. start"] >= sorted_values.loc[j, "q. start"]
165 |                 and sorted_values.loc[i, "q. end"] <= sorted_values.loc[j, "q. end"]
166 |                 and sorted_values.loc[i, "score"] <= sorted_values.loc[j, "score"]):
167 |                 if i in trimmed_hmmer.index:
168 |                     trimmed_hmmer = trimmed_hmmer.drop([i])
169 | 
170 |     # Tidy the output indices.
171 |     trimmed_hmmer = trimmed_hmmer.reset_index(drop=True)
172 | 
173 |     return trimmed_hmmer
174 | 
175 | def recalculate_hmmer_query_coordinates(hmmer : pd.DataFrame):
176 |     """
177 |     Recalculate the coordinates of the hmmer database , such that each translated frame
178 |     reverts to original nucleotide coordinates.
179 |     """
180 |     assert "nt_qlen" in hmmer.columns, ("No \"nt_qlen\" heading in HMMER output dataframe being "
181 |                                          "passed to calculate nt coordinates, ensure that the dataframe has "
182 |                                          "been processed to include nucleotide query length data.")
183 |     hmmer["q. start"], hmmer["q. end"] = convert_protein_to_nucleotide_coords(
184 |         hmmer["frame"].to_numpy(),
185 |         hmmer["ali from"].to_numpy(),
186 |         hmmer["ali to"].to_numpy(),
187 |         hmmer["nt_qlen"].to_numpy())
188 | 
189 | def append_nt_querylength_info(hmmer : pd.DataFrame, queries : dict[str, Query]):
190 |     """ 
191 |     Take the hmmer output, and add a series (nt_qlen) 
192 |     of the true nt length based on query name.
193 |     """
194 |     hmmer["nt_qlen"] = [queries[q[:-2]].length for q in hmmer["query name"]]
195 | 


--------------------------------------------------------------------------------