├── testing ├── outputs_expected │ ├── logging │ │ └── checkpoints │ │ │ ├── finished_blast │ │ │ └── finished_heatmap │ ├── blast │ │ ├── blast_tables_combined.csv │ │ ├── intermediate │ │ │ ├── fix_blank_results │ │ │ │ ├── Chl_sp_N1.csv │ │ │ │ ├── Ignavibacterium_album_JCM_16511_outgroup.csv │ │ │ │ ├── Chl_luteolum_DSM_273.csv │ │ │ │ ├── Chl_ferrooxidans_KoFox.csv │ │ │ │ └── Chl_phaeoferrooxidans_KB01.csv │ │ │ ├── reciprocal_blast │ │ │ │ ├── Chl_sp_N1.csv │ │ │ │ ├── Ignavibacterium_album_JCM_16511_outgroup.csv │ │ │ │ ├── Chl_luteolum_DSM_273.csv │ │ │ │ ├── Chl_ferrooxidans_KoFox.csv │ │ │ │ └── Chl_phaeoferrooxidans_KB01.csv │ │ │ └── remove_duplicates │ │ │ │ ├── Chl_sp_N1.csv │ │ │ │ ├── Ignavibacterium_album_JCM_16511_outgroup.csv │ │ │ │ ├── Chl_luteolum_DSM_273.csv │ │ │ │ ├── Chl_ferrooxidans_KoFox.csv │ │ │ │ └── Chl_phaeoferrooxidans_KB01.csv │ │ └── combine_blast_tables │ │ │ └── blast_tables_combined.csv │ └── heatmap │ │ ├── BackBLAST_heatmap.pdf │ │ └── BackBLAST_heatmap.tsv └── inputs │ ├── riboprotein_tree.treefile │ ├── gene_metadata.tsv │ ├── genome_metadata.tsv │ ├── config.yaml │ ├── subjects │ ├── Chl_sp_N1.faa │ ├── Ignavibacterium_album_JCM_16511_outgroup.faa │ ├── Chl_luteolum_DSM_273.faa │ ├── Chl_ferrooxidans_KoFox.faa │ └── Chl_phaeoferrooxidans_KB01.faa │ ├── Chl_ferrooxidans_KoFox_gene_targets.faa │ └── Chl_ferrooxidans_KoFox.faa ├── envs └── gtotree.yml ├── images ├── ExampleResults.jpeg └── BackBLAST-Algorithm.gif ├── environment.yml ├── .gitignore ├── scripts ├── remove_duplicates.sh ├── combine_tables.R ├── create_blank_results.py ├── generate_run_templates.sh ├── search.py └── generate_heatmap.R ├── LICENSE ├── snakemake ├── template_config.yaml └── Snakefile ├── README.md └── backblast /testing/outputs_expected/logging/checkpoints/finished_blast: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /testing/outputs_expected/logging/checkpoints/finished_heatmap: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/blast_tables_combined.csv: -------------------------------------------------------------------------------- 1 | combine_blast_tables/blast_tables_combined.csv -------------------------------------------------------------------------------- /envs/gtotree.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - astrobiomike 5 | dependencies: 6 | - gtotree=1.8.7 7 | -------------------------------------------------------------------------------- /images/ExampleResults.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeeBergstrand/BackBLAST_Reciprocal_BLAST/HEAD/images/ExampleResults.jpeg -------------------------------------------------------------------------------- /images/BackBLAST-Algorithm.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeeBergstrand/BackBLAST_Reciprocal_BLAST/HEAD/images/BackBLAST-Algorithm.gif -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/fix_blank_results/Chl_sp_N1.csv: -------------------------------------------------------------------------------- 1 | WP_006367388.1,WP_131354685.1,57.178,5.41e-158,98.0,442.0 2 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/reciprocal_blast/Chl_sp_N1.csv: -------------------------------------------------------------------------------- 1 | WP_006367388.1,WP_131354685.1,57.178,5.41e-158,98.0,442.0 2 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/remove_duplicates/Chl_sp_N1.csv: -------------------------------------------------------------------------------- 1 | WP_006367388.1,WP_131354685.1,57.178,5.41e-158,98.0,442.0 2 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/fix_blank_results/Ignavibacterium_album_JCM_16511_outgroup.csv: -------------------------------------------------------------------------------- 1 | WP_006367320.1,WP_014560057.1,44.558,2.72e-82,79.0,246.0 2 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/reciprocal_blast/Ignavibacterium_album_JCM_16511_outgroup.csv: -------------------------------------------------------------------------------- 1 | WP_006367320.1,WP_014560057.1,44.558,2.72e-82,79.0,246.0 2 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/remove_duplicates/Ignavibacterium_album_JCM_16511_outgroup.csv: -------------------------------------------------------------------------------- 1 | WP_006367320.1,WP_014560057.1,44.558,2.72e-82,79.0,246.0 2 | -------------------------------------------------------------------------------- /testing/outputs_expected/heatmap/BackBLAST_heatmap.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LeeBergstrand/BackBLAST_Reciprocal_BLAST/HEAD/testing/outputs_expected/heatmap/BackBLAST_heatmap.pdf -------------------------------------------------------------------------------- /testing/inputs/riboprotein_tree.treefile: -------------------------------------------------------------------------------- 1 | (Chl_ferrooxidans_KoFox:0.0000021077,((Chl_luteolum_DSM_273:0.0214481784,Chl_sp_N1:0.0212690670)99:0.0613223103,Ignavibacterium_album_JCM_16511_outgroup:0.7595852914)100:0.0708451825,Chl_phaeoferrooxidans_KB01:0.0024207887); 2 | -------------------------------------------------------------------------------- /testing/inputs/gene_metadata.tsv: -------------------------------------------------------------------------------- 1 | qseqid gene_name 2 | WP_006367388.1 cyc2 3 | WP_006365810.1 bacterioferritin 4 | WP_006367305.1 cysI 5 | WP_006367307.1 cysH 6 | WP_006367308.1 cysD 7 | WP_006367309.1 cysN 8 | WP_006367310.1 cysG 9 | WP_006367323.1 cysP 10 | WP_006367322.1 cysT 11 | WP_006367321.1 cysW 12 | WP_006367320.1 cysA 13 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - python=3.12 6 | - snakemake=7 7 | - biopython>=1.76,<=1.84 8 | - blast>=2.9,<=2.16 9 | - networkx=3.4 10 | - r-base=4 11 | - r-codetools 12 | - r-conflicted 13 | - r-getopt=1.20 14 | - r-argparser=0.7 15 | - r-futile.logger=1.4 16 | - r-glue=1 17 | - r-plyr=1.8 18 | - r-dplyr=1.1 19 | - r-tibble=3.2 20 | - r-reshape2=1.4 21 | - r-rcolorbrewer=1.1_3 22 | - r-ggplot2=3.5 23 | - r-ape=5.8 24 | - r-maps=3.4 25 | - r-phytools=2 26 | - r-tidytree=0.4 27 | - bioconductor-treeio=1.26 28 | - bioconductor-ggtree=3.10 29 | - r-gridextra=2.3 30 | - r-egg=0.4 31 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/reciprocal_blast/Chl_luteolum_DSM_273.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_011356871.1,71.429,4.53e-83,99.0,233.0 2 | WP_006367305.1,WP_011358288.1,53.004,3.83e-113,96.0,321.0 3 | WP_006367307.1,WP_011358287.1,67.257,1.85e-116,95.0,324.0 4 | WP_006367308.1,WP_011358286.1,79.522,0.0,100.0,505.0 5 | WP_006367309.1,WP_011358285.1,71.953,0.0,100.0,887.0 6 | WP_006367310.1,WP_041463880.1,53.147,8.67e-59,95.0,171.0 7 | WP_006367320.1,WP_011358284.1,75.35,0.0,100.0,560.0 8 | WP_006367321.1,WP_011358283.1,71.831,5.83e-144,98.0,398.0 9 | WP_006367322.1,WP_011358282.1,77.256,3.78e-157,100.0,430.0 10 | WP_006367323.1,WP_011358281.1,76.347,0.0,98.0,528.0 11 | WP_006367388.1,WP_011358173.1,53.364,2.54e-149,100.0,421.0 12 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/fix_blank_results/Chl_luteolum_DSM_273.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_011356871.1,71.429,4.53e-83,99.0,233.0 2 | WP_006367305.1,WP_011358288.1,53.004,3.83e-113,96.0,321.0 3 | WP_006367307.1,WP_011358287.1,67.257,1.85e-116,95.0,324.0 4 | WP_006367308.1,WP_011358286.1,79.522,0.0,100.0,505.0 5 | WP_006367309.1,WP_011358285.1,71.953,0.0,100.0,887.0 6 | WP_006367310.1,WP_041463880.1,53.147,8.67e-59,95.0,171.0 7 | WP_006367320.1,WP_011358284.1,75.35,0.0,100.0,560.0 8 | WP_006367321.1,WP_011358283.1,71.831,5.83e-144,98.0,398.0 9 | WP_006367322.1,WP_011358282.1,77.256,3.78e-157,100.0,430.0 10 | WP_006367323.1,WP_011358281.1,76.347,0.0,98.0,528.0 11 | WP_006367388.1,WP_011358173.1,53.364,2.54e-149,100.0,421.0 12 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/remove_duplicates/Chl_luteolum_DSM_273.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_011356871.1,71.429,4.53e-83,99.0,233.0 2 | WP_006367305.1,WP_011358288.1,53.004,3.83e-113,96.0,321.0 3 | WP_006367307.1,WP_011358287.1,67.257,1.85e-116,95.0,324.0 4 | WP_006367308.1,WP_011358286.1,79.522,0.0,100.0,505.0 5 | WP_006367309.1,WP_011358285.1,71.953,0.0,100.0,887.0 6 | WP_006367310.1,WP_041463880.1,53.147,8.67e-59,95.0,171.0 7 | WP_006367320.1,WP_011358284.1,75.35,0.0,100.0,560.0 8 | WP_006367321.1,WP_011358283.1,71.831,5.83e-144,98.0,398.0 9 | WP_006367322.1,WP_011358282.1,77.256,3.78e-157,100.0,430.0 10 | WP_006367323.1,WP_011358281.1,76.347,0.0,98.0,528.0 11 | WP_006367388.1,WP_011358173.1,53.364,2.54e-149,100.0,421.0 12 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/fix_blank_results/Chl_ferrooxidans_KoFox.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_006365810.1,100.0,5.58e-115,100.0,313.0 2 | WP_006367305.1,WP_006367305.1,100.0,0.0,100.0,602.0 3 | WP_006367307.1,WP_006367307.1,100.0,0.0,100.0,493.0 4 | WP_006367308.1,WP_006367308.1,100.0,0.0,100.0,611.0 5 | WP_006367309.1,WP_006367309.1,100.0,0.0,100.0,1232.0 6 | WP_006367310.1,WP_006367310.1,100.0,1.25e-114,100.0,312.0 7 | WP_006367320.1,WP_006367320.1,100.0,0.0,100.0,732.0 8 | WP_006367321.1,WP_006367321.1,100.0,0.0,100.0,572.0 9 | WP_006367322.1,WP_006367322.1,100.0,0.0,100.0,555.0 10 | WP_006367323.1,WP_006367323.1,100.0,0.0,100.0,690.0 11 | WP_006367388.1,WP_006367388.1,100.0,0.0,100.0,839.0 12 | WP_006367389.1,WP_006367389.1,100.0,3.82e-90,100.0,248.0 13 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/reciprocal_blast/Chl_ferrooxidans_KoFox.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_006365810.1,100.0,5.58e-115,100.0,313.0 2 | WP_006367305.1,WP_006367305.1,100.0,0.0,100.0,602.0 3 | WP_006367307.1,WP_006367307.1,100.0,0.0,100.0,493.0 4 | WP_006367308.1,WP_006367308.1,100.0,0.0,100.0,611.0 5 | WP_006367309.1,WP_006367309.1,100.0,0.0,100.0,1232.0 6 | WP_006367310.1,WP_006367310.1,100.0,1.25e-114,100.0,312.0 7 | WP_006367320.1,WP_006367320.1,100.0,0.0,100.0,732.0 8 | WP_006367321.1,WP_006367321.1,100.0,0.0,100.0,572.0 9 | WP_006367322.1,WP_006367322.1,100.0,0.0,100.0,555.0 10 | WP_006367323.1,WP_006367323.1,100.0,0.0,100.0,690.0 11 | WP_006367388.1,WP_006367388.1,100.0,0.0,100.0,839.0 12 | WP_006367389.1,WP_006367389.1,100.0,3.82e-90,100.0,248.0 13 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/remove_duplicates/Chl_ferrooxidans_KoFox.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_006365810.1,100.0,5.58e-115,100.0,313.0 2 | WP_006367305.1,WP_006367305.1,100.0,0.0,100.0,602.0 3 | WP_006367307.1,WP_006367307.1,100.0,0.0,100.0,493.0 4 | WP_006367308.1,WP_006367308.1,100.0,0.0,100.0,611.0 5 | WP_006367309.1,WP_006367309.1,100.0,0.0,100.0,1232.0 6 | WP_006367310.1,WP_006367310.1,100.0,1.25e-114,100.0,312.0 7 | WP_006367320.1,WP_006367320.1,100.0,0.0,100.0,732.0 8 | WP_006367321.1,WP_006367321.1,100.0,0.0,100.0,572.0 9 | WP_006367322.1,WP_006367322.1,100.0,0.0,100.0,555.0 10 | WP_006367323.1,WP_006367323.1,100.0,0.0,100.0,690.0 11 | WP_006367388.1,WP_006367388.1,100.0,0.0,100.0,839.0 12 | WP_006367389.1,WP_006367389.1,100.0,3.82e-90,100.0,248.0 13 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/fix_blank_results/Chl_phaeoferrooxidans_KB01.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_076790319.1,98.71,6.57e-114,100.0,311.0 2 | WP_006367305.1,WP_076789648.1,93.151,0.0,100.0,571.0 3 | WP_006367307.1,WP_076790657.1,95.359,3.02e-175,100.0,473.0 4 | WP_006367308.1,WP_076790659.1,97.952,0.0,100.0,603.0 5 | WP_006367309.1,WP_076790661.1,97.496,0.0,100.0,1201.0 6 | WP_006367310.1,WP_076790662.1,82.667,6.27e-95,100.0,263.0 7 | WP_006367320.1,WP_076790666.1,96.078,0.0,100.0,704.0 8 | WP_006367321.1,WP_076790668.1,93.426,0.0,100.0,543.0 9 | WP_006367322.1,WP_076790669.1,98.917,0.0,100.0,552.0 10 | WP_006367323.1,WP_076790671.1,97.345,0.0,100.0,674.0 11 | WP_006367388.1,WP_076792910.1,89.538,0.0,100.0,744.0 12 | WP_006367389.1,WP_076793039.1,94.915,7.5e-87,100.0,239.0 13 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/reciprocal_blast/Chl_phaeoferrooxidans_KB01.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_076790319.1,98.71,6.57e-114,100.0,311.0 2 | WP_006367305.1,WP_076789648.1,93.151,0.0,100.0,571.0 3 | WP_006367307.1,WP_076790657.1,95.359,3.02e-175,100.0,473.0 4 | WP_006367308.1,WP_076790659.1,97.952,0.0,100.0,603.0 5 | WP_006367309.1,WP_076790661.1,97.496,0.0,100.0,1201.0 6 | WP_006367310.1,WP_076790662.1,82.667,6.27e-95,100.0,263.0 7 | WP_006367320.1,WP_076790666.1,96.078,0.0,100.0,704.0 8 | WP_006367321.1,WP_076790668.1,93.426,0.0,100.0,543.0 9 | WP_006367322.1,WP_076790669.1,98.917,0.0,100.0,552.0 10 | WP_006367323.1,WP_076790671.1,97.345,0.0,100.0,674.0 11 | WP_006367388.1,WP_076792910.1,89.538,0.0,100.0,744.0 12 | WP_006367389.1,WP_076793039.1,94.915,7.5e-87,100.0,239.0 13 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/intermediate/remove_duplicates/Chl_phaeoferrooxidans_KB01.csv: -------------------------------------------------------------------------------- 1 | WP_006365810.1,WP_076790319.1,98.71,6.57e-114,100.0,311.0 2 | WP_006367305.1,WP_076789648.1,93.151,0.0,100.0,571.0 3 | WP_006367307.1,WP_076790657.1,95.359,3.02e-175,100.0,473.0 4 | WP_006367308.1,WP_076790659.1,97.952,0.0,100.0,603.0 5 | WP_006367309.1,WP_076790661.1,97.496,0.0,100.0,1201.0 6 | WP_006367310.1,WP_076790662.1,82.667,6.27e-95,100.0,263.0 7 | WP_006367320.1,WP_076790666.1,96.078,0.0,100.0,704.0 8 | WP_006367321.1,WP_076790668.1,93.426,0.0,100.0,543.0 9 | WP_006367322.1,WP_076790669.1,98.917,0.0,100.0,552.0 10 | WP_006367323.1,WP_076790671.1,97.345,0.0,100.0,674.0 11 | WP_006367388.1,WP_076792910.1,89.538,0.0,100.0,744.0 12 | WP_006367389.1,WP_076793039.1,94.915,7.5e-87,100.0,239.0 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # R/RStudio 39 | .Rhistory 40 | .RData 41 | 42 | # OSX 43 | .DS_Store 44 | 45 | # NCBI BLAST 46 | *.phr 47 | *.pin 48 | *.pog 49 | *.psd 50 | *.psi 51 | *.psq 52 | *.nhr 53 | *.nin 54 | *.nog 55 | *.nsd 56 | *.nsi 57 | *.nsq 58 | 59 | # BackBLAST 60 | tempQuery.faa 61 | BackLog*.txt 62 | .idea 63 | 64 | # Snakemake 65 | .snakemake 66 | 67 | # Testing 68 | *.log 69 | *.benchmark.txt 70 | -------------------------------------------------------------------------------- /testing/inputs/genome_metadata.tsv: -------------------------------------------------------------------------------- 1 | subject_name plotting_name 2 | Cba_parvum_NCIB_8327 Cba. parvum NCIB8327 3 | Che_thalassium_ATCC_35110 Che. thalassium ATCC35110 4 | Chl_chlorochromatii_CaD3 Chl. chlorochromatii CaD3 5 | Chl_clathratiforme_BU_1 Chl. clathratiforme BU1 6 | Chl_ferrooxidans_KoFox Chl. ferrooxidans KoFox 7 | Chl_limicola_DSM_245 Chl. limicola DSM245 8 | Chl_luteolum_DSM_273 Chl. luteolum DSM273 9 | Chl_phaeobacteroides_BS1 Chl. phaeobacteroides BS1 10 | Chl_phaeobacteroides_DSM_266 Chl. phaeobacteroides DSM266 11 | Chl_phaeoferrooxidans_KB01 Chl. phaeoferrooxidans KB01 12 | Chl_phaeovibrioides_DSM_265 Chl. phaeovibrioides DSM265 13 | Chl_sp_N1 Chl. sp. N1 14 | Chl_tepidum_TLS Chl. tepidum TLS 15 | Ignavibacterium_album_JCM_16511_outgroup Ignavibacterium album JCM16511 (outgroup) 16 | L227_2013_bin22 L227 2013 bin22 17 | L227_2013_bin55 L227 2013 bin55 18 | L227_2013_bin56 L227 2013 bin56 19 | L227_enr_S_6D_bin01 L227 enr. S-6D bin01 20 | L304_enr_S_6D_bin01 L304 enr. S-6D bin01 21 | L442_2011_2014_bin64 L442 2011 2014 bin64 22 | Ptc_aestuarii_DSM_271 Ptc. aestuarii DSM271 23 | -------------------------------------------------------------------------------- /scripts/remove_duplicates.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | # remove_duplicates.sh 4 | # Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025 5 | # A simple script that removes multiple hits from BackBLAST results 6 | # Part of the BackBLAST pipeline 7 | 8 | SCRIPT_NAME=${0##*/} 9 | 10 | # Print help statement 11 | if [[ $# -ne 1 ]]; then 12 | echo "Incorrect number of arguments provided. Please run '-h' or '--help' to see help. Exiting..." >&2 13 | exit 1 14 | elif [[ $1 = "-h" ]] || [[ $1 = "--help" ]]; then 15 | printf "${SCRIPT_NAME}: A simple script that removes multiple hits from BackBLAST results.\n" 16 | printf "Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025\n\n" 17 | printf "Usage: ${SCRIPT_NAME} input.csv > output.csv\n\n" 18 | printf "Note: log information is printed to STDERR.\n\n" 19 | exit 0 20 | fi 21 | 22 | # Print stack trace to log 23 | set -x 24 | 25 | # Get input from user 26 | input=$1 27 | 28 | echo "[ $(date -u) ]: Removing duplicate hits from ${input}" >&2 29 | 30 | # Prints output to STDOUT 31 | sort -k 1,1 -t , -u ${input} 32 | 33 | echo "[ $(date -u) ]: ${SCRIPT_NAME}: done." >&2 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2025 Lee H. Bergstrand and Jackson M. Tsuji 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /testing/outputs_expected/heatmap/BackBLAST_heatmap.tsv: -------------------------------------------------------------------------------- 1 | subject_name qseqid sseqid pident evalue qcovhsp bitscore 2 | Chl. ferrooxidans KoFox bacterioferritin WP_006365810.1 100 5.58e-115 100 313 3 | Chl. ferrooxidans KoFox cysI WP_006367305.1 100 0 100 602 4 | Chl. ferrooxidans KoFox cysH WP_006367307.1 100 0 100 493 5 | Chl. ferrooxidans KoFox cysD WP_006367308.1 100 0 100 611 6 | Chl. ferrooxidans KoFox cysN WP_006367309.1 100 0 100 1232 7 | Chl. ferrooxidans KoFox cysG WP_006367310.1 100 1.25e-114 100 312 8 | Chl. ferrooxidans KoFox cysA WP_006367320.1 100 0 100 732 9 | Chl. ferrooxidans KoFox cysW WP_006367321.1 100 0 100 572 10 | Chl. ferrooxidans KoFox cysT WP_006367322.1 100 0 100 555 11 | Chl. ferrooxidans KoFox cysP WP_006367323.1 100 0 100 690 12 | Chl. ferrooxidans KoFox cyc2 WP_006367388.1 100 0 100 839 13 | Chl. luteolum DSM273 bacterioferritin WP_011356871.1 71.429 4.53e-83 99 233 14 | Chl. luteolum DSM273 cysI WP_011358288.1 53.004 3.83e-113 96 321 15 | Chl. luteolum DSM273 cysH WP_011358287.1 67.257 1.85e-116 95 324 16 | Chl. luteolum DSM273 cysD WP_011358286.1 79.522 0 100 505 17 | Chl. luteolum DSM273 cysN WP_011358285.1 71.953 0 100 887 18 | Chl. luteolum DSM273 cysG WP_041463880.1 53.147 8.67e-59 95 171 19 | Chl. luteolum DSM273 cysA WP_011358284.1 75.35 0 100 560 20 | Chl. luteolum DSM273 cysW WP_011358283.1 71.831 5.83e-144 98 398 21 | Chl. luteolum DSM273 cysT WP_011358282.1 77.256 3.78e-157 100 430 22 | Chl. luteolum DSM273 cysP WP_011358281.1 76.347 0 98 528 23 | Chl. luteolum DSM273 cyc2 WP_011358173.1 53.364 2.54e-149 100 421 24 | Chl. phaeoferrooxidans KB01 bacterioferritin WP_076790319.1 98.71 6.57e-114 100 311 25 | Chl. phaeoferrooxidans KB01 cysI WP_076789648.1 93.151 0 100 571 26 | Chl. phaeoferrooxidans KB01 cysH WP_076790657.1 95.359 3.02e-175 100 473 27 | Chl. phaeoferrooxidans KB01 cysD WP_076790659.1 97.952 0 100 603 28 | Chl. phaeoferrooxidans KB01 cysN WP_076790661.1 97.496 0 100 1201 29 | Chl. phaeoferrooxidans KB01 cysG WP_076790662.1 82.667 6.27e-95 100 263 30 | Chl. phaeoferrooxidans KB01 cysA WP_076790666.1 96.078 0 100 704 31 | Chl. phaeoferrooxidans KB01 cysW WP_076790668.1 93.426 0 100 543 32 | Chl. phaeoferrooxidans KB01 cysT WP_076790669.1 98.917 0 100 552 33 | Chl. phaeoferrooxidans KB01 cysP WP_076790671.1 97.345 0 100 674 34 | Chl. phaeoferrooxidans KB01 cyc2 WP_076792910.1 89.538 0 100 744 35 | Chl. sp. N1 cyc2 WP_131354685.1 57.178 5.41e-158 98 442 36 | Ignavibacterium album JCM16511 (outgroup) cysA WP_014560057.1 44.558 2.72e-82 79 246 37 | -------------------------------------------------------------------------------- /testing/outputs_expected/blast/combine_blast_tables/blast_tables_combined.csv: -------------------------------------------------------------------------------- 1 | subject_name,qseqid,sseqid,pident,evalue,qcovhsp,bitscore 2 | Chl_ferrooxidans_KoFox,WP_006365810.1,WP_006365810.1,100,5.58e-115,100,313 3 | Chl_ferrooxidans_KoFox,WP_006367305.1,WP_006367305.1,100,0,100,602 4 | Chl_ferrooxidans_KoFox,WP_006367307.1,WP_006367307.1,100,0,100,493 5 | Chl_ferrooxidans_KoFox,WP_006367308.1,WP_006367308.1,100,0,100,611 6 | Chl_ferrooxidans_KoFox,WP_006367309.1,WP_006367309.1,100,0,100,1232 7 | Chl_ferrooxidans_KoFox,WP_006367310.1,WP_006367310.1,100,1.25e-114,100,312 8 | Chl_ferrooxidans_KoFox,WP_006367320.1,WP_006367320.1,100,0,100,732 9 | Chl_ferrooxidans_KoFox,WP_006367321.1,WP_006367321.1,100,0,100,572 10 | Chl_ferrooxidans_KoFox,WP_006367322.1,WP_006367322.1,100,0,100,555 11 | Chl_ferrooxidans_KoFox,WP_006367323.1,WP_006367323.1,100,0,100,690 12 | Chl_ferrooxidans_KoFox,WP_006367388.1,WP_006367388.1,100,0,100,839 13 | Chl_ferrooxidans_KoFox,WP_006367389.1,WP_006367389.1,100,3.82e-90,100,248 14 | Chl_luteolum_DSM_273,WP_006365810.1,WP_011356871.1,71.429,4.53e-83,99,233 15 | Chl_luteolum_DSM_273,WP_006367305.1,WP_011358288.1,53.004,3.83e-113,96,321 16 | Chl_luteolum_DSM_273,WP_006367307.1,WP_011358287.1,67.257,1.85e-116,95,324 17 | Chl_luteolum_DSM_273,WP_006367308.1,WP_011358286.1,79.522,0,100,505 18 | Chl_luteolum_DSM_273,WP_006367309.1,WP_011358285.1,71.953,0,100,887 19 | Chl_luteolum_DSM_273,WP_006367310.1,WP_041463880.1,53.147,8.67e-59,95,171 20 | Chl_luteolum_DSM_273,WP_006367320.1,WP_011358284.1,75.35,0,100,560 21 | Chl_luteolum_DSM_273,WP_006367321.1,WP_011358283.1,71.831,5.83e-144,98,398 22 | Chl_luteolum_DSM_273,WP_006367322.1,WP_011358282.1,77.256,3.78e-157,100,430 23 | Chl_luteolum_DSM_273,WP_006367323.1,WP_011358281.1,76.347,0,98,528 24 | Chl_luteolum_DSM_273,WP_006367388.1,WP_011358173.1,53.364,2.54e-149,100,421 25 | Chl_phaeoferrooxidans_KB01,WP_006365810.1,WP_076790319.1,98.71,6.57e-114,100,311 26 | Chl_phaeoferrooxidans_KB01,WP_006367305.1,WP_076789648.1,93.151,0,100,571 27 | Chl_phaeoferrooxidans_KB01,WP_006367307.1,WP_076790657.1,95.359,3.02e-175,100,473 28 | Chl_phaeoferrooxidans_KB01,WP_006367308.1,WP_076790659.1,97.952,0,100,603 29 | Chl_phaeoferrooxidans_KB01,WP_006367309.1,WP_076790661.1,97.496,0,100,1201 30 | Chl_phaeoferrooxidans_KB01,WP_006367310.1,WP_076790662.1,82.667,6.27e-95,100,263 31 | Chl_phaeoferrooxidans_KB01,WP_006367320.1,WP_076790666.1,96.078,0,100,704 32 | Chl_phaeoferrooxidans_KB01,WP_006367321.1,WP_076790668.1,93.426,0,100,543 33 | Chl_phaeoferrooxidans_KB01,WP_006367322.1,WP_076790669.1,98.917,0,100,552 34 | Chl_phaeoferrooxidans_KB01,WP_006367323.1,WP_076790671.1,97.345,0,100,674 35 | Chl_phaeoferrooxidans_KB01,WP_006367388.1,WP_076792910.1,89.538,0,100,744 36 | Chl_phaeoferrooxidans_KB01,WP_006367389.1,WP_076793039.1,94.915,7.5e-87,100,239 37 | Chl_sp_N1,WP_006367388.1,WP_131354685.1,57.178,5.41e-158,98,442 38 | Ignavibacterium_album_JCM_16511_outgroup,WP_006367320.1,WP_014560057.1,44.558,2.72e-82,79,246 39 | -------------------------------------------------------------------------------- /snakemake/template_config.yaml: -------------------------------------------------------------------------------- 1 | # Config file for running the BackBLAST pipeline 2 | 3 | ## Execution settings 4 | # Total threads to be used for all processes 5 | threads: 1 6 | 7 | 8 | ## Phylogenetic tree 9 | # Path to the phylogenetic tree file (newick format) corresponding to the subject genomes 10 | # Subject names in the tree must match subject names provided under 'subjects' below 11 | # OR specify 'subjects' to auto-generate a phylogenetic tree using the subject genomes via GToTree (see GToTree params below) 12 | # OR specify 'NA' to skip tree generation and plot the heatmap only 13 | phylogenetic_tree_newick: NA 14 | # Optional: set the cutoff for displayed bootstrap values. Anything greater than to equal to the cutoff will be shown. 15 | bootstrap_cutoff: 80 16 | # Optional: if you want to re-root your tree, then give the exact name of the tip you want to re-root by. 17 | # 'midpoint' will midpoint root the tree. 'NA' will keep the current tree topology. 18 | root_name: NA 19 | 20 | 21 | ## Options for GToTree, if you want to generate the phylogenetic tree within the BackBLAST pipeline 22 | ## For these options to be used, set 'phylogenetic_tree_newick' above to 'subjects' 23 | # Gene marker set to use for making the tree 24 | gtotree_phylogenetic_model: "Universal-Hug-et-al" 25 | # Keeps gene hits within this proportional threshold of the median sequence length 26 | gtotree_sequence_length_threshold: 0.2 27 | # Keeps genomes with this proportion of hits from the gene marker set 28 | gtotree_minimum_hit_fraction: 0.5 29 | 30 | 31 | ## Plot settings 32 | # Optional: Tab-separated file containing the 'subject_name' of each plotted genome, plus the 'plotting_name' that you desire to appear 33 | # on the final plot. 34 | # See testing/inputs/genome_metadata.tsv 35 | genome_metadata_tsv: NA 36 | # Optional: Tab-separated file containing the accession of each query gene and its corresponding gene name. 37 | # Should have two columns: 'qseqid' with the accession and 'gene_name' with the corresponding gene name. 38 | # Genes will be plotted in the order specified in this TSV file. 39 | # See testing/inputs/gene_metadata.tsv 40 | gene_metadata_tsv: NA 41 | plot_width_mm: 400 42 | plot_height_mm: 200 43 | 44 | 45 | ## BLAST settings 46 | # Only hits with an e-value below this number will be kept 47 | e_value_cutoff: 1e-40 48 | # Only hits with percent amino acid identity above this number will be kept 49 | minimum_percent_identity: 25 50 | # Only hits with percent query coverage above this number will be kept 51 | minimum_query_coverage: 50 52 | 53 | ## BLAST input files 54 | # Query files 55 | # query_genes contains the genes you want to do a reciprocal BLAST search with. 56 | # query_genome_orfs contains all predicted ORFs from the query genome (for reciprocal BLAST purposes) 57 | query_genes: /path/to/query_gene_targets.faa 58 | query_genome_orfs: /path/to/query.faa 59 | 60 | # Subject files for BLAST (the name you specify will be plotted). 61 | # These should be ORF predictions from the genomes of the organisms of interest. 62 | # If you want to plot the query genome as well, then include it here. 63 | subjects: 64 | -------------------------------------------------------------------------------- /scripts/combine_tables.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # combine_tables.R 4 | # Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025 5 | # Combines BLAST output (CSV) tables into a single output table with headers 6 | # Part of the BackBLAST pipeline 7 | 8 | # Load libraries 9 | library(getopt) 10 | library(futile.logger) 11 | library(tools) 12 | library(plyr) 13 | library(dplyr, warn.conflicts = FALSE) 14 | 15 | # Hard-coded variables 16 | HEADER_NAMES <- c("qseqid", "sseqid", "pident", "evalue", "qcovhsp", "bitscore") 17 | 18 | # Function to assign command line input to variables, or throw help message 19 | parse_command_line_input <- function(commandArgs) { 20 | 21 | if (length(commandArgs) < 2) { 22 | cat("combine_tables.R: Combines BLAST tables and makes new column for sample ID based on the filenames.\n") 23 | cat("Copyright Lee Bergstrand and Jackson M. Tsuji, 2025\n") 24 | cat("Part of the BackBLAST pipeline.\n\n") 25 | 26 | cat("Usage: combine_tables.R subject1.csv subject2.csv ... subjectN.csv combined_blast_tables.csv\n\n") 27 | 28 | cat("Details:\n", 29 | "subject1.csv, etc.: CSV BLAST tables for all individual samples (subjects for BLAST). Name should be the subject name [Required]\n", 30 | "combined_blast_tables.csv: the output CSV file. [Required]\n\n") 31 | 32 | quit(status = 1) 33 | } 34 | 35 | num_args <- length(commandArgs) 36 | 37 | # Return output params 38 | params <- list() 39 | params$input_filenames <- commandArgs[1:(num_args-1)] 40 | params$output_filename <- commandArgs[num_args] 41 | 42 | return(params) 43 | } 44 | 45 | # Function to load table of data, add header IDs, and add query/subject name by parsing filename 46 | load_individual_table <- function(table_filename, header_names) { 47 | flog.info(paste("Reading table '", table_filename, "'", sep = "")) 48 | 49 | # Read the table and add columns 50 | data_table <- read.table(table_filename, header = FALSE, sep = ",", stringsAsFactors = FALSE, comment.char = "") 51 | colnames(data_table) <- header_names 52 | 53 | # Remove CSV ending and folder path. Set to subject_name. 54 | subject_name <- tools::file_path_sans_ext(basename(table_filename)) 55 | 56 | # Add new columns to table 57 | # TODO - consider also adding query_name 58 | # data_table$query_name <- query_name 59 | flog.info(paste("Adding subject_name of '", subject_name, "'", sep = "")) 60 | data_table$subject_name <- subject_name 61 | 62 | # Re-order columns to be more user friendly 63 | data_table <- dplyr::select(data_table, subject_name, everything()) 64 | 65 | return(data_table) 66 | 67 | } 68 | 69 | main <- function() { 70 | # Parse command line input 71 | params <- parse_command_line_input(commandArgs(trailingOnly = TRUE)) 72 | 73 | # Startup messages 74 | flog.info("Running combine_tables.R") 75 | flog.info(paste("Input tables: ", length(params$input_filenames), " total", sep = "")) 76 | flog.info(paste("Output table: '", params$output_filename, "'", sep = "")) 77 | 78 | # Load all tables 79 | flog.info("Loading BLAST tables") 80 | blast_tables <- lapply(params$input_filenames, function(x) { load_individual_table(x, HEADER_NAMES) }) 81 | 82 | flog.info("Combining BLAST tables") 83 | output_table <- dplyr::bind_rows(blast_tables) 84 | 85 | flog.info("Writing combining BLAST table to file (**with headers**)") 86 | write.table(output_table, file = params$output_filename, sep = ",", row.names = FALSE, 87 | col.names = TRUE, quote = FALSE) 88 | 89 | flog.info("combine_tables.R: Done.") 90 | } 91 | 92 | main() 93 | -------------------------------------------------------------------------------- /testing/inputs/config.yaml: -------------------------------------------------------------------------------- 1 | # Config file for running the BackBLAST pipeline 2 | 3 | ## Execution settings 4 | # Total threads to be used for all processes 5 | threads: 4 6 | 7 | 8 | ## Phylogenetic tree 9 | # Path to the phylogenetic tree file (newick format) corresponding to the subject genomes 10 | # Subject names in the tree must match subject names provided under 'subjects' below 11 | # OR specify 'subjects' to auto-generate a phylogenetic tree using the subject genomes via GToTree (see GToTree params below) 12 | # OR specify 'NA' to skip tree generation and plot the heatmap only 13 | phylogenetic_tree_newick: "../inputs/riboprotein_tree.treefile" 14 | # Optional: set the cutoff for displayed bootstrap values. Anything greater than to equal to the cutoff will be shown. 15 | bootstrap_cutoff: 50 16 | # Optional: if you want to re-root your tree, then give the exact name of the tip you want to re-root by. Otherwise, 'NA'. 17 | root_name: "Ignavibacterium_album_JCM_16511_outgroup" 18 | 19 | 20 | ## Options for GToTree, if you want to generate the phylogenetic tree within the BackBLAST pipeline 21 | ## For these options to be used, set 'phylogenetic_tree_newick' above to 'subjects' 22 | # Gene marker set to use for making the tree 23 | gtotree_phylogenetic_model: "Universal_Hug_et_al.hmm" 24 | # Keeps gene hits within this proportional threshold of the median sequence length 25 | gtotree_sequence_length_threshold: 0.2 26 | # Keeps genomes with this proportion of hits from the gene marker set 27 | gtotree_minimum_hit_fraction: 0.5 28 | 29 | 30 | ## Plot settings 31 | # Optional: Tab-separated file containing the 'subject_name' of each plotted genome, plus the 'plotting_name' that you desire to appear 32 | # on the final plot. 33 | # See ../testing/inputs/genome_metadata.tsv 34 | genome_metadata_tsv: "../inputs/genome_metadata.tsv" 35 | # Optional: Tab-separated file containing the accession of each query gene and its corresponding gene name. 36 | # Should have two columns: 'qseqid' with the accession and 'gene_name' with the corresponding gene name. 37 | # Genes will be plotted in the order specified in this TSV file. 38 | # See ../testing/inputs/gene_metadata.tsv 39 | gene_metadata_tsv: "../inputs/gene_metadata.tsv" 40 | plot_width_mm: 250 41 | plot_height_mm: 70 42 | 43 | 44 | ## BLAST settings 45 | # Only hits with an e-value below this number will be kept 46 | e_value_cutoff: 1e-40 47 | # Only hits with percent amino acid identity above this number will be kept 48 | minimum_percent_identity: 25 49 | # Only hits with percent query coverage above this number will be kept 50 | minimum_query_coverage: 50 51 | 52 | 53 | ## BLAST input files 54 | # Query files 55 | # query_genes contains the genes you want to do a reciprocal BLAST search with. 56 | # query_genome_orfs contains all predicted ORFs from the query genome (for reciprocal BLAST purposes) 57 | query_genes: "../inputs/Chl_ferrooxidans_KoFox_gene_targets.faa" 58 | query_genome_orfs: "../inputs/Chl_ferrooxidans_KoFox.faa" 59 | 60 | # Subject files for BLAST (the name you specify will be plotted). 61 | # These should be ORF predictions from the genomes of the organisms of interest. 62 | # If you want to plot the query genome as well, then include it here. 63 | subjects: 64 | Chl_ferrooxidans_KoFox: "../inputs/subjects/Chl_ferrooxidans_KoFox.faa" 65 | Chl_luteolum_DSM_273: "../inputs/subjects/Chl_luteolum_DSM_273.faa" 66 | Chl_phaeoferrooxidans_KB01: "../inputs/subjects/Chl_phaeoferrooxidans_KB01.faa" 67 | Chl_sp_N1: "../inputs/subjects/Chl_sp_N1.faa" 68 | Ignavibacterium_album_JCM_16511_outgroup: "../inputs/subjects/Ignavibacterium_album_JCM_16511_outgroup.faa" 69 | -------------------------------------------------------------------------------- /testing/inputs/subjects/Chl_sp_N1.faa: -------------------------------------------------------------------------------- 1 | >WP_131353783.1 NAD-dependent epimerase [Chlorobium sp. N1] 2 | MHVLVTGAAGFIGSGVSRRLLERGDRVIGVDNMNDYYEVSLKEARLGKLTPHENFEFVKADISDRSAMEELFEKGKFDRVVNLAAQAGVRYSIENPHSYIESNIVGFINILEGCRHHNVEHLVYASSSSVYGANETMPFSVHDNVDHPLSLYAASKKANELMAHTYSHLYRLPTTGLRFFTVYGPWGRPDMALFLFTDAILKGKPIKVFNYGKHRRDFTYIDDIVEGVIRTLDHVARPNPDWSGLKPDPGSSRAPWRVYNIGNSNPVELMDYIGALERELGRTAEKEMLPLQPGDVPDTYADVEQLKEDVAYQPSTTVDEGVRRFVRWYREYYGIAE 3 | >WP_131353785.1 circularly permuted type 2 ATP-grasp protein [Chlorobium sp. N1] 4 | MKRFFDAYDAEASRYYDEVIAREGGARSHYEKLLRRFAMFSHEDVKTRREMANIFFRNQGITFTVYGLEEGIERIFPFDLIPRIIPSAEWSLIERGLTQRITALNLFLYDIYHTQKILRDKVIPAELVLGSPHFRREFVGVTPPLGIYVHVVGSDIIRDREGRYLVLEDNLRTPSGVSYMLQNRQAMKRAFPVLFSQYKVRPVEDYPQELLRTLQEASSSVPGDPNVVVLTPGIYNSAYFEHSFLARQMGVELTEGRDLVVNDNIVYKRTTRGLERVDVIYRRIDDAFLDPLCFRPDSKLGVAGLINAYRKGNVTLANAIGTGIADDKVIYSFVPKMIRYYLGEDPVLENVPTWLPGNPWEMKYILEHLGELVVKSANESGGYGMLVGPESTLEEQERFRELIVSNPRNYIAQPTISLSRHPSVLNDSELYGCHIDLRPYVLTGKKTTIVPGGLTRVALKRGSLVVNSSQGGGSKDTWVVDE 5 | >WP_131353787.1 alpha-E domain-containing protein [Chlorobium sp. N1] 6 | MLSRVAESLFWMSRYVERAENTARFLDVNFNLLLDLNRITTVESPECWIALILVTSDRERFESAYGDYGAEPVTDYLVFNRQNPNSIISCIGMARENARSVIESISSEMWEQINNLYHYLQSATPQSVQNDPFSFYREIKNASHLFQGITDSTFPRSEGWDFMQVAKYLERADNTARLVDVKYHMLETTRQEHEEVVSRSVDIIEWMAVLKSCSALEAFRKRYLARIEPRSIMEFLILDRSFPRSINFSVCAAKEALWRLSGGSGGRSLNSADRLIGKMEAELSFTTMDDISDTGLHHYLERLERGIREVGEEAHLTYFASHTPEIEPFPDKDRNLPATRRKAEWREAQQQQQQAYCP 7 | >WP_131353789.1 transglutaminase family protein [Chlorobium sp. N1] 8 | MILKVEHRTIFEYDAPIYETATEVRLHPHTGQGSTQRCTSFTLQVDPPAPIFEYTDFYGNRVHHFNLLRSHRRVAILGTSVVETGTARSGREEHEEILLLDFLAESRFAHFDPAVREFAAGIPSADGPEAQAEALCRKINDTFTYEPGATDVHSTSSVVMALGRGVCQDFAHVMLAGCRSLGLPARYVSGYLFGGSTPEGHDEASHAWCEVWCGTEKGWVGMDPTHSTIMADERYIRIGSGRDYGDVPPVRGTWKGSATETLGVSVRVSSLS 9 | >WP_131353791.1 cytochrome c5 family protein [Chlorobium sp. N1] 10 | MRILRASAAFALAFSLSQSFPLAEPALAAEHDLARGESIYGSTCSTCHNSGIMGAPKPGDKAAWVDRIAGGFDLMVAHSLNGFNGMPARGGRSSLTDTDVADAVAFMVSKSR 11 | >WP_131353793.1 tRNA guanosine(34) transglycosylase Tgt [Chlorobium sp. N1] 12 | MSFRLETTDPRSAARTGTLSTGHGEIRTPVFMPVGTRASVKSVEPHELLEQEAQIILANTYHLYLKPGNHIIAKAGGVHRFMDWQGPLLTDSGGYQVYSLSELRKITEEGVAFKSHLDGSMLQFTPENVVDTQRIIGSDIMMPLDECPSATAEREYIRKSGELTIRWAERARRRFLQTEPLYGHPQMLFAITQGGTHADLRTISAKALVELDFEGYAIGGMAVGEPAEEMYRILELSHTMLPEHKPRYLMGVGTPENILNAIERGVDMFDCVIPTREGRNGRVYTRKGKMNLRSAKYAEDFRPLDEGFESHVSGHYSRAYIRHLLNVGEILGLKLCTLQNIAFFMWLTRTARQEIEKGSFLEWKAAVLEEFNRGEA 13 | >WP_131353795.1 30S ribosomal protein S12 methylthiotransferase RimO [Chlorobium sp. N1] 14 | MDTGAKTTVFLLTLGCSKNTVDSERLQAQAEAAGVRFTEQAGEAEAILINTCGFIEDAKEESIAEILSAVELKEAGTVRKVYVMGCLTELYRNELQEELPEVDGFFGTRELPAVLEALGTRYRLELHDHRSISEPRHSSYLKIAEGCSRSCSFCSIPKIRGRYQSQSMDQLLREARLLQEKGVRELSLIAQDITLYGVDLYGRQMLNELLLRLSDMGFDWLRLLYAYPLGFPLEVIDTMRRRKNVCNYLDLPLQHASDPLLRSMNRGITGGESRRLIEEIRERNPEIRLRTTMIAGYPGESRREFEEMLRFVGDVGFDRLGCFPYRHEEHSPAYRLRDDVPDKEKEERVRELMELQEGIAEKKNAAFEGEELRILVDREEEGEEGLLLVGRTEYDAPEVDNDCLLDPAGLTALPGTFVKARISASDAYELHGTLTAAEG 15 | >WP_131353797.1 shikimate kinase [Chlorobium sp. N1] 16 | MERAVEQKEHSLIFLTGFSGSGKSTIGPLLANSLGYEFLDVDQEVERRTGKSITAIFSDEGEAAFRELELQAIRSVAGEREKVVSLGGGALEYDPSHEFISRAGTLVYLKSSPSSLARRLVNKRDRPLLRAEGGRRHSREELEEKIRLILEEREPRYLQAAVTVHTDQKRIGSTVEELTRKIERLVRKASAADAQHQQQQK 17 | >WP_131353799.1 3-dehydroquinate synthase [Chlorobium sp. N1] 18 | MSTIIVSGAATAGLRERFRAHGLKQKTVVLFDTNTRKLFGEKLLSAMKAEGFTVLELTVPSRENSKSFSTAFKLYGRMIEAGVDRSWNLLAVGGGVVGDLGGFIAASYYRGIPVIQLPTTLLAMTDSSIGGKVAVNHPLGKNLIGFFHLPELVLIDPSYLRTLPAREIYSGLSEVVKYGFIADRAMFERLDEHFEEVTALEEPYLTEAVRRSAEIKEAVVEADFREMSGLRATLNFGHTFAHGLEKLADYRFIRHGEAVTIGMACALSLSNHLGYLDEASLERGLALIRKFRFPRGLVEKRFLSIEAPVLLENMLSDKKKLDSRLRFVLLRALGEAFLLEEDVEDREVLRAIESAKDFFRNPRSC 19 | >WP_131354685.1 hypothetical protein [Chlorobium sp. N1] 20 | MQFKHLYRVLAGGAVSALLFATPTEVSATPVFARQTGMSCAACHFQRYPMLNAFGREFKVRGYTMKGAQETIEDKNLSIPSTLNFGLVSKLRYQKRNGSATTGTPGMLNDGEFQIPDEAFLSVGGRVAQNIGFQAELNLANVATNGVFAAFKMPFVFEPTPDIVANIVPFTTDAQGPSYGFELLNTGAVRFNRAFEHRNETSAQQYLGTATQATGASFALYNKLGFLTYAPYVQRASNADPAFSSGEMLSYLRGAVTPTVGGWDLAAGFQLWNGSSKLEVAGTETPTSADVWALDAQAQGNVGKLPLGVYVTYGSAAASSAGDTNVYNTNTAGDKTALTLAAELGVIPYKLTVGAAYRNASSGASASDAEDAVTLGLNYMPAKNVTLQVNHSFYSYDASLAAAGDNLTTLMLYSAF 21 | -------------------------------------------------------------------------------- /testing/inputs/Chl_ferrooxidans_KoFox_gene_targets.faa: -------------------------------------------------------------------------------- 1 | >WP_006365810.1 bacterioferritin [Chlorobium ferrooxidans] 2 | MKGNPKIIEKLNSLLADELTAINQYIVHSEMCSNWGYEKLHNADEKRAIDEMKHAEKLIARILFLEGIPVVSDLKKIKIGADVAAQHKNDRHSEDGAIASYNDGIRLAVEAGDNGTRELLESILRDEEAHIDWLEAQIDQISQMGIQNYLVEQLG 3 | >WP_006367305.1 4Fe-4S dicluster domain-containing protein [Chlorobium ferrooxidans] 4 | MSEHELDLAALSKVGMMRQKQPDYYVMRLKAVAGDMTASQLACVAGVAEKYGRGFVHLSTRQGIEIHYVHRDHLETARLELQGAGIEMGACGPRVRVIVACPGEETCRWGNIDTKKIARELDRRYFKQETPSKFKLAVTGCSNNCTKANENDIGVRGAIEPKWEAGECNDCGLCVSLCPVSAIERRDSDEGYCYAIDEQLCINCSVCTSLCPSNSWVIGRKGYTLYIGGTMGRVPRFASVLKKMVESEEELYLLIEKAIALYREKGRKKERFGHMIDRIGLEAVKEELLAKP 5 | >WP_006367307.1 phosphoadenylyl-sulfate reductase [Chlorobium ferrooxidans] 6 | MKREKIEQLNIELAAKSPEEILRAAVGCFGKENMAFASSFGAEDQVLTDMLHREKLPVPVFTLDTGRLHQETYDLFDATRKHYGIEIEALFPETAAVESMLALHGPNLFYESIEKRRECCRIRKVQPLTKKLSTVTAWICGLRREQSVTRTAVAPIEWDAAFGLYKINPLAEVSEAWVWEYIKQHNVPFNRLHNEGYPSIGCAPCTRAVKPGEDLRAGRWWWEIAEHRECGLHREKQ 7 | >WP_006367308.1 sulfate adenylyltransferase subunit 2 [Chlorobium ferrooxidans] 8 | MDHLDKLEAQSVYILREAYREFKSLCMLWSIGKDSTVMLWLARKAFFGHVPIPLVHVDTHYKIPEMIEYRDRLALEWNLNMIYGENREALERKLTFPDGNTDRLTCCKYLKSEALKHTLSGEWPRYRMDHELKHYVQDEGKEPYTGVIAGVRADEEGSRSKERYFSPRDKDNVWDVGDQPPEFWNQFKTDFAPGTHVRIHPLLDWTELNIWEYIEREKISVVSLYFNQGKGIRYRSLGCYPCTNPVESEARTLPEIIDELRSGKFSNIAERSGRAQDSEDGGGLETLRRDGYM 9 | >WP_006367309.1 GTP-binding protein [Chlorobium ferrooxidans] 10 | MNGRSQMNIVIVGHVDHGKSTVIGRLLADTGTLPQGKLESVRESCRKNSKPFEYAFLLDALKDEQAQGITIDMARCFFKTEKRDYIIIDAPGHIEFLKNMITGASRAEAALLVIDAHEGIQENSKRHGHMVSMLGVKHVTVLVNKMDLAGYSEEVFEQLKKEYTAFLAQIKVTPVSFIPVSAREGDNIASLSDRMEWYKGATVLEQLDRFVSSKELQELPFRFPVQDIYKFTRSDDDRRIVAGSVAAGSVQVGDEVLFLPSGKHSVIQSVESFNTLPKTKASAGEATGFTLSTQIYIRPGELMVKPSEPQPEVGSRFRVNIFWVGRAPMIRQKEYKLKLGSARASVKLAEISNTLDASDLSYSRSKQQLDCRDVGECILETARPIAFDPASASETTGRFVIVDNFEICGGGIVLENLSAGETLLQQHIRDRENNWEPGLVRFEERAEANRHKAKFIVFTGAPGTGKRAIAKALERGLFENGLNAYYLGVANIDRGLDADLGTHADSAGERLRRIGELARILTDAGLIFITTIDDADDYDIETLKQLNEPNDILVINTGENGFSRYQPDFELPAGADVSKAVGQVADILKTREIIIDYQI 11 | >WP_006367310.1 bifunctional precorrin-2 dehydrogenase/sirohydrochlorin ferrochelatase [Chlorobium ferrooxidans] 12 | MNYVPITLKVENKQLLILGGGKAALEKVQMLQRFGFTVTVIGEEIDEEILQSGCTCHLRPFRSEDFEDVLLVYACFADREVNRAIKEEANARGILVNTPDDPELCDFITPALFIDGPMMVAVSSGGTDVRKAVAWRNRIKTYFSSHDPIS 13 | >WP_006367320.1 sulfate ABC transporter ATP-binding protein [Chlorobium ferrooxidans] 14 | MGIELQNITKKFTDYTALDNISLNIASGELIALLGPSGCGKTTLLRIIAGLESADSGKIILEGKDTTNLPPREKNVGFVFQHYALFRRLNVYENVAFGLKVLPRSKRPSKSEIKDRVEHLLKLVQMEWALKRYPSQLSGGQRQRVALARALAVNPRVLLLDEPFSALDAKVRQELRRWLRKLHDEIHVTSIFVTHDQEEALELADRIVVINKGKIEQQGTPQEVYDHPANAFVYNFLGNVNVFHGRVHEGMVTLGGHSVDAPDELKNSAEKTSQTFVRPHEIGISKIRSEGNDLEGTIREIRLLGGQIGLNIECEGFDQPIDAEIPRELYLNLQLKKGDRVFVTFNKVKVFACDYEI 15 | >WP_006367321.1 sulfate ABC transporter permease subunit CysW [Chlorobium ferrooxidans] 16 | MSTPLLSQGTPPVVRKKISDPLPVRILLIGLTLLFFIGFVFLPLGLVFAQAFQKGWDFYLEAIREPYTIEAVKLTLTTVAIVVPLNAFFGVTAAWAITKFSFPGKAALKTLLDLPFAVSPVIAGLIFVLLLGSRTPFGSWLGEQGIKIIFSTPGIIIATLFVTFPFVARELIPLMEAQGRDEEEAALTLGAKGWQIFGKITLPNIRWGLLYGMILCSARAIGEFGAVSVVSGHIRGQTNTIPLHVEILYSEYNFTASFAVASLLVFLALTTVIVKAVMENRFQKNSSQH 17 | >WP_006367322.1 sulfate ABC transporter permease subunit CysT [Chlorobium ferrooxidans] 18 | MAIFQKKRRNILPGFGLSMGYTVFYLSAVVIVPLSMIFFNAIPMGWEPFLSAVTAPRVLASYKLSFSTAFLAALFDAVAGLLTAWVLVRYRFPGKAFLDALVDIPFALPTAVAGICFATLYSPVGWLGEITAKWNIEVINSPTGIVVALIFIGFPFVVRTLQPVLEELEPEIEESAHCLGATRLQTFRKILLPHLFPALLTGSTLAFARGIGEYGSVIFIAGNLPMKTEIAPLMIMSKLDQYDYNGASAVALVLLVISFSMILLLNAIQEWQQKEYR 19 | >WP_006367323.1 sulfate ABC transporter substrate-binding protein [Chlorobium ferrooxidans] 20 | MHLNWTKKITLAASLILASGLPGVVAEAVPAPAKLLNVSYDPTRELYKSENAAFINYWKAKTGQTVTIDQSHGGSGKQGRAVIDGLEADVVTLALAYDIDAIADSRLLDVNWQKRLANNSTPYTSTIVFVVRKGNPKRINGWDDLVKPGVSVITPNPKTSGGARWNYLAAWGFKQKQTGSALKAKAFVKALYKNVPVLDTGARGSTLTFAQRGLGDVFLSWENEAHLILKEFGSDKFEIVAPATSILAEPPVAVVDKNVAKHGTRKLAEAYLNFLYTPQSQEIIARNYYRPSSPAALKKYGANFPKIKLFTLKEFFGDWRTAQKTHFKDGGVFDQIYLP 21 | >WP_006367388.1 hypothetical protein [Chlorobium ferrooxidans] 22 | MTLKKTMLVGASCLSLMALSSPQEASATPVFARQTSQSCAACHFQRFPMLNAYGRSFKANGYTTVGKQGTIEDTNLSTPAILNAGLVSKFRYQKTNGTADTELNAGEFQVPDEATLFVGGKVAKNIGFQAEIGLTTTASMLGFKVPFVFPANDFTFSAIPFYSDAQGAAYGFELLNTGALRFSRTFENRTEASAQQYIGAAHKTTGVALVALHKYGYISYTPYLVKDVLTDGIAFSSGRPLSYIRGVVTPEKVGNWDLAAGAQIWTGSSEEGTASAPTRNHASAWAIDAQAQGKAGDMPLGVYVAYGSAAASKAGETANFYNASTVASKNALTIATELGIVPNRLGIGVAYRNANSGAGAADNDNALTLGANYQIARNMVFQLNHSFYSGDIKTDAVGKQLTTAMFYSAF 23 | >WP_006367389.1 cytochrome c5 family protein [Chlorobium ferrooxidans] 24 | MRRFVQVCSLMVVFICSSQIVNAEDIARATLRAKYDMVQGKDVYERACSVCHSSGVMDAPKFCDITAWKPRMAHGMEAMVKHAVEGFNNMPAKGGMDALTLTESANAVAYMVDQCLFD 25 | -------------------------------------------------------------------------------- /testing/inputs/subjects/Ignavibacterium_album_JCM_16511_outgroup.faa: -------------------------------------------------------------------------------- 1 | >WP_014560052.1 T9SS C-terminal target domain-containing protein [Ignavibacterium album] 2 | MKKLFINNLKSGSLFNVIMILSVFLFTKNGFGISDTIPPSTPQNVKAYGYERHIDLVWWDNEEPDLAGYKVYRKINNQFYYWTTVPKEKSYLMLSNLGMNYTLTMKVSAVDLSGNESPLSDSVVATTHIMTDEEFLDMTQRATFRYFWDYAHPVSGLARERLGSGETVTIGGSGFGIMALIVGAEREFKSRDLIKKRMLKILNFLTNQAQRFHGAYSHWLNGSTGQVIPFSQYDDGGDLVETAFLIQGLLAARQYFNGQDSIETEIRNLITQIWESVEWSWYRRYQASYTLYWHWSPNYGWQMNMPIIGPNETMIVYLLAIASPTFPIPAFCYRYGYASSSNYVNGKTFYGFKIYVGWDYGGPLFFAHYSFLGFDPRNKKDMFCNYFTNNRNITLIHKAYCTANPGGWTGYNENCWGLTASDDPSGYRVHEPTNDNGTISPTAALSSMPYTPNESIEVIKHLYRNYYSGLWGEYGFKDAFNPTQNWFANSYLAIDQGPIIVMIENYRTGLLWNLFMANSEIQPMLSAIGFVPDSATSVESDNNSVSDFELYQNYPNPFNHVTRIAFNIPSLGKGNSVYKVVLTVFDALGNEIEKLIDDEKSPGTYEVEFNAEDLSSGIYFYKLSVVSNSDNKPDLNKFNKVRKLILLK 3 | >WP_014560054.1 sugar ABC transporter permease [Ignavibacterium album] 4 | MKSNSLKIVFFFLTPAIGAIFIFFFIPVIAAFIISFTDFDIYTLGDISTLRFIGLDNYSKLLKDELFWTALKNTFYFVLVAGPLSIAVSLSVALLLNSKLTKFKSLFRLAYFLPVVTTLVAVAIVWRFIYHPNFGILNFFLGLLGINPIDWLGDPFWAMPSIIILAVWKNFGYNMIIFIAGLQNIPEELYEAADIEGANAFRKFIHITLPMLAPTTLFVSIITMIGYFQLFAEPYVMTQGGPLNKTLSIVQYMYQEGFRWWNMGYSASIAFILFVIIFIGTIIQFRIQKSQK 5 | >WP_014560055.1 carbohydrate ABC transporter permease [Ignavibacterium album] 6 | MKKIILHTVLILIAVVTLIPFLWMISASFMLDGHASVFPPRFLPDQFTLVQYERLFERLSIARNFFNSLVLSILVTFISLTFNSMAGYAFAKYKFKGKDKLFNLLISSMIIPAQVTMLPLFLMLKWMGFINTYMAIIIPGLANIFGIFLIRQYCFSIPDSLIEAARIDGADDFLIYRKIILPLLTPVLATLAIFTFLGTWNDFLWPLIVMTDDSMYTLPVALANLMLEHAKDPELMMAGSVVTILPVIIVFLALQKYYLKGIMMGSVKE 7 | >WP_014560056.1 hypothetical protein [Ignavibacterium album] 8 | MIEILINKFSGLTSFNQFNVNKGLIIFNLFVLFLYPINIFAQLESKPLAKVGNQIITVEEFKNRYEFMPHLNYSSDNKDTLRNEFLYSLIAEKLWALEGLEKRFDTLDVVKNSLKTLEKLFVKDELFRSEVESKILLTPEEISKGLLRVGRTLSIKIINSTDSTEIFRIYDYLLNSDNFDSLLATRPESNSQQKPFQIKLGNLSDEFAEDVVFNLKLNEVSEPIKSNDKWFIFKLVSEETDSSIVRANESAKNKTISILSERKRKKIAGNFLDKILGGRTISANSELFNHFADKLIDVLQKRIEAGNQESSQSIELTPDDLQKTLRLIDRNKLYSLFVEFDSTKLSLNDFIYYLMYQKVSFPSPKPNRIKVVLNSAVKQFIEDEVITQEGYKKGMNNLQSVKNDIEMWKKYYLSELMIQSYSDSITVSEKEIESYISQKLSHSSNTVMLNIIEIFNTQLDQMLIVLDELKKGTDFKHLAQKFNQREYTKKSNGEWGYFNANSAGQIGKIASDLEIGQIYGPIKVEGGYSIIKLIDKKFISDTLVNNNDEPKEYIRMKLSLTKMNEILNRQTARLALKYKISIDEQLLNTIELSDLNMFTYRLIGFGGKIAAVPVTIPIFEWYYLMKDKNEIP 9 | >WP_014560057.1 sugar ABC transporter ATP-binding protein [Ignavibacterium album] 10 | MAEVILKNVSKIYDDKTKAVDNVNITIADKEFIVLVGPSGCGKTTTLRMIAGLEDISEGEILIDGKVINNLPPKDRDIAMVFQNYALYPHMSVYENLAFGLKLRKLDKHEIDKRVNEAARILGLEKYLDRKPKALSGGQRQRVAVGRAIVRNPKVFLFDEPLSNLDAKLRVQMRTEISRLHKELGATMIYVTHDQIEAMTMGDRIVVMKDGCVQQIDTPLNLYNLPANKFVAGFIGSPSMNFIKGEIAEENFNLFFQSTFGNLKFIIPDEKKNSFIEKKIKKVWLGIRPENIHLTNKDKPSVSVASLKSEILLIEPLGNQTLIYFNFENNQLVSEHHGFISISKGTIHEFLIDLNKIHFFDEESEMRIT 11 | >WP_014560059.1 type III-A CRISPR-associated protein Cas10/Csm1 [Ignavibacterium album] 12 | MTAEEKTLILGALFHDIGKFKQRGVSQSERLKHQEYSSGFVNDLFKDKLLSDLVLYHHREDLNKSNLSGLNRILAEIVCEADNLASGERQPDPDVKTQHPLESILSKIDGVHKKKINPQLYFQDISELFYNEYIFPIKKDKYDLSSLENKYKAWWNKFEKEIKKVNKDEIETLFYLLKKYLWCVPSSSYKTRSDVSLFEHSKITAAIAISMLRFLLEKNGNDINKLKDFDNREEYRYQLVLGDITGIQSYIYNIGHRGAAKSLKGRSFFLQQMLENIAYYILDHKSIDLPITNLIYSSGGKFYLFVPNTNSVNNALEQIQKELEQKFLYDYNGALGIIFGKIELNGRDLEYNKDGKDHTISEKWDKLNSIVEIQKKRKFSKNWFYSFFEPSGIDGEIIKCSYTGIPLIKKEVLTNKQTLKQEVKLDEYSPVKFIKHSFEGELFYQVYDNENLTDDYISKEQFYSQKIGNDLKKNFETIVYQDVLEGYSVLDINSFTTSKDFNFVNKLNSKYPRQFLINSLKINDLQGDASKGYKFYGGDWRFGDTYEEVIKKGLGIERLGVLRLDVDNLGLIFKDGFGKHATFGRVVQLSSMLDFFFSHYLNKLKFFSWNPVKGLSEKISDYNYKVKDLIEIVYSGGDDVFIVGHWSVLPDVAIWINEEFRKFTANNDNFSISAGISLFDDKYPIYKAALEAGEYEDLAKRKERINKDKSKQKKNGICFLDKKTPVSWNDFDEIRSWVRKFYNWLEVGVEISKEEKKKLSKGLISRLYSIYYEYEEGKYQDWARWRWRASYSLARLAKQYQEPFGDNIRDFAAELFTSKKTKQELIQLLYIIANWTDLLTRKENKNDK 13 | >WP_014560060.1 type III-A CRISPR-associated protein Csm2 [Ignavibacterium album] 14 | MTNEKAIVYRNKIDTLENEVKKERDRFKKAKPNEKDEIKKKIDSLEKDIDKTYESLFKEFDEDIELKSIDEMNEQSILFSEFFGRYLVRLDLSTSQIRNVYGDVMRLKMKGFSSNELMLLKPRLAYTTERKGTDGSRKFREKIENALDKVIFIEDKSKQETLFQNFANFFEAILAYHRSFGGK 15 | >WP_014560061.1 type III-A CRISPR-associated RAMP protein Csm3 [Ignavibacterium album] 16 | MSNNNTNNGQLLKKILIEGTITAKTGLHIGGSSVGMSIGGADATVVRNPLTNEPYIPGSSLKGKMRSLLEKVIGQKAFQYMTDQVKYGPRTKAIKDDENLSEDEKKNISLIMGIFGTKPEDTKSKDEPVSRLIVRDCELEGIMEDNKLVTREEGGVKKLFESKNTDMPYTEVKTEVVIDRITSAATPRQLERVPAGTVFNMRMILNVYNGDDEKAMLNKIFEGLALVQNDYLGGKGTRGSGEVDIKITQLKYKDKEVYENAGNWADYSNDDFIVPEELKN 17 | >WP_014560062.1 type III-A CRISPR-associated RAMP protein Csm4 [Ignavibacterium album] 18 | MKAYKLKFQSSFHIDEGSAVDGPSETFIHSDTLFSALVSAARKFYGKEVAEKFLSPRAVVLSSAFPYYKEEFFLPKPLYFFPENLKEYEMIKVFKEAKFISKDLLFKILSGTQVEEKYYSKEHILNGCWRINKNIKSLEEEDKIFEVQEVPHIVMDRISNQTQIFYKTEVYFNKNAGLYFIADINEELLKQFETVLRFLGDEGIGADRTIGKGLFEVEEIKNFSLSTSNESGFYYLLSLYSPTKEEFEKIDPKKSFYDFKIRGGWISNNTLNRKNVRMFVEGSVLKFLSNNKPIGSIHKVLNANEYPDDLMYDIYRSGQSLFLPVTGGVNDN 19 | >WP_041294073.1 hypothetical protein [Ignavibacterium album] 20 | MIHKIKNISLLLFIILFSSCANNQDEIKTIKFWAMGSEAEQISKILPEFEKRYPDIKVKVQQIPWTAAQEKLITAFASDNTPDICQLGNTWIPQFASLNAIIDLNDFIKTSSIVKPEKFFPGIWETNIIENRVYGIPWYVDTRLMFYRKDVFERAGFNLPPKNWNELYTLCKKIKELDKGKEKYPVFIPTNEWNSFIIFGLQAGAELLKEKNTRGNFSSREFKEAFDYLIRFHKEKLTPFGMMQVTNVYQAMADEYISIYFSGPWNIPEFKKWMTGNLADKWATAPMPGYKNEYPGLSLAGGSSLVIFKNSNHKNEVWKLIEFLSEPDIQLKVYNVTNNLPAVIDAWKDSSLSNDIYMKAFYQQLQNVTSAPKIPEWEQIVFSKLQQYAEFAARGVMTTDEALRKLDEDTDRILEKRRWLTFKQSK 21 | >WP_083832018.1 CRISPR-associated endoribonuclease Cas6 [Ignavibacterium album] 22 | MGGLVRLKIELHTNHTRSLPFNYHYQFSSAIYLLLKFGSPEFSDFLHNIGYKVNGRQYKLFSFAVKFEQYKTTQREIILESPRLNLTVTSPKIDEFIKNFVIGSFERTFFNISIGGSEHKFLIRNMELLPEPDFINEMSFTMTSPMVLSTLKEFNGKTSTYYLRPDDIDEINRILTQNLRNKFELLNGKTSEGEVTLEWNEDFVKRHPRITKKITINEFGKYPVDVIGIQAPFRITGDADLIKTGYQCGFGEKNSMGFGMVEVVKNH 23 | -------------------------------------------------------------------------------- /scripts/create_blank_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ----------------------------------------------------------------------------------------------------------- 4 | # Copyright: Lee H. Bergstrand and Jackson M. Tsuji (2025) 5 | # Description: A simple program that takes a FASTA file query and makes a csv of blank BLAST results. Part of the BackBLAST pipeline. 6 | # 7 | # Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download 8 | # 9 | # Usage: create_blank_results.py -i blast_file.csv -q query_proteins.faa -o new_blast_output_file.csv 10 | # Will make a blank file if the input blast file is blank, and will otherwise return the original file. 11 | # ----------------------------------------------------------------------------------------------------------- 12 | # =========================================================================================================== 13 | 14 | # Imports: 15 | import sys 16 | import os 17 | import argparse 18 | import logging 19 | import time 20 | 21 | from Bio import SeqIO 22 | from shutil import copyfile 23 | 24 | # Set up the logger 25 | logging.basicConfig(level=logging.DEBUG, format="[ %(asctime)s UTC ]: %(levelname)s: %(module)s: %(message)s") 26 | logging.Formatter.converter = time.gmtime 27 | logger = logging.getLogger(__name__) 28 | 29 | # =========================================================================================================== 30 | # Functions: 31 | 32 | # Checks whether or not the provided input_csv is an empty file. Returns logical. 33 | def check_if_input_csv_is_empty(input_csv): 34 | if os.stat(input_csv).st_size == 0: # Returns True is the CSV is empty and False if not 35 | return True 36 | else: 37 | return False 38 | 39 | 40 | # Checks whether or not the provided file has the expected extension. Throws warning if not. No return. 41 | def file_extension_check(filename, extension): 42 | if not filename.endswith('.' + extension): 43 | logger.warning("'" + filename + "' may not be a '" + extension + "' file!") 44 | 45 | 46 | # Uses input query_proteins FAA file to build a blank BLAST CSV table. Returns the table. 47 | def generate_blank_results(query_proteins): 48 | blank_results_list = [] 49 | 50 | with open(query_proteins, "r", newline=None) as fasta_file: 51 | fasta_entries = SeqIO.parse(fasta_file, "fasta") 52 | 53 | for entry in fasta_entries: 54 | blank_results_list.append( 55 | entry.id + ",NA,NA,NA,NA,NA") # qseqid sseqid pident evalue qcovhsp bitscore 56 | 57 | blank_results = "\n".join(blank_results_list) 58 | return(blank_results) 59 | 60 | 61 | # Writes a blank_results table to a file with name new_blast_file. No return. 62 | def write_blank_results(blank_results, new_blast_file): 63 | with open(new_blast_file, "w") as writeFile: 64 | writeFile.write(blank_results) 65 | 66 | 67 | def main(args): 68 | # Get user-provided variables 69 | original_blast_file = args.blast_results 70 | query_proteins = args.query_proteome 71 | new_blast_file = args.output_file 72 | 73 | # Report the input arguments 74 | logger.debug("Running " + os.path.basename(sys.argv[0])) 75 | logger.debug("Settings: Input BLAST CSV table: " + original_blast_file) 76 | logger.debug("Settings: Query protein FastA file: " + query_proteins) 77 | logger.debug("Settings: Output BLAST CSV table: " + new_blast_file) 78 | 79 | # Check file extensions 80 | file_extension_check(original_blast_file, "csv") 81 | file_extension_check(query_proteins, "faa") 82 | file_extension_check(new_blast_file, "csv") 83 | 84 | # If the CSV file is not empty, then just keep the file as is. 85 | if check_if_input_csv_is_empty(original_blast_file) == False: 86 | logger.debug("Provided BLAST file has content - no need to replace. Copying to output file.") 87 | copyfile(original_blast_file, new_blast_file) 88 | 89 | # If it is empty, then make a fake file. 90 | elif check_if_input_csv_is_empty(original_blast_file) == True: 91 | logger.debug("Generating blank BLAST table based on FAA file provided") 92 | blank_results = generate_blank_results(query_proteins) # Stores file one for input checking. 93 | 94 | logger.debug("Writing to CSV file '" + new_blast_file + "'") 95 | write_blank_results(blank_results, new_blast_file) 96 | else: 97 | logger.error("Something went wrong. Exiting...") 98 | sys.exit(1) 99 | 100 | logger.debug("Done") 101 | 102 | 103 | if __name__ == '__main__': 104 | """Command Line Interface Options""" 105 | 106 | parser = argparse.ArgumentParser(description = "A simple utility for working with BLAST results in batch. " 107 | "Creates a BLAST results template based on the query_proteome " 108 | "if the input blast file is blank. " 109 | "Returns the original file if not blank. " 110 | "Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025.") 111 | parser.add_argument('-i', '--blast_results', metavar='BLAST_IN', required=True, 112 | help='''The path to CSV-format BLAST results (to be checked by this script if empty or not).''') 113 | 114 | parser.add_argument('-q', '--query_proteome', metavar='FASTA', required=True, 115 | help='''The path to a protein FASTA file used as the original BLAST query.''') 116 | 117 | parser.add_argument('-o', '--output_file', metavar='BLAST_OUT', required=True, 118 | help='''The path to write the output CSV-format BLAST results to.''') 119 | 120 | cli_args = parser.parse_args() 121 | main(cli_args) 122 | -------------------------------------------------------------------------------- /snakemake/Snakefile: -------------------------------------------------------------------------------- 1 | # Snakemake rules for the BackBLAST pipeline 2 | # Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025 3 | 4 | import os 5 | from snakemake.utils import logger, min_version, update_config 6 | 7 | # Specify the minimum snakemake version allowable 8 | min_version("6.0") 9 | # Specify shell parameters 10 | shell.executable("/bin/bash") 11 | shell.prefix("set -o pipefail; ") 12 | 13 | rule all: 14 | input: 15 | "logging/checkpoints/finished_blast", 16 | "logging/checkpoints/finished_heatmap" 17 | 18 | # Run reciprocal BLAST for each subject genome against the target genes in the query genome 19 | rule run_reciprocal_blast: 20 | input: 21 | lambda wildcards: config["subjects"][wildcards.subject] 22 | output: 23 | temp("blast/intermediate/reciprocal_blast/{subject}.csv") 24 | log: 25 | "logging/logs/blast/intermediate/reciprocal_blast/{subject}.log" 26 | benchmark: 27 | "logging/benchmarks/{subject}.reciprocal_blast.benchmark.txt" 28 | threads: 1 29 | params: 30 | query_genes = config.get("query_genes"), 31 | query_genome_orfs = config.get("query_genome_orfs"), 32 | eval = config.get("e_value_cutoff"), 33 | pident = config.get("minimum_percent_identity"), 34 | qcovhsp = config.get("minimum_query_coverage") 35 | shell: 36 | "backblast search --gene_cluster {params.query_genes} --query_proteome {params.query_genome_orfs} --subject_proteome {input} " 37 | "--e_value {params.eval} --min_ident {params.pident} --min_query_cov {params.qcovhsp} --output_file {output} > {log} 2>&1" 38 | 39 | # Remove duplicate BLAST hits for each BLAST table 40 | rule remove_duplicates: 41 | input: 42 | "blast/intermediate/reciprocal_blast/{subject}.csv" 43 | output: 44 | temp("blast/intermediate/remove_duplicates/{subject}.csv") 45 | log: 46 | "logging/logs/blast/intermediate/remove_duplicates/{subject}.log" 47 | benchmark: 48 | "logging/benchmarks/{subject}.remove_duplicates.benchmark.txt" 49 | threads: 1 50 | shell: 51 | "backblast remove_duplicates {input} > {output} 2> {log}" 52 | 53 | # If BLAST CSV is empty, create a blank BLAST table 54 | rule create_blank_results: 55 | input: 56 | "blast/intermediate/remove_duplicates/{subject}.csv" 57 | output: 58 | temp("blast/intermediate/fix_blank_results/{subject}.csv") 59 | log: 60 | "logging/logs/blast/intermediate/create_blank_results/{subject}.log" 61 | benchmark: 62 | "logging/benchmarks/{subject}.create_blank_results.benchmark.txt" 63 | params: 64 | query_genes=config.get("query_genes") 65 | shell: 66 | "backblast create_blank_results -i {input} -q {params.query_genes} -o {output} > {log} 2>&1" 67 | 68 | # Combine the BLAST tables into a single table, and add a column for sample ID 69 | rule combine_blast_tables: 70 | input: 71 | blast_tables=expand("blast/intermediate/fix_blank_results/{subject}.csv", subject=config.get("subjects")) 72 | output: 73 | "blast/combine_blast_tables/blast_tables_combined.csv" 74 | log: 75 | "logging/logs/blast/combine_blast_tables/combine_blast_tables.log" 76 | benchmark: 77 | "logging/benchmarks/combine_blast_tables.benchmark.txt" 78 | shell: 79 | "backblast combine_tables {input} {output} > {log} 2>&1" 80 | 81 | # Create a symlink of the combined BLAST table to make it easier for the user to find 82 | rule symlink_combined_blast_table: 83 | input: 84 | "blast/combine_blast_tables/blast_tables_combined.csv" 85 | output: 86 | "blast/blast_tables_combined.csv" 87 | run: 88 | source_relpath = os.path.relpath(str(input), os.path.dirname(str(output))) 89 | os.symlink(source_relpath, str(output)) 90 | 91 | # Checkpoint that BLAST step is finished 92 | rule finished_blast: 93 | input: "blast/blast_tables_combined.csv" 94 | output: temp(touch("logging/checkpoints/finished_blast")) 95 | 96 | # Generate phylogenetic tree if desired by the user 97 | if config.get("phylogenetic_tree_newick") == "subjects": 98 | # Makes a list of filepaths to the FAA genome files for use by GToTree 99 | # Part 1 - generate for each sample 100 | rule generate_phylogenetic_tree_input_individual: 101 | input: 102 | lambda wildcards: config["subjects"][wildcards.subject] 103 | output: 104 | temp("phylogeny/input/{subject}.list") 105 | threads: 1 106 | shell: 107 | "echo {input} > {output}" 108 | 109 | # Part 2 of the above rule 110 | # TODO - check {output} does not already exist before run start or move output to a different folder 111 | # TODO - Is it possible to merge Parts 1 and 2 together into a single rule? 112 | rule generate_phylogenetic_tree_input_grouped: 113 | input: 114 | blast_tables=expand("phylogeny/input/{subject}.list", subject=config.get("subjects")) 115 | output: 116 | "phylogeny/input/input_genomes_faa.list" 117 | threads: 1 118 | shell: 119 | "cat {input} >> {output}" 120 | 121 | # Run GToTree 122 | # TODO - note that it is possible to expose more params to the user 123 | rule generate_phylogenetic_tree: 124 | input: 125 | "phylogeny/input/input_genomes_faa.list" 126 | output: 127 | "phylogeny/iqtree_out.treefile" 128 | conda: 129 | "../envs/gtotree.yml" 130 | log: 131 | "logging/logs/phylogeny/gtotree.log" 132 | benchmark: 133 | "logging/benchmarks/gtotree.benchmark.txt" 134 | threads: config.get("threads", 1) 135 | params: 136 | phylogenetic_model = config.get("gtotree_phylogenetic_model", "Universal-Hug-et-al"), 137 | sequence_length_threshold = config.get("gtotree_sequence_length_threshold", "0.2"), 138 | minimum_hit_fraction = config.get("gtotree_minimum_hit_fraction", "0.5") 139 | shell: 140 | "GToTree -A {input} -H {params.phylogenetic_model} -o phylogeny/gtotree -T IQ-TREE " 141 | "-c {params.sequence_length_threshold} -G {params.minimum_hit_fraction} " 142 | "-n {threads} -j {threads} 2>&1 | tee {log} && " 143 | "ln phylogeny/gtotree/iqtree_out/iqtree_out.treefile phylogeny/iqtree_out.treefile" 144 | 145 | # Create a fake temp file to allow plotter to run if 'NA' is selected 146 | # TODO - this is a hack; is there a proper way to make this work? 147 | if config.get("phylogenetic_tree_newick") == "NA": 148 | rule generate_fake_phylogenetic_tree: 149 | output: temp(touch("NA")) 150 | 151 | # Generate the final heatmap 152 | rule generate_heatmap: 153 | input: 154 | blast_table = "blast/combine_blast_tables/blast_tables_combined.csv", 155 | tree_file = "phylogeny/iqtree_out.treefile" if config.get("phylogenetic_tree_newick") == "subjects" else config.get("phylogenetic_tree_newick", "NA") 156 | output: 157 | "heatmap/BackBLAST_heatmap.pdf" 158 | log: 159 | "logging/logs/heatmap/generate_heatmap.log" 160 | benchmark: 161 | "logging/benchmarks/generate_heatmap.benchmark.txt" 162 | params: 163 | genome_metadata = config.get("genome_metadata_tsv", "NA"), 164 | gene_metadata = config.get("gene_metadata_tsv", "NA"), 165 | bootstrap_cutoff = config.get("bootstrap_cutoff", "NA"), 166 | root_name = config.get("root_name", "NA"), 167 | plot_width = config.get("plot_width_mm", 400), 168 | plot_height = config.get("plot_height_mm", 200) 169 | shell: 170 | "backblast generate_heatmap -m {params.genome_metadata} -g {params.gene_metadata} " 171 | "-b {params.bootstrap_cutoff} -r {params.root_name} -w {params.plot_width} -z {params.plot_height} -d " 172 | "{input.tree_file} {input.blast_table} {output} 2>&1 | tee {log} && " 173 | "if [[ -f Rplots.pdf ]]; then rm Rplots.pdf; fi" 174 | 175 | # Checkpoint that the plot is done 176 | # This is probably over-engineered, but it is helpful to mirror what is being done for the BLAST step and could help to one day separate this Snakefile into modules 177 | rule finished_heatmap: 178 | input: "heatmap/BackBLAST_heatmap.pdf" 179 | output: temp(touch("logging/checkpoints/finished_heatmap")) 180 | 181 | -------------------------------------------------------------------------------- /testing/inputs/subjects/Chl_luteolum_DSM_273.faa: -------------------------------------------------------------------------------- 1 | >WP_011356871.1 bacterioferritin [Pelodictyon luteolum] 2 | MKGQPKMIEKLNALLAEELTAINQYMVHSEMCDNWGYARLHGADEKRAIDEMKHAEQLIGRILFLDGTPDVSTLNKIMIGASVEKQHRSDLHLEGETIISYNEGIRLAAELGDNGSKDLLESILKEEEMHLDWIEAQLDQIGQMGVQTYLAEQLS 3 | >WP_011356874.1 DNA-binding response regulator [Pelodictyon luteolum] 4 | MSNSSRIEQPACQRIVVVVSDPDCRSRLAAKLRGDGHDVDIAGSALEFYGLLAQREFRLAVLEAELSDQNGLVVARFLRRNTPISSIMLAGSVDRKVRLAVYHAGSLACFCKPVDLGEFSVLVDNLLNQGQRTSAGAQRTTLEIPQGHAKWKILRNGWVLAGPRGTMVKLTINEFEFMSLLASSNQMAVSRKAILEHMGYRNDVHGNKALEAVVHRLRLKTQVAGGSLIETAHGVGWGFSSEVALV 5 | >WP_011356875.1 hypothetical protein [Pelodictyon luteolum] 6 | MVTPMQESFQVLVERPPTPCLSLYQETHRSYPDNRQDRLRYKNLVNALEESLRRKYPGRDALPLLEPFRVLGDDDGFWNHTHEGLAVFGAEGIFRTYRFPYVPRELAVSADSFHTKPLLRVMQSTERFHLLCLTRQNIKLYDADRYAVEEIELPPDMPRNMLQVLGSETTEPEITIASFGSAGAGGVIRHGHSSKRDEEEVDNEKFFRAVDTAVLERYSIPSGLPLMLAALPEHQGLFRKISRNASLLEGGISADPQSMTTDVLRVRAWDVMAPSIAARIRAAAESCAAAIALGTGTSDPEDAAVAAVGGRIERLLIDRETLIAGTVDPVTGAVRFVPLEDPAVDDVLDDIGEIVRQKGGEVLVLSATEMPSETGLAAIYRY 7 | >WP_011356876.1 acyl-CoA thioesterase [Pelodictyon luteolum] 8 | MQREHHSFTLEMDVRDYECDMQGIVNNSVYQNYLEHARHEYLKSVGIDFSEYARMGVNLVVVRAELDYRSPLKSGDRFLVNLQLQRESTLKFAFYQDILRLPDMKPALNAKVIGTALNGRGRPEIPTALAALMSAGDA 9 | >WP_011358168.1 serine O-acetyltransferase [Pelodictyon luteolum] 10 | MQSDADKIWKIIVSEASAECRRDPDISMFLEQHILQFSDFASSLAMLLAVKLGSKHFPPPVLQGLFEDFYRDCPDRVDYAVCDLVATQDRDPAAVYYFETMLFLKGYQALQAYRFSHWLWKNGRKTMSYFIQNRISEVFAVDIHPAAVIGKGILLDHATSLVIGETAVVDDNVSLLHEVTLGGTGKETGDRHPKVHKSVLIGAGAKILGNVVIGEGAKVGAGSVVLDDVPPHYTVAGVPAQIVGRTEVQEPSLDMNQRLVGRYNESDGNQPVKADRKASR 11 | >WP_011358170.1 cation transporter [Pelodictyon luteolum] 12 | MHSISRPKKSLKPFVLLSITAAIVTIVMKMLAWRLTGSVGLLSDALESFVNLAGAMMALAMITLAERPADDSHPFGHGKAEYFSSGFEGLLILIAALSIGFSAVDRLLHPRELEAVGFALLVSAGASMVNYFTARTLLTVGRREHSITLEADAHHLMTDVWTSVGVIAGVALAWWSGWTWLDPVIALAVAINILRTGWHLLQRTAEGMMDASLPPEQLSEIQTALKDISLDGVSCHNLKTRMGGSIVFITLDVMVPGNWSVQQGHDCCESIEAHLHKILPHTYVTTHLEPSDQ 13 | >WP_011358171.1 hypothetical protein [Pelodictyon luteolum] 14 | MKLVILVIAFFAIVVGLVLAFPDYLLNPGPLMQGHAHIEKNCLSCHRPFRGALAMQCTSCHKPEEIGIKNVKGMALLRKDSKISFHRGLPANSCIGCHTDHKGRDSTRAVRAFKHEGISFDLRANCNSCHDVQKPKDKLHDYATRSCGECHSTRAWKPASFDHKSLASGTNCLDCHKGDVPRDRLHSSAGANCSTCHGTSGWKPASFDHKTVASAQRCLDCHKGDVPRDRLHSSAGANCSTCHGTSGWKPASFDHKSLASGTNCIDCHKGDVPRDRLHSSAGANCSTCHGTSGWKPASFDHKTVASAQRCIDCHKGEVPKDRLHSSAGANCSTCHGTMAWKPASFDHDRYFRLDRDHRASCSTCHNDSSDFKKYTCYGCHEHTASNIASEHREEGIRNYQNCMRCHKSGSDED 15 | >WP_011358172.1 hypothetical protein [Pelodictyon luteolum] 16 | MYWNQKTSNLIRQCGQSILIRKALPLTASYLLLIVASMALDYLLHQLQIELAGRYLGVAGTIFFLLSFLYSARKKQIIPKGPLKRFLQLHCHAGWIATLLILVHSGIHFNAFLPWAATVLMMIVTASGHVGQHLVRRFRNEAKLKKQQLGILTPPDDALDRQLFWNAVTLKALNQWRMVHMPLVAFLIILTLVHVVSILFFSNRANWM 17 | >WP_011358173.1 hypothetical protein [Pelodictyon luteolum] 18 | MKFKNALGVLAGGAVSALLLALPQDAEATPVFARQTGMSCTACHFQRFPMLNAFGRDFKARGYVMKGSQQTIEDKDLSIPSSLNLGLVTKLRYQKKNGTTTTGPDGMLNDGEFQIPDEAFLSVGGRVAENIGFQTELNLANVSTGGVFAAFKMPFVFDLGKEFIANVIPFTTDTQGPAYGFELLNTGAVRFSRAFENRTETSAQQYIGTATAATGAALVAYHKYGYLNFSPYVKRGSNADPIFSSGQMFSYLRGAVTPTIGGWDLAAGFQVWNGDARYTEAVVDATTDPVTTTYPETSQSANAWAVDAQAQGNVGELPLGVYLTYGSADNAATNIYTKGAEALTLAAELGVIPYKLTLGAAYRSASNGKATLDGEDAVTLGMNYLVAKNVTVQINHSFYSYDANLDAKGDNLTTVMLYSAF 19 | >WP_011358175.1 DUF4282 domain-containing protein [Pelodictyon luteolum] 20 | MITESINNKKGATMQNGSLMGKLFDFSFKEFITLQIIKYLYIMGIVFAAITALSTVGAGFATMQYSFWSGLGGILMAPVVFFLLLLIVRLLLEALVATFRIAENTTILVERNAKL 21 | >WP_011358176.1 carbon starvation protein A [Pelodictyon luteolum] 22 | MMSRNGFPINYSPVLARQLGIDEAEETPAVKQNDGVDFVPAKHWLILFGHHFASIAGAAPIIGPVIACLYWGWLPALAWIVFGSIFMGAVHDLASLAISAKNEGRSIADLTENILDRNSKIVFSLFVFLTLILIVAVFAAIAGQTLANTPEVVLPTIGLIPVASLIGWLMYHRGFSIPLSSLLGVAMLFGLIVLGYRYPLSLPVPNPEQWWTVILIFYGMTASVTPVTLLLQPRDHLAALVLFAGMFFGFLGLLLSRPELHAPPVVAFSSDQGWMFPMLFITIGCGAVSGFHSLVASGTTAKQLPTMKDVRAVGYGAMVTESALAVLAIVSVTAGLYWKELPVGQHGFVYQEVFQKGGWIKTFGVGYGEVTKPVLGTFGALVGITMLKTFVMTTLDSATRITRYVGSELLGETFGLPGMKNKYGITLLIGFLAAMLALGNWQAIWPIFGSANQLVASVVFIVTSVWLFRRGGNWKLTAVPAVLMLLTTLSALVVKTWEFMTADPRKDLLAAIAIVLIALALFMLLQSIRVVRKKASA 23 | >WP_011358177.1 Ni-Fe hydrogenase small subunit [Pelodictyon luteolum] 24 | MQCKQTFEEVLNERGISRRSFLKYCALTAAALGLSPLMASKIAHAIETGPRTPVLWLHGLECTCCSESFIRSSHPTIEDILFNMISLDYDDILSAAAGTQLEEVRRRIMKEYKGKYILAIEGNIPTKDDGVYCLVGGDSFLNTVKETAADAAAIIAWGNCASFGCVQNAHPNPTGAAPVSDIIKNKPIVKVPGCPPIAEVMTGVIAHFHTFGTLPELDRLGRPKAFYNTRIHDKCYRRAFFDAGMFVESFDDEATKKGWCLYKMGCKGPTTYNSCSKIQWNGGTSFPIGSGHPCIGCSEPGFWDNGPFYGRLAKVPFLGSDSNADKVGIVAVGAAAAGAAAHATVTALKKAKQGGEENKDNA 25 | >WP_011358278.1 O-acetylhomoserine aminocarboxypropyltransferase/cysteine synthase [Pelodictyon luteolum] 26 | MSKKPYQFETLALHAGQAVDATQSRGVPVYRTSSYLFKNTEHAANLFALKELGNIYTRLMNPTTDVLEQRVTQLEGGAASVALASGTSAIFYTVITLAEAGDNIVSANNLYGGTYTQFDAILPKLGIDVTFVDPKDPANFEKAINEKTRALYIETIGNPVLDYTDVKAIAEVAHRNCLPLIVDATFTTPYLLKTIELGADIVINSLTKWLGGHGAGIGGIITDAGRFDWKAGRHPLFTQPDANYHGLCWGIDLPEPLAPIAFALRVRTVPLRNLGACISPDNSWLFIQGIETLPVRMARHSENALKVAEHLKAHPKVAWVRYPGLKDDPSYPIASRDLKHGFGGMVVFGVKGGYDAAVKIIDTIDLFSHLANVGDAKSLILHPASTSHSQMTKEQQAQSGLTSDLIRLSVGMEHPDDLITALDEVLAKV 27 | >WP_011358279.1 DNA-binding response regulator [Pelodictyon luteolum] 28 | MKPKGSILIADTQFLTTEALRSMLDAEGYGMSVVHTRTELYEYLKANEPSLIITDYILFDFRSINDLRELGELHDGSPILVLCNSVNQMQIKELNQAGIRNIALKTDDRTELLQSVTTAMKGKKAYSGSVLDILLKEDGPAEDACLLTTSEIEIVRLISTGLTTKEIAVKKHVSFHTVTTHRKNIFRKLGVSSSPELMMFAIKAGLIYNIEYHI 29 | >WP_011358280.1 cysteine synthase A [Pelodictyon luteolum] 30 | MARIAKKLTDLIGNTPLLELGNFGLDQQALGTIIAKLEYFNPGGSVKDRIGFSMIEAAEKDGLLTRESVIVEPTSGNTGIALAFIAAAKGYRLILTMPETMSIERRNLLKALGAELVLTPGQEGMGGAIRKAEELNESLPNAFLPQQFKNPANPDIHRRTTAEEIWNDMDGKIDIFVSGIGTGGTITGVGEVLKQRNPNVRIVAVEPTESPVISGGKPGSHKIQGIGAGFIPDILNQGVIDEIITVSNENSFRTGQDLARTEGLIVGISSGAAVWAALQLAKRPENSGKRIIVILPDTGERYLSTLLYQFENEATAI 31 | >WP_011358281.1 sulfate ABC transporter substrate-binding protein [Pelodictyon luteolum] 32 | MKSTRFSVGLALFSSLLYSTAQATAPAGATLLNVSYDPTRELYQSENAAFASYWKQKTGQEVTINQSHGGSGKQARAVIYGLEADVVTLALAYDIDAIADSKLLATNWQQRLPQNSTPYTSTIVFVVRKGNPKGIRKWDDLIKTGISVITPNPKTSGGARWNYLAAWGYKQTQTGSADKAKAFVKAIYKNVPVLDTGARGSSLTFTQRGIGDVLITWENEAHLILKEFGNDRFEIVAPPVSILAEPPIAIVDKNVDRHGTRKVAEAYLNFLYTPQAQEIIVQNSYRPRNTATLRKHASNFPAIKLFTIKDLFGTWRQAQKTHFSDGGTFDQIYTP 33 | >WP_011358282.1 sulfate ABC transporter permease subunit CysT [Pelodictyon luteolum] 34 | MPPFNRNKRHILPGFGLSMGYTVFYLSAIVIVPLGTIVVSAIRMDPQAFFFTVTSPRVLASYSLSISTAVFAALFNALFGFLTAWTLVRYRFPGRMVLDALVDLPFALPTAVAGICFATLYSPVGWIGQFLDKTGIQVINAPSGIVIALIFIGFPFVVRTLQPVLAELEPEMEESAQSLGATRWQTFRRILLPHLFPALLTGVTLAFARAIGEYGSVIFIAGNLPFKTEIAPLMIMSKLDQFDYDGASAVALVLLLISFSMLLVLNAVQRWQQKSYR 35 | >WP_011358283.1 sulfate ABC transporter permease subunit CysW [Pelodictyon luteolum] 36 | MQTNSETAQQSRTPLTRNDPPTIQAVLILLTFLFFASFILIPLGLVFAEGFRKGAQVYLEAIREPFALAALKLTLLTTAIVVPVNAIFGVTAAWAISKFTFKGKSTLFSLLDLPFAVSPVIAGLIFVLLVGSRTSFGAWLGEHGIRIIFSTPGIVIATLFVTFPFVARELIPLMEAQGREEEEAAITLGARGWHVFTRITLPNIKWGLMYGMILCMARSIGEFGAVSVVSGHIRGMTNTVPLHVEILYSEYNYTAAFAVSSILVLLALGTVAIKALMEMNVHNRQHSQH 37 | >WP_011358284.1 sulfate ABC transporter ATP-binding protein [Pelodictyon luteolum] 38 | MSIEIKNVTKKYPGYTALDDITLKVQTGELIALLGPSGCGKTTLLRIIAGLETPDAGSVLLGDRDTTNLQPREKNVGFVFQHYALFRRLDVFENVAFGLNVRPSGSRPSKSEIRDRVEYLLKLVQMDWAMKRYPSQLSGGQRQRVALARALAVNPKVLLLDEPFSALDAKVRQELRRWLRKLHYDIHVTSIFVTHDQEEALELADRIVVINKGRIEQQGSPQEVYDHPANAFVYNFLGNVNVFHGRVREGKILVGEKLLEAGPASGIHEAFDGQAFIRPHEIGISRIRSSGNDLSGFIRDIRLMGGQIGLALETKAFDQPIEAEITREAWHNLQLSKGDEVFITLDRLKVFSGDYEI 39 | >WP_011358285.1 GTP-binding protein [Pelodictyon luteolum] 40 | MNGKEQMNIVIVGHVDHGKSTVIGRLLADTGSLPDGKLDAVKESCRRNHKPFEYAFLLDALKDEQAQGITIDMARCFFNTRKRDYIIIDAPGHIEFLKNMVTGASRAEAALLVIDAHEGIQENSRRHGHMVAMLGVKQVSVLVNKMDLAGYRQVDFDALKAEYTAYLRQIDVEPLGFIPVSAREGDNIAAVSASMPWYDGPTVLAQLDRFSTGGGLRDKPFRLPVQDVYKFTGSGDDRRIVAGTIEAGSVNIGDAVLFLPSGKRSSIASVESFNTAVKHSAKAGEAVGVTLATQIYIRPGELMVRPDDPQPEVGARFRANIFWVGRVPMSTEKEYKLKLGTAHATVRLAGILSTLDASDLRLSRTKQQLDNRDVGECILETTRPIAFDLASTSEATGRFVIVDNYEIAGGGIVLENLSGGESILERHIRDRERHWENGNVLVVDRERVHRHRAKFIVIVGPPLTGKRALAKELESMLFRNRYSTYYLGMAAIEHGLEADLRSDEGGAEEKLRRIGELARIMTDAGLIFITVIDDPDDYDIESLKLLNAPNEILVVNVGENRFYRTTPDVILPSVDDIANEISRIADLLLWKEVIDDYQI 41 | >WP_011358286.1 sulfate adenylyltransferase subunit 2 [Pelodictyon luteolum] 42 | MNHLDKLEAQSVYILREAYRDFRQLCMLWSIGKDSTVLLWLARKAFFGHVPLPLVHIDTHFKIPEMIQYRDRMAMEFELNMIYGENTDAVERKLSFPDGNTDRITCCRNLKSEALKRTLSGEWPRWRMEHPTGRYVLDESREPYTGVIVGVRADEEGSRSKERYFSPRDTENCWEVGDQPPEFWNHFKTDFAPGTHVRIHPLLDWTELNIWEYIEREEIPVVPLYFNRGQGTRYRSLGCWPCTTPIASESETVADIISELCEGRLANIAERSGRAQDREDAGGLETLRRVGYM 43 | >WP_011358287.1 phosphoadenylyl-sulfate reductase [Pelodictyon luteolum] 44 | MSSAAAGKLQQEWSKLQPEEVLSRAVRKFGISHLAFASSFGAEDQVIIDMICRHGLDIPVFTIDTGRLPQETYDLLDATRMHYGISMEVLFPERAPVEDMLQRFGANLFYDSVEKRRMCCRTRKVQPLERRLSSLDAWICGLRREQSVTRTGTGVVEWDEAFGLFKINPLAAFTEAEVFGYLLRHNVPSNALHRKGYPSIGCAPCTRAVEPGADIRSGRWWWEEAEHRECGLHVPNTNEKKSEHEPSRQA 45 | >WP_011358288.1 4Fe-4S dicluster domain-containing protein [Pelodictyon luteolum] 46 | MKSSVEETAIESRTGIMKQLQPGVFVMRLKSVAGDLDSRQLAAVAAVAEKYGSGSIHLTTRQGIEIHDVRRKDLEPAFLELQGSGVELGACGSRVRGIVACPGSTCCRLGVIDTKALGERLDAEFFGAEAPSKFKIGITGCASNCAKANENDVGIRGVVEPEWIGPQCNDCGKCLSYCPVDAISRSGRAARDDQFTYSVDEKRCIGCGLCASKCTSGGWAVRRSGFSILIGGTMGRKPRFADPLVWMAADADEVIRLVGRVLEFYRKYGWAKERFGATIERVGLAHARFVILGEVSIPPAGEGVPEAGSASGLRGGSAA 47 | >WP_011358289.1 cystathionine gamma-synthase [Pelodictyon luteolum] 48 | MEFETLAIHDGQAPDPVTGSVTVPIYQTSTFEREGMEFNGGFVYSRIGNPTRKALESTLALLENGRHGLAFASGVAAAMAALQVLRPGDHIVSSMDIYGGSYRIFKEVMQPWGVETTYASGKTTASYESCIRPETRMIWVESPSNPLMRIADLRALAAICGERGILLAVDNTFASPYFQRPLDLGADIVVHSTTKYLGGHSDVMGGAIITNDGKLHTAIRNYQATAGAIPAPWECWLIMRGLKTLKIRMKEHEANALHLARFLEQQPAVSRVLYPGLPSHPQHELAASQMEGFGGMITIELKGGMHAVEKLIKGLKLFILADSLGGVESLIASPSRMTLWALSPEEKAARECTEGLVRLSIGLENARDLEDDLSSALEGC 49 | >WP_011358290.1 thiamine biosynthesis protein ThiS [Pelodictyon luteolum] 50 | MLTIELNGKPEPVVEGSTVADMLAILGADGRKVATVVNEHIVRPPERTTAVLQDGDRVEILVFAGGG 51 | >WP_011358292.1 2-iminoacetate synthase ThiH [Pelodictyon luteolum] 52 | MKEVPAWLVDEGSTAEMRRMLSSDSPVDIETLAARARAITLRRFGRTISLYAPLYLSNHCPSGCAYCGFASDRTTLRRRLEEDEIRREIAAMKKLGIRDILLLTGERTAVAGFDYLRRAVEIAAEEMPRVSVETFPMSVEEYRELARCGCTGVTIYQETYDRGRYEELHRWGPKKDFLHRLETPERALEGGIKTVGIGALLGLSEPVEEALRLYRHARHLAKTWWRAGISASFPRMRPEQGGWQPPFNVSDHQLARMILAFRIGLPDMDLALSTRERASFRDGMAGLGVTRMSIASKTTVGGYDEGETGERGQFDISDERSAGEFCQALRNRGIEPVFKNWDGAYNGPATQIIPTGGLKETIP 53 | >WP_011358293.1 HesA/MoeB/ThiF family protein [Pelodictyon luteolum] 54 | MNLSAQEQERYSRHLALPEIGMEGQQRLRASRVLIVGAGGLGSPAALYLSAAGVGTIGLIDGDTVDLTNLQRQILHTTASVGIKKVDSARECLMALNPNQNLHIYPFRLSSENAGEIVRGYDFVVDATDSFGSKFLISRACHATRKPYSHAGITRFFGQAMTVIPGKTTCCHCIFHEDEAPPEAAPEGPLGAMPGVIGSIQAIETIKVLLAIGTPLYDTLLTCNTLTMEFRRVTVRRDPRCPLCGTL 55 | >WP_041463880.1 bifunctional precorrin-2 dehydrogenase/sirohydrochlorin ferrochelatase [Pelodictyon luteolum] 56 | MNCLPVCINLENRRVLVVGGGIAAFEKIELLKRFGADFTVVGESLDERIVDSIADCRLRRWRDEDLDGIWLTYACDEARESNARLKCAANARGILINTPDDPSLCDFLSPAIYRQGAMTVAVSSGGADVRRAIEWRDRVKLLFGDANDRD 57 | >WP_041463881.1 thiazole synthase [Pelodictyon luteolum] 58 | MDALHIGSHTFNSRLILGTGKFSSTDVMLRAVRASKAELVTVALRRFNREQLEDDLFGPLSQLEGITLMPNTSGASTAAEAVKAAHISRELSGSPFIKVEIHPNPQHLMPDPIETFEAAKILASEGFLVMPYISADPVLAKRLEEVGCASVMPLGAAIGSGRGLSTAGMLEIIIRESAIPVIVDAGLRAPSEAARAMEMGCSAVLVNSAIAAAENPEEMAAAFRDGTSAGRRAYRAGLMTESGDAVPTSPLTSFLGERR 59 | >WP_041463965.1 4a-hydroxytetrahydrobiopterin dehydratase [Pelodictyon luteolum] 60 | MGELNKTKCVSCSEGLPPLAERESEELLKEIPEWVIVSEDGVSRLVRTFTFENFREAMAFAGSVGELAESEQHHPKLVTEWGKVRVEWWTHAVHGLHMNDFVMAARTDELFQEL 61 | >WP_041464150.1 6-carboxytetrahydropterin synthase [Pelodictyon luteolum] 62 | MQISRKIEIDYGHTLPNHFSFCNQLHGHRGVVVATVEGPVIERSGDSEEGMVMDFRFLKEIMVEHIHNRLDHGFAVWREDHEDLEFILKRNTRVLVTDDPPTAEYLAKWAFNEIRTHLPEGISLHNVRWYETPNNWADYDGQIIK 63 | >WP_081423692.1 cytochrome c5 family protein [Pelodictyon luteolum] 64 | MEVIITSTLFTFRDPIHERHLLTCPEIIPMKRIFYSAVFACSLLLSQQPLQAASPDPAALKAQHDLVKGAAIYNSTCAVCHNNGIMGAPKPGDKVAWSARVAGGFAPVLANSIKGFKGMPARGGKQSLTDAQIADAVAFMVTKSL 65 | -------------------------------------------------------------------------------- /testing/inputs/Chl_ferrooxidans_KoFox.faa: -------------------------------------------------------------------------------- 1 | >WP_006365806.1 DNA polymerase V subunit UmuC [Chlorobium ferrooxidans] 2 | MFALIDCNNFYVSCERLFNPGLRGRPVVVLSNNDGCFISRSEEAKAIGLGMGEPAFKCRELLQQHRVALYSANFPLYGDLSARVMRTLADFAPDIEIYSIDECFLDLSGFGRFDLDRYARQIRQTVEKHTGIPVSIGIGATKTLAKAASRMAKKNASLGGVSLLRTDAGIRDALSGMAVEHIWGIGRRYGALLHRHRIHTALDFAAAPAPWVRRNLHITGARVQEELNGRSCLPLELVRPPKQSICTSRSFGTPVIAKEELQHAVVHFASKCAFKLRREESRASLVTVFACTSPFAPKEKRYWGTETLSLPAPSNDTLLLLKAATLALERLYRRGYEYKKAGVILSGIGSVSEPQQTTLSLFDTDPLPEKQGRAASLMEALDALNFRYGPGTLRLASDCSSGWKQRQEKLSPAYTTRWSDIIEVKP 3 | >WP_006365807.1 mitomycin resistance protein [Chlorobium ferrooxidans] 4 | MNPAKVSRERLKELTDLPNIGKSIAGDLRLIGISRPEQLVGRSALEMYHDLCRVTDTVQDPCVLDVFMSLTSFMSGSEPRTWWSFTEERKRLLSKEAPTGTSKNYHERFEDF 5 | >WP_006365808.1 DUF1848 domain-containing protein [Chlorobium ferrooxidans] 6 | MIISASRRTDIPAFYGEWLVNRLKAGEVLVRNPMQPKQVSRIMLTPATVDALVFWTKNPEPFLSRLPEIDALGYSYYFLFTLTPYDVTLEPGVPEKGSIVEVFRRLSRLVGPEKVVWRYDPVVITDQFTPAWHAASFPRLAEKLSGYTGRCIISFLDDYRKVRSRMRNLRYTLPDSAEMGELAALFADIAAKQDIALCTCSQEVDLSRYGIMHSRCIDGGLVERISGRRMRGAKKDGSQRHACGCIESRDIGSYDTCPHGCLYCYAVAGQAKAGKAFEAYDPSSPMLCDSLHGDETITTPAPRKRAASARTPSAPGLHQGELFS 7 | >WP_006365809.1 type II toxin-antitoxin system Phd/YefM family antitoxin [Chlorobium ferrooxidans] 8 | MHAIDTRNVTPLTDFRNNIKRYMEELSVSKKPLLLTQHGKSAAVLLDAEQYQKMLDQITFMQLVTEGLEEYRNNRTIPAEELFASLDKIIAEAEKQ 9 | >WP_006365810.1 bacterioferritin [Chlorobium ferrooxidans] 10 | MKGNPKIIEKLNSLLADELTAINQYIVHSEMCSNWGYEKLHNADEKRAIDEMKHAEKLIARILFLEGIPVVSDLKKIKIGADVAAQHKNDRHSEDGAIASYNDGIRLAVEAGDNGTRELLESILRDEEAHIDWLEAQIDQISQMGIQNYLVEQLG 11 | >WP_006365811.1 ArsR family transcriptional regulator [Chlorobium ferrooxidans] 12 | MLDQRAEGCQEECFHPDVVERVRKAMPGEPLQEELAQLFKVLGDHTRIRILNALCLSELCVCDLTSILGMNQSAVSHQLRVLRDAKLVKSRKQGKNVLYTLDDTHVSTLIRTGSEHIREGK 13 | >WP_006365812.1 permease [Chlorobium ferrooxidans] 14 | MNELLIVVSKVLLASWSVLLESAPFVLLGFFIAGVLKAFVPDDFVATHLGGGGMASIFKASLFGIPIPLCSCGVLPAAAGLKKQGSGNGPVAAFLISTPETGVDSIAVSWALLDPLMTVIRPVIALFMAIATGIAVSFAGKQSANEVAADIDKSGESACASSCCCSHKKQEKLTVAGKFREGMSFAFGDLLNDVGLWLFIGILLSGLISVFVSTEVVSRYLSNEYLSMLVMFIVSVPLYVCATASTPIVAALALKGISPGAALVFLLAGPATNAASLPVIFRMLGRKSALIYLASIVLISLIGGVVVNDLYSYLGYDITHWVSKGAHEEAGIVSIAASVVLLLLVLKSLLPKKQIGHPR 15 | >WP_006365814.1 cadmium-translocating P-type ATPase [Chlorobium ferrooxidans] 16 | MTTTGTYSVKGMHCASCAAIITKKLSKVEGITRADVNLATEKVTIAFDHETLPVEALNDAVSRYGYAIAAEPGTASAGESQKIAASGAIRLKEEKKQALLDQKAKVQFALPIALLVFILMMWDIGSRFIESVPNLPIPMELFNIISMVLATWFVFRIGAPFLHGIVMFLQSGAASMDTLIGIGTLTAYVYSALITLIPSLRELLRVPDYTYFDVTIVVIGFVLLGKYLEARSRMKTGEAIEKLINLQAKTAIVLREGKEIEIPSEAVRKGDTVIIKPGAKLPVDGIIIEGFSAIDESMVTGESIPADKKSGDEVIGGTINKQGSFTFRASNVGSDTVLARIIRMVEEAQGSKAPIQEIADKIAGIFVPVVLIIAAATFLIWITVGSSYLGFSVALSYAISGMVGILVIACPCALGLATPTAIIVGIGKGAEYGILIRNAESLEKLSSVDTVVFDKTGTITTGSPKVTDVTPLSEGTTPESLLQIAASVENRSEHPLAEAILDEAKRRSITLRDISEFKALEGMGVSATIAGLPVTIRKPLEAERELAEVKRLQAEGKTVVMIEENRIALGLIALSDTVKTEAKAAVAALHRRGLKVIMLTGDNRAAANYMAGQVGIDEVIAEVLPTEKAAKIKALQSEGRKVAMAGDGINDAPALAQADVGIAMATGTDIAIESAGITLLKGDINKVAQSITLSRATMNVIRQNLFWAFIYNIIGIPLAAGALYPFWGIFLNPVFSGLAMAGSSVSVVSNSLRLKAKKLDR 17 | >WP_006365815.1 DUF305 domain-containing protein [Chlorobium ferrooxidans] 18 | MSSNDFNHFHEKTMKNISLSLAVALSIVMAVIGIGTGYWLTPQYSLSMYDKNSMDLGQADKWVDLRYLDGMIAHHRGAILLAEKASISQRTEIRNLCKEILKNEPVAIAELYQWKKEWYGDRRKVRDPQVANLGSSDSKLDLRFLNALIAHHENGVRMTREIRLKSSRSEVLDNADAVELFLKGGLGMLREWRKEWYNVIDTNPYL 19 | >WP_006365940.1 type II toxin-antitoxin system RelE/ParE family toxin [Chlorobium ferrooxidans] 20 | MNVEFSERAARQITDVVRYIAADKPAAAKKWADSVKKAVRKLSDYPHLGRVVPEFVDASLRELLHGEYRIVYKIDEQFSRIVIVAVYHARRILMPD 21 | >WP_006366764.1 MULTISPECIES: F0F1 ATP synthase subunit beta [Chlorobium] 22 | MQEGKISQIIGPVVDVDFPEGRLPSILDALTITRPDGTKLVLETQQHLGEERVRTVSMESTDGLVRGTSVANTGMPIQVPVGPEVLSRMMNVVGEPIDGRGPVHTAKTYSIHRSAPKFDEISTKAEMFETGIKVIDLLEPYSRGGKTGLFGGAGVGKTVLIMELINNIAKQQSGFSVFAGVGERTREGNDLWHEMMESGVIDKTALVFGQMNEPPGARQRVALTGLSIAEYFRDEEGRDVLLFIDNIFRFTQAGSEVSALLGRMPSAVGYQPTLATEMGQLQDRIVSTKKGSVTSVQAIYVPADDLTDPAPATAFAHLDATTVLSRQIAELGIYPAVDPLDSTSRILDPNIVGDDHYDTAQAVKQLLQRYKDLQDIIAILGMDELSDEDKLVVSRARKVQRFLSQPFFVAEAFTGLAGKYVKLEETIKGFKEIIAGKHDNLPESAFYLVGTIEEAIEKAKTL 23 | >WP_006367304.1 hypothetical protein [Chlorobium ferrooxidans] 24 | MKKMLSLAAMFAVLSYASPASAELKISGDAATRLRDDFGKAKTAAGVTTNTQDQYFQYRVRLKPSADLGDGYFFKGLIQNEGVAGGWQTIAASNTERNSLEVSQFYFGRNLANSHYSLGRLPLGSLDNPILDLTLYAIPGTGPQGSKLYAVDTPIATNHFDRLFGFNYGVKLGGGELNAVLVNFDNLSSTATANQGLLNDGYGITANYKTTLGDVTVVPQAYYTITRISDGRRPYSFGGQAFIPVGKSKVALSGFYTANKNTYQATNWDYSGYIFRVKGESGPVTAWVDYNRTKDKSAATTVDYDNLFVWGSYKVNVHEAATGSFSITPTLRYRASGSQAVGATKATNDLLRTEVYATVTF 25 | >WP_006367305.1 4Fe-4S dicluster domain-containing protein [Chlorobium ferrooxidans] 26 | MSEHELDLAALSKVGMMRQKQPDYYVMRLKAVAGDMTASQLACVAGVAEKYGRGFVHLSTRQGIEIHYVHRDHLETARLELQGAGIEMGACGPRVRVIVACPGEETCRWGNIDTKKIARELDRRYFKQETPSKFKLAVTGCSNNCTKANENDIGVRGAIEPKWEAGECNDCGLCVSLCPVSAIERRDSDEGYCYAIDEQLCINCSVCTSLCPSNSWVIGRKGYTLYIGGTMGRVPRFASVLKKMVESEEELYLLIEKAIALYREKGRKKERFGHMIDRIGLEAVKEELLAKP 27 | >WP_006367307.1 phosphoadenylyl-sulfate reductase [Chlorobium ferrooxidans] 28 | MKREKIEQLNIELAAKSPEEILRAAVGCFGKENMAFASSFGAEDQVLTDMLHREKLPVPVFTLDTGRLHQETYDLFDATRKHYGIEIEALFPETAAVESMLALHGPNLFYESIEKRRECCRIRKVQPLTKKLSTVTAWICGLRREQSVTRTAVAPIEWDAAFGLYKINPLAEVSEAWVWEYIKQHNVPFNRLHNEGYPSIGCAPCTRAVKPGEDLRAGRWWWEIAEHRECGLHREKQ 29 | >WP_006367308.1 sulfate adenylyltransferase subunit 2 [Chlorobium ferrooxidans] 30 | MDHLDKLEAQSVYILREAYREFKSLCMLWSIGKDSTVMLWLARKAFFGHVPIPLVHVDTHYKIPEMIEYRDRLALEWNLNMIYGENREALERKLTFPDGNTDRLTCCKYLKSEALKHTLSGEWPRYRMDHELKHYVQDEGKEPYTGVIAGVRADEEGSRSKERYFSPRDKDNVWDVGDQPPEFWNQFKTDFAPGTHVRIHPLLDWTELNIWEYIEREKISVVSLYFNQGKGIRYRSLGCYPCTNPVESEARTLPEIIDELRSGKFSNIAERSGRAQDSEDGGGLETLRRDGYM 31 | >WP_006367309.1 GTP-binding protein [Chlorobium ferrooxidans] 32 | MNGRSQMNIVIVGHVDHGKSTVIGRLLADTGTLPQGKLESVRESCRKNSKPFEYAFLLDALKDEQAQGITIDMARCFFKTEKRDYIIIDAPGHIEFLKNMITGASRAEAALLVIDAHEGIQENSKRHGHMVSMLGVKHVTVLVNKMDLAGYSEEVFEQLKKEYTAFLAQIKVTPVSFIPVSAREGDNIASLSDRMEWYKGATVLEQLDRFVSSKELQELPFRFPVQDIYKFTRSDDDRRIVAGSVAAGSVQVGDEVLFLPSGKHSVIQSVESFNTLPKTKASAGEATGFTLSTQIYIRPGELMVKPSEPQPEVGSRFRVNIFWVGRAPMIRQKEYKLKLGSARASVKLAEISNTLDASDLSYSRSKQQLDCRDVGECILETARPIAFDPASASETTGRFVIVDNFEICGGGIVLENLSAGETLLQQHIRDRENNWEPGLVRFEERAEANRHKAKFIVFTGAPGTGKRAIAKALERGLFENGLNAYYLGVANIDRGLDADLGTHADSAGERLRRIGELARILTDAGLIFITTIDDADDYDIETLKQLNEPNDILVINTGENGFSRYQPDFELPAGADVSKAVGQVADILKTREIIIDYQI 33 | >WP_006367310.1 bifunctional precorrin-2 dehydrogenase/sirohydrochlorin ferrochelatase [Chlorobium ferrooxidans] 34 | MNYVPITLKVENKQLLILGGGKAALEKVQMLQRFGFTVTVIGEEIDEEILQSGCTCHLRPFRSEDFEDVLLVYACFADREVNRAIKEEANARGILVNTPDDPELCDFITPALFIDGPMMVAVSSGGTDVRKAVAWRNRIKTYFSSHDPIS 35 | >WP_006367312.1 uroporphyrinogen-III C-methyltransferase [Chlorobium ferrooxidans] 36 | MTPYPETLRLIAEHLDHSPAEPACFALDPFRLLEVTDDPLQALLDGRSEAALFPASALPEPLPDEFAVAALLSFRTVPDREDAHPLKDMNVIVVRHERADLKVLFAPLDIRRGYGKVYLTGAGAGSRDYLTLKADTLMQRAGVIFYDDLIDTGLLDAYRAEKVYVGKRKGLHHADQDAINRQLFAAALTKQIVVRLKGGDPLVFGRGGEELAWLSDRQIDVEVVPGVSSMMSAAASAGIPLTQRGVSGGVTLQSAHNVLAGDTPRTLVYFMCASRLKELQSTLLHEGIDGKTPVALIRKAGFFDEAMIMTTVEIMHTVELASPLLAIIGRTAALCRKRSKILHTGDDPYRCLLPGKIVPISDITAGGNPLGVVDLSLFSGIVFTGREQIDTLLAAFGTFPPHLVLYAEGRKTAELLRSRGYGRTVMEI 37 | >WP_006367320.1 sulfate ABC transporter ATP-binding protein [Chlorobium ferrooxidans] 38 | MGIELQNITKKFTDYTALDNISLNIASGELIALLGPSGCGKTTLLRIIAGLESADSGKIILEGKDTTNLPPREKNVGFVFQHYALFRRLNVYENVAFGLKVLPRSKRPSKSEIKDRVEHLLKLVQMEWALKRYPSQLSGGQRQRVALARALAVNPRVLLLDEPFSALDAKVRQELRRWLRKLHDEIHVTSIFVTHDQEEALELADRIVVINKGKIEQQGTPQEVYDHPANAFVYNFLGNVNVFHGRVHEGMVTLGGHSVDAPDELKNSAEKTSQTFVRPHEIGISKIRSEGNDLEGTIREIRLLGGQIGLNIECEGFDQPIDAEIPRELYLNLQLKKGDRVFVTFNKVKVFACDYEI 39 | >WP_006367321.1 sulfate ABC transporter permease subunit CysW [Chlorobium ferrooxidans] 40 | MSTPLLSQGTPPVVRKKISDPLPVRILLIGLTLLFFIGFVFLPLGLVFAQAFQKGWDFYLEAIREPYTIEAVKLTLTTVAIVVPLNAFFGVTAAWAITKFSFPGKAALKTLLDLPFAVSPVIAGLIFVLLLGSRTPFGSWLGEQGIKIIFSTPGIIIATLFVTFPFVARELIPLMEAQGRDEEEAALTLGAKGWQIFGKITLPNIRWGLLYGMILCSARAIGEFGAVSVVSGHIRGQTNTIPLHVEILYSEYNFTASFAVASLLVFLALTTVIVKAVMENRFQKNSSQH 41 | >WP_006367322.1 sulfate ABC transporter permease subunit CysT [Chlorobium ferrooxidans] 42 | MAIFQKKRRNILPGFGLSMGYTVFYLSAVVIVPLSMIFFNAIPMGWEPFLSAVTAPRVLASYKLSFSTAFLAALFDAVAGLLTAWVLVRYRFPGKAFLDALVDIPFALPTAVAGICFATLYSPVGWLGEITAKWNIEVINSPTGIVVALIFIGFPFVVRTLQPVLEELEPEIEESAHCLGATRLQTFRKILLPHLFPALLTGSTLAFARGIGEYGSVIFIAGNLPMKTEIAPLMIMSKLDQYDYNGASAVALVLLVISFSMILLLNAIQEWQQKEYR 43 | >WP_006367323.1 sulfate ABC transporter substrate-binding protein [Chlorobium ferrooxidans] 44 | MHLNWTKKITLAASLILASGLPGVVAEAVPAPAKLLNVSYDPTRELYKSENAAFINYWKAKTGQTVTIDQSHGGSGKQGRAVIDGLEADVVTLALAYDIDAIADSRLLDVNWQKRLANNSTPYTSTIVFVVRKGNPKRINGWDDLVKPGVSVITPNPKTSGGARWNYLAAWGFKQKQTGSALKAKAFVKALYKNVPVLDTGARGSTLTFAQRGLGDVFLSWENEAHLILKEFGSDKFEIVAPATSILAEPPVAVVDKNVAKHGTRKLAEAYLNFLYTPQSQEIIARNYYRPSSPAALKKYGANFPKIKLFTLKEFFGDWRTAQKTHFKDGGVFDQIYLP 45 | >WP_006367388.1 hypothetical protein [Chlorobium ferrooxidans] 46 | MTLKKTMLVGASCLSLMALSSPQEASATPVFARQTSQSCAACHFQRFPMLNAYGRSFKANGYTTVGKQGTIEDTNLSTPAILNAGLVSKFRYQKTNGTADTELNAGEFQVPDEATLFVGGKVAKNIGFQAEIGLTTTASMLGFKVPFVFPANDFTFSAIPFYSDAQGAAYGFELLNTGALRFSRTFENRTEASAQQYIGAAHKTTGVALVALHKYGYISYTPYLVKDVLTDGIAFSSGRPLSYIRGVVTPEKVGNWDLAAGAQIWTGSSEEGTASAPTRNHASAWAIDAQAQGKAGDMPLGVYVAYGSAAASKAGETANFYNASTVASKNALTIATELGIVPNRLGIGVAYRNANSGAGAADNDNALTLGANYQIARNMVFQLNHSFYSGDIKTDAVGKQLTTAMFYSAF 47 | >WP_006367389.1 cytochrome c5 family protein [Chlorobium ferrooxidans] 48 | MRRFVQVCSLMVVFICSSQIVNAEDIARATLRAKYDMVQGKDVYERACSVCHSSGVMDAPKFCDITAWKPRMAHGMEAMVKHAVEGFNNMPAKGGMDALTLTESANAVAYMVDQCLFD 49 | >WP_006367390.1 50S ribosomal protein L9 [Chlorobium ferrooxidans] 50 | MKVILRKDVATLGDTGEVVAVKNGYANNYLIPQGIAIRATEGTLKALETEKKQQAKKVELLRKHAREVARNIEQLALKVYAKAGESGKLFGTVTSADIAEALKAQGFEIDRRKITLDAPVKTLGKFEADARLFSDISVKVHFEVEAEGAGE 51 | >WP_006367391.1 MULTISPECIES: 30S ribosomal protein S18 [Chlorobium] 52 | MRQKFTQSQGHKSQGNKSLSNALASKKKVSKNQVVFFDYRDERKLKRFINDQGKIIPRRITGLSAKEQNLLTHSVKWARFLAVIPYVVDEYK 53 | >WP_006367391.1 MULTISPECIES: 30S ribosomal protein S18 [Chlorobium] 54 | MRQKFTQSQGHKSQGNKSLSNALASKKKVSKNQVVFFDYRDERKLKRFINDQGKIIPRRITGLSAKEQNLLTHSVKWARFLAVIPYVVDEYK 55 | >WP_006367392.1 single-stranded DNA-binding protein [Chlorobium ferrooxidans] 56 | MAELKMPEINSVINSVIIAGNLTKDPVFRQTNSGGTPVVNFSIACNRRFRDSNHMWQEDVCYVGVVAWNKLAESCRDNLKKSSAVLVDGELQSRTWKAQDGTSRTVVEIKARRIQFLNKRKKNGEDDEEGFIEDECHDIHHGEIPDDEASHIYEYKYLSSD 57 | >WP_006367393.1 MULTISPECIES: 30S ribosomal protein S6 [Chlorobium] 58 | MEKNKLYECTVIIDGGLQDEAITAAMELVQRVITEKGGSISGVLEIGRRKTAYPIKKRTIGYYAHIEFTGAPGVIAEIEKVLRYEEDLLRYLIIQLTGALLEMRKRVEKYSVVIGSPEDVAALEAAASEAAAK 59 | >WP_006367393.1 MULTISPECIES: 30S ribosomal protein S6 [Chlorobium] 60 | MEKNKLYECTVIIDGGLQDEAITAAMELVQRVITEKGGSISGVLEIGRRKTAYPIKKRTIGYYAHIEFTGAPGVIAEIEKVLRYEEDLLRYLIIQLTGALLEMRKRVEKYSVVIGSPEDVAALEAAASEAAAK 61 | >WP_006367394.1 glycosyltransferase family 9 protein [Chlorobium ferrooxidans] 62 | MSSSNRLIALLRSFRVQFLPNQNRVIKPGGSSGRNRMSVIKKILVIRLSSIGDIILTTPLLRRLNEAFPDAVIDYCTKAAFSTLLSSNPRISSHYTPEQPPFGAYDLIVDLQNNSRSRSLVRHLHAAKVVRYRKQNWKKWLLVQFKVNLYGKERHVVERYQDSLERFALPVDRKGCELYPSAEERGFAESFFITDQKTLALCFGAKHYTKRYPPERFAELLGLLLKEESLQVLLLGGEEDAPQASGIMHSLPDHYRKKVVNLSGNCSLMQTAAILERCDAVLCNDTGLMHMASAFGKKLFVLFGSSSSAFGFLPYHAPFDLFEVSGLRCRPCSHIGRERCPRGHFRCMNELSARDIASRILDYFRQAEL 63 | >WP_006367398.1 CTP synthase [Chlorobium ferrooxidans] 64 | MARPKNVKHIFVTGGVISSLGKGILSASLGMLLKSRGLRVAIQKYDPYINVDPGTMSPYQHGEVYVTDDGAETDLDLGHYERFLDESTSQSSNLTMGRVYKSVIDKERRGEYLGATVQVVPHVIDEIKDRMAELAKNGNLDVLITEIGGTIGDIESLPFLEAMRQMKLEMGDRNMLNIHLTFVPYIKAASELKTKPTQHSVKMLLETGIQPDILVCRSEKPLSREIKNKVGHFCNVNDLDVIGLNDCDTIYDVPLLLLKEKLDLRVLKKLGLKKYKEPNLEYWREFCNKVKFPEEGEVTIGICGKYTEYPDAYKSILEAFIHAGASNNVKVNIKLLRAEDAEINTFNLSKELEGVSGILVAPGFGDRGIEGKINFIRYAREKNVPFFGICLGMQCATIEFARNVCGLQDANSTEFNKRTRFPVIDLMEHQKKVKEKGGTMRLGSYPCIIKEGSMAHAVYQKFLINERHRHRFEFNNTYRNSFEESGLVFSGTSPNGELVEIIELKDHRWFVGVQFHPELKSRVQKVHPLFHGFVAAAKAYVQGGRQMELPVEQPSFMPVENEAGE 65 | >WP_006367399.1 cytochrome c551 [Chlorobium ferrooxidans] 66 | MEQKKPYIIKEQPGTKYYCACSSSKNLPYCDGSHKGSGLHPVKVEIDEEKTVAICSCGLSANPPYCDGSHKSAQ 67 | >WP_050770676.1 sulfurtransferase TusA family protein [Chlorobium ferrooxidans] 68 | MQGISCPANAMRVRAAVAELTGDELLEVLVDEGEAVLRVARTLKDCGHRIVKVVNRGEGVSVIVGK 69 | -------------------------------------------------------------------------------- /testing/inputs/subjects/Chl_ferrooxidans_KoFox.faa: -------------------------------------------------------------------------------- 1 | >WP_006365806.1 DNA polymerase V subunit UmuC [Chlorobium ferrooxidans] 2 | MFALIDCNNFYVSCERLFNPGLRGRPVVVLSNNDGCFISRSEEAKAIGLGMGEPAFKCRELLQQHRVALYSANFPLYGDLSARVMRTLADFAPDIEIYSIDECFLDLSGFGRFDLDRYARQIRQTVEKHTGIPVSIGIGATKTLAKAASRMAKKNASLGGVSLLRTDAGIRDALSGMAVEHIWGIGRRYGALLHRHRIHTALDFAAAPAPWVRRNLHITGARVQEELNGRSCLPLELVRPPKQSICTSRSFGTPVIAKEELQHAVVHFASKCAFKLRREESRASLVTVFACTSPFAPKEKRYWGTETLSLPAPSNDTLLLLKAATLALERLYRRGYEYKKAGVILSGIGSVSEPQQTTLSLFDTDPLPEKQGRAASLMEALDALNFRYGPGTLRLASDCSSGWKQRQEKLSPAYTTRWSDIIEVKP 3 | >WP_006365807.1 mitomycin resistance protein [Chlorobium ferrooxidans] 4 | MNPAKVSRERLKELTDLPNIGKSIAGDLRLIGISRPEQLVGRSALEMYHDLCRVTDTVQDPCVLDVFMSLTSFMSGSEPRTWWSFTEERKRLLSKEAPTGTSKNYHERFEDF 5 | >WP_006365808.1 DUF1848 domain-containing protein [Chlorobium ferrooxidans] 6 | MIISASRRTDIPAFYGEWLVNRLKAGEVLVRNPMQPKQVSRIMLTPATVDALVFWTKNPEPFLSRLPEIDALGYSYYFLFTLTPYDVTLEPGVPEKGSIVEVFRRLSRLVGPEKVVWRYDPVVITDQFTPAWHAASFPRLAEKLSGYTGRCIISFLDDYRKVRSRMRNLRYTLPDSAEMGELAALFADIAAKQDIALCTCSQEVDLSRYGIMHSRCIDGGLVERISGRRMRGAKKDGSQRHACGCIESRDIGSYDTCPHGCLYCYAVAGQAKAGKAFEAYDPSSPMLCDSLHGDETITTPAPRKRAASARTPSAPGLHQGELFS 7 | >WP_006365809.1 type II toxin-antitoxin system Phd/YefM family antitoxin [Chlorobium ferrooxidans] 8 | MHAIDTRNVTPLTDFRNNIKRYMEELSVSKKPLLLTQHGKSAAVLLDAEQYQKMLDQITFMQLVTEGLEEYRNNRTIPAEELFASLDKIIAEAEKQ 9 | >WP_006365810.1 bacterioferritin [Chlorobium ferrooxidans] 10 | MKGNPKIIEKLNSLLADELTAINQYIVHSEMCSNWGYEKLHNADEKRAIDEMKHAEKLIARILFLEGIPVVSDLKKIKIGADVAAQHKNDRHSEDGAIASYNDGIRLAVEAGDNGTRELLESILRDEEAHIDWLEAQIDQISQMGIQNYLVEQLG 11 | >WP_006365811.1 ArsR family transcriptional regulator [Chlorobium ferrooxidans] 12 | MLDQRAEGCQEECFHPDVVERVRKAMPGEPLQEELAQLFKVLGDHTRIRILNALCLSELCVCDLTSILGMNQSAVSHQLRVLRDAKLVKSRKQGKNVLYTLDDTHVSTLIRTGSEHIREGK 13 | >WP_006365812.1 permease [Chlorobium ferrooxidans] 14 | MNELLIVVSKVLLASWSVLLESAPFVLLGFFIAGVLKAFVPDDFVATHLGGGGMASIFKASLFGIPIPLCSCGVLPAAAGLKKQGSGNGPVAAFLISTPETGVDSIAVSWALLDPLMTVIRPVIALFMAIATGIAVSFAGKQSANEVAADIDKSGESACASSCCCSHKKQEKLTVAGKFREGMSFAFGDLLNDVGLWLFIGILLSGLISVFVSTEVVSRYLSNEYLSMLVMFIVSVPLYVCATASTPIVAALALKGISPGAALVFLLAGPATNAASLPVIFRMLGRKSALIYLASIVLISLIGGVVVNDLYSYLGYDITHWVSKGAHEEAGIVSIAASVVLLLLVLKSLLPKKQIGHPR 15 | >WP_006365814.1 cadmium-translocating P-type ATPase [Chlorobium ferrooxidans] 16 | MTTTGTYSVKGMHCASCAAIITKKLSKVEGITRADVNLATEKVTIAFDHETLPVEALNDAVSRYGYAIAAEPGTASAGESQKIAASGAIRLKEEKKQALLDQKAKVQFALPIALLVFILMMWDIGSRFIESVPNLPIPMELFNIISMVLATWFVFRIGAPFLHGIVMFLQSGAASMDTLIGIGTLTAYVYSALITLIPSLRELLRVPDYTYFDVTIVVIGFVLLGKYLEARSRMKTGEAIEKLINLQAKTAIVLREGKEIEIPSEAVRKGDTVIIKPGAKLPVDGIIIEGFSAIDESMVTGESIPADKKSGDEVIGGTINKQGSFTFRASNVGSDTVLARIIRMVEEAQGSKAPIQEIADKIAGIFVPVVLIIAAATFLIWITVGSSYLGFSVALSYAISGMVGILVIACPCALGLATPTAIIVGIGKGAEYGILIRNAESLEKLSSVDTVVFDKTGTITTGSPKVTDVTPLSEGTTPESLLQIAASVENRSEHPLAEAILDEAKRRSITLRDISEFKALEGMGVSATIAGLPVTIRKPLEAERELAEVKRLQAEGKTVVMIEENRIALGLIALSDTVKTEAKAAVAALHRRGLKVIMLTGDNRAAANYMAGQVGIDEVIAEVLPTEKAAKIKALQSEGRKVAMAGDGINDAPALAQADVGIAMATGTDIAIESAGITLLKGDINKVAQSITLSRATMNVIRQNLFWAFIYNIIGIPLAAGALYPFWGIFLNPVFSGLAMAGSSVSVVSNSLRLKAKKLDR 17 | >WP_006365815.1 DUF305 domain-containing protein [Chlorobium ferrooxidans] 18 | MSSNDFNHFHEKTMKNISLSLAVALSIVMAVIGIGTGYWLTPQYSLSMYDKNSMDLGQADKWVDLRYLDGMIAHHRGAILLAEKASISQRTEIRNLCKEILKNEPVAIAELYQWKKEWYGDRRKVRDPQVANLGSSDSKLDLRFLNALIAHHENGVRMTREIRLKSSRSEVLDNADAVELFLKGGLGMLREWRKEWYNVIDTNPYL 19 | >WP_006365940.1 type II toxin-antitoxin system RelE/ParE family toxin [Chlorobium ferrooxidans] 20 | MNVEFSERAARQITDVVRYIAADKPAAAKKWADSVKKAVRKLSDYPHLGRVVPEFVDASLRELLHGEYRIVYKIDEQFSRIVIVAVYHARRILMPD 21 | >WP_006366764.1 MULTISPECIES: F0F1 ATP synthase subunit beta [Chlorobium] 22 | MQEGKISQIIGPVVDVDFPEGRLPSILDALTITRPDGTKLVLETQQHLGEERVRTVSMESTDGLVRGTSVANTGMPIQVPVGPEVLSRMMNVVGEPIDGRGPVHTAKTYSIHRSAPKFDEISTKAEMFETGIKVIDLLEPYSRGGKTGLFGGAGVGKTVLIMELINNIAKQQSGFSVFAGVGERTREGNDLWHEMMESGVIDKTALVFGQMNEPPGARQRVALTGLSIAEYFRDEEGRDVLLFIDNIFRFTQAGSEVSALLGRMPSAVGYQPTLATEMGQLQDRIVSTKKGSVTSVQAIYVPADDLTDPAPATAFAHLDATTVLSRQIAELGIYPAVDPLDSTSRILDPNIVGDDHYDTAQAVKQLLQRYKDLQDIIAILGMDELSDEDKLVVSRARKVQRFLSQPFFVAEAFTGLAGKYVKLEETIKGFKEIIAGKHDNLPESAFYLVGTIEEAIEKAKTL 23 | >WP_006367304.1 hypothetical protein [Chlorobium ferrooxidans] 24 | MKKMLSLAAMFAVLSYASPASAELKISGDAATRLRDDFGKAKTAAGVTTNTQDQYFQYRVRLKPSADLGDGYFFKGLIQNEGVAGGWQTIAASNTERNSLEVSQFYFGRNLANSHYSLGRLPLGSLDNPILDLTLYAIPGTGPQGSKLYAVDTPIATNHFDRLFGFNYGVKLGGGELNAVLVNFDNLSSTATANQGLLNDGYGITANYKTTLGDVTVVPQAYYTITRISDGRRPYSFGGQAFIPVGKSKVALSGFYTANKNTYQATNWDYSGYIFRVKGESGPVTAWVDYNRTKDKSAATTVDYDNLFVWGSYKVNVHEAATGSFSITPTLRYRASGSQAVGATKATNDLLRTEVYATVTF 25 | >WP_006367305.1 4Fe-4S dicluster domain-containing protein [Chlorobium ferrooxidans] 26 | MSEHELDLAALSKVGMMRQKQPDYYVMRLKAVAGDMTASQLACVAGVAEKYGRGFVHLSTRQGIEIHYVHRDHLETARLELQGAGIEMGACGPRVRVIVACPGEETCRWGNIDTKKIARELDRRYFKQETPSKFKLAVTGCSNNCTKANENDIGVRGAIEPKWEAGECNDCGLCVSLCPVSAIERRDSDEGYCYAIDEQLCINCSVCTSLCPSNSWVIGRKGYTLYIGGTMGRVPRFASVLKKMVESEEELYLLIEKAIALYREKGRKKERFGHMIDRIGLEAVKEELLAKP 27 | >WP_006367307.1 phosphoadenylyl-sulfate reductase [Chlorobium ferrooxidans] 28 | MKREKIEQLNIELAAKSPEEILRAAVGCFGKENMAFASSFGAEDQVLTDMLHREKLPVPVFTLDTGRLHQETYDLFDATRKHYGIEIEALFPETAAVESMLALHGPNLFYESIEKRRECCRIRKVQPLTKKLSTVTAWICGLRREQSVTRTAVAPIEWDAAFGLYKINPLAEVSEAWVWEYIKQHNVPFNRLHNEGYPSIGCAPCTRAVKPGEDLRAGRWWWEIAEHRECGLHREKQ 29 | >WP_006367308.1 sulfate adenylyltransferase subunit 2 [Chlorobium ferrooxidans] 30 | MDHLDKLEAQSVYILREAYREFKSLCMLWSIGKDSTVMLWLARKAFFGHVPIPLVHVDTHYKIPEMIEYRDRLALEWNLNMIYGENREALERKLTFPDGNTDRLTCCKYLKSEALKHTLSGEWPRYRMDHELKHYVQDEGKEPYTGVIAGVRADEEGSRSKERYFSPRDKDNVWDVGDQPPEFWNQFKTDFAPGTHVRIHPLLDWTELNIWEYIEREKISVVSLYFNQGKGIRYRSLGCYPCTNPVESEARTLPEIIDELRSGKFSNIAERSGRAQDSEDGGGLETLRRDGYM 31 | >WP_006367309.1 GTP-binding protein [Chlorobium ferrooxidans] 32 | MNGRSQMNIVIVGHVDHGKSTVIGRLLADTGTLPQGKLESVRESCRKNSKPFEYAFLLDALKDEQAQGITIDMARCFFKTEKRDYIIIDAPGHIEFLKNMITGASRAEAALLVIDAHEGIQENSKRHGHMVSMLGVKHVTVLVNKMDLAGYSEEVFEQLKKEYTAFLAQIKVTPVSFIPVSAREGDNIASLSDRMEWYKGATVLEQLDRFVSSKELQELPFRFPVQDIYKFTRSDDDRRIVAGSVAAGSVQVGDEVLFLPSGKHSVIQSVESFNTLPKTKASAGEATGFTLSTQIYIRPGELMVKPSEPQPEVGSRFRVNIFWVGRAPMIRQKEYKLKLGSARASVKLAEISNTLDASDLSYSRSKQQLDCRDVGECILETARPIAFDPASASETTGRFVIVDNFEICGGGIVLENLSAGETLLQQHIRDRENNWEPGLVRFEERAEANRHKAKFIVFTGAPGTGKRAIAKALERGLFENGLNAYYLGVANIDRGLDADLGTHADSAGERLRRIGELARILTDAGLIFITTIDDADDYDIETLKQLNEPNDILVINTGENGFSRYQPDFELPAGADVSKAVGQVADILKTREIIIDYQI 33 | >WP_006367310.1 bifunctional precorrin-2 dehydrogenase/sirohydrochlorin ferrochelatase [Chlorobium ferrooxidans] 34 | MNYVPITLKVENKQLLILGGGKAALEKVQMLQRFGFTVTVIGEEIDEEILQSGCTCHLRPFRSEDFEDVLLVYACFADREVNRAIKEEANARGILVNTPDDPELCDFITPALFIDGPMMVAVSSGGTDVRKAVAWRNRIKTYFSSHDPIS 35 | >WP_006367312.1 uroporphyrinogen-III C-methyltransferase [Chlorobium ferrooxidans] 36 | MTPYPETLRLIAEHLDHSPAEPACFALDPFRLLEVTDDPLQALLDGRSEAALFPASALPEPLPDEFAVAALLSFRTVPDREDAHPLKDMNVIVVRHERADLKVLFAPLDIRRGYGKVYLTGAGAGSRDYLTLKADTLMQRAGVIFYDDLIDTGLLDAYRAEKVYVGKRKGLHHADQDAINRQLFAAALTKQIVVRLKGGDPLVFGRGGEELAWLSDRQIDVEVVPGVSSMMSAAASAGIPLTQRGVSGGVTLQSAHNVLAGDTPRTLVYFMCASRLKELQSTLLHEGIDGKTPVALIRKAGFFDEAMIMTTVEIMHTVELASPLLAIIGRTAALCRKRSKILHTGDDPYRCLLPGKIVPISDITAGGNPLGVVDLSLFSGIVFTGREQIDTLLAAFGTFPPHLVLYAEGRKTAELLRSRGYGRTVMEI 37 | >WP_006367320.1 sulfate ABC transporter ATP-binding protein [Chlorobium ferrooxidans] 38 | MGIELQNITKKFTDYTALDNISLNIASGELIALLGPSGCGKTTLLRIIAGLESADSGKIILEGKDTTNLPPREKNVGFVFQHYALFRRLNVYENVAFGLKVLPRSKRPSKSEIKDRVEHLLKLVQMEWALKRYPSQLSGGQRQRVALARALAVNPRVLLLDEPFSALDAKVRQELRRWLRKLHDEIHVTSIFVTHDQEEALELADRIVVINKGKIEQQGTPQEVYDHPANAFVYNFLGNVNVFHGRVHEGMVTLGGHSVDAPDELKNSAEKTSQTFVRPHEIGISKIRSEGNDLEGTIREIRLLGGQIGLNIECEGFDQPIDAEIPRELYLNLQLKKGDRVFVTFNKVKVFACDYEI 39 | >WP_006367321.1 sulfate ABC transporter permease subunit CysW [Chlorobium ferrooxidans] 40 | MSTPLLSQGTPPVVRKKISDPLPVRILLIGLTLLFFIGFVFLPLGLVFAQAFQKGWDFYLEAIREPYTIEAVKLTLTTVAIVVPLNAFFGVTAAWAITKFSFPGKAALKTLLDLPFAVSPVIAGLIFVLLLGSRTPFGSWLGEQGIKIIFSTPGIIIATLFVTFPFVARELIPLMEAQGRDEEEAALTLGAKGWQIFGKITLPNIRWGLLYGMILCSARAIGEFGAVSVVSGHIRGQTNTIPLHVEILYSEYNFTASFAVASLLVFLALTTVIVKAVMENRFQKNSSQH 41 | >WP_006367322.1 sulfate ABC transporter permease subunit CysT [Chlorobium ferrooxidans] 42 | MAIFQKKRRNILPGFGLSMGYTVFYLSAVVIVPLSMIFFNAIPMGWEPFLSAVTAPRVLASYKLSFSTAFLAALFDAVAGLLTAWVLVRYRFPGKAFLDALVDIPFALPTAVAGICFATLYSPVGWLGEITAKWNIEVINSPTGIVVALIFIGFPFVVRTLQPVLEELEPEIEESAHCLGATRLQTFRKILLPHLFPALLTGSTLAFARGIGEYGSVIFIAGNLPMKTEIAPLMIMSKLDQYDYNGASAVALVLLVISFSMILLLNAIQEWQQKEYR 43 | >WP_006367323.1 sulfate ABC transporter substrate-binding protein [Chlorobium ferrooxidans] 44 | MHLNWTKKITLAASLILASGLPGVVAEAVPAPAKLLNVSYDPTRELYKSENAAFINYWKAKTGQTVTIDQSHGGSGKQGRAVIDGLEADVVTLALAYDIDAIADSRLLDVNWQKRLANNSTPYTSTIVFVVRKGNPKRINGWDDLVKPGVSVITPNPKTSGGARWNYLAAWGFKQKQTGSALKAKAFVKALYKNVPVLDTGARGSTLTFAQRGLGDVFLSWENEAHLILKEFGSDKFEIVAPATSILAEPPVAVVDKNVAKHGTRKLAEAYLNFLYTPQSQEIIARNYYRPSSPAALKKYGANFPKIKLFTLKEFFGDWRTAQKTHFKDGGVFDQIYLP 45 | >WP_006367388.1 hypothetical protein [Chlorobium ferrooxidans] 46 | MTLKKTMLVGASCLSLMALSSPQEASATPVFARQTSQSCAACHFQRFPMLNAYGRSFKANGYTTVGKQGTIEDTNLSTPAILNAGLVSKFRYQKTNGTADTELNAGEFQVPDEATLFVGGKVAKNIGFQAEIGLTTTASMLGFKVPFVFPANDFTFSAIPFYSDAQGAAYGFELLNTGALRFSRTFENRTEASAQQYIGAAHKTTGVALVALHKYGYISYTPYLVKDVLTDGIAFSSGRPLSYIRGVVTPEKVGNWDLAAGAQIWTGSSEEGTASAPTRNHASAWAIDAQAQGKAGDMPLGVYVAYGSAAASKAGETANFYNASTVASKNALTIATELGIVPNRLGIGVAYRNANSGAGAADNDNALTLGANYQIARNMVFQLNHSFYSGDIKTDAVGKQLTTAMFYSAF 47 | >WP_006367389.1 cytochrome c5 family protein [Chlorobium ferrooxidans] 48 | MRRFVQVCSLMVVFICSSQIVNAEDIARATLRAKYDMVQGKDVYERACSVCHSSGVMDAPKFCDITAWKPRMAHGMEAMVKHAVEGFNNMPAKGGMDALTLTESANAVAYMVDQCLFD 49 | >WP_006367390.1 50S ribosomal protein L9 [Chlorobium ferrooxidans] 50 | MKVILRKDVATLGDTGEVVAVKNGYANNYLIPQGIAIRATEGTLKALETEKKQQAKKVELLRKHAREVARNIEQLALKVYAKAGESGKLFGTVTSADIAEALKAQGFEIDRRKITLDAPVKTLGKFEADARLFSDISVKVHFEVEAEGAGE 51 | >WP_006367391.1 MULTISPECIES: 30S ribosomal protein S18 [Chlorobium] 52 | MRQKFTQSQGHKSQGNKSLSNALASKKKVSKNQVVFFDYRDERKLKRFINDQGKIIPRRITGLSAKEQNLLTHSVKWARFLAVIPYVVDEYK 53 | >WP_006367391.1 MULTISPECIES: 30S ribosomal protein S18 [Chlorobium] 54 | MRQKFTQSQGHKSQGNKSLSNALASKKKVSKNQVVFFDYRDERKLKRFINDQGKIIPRRITGLSAKEQNLLTHSVKWARFLAVIPYVVDEYK 55 | >WP_006367392.1 single-stranded DNA-binding protein [Chlorobium ferrooxidans] 56 | MAELKMPEINSVINSVIIAGNLTKDPVFRQTNSGGTPVVNFSIACNRRFRDSNHMWQEDVCYVGVVAWNKLAESCRDNLKKSSAVLVDGELQSRTWKAQDGTSRTVVEIKARRIQFLNKRKKNGEDDEEGFIEDECHDIHHGEIPDDEASHIYEYKYLSSD 57 | >WP_006367393.1 MULTISPECIES: 30S ribosomal protein S6 [Chlorobium] 58 | MEKNKLYECTVIIDGGLQDEAITAAMELVQRVITEKGGSISGVLEIGRRKTAYPIKKRTIGYYAHIEFTGAPGVIAEIEKVLRYEEDLLRYLIIQLTGALLEMRKRVEKYSVVIGSPEDVAALEAAASEAAAK 59 | >WP_006367393.1 MULTISPECIES: 30S ribosomal protein S6 [Chlorobium] 60 | MEKNKLYECTVIIDGGLQDEAITAAMELVQRVITEKGGSISGVLEIGRRKTAYPIKKRTIGYYAHIEFTGAPGVIAEIEKVLRYEEDLLRYLIIQLTGALLEMRKRVEKYSVVIGSPEDVAALEAAASEAAAK 61 | >WP_006367394.1 glycosyltransferase family 9 protein [Chlorobium ferrooxidans] 62 | MSSSNRLIALLRSFRVQFLPNQNRVIKPGGSSGRNRMSVIKKILVIRLSSIGDIILTTPLLRRLNEAFPDAVIDYCTKAAFSTLLSSNPRISSHYTPEQPPFGAYDLIVDLQNNSRSRSLVRHLHAAKVVRYRKQNWKKWLLVQFKVNLYGKERHVVERYQDSLERFALPVDRKGCELYPSAEERGFAESFFITDQKTLALCFGAKHYTKRYPPERFAELLGLLLKEESLQVLLLGGEEDAPQASGIMHSLPDHYRKKVVNLSGNCSLMQTAAILERCDAVLCNDTGLMHMASAFGKKLFVLFGSSSSAFGFLPYHAPFDLFEVSGLRCRPCSHIGRERCPRGHFRCMNELSARDIASRILDYFRQAEL 63 | >WP_006367398.1 CTP synthase [Chlorobium ferrooxidans] 64 | MARPKNVKHIFVTGGVISSLGKGILSASLGMLLKSRGLRVAIQKYDPYINVDPGTMSPYQHGEVYVTDDGAETDLDLGHYERFLDESTSQSSNLTMGRVYKSVIDKERRGEYLGATVQVVPHVIDEIKDRMAELAKNGNLDVLITEIGGTIGDIESLPFLEAMRQMKLEMGDRNMLNIHLTFVPYIKAASELKTKPTQHSVKMLLETGIQPDILVCRSEKPLSREIKNKVGHFCNVNDLDVIGLNDCDTIYDVPLLLLKEKLDLRVLKKLGLKKYKEPNLEYWREFCNKVKFPEEGEVTIGICGKYTEYPDAYKSILEAFIHAGASNNVKVNIKLLRAEDAEINTFNLSKELEGVSGILVAPGFGDRGIEGKINFIRYAREKNVPFFGICLGMQCATIEFARNVCGLQDANSTEFNKRTRFPVIDLMEHQKKVKEKGGTMRLGSYPCIIKEGSMAHAVYQKFLINERHRHRFEFNNTYRNSFEESGLVFSGTSPNGELVEIIELKDHRWFVGVQFHPELKSRVQKVHPLFHGFVAAAKAYVQGGRQMELPVEQPSFMPVENEAGE 65 | >WP_006367399.1 cytochrome c551 [Chlorobium ferrooxidans] 66 | MEQKKPYIIKEQPGTKYYCACSSSKNLPYCDGSHKGSGLHPVKVEIDEEKTVAICSCGLSANPPYCDGSHKSAQ 67 | >WP_050770676.1 sulfurtransferase TusA family protein [Chlorobium ferrooxidans] 68 | MQGISCPANAMRVRAAVAELTGDELLEVLVDEGEAVLRVARTLKDCGHRIVKVVNRGEGVSVIVGK 69 | -------------------------------------------------------------------------------- /testing/inputs/subjects/Chl_phaeoferrooxidans_KB01.faa: -------------------------------------------------------------------------------- 1 | >WP_006366764.1 MULTISPECIES: F0F1 ATP synthase subunit beta [Chlorobium] 2 | MQEGKISQIIGPVVDVDFPEGRLPSILDALTITRPDGTKLVLETQQHLGEERVRTVSMESTDGLVRGTSVANTGMPIQVPVGPEVLSRMMNVVGEPIDGRGPVHTAKTYSIHRSAPKFDEISTKAEMFETGIKVIDLLEPYSRGGKTGLFGGAGVGKTVLIMELINNIAKQQSGFSVFAGVGERTREGNDLWHEMMESGVIDKTALVFGQMNEPPGARQRVALTGLSIAEYFRDEEGRDVLLFIDNIFRFTQAGSEVSALLGRMPSAVGYQPTLATEMGQLQDRIVSTKKGSVTSVQAIYVPADDLTDPAPATAFAHLDATTVLSRQIAELGIYPAVDPLDSTSRILDPNIVGDDHYDTAQAVKQLLQRYKDLQDIIAILGMDELSDEDKLVVSRARKVQRFLSQPFFVAEAFTGLAGKYVKLEETIKGFKEIIAGKHDNLPESAFYLVGTIEEAIEKAKTL 3 | >WP_006367391.1 MULTISPECIES: 30S ribosomal protein S18 [Chlorobium] 4 | MRQKFTQSQGHKSQGNKSLSNALASKKKVSKNQVVFFDYRDERKLKRFINDQGKIIPRRITGLSAKEQNLLTHSVKWARFLAVIPYVVDEYK 5 | >WP_006367391.1 MULTISPECIES: 30S ribosomal protein S18 [Chlorobium] 6 | MRQKFTQSQGHKSQGNKSLSNALASKKKVSKNQVVFFDYRDERKLKRFINDQGKIIPRRITGLSAKEQNLLTHSVKWARFLAVIPYVVDEYK 7 | >WP_006367393.1 MULTISPECIES: 30S ribosomal protein S6 [Chlorobium] 8 | MEKNKLYECTVIIDGGLQDEAITAAMELVQRVITEKGGSISGVLEIGRRKTAYPIKKRTIGYYAHIEFTGAPGVIAEIEKVLRYEEDLLRYLIIQLTGALLEMRKRVEKYSVVIGSPEDVAALEAAASEAAAK 9 | >WP_006367393.1 MULTISPECIES: 30S ribosomal protein S6 [Chlorobium] 10 | MEKNKLYECTVIIDGGLQDEAITAAMELVQRVITEKGGSISGVLEIGRRKTAYPIKKRTIGYYAHIEFTGAPGVIAEIEKVLRYEEDLLRYLIIQLTGALLEMRKRVEKYSVVIGSPEDVAALEAAASEAAAK 11 | >WP_076789645.1 sulfurtransferase TusA family protein [Chlorobium sp. KB01] 12 | MPGPEITKTLDVQGISCPANAMRVRAAIAELKGDELLEVLVDEGEAVLRVARTLKDCGHRIVKVENRGEGVSVIVGK 13 | >WP_076789648.1 4Fe-4S dicluster domain-containing protein [Chlorobium sp. KB01] 14 | MSEHEIDLAALGKVGMMRQKQPDYYVMRLKAVAGDMTASQLACVAGVAEKYGRGFVHLSTRQGIEIHYVHRDHLETARLELQGAGIEMGACGPRVRVIVACPGEETCRWGSIDTKKIARELDRRYFKQETPSKFKLAVTGCSNNCTKANENDIGIRGAIEPEWAADNCNDCGLCISTCPVNAIDRKDSDEGFLYQIDEDLCINCSVCTSLCPSNSWVIGRKGYTLYIGGTMGRVPRFASVLKKMVESEEELYLLIDKSIALYREKGRKKERFGHMIDRIGLEAVKEELLARP 15 | >WP_076790309.1 peptidase S24 [Chlorobium sp. KB01] 16 | MKLTRIYQSRVIDFYSPDLSTELRLPFAETGVSAGFPSPADDYMELSLDLNKALVRHPAATFYARVKGSSMIDAGIMEGDILVIDKSIDPKDGDIAICFLDGEFTVKRIMQHADGLLLMPANEEFTPIRITEENDFLVWGVVTYIIHKAR 17 | >WP_076790311.1 Y-family DNA polymerase [Chlorobium sp. KB01] 18 | MFALIDCNNFYVSCERLFNPGLRGRPVVVLSNNDGCFISRSEEAKAIGLGMGEPAFKCRELLRRHQVAVYSANFPLYGDISGRVMRTLADFAPDIEIYSIDECFLDLSGFDRFDLDSYARQMRRTVEKHTGIPVSIGIGATKTLAKAASRMAKKNPALGGVFLLRTDAGIRDALSGMEVEHIWGIGRRYGVLLHRNGIHSALDFACVPAPWVRRNLHITGARVQEELNGRSCLSLELVRSPKQSICTSRSFGTSVTLKEELQHAVVHFASKCAFKLRREESRASLVTVFACTSPFAPKEKRYWGTETLSLPAPSNDTLLLLKAATLALERLYRRGYEYKKAGVILSGIGSVSEPQQTTLSLFDTDPLPEKQGRSASLMEALDALNSRYGPGTLRVASDSSSGWKQRQEKLSPAYTTRWSDIIEVRL 19 | >WP_076790314.1 DUF1848 domain-containing protein [Chlorobium sp. KB01] 20 | MIISASRRTDIPAFYGEWLVNRLKAGEVLVRNPMQPKQVSRIMLAPATVDALVFWTKNPEPFLSRLPEIDALGYSYYFLFTLTPYDATLEPGVPEKGSLVEVFRRLSRLIGPEKVVWRYDPVVITDQFTPAWHAASFQKLAERFSGYTERCIISFLDDYRKVRSRMRNLRFTLPDNAEMEELASIVADIAAKQNIELFTCSQEVDLSRYGIMHSRCIDGGLVERICGRRMTVTKKDSSQRHACGCVESRDIGSYDTCPHGCLYCYAVSSQTKSCKAFEAFDPASPMLCDHLNGDETITTPALRKRAASARTQSDSGLHQGELFG 21 | >WP_076790316.1 type II toxin-antitoxin system Phd/YefM family antitoxin [Chlorobium sp. KB01] 22 | MHAIDTRNVTPLTDFRNNIKRYMEELSVSKKPLLLTQHGKSAAVLLDAEQYQKMLDQITFMQLVTEGLEEYRNNRTIPAEELFPSLDKIIAEAEKQ 23 | >WP_076790319.1 bacterioferritin [Chlorobium sp. KB01] 24 | MKGNPKIIEKLNSLLADELTAINQYIVHSEMCSNWGYEKLHNADEKRAIDEMKHAEKLIARILFLEGIPVVTDLKKIKIGADVAAQHKNDRHSEDGAIASYNDGIRLAVEVGDNGTRELLESILRDEEAHIDWLEAQIDQISQMGIQNYLVEQLG 25 | >WP_076790324.1 ArsR family transcriptional regulator [Chlorobium sp. KB01] 26 | MQDHRAEGCQEECFHPDVVERVRTEMPGEPLQEELAQLFKVLGDHTRIRILNALCLSELCVCDLTSILGMNQSAVSHQLRVLRDAKLVKSRKQGKNVLYTLDDTHVSTLIRTGSEHIREGK 27 | >WP_076790327.1 permease [Chlorobium sp. KB01] 28 | MNELLIVVSKVLLASWSVLLESAPFVLLGFFIAGLLKAFVPDNFIATHLGGRGVVSIFKASLFGIPIPLCSCGVLPAAAGLKKQGSGNGPVAAFLISTPETGVDSIAVSWALLDPLMTVFRPVVALFMAVFTGIAVSFSEKQSESNVAADADKPGESACTSSCCCGNAKKEAEKKPTVVEKFRHGMSFAFGGLLKDVGLWLFIGILLSGLISVFVSTEMVSRYLSNEYLSMLLMFIVSIPLYVCATASTPIVAALALKGLSPGAALVFLLAGPATNAASLPVIFRMLGRKSALIYLVSIVLISLIAGVVVNDLYSYLGYDIMHWVSKGAHEGGGILSIAASVLLLLLVVKSLLPKKQSGHSR 29 | >WP_076790329.1 copper-translocating P-type ATPase [Chlorobium sp. KB01] 30 | MTTTGTYSVKGMHCASCAAIITKKLSKVEGITRADVNLATEKVTIAFDHETLPVEALNDAVSRYGYAIAAEPGAAGAGESQKIAASVAIRLKEEKEQALLDQKAKVQFALPIALLIFILMMWDIGSRFIESVPNLPIPMELFNIISMVLATWFVFRIGAPFLHGIVMFLQSGAASMDTLIGIGTLTAYVYSALITLIPSLRELLRVPDYTYFDVTIVVIGFVLLGKYLEARSRMKTGEAIEKLINLQAKTAIVLREGKEIEIPSEAVRKGDTVIIKPGAKLPVDGIIIEGFSAIDESMVTGESIPADKKSGDEVIGGTINKQGSFTFRASNVGSDTVLARIIRMVEEAQGSKAPIQEIADKIAGIFVPVVLIIAAATFLIWITVGSSYLGFSVALSYAISGMVGILVIACPCALGLATPTAIIVGIGKGAEYGILIRNAESLEKLSSIDTVVFDKTGTITTGSPKVTDVTPLSEGTTSESLLEIAASVENRSEHPLAEAILDEAKKRSITLRDISEFKALEGMGVSATIAGLPVTIRKPLETERELAEVKRLQAEGKTVVMIEENRIALGLIALSDTVKTEAKAAVAALHRRGLKVIMLTGDNRAAANYMAGQVGIDEVIAEVLPTEKAAKIKALQTEGRKVAMAGDGINDAPALAQADVGIAMATGTDIAIESAGITLLKGDINKVAQAITLSRSTMRVIRQNLFWAFIYNIIGIPLAAGALYPFWGIFLNPVFSGLAMAGSSVSVVSNSLRLKAKKLK 31 | >WP_076790332.1 DUF305 domain-containing protein [Chlorobium sp. KB01] 32 | MSSNDFNLFLEKTMKNISLFLAVALSIVMAVIGIGAGYWLTPQYSLSMYDKNSMDLGQPDKWVDLRYLDGMIAHHRGAILLAEKASISQRTEIRNLCKEILKNEPVAIAELYKWKKEWYGDQRKVRDPQVANLGSSDSKLDLRFLNALIAHHENGVRMTREIRLKSSRSEVLDNADAVELFLKGGLGMLREWRKEWYNVIDTNPYL 33 | >WP_076790653.1 hypothetical protein [Chlorobium sp. KB01] 34 | MSSTGLPIPDFLDARLLAYEEASIKTRRNGKPLVLGKKPHPGSIILQSNDYLSISTHPQILQAQISRLEQAHHEMVMSAVFLHEDSDKSLFEKAMAEFAGFEHAILCQSGWAANVGIMQTIADQNTPVYIDFFTHMSLWEGVKTSGAPFYTFMHNDAAHLENLVKQHGQGIIVVDSLYSTTGDIAPLAEIIDIANRHNCLSLVDESHSLGTHGPRGAGLVASLGLTDRVHFITGSLAKAFAGRAGIILCSKRFAEYYPYAAFPAIFSSALLPHEIAGLSATLEVLIKSDDRRKILHDNALFFRNGLESLGYFLASQSQIISLEPESEPQMEILRDALEDRNVFGSVFCAPATPKNRTLMRFSLNSDMEINELQQVLDVCASIRDEVGMREWKSTKRKKT 35 | >WP_076790655.1 IS200/IS605 family transposase [Chlorobium sp. KB01] 36 | MSEYIHKSHNVTVLMYHIVLPAKYRRVIFDDKVDEVLKDVCLDIENRYQIKFLEIGTDKDHVHFLVQSVPTYSLTKIVTMIKSISAREVFRRCPKVKKLLWGGEMWSDGYYAGSVGKHGNEDMIGKYVKGQGCTYQKLYSNYQLSLF 37 | >WP_076790657.1 phosphoadenylyl-sulfate reductase [Chlorobium sp. KB01] 38 | MSRESIEQLNIELAAKSPEEILRAAVGCFGAENMAFASSFGAEDQVLTDMLHREKLPVPVFTLDTGRLHQETYDLFDATRKHYGIEIEALFPETVAVEAMLAQHGPNLFYESIEKRRECCRIRKVQPLTKKLSTLKAWICGLRREQSVTRTAVAPIEWDAAFGLYKINPLAEVSEAWVWEYIKKHNVPFNRLHDEGYPSIGCAPCTRAVKQGEDLRAGRWWWEIAEHRECGLHREKQ 39 | >WP_076790659.1 sulfate adenylyltransferase subunit 2 [Chlorobium sp. KB01] 40 | MDHLDKLEAQSVYILREAYREFKSLCMLWSIGKDSTVMLWLARKAFFGHVPIPLVHVDTHYKIPEMIEYRDRLALEWNLNMIYGENREALDKKLTFPDGNTDRLTCCKYLKSEALKHTLSGEWPRYRMDHELKHYVQDEGKEPYTGVIAGVRADEEGSRSKERYFSPRDKDNVWDVGDQPPEFWNQFKTDFAPGTHVRIHPLLDWTELNIWEYIEREKIPVVSLYFNQGEGIRYRSLGCYPCTNPVESEARTLPEIIDELKSGKFSNVAERSGRAQDSEDGGGLETLRRDGYM 41 | >WP_076790661.1 GTP-binding protein [Chlorobium sp. KB01] 42 | MIGRSQMNIVIVGHVDHGKSTVIGRLLADTGTLPQGKLESVRESCRKNSKPFEYAFLLDALKDEQAQGITIDMARCFFKTEKRDYIIIDAPGHIEFLKNMITGASRAEAALLVIDAHEGIQENSKRHGHMVSMLGVKHVTVLVNKMDLAGYSEEVFEQLKTDYSAFLARIKVTPVSFIPVSAREGDNIASLSERMAWYKGATVLEQLDRFVSNKELQELPFRFPVQDIYKFTRSDDDRRIVAGSVAAGSVQVGDEVLFLPSGKQSVIQSVESFNTLPKTKASAGEATGFTLSTQIYIRPGELMVRPSEPQPEVGSRFRVNIFWVGRAPMIRQKEYKLKLGSARASVKLAEISNTLDASDLSYSRSKQQLDCRDVGECILETARPIAFDPTSASETTGRFVIVDNFEICGGGIVLENLSAGETLLQQHIRDRENNWEPGLVRFEERAAANRHKAKFIVFTGAPGTGKRAIAKALERGLFENSLNAYYLGVANIDRGLDADLGTHADSAGERLRRIGELARILTDAGLIFITTIDDADDYDIETLKQLNEPNDILVINTGENGFSRYHPDFELPAGADVSLAVGQVADILKTREIIIDYQI 43 | >WP_076790662.1 bifunctional precorrin-2 dehydrogenase/sirohydrochlorin ferrochelatase [Chlorobium sp. KB01] 44 | MNYVPITLKVENKKVLIAGGGKAALEKVQMLLRLAVAVTVIGEEIDEEILLSDATCHLRPFRSEDLEDVLLVYACSSDREVNRLVKAEANARNILVNTPDDPELCDFITPALYIEGSMTVAVSSGGSDVRKAVAWRNRIKTFFSNHDPVS 45 | >WP_076790664.1 uroporphyrinogen-III C-methyltransferase [Chlorobium sp. KB01] 46 | MTPYPETLRPLLEHLDHSPAEPACFALDPFRLLPVDDDPLQALLDGCGDLLFMAAPELPDPLPEEFAVAALLSLRTVPDRDDAHPLHDMNVIVVRYERADLRVLFAPLDIRRGYGKVYLTGAGAGSRDYLTLKADTLLQRAGVIFYDDLIDTGLLDAYSAEKVYVGKRKGRHHADQDAINRQLYLAAAKQQIVVRLKGGDPLVFGRGGEELAYLCERQIDVEVVPGVSSMLSAAASAAIPLTQRGVSGGVTLQSAHNVLAGGTPRTLVYFMCASRLKELQSIVLHEGIEREMPVALIRKAGFYDESVTMTTVGVMHTVELDSPLLAIIGRTAALGRQRSKILHTGDDPYRCMLSGKIVPLGDITTGGNLLGAVDLSLFSGIVFTGAGKVEALLAAFGPLPSHLVLYAEGAESAEELRNWGYASRVMEV 47 | >WP_076790666.1 sulfate ABC transporter ATP-binding protein [Chlorobium sp. KB01] 48 | MGIELQNITKNFTGYTALDNISLNVASGDLIALLGPSGCGKTTLLRIIAGLESADSGKIILEGKDTTNLPPREKNVGFVFQHYALFRRLNVYENVAFGLKVLPRSKRPSKSEIKDRVEHLLKLVQMEWALKRYPSQLSGGQRQRVALARALAVNPRVLLLDEPFSALDAKVRQELRRWLRKLHDEIHVTSIFVTHDQEEALELADRIVVINKGKIEQQGTPQEVYDHPANAFVYNFLGNVNVFHGRVHEGVVTLGGHTVDAPDELKSTEGKTSQTFVRPHEIGISKIRSEGNDLEGTVREIRLLGGQIGLNIDCEGFEQPIDAEIPRELYLNLQLKKGDRVFVTFNKVKVFAGDYEI 49 | >WP_076790668.1 sulfate ABC transporter permease subunit CysW [Chlorobium sp. KB01] 50 | MSTPLLSQGTPPVVRKKISDPLPVRMLLIALTLLFFIGFVFLPLGLVFAQAFQKGWEFYLEAVREPYTIEAVKLTLTTVTIVVPLNAFFGVSAAWAITKFSFPGKAALKTLLDLPFAVSPVIAGLIFVLLLGSRTPFGTWLGEQGIKIIFSTPGIVIATLFVTFPFVARELIPLMEAQGRDEEEAALTLGARGWQIFVKITLPNIRWGLLYGMILCGARAIGEFGAVSVVSGHIRGQTNTIPLHVEILYSEYNFAASFAVASLLVFLALTTVVVKSIIEKKFQKNSLQH 51 | >WP_076790669.1 sulfate ABC transporter permease subunit CysT [Chlorobium sp. KB01] 52 | MAIFQKKRRNILPGFGLSMGYTVFYLSAVVIVPLSMIFFNAIPMGWEPFLSAVTAPRVLASYKLSFSTAFFAAIFDAVAGLLTAWVLVRYRFPGKAFLDALVDIPFALPTAVAGICFATLYSPMGWLGEITAKWNIEVINSPTGIVVALIFIGFPFVVRTLQPVLEELEPEIEESAHCLGATRLQTFRKILLPHLFPALLTGSTLAFARGIGEYGSVIFIAGNLPMKTEIAPLMIMSKLDQYDYNGASAVALVLLVISFSMILLLNAIQEWQQKEYR 53 | >WP_076790671.1 sulfate ABC transporter substrate-binding protein [Chlorobium sp. KB01] 54 | MHLNWTKKITLAASLILASGLPGVVAEAVPAPAKLLNVSYDPTRELYQSENAAFIKFWKAKTGQTVTIDQSHGGSGKQGRAVIDGLEADVVTLALAYDIDAIADSKLIDPNWQKRLANNSTPYTSTIVFVVRKGNPKRITGWDDLVKPGVSVITPNPKTSGGARWNYLAAWGFKQKQTGSAEKAKAFVKALYKNVPVLDTGARGSTLTFAQRGLGDVFLSWENEAHLILKEFGSDKFEIVTPATSILAEPPVAVVDKNVAKHGTRKLAEAYLNFLYTPQSQEIIARNYYRPSSPAALKKYGANFPKIKLFTLKEFFGDWRTAQKTHFKDGGVFDQIYLP 55 | >WP_076790672.1 alkylhydroperoxidase [Chlorobium sp. KB01] 56 | MTIRYITSSAPADTLSEKVLGQIRREFGAEVEPFTLHLPVPELLAGVWIAVRETLLAGSGSRDAKEVVAAAVSSLNRCPYCLDAHSIMVLEASGHDYSDALADGAPEQIEDTFLRDVAEWAAATRTPGSPLLASPLFSSQDAPSFIGTAVVFHYINRMVTILLGSSPLPFTSGIPKKVAMQMAAWFFGGAIRIPKKPGASLELLPEAPLPDDLAWAEPSPVISAAFARFFQVIEKHGAMALSTEVRNAVHAAVQNWNGSDPDMHNLWCEEAIAHLTEKDKAAGRLALLTALAPWRVDKTIVRAFSSNFPGDDRLIAALAWSSFEAAKRTGSWL 57 | >WP_076790674.1 30S ribosomal protein S12 methylthiotransferase RimO [Chlorobium sp. KB01] 58 | MTMTTHNVFLLSLGCSKNTVDSERLMAQAEASGITFTETADEADTILINTCAFIEDAKEESIAETFAAIDKKTEGKVQRVYMMGCLTELYRKELGEEMEEVDGFFGTRELPEVLAALGAVYREELYDRRSLLTPPHYAYLKIAEGCNRSCSFCSIPKIRGRYISQPPEQLLREAALLKSTGVRELNVIAQDISLYGYDLEGKTLLNDLVLRLSDMEFDWIRLFYAYPVGFPLEVIDTMHQRANICNYLDIPLQHGNDRILRSMNRGINKAETIKLIETIREKNPDIRLRTTMIAGYPGETREEFEELLQFVEESRFDRLGCFPYCHEEHAPSFKLEESLTMEEKRERAAELMELQESISAENNRVFEGKTIRVLIDQIEGDTALGRTEYDAPEVDNECILSIEHQSVKAGSFCMAEITDSGAYELHGRVIEIL 59 | >WP_076790675.1 tRNA guanosine(34) transglycosylase Tgt [Chlorobium sp. KB01] 60 | MKFTLLQTDPHSAARCGVLSTAHGAIPTPVFMPVGTRASVKSVEPHELKENNVHIILANTYHLYLKPGNDILFKAGGVHRFMNWNGPLLTDSGGYQVYSLSELRKISEEGVIFKSHLDGSMQQFTPENVVETQRIIGSDIIMPLDECPPSMAEKEYIRKSGELTIRWAARAREAFAATTSLYGHEQYLFGITQGGIHDDLRAISIKALVDMNFDGYSIGGMAVGEPAPEMYRILELSHTLLPENKPRYLMGVGTPENILNAIERGVDMFDCVIPTREGRNGRVYTRHGKMNLRSAKYATDFTPIDEGFDNDVCRNYSRAYIRHLLNVGEILGLKLCTLQNISFFMWLTATARTQIQQGTFLEWKKDFLKRFNDHDNA 61 | >WP_076790677.1 penicillin-binding protein activator LpoB [Chlorobium sp. KB01] 62 | MRIKLILISLFLFSIVLFSQGYAGEKPRIGVLRFTNHTSAGWWSGSVGTDLQDMLIAELASTNSFRVLERKELDAVIQEQDLGASGRVNPKTRSKLGKITGAKYLVAATVSAFEQNTSGSGGGLSFGGISLGGKQDKAYMAVDLKVIEVETGEIFDARTVEATSKSSGLSLGVSSGGFNGNLGQYKNTPVGKAIRACIMEITEYLECSMVEGKDSDCMDKFNEKERSRKEKTKSAIQLD 63 | >WP_076790679.1 ribosome small subunit-dependent GTPase A [Chlorobium sp. KB01] 64 | MRKLSEPGFDEWFQAHIDDLPPGGQEIARVTAVDRNSFMVRSEHTEVPAELSGRLSFHIDSSIELPCVGDWVRVQYHNDGASAIIHGIFPRRTCLRRKRAGTEVKYQLIAANIDLAFIVQSCHFDFNLARLNRYLVISADGGVEPVVILAKTDLISPEELAEKLAAIKAAGITTRVIALSNVTGAGIEEFRELLHPGQTCCLIGSSGVGKTTLINNLTGKADLDTKAVSATGEGTHTTTRRQLIVLDNGVMFIDTPGMRELGLLGAGEGVNKGFEDIIALAESCRFADCSHTGESGCAVLAAIADGELGEEHYGNYMKLSKESEYHEMSYLERRNKDKAFGRFIKSAKKSMKR 65 | >WP_076792905.1 single-stranded DNA-binding protein [Chlorobium sp. KB01] 66 | MAELKMPEINSVIIAGNLTKDPVFRQTNSGGTPVVNFSIACNRRFRDSNHLWQEDVCYVGVVAWNKLAESCRDNLKKSSAVLVDGELQSRTWKAQDGTSRTVVEIKARRIQFLNKRKKNGEDDEEGFIEDECHDIHHGEIPDDEASHIYEYKYLSSD 67 | >WP_076792907.1 50S ribosomal protein L9 [Chlorobium sp. KB01] 68 | MKVILRKDVAALGDTGEVVAVKNGYANNYLIPQGIAIRATEGTLKALETEKKQQAKKVELLRKHAREVARNIEQLALKVYAKAGESGKLFGTVTSADIAEALKVQGFEIDRRKITLDAPVKTLGKFEADARLFSDISVKVHFEVEAEGAGE 69 | >WP_076792910.1 hypothetical protein [Chlorobium sp. KB01] 70 | MTLKKTMLVGASCLSLMALSSPQEASATPVFARQTNQSCAACHFQRFPLLNAYGRSFKANGYTTVGKQGTIEDNNLSTPAILNAGLVSKFRYQKTNGTADTELNAGQFQVPEEAFLSIGGKVAKNIGFQAEISLVSPASIVGFKVPFVFPASKDITFLTVPFYTDAQGAAFGYELLNTGALRFSRAFEHRNEASAQQYIGAAHQTTGVALVAQHKYGYLSYTPYLVKDVMTDGIAFSTGRPLSYIRGVVTPEKVGDWDLAAGAQIWTGSSAEGTAAAPTRNHAAAWAIDAQAQGKAGDMPLGVYVAYGSAAASKAGQTANYFNASTVGSKNALTIATELGVVPNRLGIGVAYRNANSGAGAADNDNALTLGANYQVARNMVFQLNHSFYSGDVKTDAAGKQLTTAMFYSAF 71 | >WP_076792912.1 DUF3570 domain-containing protein [Chlorobium sp. KB01] 72 | MKSVPTKKRTIIGAALFAAAMAMPSSHPAFAEAAPEKGSVAFKYLHYQDSQPSQDRIGVDAYTVTAMAPIAGKWSISTTYVNDSVSGASPDYHTHILSGASSHDTREEVDLGLTRYFSKGSLTLGTVYSTENDYISRGYSGQGSLQTEDKNTTFTLGGSYTTDTINPTAQGLQYDKRTIAGLFGVTKVLSKVDILQLNFGISRGRGYFSDPYKSYDSRPDRRESKTIMTRWNHHFDGTDGTTRLSYRYYYDTFGIKASTLGLEYVQPLPNDFTVIPSVRVHSQTAADFYRTVYPPVPRTITPNSLDQRLSAFGALTLGIKVEKRIAKDWLVDVKYENYEQRAGWCVTGGGDNNLAPFTATFLQLGVSRLF 73 | >WP_076792915.1 phenylalanine--tRNA ligase subunit alpha [Chlorobium sp. KB01] 74 | MENTIRSLQQEITDFEITTHADLEAFRLKYTVRKGLIAALFGQLKTVAPADKPRIGQLLNQLKLTADSRIDEEEAKLSANAGDCGKRIDLTLPGRRYFTGSEHPVQKVLGEMKQIFSAMGFGIATGPELELDRNNFDLLNFPPDHPARDMQDTFFVTTGDADTDLLLRTHTSPVQVRVMLDREPPIRVICPGKVYRNEAISSRSYCVFHQLEGLYIDKKVSFADLKATIYSFARQMFGTDVKLRFRPSFFPFTEPSAEVDVTCYLCGGKGCRVCKKSGWLEIMGCGMVHPNVLRNCGINPEEWSGYAFGMGVDRTVLLRYKIDDIRLLFENDLRMLRQFPA 75 | >WP_076793039.1 cytochrome c5 family protein [Chlorobium sp. KB01] 76 | MRRFVQVCSLMVVFICSSQIVKAEDIARATLRAKYDMVQGKDVYERVCAVCHSSGVMDAPKFCDITAWKPRMAHGMEAMVKHAVEGFNNMPAKGGMDSLSLTESANAVAYMIDQCLFD 77 | >WP_076793041.1 DUF4266 domain-containing protein [Chlorobium sp. KB01] 78 | MIQRPVHALLLLLMLGLSGCSIGQAVQPWEKETLARPEMTFEGDALDSKYTEHIYGSKEAASGGAGVGGGGCGCN 79 | >WP_083694690.1 TlpA family protein disulfide reductase [Chlorobium sp. KB01] 80 | MVNTVRKKLGTLLVGGLLCVGFSGTSYALEAGSKAPDFSLPGSQGSVTLSSTAGSVVYVDFWASWCGPCRQSFPWMNSIQEKYRAQGLKVIGVNVDGKNEDAKKFLSQNPAKFTVAFDSKGLTPKTFGVKGMPTSFLIGRDGKIISQHLGFKEADRDGLEKQIKAALEANK 81 | -------------------------------------------------------------------------------- /scripts/generate_run_templates.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | # generate_run_templates.sh 4 | # Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025 5 | # Script for generating and manipulating BackBLAST config templates 6 | # Part of the BackBLAST pipeline 7 | 8 | # GLOBAL variables 9 | # Assign ones that overlap with the main BackBLAST script only if this script is called independently 10 | if [[ ${BASH_SOURCE[0]} = ${0} ]]; then 11 | readonly SCRIPT_NAME="${0##*/}" 12 | readonly SCRIPT_DIR="$(realpath ${0%/*})" 13 | readonly TEMPLATE_CONFIG="${SCRIPT_DIR}/../snakemake/template_config.yaml" 14 | fi 15 | 16 | ####################################### 17 | # Add the user-provided subject files to the end of the template config file 18 | # Globals: (none) 19 | # Arguments: 20 | # output_config_filepath: the path to the config.yaml file to which the subject names will be added 21 | # subject_genome_directory: the path to the directory containing the protein files for subject genomes (FastA format) 22 | # genome_extension: the extension of the predicted protein files for subject genomes (e.g., faa) 23 | # Returns: 24 | # writes to output_config_filepath 25 | ####################################### 26 | function add_subjects_to_config_file() { 27 | # User-provided inputs 28 | local output_config_filepath 29 | output_config_filepath=$1 30 | local subject_genome_directory 31 | subject_genome_directory=$2 32 | local genome_extension 33 | genome_extension=$3 34 | 35 | # Find the subjects 36 | local subject_genome_files 37 | subject_genome_files=($(find ${subject_genome_directory} -maxdepth 1 -type f -name "*.${genome_extension}" | sort -h | xargs realpath)) 38 | 39 | echo "[ $(date -u) ]: Found ${#subject_genome_files[@]} subject genomes with extension '${genome_extension}'" 40 | 41 | # Append to the bottom of the file (a bit hacky) 42 | # Note that the arbitrary sample name is defined here for each sample as the basename of the file 43 | for subject_genome_file in ${subject_genome_files[@]}; do 44 | 45 | subject_genome_basename=${subject_genome_file%.${genome_extension}} 46 | subject_genome_basename=${subject_genome_basename##*/} 47 | echo " ${subject_genome_basename}: '${subject_genome_file}'" >> ${output_config_filepath} 48 | 49 | done 50 | } 51 | 52 | ####################################### 53 | # Generate the genome and gene metadata file templates 54 | # Globals: (none) 55 | # Arguments: 56 | # genome_metadata_tsv: the path to which the genome metadata TSV file is to be written 57 | # gene_metadata_tsv: the path to which the gene metadata TSV file is to be written 58 | # subject_genome_directory: the path to the directory containing the protein files for subject genomes (FastA format) 59 | # genome_extension: the extension of the predicted protein files for subject genomes (e.g., faa) 60 | # query_filepath: the path to the query protein sequences (FastA format) 61 | # Returns: 62 | # writes to genome_metadata_tsv and gene_metadata_tsv 63 | ####################################### 64 | function generate_metadata_templates() { 65 | # Assign variables from input 66 | local genome_metadata_tsv 67 | genome_metadata_tsv=$1 68 | local gene_metadata_tsv 69 | gene_metadata_tsv=$2 70 | local subject_genome_directory 71 | subject_genome_directory=$3 72 | local genome_extension 73 | genome_extension=$4 74 | local query_filepath 75 | query_filepath=$5 76 | 77 | # Generate genome metadata template 78 | echo "[ $(date -u) ]: Writing genome metadata template to '${genome_metadata_tsv}'" >&2 79 | printf "subject_name\tplotting_name\n" > ${genome_metadata_tsv} 80 | 81 | local subject_genome_files 82 | subject_genome_files=($(find ${subject_genome_directory} -maxdepth 1 -type f -name "*.${genome_extension}" | sort -h)) 83 | 84 | for subject_genome_file in ${subject_genome_files[@]}; do 85 | 86 | subject_genome_basename=${subject_genome_file%.${genome_extension}} 87 | subject_genome_basename=${subject_genome_basename##*/} 88 | printf "${subject_genome_basename}\t\n" >> ${genome_metadata_tsv} 89 | 90 | done 91 | 92 | # Generate gene metadata template 93 | echo "[ $(date -u) ]: Writing gene metadata template to '${gene_metadata_tsv}'" >&2 94 | printf "qseqid\tgene_name\n" > ${gene_metadata_tsv} 95 | 96 | local query_accessions 97 | query_accessions=($(grep "^>" ${query_filepath} | cut -d ">" -f 2 | cut -d " " -f 1)) 98 | 99 | for query_accession in ${query_accessions[@]}; do 100 | 101 | printf "${query_accession}\t\n" >> ${gene_metadata_tsv} 102 | 103 | done 104 | } 105 | 106 | ####################################### 107 | # Generate template files for the BackBLAST run 108 | # Globals: (none) 109 | # Arguments: 110 | # template_config: the path to the template Snakemake configuration file (YAML format) 111 | # query_filepath: the path to the query protein sequences (FastA format) 112 | # query_genome_filepath: the path to the predicted protein sequences of the entire genome corresponding to the query protein sequences (FastA format) 113 | # subject_genome_directory: the path to the directory containing the protein files for subject genomes (FastA format) 114 | # genome_extension: the extension of the predicted protein files for subject genomes (e.g., faa) 115 | # output_directory: path to the directory where output files should be written 116 | # threads: maximum number of threads that any given task within the snakemake pipeline ought to use 117 | # phylogenetic_tree_newick: path to the phylogenetic tree file corresponding to the subject genomes (or type 'subjects' to have the tree auto-generated) 118 | # bootstrap_cutoff: numeric value; only bootstrap numbers above this value (e.g., 80) will be shown on the plot (or 'NA' to skip) 119 | # root_name: string; the exact label of the phylogenetic tree tip corresponding to the root (or 'NA' to skip) 120 | # evalue: numeric/scientific; e-value cutoff for reciprocal BLASTP 121 | # pident: numeric; percent identity cutoff for reciprocal BLASTP 122 | # qcov: numeric; percent query coverage cutoff for reciprocal BLASTP 123 | # Returns: 124 | # writes three template files to the output_directory: 'config.yaml', 'genome_metadata.tsv', and 'gene_metadata.tsv' 125 | ####################################### 126 | function make_run_templates() { 127 | # Assign input variables 128 | # TODO - is there a more elegant way of doing this? This is a ton of input variables to have to bring in one at a time 129 | local template_config 130 | template_config=$1 131 | local query_filepath 132 | query_filepath=$2 133 | local query_genome_filepath 134 | query_genome_filepath=$3 135 | local subject_genome_directory 136 | subject_genome_directory=$4 137 | local genome_extension 138 | genome_extension=$5 139 | local output_directory 140 | output_directory=$6 141 | local threads 142 | threads=$7 143 | local phylogenetic_tree_newick 144 | phylogenetic_tree_newick=$8 145 | local bootstrap_cutoff 146 | bootstrap_cutoff=$9 147 | local root_name 148 | root_name=${10} 149 | local evalue 150 | evalue=${11} 151 | local pident 152 | pident=${12} 153 | local qcov 154 | qcov=${13} 155 | 156 | # Check if output directory exists 157 | if [[ ! -d ${output_directory} ]]; then 158 | echo "[ $(date -u) ]: ERROR: output directory '${output_directory}' does not exist. Exiting..." 159 | exit 1 160 | fi 161 | 162 | # Check if desired output files already exist 163 | local output_config_filepath 164 | output_config_filepath=${output_directory}/config.yaml 165 | local genome_metadata_tsv 166 | genome_metadata_tsv=${output_directory}/genome_metadata.tsv 167 | local gene_metadata_tsv 168 | gene_metadata_tsv=${output_directory}/gene_metadata.tsv 169 | 170 | if [[ -f ${output_config_filepath} ]]; then 171 | echo "[ $(date -u) ]: Found existing 'config.yaml' at '${output_directory}'. Will not continue. Exiting..." >&2 172 | exit 1 173 | elif [[ -f ${genome_metadata_tsv} ]]; then 174 | echo "[ $(date -u) ]: Found existing 'genome_metadata.tsv' at '${output_directory}'. Will not continue. Exiting..." >&2 175 | exit 1 176 | elif [[ -f ${gene_metadata_tsv} ]]; then 177 | echo "[ $(date -u) ]: Found existing 'gene_metadata.tsv' at '${output_directory}'. Will not continue. Exiting..." >&2 178 | exit 1 179 | fi 180 | 181 | # Generate the metadata templates 182 | generate_metadata_templates ${genome_metadata_tsv} ${gene_metadata_tsv} ${subject_genome_directory} \ 183 | ${genome_extension} ${query_filepath} 184 | 185 | ### Generate config file and add variables 186 | echo "[ $(date -u) ]: Writing config info to '${output_config_filepath}'" >&2 187 | cp ${template_config} ${output_config_filepath} 188 | 189 | # Add subject info 190 | add_subjects_to_config_file ${output_config_filepath} ${subject_genome_directory} ${genome_extension} 191 | 192 | # Special check for the phylogenetic tree - if the entry is not a file (e.g., 'subjects' or 'NA'), then do not run realpath 193 | if [[ -f ${phylogenetic_tree_newick} ]]; then 194 | phylogenetic_tree_newick=$(realpath ${phylogenetic_tree_newick}) 195 | fi 196 | 197 | # Add other variables to config file 198 | # NOTE: I'm using the '|' symbol as the sed separator because some of the variables contain forward slashes (which is the normal separator) 199 | # TODO - is there a more elegant way of doing this? 200 | sed -i "s|^query_genes: .*|query_genes: '$(realpath ${query_filepath})'|" ${output_config_filepath} 201 | sed -i "s|^query_genome_orfs: .*|query_genome_orfs: '$(realpath ${query_genome_filepath})'|" ${output_config_filepath} 202 | sed -i "s|^threads: .*|threads: ${threads}|" ${output_config_filepath} 203 | sed -i "s|^phylogenetic_tree_newick: .*|phylogenetic_tree_newick: '${phylogenetic_tree_newick}'|" ${output_config_filepath} 204 | sed -i "s|^genome_metadata_tsv: .*|genome_metadata_tsv: '${genome_metadata_tsv}'|" ${output_config_filepath} 205 | sed -i "s|^gene_metadata_tsv: .*|gene_metadata_tsv: '${gene_metadata_tsv}'|" ${output_config_filepath} 206 | sed -i "s|^bootstrap_cutoff: .*|bootstrap_cutoff: ${bootstrap_cutoff}|" ${output_config_filepath} 207 | sed -i "s|^root_name: .*|root_name: '${root_name}'|" ${output_config_filepath} 208 | sed -i "s|^e_value_cutoff: .*|e_value_cutoff: ${evalue}|" ${output_config_filepath} 209 | sed -i "s|^minimum_percent_identity: .*|minimum_percent_identity: ${pident}|" ${output_config_filepath} 210 | sed -i "s|^minimum_query_coverage: .*|minimum_query_coverage: ${qcov}|" ${output_config_filepath} 211 | } 212 | 213 | function main() { 214 | # If no input is provided, provide help and exit 215 | if [[ $# -eq 0 ]]; then 216 | echo "No arguments provided. Please run '-h' or '--help' to see help. Exiting..." >&2 217 | exit 1 218 | elif [[ $1 = "-h" ]] || [[ $1 = "--help" ]]; then 219 | 220 | # Help statement 221 | printf "${SCRIPT_NAME}: generate template config files for a BackBLAST 'setup'. Part of the BackBLAST suite.\n" 222 | printf "Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025\n\n" 223 | printf "Usage: ${SCRIPT_NAME} [OPTIONS] query_filepath query_genome_filepath subject_genome_directory output_directory\n\n" 224 | printf "Positional arguments (required):\n" 225 | printf " query_filepath: path to the query predicted protein sequences from the query genome, FastA format\n" 226 | printf " query_genome_filepath: path to the predicted proteins of the entire query genome, FastA format\n" 227 | printf " subject_genome_directory: directory containing predicted proteins of all subject genomes (FastA format).\n" 228 | printf " One genome per file, with extension 'faa' (or specify -x)\n" 229 | printf " output_directory: directory where config ('config.yaml') and metadata templates ('genome_metadata.tsv', 'gene_metadata.tsv') will be created\n\n" 230 | printf "Optional arguments:\n" 231 | printf " -t phylogenetic_tree_newick: path to the pre-calculated phylogenetic tree [default: 'subjects' - auto-calculate tree]\n" 232 | printf " Specify 'NA' to skip tree generation and plot the heatmap alone.\n" 233 | printf " -b bootstrap_cutoff: numerical value (e.g., 80) under which bootstrap values will not be displayed [default: NA]\n" 234 | printf " -r root_name: Exact name of the tree tip label at the desired root of the tree [default: NA; will skip rooting]\n" 235 | printf " -e evalue: e-value cutoff for reciprocal BLASTP [default: 1e-40]\n" 236 | printf " -p pident: percent identity cutoff for reciprocal BLASTP [default: 25]\n" 237 | printf " -c qcov: percent query coverage cutoff for reciprocal BLASTP [default: 50]\n" 238 | printf " -x genome_extension: extension for predicted protein files of subject genomes [default: faa]\n" 239 | printf " -@ threads: maximum threads to use for any process [default: 1]\n\n" 240 | printf "Advanced parameters (use with care):\n" 241 | printf " -T template_config: Path to the template config file used in setup [default: ${TEMPLATE_CONFIG}]\n\n" 242 | printf "Note: Currently does NOT support whitespaces in any input variables.\n\n" 243 | 244 | # Exit 245 | exit 0 246 | fi 247 | 248 | # Set defaults for options 249 | local threads 250 | threads=1 251 | local phylogenetic_tree_newick 252 | phylogenetic_tree_newick="subjects" 253 | local bootstrap_cutoff 254 | bootstrap_cutoff="NA" 255 | local root_name 256 | root_name="NA" 257 | local evalue 258 | evalue=1e-40 259 | local pident 260 | pident=25 261 | local qcov 262 | qcov=50 263 | local genome_extension 264 | genome_extension="faa" 265 | local template_config 266 | template_config=${TEMPLATE_CONFIG} 267 | 268 | # Set options (help from https://wiki.bash-hackers.org/howto/getopts_tutorial; accessed March 8th, 2019) 269 | OPTIND=1 # reset the OPTIND counter just in case 270 | while getopts ":@:t:b:r:e:p:c:x:T:" opt; do 271 | case ${opt} in 272 | \@) 273 | threads=${OPTARG} 274 | ;; 275 | t) 276 | phylogenetic_tree_newick=${OPTARG} 277 | ;; 278 | b) 279 | bootstrap_cutoff=${OPTARG} 280 | ;; 281 | r) 282 | root_name=${OPTARG} 283 | ;; 284 | e) 285 | evalue=${OPTARG} 286 | ;; 287 | p) 288 | pident=${OPTARG} 289 | ;; 290 | c) 291 | qcov=${OPTARG} 292 | ;; 293 | x) 294 | genome_extension=${OPTARG} 295 | ;; 296 | T) 297 | template_config=${OPTARG} 298 | ;; 299 | \?) 300 | echo "[ $(date -u) ]: ERROR: Invalid option: '-${OPTARG}'. Exiting..." >&2 301 | exit 1 302 | ;; 303 | :) 304 | echo "[ $(date -u) ]: ERROR: argument needed following '-${OPTARG}'. Exiting..." >&2 305 | exit 1 306 | ;; 307 | esac 308 | done 309 | 310 | # Set positional arguments 311 | local original_arguments 312 | original_arguments=${@} # save for reporting later 313 | shift $((OPTIND - 1)) # shift to avoid flags when assigning positional arguments 314 | local query_filepath 315 | query_filepath=$1 316 | local query_genome_filepath 317 | query_genome_filepath=$2 318 | local subject_genome_directory 319 | subject_genome_directory=$3 320 | local output_directory 321 | output_directory=$4 322 | 323 | echo "[ $(date -u) ]: Running ${SCRIPT_NAME}" >&2 324 | echo "[ $(date -u) ]: Command run: ${SCRIPT_NAME} ${original_arguments}" >&2 325 | 326 | make_run_templates ${template_config} ${query_filepath} ${query_genome_filepath} ${subject_genome_directory} \ 327 | ${genome_extension} ${output_directory} ${threads} ${phylogenetic_tree_newick} ${bootstrap_cutoff} ${root_name} \ 328 | ${evalue} ${pident} ${qcov} 329 | 330 | echo "[ $(date -u) ]: BackBLAST template generation finished." >&2 331 | } 332 | 333 | # Only run the script if it is called from the command line 334 | if [[ ${BASH_SOURCE[0]} = ${0} ]]; then 335 | main $@ 336 | fi 337 | 338 | -------------------------------------------------------------------------------- /scripts/search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---------------------------------------------------------------------------------------- 4 | # Copyright: Lee H. Bergstrand and Jackson M. Tsuji, 2025 5 | # Description: A Biopython program that takes a list of query proteins and uses local BLASTP to search 6 | # for highly similar proteins within a local blast database (usually a local db of a target 7 | # proteome). The program then BLASTPs backwards from the found subject proteins to the query 8 | # proteome to confirm gene orthology. Part of the BackBLAST pipeline. 9 | # 10 | # Requirements: - This script requires BLAST+ 2.2.9 or later. 11 | # - All operations are done with protein sequences. 12 | # - All query proteins should be from sequenced genomes in order to facilitate backwards BLAST. 13 | # 14 | # =========================================================================================================== 15 | 16 | import argparse 17 | # Imports & Setup: 18 | import csv 19 | import subprocess 20 | import sys 21 | import uuid 22 | import os 23 | 24 | from Bio import SeqIO 25 | import networkx as nx 26 | 27 | DEFAULT_E_VALUE_CUTOFF = 1e-25 28 | DEFAULT_MINIMUM_IDENTITY_CUTOFF = 25 29 | DEFAULT_MINIMUM_QUERY_COVERAGE = 50 30 | 31 | 32 | def get_blast_hight_scoring_pairs(query_gene_cluster_path, subject_proteome_file, e_value_cutoff, minimum_identity, 33 | minimum_query_coverage): 34 | """ 35 | Runs BLASTP on the given query and subject FASTA files and returns collects high scoring pairs. 36 | 37 | 38 | :param query_gene_cluster_path: The path to the query FASTA file. 39 | :param subject_proteome_file: The path to the subject FASTA file. 40 | :param e_value_cutoff: The e-value cutoff for BLASTP. 41 | :param minimum_identity: The minimum sequence identity that each HSP should have. 42 | :param minimum_query_coverage: The minimum query coverage (qcovhsp) that each HSP should have. 43 | :return: The blast output as a list of lists representing HSPs and their parameters. 44 | """ 45 | return filter_blast_csv(run_blastp(query_gene_cluster_path, 46 | subject_proteome_file, 47 | e_value_cutoff=e_value_cutoff), 48 | minimum_identity=minimum_identity, 49 | minimum_query_coverage=minimum_query_coverage) 50 | 51 | 52 | def run_blastp(query_file_path, subject_file_path, e_value_cutoff): 53 | """ 54 | Runs BLASTP on the given query and subject FASTA files. 55 | 56 | :param query_file_path: The path to the query FASTA file. 57 | :param subject_file_path: The path to the subject FASTA file. 58 | :param e_value_cutoff: The e-value cutoff for BLASTP. 59 | :return: A csv formatted BLASTP output (query_sequence_id, subject_sequence_id, percent_identity, e-value, 60 | query coverage, bitscore) 61 | """ 62 | blast_out = subprocess.check_output( 63 | ["blastp", "-query", query_file_path, "-subject", subject_file_path, "-evalue", str(e_value_cutoff), 64 | "-soft_masking", "true", "-seg", "yes", "-outfmt", "10 qseqid sseqid pident evalue qcovhsp bitscore"]) 65 | 66 | # Decodes BLASTP output to UTF-8 (In Py3 check_output returns raw bytes) 67 | blast_out = blast_out.decode().replace(' ', '') 68 | return blast_out 69 | 70 | 71 | def filter_blast_csv(raw_blast_output, minimum_identity, minimum_query_coverage): 72 | """ 73 | This filters the blast results by minimum_identity and creates a 2D array of HSPs. 74 | 75 | :param raw_blast_output: String containing the entire output from BLAST. 76 | :param minimum_identity: The minimum sequence identity that each HSP should have. 77 | :param minimum_query_coverage: The minimum query coverage (qcovhsp) that each HSP should have. 78 | :return: The filter blast output as a list of lists representing HSPs and their parameters. 79 | """ 80 | blast_results = csv.reader(raw_blast_output.splitlines(True)) # Reads BLAST csv rows as a csv 81 | 82 | filtered_blast_output = [] # Note should simply delete unwanted HSPs from current list rather than making new list 83 | # Rather than making a new one 84 | for high_scoring_pair in blast_results: 85 | if float(high_scoring_pair[2]) >= minimum_identity: # Filter by minimum identity 86 | if float(high_scoring_pair[4]) >= minimum_query_coverage: # Filter by minimum query coverage 87 | # Converts each high_scoring_pair parameter that should be a number to a number 88 | high_scoring_pair[2] = float(high_scoring_pair[2]) 89 | high_scoring_pair[3] = float(high_scoring_pair[3]) 90 | high_scoring_pair[4] = float(high_scoring_pair[4]) 91 | high_scoring_pair[5] = float(high_scoring_pair[5]) 92 | filtered_blast_output.append(high_scoring_pair) 93 | 94 | return filtered_blast_output 95 | 96 | 97 | def create_fasta_cache(file_path): 98 | """ 99 | Creates a cache of FASTA sequences from a FASTA file. 100 | 101 | :param file_path: A path to a FASTA file. 102 | :return: A dictionary of FASTA sequences keyed by each sequence's ID. 103 | """ 104 | proteome_hash = dict() 105 | try: 106 | handle = open(file_path, "r") 107 | for record in SeqIO.parse(handle, "fasta"): 108 | proteome_hash.update({record.id: record.format("fasta")}) 109 | handle.close() 110 | except IOError: 111 | print("Failed to open " + file_path) 112 | sys.exit(1) 113 | 114 | return proteome_hash 115 | 116 | 117 | def filter_forward_pairs_by_reverse_pairs(forward_blast_high_scoring_pairs, reverse_blast_high_scoring_pairs): 118 | """ 119 | Takes the forward blast results and uses the reverse blast results to filter out non-orthologus HSPs. 120 | 121 | Uses the algorithm found here: 122 | 123 | https://github.com/LeeBergstrand/BackBLAST_Reciprocal_BLAST/wiki/BackBLAST-Algorithm 124 | 125 | :param forward_blast_high_scoring_pairs: The forward blast high scoring pairs. 126 | :param reverse_blast_high_scoring_pairs: The reverse blast high scoring pairs. 127 | :return: The forward blast results with non-ortholgous HSPs removed. 128 | """ 129 | blast_graph = create_blast_graph(forward_blast_high_scoring_pairs, reverse_blast_high_scoring_pairs) 130 | 131 | filterable_forward_blast_results = list(forward_blast_high_scoring_pairs) 132 | print(">> Checking if forward hit subjects have better reciprocal hits than query.") 133 | for hit in forward_blast_high_scoring_pairs: 134 | query_protein_id = hit[0] 135 | subject_protein_id = hit[1] 136 | subject_protein = blast_graph[subject_protein_id] 137 | 138 | # Find the top score of the best reciprocal BLAST hit 139 | top_back_hit_score = 0 140 | for back_hit_id in subject_protein: 141 | # If an edge in the graph has no reverse bitscore, it means that no BLASTP hit passed the search threshold 142 | # for that subject -> query pair. Yet to do greater than/less than comparisons of bitscores, some kind of 143 | # filler value is needed for these cases so that the script can still run. Using the none_value parameter 144 | # here (and below), that filler value is set to -1 so that it is clear that it is not a real bitscore. This 145 | # filler value will never pass the top_back_hit_score=0 threshold and so is safe to use. 146 | back_hit_score = get_edge_attribute(subject_protein, back_hit_id, 147 | 'bitscore-rev', none_value=-1) 148 | if back_hit_score >= top_back_hit_score: 149 | top_back_hit_score = back_hit_score 150 | 151 | # Check if the query is the best reciprocal BLAST hit for the subject 152 | delete_hit = False 153 | if query_protein_id in subject_protein: 154 | # The edge weight between the subject and the query is the reciprocal BLAST score 155 | back_hit_to_query_score = get_edge_attribute(subject_protein, query_protein_id, 156 | 'bitscore-rev', none_value=-1) 157 | if back_hit_to_query_score < top_back_hit_score: 158 | # If the query is not the best reciprocal BLAST hit simply delete 159 | # it from the filterable_forward_blast_results 160 | delete_hit = True 161 | else: 162 | # If the query is not a reciprocal BLAST hit simply delete it from the filterable_forward_blast_results 163 | delete_hit = True 164 | 165 | if delete_hit: 166 | # Delete the forward BLAST hit from filterable_forward_blast_results 167 | del filterable_forward_blast_results[filterable_forward_blast_results.index(hit)] 168 | 169 | # Return reciprocal BLAST output 170 | return filterable_forward_blast_results 171 | 172 | 173 | def create_blast_graph(forward_blast_high_scoring_pairs, reverse_blast_high_scoring_pairs): 174 | """ 175 | Builds a graph network of forward and reverse hits. 176 | 177 | :param forward_blast_high_scoring_pairs: The forward blast high scoring pairs. 178 | :param reverse_blast_high_scoring_pairs: The reverse blast high scoring pairs. 179 | :return: A graph representation of forward and reverse HSPs. 180 | """ 181 | blast_graph = nx.Graph() # Creates graph to map BLAST hits 182 | print(">> Creating Graph...") 183 | # hit[0] = query ID; hit[1] = subject ID; hit[5] = bitscore 184 | 185 | for hit in forward_blast_high_scoring_pairs: 186 | blast_graph.add_edge(hit[0], hit[1]) 187 | blast_graph[hit[0]][hit[1]]['bitscore-fwd'] = hit[5] 188 | 189 | for hit in reverse_blast_high_scoring_pairs: 190 | blast_graph.add_edge(hit[1], hit[0]) 191 | blast_graph[hit[1]][hit[0]]['bitscore-rev'] = hit[5] 192 | 193 | return blast_graph 194 | 195 | 196 | def get_edge_attribute(vertex, neighbour_id, attribute_name, none_value=None): 197 | """ 198 | Get attribute of the edge of a nx.Graph() 199 | 200 | :param vertex: AtlasView of a graph centered on the vertex of interest (e.g., graph[vertex]). 201 | :param neighbour_id: The ID of the neighbouring vertex to form the edge. 202 | :param attribute_name: Name of the edge attribute. 203 | :param none_value: What to return of the attribute does not exist for the edge. 204 | :return: The value of the attribute for the desired edge. 205 | """ 206 | edge_attributes = vertex[neighbour_id] 207 | 208 | if attribute_name in edge_attributes: 209 | attribute_value = edge_attributes[attribute_name] 210 | else: 211 | attribute_value = none_value 212 | 213 | return attribute_value 214 | 215 | 216 | def main(args): 217 | """ 218 | The starting point of BackBLAST. 219 | 220 | :param args: The CLI arguments. 221 | """ 222 | query_gene_cluster_path = args.gene_cluster 223 | query_proteome_path = args.query_proteome 224 | subject_proteome_file = args.subject_proteome 225 | input_e_value_cutoff = args.e_value 226 | input_min_ident_cutoff = args.min_ident 227 | input_min_query_cov_cutoff = args.min_query_cov 228 | out_file = args.output_file 229 | 230 | print("Opening " + subject_proteome_file + "...") 231 | 232 | # File extension checks 233 | if not query_gene_cluster_path.endswith(".faa"): 234 | print("[Warning] " + query_gene_cluster_path + " may not be a amino acid FASTA file!") 235 | if not query_proteome_path.endswith(".faa"): 236 | print("[Warning] " + query_proteome_path + " may not be a amino acid FASTA file!") 237 | if not subject_proteome_file.endswith(".faa"): 238 | print("[Warning] " + subject_proteome_file + " may not be a amino acid FASTA file!") 239 | 240 | print(">> Forward Blasting to subject proteome...") 241 | # Forward BLAST from query proteins to subject proteome and filter BLAST results by percent identity and query cov 242 | forward_blast_high_scoring_pairs = get_blast_hight_scoring_pairs(query_gene_cluster_path=query_gene_cluster_path, 243 | subject_proteome_file=subject_proteome_file, 244 | e_value_cutoff=input_e_value_cutoff, 245 | minimum_identity=input_min_ident_cutoff, 246 | minimum_query_coverage=input_min_query_cov_cutoff) 247 | 248 | if len(forward_blast_high_scoring_pairs) == 0: 249 | print(">> No Forward hits in subject proteome were found.") 250 | 251 | try: 252 | open(out_file, "w").close() # Writes empty file for easier data processing. 253 | except IOError: 254 | print(">> Failed to create " + out_file) 255 | sys.exit(1) 256 | print(">> Exiting.\n\n") 257 | sys.exit(0) # Aborts program. (exit(0) indicates that no error occurred) 258 | 259 | # Creates python dictionary with every protein FASTA sequence in the subject proteome 260 | subject_proteome_fasta_cache = create_fasta_cache(subject_proteome_file) 261 | 262 | print(">> Creating Back-Blasting Query from found subject proteins...") 263 | # For each top hit... 264 | back_blast_query_fastas = [] 265 | for hit in forward_blast_high_scoring_pairs: 266 | subject_protein = hit[1] 267 | subject_protein_fasta = subject_proteome_fasta_cache.get(subject_protein) 268 | back_blast_query_fastas.append(subject_protein_fasta) # Adds current subject to overall protein list 269 | 270 | complete_back_blast_query = "".join(back_blast_query_fastas) 271 | 272 | # Attempt to write a temporary FASTA file for the reverse BLAST to use 273 | temp_filename = "temp_query_" + uuid.uuid4().hex + ".faa" 274 | print(">> Writing backBLASTing query to temporary file " + temp_filename) 275 | try: 276 | write_file = open(temp_filename, "w") 277 | write_file.write(complete_back_blast_query) 278 | write_file.close() 279 | except IOError: 280 | print("Failed to create " + temp_filename) 281 | sys.exit(1) 282 | 283 | print(">> BLASTing backwards from subject genome to query genome.") 284 | # Run backwards BLAST towards query proteome and filters BLAST results by percent identity 285 | reverse_blast_high_scoring_pairs = get_blast_hight_scoring_pairs(query_gene_cluster_path=temp_filename, 286 | subject_proteome_file=query_proteome_path, 287 | e_value_cutoff=input_e_value_cutoff, 288 | minimum_identity=input_min_ident_cutoff, 289 | minimum_query_coverage=input_min_query_cov_cutoff) 290 | 291 | filterable_forward_blast_results = filter_forward_pairs_by_reverse_pairs(forward_blast_high_scoring_pairs, 292 | reverse_blast_high_scoring_pairs) 293 | try: 294 | write_file = open(out_file, "w") 295 | writer = csv.writer(write_file) 296 | print(">> Output file created.") 297 | print(">> Writing Data...") 298 | for row in filterable_forward_blast_results: 299 | writer.writerow(row) 300 | write_file.close() 301 | os.remove(temp_filename) 302 | except IOError: 303 | print(">> Failed to create " + out_file) 304 | sys.exit(1) 305 | print(">> Done\n") 306 | 307 | 308 | if __name__ == '__main__': 309 | """Command Line Interface Options""" 310 | 311 | parser = argparse.ArgumentParser(description="Runs reciprocal BLASTP on a given set of query proteins. " 312 | "Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025.") 313 | parser.add_argument('-q', '--gene_cluster', metavar='FASTA', required=True, 314 | help='''The path to the protein FASTA file of the gene cluster to be used as a query.''') 315 | 316 | parser.add_argument('-r', '--query_proteome', metavar='FASTA', required=True, 317 | help='''The path to a FASTA file containing all proteins from query organism.''') 318 | 319 | parser.add_argument('-s', '--subject_proteome', metavar='FASTA', required=True, 320 | help='''The path to a FASTA file containing all proteins from subject organism.''') 321 | 322 | parser.add_argument('-e', '--e_value', metavar='E-VALUE', default=DEFAULT_E_VALUE_CUTOFF, type=str, 323 | help='''The Expect value (E) cutoff for removing high scoring pairs. ''' + 324 | '''The smaller this number is, the stricter the BLAST search.''') 325 | 326 | parser.add_argument('-i', '--min_ident', metavar='IDENT', default=DEFAULT_MINIMUM_IDENTITY_CUTOFF, type=float, 327 | help='''The minimum sequence identify cutoff for removing high scoring pairs.''' + 328 | '''The larger this number is, the stricter the BLAST search.''') 329 | 330 | parser.add_argument('-c', '--min_query_cov', metavar='QCOV', default=DEFAULT_MINIMUM_QUERY_COVERAGE, type=float, 331 | help='''The minimum percent query coverage cutoff for removing high scoring pairs.''' + 332 | '''The larger this number is, the stricter the BLAST search.''') 333 | 334 | parser.add_argument('-o', '--output_file', metavar='OUTPUT', required=True, 335 | help='''The path to write CSV-format BLAST results to.''') 336 | 337 | cli_args = parser.parse_args() 338 | main(cli_args) 339 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | BackBLAST reciprocal BLAST workflow 2 | ========================== 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3465954.svg)](https://doi.org/10.5281/zenodo.3465954) 4 | 5 | Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025 6 | 7 | # Software overview 8 | `backblast` automates the use of NCBI BLASTP to search for genes or gene clusters within bacterial genomes. 9 | Non-orthologous genes are filtered out by identifying and extracting only bidirectional best BLASTP hits using a graph-based algorithm. 10 | `backblast` then visualizes the results of bidirectional BLASTP in a convenient gene heatmap coupled to a genome phylogeny. 11 | 12 | The bidirectional BLASTP-based filtering algorithm is illustrated below: 13 | 14 | ![BackBLAST Algorithm](https://private-user-images.githubusercontent.com/18713012/381830418-2b8690db-ffd5-4fe5-adc3-661c6a7515c2.gif?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MzY4MzM2MTUsIm5iZiI6MTczNjgzMzMxNSwicGF0aCI6Ii8xODcxMzAxMi8zODE4MzA0MTgtMmI4NjkwZGItZmZkNS00ZmU1LWFkYzMtNjYxYzZhNzUxNWMyLmdpZj9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNTAxMTQlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjUwMTE0VDA1NDE1NVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTRmOWM2ODMyMjlmMjRjNDRiYzI0MzRkZGVjZDZmMTYzNmE2OWYwYjRjYjg2ZGZhMTdjYTA5ZjU4NjRjN2FkNzcmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0In0._Zn0HWRNK6News7h6oTw2r7_GnXWOjGk4wsuqBH6fS4) 15 | 16 | Example gene heatmap visualization: 17 | 18 | ![Example Results](https://media.springernature.com/full/springer-static/image/art%3A10.1038%2Fs41396-020-0650-2/MediaObjects/41396_2020_650_Fig7_HTML.png) 19 | (From Spasov, Tsuji, _et al._, 2020, [doi:10.1038/s41396-020-0650-2](https://doi.org/10.1038/s41396-020-0650-2)) 20 | 21 | # Requirements and dependencies 22 | - OS: runs on linux (e.g., Ubuntu) and MacOS (tested on Sonoma 14 and Sequoia 15) 23 | - Hardware: most modern computers (e.g., with basic CPU, >=4 GB RAM, and >=4 GB of free storage) should be able to run BackBLAST without issue. 24 | The only exception is if you create a genome tree within the pipeline, in which case you'll need a fair amount of CPU and time to calculate large trees. 25 | - Software: miniconda or miniforge needs to be installed (must be set to the `osx-64` channel for MacOS). 26 | (Because conda is used during the run to install additional tools for the workflow, you will need an internet connection the first time you run BackBLAST.) 27 | 28 | # Installation 29 | Installation is currently semi-manual. We hope to make an automated conda install in the future. 30 | 31 | Run the following code in your command line (e.g., Terminal) to install BackBLAST: 32 | ```bash 33 | # Download the repo 34 | git clone https://github.com/LeeBergstrand/BackBLAST_Reciprocal_BLAST.git 35 | cd BackBLAST_Reciprocal_BLAST 36 | # git checkout develop # optionally go to a specific branch 37 | # git checkout v2.0.0-beta1 # optionally go to a specific release 38 | 39 | # Create the conda env 40 | conda env create -n backblast --file=environment.yml 41 | 42 | # Copy the key repo contents into a conda share folder 43 | conda activate backblast 44 | mkdir -p ${CONDA_PREFIX}/share/backblast 45 | cp -r * ${CONDA_PREFIX}/share/backblast 46 | 47 | # Remove the original repo 48 | cd .. 49 | rm -rf BackBLAST_Reciprocal_BLAST 50 | 51 | # Add instructions to export the BackBLAST folder to the PATH when the repo activates 52 | mkdir -p ${CONDA_PREFIX}/etc/conda/activate.d 53 | 54 | if [[ ! -f ${CONDA_PREFIX}/etc/conda/activate.d/env_vars.sh ]]; then 55 | echo '#!/bin/sh' > ${CONDA_PREFIX}/etc/conda/activate.d/env_vars.sh 56 | fi 57 | echo "export PATH=\${PATH}:${CONDA_PREFIX}/share/backblast:${CONDA_PREFIX}/share/backblast/scripts" \ 58 | >> ${CONDA_PREFIX}/etc/conda/activate.d/env_vars.sh 59 | chmod 755 ${CONDA_PREFIX}/etc/conda/activate.d/env_vars.sh 60 | 61 | # Re-activate the repo to apply the changes 62 | conda deactivate 63 | conda activate backblast 64 | ``` 65 | Now you should be good to go! Run `backblast -h` to get started. 66 | 67 | # Usage: quick start 68 | ```bash 69 | # Set up the run 70 | backblast setup query.faa query_genome.faa subject_dir output_dir 71 | # Then edit output_dir/config.yaml 72 | # You can also edit output_dir/gene_metadata.tsv and output_dir/genome_metadata.tsv to make the plot look better 73 | 74 | # Start the run 75 | backblast run output_dir/config.yaml output_dir 76 | # All done! You can iteratively refine the plot from here as you'd like. 77 | ``` 78 | See more detailed instructions below. 79 | 80 | # Usage: step-by-step instructions 81 | Get beautiful heatmaps using BackBLAST by following the step-by-step instructions below. 82 | 83 | ## 1. Prepare your input files 84 | Prepare the following files and folders as input for BackBLAST: 85 | - `query.faa`: multi-fasta file containing the query proteins that you want to search for. 86 | These should be copied and pasted from the `query_genome.faa` file below (the FastA headers for each protein need to be the same as in that file). 87 | - `query_genome.faa`: multi-fasta file containing all proteins encoded by the query bacterial genome. 88 | - `subject_dir`: a folder containing the subjects for the search. 89 | Each subject should be the protein-coding genes (as amino acids) for a bacterial genome. 90 | Save each subject as multi-fasta file (like `query_genome.faa` above). 91 | By default, the extension of each file needs to be `.faa`, but this can be changed in the tool settings if desired. 92 | - `output_dir`: an empty folder for saving the output of BackBLAST. 93 | 94 | You are then ready to run BackBLAST, using either method 2a or 2b below. 95 | 96 | ## 2a. Recommended workflow 97 | Gives you the ability to customize the parameters of your run. 98 | 99 | ### 2a-i. Set up the run 100 | ```bash 101 | backblast setup query.faa query_genome.faa subject_dir output_dir 102 | ``` 103 | This makes several configuration files in the `output_dir`. 104 | 105 | ### 2a-ii. Configure the run settings and metadata 106 | Open and edit `output_dir/config.yaml` (e.g., in a text editor) to customize the settings for your run. Some key settings include: 107 | - Thresholds for bidirectional BLASTP: you can set the e-value, percent identity, and query coverage cutoffs for the BLASTP search. 108 | - Phylogenetic tree: you can also set whether you want to auto-generate a genome-based phylogenetic tree during your run (takes time) 109 | or if you want to supply a custom pre-generated phylogenetic tree. 110 | - If you provide your own phylogenetic tree, the genome IDs in the tree need to match the file names of the genomes in the `subject_dir`. 111 | In addition, all genomes in the phylogenetic tree need to be present in the `subject_dir`, or else BackBLAST will fail. If some 112 | genomes are in the `subject_dir` but are missing in the tree, then BackBLAST will still proceed but will drop those genomes from the 113 | analysis.) 114 | - Alternatively, you can also choose to not add a phylogenetic tree to the heatmap. 115 | - Tree rooting: one other helpful setting is related to tree rooting. You can choose to root the tree automatically by midpoint or provide 116 | the name of one of the genomes that you want to serve as the root of the tree. Alternatively, you can choose to use the current tree 117 | topology as-is. 118 | - Plot width and height: you can adjust the final width and height of the heatmap as well. It might take a few rounds of running BackBLAST 119 | to find the "perfect" width and height for your heatmap (see below). 120 | - Take a look at the full config file to see other advanced settings. 121 | 122 | You also have the opportunity (optionally) to edit the tab-separeted tables `output_dir/gene_metadata.tsv` and `output_dir/genome_metadata.tsv` 123 | to improve data visualization. You can open these in a text editor or a table editor (like Excel). 124 | - In `gene_metadata.tsv`, you can provide human-readable names for the query genes you want to search for. You can also customize the order 125 | that the genes will be shown in the heatmap by changing how they are sorted in this file. If you delete rows, then those genes will be removed 126 | removed from the heatmap during the run. 127 | - In `genome_metadata.tsv`, you can provide human-readable names for the genomes in the run. The sort order of the genomes in this file only 128 | impacts the order that the genomes are plotted in if you choose to not use a phylogenetic tree (in the config file above). Otherwise, the 129 | sort order is determined using the phylogenetic tree. 130 | 131 | ### 2a-iii. Start the run 132 | ```bash 133 | backblast run output_dir/config.yaml output_dir 134 | ``` 135 | This should generate a series of blast and heatmap-related files in the output folder. 136 | 137 | ### 2a-iv. Check and iteratively improve the run results 138 | Then, take a look at the key output files `heatmap/BackBLAST_heatmap.tsv` and `heatmap/BackBLAST_heatmap.pdf`. 139 | The first of these files shows the final bidirectional BLASTP results as a tab-separated table. The second is the final heatmap. 140 | 141 | If you aren't satisfied with the results, you can selectively delete run files from the output folder, tweak the settings files above, and 142 | then re-run BackBLAST (via step 2a-iii) to re-generate the desired files with the new run settings. 143 | 144 | Three common things to change are: 145 | - If your BLASTP results are too stringent or not stringent enough: delete the `blast` folder, tweak the key BLASTP settings (in the config file), 146 | and then re-run `backblast run`. This will redo everything from the `blast` step and will overwrite the old heatmap files. 147 | - If your visualization doesn't look right (e.g., the height and width aren't optimal, the genome/gene names aren't ideal, or the genes aren't sorted 148 | as you'd like), then delete the `heatmap` folder, tweak the config and metadata files (mentioned above), and re-run `backblast run`. 149 | BackBLAST will then use the old `blast` results and just generate a new heatmap. 150 | - If you want to use a different phylogenetic tree, then point to a new phylogenetic tree file in the config file, then re-run `backblast run`. 151 | BackBLAST will use the old `blast` results but should overwrite the old heatmap files with new ones using the updated tree. 152 | 153 | ### 2a-v. Final tweaks to the heatmap 154 | The PDF heatmap will not be perfect (e.g., the dashed lines between the genome names and tree tips won't be perfectly aligned), but 155 | you can open the heatmap in a PDF editor like Inkscape and clean it up or customize it as desired. 156 | 157 | ## 2b. Alternative, speedy workflow 158 | Gets the job done without any custom settings. 159 | This command skips the setup setup (above) and just runs the pipeline. You can set some of the key config settings via optional flags. 160 | ```bash 161 | backblast auto [OPTIONS] query.faa query_genome.faa subject_dir output_dir 162 | # See the different OPTIONS for customizing your run in the full settings at the bottom of this README. 163 | ``` 164 | 165 | The level of fine-tuning possible using this method is much less than in Method 2a, but it is a nice way to quickly see some initial 166 | bidirectional BLASTP results. 167 | 168 | In addition, you can iteratively tweak the run results and re-run using `backblast run` just like described in step 2a-iv above. 169 | 170 | Once done, you can then edit the PDF heatmap as described in step 2a-v above. 171 | 172 | # Caveats 173 | One disadvantage of BackBLAST is that it has trouble handling paralogous genes (e.g., gene duplicates). 174 | - If the __query genome__ contains paralogs of your query gene, this can sometimes cause BackBLAST to miss real functional gene hits 175 | for your query gene in some subject genomes. If the gene in the subject genome happens to be a closer match to one of the paralogs 176 | in the query genome than to your query, then even if the gene in the subject genome is a valid functional gene for the process you are 177 | interested in, it will be screened out by the BackBLAST algorithm and not reported. This can lead to an under-estimation of the true 178 | functional gene content of subject genomes. 179 | - As a partial workaround, we recommend to check for paralogs of your query genes in your query genome before running BackBLAST so 180 | you are aware if there could be paralog-related issues. One easy way to check for paralogs is to run BLASTP for each of your query genes against 181 | the query genome and look for high-scoring hits. You can then watch for unexpected results when running BackBLAST. If you are not hitting 182 | real functional genes that you want to detect in the subject genomes due to having paralogs in the query genome, then you can mask out 183 | (i.e., delete) the paralogs of your query gene in the query genome file and re-run BackBLAST. Make sure you know what you are doing if 184 | you do this, though... in some use cases, you really want the paralogs to be there to filter out non-true hits to your query. 185 | - If the __subject genomes__ contain paralogs (or multiple copies) of the query, then BackBLAST will not report this information. It will just 186 | show the best hit to your query gene and ignore the other hits. This is non-ideal if you want to know how many paralogs are in the subject genomes. 187 | - There is no built-in workaround for this in BackBLAST at the moment. 188 | 189 | # Test data 190 | Try a test run from inside the repo with: 191 | ```bash 192 | mkdir -p testing/outputs 193 | # Make sure backblast is added to your PATH before running the test 194 | backblast run testing/inputs/config.yaml testing/outputs --notemp 195 | 196 | # See if the output files looks as expected 197 | cmp testing/outputs/blast/combine_blast_tables/blast_tables_combined.csv \ 198 | testing/outputs_expected/blast/combine_blast_tables/blast_tables_combined.csv 199 | cmp testing/outputs/heatmap/BackBLAST_heatmap.tsv \ 200 | testing/outputs_expected/heatmap/BackBLAST_heatmap.tsv 201 | 202 | # Clean up test if everything looks good 203 | rm -r testing/outputs 204 | ``` 205 | 206 | # Citation 207 | We hope that BackBLAST is helpful for you! If you use BackBLAST in your research, please cite the following paper, 208 | which describes the initial version of BackBLAST: 209 | 210 | > Bergstrand LH, Cardenas E, Holert J, Van Hamme JD, Mohn WW. Delineation of steroid-degrading microorganisms through 211 | > comparative genomic analysis. __mBio__ 7:[10.1128/mbio.00166-16](https://doi.org/10.1128/mbio.00166-16). 212 | 213 | # Appendix: full usage instructions 214 | Help messages (e.g., from running `backblast -h`) are pasted below. These show some of the advanced options possible in 215 | the command line (e.g., for `backblast auto`). 216 | 217 | `backblast` 218 | ``` 219 | backblast: pipeline to search for and visualize gene homologs across multiple genomes. 220 | 221 | Please specify a run mode for the main workflow for further usage instructions: 222 | 1. setup: for setting up pre-run configuration files 223 | 2. run: to start a run using configuration files 224 | 3. auto: to skip setup and run the pipeline end-to-end with default values 225 | 226 | Or run a specific step in the workflow manually: 227 | search: performs reciprocal BLASTP on given inputs 228 | remove_duplicates: removes duplicate hits from reciprocal BLASTP results 229 | create_blank_results: creates a blank output BLAST table for entries with no reciprocal BLASTP hits 230 | combine_tables: combines a set of input BLAST tables 231 | generate_heatmap: generates a heatmap from input BLAST and phylogeny data 232 | 233 | Advanced parameters (use with care): 234 | -U utils_dir: Path to the BackBLAST utility script directory for running specific workflow steps 235 | [default: /Users/jmtsuji/mambaforge/envs/backblast/share/backblast/scripts] 236 | ``` 237 | 238 | `backblast setup` 239 | ``` 240 | backblast setup: module for setting up a BackBLAST run. 241 | 242 | Usage: backblast setup [OPTIONS] query_filepath query_genome_filepath subject_genome_directory output_directory 243 | 244 | Positional arguments (required): 245 | query_filepath: path to the query predicted protein sequences from the query genome, FastA format 246 | query_genome_filepath: path to the predicted proteins of the entire query genome, FastA format 247 | subject_genome_directory: directory containing predicted proteins of all subject genomes (FastA format). 248 | One genome per file, with extension 'faa' (or specify -x) 249 | output_directory: directory where config ('config.yaml') and metadata templates ('genome_metadata.tsv', 'gene_metadata.tsv') will be created 250 | 251 | Optional arguments: 252 | -t phylogenetic_tree_newick: path to the pre-calculated phylogenetic tree [default: 'subjects' - auto-calculate tree] 253 | Specify 'NA' to skip tree generation and plot the heatmap alone. 254 | -b bootstrap_cutoff: numerical value (e.g., 80) under which bootstrap values will not be displayed [default: NA] 255 | -r root_name: Exact name of the tree tip label at the desired root of the tree [default: NA; will skip rooting] 256 | -e evalue: e-value cutoff for reciprocal BLASTP [default: 1e-40] 257 | -p pident: percent identity cutoff for reciprocal BLASTP [default: 25] 258 | -c qcov: percent query coverage cutoff for reciprocal BLASTP [default: 50] 259 | -x genome_extension: extension for predicted protein files of subject genomes [default: faa] 260 | -@ threads: maximum threads to use for any process [default: 1] 261 | 262 | Advanced parameters (use with care): 263 | -T template_config: Path to the template config file used in setup [default: /Users/jmtsuji/mambaforge/envs/backblast/share/backblast/snakemake/template_config.yaml] 264 | -U utils_dir: Path to the BackBLAST utility script directory [default: /Users/jmtsuji/mambaforge/envs/backblast/share/backblast/scripts] 265 | 266 | Note: Currently does NOT support whitespaces in any input variables. 267 | ``` 268 | 269 | `backblast run` 270 | ``` 271 | backblast run: module for initializing a BackBLAST run. 272 | 273 | Usage: backblast run [OPTIONS] config_filepath run_directory [SNAKEMAKE_ARGUMENTS] 274 | 275 | Positional arguments (required): 276 | config_filepath: path to the config file generated using the 'setup' module 277 | run_directory: the directory where BackBLAST results out to be output 278 | 279 | Optional arguments: 280 | -P conda_prefix: path to where the conda envs should be stored [default: '.snakemake/conda' in the run_directory] 281 | -j jobs: Number of processing threads available for the run [default: 1] 282 | **Should be no lower than the 'threads' setting in the config file** 283 | 284 | Advanced parameters (use with care): 285 | -S snakefile: Path to the Snakefile used to run BackBLAST [default: /Users/jmtsuji/mambaforge/envs/backblast/share/backblast/snakemake/Snakefile] 286 | -C use_conda: specify either 'True' or 'False' for whether or not each job should be run in 287 | its own conda env [default: True] 288 | If 'False' is set, then all dependencies need to be installed in the main environment 289 | where BackBLAST is running. Could be tricky. 290 | 291 | Snakemake arguments: 292 | Any flags added at the end of the command will be passed directly to snakemake, e.g., --notemp 293 | 294 | Note: Currently does NOT support whitespaces in any input variables. 295 | ``` 296 | 297 | `backblast auto` 298 | ``` 299 | backblast auto: module for setting up a BackBLAST run AND starting with some defaults. 300 | 301 | Usage: backblast auto [OPTIONS] query_filepath query_genome_filepath subject_genome_directory output_directory [SNAKEMAKE_ARGUMENTS] 302 | 303 | Positional arguments (required): 304 | query_filepath: path to the query predicted protein sequences from the query genome, FastA format 305 | query_genome_filepath: path to the predicted proteins of the entire query genome, FastA format 306 | subject_genome_directory: directory containing predicted proteins of all subject genomes (FastA format). 307 | One genome per file, with extension 'faa' (or specify -x) 308 | output_directory: directory where config ('config.yaml') and metadata templates ('genome_metadata.tsv', 'gene_metadata.tsv') will be created 309 | 310 | Optional arguments: 311 | -t phylogenetic_tree_newick: path to the pre-calculated phylogenetic tree [default: 'subjects' - auto-calculate tree] 312 | Specify 'NA' to skip tree generation and plot the heatmap alone. 313 | -b bootstrap_cutoff: numerical value (e.g., 80) under which bootstrap values will not be displayed [default: NA] 314 | -r root_name: Exact name of the tree tip label at the desired root of the tree [default: NA; will skip rooting] 315 | -e evalue: e-value cutoff for reciprocal BLASTP [default: 1e-40] 316 | -p pident: percent identity cutoff for reciprocal BLASTP [default: 25] 317 | -c qcov: percent query coverage cutoff for reciprocal BLASTP [default: 50] 318 | -x genome_extension: extension for predicted protein files of subject genomes [default: faa] 319 | -P conda_prefix: path to where the conda envs should be stored [default: '.snakemake/conda' in the run_directory] -@ threads: maximum threads to use for any process [default: 1] 320 | -j jobs: Number of processing threads available for the run [default: 1] 321 | **Should be no lower than the 'threads' setting in the config file** 322 | 323 | Advanced parameters (use with care): 324 | -T template_config: Path to the template config file used in setup [default: /Users/jmtsuji/mambaforge/envs/backblast/share/backblast/snakemake/template_config.yaml] 325 | -S snakefile: Path to the Snakefile used to run BackBLAST [default: /Users/jmtsuji/mambaforge/envs/backblast/share/backblast/snakemake/Snakefile] 326 | -C use_conda: specify either 'True' or 'False' for whether or not each job should be run in 327 | its own conda env [default: True] 328 | If 'False' is set, then all dependencies need to be installed in the main environment 329 | where BackBLAST is running. Could be tricky. 330 | 331 | Snakemake arguments: 332 | Any flags added at the end of the command will be passed directly to snakemake, e.g., --notemp 333 | 334 | Note: Currently does NOT support whitespaces in any input variables. 335 | ``` 336 | -------------------------------------------------------------------------------- /backblast: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | # BackBLAST 4 | # Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2024 5 | # The entry script for running BackBLAST via command line 6 | 7 | # GLOBAL variables 8 | readonly VERSION="2.0.0-beta1" 9 | readonly COPYRIGHT_YEAR="2024" 10 | readonly SCRIPT_NAME="${0##*/}" 11 | readonly SCRIPT_DIR="$(realpath ${0%/*})" 12 | readonly UTILS_DIR="${SCRIPT_DIR}/scripts" 13 | readonly TEMPLATE_CONFIG="${SCRIPT_DIR}/snakemake/template_config.yaml" 14 | readonly SNAKEFILE="${SCRIPT_DIR}/snakemake/Snakefile" 15 | 16 | ####################################### 17 | # Run snakemake 18 | # Globals: (none) 19 | # Arguments: 20 | # snakefile: the path to the Snakefile used to run BackBLAST 21 | # config_file: path to the config.yaml file containing settings for the run 22 | # run_directory: path to the directory where output files should be written 23 | # conda_prefix: the path to where conda environments are stored 24 | # jobs: maximum number of parallel jobs for the snakemake scheduler to run 25 | # use_conda: character value of either 'True' or 'False' to specify whether each job should be run in its own conda environment. 26 | # If 'False' is specified, then all dependencies need to be installed on your machine (e.g., in a central conda env). 27 | # snakemake_arguments: everything after argument 6 are flags passed directly to snakemake. May contain spaces inbetween! 28 | # Returns: 29 | # output files from the BackBLAST pipeline, in the run_directory 30 | ####################################### 31 | function run_snakemake() { 32 | # Get input variables 33 | local snakefile 34 | snakefile=$1 35 | local config_file 36 | config_file=$2 37 | local run_directory 38 | run_directory=$3 39 | local conda_prefix 40 | conda_prefix=$4 41 | local jobs 42 | jobs=$5 43 | local use_conda 44 | use_conda=$6 45 | local snakemake_arguments 46 | # Get everything after argument 6 47 | snakemake_arguments=${@:7} 48 | 49 | # Make sure conda_prefix is an absolute path if not the default 50 | if [[ "${conda_prefix}" != ".snakemake/conda" ]]; then 51 | conda_prefix=$(realpath "${conda_prefix}") 52 | fi 53 | 54 | # Run snakemake 55 | if [[ ${use_conda} == "True" ]]; then 56 | echo "[ $(date -u) ]: Command: snakemake --snakefile '${snakefile}' --configfile '${config_file}' --directory '${run_directory}' --conda-prefix '${conda_prefix}' --jobs ${jobs} --rerun-incomplete --reason --printshellcmds --use-conda ${snakemake_arguments}" >&2 57 | snakemake --snakefile "${snakefile}" --configfile "${config_file}" --directory "${run_directory}" --conda-prefix "${conda_prefix}" --jobs "${jobs}" --rerun-incomplete --reason --printshellcmds --use-conda ${snakemake_arguments} 58 | elif [[ ${use_conda} == "False" ]]; then 59 | # No need for --conda-prefix here; do not use 60 | echo "[ $(date -u) ]: Command: snakemake --snakefile '${snakefile}' --configfile '${config_file}' --directory '${run_directory}' --jobs ${jobs} --rerun-incomplete --reason --printshellcmds ${snakemake_arguments}" >&2 61 | snakemake --snakefile "${snakefile}" --configfile "${config_file}" --directory "${run_directory}" --jobs ${jobs} --rerun-incomplete --reason --printshellcmds ${snakemake_arguments} 62 | else 63 | echo "[ $(date -u) ]: the 'use_conda' variable must be either 'True' or 'False'; instead, '${use_conda}' was specified. Exiting..." >&2 64 | exit 1 65 | fi 66 | } 67 | 68 | ####################################### 69 | # Perform the 'setup' command 70 | # Globals: 71 | # SCRIPT_NAME: the name of this script 72 | # VERSION: the script version 73 | # TEMPLATE_CONFIG: the path to the template config file used in setup (YAML format) 74 | # OPTIND and OPTARG: used in command line parsing as part of 'getopts' 75 | # Arguments: 76 | # all command line inputs for the 'setup' module - see help statement below 77 | # Returns: 78 | # runs the 'setup' command end-to-end 79 | ####################################### 80 | function perform_setup() { 81 | if [[ $# -eq 0 ]]; then 82 | echo "setup: No arguments provided. Please run 'setup -h' or 'setup --help' for help. Exiting..." >&2 83 | exit 1 84 | elif [[ $1 = "-h" ]] || [[ $1 = "--help" ]]; then 85 | 86 | # Print help statement 87 | printf "${SCRIPT_NAME} setup: module for setting up a BackBLAST run.\n" 88 | printf "Copyright Lee H. Bergstrand and Jackson M. Tsuji, Neufeld Research Group, 2024\n" 89 | printf "Version: ${VERSION}\n\n" 90 | printf "Usage: ${SCRIPT_NAME} setup [OPTIONS] query_filepath query_genome_filepath subject_genome_directory output_directory\n\n" 91 | printf "Positional arguments (required):\n" 92 | printf " query_filepath: path to the query predicted protein sequences from the query genome, FastA format\n" 93 | printf " query_genome_filepath: path to the predicted proteins of the entire query genome, FastA format\n" 94 | printf " subject_genome_directory: directory containing predicted proteins of all subject genomes (FastA format).\n" 95 | printf " One genome per file, with extension 'faa' (or specify -x)\n" 96 | printf " output_directory: directory where config ('config.yaml') and metadata templates ('genome_metadata.tsv', 'gene_metadata.tsv') will be created\n\n" 97 | printf "Optional arguments:\n" 98 | printf " -t phylogenetic_tree_newick: path to the pre-calculated phylogenetic tree [default: 'subjects' - auto-calculate tree]\n" 99 | printf " Specify 'NA' to skip tree generation and plot the heatmap alone.\n" 100 | printf " -b bootstrap_cutoff: numerical value (e.g., 80) under which bootstrap values will not be displayed [default: NA]\n" 101 | printf " -r root_name: Exact name of the tree tip label at the desired root of the tree [default: NA; will skip rooting]\n" 102 | printf " -e evalue: e-value cutoff for reciprocal BLASTP [default: 1e-40]\n" 103 | printf " -p pident: percent identity cutoff for reciprocal BLASTP [default: 25]\n" 104 | printf " -c qcov: percent query coverage cutoff for reciprocal BLASTP [default: 50]\n" 105 | printf " -x genome_extension: extension for predicted protein files of subject genomes [default: faa]\n" 106 | printf " -@ threads: maximum threads to use for any process [default: 1]\n\n" 107 | printf "Advanced parameters (use with care):\n" 108 | printf " -T template_config: Path to the template config file used in setup [default: ${TEMPLATE_CONFIG}]\n" 109 | printf " -U utils_dir: Path to the BackBLAST utility script directory [default: ${UTILS_DIR}]\n\n" 110 | printf "Note: Currently does NOT support whitespaces in any input variables.\n\n" 111 | 112 | # Exit 113 | exit 0 114 | fi 115 | 116 | # Set defaults for options 117 | local threads 118 | threads=1 119 | local phylogenetic_tree_newick 120 | phylogenetic_tree_newick="subjects" 121 | local bootstrap_cutoff 122 | bootstrap_cutoff="NA" 123 | local root_name 124 | root_name="NA" 125 | local evalue 126 | evalue=1e-40 127 | local pident 128 | pident=25 129 | local qcov 130 | qcov=50 131 | local genome_extension 132 | genome_extension="faa" 133 | local template_config 134 | template_config=${TEMPLATE_CONFIG} 135 | local utils_dir 136 | utils_dir=${UTILS_DIR} 137 | 138 | # Set options (help from https://wiki.bash-hackers.org/howto/getopts_tutorial; accessed March 8th, 2019) 139 | OPTIND=1 # reset the OPTIND counter just in case 140 | while getopts ":@:t:b:r:e:p:c:x:T:U:" opt; do 141 | case ${opt} in 142 | \@) 143 | threads=${OPTARG} 144 | ;; 145 | t) 146 | phylogenetic_tree_newick=${OPTARG} 147 | ;; 148 | b) 149 | bootstrap_cutoff=${OPTARG} 150 | ;; 151 | r) 152 | root_name=${OPTARG} 153 | ;; 154 | e) 155 | evalue=${OPTARG} 156 | ;; 157 | p) 158 | pident=${OPTARG} 159 | ;; 160 | c) 161 | qcov=${OPTARG} 162 | ;; 163 | x) 164 | genome_extension=${OPTARG} 165 | ;; 166 | T) 167 | template_config=${OPTARG} 168 | ;; 169 | U) 170 | utils_dir=${OPTARG} 171 | ;; 172 | \?) 173 | echo "[ $(date -u) ]: ERROR: Invalid option: '-${OPTARG}'. Exiting..." >&2 174 | exit 1 175 | ;; 176 | :) 177 | echo "[ $(date -u) ]: ERROR: argument needed following '-${OPTARG}'. Exiting..." >&2 178 | exit 1 179 | ;; 180 | esac 181 | done 182 | 183 | # Set positional arguments 184 | local original_arguments 185 | original_arguments=${@} # save for reporting later 186 | shift $((OPTIND - 1)) # shift to avoid flags when assigning positional arguments 187 | local query_filepath 188 | query_filepath=$1 189 | local query_genome_filepath 190 | query_genome_filepath=$2 191 | local subject_genome_directory 192 | subject_genome_directory=$3 193 | local output_directory 194 | output_directory=$4 195 | 196 | echo "[ $(date -u) ]: Running ${SCRIPT_NAME} in setup mode" >&2 197 | echo "[ $(date -u) ]: Command run: ${SCRIPT_NAME} setup ${original_arguments}" >&2 198 | 199 | # Make the template files for the run 200 | ${utils_dir}/generate_run_templates.sh \ 201 | -T ${template_config} \ 202 | -t ${phylogenetic_tree_newick} \ 203 | -b ${bootstrap_cutoff} \ 204 | -r ${root_name} \ 205 | -e ${evalue} \ 206 | -p ${pident} \ 207 | -c ${qcov} \ 208 | -x ${genome_extension} \ 209 | -@ ${threads} \ 210 | ${query_filepath} \ 211 | ${query_genome_filepath} \ 212 | ${subject_genome_directory} \ 213 | ${output_directory} 214 | 215 | echo "[ $(date -u) ]: Setup complete. After modifying the config.yaml file to your liking, run ${SCRIPT_NAME} using 'run' mode." >&2 216 | echo "[ $(date -u) ]: BackBLAST finished." >&2 217 | } 218 | 219 | ####################################### 220 | # Perform the 'run' command 221 | # Globals: 222 | # SCRIPT_NAME: the name of this script 223 | # VERSION: the script version 224 | # SNAKEFILE: the path to the Snakefile used to run BackBLAST 225 | # Arguments: 226 | # all command line inputs for the 'run' module - see help statement below 227 | # Returns: 228 | # runs the 'run' command end-to-end 229 | ####################################### 230 | function perform_run() { 231 | 232 | if [[ $# -eq 0 ]]; then 233 | echo "run: No arguments provided. Please run 'setup -h' or 'setup --help' for help. Exiting..." >&2 234 | exit 1 235 | elif [[ $1 = "-h" ]] || [[ $1 = "--help" ]]; then 236 | 237 | # Print help statement 238 | printf "${SCRIPT_NAME} run: module for initializing a BackBLAST run.\n" 239 | printf "Copyright Lee H. Bergstrand and Jackson M. Tsuji, Neufeld Research Group, 2024\n" 240 | printf "Version: ${VERSION}\n\n" 241 | printf "Usage: ${SCRIPT_NAME} run [OPTIONS] config_filepath run_directory [SNAKEMAKE_ARGUMENTS]\n\n" 242 | printf "Positional arguments (required):\n" 243 | printf " config_filepath: path to the config file generated using the 'setup' module\n" 244 | printf " run_directory: the directory where BackBLAST results out to be output\n\n" 245 | printf "Optional arguments:\n" 246 | printf " -P conda_prefix: path to where the conda envs should be stored [default: '.snakemake/conda' in the run_directory]\n" 247 | printf " -j jobs: Number of processing threads available for the run [default: 1]\n" 248 | printf " **Should be no lower than the 'threads' setting in the config file**\n\n" 249 | printf "Advanced parameters (use with care):\n" 250 | printf " -S snakefile: Path to the Snakefile used to run BackBLAST [default: ${SNAKEFILE}]\n" 251 | printf " -C use_conda: specify either 'True' or 'False' for whether or not each job should be run in \n" 252 | printf " its own conda env [default: True]\n" 253 | printf " If 'False' is set, then all dependencies need to be installed in the main environment\n" 254 | printf " where BackBLAST is running. Could be tricky.\n\n" 255 | printf "Snakemake arguments:\n" 256 | printf " Any flags added at the end of the command will be passed directly to snakemake, e.g., --notemp\n\n" 257 | printf "Note: Currently does NOT support whitespaces in any input variables.\n\n" 258 | 259 | # Exit 260 | exit 0 261 | fi 262 | 263 | # Set defaults for options 264 | local jobs 265 | jobs=1 266 | local conda_prefix 267 | conda_prefix=".snakemake/conda" 268 | local snakefile 269 | snakefile=${SNAKEFILE} 270 | local use_conda 271 | use_conda="True" 272 | 273 | # Set options (help from https://wiki.bash-hackers.org/howto/getopts_tutorial; accessed March 8th, 2019) 274 | OPTIND=1 # reset the OPTIND counter just in case 275 | while getopts ":j:P:S:C:" opt; do 276 | case ${opt} in 277 | j) 278 | jobs=${OPTARG} 279 | ;; 280 | P) 281 | conda_prefix=${OPTARG} 282 | ;; 283 | S) 284 | snakefile=${OPTARG} 285 | ;; 286 | C) 287 | use_conda=${OPTARG} 288 | ;; 289 | \?) 290 | echo "[ $(date -u) ]: ERROR: Invalid option: '-${OPTARG}'. Exiting..." >&2 291 | exit 1 292 | ;; 293 | :) 294 | echo "[ $(date -u) ]: ERROR: argument needed following '-${OPTARG}'. Exiting..." >&2 295 | exit 1 296 | ;; 297 | esac 298 | done 299 | 300 | # Set positional arguments 301 | local original_arguments 302 | original_arguments=${@} # save for reporting later 303 | shift $((OPTIND - 1)) # shift to avoid flags when assigning positional arguments 304 | local config_filepath 305 | config_filepath=$1 # config.yaml 306 | local run_directory 307 | run_directory=$2 308 | 309 | # Get Snakemake arguments; i.e., everything after argument 2 310 | local snakemake_arguments 311 | snakemake_arguments=${@:3} 312 | 313 | echo "[ $(date -u) ]: Running ${SCRIPT_NAME} in run mode" >&2 314 | echo "[ $(date -u) ]: Command run: ${SCRIPT_NAME} run ${original_arguments}" >&2 315 | 316 | # Start the run 317 | run_snakemake ${snakefile} ${config_filepath} ${run_directory} ${conda_prefix} ${jobs} ${use_conda} ${snakemake_arguments} 318 | 319 | echo "[ $(date -u) ]: BackBLAST finished." >&2 320 | } 321 | 322 | ####################################### 323 | # Perform the 'auto' command 324 | # Globals: 325 | # SCRIPT_NAME: the name of this script 326 | # VERSION: the script version 327 | # TEMPLATE_CONFIG: the path to the template config file used in setup (YAML format) 328 | # SNAKEFILE: the path to the Snakefile used to run BackBLAST 329 | # OPTIND and OPTARG: used in command line parsing as part of 'getopts' 330 | # Arguments: 331 | # all command line inputs for the 'auto' module - see help statement below 332 | # Returns: 333 | # runs the 'auto' command end-to-end (like running 'setup', then 'run') 334 | ####################################### 335 | # TODO - consider allowing the user to specify pre-made genome and gene metadata files 336 | function perform_auto() { 337 | if [[ $# -eq 0 ]]; then 338 | echo "auto: No arguments provided. Please run 'auto -h' or 'auto --help' for help. Exiting..." >&2 339 | exit 1 340 | elif [[ $1 = "-h" ]] || [[ $1 = "--help" ]]; then 341 | 342 | # Print help statement 343 | printf "${SCRIPT_NAME} auto: module for setting up a BackBLAST run AND starting with some defaults.\n" 344 | printf "Copyright Lee H. Bergstrand and Jackson M. Tsuji, Neufeld Research Group, 2024\n" 345 | printf "Version: ${VERSION}\n\n" 346 | printf "Usage: ${SCRIPT_NAME} auto [OPTIONS] query_filepath query_genome_filepath subject_genome_directory output_directory [SNAKEMAKE_ARGUMENTS]\n\n" 347 | printf "Positional arguments (required):\n" 348 | printf " query_filepath: path to the query predicted protein sequences from the query genome, FastA format\n" 349 | printf " query_genome_filepath: path to the predicted proteins of the entire query genome, FastA format\n" 350 | printf " subject_genome_directory: directory containing predicted proteins of all subject genomes (FastA format).\n" 351 | printf " One genome per file, with extension 'faa' (or specify -x)\n" 352 | printf " output_directory: directory where config ('config.yaml') and metadata templates ('genome_metadata.tsv', 'gene_metadata.tsv') will be created\n\n" 353 | printf "Optional arguments:\n" 354 | printf " -t phylogenetic_tree_newick: path to the pre-calculated phylogenetic tree [default: 'subjects' - auto-calculate tree]\n" 355 | printf " Specify 'NA' to skip tree generation and plot the heatmap alone.\n" 356 | printf " -b bootstrap_cutoff: numerical value (e.g., 80) under which bootstrap values will not be displayed [default: NA]\n" 357 | printf " -r root_name: Exact name of the tree tip label at the desired root of the tree [default: NA; will skip rooting]\n" 358 | printf " -e evalue: e-value cutoff for reciprocal BLASTP [default: 1e-40]\n" 359 | printf " -p pident: percent identity cutoff for reciprocal BLASTP [default: 25]\n" 360 | printf " -c qcov: percent query coverage cutoff for reciprocal BLASTP [default: 50]\n" 361 | printf " -x genome_extension: extension for predicted protein files of subject genomes [default: faa]\n" 362 | printf " -P conda_prefix: path to where the conda envs should be stored [default: '.snakemake/conda' in the run_directory]" 363 | printf " -@ threads: maximum threads to use for any process [default: 1]\n" 364 | printf " -j jobs: Number of processing threads available for the run [default: 1]\n" 365 | printf " **Should be no lower than the 'threads' setting in the config file**\n\n" 366 | printf "Advanced parameters (use with care):\n" 367 | printf " -T template_config: Path to the template config file used in setup [default: ${TEMPLATE_CONFIG}]\n" 368 | printf " -S snakefile: Path to the Snakefile used to run BackBLAST [default: ${SNAKEFILE}]\n" 369 | printf " -C use_conda: specify either 'True' or 'False' for whether or not each job should be run in \n" 370 | printf " its own conda env [default: True]\n" 371 | printf " If 'False' is set, then all dependencies need to be installed in the main environment\n" 372 | printf " where BackBLAST is running. Could be tricky.\n\n" 373 | printf "Snakemake arguments:\n" 374 | printf " Any flags added at the end of the command will be passed directly to snakemake, e.g., --notemp\n\n" 375 | printf "Note: Currently does NOT support whitespaces in any input variables.\n\n" 376 | 377 | # Exit 378 | exit 0 379 | fi 380 | 381 | # Set defaults for options 382 | local jobs 383 | jobs=1 384 | local threads 385 | threads=1 386 | local conda_prefix 387 | conda_prefix=".snakemake/conda" 388 | local phylogenetic_tree_newick 389 | phylogenetic_tree_newick="subjects" 390 | local bootstrap_cutoff 391 | bootstrap_cutoff="NA" 392 | local root_name 393 | root_name="NA" 394 | local evalue 395 | evalue=1e-40 396 | local pident 397 | pident=25 398 | local qcov 399 | qcov=50 400 | local genome_extension 401 | genome_extension="faa" 402 | local template_config 403 | template_config=${TEMPLATE_CONFIG} 404 | local snakefile 405 | snakefile=${SNAKEFILE} 406 | local use_conda 407 | use_conda="True" 408 | 409 | # Set options (help from https://wiki.bash-hackers.org/howto/getopts_tutorial; accessed March 8th, 2019) 410 | OPTIND=1 # reset the OPTIND counter just in case 411 | while getopts ":j:@:P:t:b:r:e:p:c:x:T:S:C:" opt; do 412 | case ${opt} in 413 | j) 414 | jobs=${OPTARG} 415 | ;; 416 | \@) 417 | threads=${OPTARG} 418 | ;; 419 | P) 420 | conda_prefix=${OPTARG} 421 | ;; 422 | t) 423 | phylogenetic_tree_newick=${OPTARG} 424 | ;; 425 | b) 426 | bootstrap_cutoff=${OPTARG} 427 | ;; 428 | r) 429 | root_name=${OPTARG} 430 | ;; 431 | e) 432 | evalue=${OPTARG} 433 | ;; 434 | p) 435 | pident=${OPTARG} 436 | ;; 437 | c) 438 | qcov=${OPTARG} 439 | ;; 440 | x) 441 | genome_extension=${OPTARG} 442 | ;; 443 | T) 444 | template_config=${OPTARG} 445 | ;; 446 | S) 447 | snakefile=${OPTARG} 448 | ;; 449 | C) 450 | use_conda=${OPTARG} 451 | ;; 452 | \?) 453 | echo "[ $(date -u) ]: ERROR: Invalid option: '-${OPTARG}'. Exiting..." >&2 454 | exit 1 455 | ;; 456 | :) 457 | echo "[ $(date -u) ]: ERROR: argument needed following '-${OPTARG}'. Exiting..." >&2 458 | exit 1 459 | ;; 460 | esac 461 | done 462 | 463 | # Set positional arguments 464 | local original_arguments 465 | original_arguments=${@} # save for reporting later 466 | shift $((OPTIND - 1)) # shift to avoid flags when assigning positional arguments 467 | local query_filepath 468 | query_filepath=$1 469 | local query_genome_filepath 470 | query_genome_filepath=$2 471 | local subject_genome_directory 472 | subject_genome_directory=$3 473 | local output_directory 474 | output_directory=$4 475 | 476 | # Get Snakemake arguments; i.e., everything after argument 4 477 | local snakemake_arguments 478 | snakemake_arguments=${@:5} 479 | 480 | echo "[ $(date -u) ]: Running ${SCRIPT_NAME} in 'auto' mode" >&2 481 | echo "[ $(date -u) ]: Command run: ${SCRIPT_NAME} setup ${original_arguments}" >&2 482 | 483 | # Make the template files for the run 484 | ${utils_dir}/generate_run_templates.sh \ 485 | -T ${template_config} \ 486 | -t ${phylogenetic_tree_newick} \ 487 | -b ${bootstrap_cutoff} \ 488 | -r ${root_name} \ 489 | -e ${evalue} \ 490 | -p ${pident} \ 491 | -c ${qcov} \ 492 | -x ${genome_extension} \ 493 | -@ ${threads} \ 494 | ${query_filepath} \ 495 | ${query_genome_filepath} \ 496 | ${subject_genome_directory} \ 497 | ${output_directory} 498 | 499 | # Change metadata files to NA for default run 500 | local output_config_filepath 501 | output_config_filepath=${output_directory}/config.yaml 502 | sed -i "s|^genome_metadata_tsv: .*|genome_metadata_tsv: NA|" ${output_config_filepath} 503 | sed -i "s|^gene_metadata_tsv: .*|gene_metadata_tsv: NA|" ${output_config_filepath} 504 | 505 | # Start the run 506 | echo "[ $(date -u) ]: Default setup finished; starting pipeline" >&2 507 | run_snakemake ${snakefile} ${output_config_filepath} ${output_directory} ${conda_prefix} ${jobs} ${use_conda} \ 508 | ${snakemake_arguments} 509 | 510 | echo "[ $(date -u) ]: BackBLAST finished." >&2 511 | } 512 | 513 | function main() { 514 | # If no input is provided, provide help and exit 515 | if [[ $# -eq 0 ]]; then 516 | echo "No arguments provided. Please run '-h' or '--help' to see help. Exiting..." >&2 517 | exit 1 518 | elif [[ $1 = "-h" ]] || [[ $1 = "--help" ]]; then 519 | 520 | # Help statement 521 | printf "${SCRIPT_NAME}: pipeline to search for and visualize gene homologs across multiple genomes.\n" 522 | printf "Copyright Lee H. Bergstrand and Jackson M. Tsuji, Neufeld Research Group, ${COPYRIGHT_YEAR}\n" 523 | printf "Version: ${VERSION}\n\n" 524 | printf "Please specify a run mode for the main workflow for further usage instructions:\n" 525 | printf " 1. setup: for setting up pre-run configuration files\n" 526 | printf " 2. run: to start a run using configuration files\n" 527 | printf " 3. auto: to skip setup and run the pipeline end-to-end with default values\n\n" 528 | printf "Or run a specific step in the workflow manually:\n" 529 | printf " search: performs reciprocal BLASTP on given inputs\n" 530 | printf " remove_duplicates: removes duplicate hits from reciprocal BLASTP results\n" 531 | printf " create_blank_results: creates a blank output BLAST table for entries with no reciprocal BLASTP hits\n" 532 | printf " combine_tables: combines a set of input BLAST tables\n" 533 | printf " generate_heatmap: generates a heatmap from input BLAST and phylogeny data\n\n" 534 | printf "Advanced parameters (use with care):\n" 535 | printf " -U utils_dir: Path to the BackBLAST utility script directory for running specific workflow steps\n" 536 | printf " [default: ${UTILS_DIR}]\n\n" 537 | 538 | # Exit 539 | exit 0 540 | fi 541 | 542 | # Set defaults for options 543 | local utils_dir 544 | utils_dir="${UTILS_DIR}" 545 | 546 | # Set options (help from https://wiki.bash-hackers.org/howto/getopts_tutorial; accessed March 8th, 2019) 547 | OPTIND=1 # reset the OPTIND counter just in case 548 | while getopts ":U:" opt; do 549 | case ${opt} in 550 | U) 551 | utils_dir=${OPTARG} 552 | ;; 553 | \?) 554 | echo "[ $(date -u) ]: ERROR: Invalid option: '-${OPTARG}'. Exiting..." >&2 555 | exit 1 556 | ;; 557 | :) 558 | echo "[ $(date -u) ]: ERROR: argument needed following '-${OPTARG}'. Exiting..." >&2 559 | exit 1 560 | ;; 561 | esac 562 | done 563 | 564 | # Set positional arguments 565 | local original_arguments 566 | original_arguments=${@} # save for reporting later 567 | shift $((OPTIND - 1)) # shift to avoid flags when assigning positional arguments 568 | local run_mode 569 | run_mode=$1 570 | 571 | # Determine the run mode based on user input 572 | # Different variables are needed for the different run modes 573 | # Only pass the second and upward arguments via ${@:2} - see https://stackoverflow.com/a/9057392 (accessed July 31, 2019) 574 | if [[ ${run_mode} = "setup" ]]; then 575 | perform_setup ${@:2} 576 | elif [[ ${run_mode} = "run" ]]; then 577 | perform_run ${@:2} 578 | elif [[ ${run_mode} = "auto" ]]; then 579 | perform_auto ${@:2} 580 | elif [[ ${run_mode} = "search" ]]; then 581 | "${utils_dir}/search.py" ${@:2} 582 | elif [[ ${run_mode} = "remove_duplicates" ]]; then 583 | "${utils_dir}/remove_duplicates.sh" ${@:2} 584 | elif [[ ${run_mode} = "create_blank_results" ]]; then 585 | "${utils_dir}/create_blank_results.py" ${@:2} 586 | elif [[ ${run_mode} = "combine_tables" ]]; then 587 | "${utils_dir}/combine_tables.R" ${@:2} 588 | elif [[ ${run_mode} = "generate_heatmap" ]]; then 589 | "${utils_dir}/generate_heatmap.R" ${@:2} 590 | else 591 | echo "Provided run mode ('${run_mode}') does not match those available. Exiting..." >&2 592 | exit 1 593 | fi 594 | 595 | } 596 | 597 | # Only run the script if it is called from the command line 598 | if [[ ${BASH_SOURCE[0]} = ${0} ]]; then 599 | main $@ 600 | fi 601 | -------------------------------------------------------------------------------- /scripts/generate_heatmap.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # generate_heatmap.R 3 | # Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025 4 | # Plots a newick treefile and BLAST table together as a phylogenetic tree and heatmap 5 | # Part of the BackBLAST pipeline 6 | 7 | # Load libraries 8 | library(conflicted) 9 | library(argparser) 10 | library(futile.logger) 11 | library(tools) 12 | library(glue) 13 | library(plyr) 14 | library(dplyr) 15 | library(tibble) 16 | library(reshape2) 17 | library(RColorBrewer) 18 | library(ggplot2) 19 | library(ape) 20 | library(maps) 21 | library(phytools) 22 | library(tidytree) 23 | suppressPackageStartupMessages(library(treeio)) 24 | suppressPackageStartupMessages(library(ggtree)) 25 | library(gridExtra) 26 | library(egg) 27 | 28 | #' Reads a data table as a tibble with several default parameters. All parameters below are the same as read.table() 29 | #' 30 | #' @param file Filepath to data table 31 | #' @param sep Field separator character 32 | #' @param header Logical; does the table have headers as the first row? 33 | #' @param stringsAsFactors Logical; should character vectors be converted to factors? 34 | #' @return Tibble of the data table 35 | #' @export 36 | read_tibble <- function(file, sep = "\t", header = TRUE, stringsAsFactors = FALSE) { 37 | data_table <- read.table(file, sep = sep, header = header, stringsAsFactors = stringsAsFactors) %>% 38 | tibble::as_tibble() 39 | return(data_table) 40 | } 41 | 42 | #' Convert character "NA" (from command line) into constant NA 43 | #' 44 | #' @param entry single-length vector of any type 45 | #' @return single-length vector; if it was "NA", it will now be NA; otherwise it will be the same as input 46 | #' @export 47 | convert_to_constant_NA <- function(entry) { 48 | conversion <- entry 49 | if (!is.na(entry)) { 50 | if (entry == "NA") { 51 | futile.logger::flog.debug("Converting 'NA' input to constant NA value") 52 | conversion <- NA 53 | } 54 | } 55 | return(conversion) 56 | } 57 | 58 | #' Re-root a ggtree 59 | #' 60 | #' @param phylo_tree ggtree-format tree 61 | #' @param root_name Character (length 1 vector); the exact name of the to-be root 62 | #' @return ggtree-format tree, rooted 63 | #' @export 64 | reroot_ggtree <- function(phylo_tree, root_name) { 65 | 66 | # Midpoint root the tree if requested; otherwise root to a name 67 | if (root_name == "midpoint") { 68 | futile.logger::flog.info("Midpoint rooting the tree") 69 | tree_rooted <- phytools::midpoint.root(phylo_tree) 70 | } else { 71 | futile.logger::flog.info(glue::glue("Re-rooting tree to '", root_name, "'")) 72 | # Extract the data from the tree in tabular format 73 | tree_data <- ggtree::ggtree(phylo_tree)$data 74 | 75 | # Check that the root_name exists 76 | if ( !(root_name %in% tree_data$label) ) { 77 | futile.logger::flog.error(glue::glue("Could not find the root_name '", root_name, 78 | "' in the provided tree. Cannot re-root. Exiting...")) 79 | quit(save = "no", status = 1) 80 | } 81 | 82 | # Re-root 83 | tree_rooted <- treeio::root(phylo_tree, outgroup = root_name) 84 | } 85 | 86 | return(tree_rooted) 87 | } 88 | 89 | #' Extracts tree label (bootstrap) data and applies bootstrap cutoff if desired 90 | #' 91 | #' @param phylo_tree ggtree-format tree 92 | #' @param bootstrap_cutoff Numeric (length 1 vector); the minimum bootstrap cutoff, in percent 93 | #' @return extracted tree data as a data frame with labels adjusted to only contain bootstraps (numeric) (optionally) above bootstrap cutoff for branches. 94 | #' To be mapped onto the main tree during plotting. 95 | #' @export 96 | generate_bootstrap_labels <- function(phylo_tree, bootstrap_cutoff) { 97 | # For adding bootstrap labels like this, see https://guangchuangyu.github.io/software/ggtree/faq/ (accessed Sept 14, 2018) 98 | # Note that 'labels' encompasses tip labels and node labels. You can to extract the tip labels alone. 99 | 100 | # Extract info from the tree into data frame format 101 | bootstrap_label_data <- ggtree::ggtree(phylo_tree)$data 102 | 103 | # Set the labels to numeric (instead of character; will make non character go 'NA', like perhaps tip labels) 104 | # This longer lapply function tries to avoid warnings from coersion while still allowing for transparent error reporting otherwise 105 | bootstrap_label_data$label <- unlist(lapply(bootstrap_label_data$label, function(label) { 106 | if( !is.na(suppressWarnings(as.numeric(label))) ) { 107 | return(as.numeric(label)) 108 | } else { 109 | return(NA)} 110 | })) 111 | 112 | # Filter out tip labels and 'NA' labels 113 | bootstrap_label_data <- dplyr::filter(bootstrap_label_data, !is.na(label) & isTip == FALSE) 114 | 115 | # Set bootstrap cutoff if desired 116 | if ( !is.na(bootstrap_cutoff) ) { 117 | futile.logger::flog.info(glue::glue("Applying bootstrap display cutoff of ", bootstrap_cutoff)) 118 | bootstrap_label_data <- dplyr::filter(bootstrap_label_data, label >= bootstrap_cutoff) 119 | } 120 | 121 | # Return whole tree data frame (helps for mapping the labels onto the existing tree later) 122 | return(bootstrap_label_data) 123 | } 124 | 125 | #' Makes an initial plot of the tree with overlaid bootstrap labels 126 | #' 127 | #' @param phylo_tree ggtree-format tree 128 | #' @param bootstrap_label_data data frame extracted from tree and possibly modified, generated in 'generate_bootstrap_labels' 129 | #' @return ggtree plot 130 | #' @export 131 | plot_ggtree <- function(phylo_tree, bootstrap_label_data) { 132 | 133 | tree_plot <- ggtree::ggtree(phylo_tree, size = 1, colour = "black", ladderize = TRUE, 134 | branch.length = 0.1) + 135 | 136 | # Add the bootstrap labels from the external file 137 | geom_text(data = bootstrap_label_data, aes(label = label), nudge_x = -0.035, nudge_y = 0.35, size = 3) + 138 | 139 | # Add the dotted lines from the tree tips 140 | geom_tiplab(align = TRUE, linetype = "dotted", size = 0.1, offset = 0.1) + 141 | 142 | # Add scale bar 143 | geom_treescale(x = 0, y = 1.5, linesize = 1, fontsize = 3, offset = 0.1) + 144 | 145 | # Manually tune the y-axis boundaries to match the heatmap: https://stackoverflow.com/a/18718196 (accessed Sept. 15, 2018) 146 | # TODO - this might need to be a function of the number of entries 147 | scale_y_discrete(expand = c(0,0.6)) + 148 | 149 | ## Manually set the righthand cutoff for the tree 150 | xlim_tree(1) 151 | # xlim etc.: https://guangchuangyu.github.io/2016/10/xlim_tree-set-x-axis-limits-for-only-tree-panel/ (accessed Sept 15, 2018) 152 | 153 | return(tree_plot) 154 | } 155 | 156 | #' Master function to load and plot the phylogenetic tree 157 | #' 158 | #' @param input_phylogenetic_tree_filepath Character (length 1 vector); the filepath to the phylogenetic tree 159 | #' @param root_name Character (length 1 vector); the exact name of the to-be root 160 | #' @param bootstrap_cutoff Numeric (length 1 vector); the minimum bootstrap cutoff, in percent 161 | #' @return list of two: 'phylo_tree' - ggtree unplotted object ; 'phylo_tree_fig' - ggtree figure 162 | #' @export 163 | load_and_plot_phylogenetic_tree <- function(input_phylogenetic_tree_filepath, root_name, bootstrap_cutoff) { 164 | # Read tree 165 | futile.logger::flog.info("Reading input phylogenetic tree") 166 | phylo_tree <- treeio::read.tree(input_phylogenetic_tree_filepath) 167 | 168 | # Optionally re-root tree 169 | if ( !is.na(root_name) ) { 170 | phylo_tree <- reroot_ggtree(phylo_tree, root_name) 171 | } 172 | 173 | # Set cutoff for bootstraps externally, to be overlaid onto the tree figure later 174 | # No cutoff is applied if bootstrap_cutoff is set to 'NA' 175 | futile.logger::flog.debug("Generating bootstrap labels") 176 | bootstrap_label_data <- generate_bootstrap_labels(phylo_tree, bootstrap_cutoff) 177 | 178 | # Generate tree plot 179 | futile.logger::flog.info("Generating ggtree plot") 180 | phylo_tree_fig <- plot_ggtree(phylo_tree, bootstrap_label_data) 181 | 182 | # Make list to return to user 183 | tree_list <- list(phylo_tree, phylo_tree_fig) 184 | names(tree_list) <- c("phylo_tree", "phylo_tree_fig") 185 | 186 | return(tree_list) 187 | } 188 | 189 | #' Loads BLAST table and checks for expected column names (HARD-CODED in function) 190 | #' 191 | #' @param input_blast_table_filepath Character (length 1 vector); the filepath to the BLAST table (comma-separated) 192 | #' @return tibble of BLAST data 193 | #' @export 194 | read_blast_results <- function(input_blast_table_filepath) { 195 | 196 | # Load the data 197 | blast_results <- read_tibble(input_blast_table_filepath, sep = ",") 198 | 199 | # HARD-CODED expected header names 200 | expected_header_names <- c("subject_name", "qseqid", "sseqid", "pident", "evalue", "qcovhsp", "bitscore") 201 | 202 | # Confirm the columns look okay 203 | if ( !identical(colnames(blast_results), expected_header_names) ) { 204 | futile.logger::flog.error(glue::glue("BLAST data table did not have expected column names ('", 205 | glue::glue_collapse(expected_header_names, sep = "; "), 206 | "'). Instead, had: '", glue::glue_collapse(colnames(blast_results), sep = "; "), 207 | "'. Exiting...")) 208 | quit(save = "no", status = 1) 209 | } 210 | 211 | return(blast_results) 212 | } 213 | 214 | #' Changes the order of the subject_name in the BLAST table to match that of the ggtree tips 215 | #' 216 | #' @param blast_results Tibble output of read_blast_results 217 | #' @param tip_order Character vector of the exact order of the tip labels in the phylogenetic tree 218 | #' @return tibble of BLAST data with subject_name as an ordered factor and reduced to match phylogenetic tree names (if needed) 219 | #' @export 220 | order_blast_subjects <- function(blast_results, tip_order) { 221 | 222 | # If entries are not identical, try filtering down to just what is in the phylogenetic tree 223 | if ( !identical(sort(unique(blast_results$subject_name)), sort(tip_order)) ) { 224 | 225 | # Make sure that the tree tips are contained in the blast table; if so, everything is okay 226 | # The length should be zero if all tree tips are contained in the blast table 227 | if(length(setdiff(tip_order, unique(blast_results$subject_name))) > 0) { 228 | 229 | # But if the length is > 0, it means some entries in the tree are MISSING in the blast table, a major issue 230 | missing_blast_table_entries <- setdiff(tip_order, unique(blast_results$subject_name)) 231 | flog.error(glue::glue("The provided BLAST table is missing some entries in the phylogenetic tree: '", 232 | glue::glue_collapse(missing_blast_table_entries, sep = ", "), 233 | "'. Cannot continue -- exiting...")) 234 | quit(save = "no", status = 1) 235 | } 236 | 237 | # Report to the user which entries in the BLAST table are to be ignored 238 | extra_blast_table_entries <- setdiff(unique(blast_results$subject_name), tip_order) 239 | 240 | futile.logger::flog.info(glue::glue("Some entries in the BLAST table are missing in the phylogenetic tree ", 241 | "and will be removed in plotting: '", 242 | glue::glue_collapse(extra_blast_table_entries, sep = ", "), "'.")) 243 | 244 | # Filter down the BLAST table to have the same subject_name's as the tree 245 | blast_results <- dplyr::filter(blast_results, subject_name %in% tip_order) 246 | } 247 | 248 | # Make subjects the same order as in the tree 249 | blast_results$subject_name <- factor(blast_results$subject_name, levels = rev(tip_order), ordered = TRUE) 250 | 251 | return(blast_results) 252 | } 253 | 254 | #' Overlays user-given genome names for the heatmap y-axis 255 | #' 256 | #' @param blast_results Tibble output of read_blast_results 257 | #' @param genome_metadata_filepath Character (length 1 vector); the filepath of the tab-separated genome metadata file. 258 | #' Must at least have the columns 'subject_name' and 'plotting_name' as the first and second columns, respectively. 259 | #' @return tibble of BLAST data with subject_name renamed with the user-desired values 260 | #' @export 261 | overlay_genome_naming <- function(blast_results, genome_metadata_filepath) { 262 | 263 | # Load the metadata table 264 | futile.logger::flog.info("Loading genome metadata") 265 | genome_metadata_table <- read_tibble(genome_metadata_filepath) 266 | 267 | # Check that the column names match expected (for the first two columns; doesn't matter after that) 268 | # HARD-CODED 269 | genome_metadata_table_expected_headers <- c("subject_name", "plotting_name") 270 | 271 | if ( !identical(colnames(genome_metadata_table)[1:2], genome_metadata_table_expected_headers) ) { 272 | futile.logger::flog.error(glue::glue("The first two columns of the genome metadata table should be: ", 273 | glue::glue_collapse(genome_metadata_table_expected_headers, sep = ", "), 274 | ". However, you provided something else: ", 275 | glue::glue_collapse(colnames(genome_metadata_table)[1:2], sep = ", "), ". Exiting...")) 276 | quit(save = "no", status = 1) 277 | } 278 | 279 | # Reduce the metadata down to just the expected headers 280 | genome_metadata_table <- dplyr::select(genome_metadata_table, genome_metadata_table_expected_headers) 281 | 282 | # If entries are not identical, try filtering down to just what is in the BLAST table 283 | if ( !identical(sort(unique(blast_results$subject_name)), sort(genome_metadata_table$subject_name)) ) { 284 | 285 | # Make sure that the subject_name's are contained in the metadata; if so, everything is okay 286 | # The length should be zero if all tree tips are contained in the metadata table 287 | if(length(setdiff(unique(blast_results$subject_name), genome_metadata_table$subject_name)) > 0) { 288 | # But if the length is > 0, it means some entries are MISSING in the metadata, a major issue 289 | missing_metadata_entries <- setdiff(unique(blast_results$subject_name), genome_metadata_table$subject_name) 290 | flog.error(glue::glue("The provided gene metadata file is missing some entries in the BLAST table: '", 291 | glue::glue_collapse(missing_metadata_entries, sep = ", "), 292 | "'. Cannot continue -- exiting...")) 293 | quit(save = "no", status = 1) 294 | } 295 | 296 | # Report to the user which entries in the metadata are extra and will be ignored 297 | extra_metadata_entries <- setdiff(genome_metadata_table$subject_name, unique(blast_results$subject_name)) 298 | 299 | futile.logger::flog.info(glue::glue("Some entries in the genome metadata are missing in the BLAST table ", 300 | "and will be removed in plotting: '", 301 | glue::glue_collapse(extra_metadata_entries, sep = ", "), "'.")) 302 | 303 | # Filter down the metadata to match the BLAST table entries 304 | genome_metadata_table <- dplyr::filter(genome_metadata_table, subject_name %in% unique(blast_results$subject_name)) 305 | } 306 | 307 | # Change the subject_name to be the plotting_name in the BLAST table 308 | blast_results$subject_name <- plyr::mapvalues(blast_results$subject_name, from = genome_metadata_table$subject_name, 309 | to = genome_metadata_table$plotting_name, warn_missing = TRUE) 310 | 311 | return(blast_results) 312 | } 313 | 314 | #' Overlays user-given gene names for the heatmap x-axis 315 | #' 316 | #' @param blast_results Tibble output of read_blast_results 317 | #' @param gene_metadata_filepath Character (length 1 vector); the filepath of the tab-separated gene metadata file. 318 | #' Must at least have the columns 'qseqid' and 'gene_name' as the first and second columns, respectively. 319 | #' @return tibble of BLAST data with qseqid renamed with the user-desired values 320 | #' @export 321 | overlay_gene_naming <- function(blast_results, gene_metadata_filepath) { 322 | 323 | # Load gene naming table 324 | gene_metadata_table <- read_tibble(gene_metadata_filepath) 325 | 326 | # Check that the column names match expected (for the first two columns; doesn't matter after that) 327 | # HARD-CODED 328 | gene_metadata_table_expected_headers <- c("qseqid", "gene_name") 329 | 330 | if ( !identical(colnames(gene_metadata_table)[1:2], gene_metadata_table_expected_headers) ) { 331 | futile.logger::flog.error(glue::glue("The first two columns of the gene table should be: ", 332 | glue::glue_collapse(gene_metadata_table_expected_headers, sep = ", "), 333 | ". However, you provided something else: ", 334 | glue::glue_collapse(colnames(gene_metadata_table)[1:2], sep = ", "), ". Exiting...")) 335 | quit(save = "no", status = 1) 336 | } 337 | 338 | # If entries are not identical, try filtering down to just what is in the BLAST table 339 | if ( !identical(sort(unique(blast_results$qseqid)), sort(gene_metadata_table$qseqid)) ) { 340 | # Remove any entries from the BLAST table that are missing in the metadata 341 | if(length(setdiff(unique(blast_results$qseqid), gene_metadata_table$qseqid)) > 0) { 342 | missing_metadata_entries <- setdiff(unique(blast_results$qseqid), gene_metadata_table$qseqid) 343 | flog.warn(glue::glue("The provided gene metadata file is missing some entries in the BLAST table: '", 344 | glue::glue_collapse(missing_metadata_entries, sep = ", "), 345 | "'. These entries will be REMOVED from the BLAST table when plotting.")) 346 | blast_results <- dplyr::filter(blast_results, !(qseqid %in% (missing_metadata_entries))) 347 | } 348 | 349 | # Remove any entries in the metadata that are missing in the BLAST table 350 | if (length(setdiff(gene_metadata_table$qseqid, unique(blast_results$qseqid))) > 0) { 351 | extra_metadata_entries <- dplyr::filter(gene_metadata_table, 352 | qseqid %in% setdiff(gene_metadata_table$qseqid, 353 | unique(blast_results$qseqid))) 354 | extra_metadata_entries$user_reporting <- paste(extra_metadata_entries$qseqid, " (", 355 | extra_metadata_entries$gene_name, ")", 356 | sep = "") 357 | futile.logger::flog.info(glue::glue("Some entries in the gene metadata are missing in the BLAST table ", 358 | "and will be removed in plotting: '", 359 | glue::glue_collapse(extra_metadata_entries$user_reporting, 360 | sep = ", "), "'.")) 361 | gene_metadata_table <- dplyr::filter(gene_metadata_table, qseqid %in% unique(blast_results$qseqid)) 362 | } 363 | } 364 | 365 | # Change qseqid to gene_name and order according to the gene_metadata_table 366 | blast_results$qseqid <- plyr::mapvalues(x = blast_results$qseqid, from = gene_metadata_table$qseqid, 367 | to = gene_metadata_table$gene_name) 368 | blast_results$qseqid <- factor(blast_results$qseqid, levels = gene_metadata_table$gene_name, ordered = TRUE) 369 | 370 | return(blast_results) 371 | } 372 | 373 | #' Plots the BLAST table as a heatmap in ggplot 374 | #' 375 | #' @param blast_results Tibble output of read_blast_results 376 | #' @return ggplot heatmap 377 | #' @export 378 | plot_blast_heatmap <- function(blast_results) { 379 | 380 | # Just to be safe, only keep the top percent identity value for a query if there are multiple hits 381 | blast_results_drop_duplicates <- dplyr::top_n(blast_results, 1, pident) 382 | 383 | # Add NA values for missing grid values so that grid lines will appear in the final plot 384 | # TODO - A bit hacky 385 | blast_results_drop_duplicates <- reshape2::dcast(blast_results_drop_duplicates, subject_name ~ qseqid, 386 | value.var = "pident") %>% 387 | reshape2::melt(na.rm = FALSE, id.vars = c("subject_name"), variable.name = "qseqid", 388 | value.name = "pident") %>% 389 | tibble::as_tibble() 390 | 391 | blast_heatmap <- ggplot2::ggplot(blast_results_drop_duplicates, aes(y = subject_name, x = qseqid)) + 392 | geom_tile(aes(fill = pident), colour = "black") + 393 | theme_bw() + 394 | theme(panel.grid = element_blank(), axis.title = element_text(size = 12), 395 | panel.border = element_rect(colour = "black", size = 1), 396 | axis.text = element_text(size = 10, colour = "black"), 397 | axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), 398 | axis.ticks = element_line(size = 0.5), axis.line = element_line(colour = "black", size = 0.5), 399 | legend.text = element_text(size = 10, colour = "black"), 400 | legend.title = element_text(size = 10, face = "bold"), 401 | legend.key = element_rect(colour = "transparent")) + 402 | guides(fill = guide_legend(title = "Amino acid \nidentity (%)")) + 403 | scale_fill_gradientn(colours = RColorBrewer::brewer.pal(name = "Blues", n = 5), 404 | na.value = "transparent") + 405 | xlab(NULL) + 406 | ylab(NULL) 407 | 408 | return(blast_heatmap) 409 | } 410 | 411 | #' Master function to load and plot the BLAST table metadata to produce a heatmap 412 | #' 413 | #' @param input_blast_table_filepath Character (length 1 vector); the filepath to the BLAST table (comma-separated) 414 | #' @param tip_order Character vector of the exact order of the tip labels in the phylogenetic tree; provide 'NA' to skip alignment 415 | #' @param genome_metadata_filepath Character (length 1 vector); the filepath of the tab-separated genome metadata file. 416 | #' Must at least have the columns 'subject_name' and 'plotting_name' as the first and second columns, respectively. 417 | #' @param gene_metadata_filepath Character (length 1 vector); the filepath of the tab-separated gene metadata file. 418 | #' Must at least have the columns 'qseqid' and 'gene_name' as the first and second columns, respectively. 419 | #' @return list of two: 'blast_results' data frame; 'blast_heatmap' ggplot object 420 | #' @export 421 | load_and_plot_blast_results <- function(input_blast_table_filepath, tip_order = NA, gene_metadata_filepath, 422 | genome_metadata_filepath) { 423 | # Load the blast table 424 | futile.logger::flog.info("Loading the BLAST table") 425 | blast_results <- read_blast_results(input_blast_table_filepath) 426 | 427 | # Order the BLAST table subject names to match the ggtree 428 | if (!is.na(tip_order[1])) { 429 | futile.logger::flog.debug("Aligning BLAST table's subject names to match order of the ggtree") 430 | blast_results <- order_blast_subjects(blast_results, tip_order) 431 | } else { 432 | futile.logger::flog.debug("Skipping alignment of BLAST table to phylogenetic tree") 433 | } 434 | 435 | # Overlay genome names and genome naming order, if provided 436 | if ( !is.na(genome_metadata_filepath) ) { 437 | futile.logger::flog.info("Overlaying genome naming and ordering onto the BLAST table") 438 | blast_results <- overlay_genome_naming(blast_results, genome_metadata_filepath) 439 | } 440 | 441 | # Overlay gene names and gene naming order, if provided 442 | if ( !is.na(gene_metadata_filepath) ) { 443 | futile.logger::flog.info("Overlaying gene naming and ordering onto the BLAST table") 444 | blast_results <- overlay_gene_naming(blast_results, gene_metadata_filepath) 445 | } 446 | 447 | # Create the heatmap 448 | futile.logger::flog.info("Plotting BLAST heatmap") 449 | blast_heatmap <- plot_blast_heatmap(blast_results) 450 | 451 | # Make output list 452 | blast_results_list <- list(blast_results, blast_heatmap) 453 | names(blast_results_list) <- c("blast_results", "blast_heatmap") 454 | 455 | return(blast_results_list) 456 | } 457 | 458 | main <- function(params) { 459 | # Startup messages 460 | futile.logger::flog.info("Running generate_heatmap.R") 461 | futile.logger::flog.info("######### Settings #########") 462 | futile.logger::flog.info(glue::glue("Input phylogenetic tree filepath (ignored if 'NA'): ", params$input_phylogenetic_tree_filepath)) 463 | futile.logger::flog.info(glue::glue("Input BLAST table filepath: ", params$input_blast_table_filepath)) 464 | futile.logger::flog.info(glue::glue("Output PDF filepath: ", params$output_pdf_filepath)) 465 | futile.logger::flog.info(glue::glue("Bootstrap display cutoff (%; ignored if 'NA'): ", params$bootstrap_cutoff)) 466 | futile.logger::flog.info(glue::glue("Root name (ignored if 'NA'): ", params$root_name)) 467 | futile.logger::flog.info(glue::glue("Input genome metadata filepath (ignored if 'NA'): ", 468 | params$genome_metadata_filepath)) 469 | futile.logger::flog.info(glue::glue("Input gene metadata filepath (ignored if 'NA'): ", 470 | params$gene_metadata_filepath)) 471 | futile.logger::flog.info(glue::glue("Plot width (mm): ", params$plot_width)) 472 | futile.logger::flog.info(glue::glue("Plot height (mm): ", params$plot_height)) 473 | futile.logger::flog.info(glue::glue("Write data table: ", params$write_data)) 474 | futile.logger::flog.info("############################") 475 | 476 | # Convert character "NA" (from command line) into true NA 477 | params <- lapply(params, convert_to_constant_NA) 478 | 479 | # Load and plot the tree 480 | if (!is.na(params$input_phylogenetic_tree_filepath)) { 481 | phylo_tree_list <- load_and_plot_phylogenetic_tree(params$input_phylogenetic_tree_filepath, 482 | params$root_name, params$bootstrap_cutoff) 483 | 484 | # Get tip order of the tree, to match with heatmap later 485 | # Based on https://groups.google.com/forum/#!topic/bioc-ggtree/LqRDK78m3U4 (accessed Sept. 15, 2018) 486 | futile.logger::flog.debug("Exporting tip order of tree to correspond with heatmap") 487 | tip_order <- dplyr::filter(ggtree::ggtree(phylo_tree_list[[1]])$data, isTip == TRUE) 488 | tip_order <- tip_order[order(tip_order$y, decreasing = TRUE),]$label # plotting_name 489 | } else { 490 | flog.info("Skipping plotting phylogenetic tree") 491 | tip_order <- NA 492 | } 493 | 494 | # Load and plot the BLAST table as a heatmap 495 | blast_results_list <- load_and_plot_blast_results(params$input_blast_table_filepath, 496 | tip_order, params$gene_metadata_filepath, 497 | params$genome_metadata_filepath) 498 | 499 | # Save the heatmap data 500 | if (params$write_data == TRUE) { 501 | futile.logger::flog.info("Saving raw heatmap data to file") 502 | output_table_filepath = paste(tools::file_path_sans_ext(params$output_pdf_filepath), ".tsv", sep = "") 503 | write.table(blast_results_list[[1]], file = output_table_filepath, sep = "\t", 504 | col.names = TRUE, row.names = FALSE, quote = FALSE) 505 | } 506 | 507 | # Save the plot 508 | if (!is.na(params$input_phylogenetic_tree_filepath)) { 509 | # Combine the tree and heatmap 510 | # Got ggarrange ideas from https://cran.r-project.org/web/packages/egg/vignettes/Ecosystem.html (accessed Sept. 15, 2018) 511 | futile.logger::flog.info("Combining the ggtree and the heatmap") 512 | combined_plot <- egg::ggarrange(phylo_tree_list[[2]], blast_results_list[[2]], 513 | nrow = 1, widths = c(1, 1.5), heights = c(1), padding = unit(0, "mm")) 514 | 515 | # Print a PDF of the combined plot 516 | # N.B., dimensions need to be input in inches (25.4 mm per inch) 517 | futile.logger::flog.info("Saving to PDF") 518 | pdf(file = params$output_pdf_filepath, width = params$plot_width / 25.4, 519 | height = params$plot_height / 25.4) 520 | print(combined_plot) 521 | dev.off() 522 | } else { 523 | # Save heatmap alone 524 | futile.logger::flog.info("Saving heatmap to PDF") 525 | 526 | pdf(file = params$output_pdf_filepath, width = params$plot_width / 25.4, 527 | height = params$plot_height / 25.4) 528 | print(blast_results_list[[2]]) 529 | dev.off() 530 | } 531 | futile.logger::flog.info("generate_heatmap.R: done.") 532 | } 533 | 534 | if ( !interactive() ) { 535 | parser <- argparser::arg_parser( 536 | description = glue::glue("generate_heatmap.R: Binds a phylogenetic tree to a BLAST table heatmap. 537 | Copyright Lee H. Bergstrand and Jackson M. Tsuji, 2025.")) 538 | 539 | # Add required args 540 | parser <- argparser::add_argument(parser = parser, arg = "input_phylogenetic_tree_filepath", 541 | help = "Input phylogenetic tree filepath (set to 'NA' to plot heatmap only)", 542 | type = "character", default = NULL) 543 | parser <- argparser::add_argument(parser = parser, arg = "input_blast_table_filepath", 544 | help = "Input BLAST table filepath", 545 | type = "character", default = NULL) 546 | parser <- argparser::add_argument(parser = parser, arg = "output_pdf_filepath", 547 | help = "Output PDF filepath", 548 | type = "character", default = NULL) 549 | 550 | # Add optional args (set to 'NA' to ignore) 551 | parser <- argparser::add_argument(parser = parser, arg = "--genome_metadata_filepath", short = "-m", 552 | help = "Genome metadata filepath", 553 | type = "character", default = NA) 554 | parser <- argparser::add_argument(parser = parser, arg = "--gene_metadata_filepath", short = "-g", 555 | help = "Gene metadata filepath", 556 | type = "character", default = NA) 557 | parser <- argparser::add_argument(parser = parser, arg = "--bootstrap_cutoff", short = "-b", 558 | help = "Bootstrap cutoff value", 559 | type = "numeric", default = NA) 560 | parser <- argparser::add_argument(parser = parser, arg = "--root_name", short = "-r", 561 | help = "Root name ('midpoint' to midpoint root or NA to keep the existing root; default NA)", 562 | type = "character", default = NA) 563 | parser <- argparser::add_argument(parser = parser, arg = "--plot_width", short = "-w", 564 | help = "Plot width (mm)", 565 | type = "numeric", default = 400) 566 | parser <- argparser::add_argument(parser = parser, arg = "--plot_height", short = "-z", 567 | help = "Plot height (mm)", 568 | type = "numeric", default = 200) 569 | parser <- argparser::add_argument(parser = parser, arg = "--write_data", short = "-d", 570 | help = "Write raw plotting data to disk (same basepath as the PDF, but as a .tsv file)", 571 | flag = TRUE) 572 | 573 | params <- argparser::parse_args(parser) 574 | 575 | main(params) 576 | } 577 | --------------------------------------------------------------------------------