├── pyproject.toml ├── LICENSE ├── README.md └── reciprologs /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "reciprologs" 3 | version = "1.1.1" 4 | description = "" 5 | authors = ["Graham Larue "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8" 11 | biogl = "^2.3.0" 12 | networkx = "^3.3" 13 | palign = {git = "https://github.com/glarue/palign.git"} 14 | 15 | 16 | [build-system] 17 | requires = ["poetry-core"] 18 | build-backend = "poetry.core.masonry.api" 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Graham Larue 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Dependencies 2 | 3 | #### `biogl` 4 | 5 | This script needs the [biogl](https://github.com/glarue/biogl) module to function properly. If you use (or can get) `pip`, you can simply do 6 | 7 | ```python3 -m pip install biogl``` 8 | 9 | to add the package to a location reachable by your Python installation. 10 | 11 | Otherwise, you can clone the `biogl` repo and source it locally (to run from anywhere, you'll need to add it to your `PYTHONPATH` environment variable, a process that varies by OS): 12 | 13 | ```git clone https://github.com/glarue/biogl.git``` 14 | 15 | #### `palign` 16 | 17 | This script also requires [palign](https://github.com/glarue/palign) to run the alignment steps (using DIAMOND or BLAST), which you can clone and then add to your `PATH`: 18 | 19 | ```git clone https://github.com/glarue/palign.git``` 20 | 21 | #### Recommended: `networkx` 22 | 23 | In use-cases with more than two files, `reciprologs` builds a graph representation of all of the reciprocal best hits (RBH), which allows construction of maximal cliques where every member is a RBH of every other member (e.g. `A-B, A-C, B-C, A-D --> A-B-C, A-D`). In order to do this _efficiently_, the Python package [networkx](https://networkx.org/) is needed (via `pip` or otherwise). If you want less-strict requirements for clustering (e.g. `A-B, A-C, A-D --> A-B-C-D`), see the `--chain` argument. 24 | 25 | ### Usage info 26 | 27 | ``` 28 | usage: reciprologs [-h] [-p PARALLEL_PROCESSES] [-q PERCENTAGE] [--chain] 29 | [--subset subset_1 [subset_2 ...]] [--ignore_same_id] 30 | [--ignore_same_prefix ] [-o [OUTPUT]] 31 | [-d path [path ...]] [-b BLAST_FILE] [--overwrite] 32 | [--one_to_one] [--logging] [--no_hash_tag] 33 | file_1 file_2 ... [file_1 file_2 ... ...] 34 | {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx} 35 | 36 | Find reciprocal best hits between two or more files. Any unrecognized 37 | arguments will be passed along to the chosen alignment program. 38 | 39 | positional arguments: 40 | file_1 file_2 ... files to use to build reciprolog sets (space 41 | separated) 42 | {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx} 43 | type of alignment program to run 44 | 45 | optional arguments: 46 | -h, --help show this help message and exit 47 | -p PARALLEL_PROCESSES, --parallel_processes PARALLEL_PROCESSES 48 | run the alignment step using multiple parallel 49 | processes (default: 1) 50 | -q PERCENTAGE, --query_percentage_threshold PERCENTAGE 51 | require a specified fraction of the query length to 52 | match in order for a hit to qualify (lowest 53 | allowable percentage (default: None) 54 | --chain cluster reciprologs without requiring all-by-all 55 | pairwise relationships, e.g. A-B, A-C, A-D --> A-B- 56 | C-D (default: False) 57 | --subset subset_1 [subset_2 ...] 58 | Files containing subsets of headers to be used as 59 | queries for each input file. Supplied in the same 60 | order as the input files; one header per line. To 61 | omit a subset file for a given input file, provide 62 | "." as the argument, e.g. for three input files 63 | with only 1 & 3 with subsets: --subsets subset_1 . 64 | subset_2 (default: None) 65 | --ignore_same_id ignore hits where both query and subject have 66 | identical IDs (default: False) 67 | --ignore_same_prefix 68 | ignore hits where both query and subject have 69 | identical prefixes, where the prefix for each ID is 70 | delimited by the specified 71 | (default: None) 72 | -o [OUTPUT], --output [OUTPUT] 73 | output filename (use flag without argument for 74 | stdout; omit flag to use auto-naming) (default: 75 | stdout) 76 | -d path [path ...], --alignment_source_directory path [path ...] 77 | check for existing alignment files to use in this 78 | directory first (default: None) 79 | -b BLAST_FILE, --blast_file BLAST_FILE 80 | aggregated BLAST output to use (both directions) 81 | (default: None) 82 | --overwrite overwrite existing output files (instead of using 83 | them to bypass alignment step) (default: False) 84 | --one_to_one remove any many-to-one reciprolog relationships in 85 | each pairwise set, such that each member of each 86 | pairwise comparison is only present exactly one 87 | time in output (default: False) 88 | --logging output a log of best-hit choice criteria (default: 89 | False) 90 | --no_hash_tag do not auto-tag output files with MD5 hashes of 91 | source files (default: False) 92 | 93 | ``` 94 | -------------------------------------------------------------------------------- /reciprologs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # the above sources Python from $PATH 3 | ##!/usr/local/bin/python3 4 | ##!/usr/bin/python3 5 | # the above uses specific Python version; allows script name in top 6 | 7 | # authorship information 8 | __author__ = 'Graham E. Larue' 9 | __maintainer__ = "Graham E. Larue" 10 | __email__ = 'egrahamlarue@gmail.com' 11 | __license__ = 'GPL' 12 | 13 | """ 14 | usage: reciprologs [-h] [-p PARALLEL_PROCESSES] [-q PERCENTAGE] [--chain] 15 | [--subset subset_1 [subset_2 ...]] [--ignore_same_id] 16 | [--ignore_same_prefix ] [-o [OUTPUT]] 17 | [-d path [path ...]] [-b BLAST_FILE] [--overwrite] 18 | [--one_to_one] [--logging] [--no_hash_tag] 19 | file_1 file_2 ... [file_1 file_2 ... ...] 20 | {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx} 21 | 22 | Find reciprocal best hits between two or more files. Any unrecognized 23 | arguments will be passed along to the chosen alignment program. 24 | 25 | positional arguments: 26 | file_1 file_2 ... files to use to build reciprolog sets (space 27 | separated) 28 | {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx} 29 | type of alignment program to run 30 | 31 | optional arguments: 32 | -h, --help show this help message and exit 33 | -p PARALLEL_PROCESSES, --parallel_processes PARALLEL_PROCESSES 34 | run the alignment step using multiple parallel 35 | processes (default: 1) 36 | -q PERCENTAGE, --query_percentage_threshold PERCENTAGE 37 | require a specified fraction of the query length to 38 | match in order for a hit to qualify (lowest 39 | allowable percentage (default: None) 40 | --chain cluster reciprologs without requiring all-by-all 41 | pairwise relationships, e.g. A-B, A-C, A-D --> A-B- 42 | C-D (default: False) 43 | --subset subset_1 [subset_2 ...] 44 | Files containing subsets of headers to be used as 45 | queries for each input file. Supplied in the same 46 | order as the input files; one header per line. To 47 | omit a subset file for a given input file, provide 48 | "." as the argument, e.g. for three input files 49 | with only 1 & 3 with subsets: --subsets subset_1 . 50 | subset_2 (default: None) 51 | --ignore_same_id ignore hits where both query and subject have 52 | identical IDs (default: False) 53 | --ignore_same_prefix 54 | ignore hits where both query and subject have 55 | identical prefixes, where the prefix for each ID is 56 | delimited by the specified 57 | (default: None) 58 | -o [OUTPUT], --output [OUTPUT] 59 | output filename (use flag without argument for 60 | auto-naming) (default: stdout) 61 | -d path [path ...], --alignment_source_directory path [path ...] 62 | check for existing alignment files to use in this 63 | directory first (default: None) 64 | -b BLAST_FILE, --blast_file BLAST_FILE 65 | aggregated BLAST output to use (both directions) 66 | (default: None) 67 | --overwrite overwrite existing output files (instead of using 68 | them to bypass alignment step) (default: False) 69 | --one_to_one remove any many-to-one reciprolog relationships in 70 | each pairwise set, such that each member of each 71 | pairwise comparison is only present exactly one 72 | time in output (default: False) 73 | --logging output a log of best-hit choice criteria (default: 74 | False) 75 | --no_hash_tag do not auto-tag output files with MD5 hashes of 76 | source files (default: False) 77 | 78 | NOTE: Depends on palign 79 | 80 | """ 81 | import sys 82 | import subprocess 83 | import os 84 | import time 85 | import argparse 86 | import re 87 | import shutil 88 | from operator import itemgetter 89 | from multiprocessing import cpu_count 90 | from collections import defaultdict 91 | from itertools import combinations, permutations 92 | from biogl import fasta_parse, get_runtime 93 | from hashlib import md5 94 | 95 | # use networkx library for fast ortholog clustering if available 96 | try: 97 | import networkx as nx 98 | USE_GRAPH = True 99 | except ModuleNotFoundError: 100 | USE_GRAPH = False 101 | 102 | 103 | def parse_blast_line(bl, *args): 104 | """ 105 | Returns info from certain columns in a tab-separated BLAST 106 | output file. $args may be: query, subject, length, e, bitscore 107 | 108 | """ 109 | columns = bl.strip().split("\t") 110 | ( 111 | query, subject, length, 112 | e_value, bitscore 113 | ) = itemgetter(0, 1, 3, 10, 11)(columns) 114 | arg_map = { 115 | "query": query, 116 | "subject": subject, 117 | "length": int(length) - 1, # seem to be off by 1 in BLAST output 118 | "e": float(e_value), 119 | "bitscore": float(bitscore) 120 | } 121 | results = [] 122 | for a in args: 123 | results.append(arg_map[a]) 124 | if len(results) == 1: 125 | results = results[0] 126 | return results 127 | 128 | 129 | def is_better(challenger, defender, seq_lengths=None): 130 | """ 131 | Compares attributes of two dictionaries of BLAST 132 | hits for a given query to determine which is better. 133 | 134 | Returns the winning dictionary and reason if it's 135 | better, otherwise False. 136 | 137 | """ 138 | cbs = challenger['score'] 139 | dbs = defender['score'] 140 | # criteria: bitscore 141 | if cbs < dbs: 142 | return False 143 | elif cbs > dbs: 144 | return challenger, 'bitscore' 145 | elif cbs == dbs: 146 | # criteria --> e-value 147 | cev = challenger['evalue'] 148 | dev = defender['evalue'] 149 | if cev < dev: # lower is better 150 | return challenger, 'e-value' 151 | elif seq_lengths is not None: 152 | # criteria --> length 153 | # if scores are equal, check if sequence lengths 154 | # have been provided as an additional tiebreaking 155 | # criteria and look up the subject length to 156 | # see if there's a difference 157 | dn = defender['name'] 158 | cn = challenger['name'] 159 | try: 160 | if seq_lengths[cn] > seq_lengths[dn]: 161 | return challenger, 'length' 162 | except KeyError: 163 | return False 164 | else: 165 | return False 166 | else: 167 | return False 168 | 169 | 170 | def get_prefix(seq_id, delimiter): 171 | split_list = re.split(delimiter, seq_id, maxsplit=1) 172 | split_list = [s for s in split_list if s] 173 | 174 | return split_list[0] 175 | 176 | 177 | def get_top_hits( 178 | blast, 179 | paralogs=False, 180 | query_match=None, 181 | seq_lengths=None, 182 | ignore_same_id=False, 183 | ignore_same_prefix=False, 184 | query_list=None): 185 | results = {} 186 | # dictionary to store tie-broken matches 187 | win_ledger = defaultdict(lambda: defaultdict(set)) 188 | with open(blast) as blst: 189 | for l in blst: 190 | new_best_hit = False 191 | if l.startswith("#"): 192 | continue 193 | (q, s, score, length, evalue) = parse_blast_line( 194 | l, "query", "subject", "bitscore", "length", "e") 195 | challenger = { 196 | 'name': s, 197 | 'score': score, 198 | 'evalue': evalue, 199 | 'length': length 200 | } 201 | if query_list and q not in query_list: 202 | continue 203 | 204 | # do not consider hits to self if BLASTing against self, 205 | # but allow query/subject names to be the same 206 | if q == s and (paralogs is True or ignore_same_id is True): 207 | continue 208 | if ignore_same_prefix is not None: 209 | prefix = ignore_same_prefix 210 | if get_prefix(q, prefix) == get_prefix(s, prefix): 211 | continue 212 | if query_match: 213 | # use query_match dictionary to compare query lengths to 214 | # match lengths to exclude matches where query percentage 215 | # is below query_match_threshold key 216 | fraction = (length / query_match[q]) * 100 217 | if fraction < query_match['query_match_threshold']: 218 | continue 219 | if q in results: 220 | defender = results[q] 221 | challenger_wins = is_better( 222 | challenger, defender, seq_lengths) 223 | if challenger_wins: # new hit is better 224 | new_best_hit = True 225 | defender_name = results[q]['name'] 226 | reason = challenger_wins[1] 227 | loser_info = (defender_name, reason) 228 | win_ledger[q]['losers'].add(loser_info) 229 | win_ledger[q]['best'] = s 230 | 231 | else: 232 | new_best_hit = True 233 | 234 | if new_best_hit is True: 235 | results[q] = { 236 | "name": s, 237 | "score": score, 238 | "evalue": evalue, 239 | "length": length 240 | } 241 | 242 | return results, win_ledger 243 | 244 | 245 | def get_reciprocals(d1, d2, tag1, tag2): 246 | """ 247 | Takes two dictionaries of top BLAST hits, 248 | returns a list of tuples of all pairs that were 249 | reciprocal best hits, along with their bitscore 250 | values. 251 | 252 | """ 253 | reciprologs = set() 254 | blast_dicts = [(d1, tag1), (d2, tag2)] 255 | blast_permuts = permutations(blast_dicts, 2) 256 | for (first, first_tag), (second, second_tag) in blast_permuts: 257 | for query, hit_info in first.items(): 258 | best_hit = hit_info["name"] 259 | score = hit_info["score"] 260 | if best_hit in second: 261 | reciprocal_hit = second[best_hit]["name"] 262 | if query == reciprocal_hit: # best hit refers back to query 263 | r_score = second[best_hit]["score"] 264 | query = (first_tag, query) 265 | best_hit = (second_tag, best_hit) 266 | hit_pair = sorted([query, best_hit]) 267 | score_tuple = tuple(sorted([score, r_score])) 268 | hit_pair.append(score_tuple) 269 | reciprologs.add(tuple(hit_pair)) 270 | 271 | return sorted(reciprologs) 272 | 273 | 274 | def clean_reciprologs(reciprologs, subset_index=None): 275 | cleaned = [] 276 | for group in reciprologs: 277 | if subset_index and not any(m[1] in subset_index[m[0]] for m in group): 278 | continue 279 | # remove file tags from tuples 280 | clean_group = [m[1] for m in group] 281 | cleaned.append(clean_group) 282 | 283 | return cleaned 284 | 285 | 286 | def file_md5(fn, buffer_size=65536): 287 | hash = md5() 288 | with open(fn, 'rb') as f: 289 | while True: 290 | data = f.read(buffer_size) 291 | if not data: 292 | break 293 | hash.update(data) 294 | 295 | return hash.hexdigest() 296 | 297 | 298 | def abbreviate(name, delimiter=".", use_hash=True, keep_path=False): 299 | local_name = os.path.basename(name) # in case of non-local file path 300 | abbreviation = local_name.split(delimiter)[0] 301 | if use_hash is True: # use shortened md5 hash to uniqueify name 302 | hash = file_md5(name)[:5] 303 | abbreviation = abbreviation + delimiter + hash 304 | 305 | if keep_path is True: 306 | file_path = os.path.dirname(os.path.abspath(name)) 307 | abbreviation = os.path.join(file_path, abbreviation) 308 | 309 | return abbreviation 310 | 311 | 312 | def unique_filenames(*file_list, skip=None, use_hash=True, keep_path=False): 313 | if skip is not None: 314 | abbreviated = [ 315 | os.path.basename(f) if f in skip 316 | else abbreviate(f, use_hash=use_hash) 317 | for f in file_list 318 | ] 319 | else: 320 | abbreviated = [os.path.basename(f) for f in file_list] 321 | if len(set(abbreviated)) < len(abbreviated): # not all are unique 322 | abbreviated = [os.path.basename(f) for f in file_list] 323 | 324 | if keep_path is True: # add parent directory paths 325 | dirpaths = [os.path.dirname(f) for f in file_list] 326 | abbreviated = [ 327 | os.path.join(p, f) for p, f in zip(dirpaths, abbreviated) 328 | ] 329 | 330 | return abbreviated 331 | 332 | 333 | def concatenate(outname, file_list, clean=True): 334 | with open(outname, 'w') as outfile: 335 | for fn in file_list: 336 | with open(fn) as f: 337 | for l in f: 338 | outfile.write(l) 339 | if clean: 340 | [os.remove(fn) for fn in file_list] 341 | 342 | 343 | def parse_run_type(align_type_arg): 344 | type_map = { 345 | 'diamondp': ('diamond', 'blastp'), 346 | 'diamondx': ('diamond', 'blastx'), 347 | 'blastn': ('blast', 'blastn'), 348 | 'blastp': ('blast', 'blastp'), 349 | 'blastx': ('blast', 'blastx'), 350 | 'tblastn': ('blast', 'tblastn'), 351 | 'tblastx': ('blast', 'tblastx'), 352 | } 353 | 354 | return type_map[align_type_arg] 355 | 356 | 357 | def aggregate_dict_chained(ortho_dict): 358 | """ 359 | IN: 360 | defaultdict(set, 361 | {'a': {'b', 'c', 'd'}, 362 | 'b': {'a', 'c', 'e', 'f'}, 363 | 'c': {'a', 'b', 'e', 'f', 'g'}, 364 | 'd': {'a'}, 365 | 'e': {'b', 'c'}, 366 | 'f': {'b', 'c'}, 367 | 'g': {'c'}, 368 | 'w': {'z'}, 369 | 'x': {'z'}, 370 | 'y': {'z'}, 371 | 'z': {'w', 'x', 'y'}}) 372 | OUT: 373 | defaultdict(set, {'a': {'b', 'c', 'd', 'e', 'f', 'g'}, 'z': {'w', 'x', 'y'}}) 374 | 375 | """ 376 | changed = False 377 | processed = [] 378 | master = defaultdict(set) 379 | for k, v in ortho_dict.items(): 380 | if k in processed: 381 | continue 382 | processed.append(k) 383 | for v2 in v: 384 | if v2 == k: 385 | continue 386 | master[k].add(v2) 387 | processed.append(v2) 388 | if v2 not in ortho_dict: 389 | continue 390 | changed = True 391 | master[k].update(ortho_dict[v2]) 392 | if changed is True: 393 | master = aggregate_dict_chained(master) 394 | 395 | return master 396 | 397 | 398 | def aggregate_orthos_chained(orthos, use_graph=False): 399 | """ 400 | IN: 401 | [ 402 | [('a', 'b'), ('a', 'c'), ('a', 'd')], 403 | [('b', 'c'), ('b', 'e'), ('b', 'f')], 404 | [('c', 'e'), ('c', 'f'), ('c', 'g')], 405 | [('z', 'x'), ('z', 'y'), ('z', 'w')] 406 | ] 407 | OUT: 408 | [['a', 'b', 'c', 'd', 'e', 'f', 'g'], ['w', 'x', 'y', 'z']] 409 | 410 | """ 411 | if use_graph: 412 | ortho_groups = graph_cluster(orthos, chain=True) 413 | else: 414 | o_dict = make_ortho_dict(*orthos) 415 | aggregated = aggregate_dict_chained(o_dict) 416 | ortho_groups = [] 417 | for k, v in aggregated.items(): 418 | combined = tuple(v) + (k,) 419 | ortho_groups.append(sorted(combined)) 420 | 421 | return sorted(ortho_groups) 422 | 423 | 424 | def aggregate_orthos_strict(orthos, use_graph=False): 425 | """ 426 | IN: 427 | [ 428 | [('a', 'b'), ('a', 'c'), ('a', 'd')], 429 | [('b', 'c'), ('b', 'e'), ('b', 'f')], 430 | [('c', 'e'), ('c', 'f'), ('c', 'g')], 431 | [('z', 'x'), ('z', 'y'), ('z', 'w')] 432 | ] 433 | OUT: 434 | [ 435 | ['x', 'z'], 436 | ['y', 'z'], 437 | ['w', 'z'], 438 | ['a', 'd'], 439 | ['c', 'g'], 440 | ['b', 'c', 'f'], 441 | ['b', 'c', 'e'], 442 | ['a', 'b', 'c'] 443 | ] 444 | 445 | """ 446 | if use_graph is True: 447 | aggregated = graph_cluster(orthos) 448 | else: 449 | o_dict = make_ortho_dict(*orthos) 450 | aggregated = all_by_all_orthos(o_dict) 451 | 452 | return aggregated 453 | 454 | 455 | def all_by_all_orthos(ortho_dict): 456 | full_groups = [] 457 | for k, v in ortho_dict.items(): 458 | groups = [] 459 | max_n = len(v) 460 | # go backward in size and cull subsets as we go 461 | for i in range(max_n, 0, -1): 462 | for g in combinations(v, i): 463 | g = set(list(g) + [k]) 464 | if g in full_groups or len(g) == 1: 465 | continue 466 | if every_member_match(g, ortho_dict): 467 | if any(og.issuperset(g) for og in full_groups): 468 | continue 469 | full_groups.append(g) 470 | 471 | return sorted([sorted(g) for g in full_groups]) 472 | 473 | 474 | def every_member_match(members, m_dict): 475 | all_match = True 476 | for m in members: 477 | others = [e for e in members if e != m] 478 | if not others: 479 | return True 480 | if any(m not in m_dict[o] for o in others): 481 | return False 482 | 483 | return all_match 484 | 485 | 486 | def make_ortho_dict(*orthos): 487 | """ 488 | IN: 489 | [ 490 | [('a', 'b'), ('a', 'c'), ('a', 'd')], 491 | [('b', 'c'), ('b', 'e'), ('b', 'f')], 492 | [('c', 'e'), ('c', 'f'), ('c', 'g')], 493 | [('z', 'x'), ('z', 'y'), ('z', 'w')] 494 | ] 495 | OUT: 496 | defaultdict(set, 497 | {'a': {'b', 'c', 'd'}, 498 | 'b': {'a', 'c', 'e', 'f'}, 499 | 'c': {'a', 'b', 'e', 'f', 'g'}, 500 | 'd': {'a'}, 501 | 'e': {'b', 'c'}, 502 | 'f': {'b', 'c'}, 503 | 'g': {'c'}, 504 | 'w': {'z'}, 505 | 'x': {'z'}, 506 | 'y': {'z'}, 507 | 'z': {'w', 'x', 'y'}}) 508 | 509 | """ 510 | collector = defaultdict(set) 511 | for o_list in orthos: 512 | for pair in o_list: 513 | for a, b in permutations(pair, 2): 514 | collector[a].add(b) 515 | 516 | return collector 517 | 518 | 519 | def names_from_blastfile(blast_fn): 520 | file_pattern = r'(.+)-vs-(.+)\.t?blast[npx]' 521 | query_fn, subject_fn = re.findall(file_pattern, blast_fn)[0] 522 | 523 | return query_fn, subject_fn 524 | 525 | 526 | def make_subset(fasta, output_fn, keep_file=None, keep_list=None): 527 | if not (keep_file or keep_list): 528 | print( 529 | '[!] Cannot make subset for {} - aborting'.format(fasta), 530 | file=sys.stderr 531 | ) 532 | sys.exit(1) 533 | keep_list_set = set() 534 | keep_file_set = set() 535 | if keep_list is not None: 536 | for e in keep_list: 537 | keep_list_set.add(e) 538 | if keep_file is not None: 539 | with open(keep_file) as f: 540 | for l in f: 541 | keep_file_set.add(l.strip()) 542 | # get the union of the two sets of headers to include any new additions 543 | keep_set = keep_list_set | keep_file_set 544 | # write combined set to new if there are entries in 545 | # that were not in the original 546 | new_keeps = len(keep_set) - len(keep_file_set) 547 | if new_keeps > 0 and keep_file is not None: 548 | print( 549 | '[#] Updating {} with {} new entries'.format(keep_file, new_keeps) 550 | , file=sys.stderr) 551 | with open(keep_file, 'w') as new_keep_file: 552 | for h in keep_set: 553 | new_keep_file.write(h + '\n') 554 | kept = 0 555 | with open(output_fn, 'w') as out: 556 | for h, s in fasta_parse(fasta, trim_header=False): 557 | trunc_header = h.split()[0] 558 | if trunc_header in keep_set: 559 | record = '>{}\n{}\n'.format(h, s) 560 | out.write(record) 561 | kept += 1 562 | 563 | return output_fn, kept 564 | 565 | 566 | def list_hash(string_list, length=3, sort_first=True): 567 | hash = md5() 568 | if sort_first is True: 569 | string_list = sorted(string_list) 570 | for s in string_list: 571 | hash.update(s.encode()) 572 | 573 | return hash.hexdigest()[:length] 574 | 575 | 576 | def subset_name(fn, file_tag='subset', use_hash=True, keep_path=False): 577 | if type(file_tag) is not str or not file_tag: 578 | file_tag = '' 579 | else: 580 | file_tag = '_{}'.format(file_tag) 581 | out_fn = '{}{}.fa'.format( 582 | abbreviate(fn, use_hash=use_hash, keep_path=keep_path), file_tag 583 | ) 584 | 585 | return out_fn 586 | 587 | 588 | def align(aligner, query, subject, run_type, output_name, extra_args=None): 589 | if extra_args is None: 590 | extra_args = [] 591 | aligner_args = [ 592 | aligner, 593 | query, 594 | subject, 595 | run_type, 596 | '--output_name', 597 | output_name 598 | ] 599 | result = subprocess.run(aligner_args + extra_args) 600 | 601 | return result 602 | 603 | 604 | def alignment_filenames(query, subject, run_type, use_hash=True): 605 | run_files = {} 606 | 607 | # when subsetting is used, this hack will produce more obvious filenames 608 | # that reflect which alignments were of subsets and which weren't 609 | special_case_tags = ['_residual', '_subset'] 610 | 611 | no_abbrev = [] 612 | for f in [query, subject]: 613 | if any(t in f for t in special_case_tags): 614 | no_abbrev.append(f) 615 | 616 | fw_names = unique_filenames(query, subject, skip=no_abbrev, use_hash=use_hash) 617 | rv_names = unique_filenames(subject, query, skip=no_abbrev, use_hash=use_hash) 618 | 619 | run_files['forward'] = '{}-vs-{}.{}'.format(*fw_names, run_type) 620 | run_files['reverse'] = '{}-vs-{}.{}'.format(*rv_names, run_type) 621 | 622 | return run_files 623 | 624 | 625 | def seq_lengths(fasta): 626 | l = {} 627 | for h, s in fasta_parse(fasta): 628 | l[h] = len(s) 629 | 630 | return l 631 | 632 | 633 | def remove_many_to_one(pairs): 634 | """ 635 | Each element of >pairs< is a tuple: (hitA, hitB, (scoreX, scoreY)) 636 | 637 | Takes a list of paired reciprocal hits (plus scores) and filters it 638 | such that each member of each pair only occur once, i.e. it removes 639 | any many-to-one hits, using the bitscores in the last element of 640 | each tuple. 641 | 642 | """ 643 | uniques = {} 644 | to_remove = [] 645 | for index, (a, b, scores) in enumerate(pairs): 646 | avg_score = sum(scores) / 2 647 | for e in [a, b]: 648 | if e not in uniques: 649 | uniques[e] = {'score': avg_score, 'index': index} 650 | elif uniques[e]['score'] >= avg_score: 651 | to_remove.append(index) 652 | continue 653 | else: 654 | to_remove.append(uniques[e]['index']) 655 | uniques[e] = {'score': avg_score, 'index': index} 656 | 657 | filtered = [] 658 | for i, p in enumerate(pairs): 659 | if i in to_remove: 660 | names = p[0:2] 661 | print('Removed: {}'.format('\t'.join(names)), file=sys.stderr) 662 | else: 663 | filtered.append(p) 664 | 665 | return filtered 666 | 667 | 668 | def graph_cluster(pairwise_sets, chain=False): 669 | graph = nx.Graph() 670 | for p_set in pairwise_sets: 671 | for pair in p_set: 672 | graph.add_edge(*pair) 673 | if chain: 674 | clusters = nx.connected_components(graph) 675 | else: 676 | clusters = nx.find_cliques(graph) 677 | clusters = [sorted(c) for c in clusters] 678 | 679 | return sorted(clusters, key=len) 680 | 681 | 682 | def subset_size_check(input_file, subset, kept_n): 683 | if kept_n == 0: 684 | print('[!] Subset size of {} = 0; aborting.'.format(subset)) 685 | sys.exit(1) 686 | else: 687 | print( 688 | '[#] Subset size for {}: {}'.format(input_file, kept_n), 689 | file=sys.stderr 690 | ) 691 | 692 | 693 | def log_ledger(q, s, run_type, win_ledger, use_hash=True): 694 | ledger_file = '{}-{}.{}.log'.format( 695 | abbreviate(q, use_hash=use_hash), abbreviate(s, use_hash=use_hash), run_type) 696 | with open(ledger_file, 'w') as lf: 697 | for query, info in sorted(win_ledger.items()): 698 | winner = info['best'] 699 | loser_tuples = info['losers'] 700 | lf.write('>{}\t[{}]\n'.format(winner, query)) 701 | for loser in sorted(loser_tuples): 702 | lf.write('\t'.join(loser) + '\n') 703 | 704 | 705 | def get_alignments( 706 | pairs_index, 707 | aligner, 708 | run_type, 709 | extra, 710 | overwrite=False, 711 | file_dirs=None, 712 | blast_file=None, 713 | use_hash=True 714 | ): 715 | """ 716 | is a dictionary of the form {label: (q, s), ...} 717 | 718 | """ 719 | # get reciprologs for each pairwise permutation of files [(1, 2), (2, 1), ...] 720 | alignment_index = {} 721 | for alignment_key, pair in pairs_index.items(): 722 | q, s = pair 723 | # this block might want to move to above the preceding if block...? 724 | if blast_file: 725 | alignment_index[alignment_key] = blast_file 726 | continue 727 | align_fn = alignment_filenames(q, s, run_type, use_hash=use_hash)['forward'] 728 | if file_dirs: 729 | for d in file_dirs: 730 | file_list = os.listdir(d) 731 | if align_fn in file_list: 732 | align_fn = os.path.join( 733 | os.path.abspath(d), align_fn) 734 | break 735 | if not os.path.isfile(align_fn) or overwrite is True: 736 | alignment = align( 737 | aligner, q, s, run_type, align_fn, extra_args=extra 738 | ) 739 | if alignment.returncode != 0: 740 | sys.exit( 741 | '[!] ERROR: alignment failed: {} (return code: {})' 742 | .format(align_fn, alignment.returncode) 743 | ) 744 | else: 745 | print( 746 | '[#] Using existing output \'{}\''.format(align_fn), 747 | file=sys.stderr 748 | ) 749 | alignment_index[alignment_key] = align_fn 750 | 751 | return alignment_index 752 | 753 | 754 | def align_residuals(alignments, aligner, residual_index, args, overwrite=False, use_hash=True): 755 | alignment_index = {} 756 | run_type = args.run_type 757 | extra = args.extra 758 | for aln_key, aln in sorted(alignments.items(), key=lambda x: sorted(x[0])): 759 | q, s = aln_key 760 | if q not in residual_index: 761 | continue 762 | r_sub_set = residual_index[q] 763 | r_sub_fn = subset_name(q, file_tag='residual', use_hash=use_hash) 764 | r_sub, kept_n = make_subset(q, r_sub_fn, keep_list=r_sub_set) 765 | subset_size_check(q, r_sub, kept_n) 766 | align_fn = alignment_filenames(q, s, run_type, use_hash=use_hash)['forward'] 767 | if not os.path.isfile(align_fn) or overwrite is True: 768 | residual_aln = align( 769 | aligner, r_sub, s, run_type, align_fn, extra_args=extra) 770 | if residual_aln.returncode != 0: 771 | sys.exit( 772 | '[!] ERROR: alignment failed: {} (return code: {})' 773 | .format(align_fn, residual_aln.returncode) 774 | ) 775 | else: 776 | print( 777 | '[#] Using existing output \'{}\''.format(align_fn), 778 | file=sys.stderr 779 | ) 780 | alignment_index[aln_key] = align_fn 781 | 782 | return alignment_index 783 | 784 | 785 | def build_pair_index( 786 | input_files, 787 | subset_index, 788 | subset_tag=None, 789 | subset_only=False, 790 | use_hash=True 791 | ): 792 | pairs = {} 793 | made = set() 794 | for q, s in permutations(sorted(input_files), 2): 795 | alignment_key = (q, s) 796 | paralogs = q == s 797 | if subset_index and subset_index.get(q) and not paralogs: 798 | # subset files are getting tagged twice—probably need to move 799 | # tagging to a different function or something 800 | sub_set = subset_index[q] 801 | subset_hash = list_hash(sub_set) 802 | hash_tag = f'{subset_tag}_h{subset_hash}' 803 | sub_fn = subset_name( 804 | q, file_tag=hash_tag, use_hash=use_hash 805 | ) 806 | 807 | # check if subset file has already been made 808 | # during this loop to avoid remaking recent files 809 | # unnecessarily 810 | if q not in made: 811 | made.add(q) 812 | sub_q, kept_n = make_subset(q, sub_fn, keep_list=sub_set) 813 | subset_size_check(q, sub_q, kept_n) 814 | else: 815 | sub_q = sub_fn 816 | q = sub_q 817 | elif subset_only is True: 818 | continue 819 | p = (q, s) 820 | pairs[alignment_key] = p 821 | 822 | return pairs 823 | 824 | 825 | def get_residual_hits(hit_index, subset_index): 826 | residual_index = defaultdict(set) 827 | for pair, fwd_hits in hit_index.items(): 828 | q, s = pair 829 | if subset_index and subset_index[s] is not None: 830 | reverse_pair = (s, q) 831 | rv_query_names = set(hit_index[reverse_pair].keys()) 832 | s_hits = set(h['name'] for h in fwd_hits.values()) 833 | residual = s_hits - rv_query_names 834 | residual_index[s] |= residual 835 | print(f'residual length for {s}: {len(residual_index[s])}', file=sys.stderr) ###!!! 836 | 837 | return residual_index 838 | 839 | 840 | def build_hit_indices(alignments, args): 841 | query_percentage = args.query_percentage_threshold 842 | blast_file = args.blast_file 843 | length_index = {} 844 | hit_index = {} 845 | residual_index = defaultdict(set) 846 | for (q, s), aln in sorted(alignments.items(), key=lambda x: sorted(x[0])): 847 | if blast_file: 848 | aln = blast_file 849 | # set flag if both files are the same 850 | paralogs = q == s 851 | if paralogs and (q, s) in hit_index: 852 | continue 853 | # add the lengths of all sequences to an index for later 854 | # tie-breaking of best hits 855 | for e in (q, s): 856 | if e not in length_index: 857 | length_index[e] = seq_lengths(e) 858 | # this is needed for the weird get_top_hits() API - might 859 | # be better as a separate arg in the future... 860 | length_index[e]['query_match_threshold'] = query_percentage 861 | # get sets of query IDs to filter alignment lines to 862 | # relevant hits (matters in case of aggregate alignment file) 863 | q_list = set(length_index[q].keys()) 864 | if query_percentage is not None: 865 | q_lengths = length_index[q] 866 | else: 867 | q_lengths = {} 868 | s_lengths = length_index[s] 869 | top_hits, win_ledger = get_top_hits( 870 | aln, 871 | paralogs, 872 | query_match=q_lengths, 873 | seq_lengths=s_lengths, 874 | ignore_same_id=args.ignore_same_id, 875 | ignore_same_prefix=args.ignore_same_prefix, 876 | query_list=q_list 877 | ) 878 | 879 | hit_index[(q, s)] = top_hits 880 | 881 | return hit_index 882 | 883 | 884 | def get_subset_set(subset_list_file): 885 | subset = set() 886 | with open(subset_list_file) as f: 887 | for l in f: 888 | subset.add(l.strip()) 889 | 890 | return subset 891 | 892 | 893 | parser = argparse.ArgumentParser( 894 | description=( 895 | 'Find reciprocal best hits between two or more files. ' 896 | 'Any unrecognized arguments will be passed along to the chosen ' 897 | 'alignment program.'), 898 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 899 | 900 | parser.add_argument( 901 | 'input_files', 902 | metavar='file_1 file_2 ...', 903 | help='files to use to build reciprolog sets (space separated)', 904 | nargs='+' 905 | ) 906 | parser.add_argument( 907 | 'run_type', 908 | choices=[ 909 | 'diamondp', 910 | 'diamondx', 911 | 'blastn', 912 | 'blastp', 913 | 'blastx', 914 | 'tblastn', 915 | 'tblastx' 916 | ], 917 | help='type of alignment program to run' 918 | ) 919 | parser.add_argument( 920 | '-p', 921 | '--parallel_processes', 922 | help=( 923 | 'run the alignment step using multiple parallel processes'), 924 | type=int, 925 | default=1 926 | ) 927 | parser.add_argument( 928 | '-q', 929 | '--query_percentage_threshold', 930 | metavar='PERCENTAGE', 931 | help=( 932 | 'require a specified fraction of the query length to match in ' 933 | 'order for a hit to qualify (lowest allowable percentage'), 934 | type=float, 935 | default=None 936 | ) 937 | parser.add_argument( 938 | '--chain', 939 | action='store_true', 940 | help=( 941 | 'cluster reciprologs without requiring all-by-all pairwise ' 942 | 'relationships, e.g. A-B, A-C, A-D --> A-B-C-D') 943 | ) 944 | parser.add_argument( 945 | '--subset', 946 | metavar=('subset_1', 'subset_2'), 947 | nargs='+', 948 | help=( 949 | 'Files containing subsets of headers to be used as queries for each ' 950 | 'input file. Supplied in the same order as the input files; one header ' 951 | 'per line. To omit a subset file for a given input file, ' 952 | 'provide "." as the argument, e.g. for three input files with only 1 & ' 953 | '3 with subsets: --subsets subset_1 . subset_2' 954 | ) 955 | ) 956 | parser.add_argument( 957 | '--ignore_same_id', 958 | action='store_true', 959 | help='ignore hits where both query and subject have identical IDs' 960 | ) 961 | parser.add_argument( 962 | '--ignore_same_prefix', 963 | metavar='', 964 | help=( 965 | 'ignore hits where both query and subject have identical prefixes, ' 966 | 'where the prefix for each ID is delimited by the specified ' 967 | '') 968 | ) 969 | parser.add_argument( 970 | '-o', 971 | '--output', 972 | help=( 973 | 'output filename (use flag without argument for auto-naming)' 974 | ), 975 | nargs='?', 976 | default='stdout' 977 | ) 978 | parser.add_argument( 979 | '-d', 980 | '--alignment_source_directory', 981 | help='check for existing alignment files to use in this directory first', 982 | nargs='+', 983 | metavar='path' 984 | ) 985 | parser.add_argument( 986 | '-b', 987 | '--blast_file', 988 | help='aggregated BLAST output to use (both directions)' 989 | ) 990 | parser.add_argument( 991 | '--overwrite', 992 | help=( 993 | 'overwrite existing output files ' 994 | '(instead of using them to bypass alignment step)' 995 | ), 996 | action='store_true' 997 | ) 998 | parser.add_argument( 999 | '--one_to_one', 1000 | help=( 1001 | 'remove any many-to-one reciprolog relationships in each pairwise ' 1002 | 'set, such that each member of each pairwise comparison is only ' 1003 | 'present exactly one time in output'), 1004 | action='store_true' 1005 | ) 1006 | parser.add_argument( 1007 | '--logging', 1008 | help='output a log of best-hit choice criteria', 1009 | action='store_true' 1010 | ) 1011 | parser.add_argument( 1012 | '--no_hash_tag', 1013 | help='do not auto-tag output files with MD5 hashes of source files', 1014 | action='store_true' 1015 | ) 1016 | 1017 | t_start = time.time() 1018 | 1019 | args, EXTRA_ARGS = parser.parse_known_args() 1020 | 1021 | RUN_TYPE = args.run_type 1022 | PARALLEL = args.parallel_processes 1023 | INPUT_FILES = args.input_files 1024 | if len(INPUT_FILES) < 2: 1025 | sys.exit('error: too few files specified (need >1)') 1026 | QUERY_PERCENTAGE = args.query_percentage_threshold 1027 | OVERWRITE = args.overwrite 1028 | ONE_TO_ONE = args.one_to_one 1029 | LOGGING = args.logging 1030 | CHAIN = args.chain 1031 | IGNORE_SAME_ID = args.ignore_same_id 1032 | IGNORE_SAME_PREFIX = args.ignore_same_prefix 1033 | BLAST_FILE = args.blast_file 1034 | OUTPUT_FILE = args.output 1035 | ALIGNMENT_SOURCE_DIRS = args.alignment_source_directory 1036 | SUBSET = args.subset 1037 | USE_HASHES = not args.no_hash_tag 1038 | 1039 | ALIGN_PROG = 'palign' 1040 | 1041 | if not USE_GRAPH: 1042 | print( 1043 | '[!] networkx library not found; will use brute-force method instead', 1044 | file=sys.stderr 1045 | ) 1046 | 1047 | # create a list with the flags/options to pass to the subsequence 1048 | # subprocess calls 1049 | call_options = { 1050 | '-p': PARALLEL 1051 | } 1052 | optional = EXTRA_ARGS 1053 | for k, v in call_options.items(): 1054 | if v: 1055 | optional.extend([str(k), str(v)]) 1056 | 1057 | # if --subset, generate list of subset files to match up with input files 1058 | if SUBSET and len(SUBSET) != len(INPUT_FILES): 1059 | print( 1060 | '[!] Must supply as many arguments to --subset as there are input files', 1061 | file=sys.stderr 1062 | ) 1063 | sys.exit(1) 1064 | 1065 | # the default None values here are used as a check later on to 1066 | # indicate which files have associated subsets 1067 | 1068 | if SUBSET: 1069 | subset_index = {f: set() for f in INPUT_FILES} 1070 | working_subsets = [] 1071 | for s in SUBSET: 1072 | if s != '.': 1073 | s = get_subset_set(s) 1074 | working_subsets.append(s) 1075 | # working_name = '{}.working'.format(s) 1076 | # shutil.copy(s, working_name) 1077 | # working_subsets.append(working_name) 1078 | for f, s in zip(INPUT_FILES, working_subsets): 1079 | if s != '.': 1080 | subset_index[f] = s 1081 | else: 1082 | subset_index = None 1083 | 1084 | 1085 | alignment_pairs = build_pair_index( 1086 | INPUT_FILES, subset_index, subset_tag='subset', subset_only=False, use_hash=USE_HASHES) 1087 | 1088 | alignments = get_alignments( 1089 | alignment_pairs, 1090 | ALIGN_PROG, 1091 | RUN_TYPE, 1092 | EXTRA_ARGS, 1093 | overwrite=OVERWRITE, 1094 | file_dirs=ALIGNMENT_SOURCE_DIRS, 1095 | blast_file=BLAST_FILE, 1096 | use_hash=USE_HASHES 1097 | ) 1098 | 1099 | # hit_index, residual_index = build_hit_indices(alignments, args, subset_index=subset_index) 1100 | hit_index = build_hit_indices(alignments, args) 1101 | residual_index = get_residual_hits(hit_index, subset_index) 1102 | 1103 | if residual_index: 1104 | residual_pairs = build_pair_index( 1105 | INPUT_FILES, 1106 | residual_index, 1107 | subset_tag='residual', 1108 | subset_only=True, 1109 | use_hash=USE_HASHES 1110 | ) 1111 | 1112 | residual_alignments = get_alignments( 1113 | residual_pairs, 1114 | ALIGN_PROG, 1115 | RUN_TYPE, 1116 | EXTRA_ARGS, 1117 | overwrite=OVERWRITE, 1118 | file_dirs=ALIGNMENT_SOURCE_DIRS, 1119 | blast_file=BLAST_FILE, 1120 | use_hash=USE_HASHES 1121 | ) 1122 | 1123 | residual_hit_index = build_hit_indices( 1124 | residual_alignments, args 1125 | ) 1126 | 1127 | for k, v in residual_hit_index.items(): 1128 | hit_index[k].update(v) 1129 | 1130 | # get all unique pairs (e.g. (0, 1) but not (1, 0)), 1131 | # to then iterate over only the forward pairs and get 1132 | # reciprocal hits for each pair 1133 | unique_pairs = set([tuple(sorted(e)) for e in hit_index]) 1134 | 1135 | pairwise_reciprolog_sets = [] 1136 | for p in unique_pairs: 1137 | rv_key = tuple(sorted(p, reverse=True)) 1138 | top_fwd = hit_index[p] 1139 | top_rv = hit_index[rv_key] 1140 | q_name, s_name = p 1141 | reciprolog_set = get_reciprocals(top_fwd, top_rv, q_name, s_name) 1142 | if ONE_TO_ONE is True: 1143 | reciprolog_set = remove_many_to_one(reciprolog_set) 1144 | 1145 | # remove score info 1146 | reciprolog_set = [tuple(x[0:2]) for x in reciprolog_set] 1147 | pairwise_reciprolog_sets.append(reciprolog_set) 1148 | 1149 | if CHAIN: 1150 | reciprologs = aggregate_orthos_chained(pairwise_reciprolog_sets, USE_GRAPH) 1151 | else: 1152 | reciprologs = aggregate_orthos_strict(pairwise_reciprolog_sets, USE_GRAPH) 1153 | 1154 | reciprologs = clean_reciprologs(reciprologs, subset_index=subset_index) 1155 | 1156 | basename = '-'.join(sorted([abbreviate(f, use_hash=USE_HASHES) for f in INPUT_FILES])) 1157 | 1158 | if len(reciprologs) == 0: 1159 | out_string = '' 1160 | else: 1161 | if OUTPUT_FILE: 1162 | if OUTPUT_FILE == 'stdout': 1163 | out = sys.stdout 1164 | out_string = '' 1165 | else: 1166 | out = open(OUTPUT_FILE, 'w') 1167 | out_string = ': {}'.format(OUTPUT_FILE) 1168 | else: 1169 | OUTPUT_FILE = '{}.{}.reciprologs'.format(basename, RUN_TYPE) 1170 | out = open(OUTPUT_FILE, 'w') 1171 | out_string = ': {}'.format(OUTPUT_FILE) 1172 | 1173 | for group in reciprologs: 1174 | out.write('\t'.join(group) + '\n') 1175 | 1176 | if OUTPUT_FILE != 'stdout': 1177 | out.close() 1178 | 1179 | runtime = get_runtime(t_start) 1180 | 1181 | print( 1182 | '[#] Job finished in {}; {} reciprolog sets found{}' 1183 | .format(runtime, len(reciprologs), out_string), 1184 | file=sys.stderr) 1185 | 1186 | sys.exit(0) 1187 | --------------------------------------------------------------------------------