├── pyproject.toml
├── LICENSE
├── README.md
└── reciprologs


/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "reciprologs"
 3 | version = "1.1.1"
 4 | description = ""
 5 | authors = ["Graham Larue <egrahamlarue@gmail.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.8"
11 | biogl = "^2.3.0"
12 | networkx = "^3.3"
13 | palign = {git = "https://github.com/glarue/palign.git"}
14 | 
15 | 
16 | [build-system]
17 | requires = ["poetry-core"]
18 | build-backend = "poetry.core.masonry.api"
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Graham Larue
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Dependencies
 2 | 
 3 | #### `biogl`
 4 | 
 5 | This script needs the [biogl](https://github.com/glarue/biogl) module to function properly. If you use (or can get) `pip`, you can simply do
 6 | 
 7 | ```python3 -m pip install biogl```
 8 | 
 9 | to add the package to a location reachable by your Python installation.
10 | 
11 | Otherwise, you can clone the `biogl` repo and source it locally (to run from anywhere, you'll need to add it to your `PYTHONPATH` environment variable, a process that varies by OS):
12 | 
13 | ```git clone https://github.com/glarue/biogl.git```
14 | 
15 | #### `palign`
16 | 
17 | This script also requires [palign](https://github.com/glarue/palign) to run the alignment steps (using DIAMOND or BLAST), which you can clone and then add to your `PATH`:
18 | 
19 | ```git clone https://github.com/glarue/palign.git```
20 | 
21 | #### Recommended: `networkx`
22 | 
23 | In use-cases with more than two files, `reciprologs` builds a graph representation of all of the reciprocal best hits (RBH), which allows construction of maximal cliques where every member is a RBH of every other member (e.g. `A-B, A-C, B-C, A-D --> A-B-C, A-D`). In order to do this _efficiently_, the Python package [networkx](https://networkx.org/) is needed (via `pip` or otherwise). If you want less-strict requirements for clustering (e.g. `A-B, A-C, A-D --> A-B-C-D`), see the `--chain` argument.
24 | 
25 | ### Usage info
26 | 
27 | ```
28 | usage: reciprologs [-h] [-p PARALLEL_PROCESSES] [-q PERCENTAGE] [--chain]
29 |                    [--subset subset_1 [subset_2 ...]] [--ignore_same_id]
30 |                    [--ignore_same_prefix <prefix_delimiter>] [-o [OUTPUT]]
31 |                    [-d path [path ...]] [-b BLAST_FILE] [--overwrite]
32 |                    [--one_to_one] [--logging] [--no_hash_tag]
33 |                    file_1 file_2 ... [file_1 file_2 ... ...]
34 |                    {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx}
35 | 
36 | Find reciprocal best hits between two or more files. Any unrecognized
37 | arguments will be passed along to the chosen alignment program.
38 | 
39 | positional arguments:
40 |   file_1 file_2 ...     files to use to build reciprolog sets (space
41 |                         separated)
42 |   {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx}
43 |                         type of alignment program to run
44 | 
45 | optional arguments:
46 |   -h, --help            show this help message and exit
47 |   -p PARALLEL_PROCESSES, --parallel_processes PARALLEL_PROCESSES
48 |                         run the alignment step using multiple parallel
49 |                         processes (default: 1)
50 |   -q PERCENTAGE, --query_percentage_threshold PERCENTAGE
51 |                         require a specified fraction of the query length to
52 |                         match in order for a hit to qualify (lowest
53 |                         allowable percentage (default: None)
54 |   --chain               cluster reciprologs without requiring all-by-all
55 |                         pairwise relationships, e.g. A-B, A-C, A-D --> A-B-
56 |                         C-D (default: False)
57 |   --subset subset_1 [subset_2 ...]
58 |                         Files containing subsets of headers to be used as
59 |                         queries for each input file. Supplied in the same
60 |                         order as the input files; one header per line. To
61 |                         omit a subset file for a given input file, provide
62 |                         "." as the argument, e.g. for three input files
63 |                         with only 1 & 3 with subsets: --subsets subset_1 .
64 |                         subset_2 (default: None)
65 |   --ignore_same_id      ignore hits where both query and subject have
66 |                         identical IDs (default: False)
67 |   --ignore_same_prefix <prefix_delimiter>
68 |                         ignore hits where both query and subject have
69 |                         identical prefixes, where the prefix for each ID is
70 |                         delimited by the specified <prefix_delimiter>
71 |                         (default: None)
72 |   -o [OUTPUT], --output [OUTPUT]
73 |                         output filename (use flag without argument for
74 |                         stdout; omit flag to use auto-naming) (default:
75 |                         stdout)
76 |   -d path [path ...], --alignment_source_directory path [path ...]
77 |                         check for existing alignment files to use in this
78 |                         directory first (default: None)
79 |   -b BLAST_FILE, --blast_file BLAST_FILE
80 |                         aggregated BLAST output to use (both directions)
81 |                         (default: None)
82 |   --overwrite           overwrite existing output files (instead of using
83 |                         them to bypass alignment step) (default: False)
84 |   --one_to_one          remove any many-to-one reciprolog relationships in
85 |                         each pairwise set, such that each member of each
86 |                         pairwise comparison is only present exactly one
87 |                         time in output (default: False)
88 |   --logging             output a log of best-hit choice criteria (default:
89 |                         False)
90 |   --no_hash_tag         do not auto-tag output files with MD5 hashes of
91 |                         source files (default: False)
92 | 
93 | ```
94 | 


--------------------------------------------------------------------------------
/reciprologs:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3  
   2 | # the above sources Python from $PATH
   3 | ##!/usr/local/bin/python3
   4 | ##!/usr/bin/python3
   5 | # the above uses specific Python version; allows script name in top
   6 | 
   7 | # authorship information
   8 | __author__ = 'Graham E. Larue'
   9 | __maintainer__ = "Graham E. Larue"
  10 | __email__ = 'egrahamlarue@gmail.com'
  11 | __license__ = 'GPL'
  12 | 
  13 | """
  14 | usage: reciprologs [-h] [-p PARALLEL_PROCESSES] [-q PERCENTAGE] [--chain]
  15 |                    [--subset subset_1 [subset_2 ...]] [--ignore_same_id]
  16 |                    [--ignore_same_prefix <prefix_delimiter>] [-o [OUTPUT]]
  17 |                    [-d path [path ...]] [-b BLAST_FILE] [--overwrite]
  18 |                    [--one_to_one] [--logging] [--no_hash_tag]
  19 |                    file_1 file_2 ... [file_1 file_2 ... ...]
  20 |                    {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx}
  21 | 
  22 | Find reciprocal best hits between two or more files. Any unrecognized
  23 | arguments will be passed along to the chosen alignment program.
  24 | 
  25 | positional arguments:
  26 |   file_1 file_2 ...     files to use to build reciprolog sets (space
  27 |                         separated)
  28 |   {diamondp,diamondx,blastn,blastp,blastx,tblastn,tblastx}
  29 |                         type of alignment program to run
  30 | 
  31 | optional arguments:
  32 |   -h, --help            show this help message and exit
  33 |   -p PARALLEL_PROCESSES, --parallel_processes PARALLEL_PROCESSES
  34 |                         run the alignment step using multiple parallel
  35 |                         processes (default: 1)
  36 |   -q PERCENTAGE, --query_percentage_threshold PERCENTAGE
  37 |                         require a specified fraction of the query length to
  38 |                         match in order for a hit to qualify (lowest
  39 |                         allowable percentage (default: None)
  40 |   --chain               cluster reciprologs without requiring all-by-all
  41 |                         pairwise relationships, e.g. A-B, A-C, A-D --> A-B-
  42 |                         C-D (default: False)
  43 |   --subset subset_1 [subset_2 ...]
  44 |                         Files containing subsets of headers to be used as
  45 |                         queries for each input file. Supplied in the same
  46 |                         order as the input files; one header per line. To
  47 |                         omit a subset file for a given input file, provide
  48 |                         "." as the argument, e.g. for three input files
  49 |                         with only 1 & 3 with subsets: --subsets subset_1 .
  50 |                         subset_2 (default: None)
  51 |   --ignore_same_id      ignore hits where both query and subject have
  52 |                         identical IDs (default: False)
  53 |   --ignore_same_prefix <prefix_delimiter>
  54 |                         ignore hits where both query and subject have
  55 |                         identical prefixes, where the prefix for each ID is
  56 |                         delimited by the specified <prefix_delimiter>
  57 |                         (default: None)
  58 |   -o [OUTPUT], --output [OUTPUT]
  59 |                         output filename (use flag without argument for
  60 |                         auto-naming) (default: stdout)
  61 |   -d path [path ...], --alignment_source_directory path [path ...]
  62 |                         check for existing alignment files to use in this
  63 |                         directory first (default: None)
  64 |   -b BLAST_FILE, --blast_file BLAST_FILE
  65 |                         aggregated BLAST output to use (both directions)
  66 |                         (default: None)
  67 |   --overwrite           overwrite existing output files (instead of using
  68 |                         them to bypass alignment step) (default: False)
  69 |   --one_to_one          remove any many-to-one reciprolog relationships in
  70 |                         each pairwise set, such that each member of each
  71 |                         pairwise comparison is only present exactly one
  72 |                         time in output (default: False)
  73 |   --logging             output a log of best-hit choice criteria (default:
  74 |                         False)
  75 |   --no_hash_tag         do not auto-tag output files with MD5 hashes of
  76 |                         source files (default: False)
  77 | 
  78 | NOTE: Depends on palign
  79 | 
  80 | """
  81 | import sys
  82 | import subprocess
  83 | import os
  84 | import time
  85 | import argparse
  86 | import re
  87 | import shutil
  88 | from operator import itemgetter
  89 | from multiprocessing import cpu_count
  90 | from collections import defaultdict
  91 | from itertools import combinations, permutations
  92 | from biogl import fasta_parse, get_runtime
  93 | from hashlib import md5
  94 | 
  95 | # use networkx library for fast ortholog clustering if available
  96 | try:
  97 |     import networkx as nx
  98 |     USE_GRAPH = True
  99 | except ModuleNotFoundError:
 100 |     USE_GRAPH = False
 101 | 
 102 | 
 103 | def parse_blast_line(bl, *args):
 104 |     """
 105 |     Returns info from certain columns in a tab-separated BLAST
 106 |     output file. $args may be: query, subject, length, e, bitscore
 107 | 
 108 |     """
 109 |     columns = bl.strip().split("\t")
 110 |     (
 111 |         query, subject, length, 
 112 |         e_value, bitscore
 113 |     ) = itemgetter(0, 1, 3, 10, 11)(columns)
 114 |     arg_map = {
 115 |         "query": query,
 116 |         "subject": subject,
 117 |         "length": int(length) - 1,  # seem to be off by 1 in BLAST output
 118 |         "e": float(e_value),
 119 |         "bitscore": float(bitscore)
 120 |     }
 121 |     results = []
 122 |     for a in args:
 123 |         results.append(arg_map[a])
 124 |     if len(results) == 1:
 125 |         results = results[0]
 126 |     return results
 127 | 
 128 | 
 129 | def is_better(challenger, defender, seq_lengths=None):
 130 |     """
 131 |     Compares attributes of two dictionaries of BLAST
 132 |     hits for a given query to determine which is better.
 133 | 
 134 |     Returns the winning dictionary and reason if it's 
 135 |     better, otherwise False.
 136 |     
 137 |     """
 138 |     cbs = challenger['score']
 139 |     dbs = defender['score']
 140 |     # criteria: bitscore
 141 |     if cbs < dbs:
 142 |         return False
 143 |     elif cbs > dbs:
 144 |         return challenger, 'bitscore'
 145 |     elif cbs == dbs:
 146 |         # criteria --> e-value
 147 |         cev = challenger['evalue']
 148 |         dev = defender['evalue']
 149 |         if cev < dev: # lower is better
 150 |             return challenger, 'e-value'
 151 |         elif seq_lengths is not None:
 152 |             # criteria --> length
 153 |             # if scores are equal, check if sequence lengths
 154 |             # have been provided as an additional tiebreaking
 155 |             # criteria and look up the subject length to
 156 |             # see if there's a difference
 157 |             dn = defender['name']
 158 |             cn = challenger['name']
 159 |             try:
 160 |                 if seq_lengths[cn] > seq_lengths[dn]:
 161 |                     return challenger, 'length'
 162 |             except KeyError:
 163 |                 return False
 164 |         else:
 165 |             return False
 166 |     else:
 167 |         return False
 168 | 
 169 | 
 170 | def get_prefix(seq_id, delimiter):
 171 |     split_list = re.split(delimiter, seq_id, maxsplit=1)
 172 |     split_list = [s for s in split_list if s]
 173 | 
 174 |     return split_list[0]
 175 | 
 176 | 
 177 | def get_top_hits(
 178 |     blast, 
 179 |     paralogs=False, 
 180 |     query_match=None, 
 181 |     seq_lengths=None,
 182 |     ignore_same_id=False,
 183 |     ignore_same_prefix=False,
 184 |     query_list=None):
 185 |     results = {}
 186 |     # dictionary to store tie-broken matches
 187 |     win_ledger = defaultdict(lambda: defaultdict(set))
 188 |     with open(blast) as blst:
 189 |         for l in blst:
 190 |             new_best_hit = False
 191 |             if l.startswith("#"):
 192 |                 continue
 193 |             (q, s, score, length, evalue) = parse_blast_line(
 194 |                 l, "query", "subject", "bitscore", "length", "e")
 195 |             challenger = {
 196 |                 'name': s,
 197 |                 'score': score,
 198 |                 'evalue': evalue,
 199 |                 'length': length
 200 |             }
 201 |             if query_list and q not in query_list:
 202 |                 continue
 203 | 
 204 |             # do not consider hits to self if BLASTing against self,
 205 |             # but allow query/subject names to be the same
 206 |             if q == s and (paralogs is True or ignore_same_id is True):
 207 |                 continue
 208 |             if ignore_same_prefix is not None:
 209 |                 prefix = ignore_same_prefix
 210 |                 if get_prefix(q, prefix) == get_prefix(s, prefix):
 211 |                     continue
 212 |             if query_match:
 213 |                 # use query_match dictionary to compare query lengths to
 214 |                 # match lengths to exclude matches where query percentage 
 215 |                 # is below query_match_threshold key
 216 |                 fraction = (length / query_match[q]) * 100
 217 |                 if fraction < query_match['query_match_threshold']:
 218 |                     continue
 219 |             if q in results:
 220 |                 defender = results[q]
 221 |                 challenger_wins = is_better(
 222 |                     challenger, defender, seq_lengths)
 223 |                 if challenger_wins:  # new hit is better
 224 |                     new_best_hit = True
 225 |                     defender_name = results[q]['name']
 226 |                     reason = challenger_wins[1]
 227 |                     loser_info = (defender_name, reason)
 228 |                     win_ledger[q]['losers'].add(loser_info)
 229 |                     win_ledger[q]['best'] = s
 230 |                     
 231 |             else:
 232 |                 new_best_hit = True
 233 | 
 234 |             if new_best_hit is True:
 235 |                 results[q] = {
 236 |                     "name": s, 
 237 |                     "score": score, 
 238 |                     "evalue": evalue, 
 239 |                     "length": length
 240 |                 }
 241 | 
 242 |     return results, win_ledger
 243 | 
 244 | 
 245 | def get_reciprocals(d1, d2, tag1, tag2):
 246 |     """
 247 |     Takes two dictionaries of top BLAST hits,
 248 |     returns a list of tuples of all pairs that were
 249 |     reciprocal best hits, along with their bitscore
 250 |     values.
 251 | 
 252 |     """
 253 |     reciprologs = set()
 254 |     blast_dicts = [(d1, tag1), (d2, tag2)]
 255 |     blast_permuts = permutations(blast_dicts, 2)
 256 |     for (first, first_tag), (second, second_tag) in blast_permuts:
 257 |         for query, hit_info in first.items():
 258 |             best_hit = hit_info["name"]
 259 |             score = hit_info["score"]
 260 |             if best_hit in second:
 261 |                 reciprocal_hit = second[best_hit]["name"]
 262 |                 if query == reciprocal_hit:  # best hit refers back to query
 263 |                     r_score = second[best_hit]["score"]
 264 |                     query = (first_tag, query)
 265 |                     best_hit = (second_tag, best_hit)
 266 |                     hit_pair = sorted([query, best_hit])
 267 |                     score_tuple = tuple(sorted([score, r_score]))
 268 |                     hit_pair.append(score_tuple)
 269 |                     reciprologs.add(tuple(hit_pair))
 270 | 
 271 |     return sorted(reciprologs)
 272 | 
 273 | 
 274 | def clean_reciprologs(reciprologs, subset_index=None):
 275 |     cleaned = []
 276 |     for group in reciprologs:
 277 |         if subset_index and not any(m[1] in subset_index[m[0]] for m in group):
 278 |             continue
 279 |         # remove file tags from tuples
 280 |         clean_group = [m[1] for m in group]
 281 |         cleaned.append(clean_group)
 282 |     
 283 |     return cleaned
 284 | 
 285 | 
 286 | def file_md5(fn, buffer_size=65536):
 287 |     hash = md5()
 288 |     with open(fn, 'rb') as f:
 289 |         while True:
 290 |             data = f.read(buffer_size)
 291 |             if not data:
 292 |                 break
 293 |             hash.update(data)
 294 | 
 295 |     return hash.hexdigest()
 296 | 
 297 | 
 298 | def abbreviate(name, delimiter=".", use_hash=True, keep_path=False):
 299 |     local_name = os.path.basename(name)  # in case of non-local file path
 300 |     abbreviation = local_name.split(delimiter)[0]
 301 |     if use_hash is True:  # use shortened md5 hash to uniqueify name
 302 |         hash = file_md5(name)[:5]
 303 |         abbreviation = abbreviation + delimiter + hash
 304 |     
 305 |     if keep_path is True:
 306 |         file_path = os.path.dirname(os.path.abspath(name))
 307 |         abbreviation = os.path.join(file_path, abbreviation)
 308 | 
 309 |     return abbreviation
 310 | 
 311 | 
 312 | def unique_filenames(*file_list, skip=None, use_hash=True, keep_path=False):
 313 |     if skip is not None:
 314 |         abbreviated = [
 315 |             os.path.basename(f) if f in skip 
 316 |             else abbreviate(f, use_hash=use_hash) 
 317 |             for f in file_list
 318 |         ]
 319 |     else:
 320 |         abbreviated = [os.path.basename(f) for f in file_list]
 321 |     if len(set(abbreviated)) < len(abbreviated):  # not all are unique
 322 |         abbreviated = [os.path.basename(f) for f in file_list]
 323 |     
 324 |     if keep_path is True:  # add parent directory paths
 325 |         dirpaths = [os.path.dirname(f) for f in file_list]
 326 |         abbreviated = [
 327 |             os.path.join(p, f) for p, f in zip(dirpaths, abbreviated)
 328 |         ]
 329 | 
 330 |     return abbreviated
 331 | 
 332 | 
 333 | def concatenate(outname, file_list, clean=True):
 334 |     with open(outname, 'w') as outfile:
 335 |         for fn in file_list:
 336 |             with open(fn) as f:
 337 |                 for l in f:
 338 |                     outfile.write(l)
 339 |     if clean:
 340 |         [os.remove(fn) for fn in file_list]
 341 | 
 342 | 
 343 | def parse_run_type(align_type_arg):
 344 |     type_map = {
 345 |         'diamondp': ('diamond', 'blastp'),
 346 |         'diamondx': ('diamond', 'blastx'),
 347 |         'blastn': ('blast', 'blastn'),
 348 |         'blastp': ('blast', 'blastp'),
 349 |         'blastx': ('blast', 'blastx'),
 350 |         'tblastn': ('blast', 'tblastn'),
 351 |         'tblastx': ('blast', 'tblastx'),
 352 |     }
 353 | 
 354 |     return type_map[align_type_arg]
 355 | 
 356 | 
 357 | def aggregate_dict_chained(ortho_dict):
 358 |     """
 359 |     IN:
 360 |     defaultdict(set,
 361 |             {'a': {'b', 'c', 'd'},
 362 |              'b': {'a', 'c', 'e', 'f'},
 363 |              'c': {'a', 'b', 'e', 'f', 'g'},
 364 |              'd': {'a'},
 365 |              'e': {'b', 'c'},
 366 |              'f': {'b', 'c'},
 367 |              'g': {'c'},
 368 |              'w': {'z'},
 369 |              'x': {'z'},
 370 |              'y': {'z'},
 371 |              'z': {'w', 'x', 'y'}})
 372 |     OUT:
 373 |     defaultdict(set, {'a': {'b', 'c', 'd', 'e', 'f', 'g'}, 'z': {'w', 'x', 'y'}})
 374 |     
 375 |     """
 376 |     changed = False
 377 |     processed = []
 378 |     master = defaultdict(set)
 379 |     for k, v in ortho_dict.items():
 380 |         if k in processed:
 381 |             continue
 382 |         processed.append(k)
 383 |         for v2 in v:
 384 |             if v2 == k:
 385 |                 continue
 386 |             master[k].add(v2)
 387 |             processed.append(v2)
 388 |             if v2 not in ortho_dict:
 389 |                 continue
 390 |             changed = True
 391 |             master[k].update(ortho_dict[v2])
 392 |     if changed is True:
 393 |         master = aggregate_dict_chained(master)
 394 | 
 395 |     return master
 396 | 
 397 | 
 398 | def aggregate_orthos_chained(orthos, use_graph=False):
 399 |     """
 400 |     IN:
 401 |     [
 402 |         [('a', 'b'), ('a', 'c'), ('a', 'd')],
 403 |         [('b', 'c'), ('b', 'e'), ('b', 'f')],
 404 |         [('c', 'e'), ('c', 'f'), ('c', 'g')],
 405 |         [('z', 'x'), ('z', 'y'), ('z', 'w')]
 406 |     ]
 407 |     OUT:
 408 |     [['a', 'b', 'c', 'd', 'e', 'f', 'g'], ['w', 'x', 'y', 'z']]
 409 |     
 410 |     """
 411 |     if use_graph:
 412 |         ortho_groups = graph_cluster(orthos, chain=True)
 413 |     else:
 414 |         o_dict = make_ortho_dict(*orthos)
 415 |         aggregated = aggregate_dict_chained(o_dict)
 416 |         ortho_groups = []
 417 |         for k, v in aggregated.items():
 418 |             combined = tuple(v) + (k,)
 419 |             ortho_groups.append(sorted(combined))
 420 |     
 421 |     return sorted(ortho_groups)
 422 | 
 423 | 
 424 | def aggregate_orthos_strict(orthos, use_graph=False):
 425 |     """
 426 |     IN:
 427 |     [
 428 |         [('a', 'b'), ('a', 'c'), ('a', 'd')],
 429 |         [('b', 'c'), ('b', 'e'), ('b', 'f')],
 430 |         [('c', 'e'), ('c', 'f'), ('c', 'g')],
 431 |         [('z', 'x'), ('z', 'y'), ('z', 'w')]
 432 |     ]
 433 |     OUT:
 434 |     [
 435 |         ['x', 'z'],
 436 |         ['y', 'z'],
 437 |         ['w', 'z'],
 438 |         ['a', 'd'],
 439 |         ['c', 'g'],
 440 |         ['b', 'c', 'f'],
 441 |         ['b', 'c', 'e'],
 442 |         ['a', 'b', 'c']
 443 |     ]
 444 |     
 445 |     """
 446 |     if use_graph is True:
 447 |         aggregated = graph_cluster(orthos)
 448 |     else:
 449 |         o_dict = make_ortho_dict(*orthos)
 450 |         aggregated = all_by_all_orthos(o_dict)
 451 | 
 452 |     return aggregated
 453 | 
 454 | 
 455 | def all_by_all_orthos(ortho_dict):
 456 |     full_groups = []
 457 |     for k, v in ortho_dict.items():
 458 |         groups = []
 459 |         max_n = len(v)
 460 |         # go backward in size and cull subsets as we go
 461 |         for i in range(max_n, 0, -1):
 462 |             for g in combinations(v, i):
 463 |                 g = set(list(g) + [k])
 464 |                 if g in full_groups or len(g) == 1:
 465 |                     continue
 466 |                 if every_member_match(g, ortho_dict):
 467 |                     if any(og.issuperset(g) for og in full_groups):
 468 |                         continue
 469 |                     full_groups.append(g)
 470 | 
 471 |     return sorted([sorted(g) for g in full_groups])
 472 | 
 473 | 
 474 | def every_member_match(members, m_dict):
 475 |     all_match = True
 476 |     for m in members:
 477 |         others = [e for e in members if e != m]
 478 |         if not others:
 479 |             return True
 480 |         if any(m not in m_dict[o] for o in others):
 481 |             return False
 482 |             
 483 |     return all_match
 484 | 
 485 | 
 486 | def make_ortho_dict(*orthos):
 487 |     """
 488 |     IN:
 489 |     [
 490 |         [('a', 'b'), ('a', 'c'), ('a', 'd')],
 491 |         [('b', 'c'), ('b', 'e'), ('b', 'f')],
 492 |         [('c', 'e'), ('c', 'f'), ('c', 'g')],
 493 |         [('z', 'x'), ('z', 'y'), ('z', 'w')]
 494 |     ]
 495 |     OUT:
 496 |     defaultdict(set,
 497 |             {'a': {'b', 'c', 'd'},
 498 |              'b': {'a', 'c', 'e', 'f'},
 499 |              'c': {'a', 'b', 'e', 'f', 'g'},
 500 |              'd': {'a'},
 501 |              'e': {'b', 'c'},
 502 |              'f': {'b', 'c'},
 503 |              'g': {'c'},
 504 |              'w': {'z'},
 505 |              'x': {'z'},
 506 |              'y': {'z'},
 507 |              'z': {'w', 'x', 'y'}})
 508 |     
 509 |     """
 510 |     collector = defaultdict(set)
 511 |     for o_list in orthos:
 512 |         for pair in o_list:
 513 |             for a, b in permutations(pair, 2):
 514 |                 collector[a].add(b)
 515 | 
 516 |     return collector
 517 | 
 518 | 
 519 | def names_from_blastfile(blast_fn):
 520 |     file_pattern = r'(.+)-vs-(.+)\.t?blast[npx]'
 521 |     query_fn, subject_fn = re.findall(file_pattern, blast_fn)[0]
 522 | 
 523 |     return query_fn, subject_fn
 524 | 
 525 | 
 526 | def make_subset(fasta, output_fn, keep_file=None, keep_list=None):
 527 |     if not (keep_file or keep_list):
 528 |         print(
 529 |             '[!] Cannot make subset for {} - aborting'.format(fasta),
 530 |             file=sys.stderr
 531 |     )
 532 |         sys.exit(1)
 533 |     keep_list_set = set()
 534 |     keep_file_set = set()
 535 |     if keep_list is not None:
 536 |         for e in keep_list:
 537 |             keep_list_set.add(e)
 538 |     if keep_file is not None:
 539 |         with open(keep_file) as f:
 540 |             for l in f:
 541 |                 keep_file_set.add(l.strip())
 542 |     # get the union of the two sets of headers to include any new additions
 543 |     keep_set = keep_list_set | keep_file_set
 544 |     # write combined set to new <keep_file> if there are entries in 
 545 |     # <keep_list_set> that were not in the original <keep_file>
 546 |     new_keeps = len(keep_set) - len(keep_file_set)
 547 |     if new_keeps > 0 and keep_file is not None:
 548 |         print(
 549 |             '[#] Updating {} with {} new entries'.format(keep_file, new_keeps)
 550 |         , file=sys.stderr)
 551 |         with open(keep_file, 'w') as new_keep_file:
 552 |             for h in keep_set:
 553 |                 new_keep_file.write(h + '\n')
 554 |     kept = 0
 555 |     with open(output_fn, 'w') as out:
 556 |         for h, s in fasta_parse(fasta, trim_header=False):
 557 |             trunc_header = h.split()[0]
 558 |             if trunc_header in keep_set:
 559 |                 record = '>{}\n{}\n'.format(h, s)
 560 |                 out.write(record)
 561 |                 kept += 1
 562 |     
 563 |     return output_fn, kept
 564 | 
 565 | 
 566 | def list_hash(string_list, length=3, sort_first=True):
 567 |     hash = md5()
 568 |     if sort_first is True:
 569 |         string_list = sorted(string_list)
 570 |     for s in string_list:
 571 |         hash.update(s.encode())
 572 |     
 573 |     return hash.hexdigest()[:length]
 574 |     
 575 | 
 576 | def subset_name(fn, file_tag='subset', use_hash=True, keep_path=False):
 577 |     if type(file_tag) is not str or not file_tag:
 578 |         file_tag = ''
 579 |     else:
 580 |         file_tag = '_{}'.format(file_tag)
 581 |     out_fn = '{}{}.fa'.format(
 582 |         abbreviate(fn, use_hash=use_hash, keep_path=keep_path), file_tag
 583 |     )
 584 | 
 585 |     return out_fn
 586 | 
 587 | 
 588 | def align(aligner, query, subject, run_type, output_name, extra_args=None):
 589 |     if extra_args is None:
 590 |         extra_args = []
 591 |     aligner_args = [
 592 |         aligner, 
 593 |         query, 
 594 |         subject, 
 595 |         run_type,
 596 |         '--output_name', 
 597 |         output_name
 598 |     ]
 599 |     result = subprocess.run(aligner_args + extra_args)
 600 | 
 601 |     return result
 602 | 
 603 | 
 604 | def alignment_filenames(query, subject, run_type, use_hash=True):
 605 |     run_files = {}
 606 | 
 607 |     # when subsetting is used, this hack will produce more obvious filenames
 608 |     # that reflect which alignments were of subsets and which weren't
 609 |     special_case_tags = ['_residual', '_subset']
 610 | 
 611 |     no_abbrev = []
 612 |     for f in [query, subject]:
 613 |         if any(t in f for t in special_case_tags):
 614 |             no_abbrev.append(f)
 615 | 
 616 |     fw_names = unique_filenames(query, subject, skip=no_abbrev, use_hash=use_hash)
 617 |     rv_names = unique_filenames(subject, query, skip=no_abbrev, use_hash=use_hash)
 618 | 
 619 |     run_files['forward'] = '{}-vs-{}.{}'.format(*fw_names, run_type)
 620 |     run_files['reverse'] = '{}-vs-{}.{}'.format(*rv_names, run_type)
 621 | 
 622 |     return run_files
 623 | 
 624 | 
 625 | def seq_lengths(fasta):
 626 |     l = {}
 627 |     for h, s in fasta_parse(fasta):
 628 |         l[h] = len(s)
 629 |     
 630 |     return l
 631 | 
 632 | 
 633 | def remove_many_to_one(pairs):
 634 |     """
 635 |     Each element of >pairs< is a tuple: (hitA, hitB, (scoreX, scoreY))
 636 | 
 637 |     Takes a list of paired reciprocal hits (plus scores) and filters it
 638 |     such that each member of each pair only occur once, i.e. it removes
 639 |     any many-to-one hits, using the bitscores in the last element of
 640 |     each tuple.
 641 |     
 642 |     """
 643 |     uniques = {}
 644 |     to_remove = []
 645 |     for index, (a, b, scores) in enumerate(pairs):
 646 |         avg_score = sum(scores) / 2
 647 |         for e in [a, b]:
 648 |             if e not in uniques:
 649 |                 uniques[e] = {'score': avg_score, 'index': index}
 650 |             elif uniques[e]['score'] >= avg_score:
 651 |                 to_remove.append(index)
 652 |                 continue
 653 |             else:
 654 |                 to_remove.append(uniques[e]['index'])
 655 |                 uniques[e] = {'score': avg_score, 'index': index}
 656 | 
 657 |     filtered = []
 658 |     for i, p in enumerate(pairs):
 659 |         if i in to_remove:
 660 |             names = p[0:2]
 661 |             print('Removed: {}'.format('\t'.join(names)), file=sys.stderr)
 662 |         else:
 663 |             filtered.append(p)
 664 | 
 665 |     return filtered
 666 | 
 667 | 
 668 | def graph_cluster(pairwise_sets, chain=False):
 669 |     graph = nx.Graph()
 670 |     for p_set in pairwise_sets:
 671 |         for pair in p_set:
 672 |             graph.add_edge(*pair)
 673 |     if chain:
 674 |         clusters = nx.connected_components(graph)
 675 |     else:
 676 |         clusters = nx.find_cliques(graph)
 677 |     clusters = [sorted(c) for c in clusters]
 678 | 
 679 |     return sorted(clusters, key=len)
 680 | 
 681 | 
 682 | def subset_size_check(input_file, subset, kept_n):
 683 |     if kept_n == 0:
 684 |         print('[!] Subset size of {} = 0; aborting.'.format(subset))
 685 |         sys.exit(1)
 686 |     else:
 687 |         print(
 688 |             '[#] Subset size for {}: {}'.format(input_file, kept_n), 
 689 |             file=sys.stderr
 690 |         )
 691 | 
 692 | 
 693 | def log_ledger(q, s, run_type, win_ledger, use_hash=True):
 694 |     ledger_file = '{}-{}.{}.log'.format(
 695 |         abbreviate(q, use_hash=use_hash), abbreviate(s, use_hash=use_hash), run_type)
 696 |     with open(ledger_file, 'w') as lf:
 697 |         for query, info in sorted(win_ledger.items()):
 698 |             winner = info['best']
 699 |             loser_tuples = info['losers']
 700 |             lf.write('>{}\t[{}]\n'.format(winner, query))
 701 |             for loser in sorted(loser_tuples):
 702 |                 lf.write('\t'.join(loser) + '\n')
 703 | 
 704 | 
 705 | def get_alignments(
 706 |     pairs_index, 
 707 |     aligner, 
 708 |     run_type, 
 709 |     extra,
 710 |     overwrite=False, 
 711 |     file_dirs=None,
 712 |     blast_file=None,
 713 |     use_hash=True
 714 | ):
 715 |     """
 716 |     <pairs_index> is a dictionary of the form {label: (q, s), ...}
 717 |     
 718 |     """
 719 |     # get reciprologs for each pairwise permutation of files [(1, 2), (2, 1), ...]
 720 |     alignment_index = {}
 721 |     for alignment_key, pair in pairs_index.items():
 722 |         q, s = pair
 723 |         # this block might want to move to above the preceding if block...?
 724 |         if blast_file:
 725 |             alignment_index[alignment_key] = blast_file
 726 |             continue
 727 |         align_fn = alignment_filenames(q, s, run_type, use_hash=use_hash)['forward']
 728 |         if file_dirs:
 729 |             for d in file_dirs:
 730 |                 file_list = os.listdir(d)
 731 |                 if align_fn in file_list:
 732 |                     align_fn = os.path.join(
 733 |                             os.path.abspath(d), align_fn)
 734 |                     break
 735 |         if not os.path.isfile(align_fn) or overwrite is True:
 736 |             alignment = align(
 737 |                 aligner, q, s, run_type, align_fn, extra_args=extra
 738 |             )
 739 |             if alignment.returncode != 0:
 740 |                 sys.exit(
 741 |                     '[!] ERROR: alignment failed: {} (return code: {})'
 742 |                     .format(align_fn, alignment.returncode)
 743 |                     )
 744 |         else:
 745 |             print(
 746 |                 '[#] Using existing output \'{}\''.format(align_fn), 
 747 |                 file=sys.stderr
 748 |             )
 749 |         alignment_index[alignment_key] = align_fn
 750 |     
 751 |     return alignment_index
 752 | 
 753 | 
 754 | def align_residuals(alignments, aligner, residual_index, args, overwrite=False, use_hash=True):
 755 |     alignment_index = {}
 756 |     run_type = args.run_type
 757 |     extra = args.extra
 758 |     for aln_key, aln in sorted(alignments.items(), key=lambda x: sorted(x[0])):
 759 |         q, s = aln_key
 760 |         if q not in residual_index:
 761 |             continue
 762 |         r_sub_set = residual_index[q]
 763 |         r_sub_fn = subset_name(q, file_tag='residual', use_hash=use_hash)
 764 |         r_sub, kept_n = make_subset(q, r_sub_fn, keep_list=r_sub_set)
 765 |         subset_size_check(q, r_sub, kept_n)
 766 |         align_fn = alignment_filenames(q, s, run_type, use_hash=use_hash)['forward']
 767 |         if not os.path.isfile(align_fn) or overwrite is True:
 768 |             residual_aln = align(
 769 |                 aligner, r_sub, s, run_type, align_fn, extra_args=extra)
 770 |             if residual_aln.returncode != 0:
 771 |                 sys.exit(
 772 |                     '[!] ERROR: alignment failed: {} (return code: {})'
 773 |                     .format(align_fn, residual_aln.returncode)
 774 |                     )
 775 |         else:
 776 |             print(
 777 |                 '[#] Using existing output \'{}\''.format(align_fn), 
 778 |                 file=sys.stderr
 779 |             )
 780 |         alignment_index[aln_key] = align_fn
 781 |     
 782 |     return alignment_index
 783 | 
 784 |         
 785 | def build_pair_index(
 786 |     input_files, 
 787 |     subset_index, 
 788 |     subset_tag=None, 
 789 |     subset_only=False,
 790 |     use_hash=True
 791 | ):
 792 |     pairs = {}
 793 |     made = set()
 794 |     for q, s in permutations(sorted(input_files), 2):
 795 |         alignment_key = (q, s)
 796 |         paralogs = q == s
 797 |         if subset_index and subset_index.get(q) and not paralogs:
 798 |             # subset files are getting tagged twice—probably need to move
 799 |             # tagging to a different function or something
 800 |             sub_set = subset_index[q]
 801 |             subset_hash = list_hash(sub_set)
 802 |             hash_tag = f'{subset_tag}_h{subset_hash}'
 803 |             sub_fn = subset_name(
 804 |                 q, file_tag=hash_tag, use_hash=use_hash
 805 |             )
 806 |             
 807 |             # check if subset file has already been made 
 808 |             # during this loop to avoid remaking recent files
 809 |             # unnecessarily
 810 |             if q not in made:
 811 |                 made.add(q)
 812 |                 sub_q, kept_n = make_subset(q, sub_fn, keep_list=sub_set)
 813 |                 subset_size_check(q, sub_q, kept_n)
 814 |             else:
 815 |                 sub_q = sub_fn
 816 |             q = sub_q
 817 |         elif subset_only is True:
 818 |             continue
 819 |         p = (q, s)
 820 |         pairs[alignment_key] = p
 821 |         
 822 |     return pairs
 823 | 
 824 | 
 825 | def get_residual_hits(hit_index, subset_index):
 826 |     residual_index = defaultdict(set)
 827 |     for pair, fwd_hits in hit_index.items():
 828 |         q, s = pair
 829 |         if subset_index and subset_index[s] is not None:
 830 |             reverse_pair = (s, q)
 831 |             rv_query_names = set(hit_index[reverse_pair].keys())
 832 |             s_hits = set(h['name'] for h in fwd_hits.values())
 833 |             residual = s_hits - rv_query_names
 834 |             residual_index[s] |= residual
 835 |             print(f'residual length for {s}: {len(residual_index[s])}', file=sys.stderr)  ###!!!
 836 |     
 837 |     return residual_index
 838 | 
 839 | 
 840 | def build_hit_indices(alignments, args):
 841 |     query_percentage = args.query_percentage_threshold
 842 |     blast_file = args.blast_file
 843 |     length_index = {}
 844 |     hit_index = {}
 845 |     residual_index = defaultdict(set)
 846 |     for (q, s), aln in sorted(alignments.items(), key=lambda x: sorted(x[0])):
 847 |         if blast_file:
 848 |             aln = blast_file
 849 |         # set flag if both files are the same
 850 |         paralogs = q == s
 851 |         if paralogs and (q, s) in hit_index:
 852 |             continue
 853 |         # add the lengths of all sequences to an index for later 
 854 |         # tie-breaking of best hits
 855 |         for e in (q, s):
 856 |             if e not in length_index:
 857 |                 length_index[e] = seq_lengths(e)
 858 |                 # this is needed for the weird get_top_hits() API - might
 859 |                 # be better as a separate arg in the future...
 860 |                 length_index[e]['query_match_threshold'] = query_percentage
 861 |         # get sets of query IDs to filter alignment lines to 
 862 |         # relevant hits (matters in case of aggregate alignment file)
 863 |         q_list = set(length_index[q].keys())
 864 |         if query_percentage is not None:
 865 |             q_lengths = length_index[q]
 866 |         else:
 867 |             q_lengths = {}
 868 |         s_lengths = length_index[s]
 869 |         top_hits, win_ledger = get_top_hits(
 870 |             aln,
 871 |             paralogs,
 872 |             query_match=q_lengths,
 873 |             seq_lengths=s_lengths,
 874 |             ignore_same_id=args.ignore_same_id,
 875 |             ignore_same_prefix=args.ignore_same_prefix,
 876 |             query_list=q_list
 877 |         )
 878 | 
 879 |         hit_index[(q, s)] = top_hits
 880 | 
 881 |     return hit_index
 882 | 
 883 | 
 884 | def get_subset_set(subset_list_file):
 885 |     subset = set()
 886 |     with open(subset_list_file) as f:
 887 |         for l in f:
 888 |             subset.add(l.strip())
 889 | 
 890 |     return subset
 891 | 
 892 | 
 893 | parser = argparse.ArgumentParser(
 894 |     description=(
 895 |         'Find reciprocal best hits between two or more files. '
 896 |         'Any unrecognized arguments will be passed along to the chosen '
 897 |         'alignment program.'),
 898 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 899 | 
 900 | parser.add_argument(
 901 |     'input_files',
 902 |     metavar='file_1 file_2 ...',
 903 |     help='files to use to build reciprolog sets (space separated)',
 904 |     nargs='+'
 905 | )
 906 | parser.add_argument(
 907 |     'run_type',
 908 |     choices=[
 909 |         'diamondp',
 910 |         'diamondx',
 911 |         'blastn',
 912 |         'blastp',
 913 |         'blastx',
 914 |         'tblastn',
 915 |         'tblastx'
 916 |     ],
 917 |     help='type of alignment program to run'
 918 | )
 919 | parser.add_argument(
 920 |     '-p',
 921 |     '--parallel_processes',
 922 |     help=(
 923 |         'run the alignment step using multiple parallel processes'),
 924 |     type=int,
 925 |     default=1
 926 | )
 927 | parser.add_argument(
 928 |     '-q',
 929 |     '--query_percentage_threshold',
 930 |     metavar='PERCENTAGE',
 931 |     help=(
 932 |         'require a specified fraction of the query length to match in '
 933 |         'order for a hit to qualify (lowest allowable percentage'),
 934 |     type=float,
 935 |     default=None
 936 | )
 937 | parser.add_argument(
 938 |     '--chain',
 939 |     action='store_true',
 940 |     help=(
 941 |         'cluster reciprologs without requiring all-by-all pairwise '
 942 |         'relationships, e.g. A-B, A-C, A-D --> A-B-C-D')
 943 | )
 944 | parser.add_argument(
 945 |     '--subset',
 946 |     metavar=('subset_1', 'subset_2'),
 947 |     nargs='+',
 948 |     help=(
 949 |         'Files containing subsets of headers to be used as queries for each '
 950 |         'input file. Supplied in the same order as the input files; one header '
 951 |         'per line. To omit a subset file for a given input file, '
 952 |         'provide "." as the argument, e.g. for three input files with only 1 & '
 953 |         '3 with subsets: --subsets subset_1 . subset_2'
 954 |     )
 955 | )
 956 | parser.add_argument(
 957 |     '--ignore_same_id',
 958 |     action='store_true',
 959 |     help='ignore hits where both query and subject have identical IDs'
 960 | )
 961 | parser.add_argument(
 962 |     '--ignore_same_prefix',
 963 |     metavar='<prefix_delimiter>',
 964 |     help=(
 965 |         'ignore hits where both query and subject have identical prefixes, '
 966 |         'where the prefix for each ID is delimited by the specified '
 967 |         '<prefix_delimiter>')
 968 | )
 969 | parser.add_argument(
 970 |     '-o',
 971 |     '--output',
 972 |     help=(
 973 |         'output filename (use flag without argument for auto-naming)'
 974 |     ),
 975 |     nargs='?',
 976 |     default='stdout'
 977 | )
 978 | parser.add_argument(
 979 |     '-d',
 980 |     '--alignment_source_directory',
 981 |     help='check for existing alignment files to use in this directory first',
 982 |     nargs='+',
 983 |     metavar='path'
 984 | )
 985 | parser.add_argument(
 986 |     '-b',
 987 |     '--blast_file',
 988 |     help='aggregated BLAST output to use (both directions)'
 989 | )
 990 | parser.add_argument(
 991 |     '--overwrite',
 992 |     help=(
 993 |         'overwrite existing output files '
 994 |         '(instead of using them to bypass alignment step)'
 995 |     ),
 996 |     action='store_true'
 997 | )
 998 | parser.add_argument(
 999 |     '--one_to_one',
1000 |     help=(
1001 |         'remove any many-to-one reciprolog relationships in each pairwise '
1002 |         'set, such that each member of each pairwise comparison is only '
1003 |         'present exactly one time in output'),
1004 |     action='store_true'
1005 | )
1006 | parser.add_argument(
1007 |     '--logging',
1008 |     help='output a log of best-hit choice criteria',
1009 |     action='store_true'
1010 | )
1011 | parser.add_argument(
1012 |     '--no_hash_tag',
1013 |     help='do not auto-tag output files with MD5 hashes of source files',
1014 |     action='store_true'
1015 | )
1016 | 
1017 | t_start = time.time()
1018 | 
1019 | args, EXTRA_ARGS = parser.parse_known_args()
1020 | 
1021 | RUN_TYPE = args.run_type
1022 | PARALLEL = args.parallel_processes
1023 | INPUT_FILES = args.input_files
1024 | if len(INPUT_FILES) < 2:
1025 |     sys.exit('error: too few files specified (need >1)')
1026 | QUERY_PERCENTAGE = args.query_percentage_threshold
1027 | OVERWRITE = args.overwrite
1028 | ONE_TO_ONE = args.one_to_one
1029 | LOGGING = args.logging
1030 | CHAIN = args.chain
1031 | IGNORE_SAME_ID = args.ignore_same_id
1032 | IGNORE_SAME_PREFIX = args.ignore_same_prefix
1033 | BLAST_FILE = args.blast_file
1034 | OUTPUT_FILE = args.output
1035 | ALIGNMENT_SOURCE_DIRS = args.alignment_source_directory
1036 | SUBSET = args.subset
1037 | USE_HASHES = not args.no_hash_tag
1038 | 
1039 | ALIGN_PROG = 'palign'
1040 | 
1041 | if not USE_GRAPH:
1042 |     print(
1043 |         '[!] networkx library not found; will use brute-force method instead',
1044 |         file=sys.stderr
1045 |     )
1046 | 
1047 | # create a list with the flags/options to pass to the subsequence
1048 | # subprocess calls
1049 | call_options = {
1050 |     '-p': PARALLEL
1051 | }
1052 | optional = EXTRA_ARGS
1053 | for k, v in call_options.items():
1054 |     if v:
1055 |         optional.extend([str(k), str(v)])
1056 | 
1057 | # if --subset, generate list of subset files to match up with input files
1058 | if SUBSET and len(SUBSET) != len(INPUT_FILES):
1059 |     print(
1060 |         '[!] Must supply as many arguments to --subset as there are input files', 
1061 |         file=sys.stderr
1062 |     )
1063 |     sys.exit(1)
1064 | 
1065 | # the default None values here are used as a check later on to 
1066 | # indicate which files have associated subsets
1067 | 
1068 | if SUBSET:
1069 |     subset_index = {f: set() for f in INPUT_FILES}
1070 |     working_subsets = []
1071 |     for s in SUBSET:
1072 |         if s != '.':
1073 |             s = get_subset_set(s)
1074 |         working_subsets.append(s)
1075 |         # working_name = '{}.working'.format(s)
1076 |         # shutil.copy(s, working_name)
1077 |         # working_subsets.append(working_name)
1078 |     for f, s in zip(INPUT_FILES, working_subsets):
1079 |         if s != '.':
1080 |             subset_index[f] = s
1081 | else:
1082 |     subset_index = None
1083 | 
1084 | 
1085 | alignment_pairs = build_pair_index(
1086 |     INPUT_FILES, subset_index, subset_tag='subset', subset_only=False, use_hash=USE_HASHES)
1087 | 
1088 | alignments = get_alignments(
1089 |     alignment_pairs, 
1090 |     ALIGN_PROG, 
1091 |     RUN_TYPE, 
1092 |     EXTRA_ARGS, 
1093 |     overwrite=OVERWRITE,
1094 |     file_dirs=ALIGNMENT_SOURCE_DIRS,
1095 |     blast_file=BLAST_FILE,
1096 |     use_hash=USE_HASHES
1097 | )
1098 | 
1099 | # hit_index, residual_index = build_hit_indices(alignments, args, subset_index=subset_index)
1100 | hit_index = build_hit_indices(alignments, args)
1101 | residual_index = get_residual_hits(hit_index, subset_index)
1102 | 
1103 | if residual_index:
1104 |     residual_pairs = build_pair_index(
1105 |         INPUT_FILES, 
1106 |         residual_index, 
1107 |         subset_tag='residual', 
1108 |         subset_only=True, 
1109 |         use_hash=USE_HASHES
1110 |     )
1111 | 
1112 |     residual_alignments = get_alignments(
1113 |         residual_pairs, 
1114 |         ALIGN_PROG, 
1115 |         RUN_TYPE, 
1116 |         EXTRA_ARGS, 
1117 |         overwrite=OVERWRITE,
1118 |         file_dirs=ALIGNMENT_SOURCE_DIRS,
1119 |         blast_file=BLAST_FILE,
1120 |         use_hash=USE_HASHES
1121 |     )
1122 | 
1123 |     residual_hit_index = build_hit_indices(
1124 |         residual_alignments, args
1125 |     )
1126 |     
1127 |     for k, v in residual_hit_index.items():
1128 |         hit_index[k].update(v)
1129 | 
1130 | # get all unique pairs (e.g. (0, 1) but not (1, 0)),
1131 | # to then iterate over only the forward pairs and get 
1132 | # reciprocal hits for each pair
1133 | unique_pairs = set([tuple(sorted(e)) for e in hit_index])
1134 | 
1135 | pairwise_reciprolog_sets = []
1136 | for p in unique_pairs:
1137 |     rv_key = tuple(sorted(p, reverse=True))
1138 |     top_fwd = hit_index[p]
1139 |     top_rv = hit_index[rv_key]
1140 |     q_name, s_name = p
1141 |     reciprolog_set = get_reciprocals(top_fwd, top_rv, q_name, s_name)
1142 |     if ONE_TO_ONE is True:
1143 |         reciprolog_set = remove_many_to_one(reciprolog_set)
1144 | 
1145 |     # remove score info
1146 |     reciprolog_set = [tuple(x[0:2]) for x in reciprolog_set]
1147 |     pairwise_reciprolog_sets.append(reciprolog_set)
1148 | 
1149 | if CHAIN:
1150 |     reciprologs = aggregate_orthos_chained(pairwise_reciprolog_sets, USE_GRAPH)
1151 | else:
1152 |     reciprologs = aggregate_orthos_strict(pairwise_reciprolog_sets, USE_GRAPH)
1153 | 
1154 | reciprologs = clean_reciprologs(reciprologs, subset_index=subset_index)
1155 | 
1156 | basename = '-'.join(sorted([abbreviate(f, use_hash=USE_HASHES) for f in INPUT_FILES]))
1157 | 
1158 | if len(reciprologs) == 0:
1159 |     out_string = ''
1160 | else:
1161 |     if OUTPUT_FILE:
1162 |         if OUTPUT_FILE == 'stdout':
1163 |             out = sys.stdout
1164 |             out_string = ''
1165 |         else:
1166 |             out = open(OUTPUT_FILE, 'w')
1167 |             out_string = ': {}'.format(OUTPUT_FILE)
1168 |     else:
1169 |         OUTPUT_FILE = '{}.{}.reciprologs'.format(basename, RUN_TYPE)
1170 |         out = open(OUTPUT_FILE, 'w')
1171 |         out_string = ': {}'.format(OUTPUT_FILE)
1172 | 
1173 |     for group in reciprologs:
1174 |         out.write('\t'.join(group) + '\n')
1175 | 
1176 |     if OUTPUT_FILE != 'stdout':
1177 |         out.close()
1178 | 
1179 | runtime = get_runtime(t_start)
1180 | 
1181 | print(
1182 |     '[#] Job finished in {}; {} reciprolog sets found{}'
1183 |     .format(runtime, len(reciprologs), out_string),
1184 |     file=sys.stderr)
1185 | 
1186 | sys.exit(0)
1187 | 


--------------------------------------------------------------------------------