├── CHANGELOG.md ├── LICENSE ├── README.md ├── dehumanizer ├── __init__.py ├── dehumanizer.py └── version.py └── setup.py /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [ZeroVer Versioning](https://0ver.org/). 6 | 7 | ## [0.9.0] - 2021-09-02 8 | ### Added 9 | * CHANGELOG.md will document notable changes 10 | * `dehumanise --version` will report version number 11 | ### Changed 12 | * `--bam` based dehumanising checks for BAM index with pysam `AlignmentFile.has_index` to skip count pass and use `idxstats` if index is available for modest improvement 13 | * Merged the default humref, decoy and HLA references into one FASTA available for download as most users do not care about which ref causes a read to be discarded 14 | ### Fixed 15 | * Cleared up confusion surrounding pre-built mmi indexes by reintroducing manifest syntax to map each input file to a minimap2 preset 16 | * `dehumanise` will exit 65 to indicate that no references for a given preset are listed in the manifest 17 | * `dehumanise` will exit 78 to indicate that a reference in the manifest is not mapped to a minimap2 preset that matches `--preset` 18 | * `dehumanise --help` now works 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Sam Nicholls 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dehumanizer 2 | Human DNA where it shouldn't be? Expunge it from your samples with the `dehumanizer`. Just point at a FASTQ or BAM that you suspect is contaminated with human DNA, and `dehumanizer` will rifle through your file, throwing your reads at as many aligning processes as you will allow, to yield a clean file, free of uninvited humans. 3 | 4 | Currently I am only supporting use of this tool on the CLIMB-COVID platform as I am in the process of making breaking changes to dehumanizer to improve performance. 5 | -------------------------------------------------------------------------------- /dehumanizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SamStudio8/dehumanizer/2f3d3db1bb578afd33c219c9417d2e9d3b775599/dehumanizer/__init__.py -------------------------------------------------------------------------------- /dehumanizer/dehumanizer.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue, Array 2 | import ctypes 3 | import sys 4 | import os 5 | import mappy as mp 6 | import numpy as np 7 | import pysam 8 | from datetime import datetime 9 | from datetime import date 10 | 11 | from . import version 12 | 13 | def load_manifest(path, preset): 14 | manifest = { 15 | "preset": preset, 16 | "references": [ 17 | ], 18 | } 19 | manifest_fh = open(path) 20 | for line_i, line in enumerate(manifest_fh): 21 | fields = line.strip().split() # split on any whitespace if you have whitespace in your ref name you have bigger problems 22 | 23 | if line[0] == '#': 24 | continue 25 | 26 | if len(fields) < 3: 27 | sys.stderr.write("[FAIL] Manifest did not contain a third column mapping a reference to a preset\n") 28 | sys.stderr.write(" Consult the README to ensure you are using a manifest suitable for dehumaniser >= 0.9.0\n") 29 | sys.exit(78) # EX_CONFIG 30 | 31 | if fields[2] != preset: 32 | continue 33 | 34 | manifest["references"].append({ 35 | "name": fields[0], 36 | "path": fields[1], 37 | }) 38 | 39 | if len(manifest["references"]) == 0: 40 | sys.stderr.write("[FAIL] Manifest did not contain any references for preset=%s\n" % preset) 41 | sys.stderr.write(" Consult the README to ensure your manifest is correctly configured and for\n") 42 | sys.stderr.write(" instructions on how to build your own indexes if needed\n") 43 | sys.exit(65) # EX_DATAERR 44 | else: 45 | sys.stderr.write("[NOTE] Detected %d references in manifest for preset=%s\n" % (len(manifest["references"]), preset)) 46 | manifest_fh.close() 47 | return manifest 48 | 49 | 50 | def dh_bam(log, manifest, bad_set, args): 51 | dirty_bam = pysam.AlignmentFile(args.dirty) 52 | 53 | dirty_header = dirty_bam.header.as_dict() 54 | 55 | pg_date = date.today().strftime("%Y%m%d") 56 | if args.pg_date: 57 | if len(args.pg_date) > 0: 58 | pg_date = args.pg_date 59 | 60 | if "PG" not in dirty_header: 61 | dirty_header["PG"] = [] 62 | dirty_header["PG"].append({ 63 | "ID": 'dehumanizer.%s' % pg_date, 64 | "PN": 'dehumanizer', 65 | "VN": version.__version__, 66 | "CL": " ".join(sys.argv), 67 | }) 68 | clean_header = pysam.AlignmentHeader.from_dict(dirty_header) 69 | clean_bam = pysam.AlignmentFile(args.clean, "wb", header=clean_header) 70 | break_first = not args.nobreak # break on first hit, otherwise we can use this to 'survey' hits to different databases 71 | 72 | aligners = [] 73 | each_dropped = [] 74 | for ref_i, ref_manifest in enumerate(manifest["references"]): 75 | sys.stderr.write("[INFO] Init minimap2 aligner: %s (%s)\n" % (ref_manifest["path"], manifest["preset"])) 76 | aligners.append( mp.Aligner(ref_manifest["path"], preset=manifest["preset"]) ) 77 | each_dropped.append(0) 78 | sys.stderr.write("[INFO] minimap2 aligners ready.\n") 79 | 80 | 81 | n_seqs = 0 82 | n_good = 0 83 | n_trash = 0 84 | n_known = 0 85 | n_collateral = 0 86 | n_baddies = 0 87 | 88 | bad_seen = set([]) 89 | 90 | if dirty_bam.has_index(): 91 | n_seqs = dirty_bam.mapped + dirty_bam.unmapped 92 | else: 93 | # First pass to get the number of sequences without an index 94 | for read in dirty_bam.fetch(until_eof=True): 95 | n_seqs += 1 96 | dirty_bam.close() 97 | 98 | bad_mask = np.zeros(n_seqs, dtype=np.bool) 99 | 100 | # Second pass to establish a bit mask of what to keep 101 | dirty_bam = pysam.AlignmentFile(args.dirty) 102 | for r_i, read in enumerate(dirty_bam.fetch(until_eof=True)): 103 | 104 | if not read.query_sequence: 105 | continue # supp alignment or something, its up to the user to trash these 106 | 107 | read_is_bad = False 108 | 109 | for ref_i, ref_manifest in enumerate(manifest["references"]): 110 | for hit in aligners[ref_i].map(read.query_sequence): 111 | 112 | if not args.minlen or not args.minid: 113 | # a hit is a hit 114 | read_is_bad = True 115 | else: 116 | if args.minlen: 117 | st = min(hit.q_st, hit.q_en) 118 | en = max(hit.q_st, hit.q_en) 119 | if ((en - st) / len(read.query_sequence)) * 100 >= args.minlen: 120 | read_is_bad = True 121 | 122 | if args.minid: 123 | # http://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity 124 | # "In the PAF format, column 10 divived by column 11 gives the BLAST identity." 125 | bscore = hit.mlen / hit.blen 126 | if bscore * 100 >= args.minid: 127 | read_is_bad = True 128 | 129 | # Criteria satisifed 130 | if read_is_bad: 131 | each_dropped[ref_i] += 1 132 | if break_first: 133 | break 134 | 135 | else: 136 | # Continue the outer loop to the next aligner, as no hit was found 137 | continue 138 | # Break the aligner loop as we've already break'ed a hit 139 | break 140 | 141 | if read_is_bad: 142 | n_baddies += 1 143 | 144 | 145 | # Check if the read is trash instead 146 | if not read_is_bad: 147 | if args.trash_minalen: 148 | try: 149 | if (read.reference_length/read.query_length)*100.0 < args.trash_minalen: 150 | read_is_bad = True 151 | n_trash += 1 152 | except ZeroDivisionError: 153 | read_is_bad = True 154 | n_trash += 1 155 | 156 | # Check if the read is on the shitlist 157 | if not read_is_bad: 158 | if read.query_name in bad_set: 159 | read_is_bad = True 160 | n_known += 1 161 | 162 | if read_is_bad: 163 | bad_mask[r_i] = 1 164 | bad_seen.add(read.query_name) 165 | 166 | dirty_bam.close() 167 | 168 | # Third and final pass to write 169 | dirty_bam = pysam.AlignmentFile(args.dirty) 170 | for r_i, read in enumerate(dirty_bam.fetch(until_eof=True)): 171 | 172 | # If the read really is good, write it out 173 | if not bad_mask[r_i]: 174 | # Finally, check if the QNAME has been tossed out already 175 | if read.query_name in bad_seen: 176 | n_collateral += 1 177 | continue 178 | 179 | n_good += 1 180 | clean_bam.write(read) 181 | 182 | sys.stderr.write("[INFO] %d sequences in, %d sequences out\n" % (n_seqs, n_good)) 183 | log.write("\t".join([str(x) for x in [ 184 | os.path.basename(args.clean), 185 | n_seqs, 186 | n_seqs - n_good, 187 | n_good, 188 | n_baddies, 189 | n_trash, 190 | n_known, 191 | n_collateral, 192 | "-" 193 | ]] + [str(x) for x in each_dropped]) + '\n') 194 | 195 | dirty_bam.close() 196 | clean_bam.close() 197 | 198 | 199 | #TODO FUTURE Would be good to have another layer of multiproc that poured reads from multiple files to any available aligners 200 | # Need to think carefully about this however; as the mp.Aligner is primed to a particular reference and shared 201 | def dh_fastx(log, manifest, args): 202 | 203 | fastx_path = args.dirty 204 | break_first = not args.nobreak # break on first hit, otherwise we can use this to 'survey' hits to different databases 205 | 206 | n_seqs = 0 207 | if args.n: 208 | n_seqs = args.n 209 | else: 210 | for name, seq, qual in mp.fastx_read(fastx_path): 211 | n_seqs += 1 212 | 213 | sys.stderr.write("[INFO] Preparing memory for flags.\n") 214 | super_flag_matrix = np.frombuffer(Array(ctypes.c_bool, n_seqs*len(manifest["references"]), lock=False), dtype=ctypes.c_bool) 215 | super_flag_matrix = super_flag_matrix.reshape(n_seqs, len(manifest["references"])) 216 | sys.stderr.write("[INFO] Raised %d x %d flags.\n" % (n_seqs, len(manifest["references"]))) 217 | 218 | 219 | #aligners = [] 220 | #for ref_i, ref_manifest in enumerate(manifest["references"]): 221 | # aligners.append([]) 222 | # sys.stderr.write("[%d/%d] Booting minimap2 aligners.\n" % (ref_i+1, len(manifest["references"]))) 223 | # 224 | # for _ in range(args.threads): 225 | # aligners[ref_i].append( mp.Aligner(ref_manifest["path"], preset=manifest["preset"]) ) 226 | 227 | def map_seqs(work_q, manifest, break_first, block_i): 228 | aligners = [] 229 | for ref_i, ref_manifest in enumerate(manifest["references"]): 230 | #sys.stderr.write("[%d:%d/%d] Booting minimap2 aligners.\n" % (block_i, ref_i+1, len(manifest["references"]))) 231 | aligners.append( mp.Aligner(ref_manifest["path"], preset=manifest["preset"]) ) 232 | sys.stderr.write("[%d:] minimap2 aligners ready.\n" % (block_i)) 233 | 234 | while True: 235 | work = work_q.get() 236 | if work is None: 237 | return 238 | 239 | for ref_i, ref_manifest in enumerate(manifest["references"]): 240 | super_flag_matrix[ work["i"] ][ref_i] = 0 241 | 242 | for ref_i, ref_manifest in enumerate(manifest["references"]): 243 | for hit in aligners[ref_i].map(work["seq"]): 244 | 245 | if args.minlen: 246 | st = min(hit.q_st, hit.q_en) 247 | en = max(hit.q_st, hit.q_en) 248 | if ((en - st) / len(work["seq"])) * 100 < args.minlen: 249 | continue 250 | 251 | if args.minid: 252 | # http://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity 253 | # "In the PAF format, column 10 divived by column 11 gives the BLAST identity." 254 | bscore = hit.mlen / hit.blen 255 | if bscore * 100 < args.minid: 256 | continue 257 | 258 | # Criteria satisifed 259 | super_flag_matrix[ work["i"] ][ref_i] = 1 260 | if break_first: 261 | break 262 | else: 263 | # Continue the outer loop to the next aligner, as no hit was found 264 | continue 265 | # Break the aligner loop as we've already seen a hit 266 | break 267 | 268 | sys.stderr.write("[INFO] Counted %d sequences\n" % (n_seqs)) 269 | sys.stderr.write("[INFO] %s\n" % (fastx_path)) 270 | 271 | work_queue = Queue(maxsize=args.threads*5000) # Queue N seqs per process 272 | processes = [] 273 | 274 | for _ in range(args.threads): 275 | p = Process(target=map_seqs, args=(work_queue,manifest,break_first,_)) 276 | processes.append(p) 277 | 278 | for p in processes: 279 | p.start() 280 | 281 | # Begin adding seqs 282 | sys.stderr.write("[INFO] Feeding sequences to queue\n") 283 | start_clock = datetime.now() 284 | for read_i, read_tuple in enumerate(mp.fastx_read(fastx_path)): 285 | if read_i % args.blockrep == 0: 286 | end_clock = datetime.now() 287 | sys.stderr.write("[NOTE] Queued Read#%d. Last block pushed in %s (%s pseq.)\n" % (read_i, str(end_clock - start_clock), str((end_clock-start_clock)/args.blockrep) )) 288 | start_clock = datetime.now() 289 | if args.n: 290 | if read_i+1 > args.n: 291 | break 292 | 293 | # Align 294 | # queue will block until there's room 295 | work_queue.put({"i": read_i, "seq": read_tuple[1]}) 296 | 297 | sys.stderr.write("[INFO] Finished feeding sequences\n") 298 | 299 | # Add sentinels to kill off processes 300 | sys.stderr.write("[INFO] Wait for queues to empty... be patient\n") 301 | for _ in range(args.threads): 302 | work_queue.put(None) 303 | 304 | # Wait for processes to complete work 305 | for p in processes: 306 | p.join() 307 | 308 | 309 | 310 | flat_dropped = ( super_flag_matrix.sum(axis=1) > 0 ) 311 | total_dropped = flat_dropped.sum() 312 | sys.stderr.write("[INFO] Dropped %d sequences\n" % (flat_dropped.sum())) 313 | 314 | # Now... 315 | clean_fq_p = args.clean 316 | if args.clean == "-": 317 | clean_fq = sys.stdout 318 | else: 319 | fp = os.path.basename(fastx_path).split(".") 320 | clean_fq = open(clean_fq_p, 'w') 321 | sys.stderr.write("[INFO] Writing FASTX %s\n" % (clean_fq_p)) 322 | 323 | 324 | # Output FASTX 325 | n_good = 0 326 | for read_i, read_tuple in enumerate(mp.fastx_read(fastx_path)): 327 | if not flat_dropped[read_i]: 328 | n_good += 1 329 | if read_tuple[2] is None: 330 | out_read = ">%s\n%s\n" % (read_tuple[0], 331 | read_tuple[1]) 332 | else: 333 | out_read = "@%s\n%s\n+\n%s\n" % (read_tuple[0], 334 | read_tuple[1], 335 | read_tuple[2]) 336 | clean_fq.write(out_read) 337 | clean_fq.close() 338 | 339 | each_dropped = list( super_flag_matrix.sum(axis=0) ) 340 | log.write("\t".join([str(x) for x in [ 341 | os.path.basename(clean_fq_p), 342 | n_seqs, 343 | n_seqs - n_good, 344 | n_good, 345 | total_dropped, 346 | 0, 347 | 0, 348 | 0, 349 | "-" 350 | ]] + [str(x) for x in each_dropped]) + '\n') 351 | 352 | 353 | def cli(): 354 | import argparse 355 | 356 | parser = argparse.ArgumentParser() 357 | 358 | parser.add_argument("manifest", help="reference manifest") 359 | parser.add_argument("dirty", help="input dirty file") 360 | 361 | parser.add_argument("--known", help="new-line delimited list of reads known to be dirty") 362 | 363 | type_p = parser.add_mutually_exclusive_group(required=True) 364 | type_p.add_argument("--bam", action="store_true") 365 | type_p.add_argument("--fastx", action="store_true") 366 | 367 | parser.add_argument("--preset", help="mappy aligner preset", required=True) 368 | 369 | parser.add_argument("-o", "--clean", help="output clean file [default -]", default="-") 370 | parser.add_argument("--log", help="log path [default .dehumanizer.log.txt]", default=None) 371 | 372 | parser.add_argument("-t", "--threads", help="number of minimap2 process queues to spawn PER REFERENCE [1]", default=1, type=int) 373 | parser.add_argument("-n", help="number of reads (prevents having to count)", type=int) 374 | parser.add_argument("--minid", help="min %%proportion of (L-NM)/L to determine a hit [use all hits]", type=float, default=None) 375 | parser.add_argument("--minlen", help="min %%proportion of read aligned to accept a hit [use all hits]", type=float, default=None) 376 | 377 | parser.add_argument("--nobreak", help="dont break on the first database hit [False]", action="store_true", default=False) 378 | parser.add_argument("--blockrep", help="report progress after a block of N sequences [100000]", default=100000, type=int) 379 | 380 | # Not really the place for it, but whatever 381 | parser.add_argument("--trash-minalen", help="trash reads whose alignment length is less than this %%proportion of their size [keep everything] ignored if not BAM", type=float, default=None) 382 | 383 | parser.add_argument("--pg-date", help="datestamp to insert into BAM PG header [default today in format YYYYMMDD]", default="") 384 | 385 | parser.add_argument("--version", action="version", version="%(prog)s " + version.__version__) 386 | 387 | args = parser.parse_args() 388 | 389 | #if not args.minid and not args.minlen: 390 | # sys.stderr.write("You must set a minimum identity (--minid) and/or minimum length (--minlen).\n") 391 | # sys.exit(1) 392 | 393 | if not args.log: 394 | log = open(args.dirty + ".dehumanizer.log.txt", 'w') 395 | else: 396 | log = open(args.log, 'w') 397 | 398 | manifest = load_manifest(args.manifest, args.preset) 399 | 400 | log.write("\t".join([ 401 | "name", 402 | "seqs_in", 403 | "seqs_total_dropped", 404 | "seqs_out", 405 | "n_hits", 406 | "n_clipped", 407 | "n_known", 408 | "n_collateral", 409 | "-" 410 | ] + [x["name"] for x in manifest["references"]]) + '\n') 411 | 412 | if args.fastx: 413 | dh_fastx(log, manifest, args) 414 | elif args.bam: 415 | bad_set = set([]) 416 | if args.known: 417 | bad_set = set([x.strip() for x in open(args.known)]) 418 | dh_bam(log, manifest, bad_set, args) 419 | 420 | log.close() 421 | 422 | if __name__ == "__main__": 423 | cli() 424 | -------------------------------------------------------------------------------- /dehumanizer/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.9.0" 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import setuptools 5 | from dehumanizer import version 6 | 7 | setuptools.setup( 8 | name="dehumanizer", 9 | version=version.__version__, 10 | url="https://github.com/samstudio8/dehumanizer", 11 | 12 | description="A command line tool for rapidly ridding reads of horrid humans", 13 | long_description="", 14 | 15 | author="Sam Nicholls", 16 | author_email="sam@samnicholls.net", 17 | 18 | maintainer="Sam Nicholls", 19 | maintainer_email="sam@samnicholls.net", 20 | 21 | packages=setuptools.find_packages(), 22 | include_package_data=True, 23 | 24 | install_requires=[ 25 | "mappy", 26 | "numpy", 27 | "pysam", 28 | ], 29 | 30 | entry_points = { 31 | "console_scripts": [ 32 | "dehumanize=dehumanizer.dehumanizer:cli", 33 | "dehumanise=dehumanizer.dehumanizer:cli", 34 | ] 35 | }, 36 | 37 | classifiers = [ 38 | 'Development Status :: 2 - Pre-Alpha', 39 | 'Intended Audience :: Science/Research', 40 | 'Topic :: Scientific/Engineering', 41 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 42 | 'License :: OSI Approved :: MIT License', 43 | ], 44 | 45 | ) 46 | --------------------------------------------------------------------------------