├── CHANGELOG.md
├── LICENSE
├── README.md
├── dehumanizer
    ├── __init__.py
    ├── dehumanizer.py
    └── version.py
└── setup.py


/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [ZeroVer Versioning](https://0ver.org/).
 6 | 
 7 | ## [0.9.0] - 2021-09-02
 8 | ### Added
 9 | * CHANGELOG.md will document notable changes
10 | * `dehumanise --version` will report version number
11 | ### Changed
12 | * `--bam` based dehumanising checks for BAM index with pysam `AlignmentFile.has_index` to skip count pass and use `idxstats` if index is available for modest improvement
13 | * Merged the default humref, decoy and HLA references into one FASTA available for download as most users do not care about which ref causes a read to be discarded
14 | ### Fixed
15 | * Cleared up confusion surrounding pre-built mmi indexes by reintroducing manifest syntax to map each input file to a minimap2 preset
16 |     * `dehumanise` will exit 65 to indicate that no references for a given preset are listed in the manifest
17 |     * `dehumanise` will exit 78 to indicate that a reference in the manifest is not mapped to a minimap2 preset that matches `--preset`
18 | * `dehumanise --help` now works
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Sam Nicholls
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dehumanizer
2 | Human DNA where it shouldn't be? Expunge it from your samples with the `dehumanizer`. Just point at a FASTQ or BAM that you suspect is contaminated with human DNA, and `dehumanizer` will rifle through your file, throwing your reads at as many aligning processes as you will allow, to yield a clean file, free of uninvited humans.
3 | 
4 | Currently I am only supporting use of this tool on the CLIMB-COVID platform as I am in the process of making breaking changes to dehumanizer to improve performance.
5 | 


--------------------------------------------------------------------------------
/dehumanizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SamStudio8/dehumanizer/2f3d3db1bb578afd33c219c9417d2e9d3b775599/dehumanizer/__init__.py


--------------------------------------------------------------------------------
/dehumanizer/dehumanizer.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process, Queue, Array
  2 | import ctypes
  3 | import sys
  4 | import os
  5 | import mappy as mp
  6 | import numpy as np
  7 | import pysam
  8 | from datetime import datetime
  9 | from datetime import date
 10 | 
 11 | from . import version
 12 | 
 13 | def load_manifest(path, preset):
 14 |     manifest = {
 15 |         "preset": preset,
 16 |         "references": [
 17 |         ],
 18 |     }
 19 |     manifest_fh = open(path)
 20 |     for line_i, line in enumerate(manifest_fh):
 21 |         fields = line.strip().split() # split on any whitespace if you have whitespace in your ref name you have bigger problems 
 22 | 
 23 |         if line[0] == '#':
 24 |             continue
 25 | 
 26 |         if len(fields) < 3:
 27 |             sys.stderr.write("[FAIL] Manifest did not contain a third column mapping a reference to a preset\n")
 28 |             sys.stderr.write("       Consult the README to ensure you are using a manifest suitable for dehumaniser >= 0.9.0\n")
 29 |             sys.exit(78) # EX_CONFIG
 30 | 
 31 |         if fields[2] != preset:
 32 |             continue
 33 | 
 34 |         manifest["references"].append({
 35 |             "name": fields[0],
 36 |             "path": fields[1],
 37 |         })
 38 | 
 39 |     if len(manifest["references"]) == 0:
 40 |         sys.stderr.write("[FAIL] Manifest did not contain any references for preset=%s\n" % preset)
 41 |         sys.stderr.write("       Consult the README to ensure your manifest is correctly configured and for\n")
 42 |         sys.stderr.write("       instructions on how to build your own indexes if needed\n")
 43 |         sys.exit(65) # EX_DATAERR
 44 |     else:
 45 |         sys.stderr.write("[NOTE] Detected %d references in manifest for preset=%s\n" % (len(manifest["references"]), preset))
 46 |     manifest_fh.close()
 47 |     return manifest
 48 | 
 49 | 
 50 | def dh_bam(log, manifest, bad_set, args):
 51 |     dirty_bam = pysam.AlignmentFile(args.dirty)
 52 | 
 53 |     dirty_header = dirty_bam.header.as_dict()
 54 | 
 55 |     pg_date = date.today().strftime("%Y%m%d")
 56 |     if args.pg_date:
 57 |         if len(args.pg_date) > 0:
 58 |             pg_date = args.pg_date
 59 | 
 60 |     if "PG" not in dirty_header:
 61 |         dirty_header["PG"] = []
 62 |     dirty_header["PG"].append({
 63 |         "ID": 'dehumanizer.%s' % pg_date,
 64 |         "PN": 'dehumanizer',
 65 |         "VN": version.__version__,
 66 |         "CL": " ".join(sys.argv),
 67 |     })
 68 |     clean_header = pysam.AlignmentHeader.from_dict(dirty_header)
 69 |     clean_bam = pysam.AlignmentFile(args.clean, "wb", header=clean_header)
 70 |     break_first = not args.nobreak # break on first hit, otherwise we can use this to 'survey' hits to different databases
 71 | 
 72 |     aligners = []
 73 |     each_dropped = []
 74 |     for ref_i, ref_manifest in enumerate(manifest["references"]):
 75 |         sys.stderr.write("[INFO] Init minimap2 aligner: %s (%s)\n" % (ref_manifest["path"], manifest["preset"]))
 76 |         aligners.append( mp.Aligner(ref_manifest["path"], preset=manifest["preset"]) )
 77 |         each_dropped.append(0)
 78 |     sys.stderr.write("[INFO] minimap2 aligners ready.\n")
 79 | 
 80 | 
 81 |     n_seqs = 0
 82 |     n_good = 0
 83 |     n_trash = 0
 84 |     n_known = 0
 85 |     n_collateral = 0
 86 |     n_baddies = 0
 87 | 
 88 |     bad_seen = set([])
 89 | 
 90 |     if dirty_bam.has_index():
 91 |         n_seqs = dirty_bam.mapped + dirty_bam.unmapped
 92 |     else:
 93 |         # First pass to get the number of sequences without an index
 94 |         for read in dirty_bam.fetch(until_eof=True):
 95 |             n_seqs += 1
 96 |     dirty_bam.close()
 97 | 
 98 |     bad_mask = np.zeros(n_seqs, dtype=np.bool)
 99 | 
100 |     # Second pass to establish a bit mask of what to keep
101 |     dirty_bam = pysam.AlignmentFile(args.dirty)
102 |     for r_i, read in enumerate(dirty_bam.fetch(until_eof=True)):
103 | 
104 |         if not read.query_sequence:
105 |             continue # supp alignment or something, its up to the user to trash these
106 | 
107 |         read_is_bad = False
108 | 
109 |         for ref_i, ref_manifest in enumerate(manifest["references"]):
110 |             for hit in aligners[ref_i].map(read.query_sequence):
111 | 
112 |                 if not args.minlen or not args.minid:
113 |                     # a hit is a hit
114 |                     read_is_bad = True
115 |                 else:
116 |                     if args.minlen:
117 |                         st = min(hit.q_st, hit.q_en)
118 |                         en = max(hit.q_st, hit.q_en)
119 |                         if ((en - st) / len(read.query_sequence)) * 100 >= args.minlen:
120 |                             read_is_bad = True
121 | 
122 |                     if args.minid:
123 |                         # http://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity
124 |                         # "In the PAF format, column 10 divived by column 11 gives the BLAST identity."
125 |                         bscore = hit.mlen / hit.blen
126 |                         if bscore * 100 >= args.minid:
127 |                             read_is_bad = True
128 | 
129 |                 # Criteria satisifed
130 |                 if read_is_bad:
131 |                     each_dropped[ref_i] += 1
132 |                     if break_first:
133 |                         break
134 | 
135 |             else:
136 |                 # Continue the outer loop to the next aligner, as no hit was found
137 |                 continue
138 |             # Break the aligner loop as we've already break'ed a hit
139 |             break
140 | 
141 |         if read_is_bad:
142 |             n_baddies += 1
143 | 
144 | 
145 |         # Check if the read is trash instead
146 |         if not read_is_bad:
147 |             if args.trash_minalen:
148 |                 try:
149 |                     if (read.reference_length/read.query_length)*100.0 < args.trash_minalen:
150 |                         read_is_bad = True
151 |                         n_trash += 1
152 |                 except ZeroDivisionError: 
153 |                     read_is_bad = True
154 |                     n_trash += 1
155 | 
156 |         # Check if the read is on the shitlist
157 |         if not read_is_bad:
158 |             if read.query_name in bad_set:
159 |                 read_is_bad = True
160 |                 n_known += 1
161 | 
162 |         if read_is_bad:
163 |             bad_mask[r_i] = 1
164 |             bad_seen.add(read.query_name)
165 | 
166 |     dirty_bam.close()
167 | 
168 |     # Third and final pass to write
169 |     dirty_bam = pysam.AlignmentFile(args.dirty)
170 |     for r_i, read in enumerate(dirty_bam.fetch(until_eof=True)):
171 | 
172 |         # If the read really is good, write it out
173 |         if not bad_mask[r_i]:
174 |             # Finally, check if the QNAME has been tossed out already
175 |             if read.query_name in bad_seen:
176 |                 n_collateral += 1
177 |                 continue
178 | 
179 |             n_good += 1
180 |             clean_bam.write(read)
181 | 
182 |     sys.stderr.write("[INFO] %d sequences in, %d sequences out\n" % (n_seqs, n_good))
183 |     log.write("\t".join([str(x) for x in [
184 |         os.path.basename(args.clean),
185 |         n_seqs,
186 |         n_seqs - n_good,
187 |         n_good,
188 |         n_baddies,
189 |         n_trash,
190 |         n_known,
191 |         n_collateral,
192 |         "-"
193 |         ]] + [str(x) for x in each_dropped]) + '\n')
194 | 
195 |     dirty_bam.close()
196 |     clean_bam.close()
197 | 
198 | 
199 | #TODO FUTURE Would be good to have another layer of multiproc that poured reads from multiple files to any available aligners
200 | #               Need to think carefully about this however; as the mp.Aligner is primed to a particular reference and shared
201 | def dh_fastx(log, manifest, args):
202 | 
203 |     fastx_path = args.dirty
204 |     break_first = not args.nobreak # break on first hit, otherwise we can use this to 'survey' hits to different databases
205 | 
206 |     n_seqs = 0
207 |     if args.n:
208 |         n_seqs = args.n
209 |     else:
210 |         for name, seq, qual in mp.fastx_read(fastx_path):
211 |             n_seqs += 1
212 | 
213 |     sys.stderr.write("[INFO] Preparing memory for flags.\n")
214 |     super_flag_matrix = np.frombuffer(Array(ctypes.c_bool, n_seqs*len(manifest["references"]), lock=False), dtype=ctypes.c_bool)
215 |     super_flag_matrix = super_flag_matrix.reshape(n_seqs, len(manifest["references"]))
216 |     sys.stderr.write("[INFO] Raised %d x %d flags.\n" % (n_seqs, len(manifest["references"])))
217 | 
218 | 
219 |     #aligners = []
220 |     #for ref_i, ref_manifest in enumerate(manifest["references"]):
221 |     #    aligners.append([])
222 |     #    sys.stderr.write("[%d/%d] Booting minimap2 aligners.\n" % (ref_i+1, len(manifest["references"])))
223 |     #
224 |     #    for _ in range(args.threads):
225 |     #        aligners[ref_i].append( mp.Aligner(ref_manifest["path"], preset=manifest["preset"]) )
226 | 
227 |     def map_seqs(work_q, manifest, break_first, block_i):
228 |         aligners = []
229 |         for ref_i, ref_manifest in enumerate(manifest["references"]):
230 |             #sys.stderr.write("[%d:%d/%d] Booting minimap2 aligners.\n" % (block_i, ref_i+1, len(manifest["references"])))
231 |             aligners.append( mp.Aligner(ref_manifest["path"], preset=manifest["preset"]) )
232 |         sys.stderr.write("[%d:] minimap2 aligners ready.\n" % (block_i))
233 | 
234 |         while True:
235 |             work = work_q.get()
236 |             if work is None:
237 |                 return
238 | 
239 |             for ref_i, ref_manifest in enumerate(manifest["references"]):
240 |                 super_flag_matrix[ work["i"] ][ref_i] = 0
241 | 
242 |             for ref_i, ref_manifest in enumerate(manifest["references"]):
243 |                 for hit in aligners[ref_i].map(work["seq"]):
244 | 
245 |                     if args.minlen:
246 |                         st = min(hit.q_st, hit.q_en)
247 |                         en = max(hit.q_st, hit.q_en)
248 |                         if ((en - st) / len(work["seq"])) * 100 < args.minlen:
249 |                             continue
250 | 
251 |                     if args.minid:
252 |                         # http://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity
253 |                         # "In the PAF format, column 10 divived by column 11 gives the BLAST identity."
254 |                         bscore = hit.mlen / hit.blen
255 |                         if bscore * 100 < args.minid:
256 |                             continue
257 | 
258 |                     # Criteria satisifed
259 |                     super_flag_matrix[ work["i"] ][ref_i] = 1
260 |                     if break_first:
261 |                         break
262 |                 else:
263 |                     # Continue the outer loop to the next aligner, as no hit was found
264 |                     continue
265 |                 # Break the aligner loop as we've already seen a hit
266 |                 break
267 | 
268 |     sys.stderr.write("[INFO] Counted %d sequences\n" % (n_seqs))
269 |     sys.stderr.write("[INFO] %s\n" % (fastx_path))
270 | 
271 |     work_queue = Queue(maxsize=args.threads*5000) # Queue N seqs per process
272 |     processes = []
273 | 
274 |     for _ in range(args.threads):
275 |         p = Process(target=map_seqs, args=(work_queue,manifest,break_first,_))
276 |         processes.append(p)
277 | 
278 |     for p in processes:
279 |         p.start()
280 | 
281 |     # Begin adding seqs
282 |     sys.stderr.write("[INFO] Feeding sequences to queue\n")
283 |     start_clock = datetime.now()
284 |     for read_i, read_tuple in enumerate(mp.fastx_read(fastx_path)):
285 |         if read_i % args.blockrep == 0:
286 |             end_clock = datetime.now()
287 |             sys.stderr.write("[NOTE] Queued Read#%d. Last block pushed in %s (%s pseq.)\n" % (read_i, str(end_clock - start_clock), str((end_clock-start_clock)/args.blockrep) ))
288 |             start_clock = datetime.now()
289 |         if args.n:
290 |             if read_i+1 > args.n:
291 |                 break
292 | 
293 |         # Align
294 |         # queue will block until there's room
295 |         work_queue.put({"i": read_i, "seq": read_tuple[1]})
296 | 
297 |     sys.stderr.write("[INFO] Finished feeding sequences\n")
298 | 
299 |     # Add sentinels to kill off processes
300 |     sys.stderr.write("[INFO] Wait for queues to empty... be patient\n")
301 |     for _ in range(args.threads):
302 |         work_queue.put(None)
303 | 
304 |     # Wait for processes to complete work
305 |     for p in processes:
306 |         p.join()
307 | 
308 | 
309 | 
310 |     flat_dropped = ( super_flag_matrix.sum(axis=1) > 0 )
311 |     total_dropped = flat_dropped.sum()
312 |     sys.stderr.write("[INFO] Dropped %d sequences\n" % (flat_dropped.sum()))
313 | 
314 |     # Now...
315 |     clean_fq_p = args.clean
316 |     if args.clean == "-":
317 |         clean_fq = sys.stdout
318 |     else:
319 |         fp = os.path.basename(fastx_path).split(".")
320 |         clean_fq = open(clean_fq_p, 'w')
321 |         sys.stderr.write("[INFO] Writing FASTX %s\n" % (clean_fq_p))
322 | 
323 | 
324 |     # Output FASTX
325 |     n_good = 0
326 |     for read_i, read_tuple in enumerate(mp.fastx_read(fastx_path)):
327 |         if not flat_dropped[read_i]:
328 |             n_good += 1
329 |             if read_tuple[2] is None:
330 |                 out_read = ">%s\n%s\n" % (read_tuple[0],
331 |                                           read_tuple[1])
332 |             else:
333 |                 out_read = "@%s\n%s\n+\n%s\n" % (read_tuple[0],
334 |                                                  read_tuple[1],
335 |                                                  read_tuple[2])
336 |             clean_fq.write(out_read)
337 |     clean_fq.close()
338 | 
339 |     each_dropped = list( super_flag_matrix.sum(axis=0) )
340 |     log.write("\t".join([str(x) for x in [
341 |         os.path.basename(clean_fq_p),
342 |         n_seqs,
343 |         n_seqs - n_good,
344 |         n_good,
345 |         total_dropped,
346 |         0,
347 |         0,
348 |         0,
349 |         "-"
350 |         ]] + [str(x) for x in each_dropped]) + '\n')
351 | 
352 | 
353 | def cli():
354 |     import argparse
355 | 
356 |     parser = argparse.ArgumentParser()
357 | 
358 |     parser.add_argument("manifest", help="reference manifest")
359 |     parser.add_argument("dirty", help="input dirty file")
360 | 
361 |     parser.add_argument("--known", help="new-line delimited list of reads known to be dirty")
362 | 
363 |     type_p = parser.add_mutually_exclusive_group(required=True)
364 |     type_p.add_argument("--bam", action="store_true")
365 |     type_p.add_argument("--fastx", action="store_true")
366 | 
367 |     parser.add_argument("--preset", help="mappy aligner preset", required=True)
368 | 
369 |     parser.add_argument("-o", "--clean", help="output clean file [default -]", default="-")
370 |     parser.add_argument("--log", help="log path [default <dirty>.dehumanizer.log.txt]", default=None)
371 | 
372 |     parser.add_argument("-t", "--threads", help="number of minimap2 process queues to spawn PER REFERENCE [1]", default=1, type=int)
373 |     parser.add_argument("-n", help="number of reads (prevents having to count)", type=int)
374 |     parser.add_argument("--minid", help="min %%proportion of (L-NM)/L to determine a hit [use all hits]", type=float, default=None)
375 |     parser.add_argument("--minlen", help="min %%proportion of read aligned to accept a hit [use all hits]", type=float, default=None)
376 | 
377 |     parser.add_argument("--nobreak", help="dont break on the first database hit [False]", action="store_true", default=False)
378 |     parser.add_argument("--blockrep", help="report progress after a block of N sequences [100000]", default=100000, type=int)
379 | 
380 |     # Not really the place for it, but whatever
381 |     parser.add_argument("--trash-minalen", help="trash reads whose alignment length is less than this %%proportion of their size [keep everything] ignored if not BAM", type=float, default=None)
382 | 
383 |     parser.add_argument("--pg-date", help="datestamp to insert into BAM PG header [default today in format YYYYMMDD]", default="")
384 | 
385 |     parser.add_argument("--version", action="version", version="%(prog)s " + version.__version__)
386 | 
387 |     args = parser.parse_args()
388 | 
389 |     #if not args.minid and not args.minlen:
390 |     #    sys.stderr.write("You must set a minimum identity (--minid) and/or minimum length (--minlen).\n")
391 |     #    sys.exit(1)
392 | 
393 |     if not args.log:
394 |         log = open(args.dirty + ".dehumanizer.log.txt", 'w')
395 |     else:
396 |         log = open(args.log, 'w')
397 | 
398 |     manifest = load_manifest(args.manifest, args.preset)
399 | 
400 |     log.write("\t".join([
401 |         "name",
402 |         "seqs_in",
403 |         "seqs_total_dropped",
404 |         "seqs_out",
405 |         "n_hits",
406 |         "n_clipped",
407 |         "n_known",
408 |         "n_collateral",
409 |         "-"
410 |     ] + [x["name"] for x in manifest["references"]]) + '\n')
411 | 
412 |     if args.fastx:
413 |         dh_fastx(log, manifest, args)
414 |     elif args.bam:
415 |         bad_set = set([])
416 |         if args.known:
417 |             bad_set = set([x.strip() for x in open(args.known)])
418 |         dh_bam(log, manifest, bad_set, args)
419 | 
420 |     log.close()
421 | 
422 | if __name__ == "__main__":
423 |     cli()
424 | 


--------------------------------------------------------------------------------
/dehumanizer/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.9.0"
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import setuptools
 5 | from dehumanizer import version
 6 | 
 7 | setuptools.setup(
 8 |     name="dehumanizer",
 9 |     version=version.__version__,
10 |     url="https://github.com/samstudio8/dehumanizer",
11 | 
12 |     description="A command line tool for rapidly ridding reads of horrid humans",
13 |     long_description="",
14 | 
15 |     author="Sam Nicholls",
16 |     author_email="sam@samnicholls.net",
17 | 
18 |     maintainer="Sam Nicholls",
19 |     maintainer_email="sam@samnicholls.net",
20 | 
21 |     packages=setuptools.find_packages(),
22 |     include_package_data=True,
23 | 
24 |     install_requires=[
25 |         "mappy",
26 |         "numpy",
27 |         "pysam",
28 |     ],
29 | 
30 |     entry_points = {
31 |         "console_scripts": [
32 |             "dehumanize=dehumanizer.dehumanizer:cli",
33 |             "dehumanise=dehumanizer.dehumanizer:cli",
34 |         ]
35 |     },
36 | 
37 |     classifiers = [
38 |         'Development Status :: 2 - Pre-Alpha',
39 |         'Intended Audience :: Science/Research',
40 |         'Topic :: Scientific/Engineering',
41 |         'Topic :: Scientific/Engineering :: Bio-Informatics',
42 |         'License :: OSI Approved :: MIT License',
43 |     ],
44 | 
45 | )
46 | 


--------------------------------------------------------------------------------