├── Dockerfile ├── LICENSE.txt ├── README.md ├── buffer.lisp ├── clean-sam.lisp ├── demo ├── README.md ├── run-large-gatk.sh ├── run-large.sh ├── run-small-gatk.sh ├── run-small.sh ├── run-wes-gatk.sh └── ucsc.hg19.dict ├── elprep-package.lisp ├── elprep-sfm-gnupar.py ├── elprep-sfm.py ├── elprep-utils.lisp ├── elprep.asd ├── elprep.py ├── elprep_entrypoint.py ├── elprep_im.py ├── elprep_io_wrapper.py ├── elprep_sfm.py ├── elprep_sfm_gnupar.py ├── filter-pipeline.lisp ├── io-utils.lisp ├── lisp-utils.lisp ├── make-lispworks-binary.sh ├── make-sbcl-binary.sh ├── mark-duplicates.lisp ├── sam-files.lisp ├── sam-types.lisp ├── save-elprep-script.lisp ├── simple-filters.lisp ├── simple-trees.lisp └── user-interface.lisp /Dockerfile: -------------------------------------------------------------------------------- 1 | From ubuntu:latest 2 | 3 | From python:2.7.10 4 | 5 | MAINTAINER Charlotte Herzeel 6 | 7 | RUN apt-get -y update && apt-get install -y wget && apt-get install -y gcc && apt-get install -y make && apt-get install -y zlib1g-dev && apt-get -y install ncurses-dev && apt-get -y install parallel && apt-get clean 8 | 9 | RUN wget https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 && bunzip2 samtools-1.2.tar.bz2 && tar -xvf samtools-1.2.tar && cd samtools-1.2 && make 10 | 11 | RUN wget https://github.com/ExaScience/elprep/releases/download/2.5/elprep-v2.5.tar.bz2 && bunzip2 elprep-v2.5.tar.bz2 && tar -xvf elprep-v2.5.tar 12 | 13 | ENV PATH ./samtools-1.2:./elprep-v2.5:${PATH} 14 | 15 | CMD [] 16 | 17 | ENTRYPOINT ["elprep_entrypoint.py"] 18 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, 2015, 2016 Imec and Intel Corporation. 2 | Copyright (c) 2017 Imec. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | * Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | * Neither the name of Imec or Intel Corporation nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | ---------------------------------------------------------------------- 31 | 32 | Additional license for binaries created with LispWorks. 33 | ------------------------------------------------------- 34 | 35 | This Software contains or has been developed using software supplied 36 | by LispWorks Ltd (''LispWorks Application Software''). All LispWorks 37 | Application Software supplied under this Agreement is additionally 38 | licensed in accordance with the following terms of the LispWorks 39 | Software License Agreement: 40 | 41 | (1) You may not copy the LispWorks Application Software except as 42 | necessary to exercise your rights under this Agreement and to make one 43 | (1) copy of the LispWorks Application Software in machine readable 44 | form for back-up or archival purposes only. You may not rent, lease, 45 | loan, translate, reverse engineer, decompile, disassemble, modify or 46 | create derivative works based on the materials, except as expressly 47 | permitted by the law of this Agreement. This license does not permit 48 | the distribution of the LispWorks Application Software, or any part 49 | thereof, to any third party. 50 | 51 | (2) Ownership; Confidentiality. You acknowledge and agree that the 52 | LispWorks Application Software contains the confidential and 53 | proprietary information of LispWorks Ltd and its licensors and is 54 | provided solely under the terms and conditions of this Agreement. All 55 | right, title and interest in and to the LispWorks Application 56 | Software, including, but not limited to, patent copyright, trade 57 | secret, trademark and all other intellectual property rights are and 58 | shall remain vested in LispWorks Ltd or its licensor. You shall not 59 | remove any product identification, copyright notices, or other legends 60 | set forth on the LispWorks Application Software and shall reproduce 61 | all such notices on any copies. You shall have no right in LispWorks 62 | Ltd's or its third party licensors' trademarks in connection with the 63 | LispWorks Application Software, or with its promotion or publication, 64 | without LispWorks Ltd's prior written approval. 65 | 66 | (3) No Warranty. LispWorks Ltd provides no warranty that (a) the 67 | LispWorks Application Software will perform substantially in 68 | accordance with the accompanying user documentation when used as 69 | described therein; or (b) that the media containing the LispWorks 70 | Application Software will not prove defective under normal use. 71 | LispWorks Ltd does not warrant that the LispWorks Application Software 72 | will be error free or that the LispWorks Application Software will 73 | meet your requirements. Your sole remedy for non performance of the 74 | LispWorks Application Software will lie with the provider of the 75 | software in which the LispWorks Application Software is included. TO 76 | THE MAXIMUM EXTENT PERMITTED BY LAW, LISPWORKS LTD DISCLAIMS ALL 77 | WARRANTIES OR REPRESENTATIONS, EXPRESS OR IMPLIED, INCLUDING, BUT NOT 78 | LIMITED TO, THE IMPLIED WARRANTIES OF QUALITY AND FITNESS FOR A 79 | PARTICULAR PURPOSE. 80 | 81 | (4) Limitation of Liability. 82 | 83 | (a) If you are a business. EXCEPT IN RESPECT OF PERSONAL INJURY OR 84 | DEATH CAUSED BY THE NEGLIGENCE OF LISPWORKS LTD, FRAUDULANT 85 | MISREPRESENTATION OR ANY OTHER LIABILITY WHICH MAY NOT BE EXCLUDED BY 86 | LAW IN NO EVENT SHALL LISPWORKS LTD OR ITS LICENSORS HAVE ANY 87 | LIABILITY TO YOU WHETHER UNDER THIS AGREEMENT OR OTHERWISE. 88 | 89 | (b) If you are a consumer. YOU ACKNOWLEDGE THAT THE LISPWORKS 90 | APPLICATION SOFTWARE HAS NOT BEEN DEVELOPED TO MEET YOUR INDIVIDUAL 91 | REQUIREMENTS, AND THAT IT IS THEREFORE YOUR RESPONSIBILITY TO ENSURE 92 | THAT THE FACILITIES AND FUNCTIONS OF THE SOFTWARE AS DESCRIBED IN THE 93 | RELEVANT DOCUMENTATION MEET YOUR REQUIREMENTS. YOU AGREE NOT TO USE 94 | THE SOFTWARE FOR ANY COMMERCIAL OR BUSINESS PURPOSES, AND LISPWORKS 95 | LTD AND ITS LICENSORS HAVE NO LIABILITY TO YOU FOR ANY LOSS OF PROFIT, 96 | LOSS OF BUSINESS, BUSINESS INTERRUPTION, OR LOSS OF BUSINESS 97 | OPPORTUNITY. NOTHING IN THIS AGREEMENT SHALL LIMIT OR EXCLUDE OUR 98 | LIABILITY FOR: (A) DEATH OR PERSONAL INJURY RESULTING FROM OUR 99 | NEGLIGENCE; (B) FRAUD OR FRAUDULENT MISREPRESENTATION; (C) ANY OTHER 100 | LIABILITY THAT CANNOT BE EXCLUDED OR LIMITED BY ENGLISH LAW. 101 | 102 | (5) Export Control. You may not export or re-export the LispWorks 103 | Application Software or any underlying information or technology 104 | except in full compliance with all applicable laws and regulations of 105 | all applicable countries. 106 | 107 | (6) Compliance with Applicable Laws. You shall be responsible for 108 | obtaining any import licenses or permits necessary for your use or 109 | distribution of the LispWorks Application Software in accordance with 110 | the terms of this Agreement. You will be responsible for any and all 111 | duties, charges, taxes and other amounts payable in connection with 112 | use of the Licensed Program in accordance with this Agreement. 113 | 114 | (7) Third Party Rights. A person who is not a party to this Agreement 115 | shall not have any rights under the Contracts (Rights of Third 116 | Parties) Act 1999 to enforce any of its terms. 117 | 118 | (8) Termination. LispWorks Ltd reserves the right to terminate your 119 | use of the LispWorks Application Software on written notice if you 120 | fail to comply with any of the above terms and conditions. Upon 121 | termination, you shall cease using the LispWorks Application Software 122 | and shall destroy all copies of the LispWorks Application Software. 123 | 124 | In the event of any conflict between this Agreement and the above 125 | terms and to the extent that such conflict relates to the use of 126 | LispWorks Application Software contained in any software supplied 127 | under this Agreement, the clauses as set out above shall prevail. 128 | -------------------------------------------------------------------------------- /buffer.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | (in-simple-base-string-syntax) 3 | 4 | (defconstant +buffer-chunk-size+ 1024 5 | "Size of buffer chunks.") 6 | 7 | (declaim (inline %make-buffer buffer-p)) 8 | 9 | (defstruct (buffer (:constructor %make-buffer ()) 10 | (:copier nil)) 11 | "A buffer can be used to create simple-base-strings that can grow arbitrarily large in size. 12 | It is somewhat similar to adjustable strings in Common Lisp, but guarantees to allocate only as much as necessary, 13 | and to never copy contents when growing. Buffers are also easy to reuse, and efficiently reuse already allocated memory. 14 | The struct buffer has a default constructor %make-buffer, and a defaul predicate. 15 | Accessor buffer-pos refers to the current active size of the buffer. 16 | Accessor buffer-str refers to an array of string chunks. 17 | Accessor buffer-hash-value refers to the hash value of the active contents of the buffer. If -1, the hash-value hasn't been computed yet." 18 | (pos 0 :type fixnum) 19 | (str #() :type simple-vector) 20 | (hash-value -1 :type fixnum)) 21 | 22 | (setf (documentation '%make-buffer 'function) 23 | "Default constructor for struct buffer." 24 | (documentation 'buffer-p 'function) 25 | "Default predicate for struct buffer." 26 | (documentation 'buffer-pos 'function) 27 | "The current active size of a buffer." 28 | (documentation 'buffer-str 'function) 29 | "An array of string chunks holding the contents of a buffer." 30 | (documentation 'buffer-hash-value 'function) 31 | "The hash-value for the active contents of a buffer. If -1, the hash-value hasn't been computed yet.") 32 | 33 | (declaim (inline reinitialize-buffer)) 34 | 35 | (defun reinitialize-buffer (buf) 36 | "Reset buffer-pos and buffer-hash-value, so this buffer can be reused." 37 | (declare (buffer buf) #.*optimization*) 38 | (setf (buffer-pos buf) 0) 39 | (setf (buffer-hash-value buf) -1) 40 | buf) 41 | 42 | (declaim (inline buffer-emptyp)) 43 | 44 | (defun buffer-emptyp (buf) 45 | "Returns true if the active size of the buffer is 0." 46 | (declare (buffer buf) #.*optimization*) 47 | (= (buffer-pos buf) 0)) 48 | 49 | (defun ensure-str (buf old-str n) 50 | "Ensure that buffer-str holds enough chunks (internal)." 51 | (declare (buffer buf) (simple-vector old-str) (fixnum n) #.*optimization*) 52 | (let ((new-str (make-array n #+lispworks :single-thread #+lispworks t))) 53 | (declare (simple-vector new-str)) 54 | (loop for i of-type fixnum below (length old-str) 55 | do (setf (svref new-str i) (svref old-str i))) 56 | (loop for i of-type fixnum from (length old-str) below (length new-str) 57 | do (setf (svref new-str i) 58 | (make-array +buffer-chunk-size+ :element-type 'base-char 59 | #+lispworks :single-thread #+lispworks t))) 60 | (setf (buffer-str buf) new-str))) 61 | 62 | (declaim (inline ensure-chunk)) 63 | 64 | (defun ensure-chunk (buf hi) 65 | "Ensure that buffer-str has enough chunks (internal)." 66 | (declare (buffer buf) (fixnum hi) #.*optimization*) 67 | (let ((str (buffer-str buf))) 68 | (declare (simple-vector str)) 69 | (svref (if (< hi (length str)) str 70 | (ensure-str buf str (the fixnum (1+ hi)))) hi))) 71 | 72 | (defmethod print-object ((buf buffer) stream) 73 | (print-unreadable-object (buf stream :type t :identity t) 74 | (format stream ":POS ~S :STR ~S" 75 | (buffer-pos buf) 76 | (unless (zerop (length (buffer-str buf))) "...")))) 77 | 78 | (declaim (inline buffer-push)) 79 | 80 | (defun buffer-push (buf char) 81 | "Add a single character to a buffer." 82 | (declare (buffer buf) (base-char char) #.*optimization*) 83 | (let ((pos (buffer-pos buf))) 84 | (declare (fixnum pos)) 85 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 86 | (declare (fixnum hi lo)) 87 | (let ((chunk (ensure-chunk buf hi))) 88 | (declare (simple-base-string chunk)) 89 | (setf (schar chunk lo) char) 90 | (setf (buffer-pos buf) (the fixnum (1+ pos))))))) 91 | 92 | (declaim (notinline slow-buffer-extend)) 93 | 94 | (defun slow-buffer-extend (buf pos hi lo chunk string start end length) 95 | "Add a simple-base-string to a buffer (internal slow path)." 96 | (declare (buffer buf) (fixnum pos hi lo) (simple-base-string chunk) 97 | (simple-base-string string) (fixnum start end length) #.*optimization*) 98 | (loop with source of-type fixnum = start do 99 | (loop for target of-type fixnum from lo below +buffer-chunk-size+ do 100 | (setf (schar chunk target) 101 | (schar string source)) 102 | (when (= (incf source) end) 103 | (setf (buffer-pos buf) (the fixnum (+ pos length))) 104 | (return-from slow-buffer-extend))) 105 | (incf hi) (setq lo 0) 106 | (setq chunk (ensure-chunk buf hi)))) 107 | 108 | (defun buffer-extend (buf string &optional (start 0) end) 109 | "Add a base-string to a buffer." 110 | (declare (buffer buf) (base-string string) (fixnum start) #.*optimization*) 111 | (let* ((end (or end (length string))) 112 | (length (the fixnum (- end start))) 113 | (pos (buffer-pos buf))) 114 | (declare (fixnum end length pos)) 115 | (multiple-value-bind (string offset) (unwrap-displaced-array string) 116 | (declare (simple-base-string string) (fixnum offset)) 117 | (setq start (the fixnum (+ start offset)) 118 | end (the fixnum (+ end offset))) 119 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 120 | (declare (fixnum hi lo)) 121 | (let ((chunk (ensure-chunk buf hi))) 122 | (declare (simple-base-string chunk)) 123 | (if (<= (the fixnum (+ lo length)) +buffer-chunk-size+) 124 | (loop for i of-type fixnum from start below end 125 | for j of-type fixnum from lo do 126 | (setf (schar chunk j) 127 | (schar string i)) 128 | finally 129 | (setf (buffer-pos buf) (the fixnum (+ pos length))) 130 | (return (values))) 131 | (slow-buffer-extend buf pos hi lo chunk string start end length))))))) 132 | 133 | #+sbcl 134 | (progn 135 | (declaim (notinline slow-io-buffer-extend)) 136 | 137 | (defun slow-io-buffer-extend (buf pos hi lo chunk string start end length) 138 | "Add a region of a buffered-ascii-input-stream buffer to a buffer (internal slow path)." 139 | (declare (buffer buf) (fixnum pos hi lo) (simple-base-string chunk) 140 | (fixnum start end length) #.*optimization*) 141 | (with-buffer-dispatch string 142 | (loop with source of-type fixnum = start do 143 | (loop for target of-type fixnum from lo below +buffer-chunk-size+ do 144 | (setf (schar chunk target) 145 | (bchar string source)) 146 | (when (= (incf source) end) 147 | (setf (buffer-pos buf) (the fixnum (+ pos length))) 148 | (return-from slow-io-buffer-extend))) 149 | (incf hi) (setq lo 0) 150 | (setq chunk (ensure-chunk buf hi))))) 151 | 152 | (defun io-buffer-extend (buf string &optional (start 0) end) 153 | "Add a region of a buffered-ascii-input-stream buffer to a buffer." 154 | (declare (buffer buf) (fixnum start) #.*optimization*) 155 | (with-buffer-dispatch string 156 | (let* ((end (or end (length string))) 157 | (length (the fixnum (- end start))) 158 | (pos (buffer-pos buf))) 159 | (declare (fixnum end length pos)) 160 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 161 | (declare (fixnum hi lo)) 162 | (let ((chunk (ensure-chunk buf hi))) 163 | (declare (simple-base-string chunk)) 164 | (if (<= (the fixnum (+ lo length)) +buffer-chunk-size+) 165 | (loop for i of-type fixnum from start below end 166 | for j of-type fixnum from lo do 167 | (setf (schar chunk j) 168 | (bchar string i)) 169 | finally 170 | (setf (buffer-pos buf) (the fixnum (+ pos length))) 171 | (return (values))) 172 | (slow-io-buffer-extend buf pos hi lo chunk string start end length)))))))) 173 | 174 | (declaim (inline make-buffer)) 175 | 176 | (defun make-buffer (&optional initial-string) 177 | "Create a buffer with an optional initial string" 178 | (let ((buf (%make-buffer))) 179 | (when initial-string (buffer-extend buf initial-string)) 180 | buf)) 181 | 182 | (defun write-buffer (buf stream) 183 | "Write the active contents of a buffer to a stream" 184 | (declare (buffer buf) (stream stream) #.*optimization*) 185 | (let ((pos (buffer-pos buf)) 186 | (str (buffer-str buf))) 187 | (declare (fixnum pos) (simple-vector str)) 188 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 189 | (declare (fixnum hi lo)) 190 | (loop for i of-type fixnum below hi do 191 | (write-string (svref str i) stream :end +buffer-chunk-size+)) 192 | (when (> lo 0) 193 | (write-string (svref str hi) stream :end lo)))) 194 | (values)) 195 | 196 | (defun buffer-copy (source target) 197 | "Copy the active contents of one buffer to another." 198 | (declare (buffer source target) #.*fixnum-optimization*) 199 | (let ((pos (buffer-pos source)) 200 | (str (buffer-str source))) 201 | (declare (fixnum pos) (simple-vector str)) 202 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 203 | (declare (fixnum hi lo)) 204 | (loop for i of-type fixnum below hi 205 | do (buffer-extend target (svref str i) 0 +buffer-chunk-size+)) 206 | (when (> lo 0) (buffer-extend target (svref str hi) 0 lo)))) 207 | (values)) 208 | 209 | #+lispworks 210 | (defun read-line-into-buffer (stream buf) 211 | "Read a line from a stream into a buffer, after reinitializing it." 212 | (declare (buffered-stream stream) (buffer buf) #.*fixnum-optimization*) 213 | (reinitialize-buffer buf) 214 | (loop (with-stream-input-buffer (buffer index limit) stream 215 | (declare (simple-base-string buffer) (fixnum index limit)) 216 | (loop for end of-type fixnum from index below limit do 217 | (when (char= (lw:sbchar buffer end) #\Newline) 218 | (buffer-extend buf buffer index end) 219 | (setq index (1+ end)) 220 | (return-from read-line-into-buffer buf)) 221 | finally 222 | (buffer-extend buf buffer index limit) 223 | (setq index limit))) 224 | (unless (stream-fill-buffer stream) 225 | (return-from read-line-into-buffer buf)))) 226 | 227 | #+sbcl 228 | (defun read-line-into-buffer (stream buf) 229 | "Read a line from a stream into a buffer, after reinitializing it." 230 | (declare (buffered-ascii-input-stream stream) (buffer buf) #.*optimization*) 231 | (reinitialize-buffer buf) 232 | (with-ascii-stream-input-buffer buffer stream 233 | (loop (let ((index (buffered-ascii-input-stream-index stream)) 234 | (limit (buffered-ascii-input-stream-limit stream))) 235 | (declare (fixnum index limit)) 236 | (loop for end of-type fixnum from index below limit do 237 | (when (char= (bchar buffer end) #\Newline) 238 | (io-buffer-extend buf buffer index end) 239 | (setf (buffered-ascii-input-stream-index stream) (the fixnum (1+ end))) 240 | (return-from read-line-into-buffer buf)) 241 | finally 242 | (io-buffer-extend buf buffer index limit) 243 | (setf (buffered-ascii-input-stream-index stream) limit))) 244 | (unless (stream-fill-buffer stream) 245 | (return-from read-line-into-buffer buf))))) 246 | 247 | (defun buffer-partition (buf separator &rest targets) 248 | "Get substrings from a buffer and feed them to target buffers after reinitializing them; 249 | separator is a character, like #\Tab; 250 | targets is a property list with numbers as keys and buffers as values; 251 | the targets need to be sorted by key; 252 | for example (buffer-partition buf #\Tab 3 buf1 6 buf2)" 253 | (declare (buffer buf) (base-char separator) (dynamic-extent targets) #.*optimization*) 254 | (loop for (nil buffer) on targets by #'cddr do (reinitialize-buffer buffer)) 255 | (let ((current-target 0)) 256 | (declare (fixnum current-target)) 257 | (flet ((get-target-buf () 258 | (if targets 259 | (when (= current-target (the fixnum (car targets))) 260 | (pop targets) 261 | (pop targets)) 262 | (return-from buffer-partition (values))))) 263 | (declare (inline get-target-buf)) 264 | (let ((target-buf (get-target-buf))) 265 | (declare ((or buffer null) target-buf)) 266 | (flet ((next-target () 267 | (incf current-target) 268 | (setq target-buf (get-target-buf)))) 269 | (declare (inline next-target)) 270 | (let ((pos (buffer-pos buf)) 271 | (str (buffer-str buf))) 272 | (declare (fixnum pos) (simple-vector str)) 273 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 274 | (declare (fixnum hi lo)) 275 | (loop for i of-type fixnum below hi 276 | for chunk of-type simple-base-string = (svref str i) 277 | for start of-type fixnum = 0 do 278 | (loop for end of-type fixnum below +buffer-chunk-size+ do 279 | (when (char= (schar chunk end) separator) 280 | (when target-buf 281 | (buffer-extend target-buf chunk start end)) 282 | (next-target) 283 | (setq start (the fixnum (1+ end)))) 284 | finally 285 | (when target-buf 286 | (buffer-extend target-buf chunk start +buffer-chunk-size+)))) 287 | (when (> lo 0) 288 | (loop with chunk of-type simple-base-string = (svref str hi) 289 | with start of-type fixnum = 0 290 | for end of-type fixnum below +buffer-chunk-size+ do 291 | (when (char= (schar chunk end) separator) 292 | (when target-buf 293 | (buffer-extend target-buf chunk start end)) 294 | (next-target) 295 | (setq start (the fixnum (1+ end)))) 296 | finally 297 | (when target-buf 298 | (buffer-extend target-buf chunk start lo)))))))))) 299 | (values)) 300 | 301 | (defun buffer-string (buf) 302 | "Return a string representation of the active contents of a buffer. 303 | Use this only for debugging. When writing to a stream, use write-buffer instead." 304 | (declare (buffer buf) #.*optimization*) 305 | (let ((pos (buffer-pos buf)) 306 | (str (buffer-str buf))) 307 | (declare (fixnum pos) (simple-vector str)) 308 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 309 | (declare (fixnum hi lo)) 310 | (let ((result (make-array pos :element-type 'base-char 311 | #+lispworks :single-thread #+lispworks t)) 312 | (target -1)) 313 | (declare (simple-base-string result) (fixnum target)) 314 | (loop for i of-type fixnum below hi 315 | for chunk of-type simple-base-string = (svref str i) do 316 | (loop for j of-type fixnum below +buffer-chunk-size+ do 317 | (setf (schar result (incf target)) 318 | (schar chunk j)))) 319 | (when (> lo 0) 320 | (loop with chunk of-type simple-base-string = (svref str hi) 321 | for j of-type fixnum below lo do 322 | (setf (schar result (incf target)) 323 | (schar chunk j)))) 324 | result)))) 325 | 326 | (defun buffer= (buf1 buf2) 327 | "Compare the active contents of two buffers." 328 | (declare (buffer buf1 buf2) #.*optimization*) 329 | (or (eq buf1 buf2) 330 | (let ((pos1 (buffer-pos buf1)) 331 | (str1 (buffer-str buf1)) 332 | (pos2 (buffer-pos buf2)) 333 | (str2 (buffer-str buf2))) 334 | (declare (fixnum pos1 pos2) (simple-vector str1 str2)) 335 | (when (= pos1 pos2) 336 | (multiple-value-bind (hi lo) (floor pos1 +buffer-chunk-size+) 337 | (declare (fixnum hi lo)) 338 | (loop for i of-type fixnum below hi 339 | for chunk1 of-type simple-base-string = (svref str1 i) 340 | for chunk2 of-type simple-base-string = (svref str2 i) do 341 | (loop for j of-type fixnum below +buffer-chunk-size+ do 342 | (when (char/= (schar chunk1 j) 343 | (schar chunk2 j)) 344 | (return-from buffer= nil)))) 345 | (when (> lo 0) 346 | (loop with chunk1 of-type simple-base-string = (svref str1 hi) 347 | with chunk2 of-type simple-base-string = (svref str2 hi) 348 | for j of-type fixnum below lo do 349 | (when (char/= (schar chunk1 j) 350 | (schar chunk2 j)) 351 | (return-from buffer= nil))))) 352 | t)))) 353 | 354 | (defun buffer-parse-integer (buf) 355 | "Parse a buffer as an integer." 356 | (declare (buffer buf) #.*optimization*) 357 | (let ((pos (buffer-pos buf)) 358 | (str (buffer-str buf)) 359 | (sign +1) 360 | (result 0)) 361 | (declare (fixnum pos) (simple-vector str) (fixnum sign) (integer result)) 362 | (flet ((update-result (char) 363 | (declare (base-char char)) 364 | (assert (and (char<= #\0) (char<= #\9))) 365 | (let ((digit (- (char-code char) #.(char-code #\0)))) 366 | (declare (fixnum digit)) 367 | (if (and (typep result 'fixnum) (< (the fixnum result) #.(floor most-positive-fixnum 10))) 368 | (setq result (the fixnum (+ (the fixnum (* (the fixnum result) 10)) digit))) 369 | (setq result (+ (* result 10) digit)))))) 370 | (declare (inline update-result)) 371 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 372 | (declare (fixnum hi lo)) 373 | (cond ((= hi 0) 374 | (assert (> lo 0)) 375 | (let* ((chunk (svref str 0)) (char (schar chunk 0)) (start 0)) 376 | (declare (simple-base-string chunk) (base-char char) (fixnum start)) 377 | (cond ((char= char #\+) (setq start 1) (assert (> lo 1))) 378 | ((char= char #\-) (setq sign -1) (setq start 1) (assert (> lo 1)))) 379 | (loop for j of-type fixnum from start below lo 380 | do (update-result (schar chunk j))))) 381 | (t (let* ((chunk (svref str 0)) (char (schar chunk 0)) (start 0)) 382 | (declare (simple-base-string chunk) (base-char char) (fixnum start)) 383 | (cond ((char= char #\+) (setq start 1)) 384 | ((char= char #\-) (setq sign -1) (setq start 1))) 385 | (loop for j of-type fixnum from start below +buffer-chunk-size+ 386 | do (update-result (schar chunk j)))) 387 | (loop for i of-type fixnum from 1 below hi 388 | for chunk of-type simple-base-string = (svref str i) do 389 | (loop for j of-type fixnum below +buffer-chunk-size+ 390 | do (update-result (schar chunk j)))) 391 | (when (> lo 0) 392 | (loop with chunk of-type simple-base-string = (svref str hi) 393 | for j of-type fixnum below lo 394 | do (update-result (schar chunk j)))))))) 395 | (* sign result))) 396 | 397 | (declaim (inline rotate-1)) 398 | 399 | (defun rotate-1 (n) 400 | "Rotate a fixnum by one position." 401 | (declare (fixnum n) #.*optimization*) 402 | (the fixnum (logior (ash n -1) (the fixnum (ash (logand n 1) #.(1- (integer-length most-positive-fixnum))))))) 403 | 404 | (defun buffer-hash (buf) 405 | "Get the hash code for a buffer; once a hash code is computed, the buffer shouldn't change anymore. 406 | This can be used for hash tables, like in (make-hash-table :test #'buffer= :hash-function #'buffer-hash)." 407 | (declare (buffer buf) #.*optimization*) 408 | (let ((pos (buffer-pos buf)) 409 | (str (buffer-str buf)) 410 | (hash (buffer-hash-value buf))) 411 | (declare (fixnum pos) (simple-vector str) (fixnum hash)) 412 | (if (> hash -1) 413 | (return-from buffer-hash hash) 414 | (setq hash 0)) 415 | (multiple-value-bind (hi lo) (floor pos +buffer-chunk-size+) 416 | (declare (fixnum hi lo)) 417 | (loop for i of-type fixnum below hi 418 | for chunk of-type simple-base-string = (svref str i) do 419 | (loop for j of-type fixnum below +buffer-chunk-size+ do 420 | (setq hash (logxor (rotate-1 hash) (char-code (schar chunk j)))))) 421 | (when (> lo 0) 422 | (loop with chunk of-type simple-base-string = (svref str hi) 423 | for j of-type fixnum below lo do 424 | (setq hash (logxor (rotate-1 hash) (char-code (schar chunk j))))))) 425 | (setf (buffer-hash-value buf) hash))) 426 | -------------------------------------------------------------------------------- /clean-sam.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | (in-simple-base-string-syntax) 3 | 4 | (eval-when (#+sbcl :compile-toplevel :load-toplevel :execute) 5 | (defun make-cigar-operations-table-consumes-reference-bases () 6 | "Map CIGAR operations to boolean flags indicating whether they consume reference bases." 7 | (let ((table (make-array (1+ (- +max-cigar-operation+ 8 | +min-cigar-operation+)) 9 | :initial-element nil 10 | #+lispworks :allocation #+lispworks :long-lived 11 | #+lispworks :single-thread #+lispworks t))) 12 | (loop for char across "MmDdNn=Xx" do (setf (svref table (- (char-code char) +min-cigar-operation+)) t)) 13 | table))) 14 | 15 | (defglobal *cigar-consumes-reference-bases-table* 16 | (make-cigar-operations-table-consumes-reference-bases) 17 | "Map CIGAR operations to boolean flags indicating whether they consume reference bases.") 18 | 19 | (defun sam-alignment-end (aln cigar-list) 20 | "Sums the lengths of all CIGAR operations that consume reference bases." 21 | (declare (sam-alignment aln) (list cigar-list) #.*optimization*) 22 | (let ((cigar-reference-length 23 | (loop with table = *cigar-consumes-reference-bases-table* 24 | for (key . val) of-type (base-char . int32) in cigar-list 25 | when (svref table (- (char-code key) +min-cigar-operation+)) 26 | ;when (or (eq key :M) (eq key :D) (eq key :N) (eq key :=) (eq key :X)) 27 | sum val))) 28 | (declare (int32 cigar-reference-length)) 29 | (+ (sam-alignment-pos aln) cigar-reference-length -1))) 30 | 31 | (declaim (inline reference-sequence-length)) 32 | 33 | (defun reference-sequence-length (aln reference-sequence-table) 34 | "Get reference sequence length for the given sam-alignment." 35 | (declare (sam-alignment aln) (hash-table reference-sequence-table) #.*optimization*) 36 | (gethash (sam-alignment-rname aln) reference-sequence-table)) 37 | 38 | (declaim (inline operator-consumes-read-bases-p)) 39 | 40 | (defun operator-consumes-read-bases-p (operator) 41 | "Does the CIGAR operation consume read bases?" 42 | (declare (base-char operator) #.*optimization*) 43 | (or (char= operator #\M) (char= operator #\I) (char= operator #\S) (char= operator #\=) (char= operator #\X))) 44 | 45 | (declaim (inline operator-consumes-reference-bases-p)) 46 | 47 | (defun operator-consumes-reference-bases-p (operator) 48 | "Does the CIGAR operation consume reference bases?" 49 | (declare (base-char operator) #.*optimization*) 50 | (or (char= operator #\M) (char= operator #\D) (char= operator #\N) (char= operator #\=) (char= operator #\X))) 51 | 52 | (defun get-read-length-from-cigar (cigar-list) 53 | "Sums the lengths of all CIGAR operations that consume read bases." 54 | (declare (list cigar-list) #.*optimization*) 55 | (loop for (operator . val) of-type (base-char . int32) in cigar-list 56 | when (operator-consumes-read-bases-p operator) 57 | sum val)) 58 | 59 | (declaim (inline element-stradless-clipped-read)) 60 | 61 | (defun element-stradless-clipped-read (new-cigar operator relative-clipping-position clipped-bases) 62 | (declare (stream new-cigar) (base-char operator) (int32 relative-clipping-position clipped-bases) #.*optimization*) 63 | (if (operator-consumes-read-bases-p operator) 64 | (if (operator-consumes-reference-bases-p operator) 65 | (when (> relative-clipping-position 0) 66 | (format new-cigar "~C~D" operator relative-clipping-position)) 67 | (setf clipped-bases (+ clipped-bases relative-clipping-position))) 68 | (when (/= relative-clipping-position 0) 69 | (error "Unexpected non-0 relative clipping position: ~s" relative-clipping-position))) 70 | (format new-cigar "S~D" clipped-bases)) 71 | 72 | (defun soft-clip-end-of-read (clip-from cigar-list) 73 | (declare (int32 clip-from) (list cigar-list) #.*optimization*) 74 | (let ((pos 0)) 75 | (declare (int32 pos)) 76 | (decf clip-from) 77 | (with-output-to-string (new-cigar nil :element-type 'base-char) 78 | (loop named cigar-loop for (operator . val) of-type (base-char . int32) in cigar-list 79 | do (let ((end-pos (+ pos (if (operator-consumes-read-bases-p operator) val 0)))) 80 | (declare (int32 end-pos)) 81 | (if (< end-pos clip-from) 82 | (format new-cigar "~C~D" operator val) 83 | (let ((clipped-bases (+ (get-read-length-from-cigar cigar-list) clip-from))) 84 | (declare (int32 clipped-bases)) 85 | (element-stradless-clipped-read new-cigar operator (- clip-from pos) clipped-bases) 86 | (return-from cigar-loop))) 87 | (setf pos (+ end-pos pos))))))) 88 | 89 | (defun clean-sam (header) 90 | "A filter for soft-clipping a sam-alignment at the end of a reference sequence, and set MAPQ to 0 if unmapped." 91 | (let ((reference-sequence-table ; create a reference dictionary that maps reference name onto the length of that reference 92 | (make-single-thread-hash-table :test #'equal))) 93 | (loop for sn-form in (sam-header-sq header) 94 | do (setf (gethash (getf sn-form :SN) reference-sequence-table) (getf sn-form :LN))) 95 | (lambda () 96 | (lambda (alignment) 97 | (declare (sam-alignment alignment) #.*optimization*) 98 | (if (sam-alignment-unmapped-p alignment) 99 | (setf (sam-alignment-mapq alignment) 0) 100 | (let ((ref-seq-length (reference-sequence-length alignment reference-sequence-table)) 101 | (scanned-cigar (scan-cigar-string 'list (sam-alignment-cigar alignment)))) 102 | (declare (int32 ref-seq-length) (list scanned-cigar)) 103 | (when (> (sam-alignment-end alignment scanned-cigar) ref-seq-length) 104 | (let ((clip-from (+ (- ref-seq-length (sam-alignment-pos alignment)) 1))) 105 | (setf (sam-alignment-cigar alignment) (soft-clip-end-of-read clip-from scanned-cigar)))))) 106 | t)))) 107 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This repository contains the elPrep demo files: 4 | 5 | Demo 1: 6 | 7 | * [run-large.sh]: a bash script that runs a 3-step preparation pipeline using elPrep on "NA12878-chr22", a subset of [whole-genome sequencing of NA12878](http://www.ebi.ac.uk/ena/data/view/ERS189474) that maps to chromosome 22 8 | * [run-small.sh]: a bash script that runs a 3-step pipeline using elPrep on 10% of the reads of NA12878-chr22 9 | 10 | Demo 2: 11 | 12 | * [run-large-gatk.sh]: a bash script that runs a 5-step preparation pipeline on NA12878-chr22 to transform it into something that can be used as input for GATK 13 | * [run-small-gatk.sh]: a bash script that runs the same 5-step preparation pipeline using elPrep on 10% of the reads of NA12878-chr22 14 | 15 | Demo 3: 16 | 17 | * [run-wes-gatk.sh]: a bash script that runs the 5-step preparation pipeline using elPrep split/filter/merge on "SRR1611184", a high-coverage [whole-exome sequencing of NA12878](http://www.ncbi.nlm.nih.gov/sra/SRX731649[accn]) 18 | 19 | Other files and scripts: 20 | 21 | * [clean.sh]: a bash script for deleting the output files generated by executing the pipeline scripts 22 | * [ucsc.hg19.dict]: a .sam/.bam header file compatible with GATK 23 | 24 | Upon executing one of the run scripts for the first time, the following input .bam files are downloaded: 25 | 26 | * [NA12878-chr22.bam]: a subset of [whole-genome sequencing of NA12878](http://www.ebi.ac.uk/ena/data/view/ERS189474) that maps to chromosome 22, created using BWA (1.2GB). 27 | * [NA12878-chr22-10pct.bam]: the first 10% entries of NA12878-chr22.bam (120MB). 28 | * [SRR1611184.bam]: a [whole-exome sequencing of NA12878](http://www.ncbi.nlm.nih.gov/sra/SRX731649[accn]) mapped using BWA (13GB). This is a complete, real-world workload. 29 | 30 | 31 | Alternatively, you can also download these files manually [here](http://www.exascience.com/public-files/elprep-demo/). 32 | 33 | # System requirements 34 | 35 | ## Operating system 36 | 37 | elPrep is developed for Linux and has been tested with the following distributions: 38 | 39 | * Ubuntu 14.04.5 LTS 40 | * Ubuntu 12.04.3 LTS 41 | * Manjaro Linux 42 | * Red Hat Enterprise Linux 6.4 and 6.5 43 | 44 | ## Workloads 45 | 46 | We recommend using NA12878-chr22-10pct.bam for quick testing and checking that the installation of elPrep is correct. NA12878-chr22.bam can be used for performance checks when running elPrep on a server. Finally, SRR1611184.bam is a complete, real-world workload. 47 | 48 | ### NA12878-chr22-10pct.bam (120MB) 49 | 50 | The minimal system requirements are: 51 | 52 | * For demo 1: 53 | 54 | * RAM: 867 MB 55 | * Disk space: 241.0 MB 56 | 57 | * For demo 2: 58 | 59 | * RAM: 3.1 GB 60 | * DisK space: 241.0 MB 61 | 62 | On our test machine, a server with two 18-core Intel Xeon Haswell processors clocked at 2.3 Ghz running Ubuntu 14.04.5 LTS, the observed runtimes for elPrep 2.61 are: 63 | 64 | * For demo 1: 65 | * with 72 threads: 5s 66 | 67 | * For demo 2: 68 | * with 72 threads: 5s 69 | 70 | ### NA12878-chr22.bam (1.2GB) 71 | 72 | The minimal system requirements are: 73 | 74 | * For demo 1: 75 | 76 | * RAM: 2 GB 77 | * Disk space: 2.4 GB 78 | 79 | * For demo 2: 80 | 81 | * RAM: 30.51 GB 82 | * Disk space: 2.4 GB 83 | 84 | On our test machine, a server with two 18-core Intel Xeon Haswell processors clocked at 2.3 Ghz running Ubuntu 14.04.5 LTS, the observed runtimes for elPrep 2.61 are: 85 | 86 | * For demo 1: 87 | * with 72 threads: 45s 88 | 89 | * For demo 2: 90 | * with 72 threads: 47s 91 | 92 | 93 | ### SRR1611184.bam (13GB) 94 | 95 | The minimal system requirements are: 96 | 97 | * For demo 3: 98 | 99 | * RAM: 21.57 GB 100 | * Disk space: 26 GB 101 | 102 | On our test machine, a server with two 18-core Intel Xeon Haswell processors clocked at 2.3 Ghz running Ubuntu 14.04.5 LTS, the observed runtime for elPrep 2.61 is: 103 | 104 | For demo 3: 105 | * with 72 threads: 16m 12s 106 | 107 | 108 | # Running the demos 109 | 110 | ## Path setup 111 | 112 | Install SAMtools and elPrep and add them to your path. For example, fill in your username and execute: 113 | 114 | export PATH=$PATH:/home/username/tools/samtools-1.5:/home/username/tools/elprep-v2.61: 115 | 116 | ## Demo 1: a simple preparation pipeline 117 | 118 | The scripts run-large.sh and run-small.sh execute a prepartion pipeline that consists of removing the unmapped reads, replacing the reference dictionary, and adding read groups, respectively for the large (NA12878-chr22) and small (NA12878-chr22-10pct) input files. 119 | 120 | 1) By default, the scripts use the maximum number of available threads, based on your processor's capabilities. If you want to use a different number of threads, edit the scripts to do so (cf. 2nd line). 121 | 122 | 2) Run the scripts by executing: 123 | 124 | sh run-small.sh 125 | 126 | for the small .bam file 127 | 128 | or 129 | 130 | sh run-large.sh 131 | 132 | for the large .bam file. 133 | 134 | Executing these scripts will print the following feedback for the small .bam file: 135 | 136 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 137 | 138 | Executing command: 139 | 140 | elprep NA12878-chr22-10pct.bam NA12878-chr22-10pct.only_mapped.reordered-contigs.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --sorting-order unknown --gc-on 2 --nr-of-threads 72 141 | 142 | or the following feedback for the large .bam file: 143 | 144 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 145 | 146 | Executing command: 147 | 148 | elprep NA12878-chr22.bam NA12878-chr22.only_mapped.reordered-contigs.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --sorting-order unsorted --gc-on 2 --nr-of-threads 72 149 | 150 | The elPrep commands that are printed in the feedback are the actual elPrep commands that are executed by those scripts. Hence you can also copy-paste these commands directly into your terminal instead of running the bash scripts. 151 | 152 | ## Demo 2: a preparation pipeline for making the data processable by GATK 153 | 154 | The scripts run-large-gatk.sh and run-small-gatk.sh execute a 5-step preparation pipeline that consists of removing the unmapped reads, replacing the reference sequence dictionary, adding read groups, marking duplicates, and sorting by coordinate order. 155 | 156 | 1) By default, the scripts use the maximum number of available threads, based on your processor's capabilities. If you want to use a different number of threads, edit the scripts to do so (cf. 2nd line). 157 | 158 | 2) Run the scripts by executing: 159 | 160 | sh run-small-gatk.sh 161 | 162 | for the small .bam file 163 | 164 | or 165 | 166 | sh run-large-gatk.sh 167 | 168 | Executing these scripts will print the following feedback for the small .bam file: 169 | 170 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 171 | 172 | Executing command: 173 | 174 | elprep NA12878-chr22-10pct.bam NA12878-chr22-10pct.only_mapped.reordered-contigs.sorted.deduplicated.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --mark-duplicates --sorting-order coordinate --gc-on 0 --nr-of-threads 72 --split-file 175 | 176 | or the following for the large .bam file: 177 | 178 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 179 | 180 | Executing command: 181 | 182 | elprep NA12878-chr22.bam NA12878-chr22.only_mapped.reordered-contigs.sorted.deduplicated.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --mark-duplicates --sorting-order coordinate --gc-on 0 --nr-of-threads 72 --split-file 183 | 184 | The elPrep commands that are printed in the feedback are the actual elPrep commands that are executed by those scripts. Hence you can also copy-paste these commands directly into your terminal instead of running the bash scripts. 185 | 186 | ## Demo 3: a preparation pipeline using elprep split/filter/merge 187 | 188 | The script run-wes-gatk.sh executes a 5-step pipeline that consists of removing the unmapped reads, replacing the reference seuqence dictionary, adding read groups, marking duplicates, and sorting by coordinate order. The script executes this pipeline using the elprep split/filter/merge approach. This script is only executable for the whole-exome data set (SRR1611184.bam). 189 | 190 | 1) By default, the scripts use the maximum number of available threads, based on your processor's capabilities. If you want to use a different number of threads, edit the scripts to do so (cf. 2nd line). 191 | 192 | 2) Run the script by executing: 193 | 194 | sh run-wes-gatk.sh 195 | 196 | Executing this script will print the following feedback: 197 | 198 | First, a call to the elprep split command is shown: 199 | 200 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 201 | 202 | Executing command: 203 | 204 | elprep split /dev/stdin /scratch/username/elprep-old-demo/temp-1506520452.55/ --output-prefix SRR1611184 --output-type sam --nr-of-threads 72 205 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 206 | 207 | Afterwards, a series of regular elprep commands is performed, one for each split file: 208 | 209 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 210 | 211 | Executing command: 212 | 213 | elprep /scratch/username/elprep-old-demo/temp-1506520452.55/splits/SRR1611184-chr6_cox_hap2.sam /scratch/username/elprep-old-demo/temp-processed-1506520452.55/SRR1611184-chr6_cox_hap2.sam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --mark-duplicates --sorting-order coordinate --gc-on 0 --nr-of-threads 72 --split-file 214 | 215 | ... 216 | 217 | Finally, an elprep merge command is performed to combine the results of the split files into a single output file: 218 | 219 | elPrep version 2.61. See http://github.com/exascience/elprep for more information. 220 | 221 | Executing command: 222 | 223 | elprep merge /scratch/username/elprep-old-demo/temp-processed-1506520452.55/ /dev/stdout --nr-of-threads 72 224 | 225 | ## Resetting the demos 226 | 227 | To remove the output files generated by executing the demos, execute the clean script: 228 | 229 | sh clean.sh 230 | -------------------------------------------------------------------------------- /demo/run-large-gatk.sh: -------------------------------------------------------------------------------- 1 | input=NA12878-chr22 #without extension 2 | threads=`grep -c processor /proc/cpuinfo` 3 | 4 | if [ ! -f $input.bam ] 5 | then 6 | echo "Input file not present." 7 | if command -v curl > /dev/null 8 | then 9 | echo "Downloading input file." 10 | curl -O http://www.exascience.com/public-files/elprep-demo/$input.bam 11 | elif command -v wget > /dev/null 12 | then 13 | echo "Downloading input file." 14 | wget http://www.exascience.com/public-files/elprep-demo/$input.bam 15 | else 16 | echo "Please download the input file from http://www.exascience.com/public-files/elprep-demo/$input.bam before running this script." 17 | exit 18 | fi 19 | fi 20 | 21 | if command -v elprep > /dev/null 22 | then 23 | if command -v samtools > /dev/null 24 | then 25 | elprep $input.bam $input.only_mapped.reordered-contigs.sorted.deduplicated.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --mark-duplicates --sorting-order coordinate --nr-of-threads $threads --split-file 26 | else 27 | echo 'samtools not found. Please download it from http://samtools.sourceforge.net and make sure that its binary is present in your PATH.' 28 | fi 29 | else 30 | echo 'elprep not found. Please download it from https://github.com/exascience/elprep and make sure that its binary is present in your PATH.' 31 | fi 32 | -------------------------------------------------------------------------------- /demo/run-large.sh: -------------------------------------------------------------------------------- 1 | input=NA12878-chr22 #without extension 2 | threads=`grep -c processor /proc/cpuinfo` 3 | 4 | if [ ! -f $input.bam ] 5 | then 6 | echo "Input file not present." 7 | if command -v curl > /dev/null 8 | then 9 | echo "Downloading input file." 10 | curl -O http://www.exascience.com/public-files/elprep-demo/$input.bam 11 | elif command -v wget > /dev/null 12 | then 13 | echo "Downloading input file." 14 | wget http://www.exascience.com/public-files/elprep-demo/$input.bam 15 | else 16 | echo "Please download the input file from http://www.exascience.com/public-files/elprep-demo/$input.bam before running this script." 17 | exit 18 | fi 19 | fi 20 | 21 | if command -v elprep > /dev/null 22 | then 23 | if command -v samtools > /dev/null 24 | then 25 | elprep $input.bam $input.only_mapped.reordered-contigs.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --sorting-order unsorted --nr-of-threads $threads --gc-on 2 26 | else 27 | echo 'samtools not found. Please download it from http://samtools.sourceforge.net and make sure that its binary is present in your PATH.' 28 | fi 29 | else 30 | echo 'elprep not found. Please download it from https://github.com/exascience/elprep and make sure that its binary is present in your PATH.' 31 | fi 32 | -------------------------------------------------------------------------------- /demo/run-small-gatk.sh: -------------------------------------------------------------------------------- 1 | input=NA12878-chr22-10pct #without extension 2 | threads=`grep -c processor /proc/cpuinfo` 3 | 4 | if [ ! -f $input.bam ] 5 | then 6 | echo "Input file not present." 7 | if command -v curl > /dev/null 8 | then 9 | echo "Downloading input file." 10 | curl -O http://www.exascience.com/public-files/elprep-demo/$input.bam 11 | elif command -v wget > /dev/null 12 | then 13 | echo "Downloading input file." 14 | wget http://www.exascience.com/public-files/elprep-demo/$input.bam 15 | else 16 | echo "Please download the input file from http://www.exascience.com/public-files/elprep-demo/$input.bam before running this script." 17 | exit 18 | fi 19 | fi 20 | 21 | if command -v elprep > /dev/null 22 | then 23 | if command -v samtools > /dev/null 24 | then 25 | elprep $input.bam $input.only_mapped.reordered-contigs.sorted.deduplicated.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --mark-duplicates --sorting-order coordinate --nr-of-threads $threads --split-file 26 | else 27 | echo 'samtools not found. Please download it from http://samtools.sourceforge.net and make sure that its binary is present in your PATH.' 28 | fi 29 | else 30 | echo 'elprep not found. Please download it from https://github.com/exascience/elprep and make sure that its binary is present in your PATH.' 31 | fi 32 | -------------------------------------------------------------------------------- /demo/run-small.sh: -------------------------------------------------------------------------------- 1 | input=NA12878-chr22-10pct #without extension 2 | threads=`grep -c processor /proc/cpuinfo` 3 | 4 | if [ ! -f $input.bam ] 5 | then 6 | echo "Input file not present." 7 | if command -v curl > /dev/null 8 | then 9 | echo "Downloading input file." 10 | curl -O http://www.exascience.com/public-files/elprep-demo/$input.bam 11 | elif command -v wget > /dev/null 12 | then 13 | echo "Downloading input file." 14 | wget http://www.exascience.com/public-files/elprep-demo/$input.bam 15 | else 16 | echo "Please download the input file from http://www.exascience.com/public-files/elprep-demo/$input.bam before running this script." 17 | exit 18 | fi 19 | fi 20 | 21 | if command -v elprep > /dev/null 22 | then 23 | if command -v samtools > /dev/null 24 | then 25 | elprep $input.bam $input.only_mapped.reordered-contigs.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --sorting-order "unknown" --nr-of-threads $threads --gc-on 2 26 | else 27 | echo 'samtools not found. Please download it from http://samtools.sourceforge.net and make sure that its binary is present in your PATH.' 28 | fi 29 | else 30 | echo 'elprep not found. Please download it from https://github.com/exascience/elprep and make sure that its binary is present in your PATH.' 31 | fi 32 | -------------------------------------------------------------------------------- /demo/run-wes-gatk.sh: -------------------------------------------------------------------------------- 1 | input=SRR1611184 #without extension 2 | threads=`grep -c processor /proc/cpuinfo` 3 | 4 | if [ ! -f $input.bam ] 5 | then 6 | echo "Input file not present." 7 | if command -v curl > /dev/null 8 | then 9 | echo "Downloading input file." 10 | curl -O http://www.exascience.com/public-files/elprep-demo/$input.bam 11 | elif command -v wget > /dev/null 12 | then 13 | echo "Downloading input file." 14 | wget http://www.exascience.com/public-files/elprep-demo/$input.bam 15 | else 16 | echo "Please download the input file from http://www.exascience.com/public-files/elprep-demo/$input.bam before running this script." 17 | exit 18 | fi 19 | fi 20 | 21 | if command -v elprep > /dev/null 22 | then 23 | if command -v samtools > /dev/null 24 | then 25 | elprep-sfm.py $input.bam $input.only_mapped.reordered-contigs.sorted.deduplicated.read-group.bam --filter-unmapped-reads --replace-reference-sequences ucsc.hg19.dict --replace-read-group "ID:group1 LB:lib1 PL:illumina PU:unit1 SM:sample1" --mark-duplicates --sorting-order coordinate --nr-of-threads $threads 26 | else 27 | echo 'samtools not found. Please download it from http://samtools.sourceforge.net and make sure that its binary is present in your PATH.' 28 | fi 29 | else 30 | echo 'elprep not found. Please download it from https://github.com/exascience/elprep and make sure that its binary is present in your PATH.' 31 | fi 32 | -------------------------------------------------------------------------------- /demo/ucsc.hg19.dict: -------------------------------------------------------------------------------- 1 | @HD VN:1.0 SO:unsorted 2 | @SQ SN:chrM LN:16571 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:d2ed829b8a1628d16cbeee88e88e39eb 3 | @SQ SN:chr1 LN:249250621 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 4 | @SQ SN:chr2 LN:243199373 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:a0d9851da00400dec1098a9255ac712e 5 | @SQ SN:chr3 LN:198022430 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:641e4338fa8d52a5b781bd2a2c08d3c3 6 | @SQ SN:chr4 LN:191154276 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:23dccd106897542ad87d2765d28a19a1 7 | @SQ SN:chr5 LN:180915260 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:0740173db9ffd264d728f32784845cd7 8 | @SQ SN:chr6 LN:171115067 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:1d3a93a248d92a729ee764823acbbc6b 9 | @SQ SN:chr7 LN:159138663 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:618366e953d6aaad97dbe4777c29375e 10 | @SQ SN:chr8 LN:146364022 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:96f514a9929e410c6651697bded59aec 11 | @SQ SN:chr9 LN:141213431 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:3e273117f15e0a400f01055d9f393768 12 | @SQ SN:chr10 LN:135534747 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:988c28e000e84c26d552359af1ea2e1d 13 | @SQ SN:chr11 LN:135006516 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:98c59049a2df285c76ffb1c6db8f8b96 14 | @SQ SN:chr12 LN:133851895 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:51851ac0e1a115847ad36449b0015864 15 | @SQ SN:chr13 LN:115169878 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:283f8d7892baa81b510a015719ca7b0b 16 | @SQ SN:chr14 LN:107349540 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:98f3cae32b2a2e9524bc19813927542e 17 | @SQ SN:chr15 LN:102531392 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:e5645a794a8238215b2cd77acb95a078 18 | @SQ SN:chr16 LN:90354753 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:fc9b1a7b42b97a864f56b348b06095e6 19 | @SQ SN:chr17 LN:81195210 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:351f64d4f4f9ddd45b35336ad97aa6de 20 | @SQ SN:chr18 LN:78077248 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:b15d4b2d29dde9d3e4f93d1d0f2cbc9c 21 | @SQ SN:chr19 LN:59128983 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:1aacd71f30db8e561810913e0b72636d 22 | @SQ SN:chr20 LN:63025520 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:0dec9660ec1efaaf33281c0d5ea2560f 23 | @SQ SN:chr21 LN:48129895 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:2979a6085bfe28e3ad6f552f361ed74d 24 | @SQ SN:chr22 LN:51304566 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:a718acaa6135fdca8357d5bfe94211dd 25 | @SQ SN:chrX LN:155270560 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:7e0e2e580297b7764e31dbc80c2540dd 26 | @SQ SN:chrY LN:59373566 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:1e86411d73e6f00a10590f976be01623 27 | @SQ SN:chr1_gl000191_random LN:106433 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:d75b436f50a8214ee9c2a51d30b2c2cc 28 | @SQ SN:chr1_gl000192_random LN:547496 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:325ba9e808f669dfeee210fdd7b470ac 29 | @SQ SN:chr4_ctg9_hap1 LN:590426 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:fa24f81b680df26bcfb6d69b784fbe36 30 | @SQ SN:chr4_gl000193_random LN:189789 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:dbb6e8ece0b5de29da56601613007c2a 31 | @SQ SN:chr4_gl000194_random LN:191469 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:6ac8f815bf8e845bb3031b73f812c012 32 | @SQ SN:chr6_apd_hap1 LN:4622290 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:fe71bc63420d666884f37a3ad79f3317 33 | @SQ SN:chr6_cox_hap2 LN:4795371 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:18c17e1641ef04873b15f40f6c8659a4 34 | @SQ SN:chr6_dbb_hap3 LN:4610396 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:2a3c677c426a10e137883ae1ffb8da3f 35 | @SQ SN:chr6_mann_hap4 LN:4683263 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:9d51d4152174461cd6715c7ddc588dc8 36 | @SQ SN:chr6_mcf_hap5 LN:4833398 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:efed415dd8742349cb7aaca054675b9a 37 | @SQ SN:chr6_qbl_hap6 LN:4611984 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:094d037050cad692b57ea12c4fef790f 38 | @SQ SN:chr6_ssto_hap7 LN:4928567 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:3b6d666200e72bcc036bf88a4d7e0749 39 | @SQ SN:chr7_gl000195_random LN:182896 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:5d9ec007868d517e73543b005ba48535 40 | @SQ SN:chr8_gl000196_random LN:38914 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:d92206d1bb4c3b4019c43c0875c06dc0 41 | @SQ SN:chr8_gl000197_random LN:37175 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:6f5efdd36643a9b8c8ccad6f2f1edc7b 42 | @SQ SN:chr9_gl000198_random LN:90085 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:868e7784040da90d900d2d1b667a1383 43 | @SQ SN:chr9_gl000199_random LN:169874 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:569af3b73522fab4b40995ae4944e78e 44 | @SQ SN:chr9_gl000200_random LN:187035 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:75e4c8d17cd4addf3917d1703cacaf25 45 | @SQ SN:chr9_gl000201_random LN:36148 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:dfb7e7ec60ffdcb85cb359ea28454ee9 46 | @SQ SN:chr11_gl000202_random LN:40103 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:06cbf126247d89664a4faebad130fe9c 47 | @SQ SN:chr17_ctg5_hap1 LN:1680828 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:d89517b400226d3b56e753972a7cad67 48 | @SQ SN:chr17_gl000203_random LN:37498 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:96358c325fe0e70bee73436e8bb14dbd 49 | @SQ SN:chr17_gl000204_random LN:81310 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:efc49c871536fa8d79cb0a06fa739722 50 | @SQ SN:chr17_gl000205_random LN:174588 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:d22441398d99caf673e9afb9a1908ec5 51 | @SQ SN:chr17_gl000206_random LN:41001 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:43f69e423533e948bfae5ce1d45bd3f1 52 | @SQ SN:chr18_gl000207_random LN:4262 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:f3814841f1939d3ca19072d9e89f3fd7 53 | @SQ SN:chr19_gl000208_random LN:92689 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:aa81be49bf3fe63a79bdc6a6f279abf6 54 | @SQ SN:chr19_gl000209_random LN:159169 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:f40598e2a5a6b26e84a3775e0d1e2c81 55 | @SQ SN:chr21_gl000210_random LN:27682 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:851106a74238044126131ce2a8e5847c 56 | @SQ SN:chrUn_gl000211 LN:166566 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:7daaa45c66b288847b9b32b964e623d3 57 | @SQ SN:chrUn_gl000212 LN:186858 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:563531689f3dbd691331fd6c5730a88b 58 | @SQ SN:chrUn_gl000213 LN:164239 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:9d424fdcc98866650b58f004080a992a 59 | @SQ SN:chrUn_gl000214 LN:137718 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:46c2032c37f2ed899eb41c0473319a69 60 | @SQ SN:chrUn_gl000215 LN:172545 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:5eb3b418480ae67a997957c909375a73 61 | @SQ SN:chrUn_gl000216 LN:172294 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:642a232d91c486ac339263820aef7fe0 62 | @SQ SN:chrUn_gl000217 LN:172149 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:6d243e18dea1945fb7f2517615b8f52e 63 | @SQ SN:chrUn_gl000218 LN:161147 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:1d708b54644c26c7e01c2dad5426d38c 64 | @SQ SN:chrUn_gl000219 LN:179198 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:f977edd13bac459cb2ed4a5457dba1b3 65 | @SQ SN:chrUn_gl000220 LN:161802 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:fc35de963c57bf7648429e6454f1c9db 66 | @SQ SN:chrUn_gl000221 LN:155397 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:3238fb74ea87ae857f9c7508d315babb 67 | @SQ SN:chrUn_gl000222 LN:186861 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:6fe9abac455169f50470f5a6b01d0f59 68 | @SQ SN:chrUn_gl000223 LN:180455 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:399dfa03bf32022ab52a846f7ca35b30 69 | @SQ SN:chrUn_gl000224 LN:179693 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:d5b2fc04f6b41b212a4198a07f450e20 70 | @SQ SN:chrUn_gl000225 LN:211173 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:63945c3e6962f28ffd469719a747e73c 71 | @SQ SN:chrUn_gl000226 LN:15008 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:1c1b2cd1fccbc0a99b6a447fa24d1504 72 | @SQ SN:chrUn_gl000227 LN:128374 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:a4aead23f8053f2655e468bcc6ecdceb 73 | @SQ SN:chrUn_gl000228 LN:129120 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:c5a17c97e2c1a0b6a9cc5a6b064b714f 74 | @SQ SN:chrUn_gl000229 LN:19913 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:d0f40ec87de311d8e715b52e4c7062e1 75 | @SQ SN:chrUn_gl000230 LN:43691 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:b4eb71ee878d3706246b7c1dbef69299 76 | @SQ SN:chrUn_gl000231 LN:27386 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:ba8882ce3a1efa2080e5d29b956568a4 77 | @SQ SN:chrUn_gl000232 LN:40652 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:3e06b6741061ad93a8587531307057d8 78 | @SQ SN:chrUn_gl000233 LN:45941 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:7fed60298a8d62ff808b74b6ce820001 79 | @SQ SN:chrUn_gl000234 LN:40531 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:93f998536b61a56fd0ff47322a911d4b 80 | @SQ SN:chrUn_gl000235 LN:34474 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:118a25ca210cfbcdfb6c2ebb249f9680 81 | @SQ SN:chrUn_gl000236 LN:41934 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:fdcd739913efa1fdc64b6c0cd7016779 82 | @SQ SN:chrUn_gl000237 LN:45867 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:e0c82e7751df73f4f6d0ed30cdc853c0 83 | @SQ SN:chrUn_gl000238 LN:39939 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:131b1efc3270cc838686b54e7c34b17b 84 | @SQ SN:chrUn_gl000239 LN:33824 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:99795f15702caec4fa1c4e15f8a29c07 85 | @SQ SN:chrUn_gl000240 LN:41933 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:445a86173da9f237d7bcf41c6cb8cc62 86 | @SQ SN:chrUn_gl000241 LN:42152 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:ef4258cdc5a45c206cea8fc3e1d858cf 87 | @SQ SN:chrUn_gl000242 LN:43523 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:2f8694fc47576bc81b5fe9e7de0ba49e 88 | @SQ SN:chrUn_gl000243 LN:43341 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:cc34279a7e353136741c9fce79bc4396 89 | @SQ SN:chrUn_gl000244 LN:39929 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:0996b4475f353ca98bacb756ac479140 90 | @SQ SN:chrUn_gl000245 LN:36651 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:89bc61960f37d94abf0df2d481ada0ec 91 | @SQ SN:chrUn_gl000246 LN:38154 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:e4afcd31912af9d9c2546acf1cb23af2 92 | @SQ SN:chrUn_gl000247 LN:36422 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:7de00226bb7df1c57276ca6baabafd15 93 | @SQ SN:chrUn_gl000248 LN:39786 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:5a8e43bec9be36c7b49c84d585107776 94 | @SQ SN:chrUn_gl000249 LN:38502 UR:file:/humgen/gsa-hpprojects/GATK/bundle/ucsc.hg19/ucsc.hg19.fasta M5:1d78abec37c15fe29a275eb08d5af236 95 | -------------------------------------------------------------------------------- /elprep-package.lisp: -------------------------------------------------------------------------------- 1 | (in-package :cl-user) 2 | 3 | (defpackage #:elprep 4 | (:documentation "A library for efficiently executing sequencing pipelines on SAM/BAM/CRAM files, taking advantage of modern multi-core processors. 5 | 6 | Modifications to headers and alignments are expressed as filters. The library comes with a number of commonly used pre-defined filters, but you can also define and use your own filters. A pipeline can be executed with run-pipeline, which accepts SAM/BAM/CRAM files as input and/or output sources, but can also operate on an in-memory representation of such files. run-pipeline can be extended to also operate on other input/output sources, such as databases. 7 | 8 | Refer to the documentation of run-pipeline as a starting point.") 9 | (:use 10 | #:cl-date-time-parser 11 | #:common-lisp 12 | #+sbcl #:sb-gray 13 | #+lispworks #:stream 14 | #:string-case) 15 | (:shadow #:byte #:make-hash-table) 16 | #+lispworks 17 | (:import-from #:hcl #:get-working-directory #:modify-hash #:with-hash-table-locked) 18 | #+lispworks 19 | (:import-from #:mp #:mailbox) 20 | #+lispworks 21 | (:import-from #:sys #:compare-and-swap) 22 | #+sbcl 23 | (:import-from #:sb-concurrency #:mailbox) 24 | #+sbcl 25 | (:import-from #:sb-ext 26 | #:compare-and-swap 27 | #:run-program 28 | #:process-p 29 | #:process-input 30 | #:process-output 31 | #:process-error 32 | #:process-alive-p 33 | #:process-wait 34 | #:process-exit-code 35 | #:process-close) 36 | #+sbcl 37 | (:import-from #:named-readtables #:defreadtable #:in-readtable) 38 | 39 | (:export 40 | 41 | #:*number-of-threads* 42 | 43 | ;; sam-types 44 | 45 | #:octet #:uint16 #:int32 46 | 47 | #:*sam-file-format-version* #:*sam-file-format-date* 48 | 49 | #:sam-header #:make-sam-header #:sam-header-p 50 | #:sam-header-hd 51 | #:sam-header-sq 52 | #:sam-header-rg 53 | #:sam-header-pg 54 | #:sam-header-co 55 | #:sam-header-user-tags 56 | 57 | #:sam-header-ensure-hd 58 | 59 | #:sam-header-user-tag 60 | #:sam-header-user-tag-p 61 | 62 | #:sam-alignment #:make-sam-alignment #:sam-alignment-p 63 | #:sam-alignment-qname 64 | #:sam-alignment-flag 65 | #:sam-alignment-rname 66 | #:sam-alignment-pos 67 | #:sam-alignment-mapq 68 | #:sam-alignment-cigar 69 | #:sam-alignment-rnext 70 | #:sam-alignment-pnext 71 | #:sam-alignment-tlen 72 | #:sam-alignment-seq 73 | #:sam-alignment-qual 74 | #:sam-alignment-tags 75 | #:sam-alignment-temps 76 | 77 | #:sam-alignment-tag 78 | #:sam-alignment-temp 79 | #:sam-alignment-refid 80 | 81 | #:check-refid-type 82 | #:coordinate< 83 | 84 | #:+multiple+ 85 | #:+proper+ 86 | #:+unmapped+ 87 | #:+next-unmapped+ 88 | #:+reversed+ 89 | #:+next-reversed+ 90 | #:+first+ 91 | #:+last+ 92 | #:+secondary+ 93 | #:+qc-failed+ 94 | #:+duplicate+ 95 | #:+supplementary+ 96 | 97 | #:sam-alignment-multiple-p 98 | #:sam-alignment-proper-p 99 | #:sam-alignment-unmapped-p 100 | #:sam-alignment-next-unmapped-p 101 | #:sam-alignment-reversed-p 102 | #:sam-alignment-next-reversed-p 103 | #:sam-alignment-first-p 104 | #:sam-alignment-last-p 105 | #:sam-alignment-secondary-p 106 | #:sam-alignment-qc-failed-p 107 | #:sam-alignment-duplicate-p 108 | #:sam-alignment-supplementary-p 109 | 110 | #:sam-alignment-flag-every 111 | #:sam-alignment-flag-some 112 | #:sam-alignment-flag-notevery 113 | #:sam-alignment-flag-notany 114 | 115 | #:sam #:make-sam #:sam-p 116 | #:sam-header 117 | #:sam-alignments 118 | #:map-sam-alignments #:do-sam-alignments 119 | 120 | #:scan-cigar-string 121 | 122 | ;; sam-files 123 | 124 | #:make-scanner 125 | 126 | #:scan-string 127 | #:scan-stringn 128 | #:scan-integer 129 | #:scan-float 130 | #:parse-sam-byte-array 131 | #:parse-sam-numeric-array 132 | 133 | #:parse-sam-header-line 134 | #:parse-sam-reference-sequence-dictionary-entry 135 | #:parse-sam-read-group 136 | #:parse-sam-program 137 | #:parse-sam-comment 138 | #:parse-sam-header #:skip-sam-header 139 | #:parse-sam-alignment 140 | #:parse-sam 141 | 142 | #:format-sam-string 143 | #:format-sam-integer 144 | #:format-sam-byte-array 145 | #:format-sam-datetime 146 | #:format-sam-header-user-tag 147 | 148 | #:format-sam-header-line 149 | #:format-sam-reference-sequence-dictionary 150 | #:format-sam-read-groups 151 | #:format-sam-programs 152 | #:format-sam-comments 153 | #:format-sam-user-tags 154 | #:format-sam-header 155 | 156 | #:format-sam-tag 157 | #:format-sam-alignment 158 | 159 | #:format-sam 160 | 161 | #:stderr* 162 | #:setup-standard-streams 163 | 164 | #:get-samtools 165 | #:check-stdout 166 | #:check-stdin 167 | #:sam-file-kind 168 | #:open-sam 169 | #:open-temporary-sam 170 | #:close-sam 171 | #:invoke-with-open-sam #:with-open-sam 172 | 173 | #:*reference-fasta* 174 | #:*reference-fai* 175 | 176 | ;; filter-pipeline 177 | 178 | #:+default-chunk-size+ 179 | 180 | #:run-pipeline 181 | #:run-pipeline-in-situ 182 | #:get-output-functions 183 | 184 | 185 | #:compose-global-filters #:with-thread-filters 186 | #:compose-thread-filters #:with-alignment-filters 187 | #:create-chunk-filter 188 | #:with-output-functions 189 | #:chunk-output-loop #:with-chunk-output 190 | 191 | #:effective-sorting-order 192 | 193 | #:call-with-processes 194 | 195 | ;; filters 196 | 197 | #:replace-reference-sequence-dictionary 198 | #:replace-reference-sequence-dictionary-from-sam-file 199 | #:filter-unmapped-reads 200 | #:filter-unmapped-reads-strict 201 | #:filter-duplicate-reads 202 | #:filter-optional-reads 203 | #:add-or-replace-read-group 204 | #:parse-read-group-from-string 205 | #:add-pg-line 206 | #:clean-sam 207 | #:add-refid 208 | 209 | #:sam-alignment-rg 210 | #:compute-phred-score 211 | #:compute-unclipped-position 212 | #:mark-duplicates 213 | 214 | ;; elprep utils 215 | 216 | #:explain-flag 217 | #:merge-sorted-files-split-per-chromosome 218 | #:split-file-per-chromosome 219 | 220 | ;; ui 221 | 222 | #:elprep-script 223 | )) 224 | -------------------------------------------------------------------------------- /elprep-sfm-gnupar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Example script for using elPrep split/merge tools. 4 | 5 | import sys 6 | import elprep_sfm_gnupar 7 | 8 | # actual script 9 | 10 | def elprep_sfm_gnupar_script(): 11 | elprep_sfm_gnupar.elprep_sfm_gnupar(sys.argv) 12 | 13 | elprep_sfm_gnupar_script() 14 | -------------------------------------------------------------------------------- /elprep-sfm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Example script for using elPrep split/merge tools. 4 | 5 | import sys 6 | import elprep_sfm 7 | 8 | # actual script 9 | 10 | def elprep_sfm_script(): 11 | elprep_sfm.elprep_sfm(sys.argv) 12 | 13 | elprep_sfm_script() 14 | -------------------------------------------------------------------------------- /elprep-utils.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | (in-simple-base-string-syntax) 3 | 4 | (defun explain-flag (flag) 5 | "Return a symbolic representation of the FLAG entry in a SAM file alignment line. 6 | This is primarily for debugging purposes. 7 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4." 8 | (let ((result '())) 9 | (macrolet ((test (&rest bits) 10 | `(progn ,@(loop for bit in bits 11 | for bitn = (symbol-name bit) 12 | for bitk = (intern (subseq bitn 1 (1- (length bitn))) :keyword) 13 | collect `(when (/= (logand flag ,bit) 0) 14 | (push ,bitk result)))))) 15 | (test +supplementary+ 16 | +duplicate+ 17 | +qc-failed+ 18 | +secondary+ 19 | +last+ 20 | +first+ 21 | +next-reversed+ 22 | +reversed+ 23 | +next-unmapped+ 24 | +unmapped+ 25 | +proper+ 26 | +multiple+)) 27 | result)) 28 | 29 | (defun sam-alignment-differ (aln1 aln2) 30 | "Return false if the two sam-alignments have the same mandatory fields. 31 | Otherwise, return a symbol indicating which field differs." 32 | (declare (sam-alignment aln1 aln2) #.*optimization*) 33 | ; check that all mandatory fields are = 34 | (or (when (string/= (the base-string (sam-alignment-qname aln1)) (the base-string (sam-alignment-qname aln2))) 'qname) 35 | (when (/= (the fixnum (sam-alignment-flag aln1)) (the fixnum (sam-alignment-flag aln2))) 'flag) 36 | (when (string/= (the base-string (sam-alignment-rname aln1)) (the base-string (sam-alignment-rname aln2))) 'rname) 37 | (when (/= (the int32 (sam-alignment-pos aln1)) (the int32 (sam-alignment-pos aln2))) 'pos) 38 | (when (/= (sam-alignment-mapq aln1) (sam-alignment-mapq aln2)) 'mapq) 39 | (when (string/= (the base-string (sam-alignment-cigar aln1)) (the base-string (sam-alignment-cigar aln2))) 'cigar) 40 | (when (string/= (the base-string (sam-alignment-rnext aln1)) (the base-string (sam-alignment-rnext aln2))) 'rnext) 41 | (when (string/= (the base-string (sam-alignment-qual aln1)) (the base-string (sam-alignment-qual aln2))) 'qual))) 42 | 43 | (defun sam-alignment-same (aln1 aln2) 44 | "Return true if the two sam-alignments have the same mandatory fields, false otherwise." 45 | (declare (sam-alignment aln1 aln2) #.*optimization*) 46 | (and (string= (the base-string (sam-alignment-qname aln1)) (the base-string (sam-alignment-qname aln2))) 47 | (= (the fixnum (sam-alignment-flag aln1)) (the fixnum (sam-alignment-flag aln2))) 48 | (string= (the base-string (sam-alignment-rname aln1)) (the base-string (sam-alignment-rname aln2))) 49 | (= (the int32 (sam-alignment-pos aln1)) (the int32 (sam-alignment-pos aln2))) 50 | (= (sam-alignment-mapq aln1) (sam-alignment-mapq aln2)) 51 | (string= (the base-string (sam-alignment-cigar aln1)) (the base-string (sam-alignment-cigar aln2))) 52 | (string= (the base-string (sam-alignment-rnext aln1)) (the base-string (sam-alignment-rnext aln2))) 53 | (string= (the base-string (sam-alignment-qual aln1)) (the base-string (sam-alignment-qual aln2))))) 54 | 55 | (defun real-diffs (alns1 alns2) 56 | "Return a list of sam-alignments in alns1 for which no alignments in alns2 exist that have the same mandatory fields." 57 | (loop for aln1 in alns1 58 | unless (find aln1 alns2 :test #'sam-alignment-same) 59 | collect aln1)) 60 | 61 | (defun compare-sams (sam1-file sam2-file) 62 | "Parse both SAM files, then compare the mandatory fields of all alignments one by one." 63 | (let ((sam1 (make-sam)) 64 | (sam2 (make-sam)) 65 | (working-directory (get-working-directory))) 66 | (run-pipeline (merge-pathnames sam1-file working-directory) sam1 :sorting-order :queryname) 67 | (run-pipeline (merge-pathnames sam2-file working-directory) sam2 :sorting-order :queryname) 68 | (format t "sam1:~s alns sam2:~s alns ~%" (length (sam-alignments sam1)) (length (sam-alignments sam2))) 69 | (let ((differences1 nil) 70 | (differences2 nil)) 71 | (loop for aln1 in (sam-alignments sam1) ; filter diffs 72 | for aln2 in (sam-alignments sam2) 73 | do (let ((d (sam-alignment-differ aln1 aln2))) 74 | (when d 75 | (push aln1 differences1) 76 | (push aln2 differences2)))) 77 | (real-diffs differences1 differences2)))) ; sort slightly different order in elprep so get out real diffs 78 | 79 | (defun verify-order-kept (sam-file) 80 | "Assume the SAM file is sorted by coordinate order. Verify that this is stil the case." 81 | (format t "verifying order kept ~%") 82 | (let ((sam (make-sam)) 83 | (working-directory (get-working-directory))) 84 | (run-pipeline (merge-pathnames sam-file working-directory) sam) 85 | (let (pos rname qname ctr) 86 | (do-sam-alignments (aln sam) 87 | (if (null pos) 88 | ;; first alignment 89 | (setq pos (sam-alignment-pos aln) 90 | rname (sam-alignment-rname aln) 91 | qname (sam-alignment-qname aln) 92 | ctr 1) 93 | ;; remaining alignments 94 | (let ((new-pos (sam-alignment-pos aln)) 95 | (new-rname (sam-alignment-rname aln)) 96 | (new-qname (sam-alignment-qname aln))) 97 | (cond ((and (< new-pos pos) (string= rname new-rname )) 98 | (format t "Not sorted: previous pos: ~s,~s,~s current pos: ~s,~s,~s. ~s reads were in the right order. ~%" qname rname pos new-qname new-rname new-pos ctr) 99 | (return-from verify-order-kept nil)) 100 | (t 101 | (incf ctr) 102 | (setf rname new-rname) 103 | (setf pos new-pos)))))) 104 | (return-from verify-order-kept t)))) 105 | 106 | (defun count-duplicates (sam-file) 107 | "Return the number of alignments in the SAM file that are marked as duplicates." 108 | (let ((sam (make-sam))) 109 | (run-pipeline (merge-pathnames sam-file (get-working-directory)) sam) 110 | (let ((count 0)) 111 | (do-sam-alignments (aln sam) 112 | (when (sam-alignment-duplicate-p aln) (incf count))) 113 | count))) 114 | 115 | ; code for splitting up sam files into chromosomes 116 | 117 | (define-symbol-macro optional-data-tag "sr:i:1") 118 | 119 | (defun split-file-per-chromosome (input output-path output-prefix output-extension) 120 | "A function for splitting a sam file into : a file containing all unmapped reads, a file containing all pairs where reads map to different chromosomes, a file per chromosome containing all pairs where the reads map to that chromosome. There are no requirements on the input file for splitting." 121 | (let ((files (directory input))) 122 | (multiple-value-bind 123 | (first-in first-program) 124 | (open-sam (first files) :direction :input) 125 | (let ((header (parse-sam-header first-in)) 126 | (splits-path (merge-pathnames (make-pathname :directory '(:relative "splits")) output-path)) 127 | (chroms-encountered (make-single-thread-hash-table :test #'buffer= :hash-function #'buffer-hash)) ; chr -> file name 128 | (buf-unmapped (make-buffer "*"))) 129 | ; create a directory for split files 130 | (ensure-directories-exist splits-path) 131 | ; tag the header as one created with elPrep split 132 | (setf (sam-header-user-tag header :|@sr|) (list (list :|co| "This file was created using elprep split."))) 133 | ; fill in a file for unmapped reads 134 | (setf (gethash buf-unmapped chroms-encountered) 135 | (multiple-value-bind 136 | (file program) ; create a new headerless file 137 | (open-sam (merge-pathnames splits-path (make-pathname :name (format nil "~a-unmapped" output-prefix) :type output-extension)) :direction :output) 138 | (format-sam-header file header) 139 | (cons file program))) 140 | ; create split files for each chromosome 141 | (loop for sn-form in (sam-header-sq header) 142 | do (assert (not (null sn-form))) 143 | (let* ((chrom (getf sn-form :SN)) 144 | (buf-chrom (make-buffer chrom))) 145 | (setf (gethash buf-chrom chroms-encountered) 146 | (multiple-value-bind 147 | (file program) 148 | (open-sam (merge-pathnames splits-path (make-pathname :name (format nil "~a-~a" output-prefix chrom) :type output-extension)) :direction :output) 149 | (format-sam-header file header) 150 | (cons file program))))) 151 | (unwind-protect 152 | (with-open-sam (spread-reads-stream (merge-pathnames output-path (make-pathname :name (format nil "~a-spread" output-prefix) :type output-extension)) :direction :output) 153 | (format-sam-header spread-reads-stream header) 154 | (let ((buf-= (make-buffer "="))) 155 | (let ((rname (make-buffer)) 156 | (rnext (make-buffer)) 157 | (aln-string (make-buffer))) 158 | (flet ((process-file (in) 159 | (loop do (read-line-into-buffer in aln-string) 160 | until (buffer-emptyp aln-string) 161 | do (progn (buffer-partition aln-string #\Tab 2 rname 6 rnext) 162 | (let* ((file (car (gethash rname chroms-encountered)))) 163 | (cond ((or (or (buffer= buf-= rnext) (buffer= rname rnext)) ; read and mate map to the same chromosome; 164 | (buffer= buf-unmapped rname)) ; read unmapped 165 | (write-buffer aln-string file) 166 | (write-newline file)) 167 | (t ; the read is part of a pair mapping to two different chromosomes 168 | (write-buffer aln-string spread-reads-stream) 169 | (write-newline spread-reads-stream) 170 | ; duplicate the info in the chromosome file so it can be used; mark the read as duplicate info 171 | (write-buffer aln-string file) 172 | (write-tab file) 173 | (writestr file optional-data-tag) 174 | (write-newline file)))))) 175 | (reinitialize-buffer rname) 176 | (reinitialize-buffer rnext) 177 | (reinitialize-buffer aln-string))) 178 | (process-file first-in) 179 | (close-sam first-in first-program) 180 | (loop for in-file in (rest files) 181 | do (with-open-sam (in in-file :direction :input) 182 | (skip-sam-header in) 183 | (process-file in))))))) 184 | (loop for (file . program) being each hash-value of chroms-encountered 185 | do (close-sam file program))))))) 186 | 187 | (defun merge-sorted-files-split-per-chromosome (input-path output input-prefix input-extension header) 188 | "A function for merging files that were split with elPrep and sorted in coordinate order." 189 | ; Extract the header to identify the files names. 190 | ; Assume that all files are sorted per cooordinate order, i.e. first sorted on refid entry according to sequence dictionary, then sorted on position entry. 191 | ; There is a file per chromosome in the sequence dictionary. These contain all reads that map to that chromosome. 192 | ; On top of that, there is a file that contains the unmapped (or *) reads and a file that contains the reads that map to different chromosomes. 193 | ; Merge these files in the order of the sequence dictionary. Put the unmapped reads as the last entries. 194 | ; When merging a particular chromosome file into the merged file, make sure that reads that map to different chromosomes are merged in correctly. 195 | ; So while mergin a particular chromosome file, pop and compare against reads in the file for reads that map to different chromosomes until the next chromosome 196 | ; is encountered on the refid position. 197 | ; when a file is empty, close it and remove it from the list of files to merge 198 | ; loop for identifying and opening the files to merge 199 | ; extract min/max from the header 200 | ; merge the files 201 | (with-open-sam (spread-reads-file (merge-pathnames input-path (make-pathname :name (format nil "~a-spread" input-prefix) :type input-extension)) :direction :input) 202 | (skip-sam-header spread-reads-file) 203 | ; merge loop 204 | (with-open-sam (out output :direction :output) 205 | (format-sam-header out header) 206 | (let ((spread-read (make-buffer)) ; for storing entries from the spread-read file 207 | (spread-read-refid (make-buffer)) 208 | (spread-read-pos (make-buffer)) 209 | (chromosome-read (make-buffer)) ; for storing reads from the chromsome file we are currently merging 210 | (chromosome-read-refid (make-buffer)) 211 | (chromosome-read-pos (make-buffer)) 212 | (common-read-refid (make-buffer))) 213 | ; then merge the rest of the files 214 | (loop for sn-form in (sam-header-sq header) 215 | for chrom = (getf sn-form :SN) 216 | for file-name = (merge-pathnames input-path (make-pathname :name (format nil "~a-~a" input-prefix chrom) :type input-extension)) 217 | when (probe-file file-name) do 218 | (reinitialize-buffer common-read-refid) 219 | (buffer-extend common-read-refid chrom) 220 | (block chromosome-loop 221 | (with-open-sam (file file-name :direction :input) 222 | (skip-sam-header file) 223 | (read-line-into-buffer file chromosome-read) 224 | (when (buffer-emptyp chromosome-read) (return-from chromosome-loop)) 225 | (buffer-partition chromosome-read #\Tab 2 chromosome-read-refid 3 chromosome-read-pos) 226 | (assert (buffer= chromosome-read-refid common-read-refid)) 227 | (when (buffer-emptyp spread-read) ; if the buffer is not empty, the current entry is potentially an entry for this file and it should not be overwritten 228 | (read-line-into-buffer spread-reads-file spread-read) 229 | (buffer-partition spread-read #\Tab 2 spread-read-refid 3 spread-read-pos)) 230 | (unless (buffer-emptyp spread-read) 231 | (when (buffer= spread-read-refid chromosome-read-refid) 232 | (let ((pos1 (buffer-parse-integer spread-read-pos)) 233 | (pos2 (buffer-parse-integer chromosome-read-pos))) 234 | (loop do (cond ((< pos1 pos2) 235 | (write-buffer spread-read out) 236 | (write-newline out) 237 | (read-line-into-buffer spread-reads-file spread-read) 238 | (cond ((buffer-emptyp spread-read) 239 | (loop-finish)) 240 | (t (buffer-partition spread-read #\Tab 2 spread-read-refid 3 spread-read-pos) 241 | (setq pos1 (buffer-parse-integer spread-read-pos))))) 242 | (t (write-buffer chromosome-read out) 243 | (write-newline out) 244 | (read-line-into-buffer file chromosome-read) 245 | (cond ((buffer-emptyp chromosome-read) 246 | (loop-finish)) 247 | (t (buffer-partition chromosome-read #\Tab 3 chromosome-read-pos) 248 | (setq pos2 (buffer-parse-integer chromosome-read-pos)))))) 249 | while (buffer= chromosome-read-refid spread-read-refid))))) 250 | ; copy remaining reads in the file, if any 251 | (when (not (buffer-emptyp chromosome-read)) 252 | (write-buffer chromosome-read out) 253 | (write-newline out)) 254 | (copy-stream file out))) 255 | ; copy remaining reads in the spread file, if any, that are one the same chromosome as the file was 256 | (when (not (buffer-emptyp spread-read)) 257 | (loop while (buffer= spread-read-refid common-read-refid) 258 | do 259 | (write-buffer spread-read out) 260 | (write-newline out) 261 | (read-line-into-buffer spread-reads-file spread-read) 262 | (when (buffer-emptyp spread-read) (return)) 263 | (buffer-partition spread-read #\Tab 2 spread-read-refid) 264 | finally 265 | (buffer-partition spread-read #\Tab 3 spread-read-pos)))) 266 | ; merge the remaining reads in the spread-reads file 267 | (when (not (buffer-emptyp spread-read)) 268 | (write-buffer spread-read out) 269 | (write-newline out)) 270 | (copy-stream spread-reads-file out)) 271 | ; merge the unmapped reads 272 | (with-open-sam (unmapped-file (merge-pathnames input-path (make-pathname :name (format nil "~a-unmapped" input-prefix) :type input-extension)) :direction :input) 273 | (skip-sam-header unmapped-file) 274 | (copy-stream unmapped-file out))))) 275 | 276 | (defun merge-unsorted-files-split-per-chromosome (input-path output input-prefix input-extension header) 277 | "A function for merging files that were split with elPrep and are unsorted" 278 | ; merge loop 279 | (with-open-sam (out output :direction :output) 280 | (format-sam-header out header) 281 | ; first merge the unmapped reads 282 | (with-open-sam (unmapped-file (merge-pathnames input-path (make-pathname :name (format nil "~a-unmapped" input-prefix) :type input-extension)) :direction :input) 283 | (skip-sam-header unmapped-file) 284 | (copy-stream unmapped-file out)) 285 | ; merge spread reads 286 | (with-open-sam (spread-reads-file (merge-pathnames input-path (make-pathname :name (format nil "~a-spread" input-prefix) :type input-extension)) :direction :input) 287 | (skip-sam-header spread-reads-file) 288 | (copy-stream spread-reads-file out)) 289 | ; merge the rest of the files 290 | (loop for sn-form in (sam-header-sq header) 291 | for chrom = (getf sn-form :SN) 292 | for file-name = (merge-pathnames input-path (make-pathname :name (format nil "~a-~a" input-prefix chrom) :type input-extension)) 293 | when (probe-file file-name) do 294 | (with-open-sam (file file-name :direction :input) 295 | (skip-sam-header file) 296 | (copy-stream file out))))) 297 | 298 | (declaim (inline parse-sam-alignment-from-stream)) 299 | 300 | (defun parse-sam-alignment-from-stream (stream) 301 | "Read a line from a stream and parse it as a SAM alignment line. 302 | Return NIL if stream is at end of file. 303 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4." 304 | (let ((line (read-line stream nil))) 305 | (when line (parse-sam-alignment line)))) 306 | 307 | (defun compare-sam-files (file1 file2 &optional (output "/dev/stdout")) 308 | "A function for comparing two sam files. The input files must be sorted by coordinate order." 309 | (labels ((get-alns (stream next-aln) 310 | (loop with group-aln = (or next-aln (parse-sam-alignment-from-stream stream)) 311 | with alns = (list group-aln) 312 | for aln = (parse-sam-alignment-from-stream stream) 313 | while (and aln 314 | (= (sam-alignment-pos group-aln) 315 | (sam-alignment-pos aln)) 316 | (string= (sam-alignment-rname group-aln) 317 | (sam-alignment-rname aln))) 318 | do (push aln alns) 319 | finally (return (values (sort alns (lambda (aln1 aln2) 320 | (or (string< (sam-alignment-qname aln1) 321 | (sam-alignment-qname aln2)) 322 | (when (string= (sam-alignment-qname aln1) 323 | (sam-alignment-qname aln2)) 324 | (< (sam-alignment-flag aln1) 325 | (sam-alignment-flag aln2)))))) 326 | aln)))) 327 | (plist-to-sorted-alist (plist) 328 | (sort (loop for (key value) on plist by #'cddr collect (cons key value)) 329 | #'string< :key (lambda (object) (string (car object))))) 330 | (compare-alns (out alns1 alns2) 331 | (loop for aln1 in alns1 332 | for aln2 in alns2 333 | for difference = (cond ((string/= (sam-alignment-qname aln1) 334 | (sam-alignment-qname aln2)) "qname (1)") 335 | ((/= (sam-alignment-flag aln1) 336 | (sam-alignment-flag aln2)) "flag (2)") 337 | ((string/= (sam-alignment-rname aln1) 338 | (sam-alignment-rname aln2)) "rname (3)") 339 | ((/= (sam-alignment-pos aln1) 340 | (sam-alignment-pos aln2)) "pos (4)") 341 | ((/= (sam-alignment-mapq aln1) 342 | (sam-alignment-mapq aln2)) "mapq (5)") 343 | ((string/= (sam-alignment-cigar aln1) 344 | (sam-alignment-cigar aln2)) "cigar (6)") 345 | ((string/= (sam-alignment-rnext aln1) 346 | (sam-alignment-rnext aln2)) "rnext (7)") 347 | ((/= (sam-alignment-pnext aln1) 348 | (sam-alignment-pnext aln2)) "pnext (8)") 349 | ((/= (sam-alignment-tlen aln1) 350 | (sam-alignment-tlen aln2)) "tlen (9)") 351 | ((string/= (sam-alignment-seq aln1) 352 | (sam-alignment-seq aln2)) "seq (10)") 353 | ((string/= (sam-alignment-qual aln1) 354 | (sam-alignment-qual aln2)) "qual (11)") 355 | (t (let ((tags1 (plist-to-sorted-alist (sam-alignment-tags aln1))) 356 | (tags2 (plist-to-sorted-alist (sam-alignment-tags aln2)))) 357 | (when (or (/= (length tags1) (length tags2)) 358 | (loop for (nil . val1) in tags1 359 | for (nil . val2) in tags2 360 | thereis (or (not (eq (type-of val1) (type-of val2))) 361 | (etypecase val1 362 | (character (char/= val1 val2)) 363 | (number (/= val1 val2)) 364 | (string (string/= val1 val2)) 365 | (array (not (equalp val1 val2))))))) 366 | "optional tags")))) 367 | when difference do 368 | (format t "alignments differ for ~a entry: ~%" difference) 369 | (format-sam-alignment *standard-output* aln1) 370 | (format-sam-alignment *standard-output* aln2) 371 | (format-sam-alignment out aln1) 372 | (format-sam-alignment out aln2)))) 373 | (with-open-sam (in1 file1 :direction :input) 374 | (skip-sam-header in1) 375 | (with-open-sam (in2 file2 :direction :input) 376 | (skip-sam-header in2) 377 | (with-open-sam (out output :direction :output) 378 | (loop 379 | for prev-aln1 = nil 380 | for prev-aln2 = nil 381 | for alns1 = (multiple-value-bind (alns next) (get-alns in1 prev-aln1) (setf prev-aln1 next) alns) 382 | for alns2 = (multiple-value-bind (alns next) (get-alns in2 prev-aln2) (setf prev-aln2 next) alns) 383 | for l1 = (length alns1) 384 | for l2 = (length alns2) 385 | for index from 1 386 | sum l1 into nr-of-reads-matched 387 | while (or alns1 alns2) do 388 | (if (= l1 l2) 389 | (compare-alns out alns1 alns2) 390 | (let ((pos (sam-alignment-pos (or (first alns1) (first alns2)))) 391 | (rname (sam-alignment-rname (or (first alns1) (first alns2))))) 392 | (format t "Files contain an unequal number of read entries at the same position.~%") 393 | (format t "File ~a has ~a reads at position ~a ~a.~%" file1 l1 rname pos) 394 | (format t "File ~a has ~a reads at position ~a ~a.~%" file2 l2 rname pos))) 395 | (when (zerop (mod index 1000000)) 396 | (format t "~a reads compared and matched.~%" nr-of-reads-matched)) 397 | finally (format t "~a reads compared and matched.~%" nr-of-reads-matched))))))) 398 | -------------------------------------------------------------------------------- /elprep.asd: -------------------------------------------------------------------------------- 1 | (asdf:defsystem #:elprep 2 | :version "2.61" 3 | :author "Charlotte Herzeel, Pascal Costanza" 4 | :licence 5 | "Copyright (c) 2014, 2015, 2016 Imec and Intel Corporation. 6 | Copyright (c) 2017 Imec. 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are 11 | met: 12 | 13 | * Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | * Redistributions in binary form must reproduce the above copyright 16 | notice, this list of conditions and the following disclaimer in the 17 | documentation and/or other materials provided with the distribution. 18 | * Neither the name of Imec or Intel Corporation nor the names of its 19 | contributors may be used to endorse or promote products derived from 20 | this software without specific prior written permission." 21 | :components ((:file "elprep-package") 22 | (:file "lisp-utils" :depends-on ("elprep-package")) 23 | (:file "io-utils" :depends-on ("lisp-utils")) 24 | (:file "buffer" :depends-on ("lisp-utils" "io-utils")) 25 | (:file "sam-types" :depends-on ("lisp-utils")) 26 | (:file "sam-files" :depends-on ("io-utils" "sam-types")) 27 | (:file "simple-trees" :depends-on ("lisp-utils")) 28 | (:file "filter-pipeline" :depends-on ("sam-files" "simple-trees")) 29 | (:file "simple-filters" :depends-on ("filter-pipeline")) 30 | (:file "clean-sam" :depends-on ("filter-pipeline")) 31 | (:file "mark-duplicates" :depends-on ("filter-pipeline")) 32 | (:file "elprep-utils" :depends-on ("filter-pipeline" "buffer")) 33 | (:file "user-interface" :depends-on ("filter-pipeline" "mark-duplicates" "clean-sam" "simple-filters" "elprep-utils"))) 34 | :depends-on ("cl-date-time-parser" 35 | "cl-fad" 36 | "claws" 37 | #+sbcl "named-readtables" 38 | "string-case" 39 | #+sbcl (:require :sb-concurrency))) 40 | -------------------------------------------------------------------------------- /elprep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import elprep_im 5 | 6 | def elprep(): 7 | elprep_im.elprep_im(sys.argv[]) 8 | 9 | elprep() 10 | -------------------------------------------------------------------------------- /elprep_entrypoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Wrapper script for using elPrep, elPrep sfm, and elPrep gnupar 4 | # Can be used as an entrypoint for Docker 5 | 6 | import sys 7 | import os 8 | import subprocess 9 | import elprep_im 10 | import elprep_sfm 11 | import elprep_sfm_gnupar 12 | 13 | # actual script 14 | 15 | def elprep_entrypoint(): 16 | if len(sys.argv) >= 2: 17 | script_selector = sys.argv[1] 18 | if (script_selector == "sfm"): 19 | elprep_sfm.elprep_sfm(sys.argv[1:]) 20 | elif (script_selector == "sfm-gnupar"): 21 | elprep_sfm_gnupar.elprep_sfm_gnupar(sys.argv[1:]) 22 | else: 23 | elprep_im.elprep_im(sys.argv) 24 | else: 25 | ret = subprocess.call(["elprep"]) 26 | if ret != 0: raise SystemExit, ret 27 | 28 | elprep_entrypoint() 29 | -------------------------------------------------------------------------------- /elprep_im.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import elprep_io_wrapper 4 | 5 | def elprep_im(argv): 6 | elprep_io_wrapper.cmd_wrap_io(["elprep"], argv[1], argv[2], argv[3:]) 7 | -------------------------------------------------------------------------------- /elprep_io_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import subprocess 5 | import os 6 | 7 | # utilities 8 | 9 | def member(el,l): 10 | i = iter(l) 11 | for val in i: 12 | if val == el: 13 | return list(i) 14 | 15 | def cmd_option(opt, cmdl): 16 | val = member(opt, cmdl) 17 | return [opt,val[0]] if val else [] 18 | 19 | def remove_cmd_option(cmd_l, name): 20 | "removes a command with its option value from a list of commands" 21 | nl = [] 22 | i = iter(cmd_l) 23 | val = next(i, None) 24 | while(val): 25 | if not(val == name): 26 | nl = nl + [val] 27 | val = next(i, None) 28 | else: 29 | next(i, None) 30 | val = next(i, None) 31 | return nl 32 | 33 | def cmd_wrap_input (cmd_list, file_in, file_out, cmd_opts): 34 | input = os.path.basename(file_in) 35 | input_prefix, input_extension = os.path.splitext(input) 36 | nr_of_threads_opt = cmd_option("--nr-of-threads", sys.argv) 37 | nr_of_threads = str(nr_of_threads_opt[1]) if nr_of_threads_opt else "1" 38 | if (input_extension == ".bam" or input_extension == ".cram"): 39 | p1 = subprocess.Popen(["samtools", "view", "-h", "-@", nr_of_threads, file_in], bufsize=-1, stdout=subprocess.PIPE) 40 | p2 = subprocess.Popen(cmd_list + ["/dev/stdin", file_out] + cmd_opts, bufsize=-1, stdin=p1.stdout) 41 | p2.communicate() 42 | if p2.returncode != 0: raise SystemExit, p2.returncode 43 | else: 44 | ret = subprocess.call(cmd_list + [file_in, file_out] + cmd_opts) 45 | if ret != 0: raise SystemExit, ret 46 | 47 | def cmd_wrap_output (cmd_list, file_in, file_out, cmd_opts): 48 | output = os.path.basename(file_out) 49 | output_prefix, output_extension = os.path.splitext(output) 50 | nr_of_threads_opt = cmd_option("--nr-of-threads", sys.argv) 51 | nr_of_threads = str(nr_of_threads_opt[1]) if nr_of_threads_opt else "1" 52 | if (output_extension == ".bam" or output_extension == ".cram"): 53 | p1 = subprocess.Popen(cmd_list + [file_in, "/dev/stdout"] + cmd_opts, bufsize=-1, stdout=subprocess.PIPE) 54 | p2 = subprocess.Popen(["samtools", "view", "-bS", "-@", nr_of_threads, "-o", file_out, "-"], bufsize=-1, stdin=p1.stdout) 55 | p2.communicate() 56 | if p2.returncode != 0: raise SystemExit, p2.returncode 57 | else: 58 | ret = subprocess.call(cmd_list + [file_in, file_out] + cmd_opts) 59 | if ret != 0: raise SystemExit, ret 60 | 61 | def cmd_wrap_io(cmd_list, file_in, file_out, cmd_opts): 62 | input = os.path.basename(file_in) 63 | output = os.path.basename(file_out) 64 | output_prefix, output_extension = os.path.splitext(output) 65 | input_prefix, input_extension = os.path.splitext(input) 66 | nr_of_threads_opt = cmd_option("--nr-of-threads", sys.argv) 67 | nr_of_threads = str(nr_of_threads_opt[1]) if nr_of_threads_opt else "1" 68 | if (input_extension == ".bam" or input_extension == ".cram"): 69 | p1 = subprocess.Popen(["samtools", "view", "-h", "-@", nr_of_threads, file_in], bufsize=-1, stdout=subprocess.PIPE) 70 | if (output_extension == ".bam"): 71 | p2 = subprocess.Popen(cmd_list + ["/dev/stdin", "/dev/stdout"] + cmd_opts, bufsize=-1, stdin=p1.stdout, stdout=subprocess.PIPE) 72 | p3 = subprocess.Popen(["samtools", "view", "-bS", "-@", nr_of_threads, "-o", file_out, "-"], bufsize=-1, stdin=p2.stdout) 73 | p3.communicate() 74 | if p3.returncode != 0: raise SystemExit, p3.returncode 75 | elif (output_extension == ".cram"): 76 | reference_t_opt = cmd_option("--reference-t", cmd_opts) 77 | reference_bigT_opt = cmd_option("--reference-T", cmd_opts) 78 | if not(reference_t_opt) and not(reference_bigT_opt): return "Converting to .cram. Need to pass reference-t or reference-T" 79 | opt_to_delete = "--reference-t" if reference_t_opt else "--reference-T" 80 | p2 = subprocess.Popen(cmd_list + ["dev/stdin", "/dev/stdout"] + remove_cmd_option(cmd_opts, opt_to_delete), bufsize=-1, stdin=p1.stdout, stdout=subprocess.PIPE) 81 | t_opt = ["-t", reference_t_opt[1]] if reference_t_opt else ["-T", reference_bigT_opt[1]] 82 | p3 = subprocess.Popen(["samtools", "view", "-C", "-@", nr_of_threads] + t_opt + ["-o", file_out, "-"], bufsize=-1, stdin=p2.stdout) 83 | p3.communicate() 84 | if p3.returncode != 0: raise SystemExit, p3.returncode 85 | else: # output_extension == ".sam" 86 | p2 = subprocess.Popen(cmd_list + ["/dev/stdin", file_out] + cmd_opts, bufsize=-1, stdin=p1.stdout) 87 | p2.communicate() 88 | if p2.returncode != 0: raise SystemExit, p2.returncode 89 | else: # input_extension == ".sam" 90 | if (output_extension == ".bam"): 91 | p1 = subprocess.Popen(cmd_list + [file_in, "/dev/stdout"] + cmd_opts, bufsize=-1, stdout=subprocess.PIPE) 92 | p2 = subprocess.Popen(["samtools", "view", "-bS", "-@", nr_of_threads, "-o", file_out, "-"], bufsize=-1, stdin=p1.stdout) 93 | p2.communicate() 94 | if p2.returncode != 0: raise SystemExit, p2.returncode 95 | elif (output_extension == ".cram"): 96 | reference_t_opt = cmd_option("--reference-t", sys.argv) 97 | reference_bigT_opt = cmd_option("--reference-T", sys.argv) 98 | if not(reference_t_opt) and not(reference_bigT_opt): return "Converting to .cram. Need to pass reference-t or reference-T" 99 | opt_to_delete = "--reference-t" if reference_t_opt else "--reference-T" 100 | p1 = subprocess.Popen(cmd_list + [file_in, "/dev/stdout"] + remove_cmd_option(cmd_opts, opt_to_delete), bufsize=-1, stdout=subprocess.PIPE) 101 | t_opt = ["-t", reference_t_opt[1]] if reference_t_opt else ["-T", reference_bigT_opt[1]] 102 | p2 = subprocess.Popen(["samtools", "view", "-C", "-@", nr_of_threads] + t_opt + ["-o", file_out, "-"], bufsize=-1, stdin=p1.stdout) 103 | p2.communicate() 104 | if p2.returncode != 0: raise SystemExit, p2.returncode 105 | else: # output_extension == ".sam" 106 | ret = subprocess.call(cmd_list + [file_in, file_out] + cmd_opts) 107 | if ret != 0: raise SystemExit, ret 108 | -------------------------------------------------------------------------------- /elprep_sfm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Example script for using elPrep split/merge tools. 4 | 5 | import subprocess 6 | import os 7 | import time 8 | import elprep_io_wrapper 9 | 10 | # actual script 11 | 12 | def elprep_sfm (argv): 13 | # set up directories for intermediate results 14 | file_in = argv[1] 15 | file_out = argv[2] 16 | input = os.path.basename(file_in) 17 | output_prefix, output_extension = os.path.splitext(input) 18 | stamp = str(time.time()) 19 | split_dir = os.path.join(os.getcwd(), "temp-" + stamp + os.sep) 20 | result_dir = os.path.join(os.getcwd(), "temp-processed-" + stamp + os.sep) 21 | os.mkdir(split_dir) 22 | os.mkdir(result_dir) 23 | # split command 24 | nr_of_threads_opt = elprep_io_wrapper.cmd_option("--nr-of-threads", argv) 25 | intermediate_files_opt = elprep_io_wrapper.cmd_option("--intermediate-files-output-type", argv) 26 | if intermediate_files_opt: 27 | intermediate_files_output_type = intermediate_files_opt[1] 28 | else: 29 | intermediate_files_output_type = "sam" 30 | given_cmd_opts = elprep_io_wrapper.remove_cmd_option(argv[3:], "--intermediate-files-output-type") 31 | cmd_opts = given_cmd_opts 32 | elprep_io_wrapper.cmd_wrap_input(["elprep", "split"], file_in, split_dir, ["--output-prefix", output_prefix, "--output-type", intermediate_files_output_type] + nr_of_threads_opt) 33 | spread_file = os.path.join(split_dir, output_prefix + "-spread." + intermediate_files_output_type) 34 | splits_path = os.path.join(split_dir, "splits" + os.sep) 35 | # run filter command for split files 36 | for root, dirs, files in os.walk(splits_path): 37 | for file in files: 38 | ext = os.path.splitext(file)[1] 39 | if (ext == ".sam" or ext == ".bam" or ext == ".cram"): 40 | ffile = os.path.join(root, file) 41 | processed_file = os.path.join(result_dir, os.path.basename(file)) 42 | elprep_io_wrapper.cmd_wrap_io(["elprep"], ffile, processed_file, cmd_opts + ["--split-file"]) 43 | os.remove(ffile) 44 | os.rmdir(splits_path) 45 | # command for spread file 46 | spread_out_file = os.path.join(result_dir, output_prefix + "-spread." + intermediate_files_output_type) 47 | elprep_io_wrapper.cmd_wrap_io(["elprep"], spread_file, spread_out_file , cmd_opts) 48 | os.remove(spread_file) 49 | os.rmdir(split_dir) 50 | # merge command 51 | elprep_io_wrapper.cmd_wrap_output(["elprep", "merge"], result_dir, file_out, nr_of_threads_opt) 52 | # remove directories for intermediate results 53 | for root, dirs, files in os.walk(result_dir): 54 | for file in files: 55 | ffile = os.path.join(root, file) 56 | os.remove(ffile) 57 | os.rmdir(result_dir) 58 | -------------------------------------------------------------------------------- /elprep_sfm_gnupar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Example script for using elPrep split/merge tools. 4 | 5 | import subprocess 6 | import os 7 | import time 8 | import elprep_io_wrapper 9 | import operator 10 | 11 | # actual script 12 | 13 | def append_cmd(cmd1, cmd2): 14 | return cmd1 + " " + cmd2 15 | 16 | def elprep_sfm_gnupar (argv): 17 | # set up directories for intermediate results 18 | file_in = argv[1] 19 | file_out = argv[2] 20 | input = os.path.basename(file_in) 21 | output_prefix, output_extension = os.path.splitext(input) 22 | stamp = str(time.time()) 23 | split_dir = os.path.join(os.getcwd(), "temp-" + stamp + os.sep) 24 | result_dir = os.path.join(os.getcwd(), "temp-processed-" + stamp + os.sep) 25 | os.mkdir(split_dir) 26 | os.mkdir(result_dir) 27 | # split command 28 | nr_of_threads_opt = elprep_io_wrapper.cmd_option('--nr-of-threads', argv) 29 | intermediate_files_opt = elprep_io_wrapper.cmd_option('--intermediate-files-output-type', argv) 30 | if intermediate_files_opt: 31 | intermediate_files_output_type = intermediate_files_opt[1] 32 | else: 33 | intermediate_files_output_type = "sam" 34 | elprep_io_wrapper.cmd_wrap_input(["elprep", "split"], file_in, split_dir, ["--output-prefix", output_prefix, "--output-type", intermediate_files_output_type] + nr_of_threads_opt) 35 | spread_file = os.path.join(split_dir, output_prefix + "-spread." + intermediate_files_output_type) 36 | splits_path = os.path.join(split_dir, "splits" + os.sep) 37 | # gnu parallel command 38 | nr_of_jobs_opt = elprep_io_wrapper.cmd_option('--nr-of-jobs', argv) 39 | read_group_string = elprep_io_wrapper.cmd_option('--replace-read-group', argv) 40 | given_cmd_opts = elprep_io_wrapper.remove_cmd_option(argv[3:], '--nr-of-jobs') 41 | given_cmd_opts = elprep_io_wrapper.remove_cmd_option(given_cmd_opts, '--intermediate-files-output-type') 42 | cmd_opts = given_cmd_opts 43 | if read_group_string: 44 | cmd_opts = elprep_io_wrapper.remove_cmd_option(cmd_opts, '--replace-read-group') 45 | cmd_opts = cmd_opts + ['--replace-read-group', '\"' + read_group_string[1] + '\"'] 46 | cmd_opts = cmd_opts + ['--split-file'] 47 | cmd_list = ["elprep"] 48 | elprep_cmd = '\'' + reduce(append_cmd, cmd_list + ['{}', result_dir + '{/.}.' + intermediate_files_output_type ] + cmd_opts) + '\'' 49 | gnu_cmd = 'parallel --gnu -j ' + str(nr_of_jobs_opt[1]) + ' ' + elprep_cmd + ' ::: ' + splits_path + '*.' + intermediate_files_output_type 50 | ret = subprocess.check_call(gnu_cmd, shell=True) 51 | if ret != 0: raise SystemExit, ret 52 | # command for spread file 53 | spread_out_file = os.path.join(result_dir, output_prefix + "-spread." + intermediate_files_output_type) 54 | elprep_io_wrapper.cmd_wrap_io(["elprep"], spread_file, spread_out_file , given_cmd_opts) 55 | # merge command 56 | elprep_io_wrapper.cmd_wrap_output(["elprep", "merge"], result_dir, file_out, nr_of_threads_opt) 57 | # remove directories for intermediate results 58 | for root, dirs, files in os.walk(splits_path): 59 | for file in files: 60 | ffile = os.path.join(root, file) 61 | os.remove(ffile) 62 | os.rmdir(splits_path) 63 | os.remove(spread_file) 64 | os.rmdir(split_dir) 65 | for root, dirs, files in os.walk(result_dir): 66 | for file in files: 67 | ffile = os.path.join(root, file) 68 | os.remove(ffile) 69 | os.rmdir(result_dir) 70 | -------------------------------------------------------------------------------- /lisp-utils.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | 3 | #+sbcl 4 | (eval-when (:compile-toplevel :load-toplevel :execute) 5 | (defun read-simple-base-string (stream c) 6 | (declare (ignore c)) 7 | (coerce (loop for char = (read-char stream t nil t) 8 | until (char= char #\") 9 | collect (if (char= char #\\) (read-char stream t nil t) char)) 10 | 'simple-base-string)) 11 | 12 | (defreadtable simple-base-string-syntax 13 | (:merge :standard) 14 | (:macro-char #\" #'read-simple-base-string nil)) 15 | 16 | (defmacro in-simple-base-string-syntax () 17 | "Make literal strings produce simple-base-string instead of (array character (*)) in SBCL." 18 | '(in-readtable simple-base-string-syntax))) 19 | 20 | #+lispworks 21 | (defmacro in-simple-base-string-syntax () 22 | "Make literal strings produce simple-base-string instead of (array character (*)) in SBCL." 23 | ()) 24 | 25 | (in-simple-base-string-syntax) 26 | 27 | (eval-when (:compile-toplevel :load-toplevel :execute) 28 | (defparameter *optimization* 29 | '(optimize (speed 3) (space 0) (debug 1) (safety 0) 30 | (compilation-speed 0)) 31 | "Standard optimization settings without fixnum optimizations.") 32 | 33 | (defparameter *fixnum-optimization* 34 | '(optimize (speed 3) (space 0) (debug 1) (safety 0) 35 | (compilation-speed 0) #+lispworks (hcl:fixnum-safety 0)) 36 | "Standard optimizations settings with fixnum optimizations.")) 37 | 38 | ;;; low-level types 39 | 40 | (deftype octet () "uint8" '(unsigned-byte 8)) 41 | (deftype uint16 () "uint16" '(unsigned-byte 16)) 42 | (deftype int32 () "int32" '(signed-byte 32)) 43 | 44 | ;;; portability 45 | 46 | (defmacro defglobal (var value &optional (doc nil docp)) 47 | "Define a global variable." 48 | #+lispworks 49 | (if docp 50 | `(hcl:defglobal-variable ,var ,value ,doc) 51 | `(hcl:defglobal-variable ,var ,value)) 52 | #+sbcl 53 | (if docp 54 | `(sb-ext:defglobal ,var ,value ,doc) 55 | `(sb-ext:defglobal ,var ,value))) 56 | 57 | ;;; utilities 58 | 59 | (defglobal *keyword* (find-package :keyword) 60 | "The :keyword package.") 61 | 62 | (declaim (inline intern-key)) 63 | 64 | (defun intern-key (string) 65 | "Intern a string in the :keyword package." 66 | (declare (simple-base-string string) #.*optimization*) 67 | (intern string *keyword*)) 68 | 69 | (defun intern-key/copy (string) 70 | "Find a symbol in the :keyword package or, if not found, intern a copy of the string in the :keyword package. 71 | Can be used for mutable or stack-allocated strings, etc." 72 | (declare (simple-base-string string) #.*optimization*) 73 | (let ((keyword *keyword*)) 74 | (or (find-symbol string keyword) 75 | (intern (copy-seq string) keyword)))) 76 | 77 | (defglobal *unique-value* (gensym) 78 | "A unique value for use in the functions presentp and unique.") 79 | 80 | (declaim (inline presentp)) 81 | 82 | (defun presentp (indicator plist) 83 | "Is the indicator present in the property list?" 84 | (declare (list plist) #.*optimization*) 85 | (let ((unique-value *unique-value*)) 86 | (not (eq (getf plist indicator unique-value) unique-value)))) 87 | 88 | (declaim (inline unique)) 89 | 90 | (defun unique (indicator plist) 91 | "Assert that the indicator is not present in the property list." 92 | (assert (not (presentp indicator plist))) 93 | indicator) 94 | 95 | (defmacro unwind-protectn (&body forms) 96 | "Like unwind-protect, except that all forms but the last are protected, and only the last form is used for cleanup" 97 | `(unwind-protect (progn ,@(butlast forms)) ,@(last forms))) 98 | 99 | (defun get-function (object) 100 | "Get a function object from a function (returned as is) or a function name." 101 | (etypecase object 102 | (function object) 103 | (symbol (symbol-function object)) 104 | (cons (fdefinition object)))) 105 | 106 | ;;; multi-threading 107 | 108 | (defvar *number-of-threads* 1 109 | "The number of threads used by run-pipeline and run-pipeline-in-situ to parallelize the filtering process. 110 | Default is 1, which results in sequential execution. Usually, parallelization only occurs when this value is greater than 3. 111 | Also used as the number of threads to use in samtools when piping from/to BAM/CRAM files.") 112 | 113 | (declaim (inline thread-run thread-join)) 114 | 115 | (defun thread-run (name function &rest arguments) 116 | "Wrapper around mp:process-run-function in LispWorks, and sb-thread:make-thread in SBCL." 117 | (declare (dynamic-extent arguments)) 118 | #+lispworks (apply #'mp:process-run-function name '() function arguments) 119 | #+sbcl (sb-thread:make-thread function :name name :arguments (copy-list arguments))) 120 | 121 | (defun thread-join (thread) 122 | "Similar to LispWorks's mp:process-join." 123 | #+lispworks (mp:process-join thread) 124 | #+sbcl (sb-thread:join-thread thread)) 125 | 126 | (defstruct (bounded-mailbox (:constructor make-bounded-mailbox 127 | (capacity &aux (semaphore 128 | #+lispworks (mp:make-semaphore :count capacity) 129 | #+sbcl (sb-thread:make-semaphore :count capacity))))) 130 | "A mailbox with an upper bound on the number of objects that can be simultaneously present. 131 | Adding new elements may block when that number is reached. 132 | The struct bounded-mailbox has a constructor make-bounded-mailbox that takes a capacity as a parameter. 133 | Read-only accessor bounded-mailbox-semaphore refers to a semaphore that controls the number of elements that are present in the mailbox. 134 | Read-only accessor bounded-mailbox-mailbox is the underlying actual mailbox." 135 | (semaphore nil :read-only t) 136 | (mailbox #+lispworks (mp:make-mailbox) #+sbcl (sb-concurrency:make-mailbox) :read-only t)) 137 | 138 | (setf (documentation 'make-bounded-mailbox 'function) 139 | "Constructor for struct bounded-mailbox that takes a capacity as a parameter." 140 | (documentation 'bounded-mailbox-p 'function) 141 | "Default predicate for struct bounded-mailbox." 142 | (documentation 'copy-bounded-mailbox 'function) 143 | "Default copier function for struct bounded-mailbox." 144 | (documentation 'bounded-mailbox-semaphore 'function) 145 | "Read the bounded-mailbox semaphore that controls the number of elements that are present in the mailbox." 146 | (documentation 'bounded-mailbox-mailbox 'function) 147 | "Read the bounded-mailbox underlying actual mailbox.") 148 | 149 | (defun make-mailbox (&optional (capacity nil)) 150 | "Create a mailbox. If the optional capacity parameter is provided and is not nil, then a bounded-mailbox is created, otherwise a plain mailbox." 151 | (if (null capacity) 152 | #+lispworks (mp:make-mailbox) 153 | #+sbcl (sb-concurrency:make-mailbox) 154 | (make-bounded-mailbox capacity))) 155 | 156 | (defgeneric mailbox-send (mailbox object) 157 | (:documentation "Send an object to a mailbox.") 158 | (:method ((mailbox mailbox) object) 159 | "Send an object to a mailbox." 160 | #+lispworks (mp:mailbox-send mailbox object) 161 | #+sbcl (sb-concurrency:send-message mailbox object)) 162 | (:method ((mailbox bounded-mailbox) object) 163 | "Send an object to a bounded mailbox. May block if the maximum capacity is reached." 164 | #+lispworks 165 | (progn 166 | (assert (mp:semaphore-acquire (bounded-mailbox-semaphore mailbox))) 167 | (mp:mailbox-send (bounded-mailbox-mailbox mailbox) object)) 168 | #+sbcl 169 | (progn 170 | (assert (sb-thread:wait-on-semaphore (bounded-mailbox-semaphore mailbox))) 171 | (sb-concurrency:send-message (bounded-mailbox-mailbox mailbox) object)))) 172 | 173 | (defgeneric mailbox-read (mailbox) 174 | (:documentation "Read an object from a mailbox. May block if there are no objects in the mailbox.") 175 | (:method ((mailbox mailbox)) 176 | "Read an object from a mailbox. May block if there are no objects in the mailbox." 177 | #+lispworks (mp:mailbox-read mailbox) 178 | #+sbcl (sb-concurrency:receive-message mailbox)) 179 | (:method ((mailbox bounded-mailbox)) 180 | "Read an object from a mailbox. May block if there are no objects in the mailbox." 181 | #+lispworks 182 | (multiple-value-prog1 183 | (mp:mailbox-read (bounded-mailbox-mailbox mailbox)) 184 | (mp:semaphore-release (bounded-mailbox-semaphore mailbox))) 185 | #+sbcl 186 | (multiple-value-prog1 187 | (sb-concurrency:receive-message (bounded-mailbox-mailbox mailbox)) 188 | (sb-thread:signal-semaphore (bounded-mailbox-semaphore mailbox))))) 189 | 190 | (declaim (inline make-single-thread-hash-table make-synchronized-hash-table)) 191 | 192 | (defun make-single-thread-hash-table (&rest args &key test size rehash-size rehash-treshold hash-function) 193 | "Like make-hash-table, but ensure it is single-thread, not synchronized." 194 | (declare (dynamic-extent args) (ignore test size rehash-size rehash-treshold hash-function)) 195 | #+lispworks (apply #'cl:make-hash-table :single-thread t args) 196 | #+sbcl (apply #'cl:make-hash-table :synchronized nil args)) 197 | 198 | (defun make-synchronized-hash-table (&rest args &key test size rehash-size rehash-treshold hash-function) 199 | "Like make-hash-table, but ensure it is synchronized, not single-thread." 200 | (declare (dynamic-extent args) (ignore test size rehash-size rehash-treshold hash-function)) 201 | #+lispworks (apply #'cl:make-hash-table :single-thread nil args) 202 | #+sbcl (apply #'cl:make-hash-table :synchronized t args)) 203 | 204 | #+sbcl 205 | (defmacro with-hash-table-locked (hash-table &body body) 206 | "Renamed sb-ext:with-locked-hash-table." 207 | `(sb-ext:with-locked-hash-table (,hash-table) ,@body)) 208 | 209 | #+sbcl 210 | (defun modify-hash (hash-table key function) 211 | "Similar to LispWorks's hcl:modify-hash." 212 | (if (sb-ext:hash-table-synchronized-p hash-table) 213 | (sb-ext:with-locked-hash-table (hash-table) 214 | (multiple-value-bind (value foundp) 215 | (gethash key hash-table) 216 | (values (setf (gethash key hash-table) 217 | (locally (declare #.*optimization*) 218 | (funcall (the function function) key value foundp))) 219 | key))) 220 | (multiple-value-bind (value foundp) 221 | (gethash key hash-table) 222 | (values (setf (gethash key hash-table) 223 | (locally (declare #.*optimization*) 224 | (funcall (the function function) key value foundp))) 225 | key)))) 226 | 227 | (defmacro with-modify-hash ((key value found) (hash-table form) &body body) 228 | "Macro version of LispWorks's modify-hash function." 229 | `(modify-hash ,hash-table ,form (lambda (,key ,value ,found) 230 | (declare (ignorable ,key ,value ,found)) 231 | (block nil ,@body)))) 232 | 233 | ;;; higher-order functions 234 | 235 | (defun compose-thunks (thunks) 236 | "Return a single thunk that executes the given thunks in sequence." 237 | (declare (list thunks)) 238 | (cond ((null thunks) (constantly nil)) 239 | ((null (cdr thunks)) (car thunks)) 240 | (t (lambda () 241 | (declare #.*optimization*) 242 | (loop for fun in thunks do (funcall (the function fun))))))) 243 | 244 | (declaim (inline mapfiltermap)) 245 | 246 | (defun mapfiltermap (inmap filter outmap list &optional tail) 247 | "Apply the following steps to each element in the list, optionally bounded by tail: 248 | - Apply inmap. 249 | - Filter out each element for which filter returns nil. 250 | - Apply outmap." 251 | (declare (function inmap filter outmap) #.*optimization*) 252 | (if (eq list tail) tail 253 | (locally (declare (cons list)) 254 | (loop for car = (funcall inmap (car list)) 255 | for cdr = (cdr list) 256 | for filtered = (funcall filter car) 257 | when filtered collect (funcall outmap car) 258 | until (eq cdr tail) 259 | do (setq list cdr))))) 260 | 261 | (declaim (inline nmapfiltermap)) 262 | 263 | (defun nmapfiltermap (inmap filter outmap list &optional tail) 264 | "Destructively apply the following steps to each element in the list, optionally bounded by tail: 265 | - Apply inmap. 266 | - Filter out each element for which filter returns nil. 267 | - Apply outmap." 268 | (declare (function inmap filter outmap) #.*optimization*) 269 | (if (eq list tail) tail 270 | (let ((head (cons nil list))) 271 | (declare (cons list head) (dynamic-extent head)) 272 | (loop with prev of-type cons = head 273 | for car = (funcall inmap (car list)) 274 | for cdr = (cdr list) 275 | for filtered = (funcall filter car) 276 | until (eq cdr tail) do 277 | (locally (declare (cons cdr)) 278 | (if filtered 279 | (setf (car list) (funcall outmap car) prev list list cdr) 280 | (setf (car list) (car cdr) (cdr list) (cdr cdr)))) 281 | finally (if filtered 282 | (setf (car list) (funcall outmap car)) 283 | (setf (cdr prev) tail))) 284 | (cdr head)))) 285 | 286 | (declaim (inline mapfilter)) 287 | 288 | (defun mapfilter (map filter list &optional tail) 289 | "Apply the following steps to each element in the list, optionally bounded by tail: 290 | - Apply map. 291 | - Filter out each element for which filter returns nil." 292 | (declare (function map filter) #.*optimization*) 293 | (if (eq list tail) tail 294 | (locally (declare (cons list)) 295 | (loop for car = (funcall map (car list)) 296 | for cdr = (cdr list) 297 | for filtered = (funcall filter car) 298 | when filtered collect car 299 | until (eq cdr tail) 300 | do (setq list cdr))))) 301 | 302 | (declaim (inline nmapfilter)) 303 | 304 | (defun nmapfilter (map filter list &optional tail) 305 | "Destructively apply the following steps to each element in the list, optionally bounded by tail: 306 | - Apply map. 307 | - Filter out each element for which filter returns nil." 308 | (declare (function map filter) #.*optimization*) 309 | (if (eq list tail) tail 310 | (let ((head (cons nil list))) 311 | (declare (cons list head) (dynamic-extent head)) 312 | (loop with prev of-type cons = head 313 | for car = (funcall map (car list)) 314 | for cdr = (cdr list) 315 | for filtered = (funcall filter car) 316 | until (eq cdr tail) do 317 | (locally (declare (cons cdr)) 318 | (if filtered 319 | (setf (car list) car prev list list cdr) 320 | (setf (car list) (car cdr) (cdr list) (cdr cdr)))) 321 | finally (if filtered 322 | (setf (car list) car) 323 | (setf (cdr prev) tail))) 324 | (cdr head)))) 325 | 326 | (declaim (inline filtermap)) 327 | 328 | (defun filtermap (filter map list &optional tail) 329 | "Apply the following steps to each element in the list, optionally bounded by tail: 330 | - Filter out each element for which filter returns nil. 331 | - Apply map." 332 | (declare (function filter map) #.*optimization*) 333 | (if (eq list tail) tail 334 | (locally (declare (cons list)) 335 | (loop for car = (car list) 336 | for cdr = (cdr list) 337 | for filtered = (funcall filter car) 338 | when filtered collect (funcall map car) 339 | until (eq cdr tail) 340 | do (setq list cdr))))) 341 | 342 | (declaim (inline nfiltermap)) 343 | 344 | (defun nfiltermap (filter map list &optional tail) 345 | "Destructively apply the following steps to each element in the list, optionally bounded by tail: 346 | - Filter out each element for which filter returns nil. 347 | - Apply map." 348 | (declare (function filter map) #.*optimization*) 349 | (if (eq list tail) tail 350 | (let ((head (cons nil list))) 351 | (declare (cons list head) (dynamic-extent head)) 352 | (loop with prev of-type cons = head 353 | for car = (car list) 354 | for cdr = (cdr list) 355 | for filtered = (funcall filter car) 356 | until (eq cdr tail) do 357 | (locally (declare (cons cdr)) 358 | (if filtered 359 | (setf (car list) (funcall map car) prev list list cdr) 360 | (setf (car list) (car cdr) (cdr list) (cdr cdr)))) 361 | finally (if filtered 362 | (setf (car list) (funcall map car)) 363 | (setf (cdr prev) tail))) 364 | (cdr head)))) 365 | 366 | (declaim (inline filter)) 367 | 368 | (defun filter (filter list &optional tail) 369 | "Filter out each element from the list, optionally bounded by tail, for which filter returns nil." 370 | (declare (function filter) #.*optimization*) 371 | (if (eq list tail) tail 372 | (locally (declare (cons list)) 373 | (loop for car = (car list) 374 | for cdr = (cdr list) 375 | when (funcall filter car) collect car 376 | until (eq cdr tail) 377 | do (setq list cdr))))) 378 | 379 | (declaim (inline nfilter)) 380 | 381 | (defun nfilter (filter list &optional tail) 382 | "Destructively filter out each element from the list, optionally bounded by tail, for which filter returns nil." 383 | (declare (function filter) #.*optimization*) 384 | (if (eq list tail) tail 385 | (let ((head (cons nil list))) 386 | (declare (cons list head) (dynamic-extent head)) 387 | (loop with prev of-type cons = head 388 | for car = (car list) 389 | for cdr = (cdr list) 390 | for filtered = (funcall filter car) 391 | until (eq cdr tail) do 392 | (locally (declare (cons cdr)) 393 | (if filtered 394 | (setf prev list list cdr) 395 | (setf (car list) (car cdr) (cdr list) (cdr cdr)))) 396 | finally (if filtered 397 | () 398 | (setf (cdr prev) tail))) 399 | (cdr head)))) 400 | 401 | (declaim (inline mapcar*)) 402 | 403 | (defun mapcar* (map list &optional tail) 404 | "Like mapcar, except operates on only one list, optionally bounded by tail." 405 | (declare (function map) #.*optimization*) 406 | (if (eq list tail) tail 407 | (locally (declare (cons list)) 408 | (loop for car = (car list) 409 | for cdr = (cdr list) 410 | collect (funcall map car) 411 | until (eq cdr tail) 412 | do (setq list cdr))))) 413 | 414 | (declaim (inline nmapcar*)) 415 | 416 | (defun nmapcar* (map list &optional tail) 417 | "Like mapcar, except destructively operates on only one list, optionally bounded by tail." 418 | (declare (function map) #.*optimization*) 419 | (if (eq list tail) tail 420 | (loop with cur of-type cons = list 421 | for car = (car cur) 422 | for cdr = (cdr cur) 423 | do (setf (car cur) (funcall map car)) 424 | until (eq cdr tail) 425 | do (setq cur cdr) 426 | finally (return list)))) 427 | 428 | (defun nthdiff (n list) 429 | "Return a copy of the first n elements of list, and the nth cdr of the list." 430 | (declare (fixnum n) (list list) #.*optimization*) 431 | (loop for tail on list repeat n 432 | collect (car tail) into head 433 | finally (return (values head tail)))) 434 | 435 | ;;; split hash tables 436 | 437 | (defstruct (split-hash-table (:constructor %make-split-hash-table (hash-function vector))) 438 | "A collection of hash tables distributing the entries based on a permutation of the same hash function used for the splits. 439 | The struct split-hash-table has a constructor %make-split-hash-table that takes a hash function and a vector of splits as parameters. 440 | Read-only accessor split-hash-table-hash-function refers to the hash function. 441 | Read-only accessor split-hash-table-vector of type simple-vector refers to the splits. 442 | Primary use of this struct is to allow for locking splits separately to avoid lock contention when operating on a hash table in parallel." 443 | (hash-function nil :type function :read-only t) 444 | (vector #() :type simple-vector :read-only t)) 445 | 446 | (setf (documentation '%make-split-hash-table 'function) 447 | "Constructor for struct split-hash-table that takes a hash function and a vector of splits as parameters." 448 | (documentation 'split-hash-table-p 'function) 449 | "Default predicate for struct split-hash-table." 450 | (documentation 'copy-split-hash-table 'function) 451 | "Default copier function for struct split-hash-table." 452 | (documentation 'split-hash-table-hash-function 'function) 453 | "Read the split-hash-table hash function." 454 | (documentation 'split-hash-table-vector 'function) 455 | "Read the split-hash-table vector of splits of type simple-vector.") 456 | 457 | (defun make-split-hash-table (splits &rest args &key 458 | test size rehash-size rehash-threshold 459 | (hash-function (error "No hash function passed to make-split-hash-table."))) 460 | "Constructor for split-hash-table that takes the number of splits and initialization arguments as for make-hash-table as parameters. 461 | The :hash-function initialization argument must be explicitly provided." 462 | (declare (dynamic-extent args) (ignore size rehash-size rehash-threshold)) 463 | (setq test (get-function test) 464 | hash-function (get-function hash-function)) 465 | (loop with vector = (make-array splits #+lispworks :single-thread #+lispworks t) 466 | for i below splits do 467 | (setf (svref vector i) (apply #'make-synchronized-hash-table :test test :hash-function hash-function args)) 468 | finally (return (%make-split-hash-table hash-function vector)))) 469 | 470 | (defconstant +total-bits+ #.(integer-length most-positive-fixnum) 471 | "Number of bits that make up a fixnum in the current Common Lisp implementation.") 472 | (defconstant +low-bits+ 15 473 | "An arbitrary number of bits in the low portion of a fixnum, used for split hash tables.") 474 | (defconstant +high-bits+ (- +total-bits+ +low-bits+) 475 | "An arbitrary number of bits in the high portion of a fixnum, used for split hash tables.") 476 | 477 | (defconstant +lowest-bits+ (1- (ash 1 +low-bits+)) 478 | "Bit mask for the lowest bits of a fixnum, used for split hash tables.") 479 | (defconstant +highest-bits+ (ash (1- (ash 1 +high-bits+)) +low-bits+) 480 | "Bit mask for the highest bits of a fixnum, used for split hash tables.") 481 | 482 | (declaim (inline rotate-15)) 483 | 484 | (defun rotate-15 (n) 485 | "Rotate a fixnum by 15 bits, used for split hash tables." 486 | (declare (fixnum n) #.*optimization*) 487 | (logior (the fixnum (ash (logand +lowest-bits+ n) +high-bits+)) 488 | (the fixnum (ash (logand +highest-bits+ n) (- +low-bits+))))) 489 | 490 | (declaim (inline hash-table-split)) 491 | 492 | (defun hash-table-split (key table) 493 | "Return a split of a split-hash-table for a given key." 494 | (declare (split-hash-table table) #.*optimization*) 495 | (let ((hash-function (split-hash-table-hash-function table)) 496 | (vector (split-hash-table-vector table))) 497 | (declare (function hash-function) (vector vector)) 498 | (svref vector (rem (rotate-15 (funcall hash-function key)) (length vector))))) 499 | 500 | (declaim (inline unwrap-displaced-array)) 501 | 502 | (defun unwrap-displaced-array (array) 503 | "Unwrap a displaced array and return the underlying actual array and an offset into that array which the displaced array is referring to." 504 | (declare (array array) #.*fixnum-optimization*) 505 | (let ((displaced array) (index 0)) 506 | (declare (array displaced) (fixnum index)) 507 | (loop (multiple-value-bind 508 | (displaced* index*) 509 | (array-displacement displaced) 510 | (if displaced* 511 | (setq displaced displaced* 512 | index (the fixnum (+ index index*))) 513 | (return-from unwrap-displaced-array (values displaced index))))))) 514 | -------------------------------------------------------------------------------- /make-lispworks-binary.sh: -------------------------------------------------------------------------------- 1 | lwcon -build save-elprep-script.lisp 2 | -------------------------------------------------------------------------------- /make-sbcl-binary.sh: -------------------------------------------------------------------------------- 1 | sbcl --dynamic-space-size 262144 --load save-elprep-script.lisp --no-userinit --no-sysinit --end-toplevel-options 2 | -------------------------------------------------------------------------------- /mark-duplicates.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | (in-simple-base-string-syntax) 3 | 4 | (declaim (inline sam-alignment-rg (setf sam-alignment-rg))) 5 | 6 | (defun sam-alignment-rg (aln) 7 | "Access the read group optional field of a sam-alignment." 8 | (sam-alignment-tag aln :rg)) 9 | 10 | (defun (setf sam-alignment-rg) (new-value aln) 11 | "Access the read group optional field of a sam-alignment." 12 | (setf (sam-alignment-tag aln :rg) new-value)) 13 | 14 | (defun make-phred-score-table () 15 | "Map Phred qualities to a reasonable range and an error flag indicating if it is outside a valid range." 16 | (let ((score-table (make-array 512 :element-type 'octet 17 | #+lispworks :allocation #+lispworks :long-lived 18 | #+lispworks :single-thread #+lispworks t))) 19 | (loop for char below 256 20 | for pos = (ash char 1) do 21 | (if (or (< char 33) (> char 126)) 22 | (setf (aref score-table pos) 0 23 | (aref score-table (1+ pos)) 1) 24 | (let ((qual (- char 33))) 25 | (setf (aref score-table pos) (if (>= qual 15) qual 0) 26 | (aref score-table (1+ pos)) 0)))) 27 | score-table)) 28 | 29 | (declaim (notinline compute-phred-score)) 30 | 31 | (defun compute-phred-score (aln) 32 | "Sum the adapted Phred qualities of a sam-alignment." 33 | (declare (sam-alignment aln) #.*optimization*) 34 | (let ((string (sam-alignment-qual aln)) 35 | (score-table (load-time-value (make-phred-score-table) t)) 36 | (score 0) (error 0)) 37 | (declare (base-string string) ((simple-array octet (512)) score-table) (fixnum score error)) 38 | (multiple-value-bind (string* offset) (unwrap-displaced-array string) 39 | (declare (simple-base-string string*) (fixnum offset)) 40 | (let ((end (the fixnum (+ (length string) offset)))) 41 | (declare (fixnum end)) 42 | (loop for i of-type fixnum from offset below end 43 | for pos of-type fixnum = (ash (char-code (schar string* i)) 1) do 44 | (setq score (+ score (aref score-table pos)) 45 | error (logior error (aref score-table (the fixnum (1+ pos)))))) 46 | (assert (= error 0)))) 47 | score)) 48 | 49 | (define-symbol-macro upcase-cigar-operations "MIDNSHPX=") 50 | 51 | (defconstant +min-upcase-cigar-operation+ (reduce #'min upcase-cigar-operations :key #'char-code) 52 | "The smallest CIGAR operation, upper case only. 53 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6.") 54 | 55 | (defconstant +max-upcase-cigar-operation+ (reduce #'max upcase-cigar-operations :key #'char-code) 56 | "The largest CIGAR operation, upper case only. 57 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6.") 58 | 59 | (declaim (inline cigar-aux-pos)) 60 | 61 | (defun cigar-aux-pos (char) 62 | "Position in a vector of CIGAR operations, starting with the smallest CIGAR operation, upper case only." 63 | (declare (base-char char) #.*optimization*) 64 | (- (char-code char) +min-upcase-cigar-operation+)) 65 | 66 | (defun make-unclipped-aux-tables () 67 | "Map CIGAR operations to flags indicating whether they are clipped and/or reference operations, upper case only." 68 | (let* ((tablesize (1+ (- +max-upcase-cigar-operation+ 69 | +min-upcase-cigar-operation+))) 70 | (clipped (make-array tablesize 71 | :initial-element 0 72 | :element-type 'octet 73 | #+lispworks :allocation #+lispworks :long-lived 74 | #+lispworks :single-thread #+lispworks t)) 75 | (reference (make-array tablesize 76 | :initial-element 0 77 | :element-type 'octet 78 | #+lispworks :allocation #+lispworks :long-lived 79 | #+lispworks :single-thread #+lispworks t))) 80 | (flet ((set-clipped-flag (char) 81 | (setf (aref clipped (cigar-aux-pos char)) 1)) 82 | (set-reference-flag (char) 83 | (setf (aref reference (cigar-aux-pos char)) 1))) 84 | (set-clipped-flag #\S) 85 | (set-clipped-flag #\H) 86 | (set-reference-flag #\M) 87 | (set-reference-flag #\D) 88 | (set-reference-flag #\N) 89 | (set-reference-flag #\=) 90 | (set-reference-flag #\X)) 91 | (cons clipped reference))) 92 | 93 | (declaim (notinline compute-unclipped-position)) 94 | 95 | (defun compute-unclipped-position (aln) 96 | "Compute unclipped position of a sam-alignment, based on its FLAG, POS, and CIGAR string." 97 | (declare (sam-alignment aln) #.*fixnum-optimization*) 98 | (let ((cigar (scan-cigar-string 'vector (sam-alignment-cigar aln)))) 99 | (declare (simple-vector cigar)) 100 | (if (= 0 (length cigar)) 101 | (sam-alignment-pos aln) 102 | (let* ((tables (load-time-value (make-unclipped-aux-tables) t)) 103 | (clipped-table (car tables)) 104 | (reference-table (cdr tables))) 105 | (declare (cons tables) ((simple-array octet) clipped-table reference-table)) 106 | (the int32 107 | (if (sam-alignment-reversed-p aln) 108 | (1- (+ (sam-alignment-pos aln) 109 | (loop for i of-type fixnum = (the fixnum (1- (length cigar))) then (the fixnum (1- i)) 110 | for (key . value) of-type (base-char . fixnum) = (svref cigar i) 111 | for p of-type fixnum = (cigar-aux-pos key) 112 | for c of-type fixnum = (aref clipped-table p) 113 | for r of-type fixnum = (aref reference-table p) 114 | for clipped of-type fixnum = c then (the fixnum (* c clipped)) 115 | sum (the fixnum (* (logior r clipped) value)) fixnum 116 | until (= i 0)))) 117 | (- (sam-alignment-pos aln) 118 | (loop for i of-type fixnum below (length cigar) 119 | for (key . value) of-type (base-char . fixnum) = (svref cigar i) 120 | for p of-type fixnum = (cigar-aux-pos key) 121 | until (= 0 (aref clipped-table p)) 122 | sum value fixnum)))))))) 123 | 124 | (declaim (inline sam-alignment-adapted-pos (setf sam-alignment-adapted-pos) 125 | sam-alignment-adapted-score (setf sam-alignment-adapted-score))) 126 | 127 | (defun sam-alignment-adapted-pos (aln) 128 | "Access the unclipped position temporary field in the sam-alignment." 129 | (sam-alignment-temp aln :pos)) 130 | 131 | (defun (setf sam-alignment-adapted-pos) (val aln) 132 | "Access the unclipped position temporary field in the sam-alignment." 133 | (setf (sam-alignment-temp aln :pos) val)) 134 | 135 | (defun sam-alignment-adapted-score (aln) 136 | "Access adapted Phred score temporary field in the sam-alignment." 137 | (sam-alignment-temp aln :score)) 138 | 139 | (defun (setf sam-alignment-adapted-score) (val aln) 140 | "Access adapted Phred score temporary field in the sam-alignment." 141 | (setf (sam-alignment-temp aln :score) val)) 142 | 143 | (declaim (inline adapt-alignment)) 144 | 145 | (defun adapt-alignment (rg-table alignment) 146 | "Adapt the sam-alignment: Make read group unique; fill in unclipped position; fill in Phred score." 147 | ; make rg unique 148 | (let ((rg (sam-alignment-rg alignment))) 149 | (when rg 150 | (setf (sam-alignment-rg alignment) 151 | (or (gethash rg rg-table) 152 | (with-modify-hash (key value found) (rg-table rg) 153 | (if found value rg)))))) 154 | ; adapt position 155 | (setf (sam-alignment-adapted-pos alignment) 156 | (compute-unclipped-position alignment)) 157 | ; adapt score 158 | (setf (sam-alignment-adapted-score alignment) 159 | (compute-phred-score alignment)) 160 | alignment) 161 | 162 | (declaim (inline mark-as-duplicate)) 163 | 164 | (defun mark-as-duplicate (aln) 165 | "Set the PCR/optical duplicate FLAG in the sam-alignment." 166 | (declare (sam-alignment aln) #.*optimization*) 167 | (setf (sam-alignment-flag aln) (logior (sam-alignment-flag aln) +duplicate+)) 168 | t) 169 | 170 | (declaim (inline make-handle)) 171 | 172 | (defstruct (handle (:constructor make-handle (object hash))) 173 | "An indirection for compare-and-swap operations. 174 | The struct handle has a constructor make-handle that takes an object and a hash value as parameters. 175 | Read-only accessor handle-hash of type fixnum refers to the hash value. 176 | Accessor handle-object referes to the object. 177 | Primary use of this struct is to enable compare-and-swap operations in classify-fragment and classify-pair." 178 | (hash 0 :type fixnum :read-only t) 179 | (object nil)) 180 | 181 | (setf (documentation 'make-handle 'function) 182 | "Constructor for struct handle that takes an object and a hash value as parameters." 183 | (documentation 'handle-p 'function) 184 | "Default predicate for struct handle." 185 | (documentation 'copy-handle 'function) 186 | "Default copier function for struct handle." 187 | (documentation 'handle-hash 'function) 188 | "Read the handle hash value of type fixnum." 189 | (documentation 'handle-object 'function) 190 | "Access the handle object.") 191 | 192 | (defun handle-fragment= (f1 f2) 193 | "Do the two handles refer to sam-alignment instances at the same unclipped position and with the same direction?" 194 | (declare (handle f1 f2) #.*optimization*) 195 | (let ((f1 (handle-object f1)) 196 | (f2 (handle-object f2))) 197 | (declare (sam-alignment f1 f2)) 198 | (and (eq (sam-alignment-rg f1) (sam-alignment-rg f2)) 199 | (= (the int32 (sam-alignment-refid f1)) (the int32 (sam-alignment-refid f2))) 200 | (= (the int32 (sam-alignment-adapted-pos f1)) (the int32 (sam-alignment-adapted-pos f2))) 201 | (eq (sam-alignment-reversed-p f1) (sam-alignment-reversed-p f2))))) 202 | 203 | (defun fragment-hash (f) 204 | "Hash function that corresponds to handle-fragment=, but operates an a sam-alignment directly." 205 | (declare (sam-alignment f) #.*optimization*) 206 | (logxor (sxhash (the base-string (sam-alignment-rg f))) 207 | (sxhash (the int32 (sam-alignment-refid f))) 208 | (sxhash (the int32 (sam-alignment-adapted-pos f))) 209 | (sxhash (the boolean (sam-alignment-reversed-p f))))) 210 | 211 | (declaim (inline true-fragment-p true-pair-p)) 212 | 213 | (defun true-fragment-p (aln) 214 | "Is this sam-alignment definitely not part of a pair?" 215 | (declare (sam-alignment aln) #.*optimization*) 216 | (/= (logand (sam-alignment-flag aln) #.(logior +multiple+ +next-unmapped+)) +multiple+)) 217 | 218 | (defun true-pair-p (aln) 219 | "Is this sam-alignment definitely part of a pair?" 220 | (declare (sam-alignment aln) #.*optimization*) 221 | (= (logand (sam-alignment-flag aln) #.(logior +multiple+ +next-unmapped+)) +multiple+)) 222 | 223 | (defun classify-fragment (aln fragments deterministic) ; fragments is the hash table to store intermediate "best" alns, passed as a parameter 224 | "For each list of sam-alignment instances with the same unclipped position and direction, all except the one with the highest score are marked as duplicates. 225 | If there are fragments in such a list that are actually part of pairs, all the true fragments are marked as duplicates and the pairs are left untouched." 226 | (let* ((hash (fragment-hash aln)) 227 | (key (make-handle aln hash)) 228 | (split (hash-table-split key fragments)) 229 | (best (or (gethash key split) 230 | (let ((entry (make-handle aln hash))) 231 | (with-hash-table-locked split 232 | (let ((value (gethash key split))) 233 | (cond (value value) 234 | (t (setf (gethash entry split) entry) 235 | (return-from classify-fragment))))))))) 236 | (declare (dynamic-extent key)) 237 | (cond ((true-fragment-p aln) 238 | (loop with aln-score = (sam-alignment-adapted-score aln) 239 | for best-aln = (handle-object best) 240 | until (if (true-pair-p best-aln) 241 | (mark-as-duplicate aln) 242 | (let ((best-aln-score (sam-alignment-adapted-score best-aln))) 243 | (cond ((> best-aln-score aln-score) 244 | (mark-as-duplicate aln)) 245 | ((= best-aln-score aln-score) 246 | (if deterministic 247 | (if (string> (sam-alignment-qname aln) (sam-alignment-qname best-aln)) 248 | (mark-as-duplicate aln) 249 | (when (compare-and-swap (handle-object best) best-aln aln) 250 | (mark-as-duplicate best-aln))) 251 | (mark-as-duplicate aln))) 252 | ((compare-and-swap (handle-object best) best-aln aln) 253 | (mark-as-duplicate best-aln))))))) 254 | (t ; the aln is a true pair object, there may be a true fragment stored in the hash table which we then need to mark and swap out 255 | (loop for best-aln = (handle-object best) 256 | until (cond ((true-pair-p best-aln) t) ; stop, the best in the hash tab is a pair, this is marked via mark-duplicates 257 | ((compare-and-swap (handle-object best) best-aln aln) 258 | (mark-as-duplicate best-aln)))))))) 259 | 260 | (defun sam-alignment-pair= (aln1 aln2) 261 | "Are the two sam-alignment fragments part of the same pair?" 262 | (declare (sam-alignment aln1 aln2) #.*optimization*) 263 | (and (eq (sam-alignment-rg aln1) (sam-alignment-rg aln2)) 264 | (string= (sam-alignment-qname aln1) (sam-alignment-qname aln2)))) 265 | 266 | (defun sam-alignment-pair-hash (aln) 267 | "Hash function that corresponds to sam-alignment-pair=." 268 | (declare (sam-alignment aln) #.*optimization*) 269 | (or (sam-alignment-temp aln :pair-hash) 270 | (let ((hash (logxor (sxhash (the base-string (sam-alignment-rg aln))) 271 | (sxhash (the base-string (sam-alignment-qname aln)))))) 272 | (setf (sam-alignment-temps aln) 273 | (list* :pair-hash hash (sam-alignment-temps aln))) 274 | hash))) 275 | 276 | (declaim (inline make-pair)) 277 | 278 | (defstruct (pair (:constructor make-pair (score aln1 aln2))) 279 | "A pair consisting of two fragment sam-alignment instances. 280 | The struct pair has a constructor make-pair that takes a score and two sam-alignment instances as parameters. 281 | Read-only accessor pair-score of type fixnum refers to the score. 282 | Read-only accessor pair-aln1 of type sam-alignment refers to the first fragment. 283 | Read-only accessor pair-aln2 of type sam-alignment refers to the second fragment." 284 | (score 0 :type fixnum :read-only t) 285 | (aln1 nil :type (or null sam-alignment) :read-only t) 286 | (aln2 nil :type (or null sam-alignment) :read-only t)) 287 | 288 | (setf (documentation 'make-pair 'function) 289 | "Constructor for struct pair that takes a score and two sam-alignment instances as parameters." 290 | (documentation 'pair-p 'function) 291 | "Default predicate for struct pair." 292 | (documentation 'copy-pair 'function) 293 | "Default copier function for struct pair." 294 | (documentation 'pair-score 'function) 295 | "Read the pair score of type fixnum." 296 | (documentation 'pair-aln1 'function) 297 | "Read the first fragment of type sam-alignment from a pair." 298 | (documentation 'pair-aln2 'function) 299 | "Read the second fragment of type sam-alignment from a pair.") 300 | 301 | (declaim (inline pair-rg pair-refid1 pair-pos1 pair-reversed1-p pair-refid2 pair-pos2 pair-reversed2-p)) 302 | 303 | (defun pair-rg (pair) 304 | "Access the read group of a pair." 305 | (declare (pair pair) #.*optimization*) 306 | (sam-alignment-rg (the sam-alignment (pair-aln1 pair)))) 307 | 308 | (defun pair-refid1 (pair) 309 | "Access the first REFID of a pair." 310 | (declare (pair pair) #.*optimization*) 311 | (sam-alignment-refid (the sam-alignment (pair-aln1 pair)))) 312 | 313 | (defun pair-pos1 (pair) 314 | "Access the first unclipped position of a pair." 315 | (declare (pair pair) #.*optimization*) 316 | (sam-alignment-adapted-pos (the sam-alignment (pair-aln1 pair)))) 317 | 318 | (defun pair-reversed1-p (pair) 319 | "Access the first direction of a pair." 320 | (declare (pair pair) #.*optimization*) 321 | (sam-alignment-reversed-p (the sam-alignment (pair-aln1 pair)))) 322 | 323 | (defun pair-refid2 (pair) 324 | "Access the second REFID of a pair." 325 | (declare (pair pair) #.*optimization*) 326 | (sam-alignment-refid (the sam-alignment (pair-aln2 pair)))) 327 | 328 | (defun pair-pos2 (pair) 329 | "Access the second unclipped position of a pair." 330 | (declare (pair pair) #.*optimization*) 331 | (sam-alignment-adapted-pos (the sam-alignment (pair-aln2 pair)))) 332 | 333 | (defun pair-reversed2-p (pair) 334 | "Access the second direction of a pair." 335 | (declare (pair pair) #.*optimization*) 336 | (sam-alignment-reversed-p (the sam-alignment (pair-aln2 pair)))) 337 | 338 | (defun handle-pair= (p1 p2) 339 | "Do the two handles refer to pair instances at the same unclipped positions and with the same directions?" 340 | (declare (handle p1 p2) #.*optimization*) 341 | (let ((p1 (handle-object p1)) 342 | (p2 (handle-object p2))) 343 | (declare (pair p1 p2)) 344 | (and (eq (pair-rg p1) (pair-rg p2)) 345 | (= (the int32 (pair-refid1 p1)) (the int32 (pair-refid1 p2))) 346 | (= (the int32 (pair-pos1 p1)) (the int32 (pair-pos1 p2))) 347 | (eq (pair-reversed1-p p1) (pair-reversed1-p p2)) 348 | (= (the int32 (pair-refid2 p1)) (the int32 (pair-refid2 p2))) 349 | (= (the int32 (pair-pos2 p1)) (the int32 (pair-pos2 p2))) 350 | (eq (pair-reversed2-p p1) (pair-reversed2-p p2))))) 351 | 352 | (defun pair-hash (p) 353 | "Hash function that corresponds to handle-pair=, but operates on a pair directly." 354 | (declare (pair p) #.*optimization* #-(or (not lispworks) lispworks6) (optimize (float 0))) 355 | (logxor (sxhash (the base-string (pair-rg p))) 356 | (sxhash (the int32 (pair-refid1 p))) 357 | (sxhash (the int32 (pair-refid2 p))) 358 | #+(or (not lispworks) lispworks6) (sxhash (+ (ash (the int32 (pair-pos1 p)) 32) (the int32 (pair-pos2 p)))) 359 | #-(or (not lispworks) lispworks6) (sxhash (sys:int64-to-integer (sys:int64+ (sys:int64<< (the int32 (pair-pos1 p)) 32) (the int32 (pair-pos2 p))))) 360 | (sxhash (the boolean (pair-reversed1-p p))) 361 | (sxhash (the boolean (pair-reversed2-p p))))) 362 | 363 | (defun classify-pair (aln fragments pairs deterministic) 364 | "For each list of pairs with the same unclipped positions and directions, all except the one with the highest score are marked as duplicates." 365 | (when (true-pair-p aln) 366 | (let ((aln1 aln) 367 | (aln2 (let ((split (hash-table-split aln fragments))) 368 | (with-hash-table-locked split 369 | (let ((value (gethash aln split))) 370 | (cond (value (remhash aln split) value) 371 | (t (setf (gethash aln split) aln) 372 | (return-from classify-pair)))))))) 373 | (when (> (the int32 (sam-alignment-adapted-pos aln1)) 374 | (the int32 (sam-alignment-adapted-pos aln2))) 375 | (rotatef aln1 aln2)) 376 | (let* ((score (+ (sam-alignment-adapted-score aln1) 377 | (sam-alignment-adapted-score aln2))) 378 | (keypair (make-pair score aln1 aln2)) 379 | (hash (pair-hash keypair)) 380 | (pairkey (make-handle keypair hash)) 381 | (entry nil) 382 | (best (let ((split (hash-table-split pairkey pairs))) 383 | (or (gethash pairkey split) 384 | (progn 385 | (setq entry (make-pair score aln1 aln2)) 386 | (let ((handle (make-handle entry hash))) 387 | (with-hash-table-locked split 388 | (let ((value (gethash pairkey split))) 389 | (cond (value value) 390 | (t (setf (gethash handle split) handle) 391 | (return-from classify-pair))))))))))) 392 | (declare (dynamic-extent keypair pairkey)) 393 | (loop for best-pair = (handle-object best) 394 | for best-pair-score = (pair-score best-pair) 395 | until (cond ((> best-pair-score score) 396 | (mark-as-duplicate aln1) 397 | (mark-as-duplicate aln2)) 398 | ((= best-pair-score score) 399 | (cond (deterministic ; code for correctness checks 400 | (cond ((string> (sam-alignment-qname aln1) (sam-alignment-qname (pair-aln1 best-pair))) 401 | (mark-as-duplicate aln1) 402 | (mark-as-duplicate aln2)) 403 | ((compare-and-swap (handle-object best) best-pair 404 | (or entry (setq entry (make-pair score aln1 aln2)))) 405 | (mark-as-duplicate (pair-aln1 best-pair)) 406 | (mark-as-duplicate (pair-aln2 best-pair))))) 407 | (t (mark-as-duplicate aln1) 408 | (mark-as-duplicate aln2)))) 409 | ((compare-and-swap (handle-object best) best-pair 410 | (or entry (setq entry (make-pair score aln1 aln2)))) 411 | (mark-as-duplicate (pair-aln1 best-pair)) 412 | (mark-as-duplicate (pair-aln2 best-pair))))))))) 413 | 414 | (defun mark-duplicates (deterministic) 415 | "A filter for marking duplicate sam-alignment instances. Depends on the add-refid filter being called before to fill in the refid." 416 | (lambda (header) 417 | (declare (ignore header)) 418 | (let ((splits (* 16 *number-of-threads*))) 419 | ; set up tables once header is parsed, tables will serve for all alignments 420 | (let ((fragments (make-split-hash-table splits :test #'handle-fragment= :hash-function #'handle-hash)) 421 | (pairs-fragments (make-split-hash-table splits :test #'sam-alignment-pair= :hash-function #'sam-alignment-pair-hash)) 422 | (pairs (make-split-hash-table splits :test #'handle-pair= :hash-function #'handle-hash)) 423 | (rg-table (make-synchronized-hash-table :test #'string= :hash-function #'sxhash))) 424 | (lambda () 425 | (lambda (alignment) 426 | (when (sam-alignment-flag-notany alignment #.(+ +unmapped+ +secondary+ +duplicate+ +supplementary+)) 427 | ; mark duplicate checks 428 | (adapt-alignment rg-table alignment) 429 | (classify-fragment alignment fragments deterministic) 430 | (classify-pair alignment pairs-fragments pairs deterministic)) 431 | t)))))) 432 | -------------------------------------------------------------------------------- /sam-types.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | (in-simple-base-string-syntax) 3 | 4 | (defvar *sam-file-format-version* "1.5" 5 | "The SAM file format version string supported by this library. 6 | This is entered by default in a @HD line in the header section of a SAM file, unless user code explicitly asks for a different version number. 7 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.3. 8 | Default is \"1.5\".") 9 | 10 | (defvar *sam-file-format-date* "3 Mar 2015" 11 | "The date of the SAM file format version supported by this library. 12 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.3. 13 | Current date is 3 Mar 2015.") 14 | 15 | (defstruct sam-header 16 | "The information stored in the header section of a SAM file. 17 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.3. 18 | The struct sam-header has a default constructor make-sam-header. 19 | Accessor sam-header-hd of type property list refers to the @HD line. 20 | Accessor sam-header-sq of type list of property lists refers to the @SQ lines. 21 | Accessor sam-header-rg of type list of property lists refers to the @RG lines. 22 | Accessor sam-header-pg of type list of property lists refers to the @PG lines. 23 | Accessor sam-header-co of type list of strings refers to the @CO lines. 24 | Accessor sam-header-user-tags of type property list refers to lines defined by end users. 25 | All information in property lists are stored in the same order as they occur in a SAM file." 26 | (hd '() :type list) 27 | (sq '() :type list) 28 | (rg '() :type list) 29 | (pg '() :type list) 30 | (co '() :type list) 31 | (user-tags '() :type list)) 32 | 33 | (setf (documentation 'make-sam-header 'function) 34 | "Default constructor for struct sam-header." 35 | (documentation 'sam-header-p 'function) 36 | "Default predicate for struct sam-header." 37 | (documentation 'copy-sam-header 'function) 38 | "Default copier function for struct sam-header." 39 | (documentation 'sam-header-hd 'function) 40 | "Access the sam-header @HD line of type property list." 41 | (documentation 'sam-header-sq 'function) 42 | "Access the sam-header @SQ lines of type list of property lists." 43 | (documentation 'sam-header-rg 'function) 44 | "Access the sam-header @RG lines of type list of property lists." 45 | (documentation 'sam-header-pg 'function) 46 | "Access the sam-header @PG lines of type list of property lists." 47 | (documentation 'sam-header-co 'function) 48 | "Access the sam-header @CO lines of type list of strings." 49 | (documentation 'sam-header-user-tags 'function) 50 | "Access the sam-header user-defined header lines of type property list.") 51 | 52 | (defun sam-header-ensure-hd (hdr &key (vn *sam-file-format-version*) so go) 53 | "Ensure an @HD line is present in the given sam-header. 54 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.3, Tag @HD." 55 | (declare (sam-header hdr) #.*optimization*) 56 | (setf (getf (sam-header-hd hdr) :VN) vn) 57 | (assert (not (and so go))) 58 | (cond (so (setf (getf (sam-header-hd hdr) :SO) 59 | (string-downcase (string so))) 60 | (loop while (remf (sam-header-hd hdr) :GO))) 61 | (go (setf (getf (sam-header-hd hdr) :GO) 62 | (string-downcase (string go))) 63 | (loop while (remf (sam-header-hd hdr) :SO)))) 64 | hdr) 65 | 66 | (declaim (inline sam-header-user-tag (setf sam-header-user-tag))) 67 | 68 | (defun sam-header-user-tag (hdr tag) 69 | "Access a sam-header user tag." 70 | (declare (sam-header hdr) (symbol tag) #.*optimization*) 71 | (getf (sam-header-user-tags hdr) tag)) 72 | 73 | (defun (setf sam-header-user-tag) (value hdr tag) 74 | "Access a sam-header user tag." 75 | (declare (sam-header hdr) (symbol tag) #.*optimization*) 76 | (setf (getf (sam-header-user-tags hdr) tag) value)) 77 | 78 | (declaim (inline sam-header-user-tag-p)) 79 | 80 | (defun sam-header-user-tag-p (code) 81 | "Does this tag string represent a user-defined tag?" 82 | (declare (simple-string code) #.*optimization*) 83 | (loop for c of-type character across code 84 | thereis (and (char<= #\a c) (char<= c #\z)))) 85 | 86 | (defstruct sam-alignment 87 | "A single read alignment with mandatory and optional fields that can be contained in a SAM file alignment line. 88 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Sections 1.4 and 1.5. 89 | The struct sam-alignment has a default constructor make-sam-alignment. 90 | Accessor sam-alignment-qname of type base-string refers to the Query template NAME. 91 | Accessor sam-alignment-flag of type fixnum refers to the bitwise FLAG. 92 | Accessor sam-alignment-rname of type base-string refers to the Reference sequence NAME. 93 | Accessor sam-alignment-pos of type int32 refers to the 1-based leftmost mapping POSition. 94 | Accessor sam-alignment-mapq of type fixnum refers to the MAPping quality. 95 | Accessor sam-alignment-cigar of type base-string refers to the CIGAR string. 96 | Accessor sam-alignment-rnext of type base-string refers to the Reference sequence name of the mate/NEXT read. 97 | Accessor sam-alignment-pnext of type int32 refers to the 1-based leftmost mapping Position of the mate/NEXT read. 98 | Accessor sam-alignment-tlen of type int32 refers to the observed Template LENgth. 99 | Accessor sam-alignment-seq of type base-string refers to the segment SEQuence. 100 | Accessor sam-alignment-qual of type base-string refers to the ASCII of Phred-scaled base QUALity+33. 101 | Accessor sam-alignment-tags of type property list refers to the optional fields in a read alignment. 102 | Accessor sam-alignment-temps of type property list refers to additional optional fields not stored in any storage format, but reserved for temporary values in filters." 103 | (qname "" :type base-string) 104 | (flag 0 :type uint16) 105 | (rname "" :type base-string) 106 | (pos 0 :type int32) 107 | (mapq 0 :type octet) 108 | (cigar "" :type base-string) 109 | (rnext "" :type base-string) 110 | (pnext 0 :type int32) 111 | (tlen 0 :type int32) 112 | (seq "" :type base-string) 113 | (qual "" :type base-string) 114 | (tags '() :type list) 115 | (temps '() :type list)) 116 | 117 | (setf (documentation 'make-sam-alignment 'function) 118 | "Default constructor for struct sam-alignment." 119 | (documentation 'sam-alignment-p 'function) 120 | "Default predicate for struct sam-alignment." 121 | (documentation 'copy-sam-alignment 'function) 122 | "Default copier function for struct sam-header." 123 | (documentation 'sam-alignment-qname 'function) 124 | "Access the sam-alignment Query template NAME of type base-string." 125 | (documentation 'sam-alignment-flag 'function) 126 | "Access the sam-alignment bitwise FLAG of type fixnum." 127 | (documentation 'sam-alignment-rname 'function) 128 | "Access the sam-alignment Reference sequence NAME of type base-string." 129 | (documentation 'sam-alignment-pos 'function) 130 | "Access the sam-alignment 1-based leftmost mapping POSition of type int32." 131 | (documentation 'sam-alignment-mapq 'function) 132 | "Access the sam-alignment MAPping quality of type fixnum." 133 | (documentation 'sam-alignment-cigar 'function) 134 | "Access the sam-alignment CIGAR string of type base-string." 135 | (documentation 'sam-alignment-rnext 'function) 136 | "Access the sam-alignment Reference sequence name of the mate/NEXT read of type base-string." 137 | (documentation 'sam-alignment-pnext 'function) 138 | "Access the sam-alignment 1-based leftmost mapping Position of the mate/NEXT read of type int32." 139 | (documentation 'sam-alignment-tlen 'function) 140 | "Access the sam-alignment observed Template LENgth of type int32." 141 | (documentation 'sam-alignment-seq 'function) 142 | "Access the sam-alignment segment SEQuence of type base-string." 143 | (documentation 'sam-alignment-qual 'function) 144 | "Access the sam-alignment ASCII of Phred-scaled base QUALity+33 of type base-string." 145 | (documentation 'sam-alignment-tags 'function) 146 | "Access the sam-alignment optional fields of type property list." 147 | (documentation 'sam-alignment-temps 'function) 148 | "Access the sam-alignment temporary values of type property list.") 149 | 150 | (declaim (inline sam-alignment-tag (setf sam-alignment-tag))) 151 | 152 | (defun sam-alignment-tag (aln tag) 153 | "Access a sam-alignment optional field in the sam-alignment." 154 | (declare (sam-alignment aln) (symbol tag) #.*optimization*) 155 | (getf (sam-alignment-tags aln) tag)) 156 | 157 | (defun (setf sam-alignment-tag) (value aln tag) 158 | "Access a sam-alignment optional field in the sam-alignment." 159 | (declare (sam-alignment aln) (symbol tag) #.*optimization*) 160 | (setf (getf (sam-alignment-tags aln) tag) value)) 161 | 162 | (declaim (inline sam-alignment-temp (setf sam-alignment-temp))) 163 | 164 | (defun sam-alignment-temp (aln tag) 165 | "Access a sam-alignment temporary value in the sam-alignment." 166 | (declare (sam-alignment aln) (symbol tag) #.*optimization*) 167 | (getf (sam-alignment-temps aln) tag)) 168 | 169 | (defun (setf sam-alignment-temp) (value aln tag) 170 | "Access a sam-alignment temporary value in the sam-alignment." 171 | (declare (sam-alignment aln) (symbol tag) #.*optimization*) 172 | (setf (getf (sam-alignment-temps aln) tag) value)) 173 | 174 | (declaim (inline sam-alignment-refid (setf sam-alignment-refid))) 175 | 176 | (defun sam-alignment-refid (aln) 177 | "Access the commonly used REFID temporary field in the sam-alignment." 178 | (sam-alignment-temp aln :refid)) 179 | 180 | (defun (setf sam-alignment-refid) (new-value aln) 181 | "Access the commonly used REFID temporary field in the sam-alignment." 182 | (setf (sam-alignment-temp aln :refid) new-value)) 183 | 184 | (declaim (inline check-refid-type)) 185 | 186 | (defun check-refid-type (value) 187 | "Ensure that value is probably an int32." 188 | (check-type value 189 | #+x86 integer 190 | #+x86-64 fixnum 191 | "a 32-bit integer") 192 | value) 193 | 194 | (defun coordinate< (aln1 aln2) 195 | "Compare two alignments according to their coordinate. 196 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.3, Tag @HD, SO." 197 | (declare (sam-alignment aln1 aln2) #.*optimization*) 198 | (let ((refid1 (check-refid-type (sam-alignment-refid aln1))) 199 | (refid2 (check-refid-type (sam-alignment-refid aln2)))) 200 | (declare (int32 refid1 refid2)) 201 | (cond ((< refid1 refid2) (>= refid1 0)) 202 | ((< refid2 refid1) (< refid2 0)) 203 | (t (< (sam-alignment-pos aln1) 204 | (sam-alignment-pos aln2)))))) 205 | 206 | (defconstant +multiple+ #x1 207 | "Bit value for sam-alignment-flag: template having multiple segments in sequencing. 208 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 209 | (defconstant +proper+ #x2 210 | "Bit value for sam-alignment-flag: each segment properly aligned according to the aligner. 211 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 212 | (defconstant +unmapped+ #x4 213 | "Bit value for sam-alignment-flag: segment unmapped. 214 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 215 | (defconstant +next-unmapped+ #x8 216 | "Bit value for sam-alignment-flag: next segment in the template unmapped. 217 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 218 | (defconstant +reversed+ #x10 219 | "Bit value for sam-alignment-flag: SEQ being reversed complemented. 220 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 221 | (defconstant +next-reversed+ #x20 222 | "Bit value for sam-alignment-flag: SEQ of the next segment in the template being reversed. 223 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 224 | (defconstant +first+ #x40 225 | "Bit value for sam-alignment-flag: the first segment in the template. 226 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 227 | (defconstant +last+ #x80 228 | "Bit value for sam-alignment-flag: the last segment in the template. 229 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 230 | (defconstant +secondary+ #x100 231 | "Bit value for sam-alignment-flag: secondary alignment. 232 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 233 | (defconstant +qc-failed+ #x200 234 | "Bit value for sam-alignment-flag: not passing quality controls. 235 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 236 | (defconstant +duplicate+ #x400 237 | "Bit value for sam-alignment-flag: PCR or optical duplicate. 238 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 239 | (defconstant +supplementary+ #x800 240 | "Bit value for sam-alignment-flag: supplementary alignment. 241 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2.") 242 | 243 | (declaim (inline sam-alignment-multiple-p 244 | sam-alignment-proper-p 245 | sam-alignment-unmapped-p 246 | sam-alignment-next-unmapped-p 247 | sam-alignment-reversed-p 248 | sam-alignment-next-reversed-p 249 | sam-alignment-first-p 250 | sam-alignment-last-p 251 | sam-alignment-secondary-p 252 | sam-alignment-qc-failed-p 253 | sam-alignment-duplicate-p 254 | sam-alignment-supplementary-p)) 255 | 256 | (defun sam-alignment-multiple-p (aln) 257 | "Check for template having multiple segments in sequencing in sam-alignment-flag. 258 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 259 | (declare (sam-alignment aln) #.*optimization*) 260 | (/= (logand (sam-alignment-flag aln) +multiple+) 0)) 261 | 262 | (defun sam-alignment-proper-p (aln) 263 | "Check for each segment being properly aligned according to the aligner in sam-alignment-flag. 264 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 265 | (declare (sam-alignment aln) #.*optimization*) 266 | (/= (logand (sam-alignment-flag aln) +proper+) 0)) 267 | 268 | (defun sam-alignment-unmapped-p (aln) 269 | "Check for segment unmapped in sam-alignment-flag. 270 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 271 | (declare (sam-alignment aln) #.*optimization*) 272 | (/= (logand (sam-alignment-flag aln) +unmapped+) 0)) 273 | 274 | (defun sam-alignment-next-unmapped-p (aln) 275 | "Check for next segment in the template unmapped in sam-alignment-flag. 276 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 277 | (declare (sam-alignment aln) #.*optimization*) 278 | (/= (logand (sam-alignment-flag aln) +next-unmapped+) 0)) 279 | 280 | (defun sam-alignment-reversed-p (aln) 281 | "Check for SEQ being reversed complemented in sam-alignment-flag. 282 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 283 | (declare (sam-alignment aln) #.*optimization*) 284 | (/= (logand (sam-alignment-flag aln) +reversed+) 0)) 285 | 286 | (defun sam-alignment-next-reversed-p (aln) 287 | "Check for SEQ of the next segment in the template being reversed in sam-alignment-flag. 288 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 289 | (declare (sam-alignment aln) #.*optimization*) 290 | (/= (logand (sam-alignment-flag aln) +next-reversed+) 0)) 291 | 292 | (defun sam-alignment-first-p (aln) 293 | "Check for being the first segment in the template in sam-alignment-flag. 294 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 295 | (declare (sam-alignment aln) #.*optimization*) 296 | (/= (logand (sam-alignment-flag aln) +first+) 0)) 297 | 298 | (defun sam-alignment-last-p (aln) 299 | "Check for being the last segment in the template in sam-alignment-flag. 300 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 301 | (declare (sam-alignment aln) #.*optimization*) 302 | (/= (logand (sam-alignment-flag aln) +last+) 0)) 303 | 304 | (defun sam-alignment-secondary-p (aln) 305 | "Check for secondary alignment in sam-alignment-flag. 306 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 307 | (declare (sam-alignment aln) #.*optimization*) 308 | (/= (logand (sam-alignment-flag aln) +secondary+) 0)) 309 | 310 | (defun sam-alignment-qc-failed-p (aln) 311 | "Check for not passing quality controls in sam-alignment-flag. 312 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 313 | (declare (sam-alignment aln) #.*optimization*) 314 | (/= (logand (sam-alignment-flag aln) +qc-failed+) 0)) 315 | 316 | (defun sam-alignment-duplicate-p (aln) 317 | "Check for PCR or optical duplicate in sam-alignment-flag. 318 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 319 | (declare (sam-alignment aln) #.*optimization*) 320 | (/= (logand (sam-alignment-flag aln) +duplicate+) 0)) 321 | 322 | (defun sam-alignment-supplementary-p (aln) 323 | "Check for supplementary alignment in sam-alignment-flag. 324 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.2." 325 | (declare (sam-alignment aln) #.*optimization*) 326 | (/= (logand (sam-alignment-flag aln) +supplementary+) 0)) 327 | 328 | (declaim (inline sam-alignment-flag-every 329 | sam-alignment-flag-some 330 | sam-alignment-flag-notevery 331 | sam-alignment-flag-notany)) 332 | 333 | (defun sam-alignment-flag-every (aln flag) 334 | "Check for every bit in flag being set in sam-alignment-flag." 335 | (declare (sam-alignment aln) (fixnum flag) #.*optimization*) 336 | (= (logand (sam-alignment-flag aln) flag) flag)) 337 | 338 | (defun sam-alignment-flag-some (aln flag) 339 | "Check for some bits in flag being set in sam-alignment-flag." 340 | (declare (sam-alignment aln) (fixnum flag) #.*optimization*) 341 | (/= (logand (sam-alignment-flag aln) flag) 0)) 342 | 343 | (defun sam-alignment-flag-notevery (aln flag) 344 | "Check for not every bit in flag being set in sam-alignment-flag." 345 | (declare (sam-alignment aln) (fixnum flag) #.*optimization*) 346 | (/= (logand (sam-alignment-flag aln) flag) flag)) 347 | 348 | (defun sam-alignment-flag-notany (aln flag) 349 | "Check for not any bit in flag being set in sam-alignment-flag." 350 | (declare (sam-alignment aln) (fixnum flag) #.*optimization*) 351 | (= (logand (sam-alignment-flag aln) flag) 0)) 352 | 353 | (declaim (inline make-sam)) 354 | 355 | (defstruct sam 356 | "A complete SAM data set that can be contained in a SAM file. 357 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1. 358 | The struct sam has a default constructor make-sam. 359 | Accessor sam-header of type sam-header refers to the header. 360 | Accessor sam-alignments of type list of sam-alignment or sequence of lists of sam-alignment refers to the read alignments." 361 | (header (make-sam-header) :type sam-header) 362 | (alignments '() :type sequence)) 363 | 364 | (setf (documentation 'make-sam 'function) 365 | "Default constructor for struct sam." 366 | (documentation 'sam-p 'function) 367 | "Default predicate for struct sam." 368 | (documentation 'copy-sam 'function) 369 | "Default copier function for struct sam." 370 | (documentation 'sam-header 'function) 371 | "Access the sam header of type sam-header." 372 | (documentation 'sam-alignments 'function) 373 | "Access the sam list of sam-alignment instances.") 374 | 375 | (defun map-sam-alignments (alns function) 376 | (if (listp (elt alns 0)) 377 | (map nil (lambda (chunk) (mapc function chunk)) alns) 378 | (map nil function alns))) 379 | 380 | (defmacro do-sam-alignments ((var sam) &body body) 381 | `(map-sam-alignments ,sam (lambda (,var) ,@body))) 382 | 383 | ;;; mapping cigar strings to alists or avectors 384 | 385 | (define-symbol-macro cigar-operations "MmIiDdNnSsHhPpXx=") 386 | 387 | (defconstant +min-cigar-operation+ (reduce #'min cigar-operations :key #'char-code) 388 | "The smallest CIGAR operation. 389 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6.") 390 | 391 | (defconstant +max-cigar-operation+ (reduce #'max cigar-operations :key #'char-code) 392 | "The largest CIGAR operation. 393 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6.") 394 | 395 | (eval-when (#+sbcl :compile-toplevel :load-toplevel :execute) 396 | (defun make-cigar-operations-table () 397 | "Map CIGAR operations represented as characters to upcase characters. 398 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6." 399 | (let ((table (make-array (1+ (- +max-cigar-operation+ 400 | +min-cigar-operation+)) 401 | :initial-element nil 402 | #+lispworks :allocation #+lispworks :long-lived 403 | #+lispworks :single-thread #+lispworks t))) 404 | (loop for char across cigar-operations do 405 | (setf (svref table (- (char-code char) +min-cigar-operation+)) 406 | (char-upcase char))) 407 | table))) 408 | 409 | (defglobal *cigar-operations* 410 | (make-cigar-operations-table) 411 | "Map CIGAR operations represented as characters to upcase characters. 412 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6.") 413 | 414 | (declaim (inline make-cigar-operation)) 415 | 416 | (defun make-cigar-operation (table cigar i) 417 | "Parse a CIGAR length + operation from position i in the cigar string. 418 | Return a cons cell with upcase character + length, and a next position in the cigar string. 419 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6." 420 | (declare (simple-base-string cigar) (fixnum i) #.*optimization*) 421 | (loop for j of-type fixnum = i then (1+ j) 422 | for char of-type fixnum = (char-code (schar cigar j)) 423 | when (or (< char #.(char-code #\0)) (> char #.(char-code #\9))) 424 | return (values (let ((length (parse-integer cigar :start i :end j)) 425 | (operation (svref table (the fixnum (- char +min-cigar-operation+))))) 426 | (if operation (cons operation length) 427 | (error "Invalid CIGAR operation ~S." (code-char char)))) 428 | (the fixnum (1+ j))))) 429 | 430 | (defglobal *cigar-list-cache* 431 | (let ((table (make-synchronized-hash-table :test #'equal))) 432 | (setf (gethash "*" table) '()) 433 | table) 434 | "Cache CIGAR Strings to association lists. 435 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6.") 436 | 437 | (declaim (notinline slow-scan-cigar-string-to-list)) 438 | 439 | (defun slow-scan-cigar-string-to-list (cigar) 440 | "Convert a cigar string to an association list, slow path. 441 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6." 442 | (declare (base-string cigar) #.*optimization*) 443 | (let ((end (length cigar))) 444 | (declare (fixnum end)) 445 | (multiple-value-bind (cigar i) (unwrap-displaced-array cigar) 446 | (declare (simple-base-string cigar) (fixnum i)) 447 | (setq end (the fixnum (+ end i))) 448 | (loop with table = *cigar-operations* 449 | with cigar-operation do 450 | (setf (values cigar-operation i) (make-cigar-operation table cigar i)) 451 | collect cigar-operation into list 452 | until (= i end) 453 | finally (return (with-modify-hash (key value found) 454 | ((the hash-table *cigar-list-cache*) cigar) 455 | (if found value list))))))) 456 | 457 | (declaim (inline fast-scan-cigar-string-to-list)) 458 | 459 | (defun fast-scan-cigar-string-to-list (cigar) 460 | "Convert a cigar string to an association list, fast path. 461 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6." 462 | (declare (base-string cigar) #.*optimization*) 463 | (multiple-value-bind (value found) 464 | (gethash cigar (the hash-table *cigar-list-cache*)) 465 | (if found value (slow-scan-cigar-string-to-list cigar)))) 466 | 467 | (defglobal *cigar-vector-cache* 468 | (let ((table (make-synchronized-hash-table :test #'equal))) 469 | (setf (gethash "*" table) #()) 470 | table) 471 | "Cache CIGAR strings to assocation vectors. 472 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6.") 473 | 474 | (declaim (notinline slow-scan-cigar-string-to-vector)) 475 | 476 | (defun slow-scan-cigar-string-to-vector (cigar) 477 | "Convert a cigar string to an assocation vector, slow path. 478 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6." 479 | (declare (base-string cigar) #.*optimization*) 480 | (let ((end (length cigar))) 481 | (declare (fixnum end)) 482 | (multiple-value-bind (cigar i) (unwrap-displaced-array cigar) 483 | (declare (simple-base-string cigar) (fixnum i)) 484 | (setq end (the fixnum (+ end i))) 485 | (loop with table = *cigar-operations* 486 | with vector = (make-array (loop for j of-type fixnum = i then (1+ j) 487 | until (= j end) 488 | count (let ((char (char-code (schar cigar j)))) 489 | (declare (fixnum char)) 490 | (or (< char #.(char-code #\0)) (> char #.(char-code #\9)))))) 491 | with cigar-operation 492 | for index of-type fixnum from 0 do 493 | (setf (values cigar-operation i) (make-cigar-operation table cigar i)) 494 | (setf (svref vector index) cigar-operation) 495 | until (= i end) 496 | finally (return (with-modify-hash (key value found) 497 | ((the hash-table *cigar-vector-cache*) cigar) 498 | (if found value vector))))))) 499 | 500 | (declaim (inline fast-scan-cigar-string-to-vector)) 501 | 502 | (defun fast-scan-cigar-string-to-vector (cigar) 503 | "Convert a cigar string to an association vector, fast path. 504 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6." 505 | (declare (base-string cigar) #.*optimization*) 506 | (or (gethash cigar (the hash-table *cigar-vector-cache*)) 507 | (slow-scan-cigar-string-to-vector cigar))) 508 | 509 | (declaim (inline scan-cigar-string)) 510 | 511 | (defun scan-cigar-string (type cigar) 512 | "Convert a cigar string to an association 'list or 'vector. 513 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4.6." 514 | (declare (symbol type)) 515 | (ecase type 516 | (list (fast-scan-cigar-string-to-list cigar)) 517 | (vector (fast-scan-cigar-string-to-vector cigar)))) 518 | 519 | (define-compiler-macro scan-cigar-string (&whole form type cigar) 520 | (case type 521 | (list `(fast-scan-cigar-string-to-list ,cigar)) 522 | (vector `(fast-scan-cigar-string-to-vector ,cigar)) 523 | (t form))) 524 | -------------------------------------------------------------------------------- /save-elprep-script.lisp: -------------------------------------------------------------------------------- 1 | (in-package :cl-user) 2 | 3 | (let ((quicklisp-init (print (merge-pathnames "quicklisp/setup.lisp" (user-homedir-pathname))))) 4 | (when (probe-file quicklisp-init) 5 | (load quicklisp-init))) 6 | 7 | (asdf:load-system :elprep) 8 | 9 | #+lispworks 10 | (deliver #'elprep:elprep-script "elprep" 4 :multiprocessing t) 11 | 12 | #+sbcl 13 | (sb-ext:save-lisp-and-die "elprep" :toplevel #'elprep:elprep-script :executable t :save-runtime-options t) 14 | -------------------------------------------------------------------------------- /simple-filters.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | (in-simple-base-string-syntax) 3 | 4 | ;; filters 5 | 6 | (defun replace-reference-sequence-dictionary (reference-sequence-dictionary) 7 | "A filter for replacing the reference sequence dictionary in a sam-header." 8 | (lambda (header) 9 | ;; changing the reference-sequence-dictionary may destroy the sorting order 10 | (let ((sorting-order (getf (sam-header-hd header) :so))) 11 | (when sorting-order 12 | (when (string= sorting-order "coordinate") 13 | (unless (loop with previous-pos = -1 14 | with old-dictionary = (sam-header-sq header) 15 | for entry in reference-sequence-dictionary 16 | for sname = (getf entry :sn) 17 | for pos = (position sname old-dictionary :test #'string= :key (lambda (entry) (getf entry :sn))) 18 | do (cond ((null pos)) 19 | ((> pos previous-pos) 20 | (setq previous-pos pos)) 21 | (t (return nil))) 22 | finally (return t)) 23 | ; "unknown", not "unsorted", because the offending snames may not occur in the reads 24 | (sam-header-ensure-hd header :so "unknown"))))) 25 | (let ((reference-sequence-table 26 | (make-single-thread-hash-table :test #'equal))) 27 | (declare (hash-table reference-sequence-table)) 28 | (loop for entry in reference-sequence-dictionary 29 | do (setf (gethash (getf entry :sn) reference-sequence-table) t)) 30 | (setf (sam-header-sq header) reference-sequence-dictionary) 31 | (lambda () 32 | (lambda (alignment) 33 | (declare (sam-alignment alignment) #.*optimization*) 34 | (gethash (sam-alignment-rname alignment) reference-sequence-table)))))) 35 | 36 | (defun replace-reference-sequence-dictionary-from-sam-file (sam-file) 37 | "A filter for replacing the reference sequence dictionary in a sam-header with one parsed from the given SAM/DICT file." 38 | (with-open-sam (in sam-file :direction :input :header-only t) 39 | (let* ((header (parse-sam-header in)) 40 | (dictionary (sam-header-sq header))) 41 | (replace-reference-sequence-dictionary dictionary)))) 42 | 43 | (defun filter-unmapped-reads (header) 44 | "A filter for removing unmapped sam-alignment instances, based on FLAG." 45 | (declare (ignore header)) 46 | (lambda () 47 | (lambda (alignment) 48 | (declare (sam-alignment alignment) #.*optimization*) 49 | (= 0 (logand (sam-alignment-flag alignment) +unmapped+))))) 50 | 51 | (defun filter-unmapped-reads-strict (header) 52 | "A filter for removing unmapped sam-alignment instances, based on FLAG, or POS=0, or RNAME=*." 53 | (declare (ignore header)) 54 | (lambda () 55 | (lambda (alignment) 56 | (declare (sam-alignment alignment) #.*optimization*) 57 | (and (= 0 (logand (sam-alignment-flag alignment) +unmapped+)) 58 | (/= (sam-alignment-pos alignment) 0) 59 | (string/= (sam-alignment-rname alignment) "*"))))) 60 | 61 | (defun filter-duplicate-reads (header) 62 | "A filter for removing duplicate sam-alignment instances, based on FLAG." 63 | (declare (ignore header)) 64 | (lambda () 65 | (lambda (alignment) 66 | (declare (sam-alignment alignment) #.*optimization*) 67 | (= 0 (logand (sam-alignment-flag alignment) +duplicate+))))) 68 | 69 | (defun filter-optional-reads (header) 70 | "A filter for removing sam-alignment instances that represent optional information in elPrep." 71 | (when (sam-header-user-tag header :|@sr|) 72 | (remf (sam-header-user-tags header) :|@sr|) 73 | (lambda () 74 | (lambda (alignment) 75 | (declare (sam-alignment alignment) #.*optimization*) 76 | (not (sam-alignment-tag alignment :|sr|)))))) 77 | 78 | (defun add-or-replace-read-group (read-group) 79 | "A filter for adding or replacing the read group both in sam-header and each sam-alignment." 80 | (lambda (header) 81 | (setf (sam-header-rg header) (list read-group)) 82 | (let ((id (getf read-group :ID))) 83 | (lambda () 84 | (lambda (alignment) 85 | (declare (sam-alignment alignment) #.*optimization*) 86 | (setf (sam-alignment-tag alignment :rg) id) 87 | t))))) 88 | 89 | (defun parse-read-group-from-string (string) 90 | "Parse an @RG line in a string. 91 | See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.3, 92 | except that entries are separated by white space instead of tabulators." 93 | (declare (string string) #.*optimization*) 94 | (let ((sbstring (coerce string 'simple-base-string)) 95 | (record '()) 96 | (string-buffer (make-array 80 97 | :element-type 'base-char 98 | :adjustable t 99 | :fill-pointer 0 100 | #+lispworks :single-thread #+lispworks t)) 101 | (tag-string (make-array 2 :element-type 'base-char))) 102 | (declare (simple-base-string sbstring tag-string) (dynamic-extent tag-string)) 103 | (with-input-from-string (stream sbstring) 104 | (labels ((read-char-or-error () 105 | (or (read-char stream nil nil) 106 | (error "Unexpected end of read group string ~S." string))) 107 | (scan-tag () 108 | (unless (peek-char t stream nil nil) 109 | (assert (getf record :ID)) 110 | (return-from parse-read-group-from-string record)) 111 | (setf (schar tag-string 0) (char-upcase (read-char-or-error)) 112 | (schar tag-string 1) (char-upcase (read-char-or-error))) 113 | (when (char/= (read-char-or-error) #\:) 114 | (error "Expected tag separator for tag ~A in read group string ~S." tag-string string)) 115 | tag-string) 116 | (scan-string (tag) 117 | (setf (fill-pointer string-buffer) 0) 118 | (when (char= (peek-char nil stream nil #\Space) #\Space) 119 | (error "Expected tag value for tag ~S in read group string ~S." tag string)) 120 | (loop do (vector-push-extend (read-char-or-error) string-buffer) 121 | until (char= (peek-char nil stream nil #\Space) #\Space)) 122 | (subseq string-buffer 0 (fill-pointer string-buffer)))) 123 | (loop for tag = (scan-tag) 124 | for val = (scan-string tag) do 125 | (setf record (nconc record (string-case (tag :default (if (sam-header-user-tag-p tag) 126 | (list (unique (intern-key/copy tag) record) val) 127 | (error "Unknown tag ~A in read group string ~S." tag string))) 128 | ("ID" (list (unique :ID record) val)) 129 | ("CN" (list (unique :CN record) val)) 130 | ("DS" (list (unique :DS record) val)) 131 | ("DT" (list (unique :DT record) (parse-date-time val))) 132 | ("FO" (list (unique :FO record) val)) 133 | ("KS" (list (unique :KS record) val)) 134 | ("LB" (list (unique :LB record) val)) 135 | ("PG" (list (unique :PG record) val)) 136 | ("PI" (list (unique :PI record) (parse-integer val))) 137 | ("PL" (list (unique :PL record) val)) 138 | ("PU" (list (unique :PU record) val)) 139 | ("SM" (list (unique :SM record) val)))))))))) 140 | 141 | (defun add-pg-line (id &rest args &key pn cl ds vn) 142 | "A filter for adding a @PG tag to a sam-header, and ensuring that it is the first one in the chain." 143 | (declare (dynamic-extent args) (ignore pn cl ds vn)) 144 | (let ((line (copy-list args))) 145 | (lambda (header) 146 | (let ((pgs (sam-header-pg header)) 147 | (new-line line)) 148 | ; ensure id is unique 149 | (loop while (find id pgs :key (lambda (pg) (getf pg :id)) :test #'equal) 150 | do (setq id (format nil "~A ~4,'0X" id (random #x10000)))) 151 | ; determine PP tag 152 | (let ((next-id (loop for pg in pgs 153 | for id = (getf pg :id) 154 | unless (find id pgs :key (lambda (pg) (getf pg :pp)) :test #'equal) 155 | return id))) 156 | (when next-id (setq new-line (list* :pp next-id new-line)))) 157 | ; add ID tag 158 | (setq new-line (list* :id id new-line)) 159 | ; add @PG line 160 | (push new-line (sam-header-pg header)) 161 | ; no next-level filters necessary 162 | nil)))) 163 | 164 | (defun rename-chromosomes (header) 165 | "A filter for prepending \"chr\" to the reference sequence names in a sam-header, and in RNAME and RNEXT in each sam-alignment." 166 | (flet ((string-append (s1 s2) 167 | #+lispworks (lw:string-append s1 s2) 168 | #+sbcl (concatenate 'simple-base-string s1 s2))) 169 | (declare (inline string-append)) 170 | (loop for plist in (sam-header-sq header) 171 | for sn = (getf plist :SN) 172 | when sn do (setf (getf plist :SN) (string-append "chr" sn))) 173 | (lambda () 174 | (lambda (alignment) 175 | (declare (sam-alignment alignment) #.*optimization*) 176 | (let ((rname (sam-alignment-rname alignment))) 177 | (cond ((string= rname "=")) 178 | ((string= rname "*")) 179 | (t (setf (sam-alignment-rname alignment) (string-append "chr" rname))))) 180 | (let ((rnext (sam-alignment-rnext alignment))) 181 | (cond ((string= rnext "=")) 182 | ((string= rnext "*")) 183 | (t (setf (sam-alignment-rnext alignment) (string-append "chr" rnext))))) 184 | t)))) 185 | 186 | (defun add-refid (header) 187 | "A filter for adding the refid (index in the reference sequence dictionary) to sam-alignment instances." 188 | (let ((reference-sequence-table (make-single-thread-hash-table :test #'equal))) 189 | (loop with ctr = -1 190 | for sn-form in (sam-header-sq header) 191 | do (setf (gethash (getf sn-form :SN) reference-sequence-table) (incf ctr))) 192 | (lambda () 193 | (lambda (alignment) 194 | ; fill in refid 195 | (setf (sam-alignment-refid alignment) 196 | (gethash (sam-alignment-rname alignment) reference-sequence-table -1)))))) 197 | -------------------------------------------------------------------------------- /simple-trees.lisp: -------------------------------------------------------------------------------- 1 | (in-package :elprep) 2 | (in-simple-base-string-syntax) 3 | 4 | (declaim (inline make-simple-tree)) 5 | 6 | (defstruct (simple-tree 7 | (:constructor make-simple-tree 8 | (rank &optional depth &aux (nodes (make-array rank :initial-element nil 9 | #+lispworks :single-thread #+lispworks t))))) 10 | "A simple balanced tree. 11 | The struct simple-tree has a constructor that takes the rank and optionally the depth as parameters. 12 | Accessor simple-tree-index of type fixnum points to the current node in simple-tree-nodes. 13 | Read-only accessor simple-tree-nodes of type simple-vector refers to the nodes or leaves. 14 | Read-only accessor simple-tree-depth of type fixnum refers to the depth of the tree. 15 | Read-only accessor simple-tree-rank of type fixnum refers to the rank of the tree." 16 | (index 0 :type fixnum) 17 | (nodes #() :type simple-vector :read-only t) 18 | (depth 0 :type fixnum :read-only t) 19 | (rank 0 :type fixnum :read-only t)) 20 | 21 | (setf (documentation 'make-simple-tree 'function) 22 | "Constructor for struct simple-tree that takes the rank and optionally the depth as parameters." 23 | (documentation 'simple-tree-p 'function) 24 | "Default predicate for struct simple-tree." 25 | (documentation 'copy-simple-tree 'function) 26 | "Default copier function for struct simple-tree." 27 | (documentation 'simple-tree-index 'function) 28 | "Access the simple-tree index of type fixnum." 29 | (documentation 'simple-tree-nodes 'function) 30 | "Read the simple-tree nodes of type simple-vector." 31 | (documentation 'simple-tree-depth 'function) 32 | "Read the simple-tree depth of type fixnum." 33 | (documentation 'simple-tree-rank 'function) 34 | "Read the simple-tree rank of type fixnum.") 35 | 36 | (declaim (inline make-subtree)) 37 | 38 | (defun make-subtree (tree) 39 | "Create a subtree for the given simple-tree." 40 | (declare (simple-tree tree) #.*optimization*) 41 | (make-simple-tree 42 | (simple-tree-rank tree) 43 | (the fixnum (1- (simple-tree-depth tree))))) 44 | 45 | (declaim (inline make-super-tree)) 46 | 47 | (defun make-super-tree (tree) 48 | "Create a super tree for the given simple-tree, and add the given tree to the new super tree as its first node." 49 | (declare (simple-tree tree) #.*optimization*) 50 | (let ((super-tree 51 | (make-simple-tree 52 | (simple-tree-rank tree) 53 | (the fixnum (1+ (simple-tree-depth tree)))))) 54 | (declare (simple-tree super-tree)) 55 | (setf (simple-tree-index super-tree) 1) 56 | (setf (svref (simple-tree-nodes super-tree) 0) tree) 57 | super-tree)) 58 | 59 | (defun insert-node (top-tree node) 60 | "Insert a node into the given simple-tree. Returns the tree unless it is fully occupied. 61 | If it is fully occupied, create a fresh super tree, insert both the given tree and the node there, 62 | and return that super tree." 63 | (declare (simple-tree top-tree) #.*optimization*) 64 | (labels ((recur (tree) 65 | (declare (simple-tree tree)) 66 | (let ((index (simple-tree-index tree)) 67 | (nodes (simple-tree-nodes tree))) 68 | (declare (fixnum index) (simple-vector nodes)) 69 | (when (< index (length nodes)) 70 | (when (= (simple-tree-depth tree) 0) 71 | (setf (svref nodes index) node) 72 | (setf (simple-tree-index tree) (the fixnum (1+ index))) 73 | ;; done: jump out of the recursion! 74 | (return-from insert-node top-tree)) 75 | ;; depth /= 0 76 | (recur (or (svref nodes index) (setf (svref nodes index) (make-subtree tree)))) 77 | ;; tree is full, try to create a new subtree 78 | (setf (simple-tree-index tree) (incf index)) 79 | (when (< index (length nodes)) 80 | (recur (setf (svref nodes index) (make-subtree tree)))))))) 81 | (recur top-tree) 82 | ;; tree is full, try from a new top tree 83 | (recur (setq top-tree (make-super-tree top-tree))) 84 | ;; new top tree is also full, which should never happen 85 | (error "This code should not be reached."))) 86 | 87 | (defun tree-reduce (tree threads map reduce) 88 | "Perform a parallel map/reduce traversal over the given simple-tree." 89 | (declare (simple-tree tree) (fixnum threads) (function map reduce) #.*optimization*) 90 | (claws:reset-workers threads) 91 | (unwind-protect 92 | (labels ((reduce-vector (vector start end map) 93 | (declare (simple-vector vector) (fixnum start end) (function map)) 94 | (let ((length (the fixnum (- end start)))) 95 | (declare (fixnum length)) 96 | (cond ((= length 0) '()) 97 | ((= length 1) 98 | (funcall map (svref vector start))) 99 | (t (let* ((half (the fixnum (ash length -1))) 100 | (middle (the fixnum (+ start half))) 101 | left right) 102 | (declare (fixnum half middle)) 103 | (claws:spawn (left) (reduce-vector vector start middle map)) 104 | (setq right (reduce-vector vector middle end map)) 105 | (claws:sync) 106 | (funcall reduce left right)))))) 107 | (recur-tree (tree) 108 | (declare (simple-tree tree)) 109 | (let ((index (simple-tree-index tree)) 110 | (nodes (simple-tree-nodes tree))) 111 | (declare (fixnum index) (simple-vector nodes)) 112 | (if (= (simple-tree-depth tree) 0) 113 | (reduce-vector nodes 0 index map) 114 | (reduce-vector nodes 0 (if (and (< index (length nodes)) 115 | (svref nodes index)) 116 | (the fixnum (1+ index)) index) 117 | #'recur-tree))))) 118 | (recur-tree tree)) 119 | (claws:reset-workers 1))) 120 | --------------------------------------------------------------------------------