├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── src
    ├── Makefile
    ├── arch.cc
    ├── arch.h
    ├── bloompat.cc
    ├── bloompat.h
    ├── cluster.cc
    ├── cluster.h
    ├── compairr.cc
    ├── compairr.h
    ├── db.cc
    ├── db.h
    ├── dedup.cc
    ├── dedup.h
    ├── hashtable.cc
    ├── hashtable.h
    ├── overlap.cc
    ├── overlap.h
    ├── threads.h
    ├── util.cc
    ├── util.h
    ├── variants.cc
    ├── variants.h
    ├── zobrist.cc
    └── zobrist.h
└── test
    ├── Makefile
    ├── expected.tsv
    ├── seta.tsv
    ├── setb.tsv
    ├── setc.tsv
    └── test.sh


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine:3.13
 2 | WORKDIR /opt/compairr
 3 | COPY Makefile .
 4 | COPY src ./src
 5 | COPY test ./test
 6 | RUN apk add --no-cache libstdc++ make g++ && \
 7 |     make clean && make && make test && make install && make clean && \
 8 |     apk del make g++
 9 | ENTRYPOINT ["/usr/local/bin/compairr"]
10 | CMD ["--help"]
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published by
637 |     the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <http://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for CompAIRR
 2 | 
 3 | ifndef PREFIX
 4 | 	PREFIX=/usr/local
 5 | endif
 6 | 
 7 | all : compairr
 8 | 
 9 | compairr:
10 | 	make -C src compairr
11 | 
12 | test: compairr
13 | 	make -C test
14 | 
15 | install: compairr test
16 | 	/usr/bin/install -d $(PREFIX)/bin
17 | 	/usr/bin/install -c src/compairr $(PREFIX)/bin/compairr
18 | 
19 | clean:
20 | 	make -C src clean
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![](https://img.shields.io/static/v1?label=AIRR-C%20sw-tools%20v1&message=compliant&color=008AFF&labelColor=000000&style=plastic)](https://docs.airr-community.org/en/stable/swtools/airr_swtools_standard.html)
  2 | 
  3 | # CompAIRR
  4 | 
  5 | CompAIRR (`compairr`) is a command line tool to compare two sets of
  6 | adaptive immune receptor repertoires and compute their overlap. It can
  7 | also identify which sequences are present in which repertoires.
  8 | Furthermore, CompAIRR can cluster the sequences in a repertoire
  9 | set. Sequence comparisons can be exact or approximate. CompAIRR has
 10 | been shown to be very fast and to have a small memory footprint
 11 | compared to similar tools, when up to 2 differences are allowed.
 12 | 
 13 | 
 14 | ## Installation
 15 | 
 16 | The code is C++11 standard compliant and should compile easily using
 17 | `make` and a modern C++ compiler (e.g. GNU GCC or LLVM Clang). Run
 18 | `make clean`, `make`, `make test` and `make install` in the main
 19 | folder to clean, build, test and install the tool. There are no
 20 | dependencies except for the C and C++ standard libraries.
 21 | 
 22 | Binaries for Linux (x86_64) and macOS (x86_64 and Arm64) are also
 23 | distributed with each
 24 | [release](https://github.com/uio-bmi/compairr/releases/latest).
 25 | 
 26 | A `Dockerfile` is included if you want to make a Docker image.  A
 27 | docker image may be built with the following command:
 28 | 
 29 | ```sh
 30 | docker build -t compairr .
 31 | ```
 32 | 
 33 | Ready-made Docker images for CompAIRR can be found on the
 34 | [Docker Hub](https://hub.docker.com/r/torognes/compairr).
 35 | 
 36 | CompAIRR can be installed on macOS using homebrew with
 37 | `brew install torognes/bioinf/compairr`.
 38 | 
 39 | 
 40 | ## Tutorial
 41 | 
 42 | For an introduction to how to use CompAIRR, please have a look at the
 43 | [CompAIRR tutorial](https://github.com/LonnekeScheffer/compairr-tutorial).
 44 | 
 45 | 
 46 | ## General options
 47 | 
 48 | Use the `-h` or `--help` option to show some help information.
 49 | 
 50 | Run the program with `-v` or `--version` for version information.
 51 | 
 52 | The type of operation that should be performed is specified with one
 53 | of the options `-m`, `-x`, `-c` or `-z` (or the corresponding long option
 54 | forms `--matrix`, `--existence`, `--cluster`, or `--deduplicate`).
 55 | 
 56 | The code is multi-threaded. The number of threads may be specified
 57 | with the `-t` or `--threads` option.
 58 | 
 59 | The results will be written to standard out (stdout) unless a file
 60 | name has been specified with the `-o` or `--output-file` option.
 61 | 
 62 | While the program is running it will print some status and progress
 63 | information to standard error (stderr) unless a log file has been
 64 | specified with the `-l` or `--log` option. Error messages and warnings
 65 | will also be written here.
 66 | 
 67 | The default is to compare amino acid sequences, but nucleotide
 68 | sequences are compared if the `-n` or `--nucleotides` option is given.
 69 | The accepted amino acid symbols are `ACDEFGHIKLMNPQRSTVWY`, while the
 70 | accepted nucleotide symbols are `ACGTU`. Lower case letters are also
 71 | accepted. The program will abort with an error message if any other
 72 | symbol is encountered in a sequence, unless one specifies the `-u` or
 73 | `--ignore-unknown` option, in which case CompAIRR will simply ignore
 74 | that sequence. If the program encounters an empty sequence it will
 75 | also abort with an error message, unless the `-e` or `--ignore-empty`
 76 | option is given.
 77 | 
 78 | By default, the sequences should be given in the `junction` or
 79 | `junction_aa` column of the input file, for nucleotide and amino acid
 80 | sequences, respectively. Alternatively, the sequences may be present
 81 | in the `cdr3` or `cdr3_aa` column, if the `--cdr3` option is given.
 82 | 
 83 | The user can specify how many differences are allowed when comparing
 84 | sequences, using the option `-d` or `--differences`. To allow indels
 85 | (insertions or deletions) the option `-i` or `--indels` may be
 86 | specified, otherwise only substitutions are allowed. By default, no
 87 | differences are allowed. The `-i` option is allowed only when d=1. The
 88 | number of differences allowed strongly influences the speed of
 89 | CompAIRR. The program will be slower as more differences
 90 | are allowed. When d=0 or d=1 it is very fast, but it will be relatively
 91 | slow with d=2 and even slower when d>2. See the section on performance
 92 | below for an example.
 93 | 
 94 | The V and J gene alleles specified for each sequence must also match,
 95 | unless the `-g` or `--ignore-genes` option is in effect.
 96 | 
 97 | 
 98 | ## Computing overlap between two repertoire sets
 99 | 
100 | To compute the overlap between two repertoire sets, use the `-m` or
101 | `--matrix` option.
102 | 
103 | For each of the two repertoire sets there must an input file of
104 | tab-separated values formatted according to [the AIRR standard for
105 | rearrangements](https://docs.airr-community.org/en/stable/datarep/rearrangements.html).
106 | The two input files are specified on the command line without any
107 | preceding option letter. If only one filename is specified on the
108 | command line, or the same filename is specified twice, it is assumed
109 | that the set should be compared to itself. Each file must contain the
110 | repertoire ID and either the nucleotide or the amino acid sequence of
111 | the rearrangement. If the repertoire ID column is missing, all
112 | sequences are assumed to belong to the same repertoire (with ID 1 or
113 | 2, respectively, for the two sets). A sequence ID may also be
114 | included. Unless they should be ignored, the V gene, the J gene, and
115 | the duplicate count is also needed.
116 | 
117 | Each set can contain many repertoires and each repertoire can contain
118 | many sequences. The tool will find the sequences in the two sets that
119 | are similar and output a matrix with results.
120 | 
121 | CompAIRR assumes that all sequences within each repertoire are
122 | distinct, and that the abundance of each sequence is indicated in the
123 | `duplicate_count` field in the input file. Duplicated sequences,
124 | i.e. identical sequences (with the same V and J genes) within the same
125 | repertoire, may lead to unexpected results. CompAIRR will warn if it
126 | detects duplicates. Duplicates may be merged with the `--deduplicate`
127 | command.
128 | 
129 | The similar sequences of each repertoire in each set are found by
130 | comparing the sequences and their V and J genes.  The duplicate count
131 | of each sequence is taken into account and a matrix is output
132 | containing a value for each combination of repertoires in the two
133 | sets. The value is usually the sum of the products of the duplicate
134 | counts of all pairs of sequences in the two repertoires that match. If
135 | the option `-f` or `--ignore-counts` is specified, the duplicate count
136 | information is ignored and all counts are treated as 1. Instead of
137 | summing the product of the counts, the ratio, min, max, or mean may be
138 | used if specified with the `-s` or `--score` option. The Morisita-Horn
139 | index or Jaccard index will be calculated if `MH` or `Jaccard` is
140 | specified with the `-s` option. These indices can only be computed
141 | when d=0.
142 | 
143 | The output will be a matrix of values in a tab-separated plain text
144 | file. Two different formats can be selected. In the default format,
145 | the first line contains the hash character (`#`) followed by the
146 | repertoire ID's from the second set. The following lines contains the
147 | repertoire ID from the first set, followed by the values corresponding
148 | to the comparison of this repertoire with each of the repertoires in
149 | the second set.
150 | 
151 | An alternative output format is used when the `-a` or `--alternative`
152 | option is specified. It will write the results in a three column
153 | format with the repertoire ID from set 1 and set 2 in the two first
154 | columns, respectively, and the value in the third column. There will
155 | be one line for each combination of repertoires in the sets. The very
156 | first line will contain a hash character (`#`) followed by the field
157 | names separated by tabs.
158 | 
159 | If the `-p` or `--pairs` option is specified, CompAIRR will write
160 | information about all pairs of matching sequences to a specified TSV
161 | file. Please note that such files may grow very large when there are
162 | many matches. Use of multithreading may be of little use in this
163 | case. The order of the lines in the file is unspecified. The following
164 | columns from both input files will be included in the output:
165 | `repertoire_id`, `sequence_id`, `duplicate_count`, `v_call`, `j_call`,
166 | and `junction`. The term `junction` will be replaced with
167 | `junction_aa`, `cdr3`, or `cdr3_aa` as appropriate. Additional columns
168 | from the input files may be copied to the pairs file using the `-k` or
169 | `--keep-columns` option. Multiple columns, separated by commas (but no
170 | spaces), may be given. A warning will be given if any of the specified
171 | columns are missing. In the header, columns from the first and second
172 | input file will be suffixed by `_1` and `_2`, respectively. The
173 | distance between the sequences will be included if the `--distance`
174 | option is included. This is usually the Hamming distance (minimum
175 | number of substitutions), unless the `--indel` (or `-i`) option is
176 | specified, in which case the distance is the Levenshtein distance
177 | (minimum number of substitutions or indels). If only the information
178 | in the pairs file is required, and not the information in the matrix,
179 | the storage and output of the matrix can be avoided with the
180 | `--no-matrix` option. This may save some memory and time if there are
181 | many repertoires in the sets.
182 | 
183 | 
184 | ## Analysing in which repertoires a set of sequences are present
185 | 
186 | Use the option `-x` or `--existence` to analyse in which repertoires a
187 | set of sequences are present, and create a sequence presence matrix.
188 | 
189 | Two input files with repertoire sets in standard format must be
190 | specified on the command line. The first file should contain the
191 | different sequences to analyse. The `sequence_id` column must be
192 | present in this file. If the optional `repertoire_id` column is
193 | present, all those identifiers must be identical. The second file must
194 | contain the repertoires to match. The `repertoire_id` column must be
195 | present in the second file, otherwise the ID will be set to 2 for all
196 | sequences.
197 | 
198 | CompAIRR will identify in which repertoires each sequence is present
199 | and will output the results either as a matrix or as a three-column
200 | table (if the `-a` option is specified). The options `-d`, `-i`, `-g`,
201 | and `-n` (and the corresponding long option names `--differences`,
202 | `--indels`, `--ignore-genes`, and `--nucleotides`) will be taken into
203 | account when comparing sequences.
204 | 
205 | The output will be in a similar format as when computing the overlap
206 | (above), but the first column will contain the `sequence_id` from the
207 | first file instead of the `repertoire_id`.
208 | 
209 | The `-p` or `--pairs` option may be specified to output all pairs of
210 | matching sequences in the same way as for the overlap computation.
211 | 
212 | 
213 | ## Clustering the sequences in a repertoire
214 | 
215 | To cluster the sequences in one repertoire, use the `-c` or
216 | `--cluster` option.
217 | 
218 | One input file in tab-separated format must be specified on the
219 | command line.
220 | 
221 | The tool will cluster the sequences using single linkage hierarchical
222 | clustering, according to the specified distance and indel options
223 | (`-d`, `--distance`, `-i`, `--indels`). The V and J gene alleles will
224 | be taken into account unless the `-g` or `--ignore-genes` option is
225 | specified. The options `-n` or `--nucleotides` indicate that the
226 | comparison should be performed with nucleotide sequences, not amino
227 | acid sequences. If the repertoire ID column is missing, all
228 | sequences are assumed to belong to the same repertoire (with ID 1).
229 | 
230 | The output will be in a similar TSV format as the input file, but
231 | preceded with two additional columns. The first column will contain a
232 | cluster number, starting at 1. The second column will contain the size
233 | of the cluster. The subsequent columns are `repertoire_id`,
234 | `sequence_id`, `duplicate_count`, `v_call`, `j_call`, and `junction`
235 | (or `junction_aa`, `cdr3` or `cdr3_aa`, as appropriate).
236 | 
237 | The clusters are sorted by size, in descending order.
238 | 
239 | 
240 | ## Deduplication
241 | 
242 | The `--deduplicate` command may be used to deduplicate a data set by
243 | merging entries in the same repertoire with identical sequences and
244 | identical V and J genes. This may be necessary to get correct results
245 | when computing overlaps between repertoires. Duplicates may be present
246 | for instance in cases were the data set contains both nucleotide and
247 | amino acid sequences from the same rearrangement, where the nucleotide
248 | sequences may be distinct while the amino acid sequences may not be,
249 | due to the degeneracy of the genetic code.
250 | 
251 | One input file in TSV format must be specified on the command line.
252 | 
253 | Strictly identical sequences in the same repertoire will be merged and
254 | their counts will be added together. If the `-g` or `--ignore_genes`
255 | option is specified, the V and J genes are ignored. The `-n` or
256 | `--nucleotides` option may be specified if the input is nucleotide
257 | sequences, otherwise amino acid sequences will be assumed. If the `-f`
258 | or `--ignore_counts` option is specified, the counts in the input file
259 | will be ignored, and just the number of identical sequences will be
260 | counted. If the repertoire ID column is missing, all sequences are
261 | assumed to belong to the same repertoire (with ID 1).
262 | 
263 | The output will be in a similar TSV format as the input file, with the
264 | following columns: `repertoire_id`, `duplicate_count`, `v_call`,
265 | `j_call`, and `junction` (or `junction_aa`, `cdr3` or `cdr3_aa`, as
266 | appropriate). If the `-g` or `--ignore_genes` option is specified, the
267 | `v_call` and `j_call` columns will not be included.
268 | 
269 | 
270 | ## Input files
271 | 
272 | The input files must be in tab-separated value (TSV) format accoring
273 | to the [Rearrangement
274 | Schema](https://docs.airr-community.org/en/stable/datarep/rearrangements.html)
275 | of the [AIRR standards 1.3
276 | documentation](https://docs.airr-community.org/en/stable/).
277 | 
278 | The first line must contain the header. The rest of the file must
279 | contain one line per sequence. The following fields should be included:
280 | 
281 | * `repertoire_id`: identifier of the repertoire
282 | * `sequence_id`: identifier of the sequence (optional except for for first file when using `-x` or `--existence`)
283 | * `duplicate_count`: number of identical copies of the same rearrangement (required unless `-f` option given)
284 | * `v_call`: V gene name with allele (required unless `-g` option given)
285 | * `j_call`: J gene name with allele (required unless `-g` option given)
286 | * `junction`: nucleotide sequence (required if `-n` option given and `--cdr3` option not given)
287 | * `junction_aa`: amino acid sequence (single letter code) (required unless `-n` or `--cdr3` options given)
288 | * `cdr3`: nucleotide sequence (required if both `-n` and `--cdr3` options given)
289 | * `cdr_aa`: amino acid sequence (single letter code) (required if `--cdr3` option given and `-n` option not given)
290 | 
291 | See below for an example. Other fields may be included, but will be
292 | ignored.
293 | 
294 | 
295 | ## Command line option overview
296 | 
297 | The command line should look like this:
298 | 
299 | ```
300 | compairr OPTIONS TSVFILE1 [TSVFILE2]
301 | ```
302 | 
303 | Exactly one of the command options `-m`, `-x` or `-c` (or their long forms) must be specified. Other options as indicated in the table below could also be included. With the `-m` and `-x` command options, the names of two tab-separated value files with repertoires must also be specified on the command line, with the `-c` command option, only one such file should be specified.
304 | 
305 | Short | Long               | Argument | Default  | Description
306 | ------|--------------------|----------|----------|-------------
307 | `-a`  | `--alternative`    |          |          | Output results in three-column format, not matrix
308 | `  `  | `--cdr3`           |          |          | Use the `cdr3` or `cdr3_aa` column instead of `junction` or `junction_aa`
309 | `-c`  | `--cluster`        |          |          | Cluster sequences in one repertoire
310 | `-d`  | `--differences`    | INTEGER  | 0        | Number of differences accepted
311 | `  `  | `--distance`       |          |          | Include sequence distance in pairs file
312 | `-e`  | `--ignore-empty`   |          |          | Ignore empty sequences
313 | `-f`  | `--ignore-counts`  |          |          | Ignore duplicate count information
314 | `-g`  | `--ignore-genes`   |          |          | Ignore V and J gene information
315 | `-h`  | `--help`           |          |          | Display help text and exit
316 | `-i`  | `--indels`         |          |          | Allow insertions or deletions
317 | `-k`  | `--keep-columns`   | STRING   |          | Copy given comma-separated columns to pairs file
318 | `-l`  | `--log`            | FILENAME | (stderr) | Log to specified file instead of stderr
319 | `-m`  | `--matrix`         |          |          | Compute overlap matrix between two sets
320 | `  `  | `--no-matrix`      |          |          | Do not keep or output any matrix
321 | `-n`  | `--nucleotides`    |          |          | Compare nucleotides, not amino acids
322 | `-o`  | `--output`         | FILENAME | (stdout) | Output results to specified file instead of stdout
323 | `-p`  | `--pairs`          | FILENAME | (none)   | Output matching pairs to specified file
324 | `-s`  | `--score`          | STRING   | product  | Sum `product`, `ratio`, `min`, `max`, or `mean`; or compute `MH` or `Jaccard` index
325 | `-t`  | `--threads`        | INTEGER  | 1        | Number of threads to use (1-256)
326 | `-u`  | `--ignore-unknown` |          |          | Ignore sequences including unknown residue symbols
327 | `-v`  | `--version`        |          |          | Display version information
328 | `-x`  | `--existence`      |          |          | Check existence of sequences in repertoires
329 | `-z`  | `--deduplicate`    |          |          | Deduplicate sequences
330 | 
331 | 
332 | ## Example 1: Repertoire overlap
333 | 
334 | In this example we will compute the overlap of two repertoire sets.
335 | 
336 | Let's use two simple input files. The first is `seta.tsv`:
337 | 
338 | ```tsv
339 | repertoire_id	sequence_id	duplicate_count	v_call	j_call	junction	junction_aa	sequence	rev_comp	productive	d_call	sequence_alignment	germline_alignment	v_cigar	d_cigar	j_cigar
340 | A1	R	1	TCRBV07-06	TCRBJ02-01	tgcgcgagcagcaccagccatgaacagtatttt	CASSTSHEQYF									
341 | A2	S	3	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctatggctataccttt	CASSLRVGGYGYTF									
342 | ```
343 | 
344 | 
345 | The second is `setb.tsv`:
346 | 
347 | ```tsv
348 | repertoire_id	sequence_id	duplicate_count	v_call	j_call	junction	junction_aa	sequence	rev_comp	productive	d_call	sequence_alignment	germline_alignment	v_cigar	d_cigar	j_cigar
349 | B1	T	5	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctatggctataccttt	CASSLRVGGYGYTF									
350 | B1	U	10	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctttggctataccttt	CASSLRVGGFGYTF									
351 | B2	V	7	TCRBV07-06	TCRBJ02-01	tgcgcgagcagcaccagccatcagcagtatttt	CASSTSHQQYF									
352 | ```
353 | 
354 | We run the following command:
355 | 
356 | `compairr -m seta.tsv setb.tsv -d 1 -o output.tsv -p pairs.tsv`
357 | 
358 | Here is the output to the console:
359 | 
360 | ```
361 | CompAIRR 1.7.0 - Comparison of Adaptive Immune Receptor Repertoires
362 | https://github.com/uio-bmi/compairr
363 | 
364 | Start time:        Thu Mar 03 12:29:32 CET 2022
365 | Command (m/c/x):   Overlap (-m)
366 | Repertoire set 1:  seta.tsv
367 | Repertoire set 2:  setb.tsv
368 | Nucleotides (n):   No
369 | Differences (d):   1
370 | Indels (i):        No
371 | Ignore counts (f): No
372 | Ignore genes (g):  No
373 | Ign. unknown (u):  No
374 | Threads (t):       1
375 | Output file (o):   output.tsv
376 | Output format (a): Matrix
377 | Score (s):         Sum of products of counts
378 | Pairs file (p):    pairs.tsv
379 | Log file (l):      (stderr)
380 | 
381 | Immune receptor repertoire set 1
382 | 
383 | Reading sequences: 100% (0s)
384 | Repertoires:       2
385 | Sequences:         2
386 | Residues:          25
387 | Shortest:          11
388 | Longest:           14
389 | Average length:    12.5
390 | Total dupl. count: 4
391 | Indexing:          100% (0s)
392 | 
393 | Repertoires in set:
394 | # Sequences Count Repertoire ID
395 | 1         1     1 A1
396 | 2         1     3 A2
397 | 
398 | Immune receptor repertoire set 2
399 | 
400 | Reading sequences: 100% (0s)
401 | Repertoires:       2
402 | Sequences:         3
403 | Residues:          39
404 | Shortest:          11
405 | Longest:           14
406 | Average length:    13.0
407 | Total dupl. count: 22
408 | Indexing:          100% (0s)
409 | 
410 | Repertoires in set:
411 | # Sequences Count Repertoire ID
412 | 1         2    15 B1
413 | 2         1     7 B2
414 | 
415 | Unique V genes:    2
416 | Unique J genes:    2
417 | Computing hashes:  100% (0s)
418 | Computing hashes:  100% (0s)
419 | Hashing sequences: 100% (0s)
420 | Analysing:         100% (0s)
421 | Writing results:   100% (0s)
422 | 
423 | End time:          Thu Mar 03 12:29:32 CET 2022
424 | ```
425 | 
426 | Repertoires will be sorted alphabetically by ID. The program gives some
427 | statistics on the input files after reading them.
428 | 
429 | Here is the result in the `output.tsv` file:
430 | 
431 | ```tsv
432 | #	B1	B2
433 | A1	0	7
434 | A2	45	0
435 | ```
436 | 
437 | And here is the result in the `pairs.tsv` file:
438 | 
439 | ```tsv
440 | #repertoire_id_1	sequence_id_1	duplicate_count_1	v_call_1	j_call_1	junction_aa_1	repertoire_id_2	sequence_id_2	duplicate_count_2	v_call_2	j_call_2	junction_aa_2
441 | A1	R	1	TCRBV07-06	TCRBJ02-01	CASSTSHEQYF	B2	V	7	TCRBV07-06	TCRBJ02-01	CASSTSHQQYF
442 | A2	S	3	TCRBV07-09	TCRBJ01-02	CASSLRVGGYGYTF	B1	T	5	TCRBV07-09	TCRBJ01-02	CASSLRVGGYGYTF
443 | A2	S	3	TCRBV07-09	TCRBJ01-02	CASSLRVGGYGYTF	B1	U	10	TCRBV07-09	TCRBJ01-02	CASSLRVGGFGYTF
444 | ```
445 | 
446 | Here, sequence R in repertoire A1 is similar to sequence V in
447 | repertoire B2. The only difference is the E and Q in the 8th
448 | position. The gene allele names are also the same. They have duplicate
449 | counts of 1 and 7, respectively. The product is 7. That value is found
450 | in the third column on the second line in the main output file.
451 | 
452 | Sequence S in repertoire A2 with duplicate count 3 is similar to both
453 | sequence T and U in repertoire B1, with duplicate counts of 5 and 10,
454 | respectively. Sequence T in B1 is identical, while sequence U in B1
455 | has an F instead of a Y in the 10th position. The result is 3 * (5 +
456 | 10) = 3 * 15 = 45. That value is found in the second column on the
457 | third line of the main output file.
458 | 
459 | Since there are no sequences from repertoire A1 similar to B1 or from
460 | A2 similar to B1, the other values in the matrix are zero.
461 | 
462 | This small dataset is included in the test folder and the tool can
463 | automatically be tested by running `make test`.
464 | 
465 | 
466 | ## Example 2: Sequence existence
467 | 
468 | In this example we will use the `-x` or `--existence` option to find
469 | out in which repertoires a set of sequences are present.
470 | 
471 | The file `setc.tsv` contains the sequences that we will analyse:
472 | 
473 | ```tsv
474 | repertoire_id	sequence_id	duplicate_count	v_call	j_call	junction	junction_aa	sequence	rev_comp	productive	d_call	sequence_alignment	germline_alignment	v_cigar	d_cigar	j_cigar
475 | C	X	1	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctttggctataccttt	CASSLRVGGFGYTF									
476 | C	Y	1	TCRBV07-06	TCRBJ02-01	tgcgcgagcagcaccagccatcagcagtatttt	CASSTSHQQYF									
477 | ```
478 | 
479 | The file above is included in the folder `test` in the distribution.
480 | 
481 | We will compare it to repertoire sets in the file `setb.tsv` described
482 | earlier.
483 | 
484 | We run the following command:
485 | 
486 | `compairr -x setc.tsv setb.tsv -d 1 -f -o output.tsv -p pairs.tsv`
487 | 
488 | Here is the output to the console:
489 | 
490 | ```
491 | CompAIRR 1.7.0 - Comparison of Adaptive Immune Receptor Repertoires
492 | https://github.com/uio-bmi/compairr
493 | 
494 | Start time:        Thu Mar 03 12:31:16 CET 2022
495 | Command (m/c/x):   Existence (-x)
496 | Repertoire:        setc.tsv
497 | Repertoire set:    setb.tsv
498 | Nucleotides (n):   No
499 | Differences (d):   1
500 | Indels (i):        No
501 | Ignore counts (f): Yes
502 | Ignore genes (g):  No
503 | Ign. unknown (u):  No
504 | Threads (t):       1
505 | Output file (o):   output.tsv
506 | Output format (a): Matrix
507 | Score (s):         Sum of products of counts
508 | Pairs file (p):    pairs.tsv
509 | Log file (l):      (stderr)
510 | 
511 | Immune receptor repertoire set 1
512 | 
513 | Reading sequences: 100% (0s)
514 | Repertoires:       1
515 | Sequences:         2
516 | Residues:          25
517 | Shortest:          11
518 | Longest:           14
519 | Average length:    12.5
520 | Total dupl. count: 2
521 | Indexing:          100% (0s)
522 | 
523 | Repertoires in set:
524 | # Sequences Count Repertoire ID
525 | 1         2     2 C
526 | 
527 | Immune receptor repertoire set 2
528 | 
529 | Reading sequences: 100% (0s)
530 | Repertoires:       2
531 | Sequences:         3
532 | Residues:          39
533 | Shortest:          11
534 | Longest:           14
535 | Average length:    13.0
536 | Total dupl. count: 22
537 | Indexing:          100% (0s)
538 | 
539 | Repertoires in set:
540 | # Sequences Count Repertoire ID
541 | 1         2    15 B1
542 | 2         1     7 B2
543 | 
544 | Unique V genes:    2
545 | Unique J genes:    2
546 | Computing hashes:  100% (0s)
547 | Computing hashes:  100% (0s)
548 | Hashing sequences: 100% (0s)
549 | Analysing:         100% (0s)
550 | Writing results:   100% (0s)
551 | 
552 | End time:          Thu Mar 03 12:31:16 CET 2022
553 | ```
554 | 
555 | Here is the result in the `output.tsv` file:
556 | 
557 | ```tsv
558 | #	B1	B2
559 | X	2	0
560 | Y	0	1
561 | ```
562 | 
563 | Please note that the `-f` option was used to ignore the duplicate
564 | counts.
565 | 
566 | And here is the result in the `pairs.tsv` file:
567 | 
568 | ```tsv
569 | #repertoire_id_1	sequence_id_1	duplicate_count_1	v_call_1	j_call_1	junction_aa_1	repertoire_id_2	sequence_id_2	duplicate_count_2	v_call_2	j_call_2	junction_aa_2
570 | C	X	1	TCRBV07-09	TCRBJ01-02	CASSLRVGGFGYTF	B1	U	10	TCRBV07-09	TCRBJ01-02	CASSLRVGGFGYTF
571 | C	X	1	TCRBV07-09	TCRBJ01-02	CASSLRVGGFGYTF	B1	T	5	TCRBV07-09	TCRBJ01-02	CASSLRVGGYGYTF
572 | C	Y	1	TCRBV07-06	TCRBJ02-01	CASSTSHQQYF	B2	V	7	TCRBV07-06	TCRBJ02-01	CASSTSHQQYF
573 | ```
574 | 
575 | The results indicate that sequence X was found (twice) in repertoire
576 | B1 (matching sequences T and U) and that sequence Y was found in
577 | repertoire B2 (matching sequence V).
578 | 
579 | 
580 | ## Example 3: Clustering sequences
581 | 
582 | This time we will cluster the nucleotide sequences in the file
583 | `setb.tsv` using the `-c` or `--cluster` option.
584 | 
585 | The command line to run is:
586 | 
587 | `compairr -c setb.tsv -d 1 -n -o output.tsv`
588 | 
589 | The output during the clustering is as follows:
590 | 
591 | ```
592 | CompAIRR 1.7.0 - Comparison of Adaptive Immune Receptor Repertoires
593 | https://github.com/uio-bmi/compairr
594 | 
595 | Start time:        Thu Mar 03 12:33:05 CET 2022
596 | Command (m/c/x):   Cluster (-c)
597 | Repertoire:        setb.tsv
598 | Nucleotides (n):   Yes
599 | Differences (d):   1
600 | Indels (i):        No
601 | Ignore counts (f): No
602 | Ignore genes (g):  No
603 | Ign. unknown (u):  No
604 | Threads (t):       1
605 | Output file (o):   output.tsv
606 | Log file (l):      (stderr)
607 | 
608 | Immune receptor repertoire clustering
609 | 
610 | Reading sequences: 100% (0s)
611 | Repertoires:       2
612 | Sequences:         3
613 | Residues:          117
614 | Shortest:          33
615 | Longest:           42
616 | Average length:    39.0
617 | Total dupl. count: 22
618 | Indexing:          100% (0s)
619 | 
620 | Unique V genes:    2
621 | Unique J genes:    2
622 | 
623 | Computing hashes:  100% (0s)
624 | Hashing sequences: 100% (0s)
625 | Building network:  100% (0s)
626 | Clustering:        100% (0s)
627 | Sorting clusters:  100% (0s)
628 | Writing clusters:  100% (0s)
629 | 
630 | Clusters:          2
631 | End time:          Thu Mar 03 12:33:05 CET 2022
632 | ```
633 | 
634 | The result in the file `output.tsv` looks like this:
635 | 
636 | ```tsv
637 | #cluster_no	cluster_size	repertoire_id	sequence_id	duplicate_count	v_call	j_call	junction
638 | 1	2	B1	T	5	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctatggctataccttt
639 | 1	2	B1	U	10	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctttggctataccttt
640 | 2	1	B2	V	7	TCRBV07-06	TCRBJ02-01	tgcgcgagcagcaccagccatcagcagtatttt
641 | ```
642 | 
643 | In this case, there are 2 clusters. The first contains 2 sequences (T
644 | and U from B1), while the second cluster contains 1 sequence (V from
645 | B2). The sequences are clustered across repertoires.
646 | 
647 | 
648 | ## Example 4: Deduplication
649 | 
650 | This time we will deduplicate the amino acid sequences in the file
651 | `setb.tsv` using the `-z` or `--deduplicate` option.
652 | 
653 | The command line to run is:
654 | 
655 | `compairr -z setb.tsv -o output.tsv`
656 | 
657 | The output will look like this:
658 | 
659 | ```
660 | CompAIRR 1.8.0 - Comparison of Adaptive Immune Receptor Repertoires
661 | https://github.com/uio-bmi/compairr
662 | 
663 | Start time:        Thu Sep 15 17:10:51 CEST 2022
664 | Command:           Deduplicate (--deduplicate)
665 | Repertoire:        setb.tsv
666 | Nucleotides (n):   No
667 | Differences (d):   0
668 | Indels (i):        No
669 | Ignore counts (f): No
670 | Ignore genes (g):  No
671 | Ign. unknown (u):  No
672 | Threads (t):       1
673 | Output file (o):   output.tsv
674 | Log file (l):      (stderr)
675 | 
676 | Reading sequences: 100% (0s)
677 | Repertoires:       2
678 | Sequences:         3
679 | Residues:          39
680 | Shortest:          11
681 | Longest:           14
682 | Average length:    13.0
683 | Total dupl. count: 22
684 | Indexing:          100% (0s)
685 | Unique V genes:    2
686 | Unique J genes:    2
687 | Computing hashes:  100% (0s)
688 | Deduplicating:     100% (0s)
689 | Duplicates merged: 0
690 | Writing output:    100% (0s)
691 | 
692 | End time:          Thu Sep 15 17:10:51 CEST 2022
693 | ```
694 | 
695 | The result in the file `output.tsv` looks like this:
696 | 
697 | ```tsv
698 | repertoire_id	duplicate_count	v_call	j_call	junction_aa
699 | B1	5	TCRBV07-09	TCRBJ01-02	CASSLRVGGYGYTF
700 | B1	10	TCRBV07-09	TCRBJ01-02	CASSLRVGGFGYTF
701 | B2	7	TCRBV07-06	TCRBJ02-01	CASSTSHQQYF
702 | ```
703 | 
704 | There were no duplicates in this dataset so the output is essentially
705 | identical to the input data, but does not include all the original
706 | columns. If the two sequences in repertoire B1 had been identical, the
707 | two lines would have been merged and the new `duplicate_count` would
708 | have been 15.
709 | 
710 | 
711 | ## Implementation
712 | 
713 | The program is written in C++. The strategy for finding similar
714 | sequences is based on a similar concept developed for the tool
715 | [Swarm](https://github.com/torognes/swarm) (Mahé et al.
716 | 2021). Basically, a 64-bit hash is computed for all sequences in the
717 | sets. All hashes for one set are stored in a Bloom filter and in a
718 | hash table. We then look for matches to sequences in the second set by
719 | looking them up in the Bloom filter and then, if there was a match, in
720 | the hash table. To find matches with 1 or 2 substitutions or indels,
721 | the hashes of all these variant sequences are generated and looked
722 | up. When d>2, a different strategy is used where all sequences are
723 | compared against each other and the number of differences is found.
724 | 
725 | 
726 | ## Performance
727 | 
728 | As a preliminary performance test, Cohort 2 ("Keck") of [the
729 | dataset](https://s3-us-west-2.amazonaws.com/publishedproject-supplements/emerson-2017-natgen/emerson-2017-natgen.zip)
730 | by Emerson et al. (2017) was compared to itself. It contains 120 repertoires
731 | with a total of 24 205 557 extracted sequences. The test was performed
732 | with CompAIRR version 1.3.1. The timing results are shown below.
733 | 
734 | Distance | Indels | Threads | Time (s) | Time (mm:ss)
735 | -------: | :----: | ------: | -------: | -----------:
736 | 0 | no | 1 | 18 | 0:18
737 | 0 | no | 4 | 12 | 0:12
738 | 1 | no | 1 | 224 | 3:44
739 | 1 | no | 4 | 72 | 1:12
740 | 1 | yes | 1 | 367 | 6:07
741 | 1 | yes | 4 | 111 | 1:51
742 | 2 | no | 4 | 3200 | 53:20
743 | 
744 | When the distance is zero almost all of the time was used to read
745 | files.
746 | 
747 | Memory usage was 2.5GB, corresponding to an average of about 100 bytes
748 | per sequence.
749 | 
750 | Since this is a comparison of a repertoire set to itself, the dataset
751 | is only read once, and the memory needed is also reduced as compared
752 | to a situation were two different repertoire sets are compared.
753 | 
754 | Wall time and memory usage was measured by `/usr/bin/time`. The
755 | analysis was performed on an Apple Mac Mini M1 (2020) with 16GB RAM.
756 | 
757 | 
758 | ## Benchmarking
759 | 
760 | The AIRR overlap functionality of CompAIRR has been thoroughly
761 | benchmarked against similar tools. All data, scripts, and results are
762 | available in a separate [CompAIRR benchmarking
763 | repository](https://github.com/uio-bmi/compairr-benchmarking).
764 | 
765 | 
766 | ## Tips
767 | 
768 | If computer memory is limited, the dataset may be split into blocks
769 | before running CompAIRR on each block separately. Results then needs
770 | to be merged together again afterwards. This may be achieved with a
771 | simple script. We will consider providing such a script.
772 | 
773 | 
774 | ## Development team
775 | 
776 | The code has been developed by Torbjørn Rognes based on code from
777 | Swarm where Frédéric Mahé and Lucas Czech made important
778 | contributions. Geir Kjetil Sandve had the idea of developing a tool
779 | for rapid repertoire set comparison. Lonneke Scheffer has tested and
780 | benchmarked the tool, and suggested new features. Milena Pavlovic and
781 | Victor Greiff have also contributed to the project.
782 | 
783 | 
784 | ## Support
785 | 
786 | We will prioritize fixing important bugs. We will also try to answer
787 | questions, improve documentation and implement suggested enhancements
788 | as time permits. As we have no dedicated funding for this project we
789 | cannot make any guarantees on the level of support.
790 | 
791 | To report a potential bug, suggest enhancements or ask questions,
792 | please use one of the following means:
793 | 
794 | * [Submit an issue on GitHub](https://github.com/uio-bmi/compairr/issues) (preferred)
795 | 
796 | * Send an email to [`torognes@ifi.uio.no`](mailto:torognes@ifi.uio.no)
797 | 
798 | If you would like to contribute with code you are most welcome to
799 | [submit a pull request](https://github.com/uio-bmi/compairr/pulls).
800 | 
801 | 
802 | ## Citing CompAIRR
803 | 
804 | Please cite the following if you use CompAIRR in any published work:
805 | 
806 | * Rognes T, Scheffer L, Greiff V, Sandve GK (2021) **CompAIRR: ultra-fast comparison of adaptive immune receptor repertoires by exact and approximate sequence matching.** *Bioinformatics*, btac505. doi: [10.1093/bioinformatics/btac505](https://doi.org/10.1093/bioinformatics/btac505)
807 | 
808 | The article is also available in preprint form:
809 | 
810 | * Rognes T, Scheffer L, Greiff V, Sandve GK (2021) **CompAIRR: ultra-fast comparison of adaptive immune receptor repertoires by exact and approximate sequence matching.** *bioRxiv*, 2021.10.30.466600. doi: [10.1101/2021.10.30.466600](https://doi.org/10.1101/2021.10.30.466600)
811 | 
812 | 
813 | ## References
814 | 
815 | * Emerson RO, DeWitt WS, Vignali M, Gravley J, Hu JK, Osborne EJ, Desmarais C, Klinger M, Carlson CS, Hansen JA, Rieder M, Robins HS (2017) **Immunosequencing identifies signatures of cytomegalovirus exposure history and HLA-mediated effects on the T cell repertoire.** *Nature Genetics*, 49 (5): 659-665. doi: [10.1038/ng.3822](https://doi.org/10.1038/ng.3822)
816 | 
817 | * Mahé F, Czech L, Stamatakis A, Quince C, de Vargas C, Dunthorn M, Rognes T (2021) **Swarm v3: Towards Tera-Scale Amplicon Clustering.** *Bioinformatics*, btab493. doi: [10.1093/bioinformatics/btab493](https://doi.org/10.1093/bioinformatics/btab493)
818 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
  1 | #    Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  2 | #
  3 | #    This program is free software: you can redistribute it and/or modify
  4 | #    it under the terms of the GNU Affero General Public License as
  5 | #    published by the Free Software Foundation, either version 3 of the
  6 | #    License, or (at your option) any later version.
  7 | #
  8 | #    This program is distributed in the hope that it will be useful,
  9 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | #    GNU Affero General Public License for more details.
 12 | #
 13 | #    You should have received a copy of the GNU Affero General Public License
 14 | #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | #
 16 | #    Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 17 | #    Department of Informatics, University of Oslo,
 18 | #    PO Box 1080 Blindern, NO-0316 Oslo, Norway
 19 | 
 20 | # Makefile for CompAIRR
 21 | 
 22 | COMMON = -g -std=c++11
 23 | EXTRAOBJ =
 24 | LINKOPT =
 25 | LIBS = -lpthread
 26 | WARNINGS = -Wall -Wextra
 27 | 
 28 | # Run "make RELEASE=1" to compile for release
 29 | ifdef RELEASE
 30 | 	COMMON += -DNDEBUG
 31 | endif
 32 | 
 33 | # Run "make PROFILE=1" to compile for profiling
 34 | ifdef PROFILE
 35 | 	COMMON += -pg
 36 | endif
 37 | 
 38 | # Run "make COVERAGE=1" to compile for coverage tests
 39 | ifdef COVERAGE
 40 | 	COMMON += -fprofile-arcs -ftest-coverage -O0
 41 | 	LIBS += -lgcov
 42 | 	LINKFLAGS += --coverage
 43 | else
 44 | 	COMMON += -flto -O3
 45 | endif
 46 | 
 47 | # Identify Machine
 48 | ifeq ($(CXX), aarch64-linux-gnu-g++)
 49 |         MACHINE = aarch64
 50 | else ifeq ($(CXX), x86_64-linux-gnu-g++)
 51 |         MACHINE = x86_64
 52 | else ifeq ($(CXX), powerpc64le-linux-gnu-g++)
 53 |         MACHINE = ppc64le
 54 | else
 55 |         MACHINE = $(shell uname -m)
 56 | endif
 57 | 
 58 | # Machine specific
 59 | ifeq ($(MACHINE), x86_64)
 60 | 	COMMON += -march=x86-64 -mtune=generic
 61 | else ifeq ($(MACHINE), arm64)
 62 | 	COMMON += -march=armv8-a+simd -mtune=generic
 63 | else ifeq ($(MACHINE), aarch64)
 64 | 	COMMON += -march=armv8-a+simd -mtune=generic
 65 | else ifeq ($(MACHINE), ppc64le)
 66 | 	COMMON += -mcpu=power8
 67 | endif
 68 | 
 69 | # OS specific
 70 | ifeq ($(CXX), x86_64-w64-mingw32-g++)
 71 | 	LIBS += -lpsapi
 72 | 	LINKOPT += -static
 73 | else
 74 | 	WARNINGS += -pedantic
 75 | endif
 76 | 
 77 | LINKFLAGS = $(COMMON) $(LINKOPT)
 78 | 
 79 | CXXFLAGS = $(COMMON) $(WARNINGS)
 80 | 
 81 | PROG = compairr
 82 | 
 83 | OBJS = arch.o bloompat.o cluster.o compairr.o db.o dedup.o hashtable.o \
 84 | 	overlap.o util.o variants.o zobrist.o
 85 | 
 86 | DEPS = Makefile threads.h \
 87 | 	arch.h bloompat.h cluster.h compairr.h db.h dedup.h hashtable.h \
 88 | 	overlap.h util.h variants.h zobrist.h
 89 | 
 90 | all : $(PROG)
 91 | 
 92 | compairr : $(OBJS) $(DEPS)
 93 | 	$(CXX) $(LINKFLAGS) -o $@ $(OBJS) $(LIBS)
 94 | 
 95 | clean :
 96 | 	rm -f compairr *.o *~ gmon.out *.gcno *.gcda *.gcov
 97 | 
 98 | .o : .cc $(DEPS)
 99 | 	$(CXX) $(CXXFLAGS) -c -o $@ $<
100 | 


--------------------------------------------------------------------------------
/src/arch.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | #include "compairr.h"
 23 | 
 24 | uint64_t arch_get_memused()
 25 | {
 26 | #ifdef _WIN32
 27 | 
 28 |   PROCESS_MEMORY_COUNTERS pmc;
 29 |   GetProcessMemoryInfo(GetCurrentProcess(),
 30 |                        &pmc,
 31 |                        sizeof(PROCESS_MEMORY_COUNTERS));
 32 |   return pmc.PeakWorkingSetSize;
 33 | 
 34 | #else
 35 | 
 36 |   struct rusage r_usage;
 37 |   getrusage(RUSAGE_SELF, & r_usage);
 38 | 
 39 | # ifdef __APPLE__
 40 |   /* Mac: ru_maxrss gives the size in bytes */
 41 |   return static_cast<uint64_t>(r_usage.ru_maxrss);
 42 | # else
 43 |   /* Linux: ru_maxrss gives the size in kilobytes  */
 44 |   return static_cast<uint64_t>(r_usage.ru_maxrss * 1024);
 45 | # endif
 46 | 
 47 | #endif
 48 | }
 49 | 
 50 | uint64_t arch_get_memtotal()
 51 | {
 52 | #ifdef _WIN32
 53 | 
 54 |   MEMORYSTATUSEX ms;
 55 |   ms.dwLength = sizeof(MEMORYSTATUSEX);
 56 |   GlobalMemoryStatusEx(&ms);
 57 |   return ms.ullTotalPhys;
 58 | 
 59 | #elif defined(__APPLE__)
 60 | 
 61 |   int mib [] = { CTL_HW, HW_MEMSIZE };
 62 |   int64_t ram = 0;
 63 |   size_t length = sizeof(ram);
 64 |   if(sysctl(mib, 2, &ram, &length, nullptr, 0) == -1)
 65 |     fatal("Cannot determine amount of RAM");
 66 |   return static_cast<uint64_t>(ram);
 67 | 
 68 | #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
 69 | 
 70 |   int64_t phys_pages = sysconf(_SC_PHYS_PAGES);
 71 |   int64_t pagesize = sysconf(_SC_PAGESIZE);
 72 |   if ((phys_pages == -1) || (pagesize == -1))
 73 |     fatal("Cannot determine amount of RAM");
 74 |   return static_cast<uint64_t>(pagesize * phys_pages);
 75 | 
 76 | #else
 77 | 
 78 |   struct sysinfo si;
 79 |   if (sysinfo(&si))
 80 |     fatal("Cannot determine amount of RAM");
 81 |   return si.totalram * si.mem_unit;
 82 | 
 83 | #endif
 84 | }
 85 | 
 86 | void arch_srandom(unsigned int seed)
 87 | {
 88 |   /* initialize pseudo-random number generator */
 89 | 
 90 | #ifdef _WIN32
 91 |   srand(seed);
 92 | #else
 93 |   srandom(seed);
 94 | #endif
 95 | }
 96 | 
 97 | uint64_t arch_random()
 98 | {
 99 | #ifdef _WIN32
100 |   return static_cast<uint64_t>(rand());
101 | #else
102 |   return static_cast<uint64_t>(random());
103 | #endif
104 | }
105 | 


--------------------------------------------------------------------------------
/src/arch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | /* functions in arch.cc */
23 | 
24 | uint64_t arch_get_memused();
25 | uint64_t arch_get_memtotal();
26 | void arch_srandom(unsigned int seed);
27 | uint64_t arch_random();
28 | 


--------------------------------------------------------------------------------
/src/bloompat.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | /*
23 |   Blocked bloom filter with precomputed bit patterns
24 |   as described in
25 | 
26 |   Putze F, Sanders P, Singler J (2009)
27 |   Cache-, Hash- and Space-Efficient Bloom Filters
28 |   Journal of Experimental Algorithmics, 14, 4
29 |   https://doi.org/10.1145/1498698.1594230
30 | */
31 | 
32 | #include "compairr.h"
33 | 
34 | void bloom_patterns_generate(struct bloom_s * b);
35 | 
36 | void bloom_patterns_generate(struct bloom_s * b)
37 | {
38 |   const unsigned int k = 8;
39 |   for (unsigned int i = 0; i < BLOOM_PATTERN_COUNT; i++)
40 |     {
41 |       uint64_t pattern = 0;
42 |       for (unsigned int j = 0; j < k; j++)
43 |         {
44 |           uint64_t onebit;
45 |           onebit = 1ULL << (arch_random() & 63);
46 |           while (pattern & onebit)
47 |             onebit = 1ULL << (arch_random() & 63);
48 |           pattern |= onebit;
49 |         }
50 |       b->patterns[i] = pattern;
51 |     }
52 | }
53 | 
54 | void bloom_zap(struct bloom_s * b)
55 | {
56 |   memset(b->bitmap, 0xff, b->size);
57 | }
58 | 
59 | struct bloom_s * bloom_init(uint64_t size)
60 | {
61 |   // Size is in bytes for full bitmap, must be power of 2
62 |   // at least 8
63 |   size = MAX(size, 8);
64 | 
65 |   struct bloom_s * b = static_cast<struct bloom_s *>(xmalloc(sizeof(struct bloom_s)));
66 | 
67 |   b->size = size;
68 | 
69 |   b->mask = (size >> 3) - 1;
70 | 
71 |   b->bitmap = static_cast<uint64_t *>(xmalloc(size));
72 | 
73 |   bloom_zap(b);
74 | 
75 |   bloom_patterns_generate(b);
76 | 
77 |   return b;
78 | }
79 | 
80 | void bloom_exit(struct bloom_s * b)
81 | {
82 |   xfree(b->bitmap);
83 |   xfree(b);
84 | }
85 | 


--------------------------------------------------------------------------------
/src/bloompat.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | #define BLOOM_PATTERN_SHIFT 10
23 | #define BLOOM_PATTERN_COUNT (1 << BLOOM_PATTERN_SHIFT)
24 | #define BLOOM_PATTERN_MASK (BLOOM_PATTERN_COUNT - 1)
25 | 
26 | struct bloom_s
27 | {
28 |   uint64_t size;
29 |   uint64_t mask;
30 |   uint64_t * bitmap;
31 |   uint64_t patterns[BLOOM_PATTERN_COUNT];
32 | };
33 | 
34 | void bloom_zap(struct bloom_s * b);
35 | 
36 | struct bloom_s * bloom_init(uint64_t size);
37 | 
38 | void bloom_exit(struct bloom_s * b);
39 | 
40 | inline uint64_t * bloom_adr(struct bloom_s * b, uint64_t h)
41 | {
42 |   return b->bitmap + ((h >> BLOOM_PATTERN_SHIFT) & b->mask);
43 | }
44 | 
45 | inline uint64_t bloom_pat(struct bloom_s * b, uint64_t h)
46 | {
47 |   return b->patterns[h & BLOOM_PATTERN_MASK];
48 | }
49 | 
50 | inline void bloom_set(struct bloom_s * b, uint64_t h)
51 | {
52 |   * bloom_adr(b, h) &= ~ bloom_pat(b, h);
53 | }
54 | 
55 | inline bool bloom_get(struct bloom_s * b, uint64_t h)
56 | {
57 |   return ! (* bloom_adr(b, h) & bloom_pat(b, h));
58 | }
59 | 


--------------------------------------------------------------------------------
/src/cluster.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | #include "compairr.h"
 23 | 
 24 | const unsigned int no_cluster = UINT_MAX;
 25 | 
 26 | static struct iteminfo_s
 27 | {
 28 |   unsigned int clusterid;
 29 |   unsigned int next;
 30 |   unsigned int network_start;
 31 |   unsigned int network_count;
 32 | } * iteminfo = 0;
 33 | 
 34 | static struct clusterinfo_s
 35 | {
 36 |   unsigned int seed;
 37 |   unsigned int size;
 38 | } * clusterinfo = 0;
 39 | 
 40 | static uint64_t clusterinfo_alloc = 0;
 41 | 
 42 | static pthread_mutex_t network_mutex;
 43 | static unsigned int * network = 0;
 44 | static unsigned int network_count = 0;
 45 | static unsigned int network_seq = 0;
 46 | static uint64_t network_alloc = 0;
 47 | static uint64_t seqcount = 0;
 48 | 
 49 | static struct db * d;
 50 | static struct bloom_s * bloom = 0;
 51 | static hashtable_s * hashtable = 0;
 52 | 
 53 | static int compare_cluster(const void * a, const void * b)
 54 | {
 55 |   clusterinfo_s * x = (clusterinfo_s *) a;
 56 |   clusterinfo_s * y = (clusterinfo_s *) b;
 57 |   if (x->size > y->size)
 58 |     return -1;
 59 |   else if (x->size < y->size)
 60 |     return +1;
 61 |   else
 62 |     return 0;
 63 | }
 64 | 
 65 | static inline void hash_insert_cluster(uint64_t seq)
 66 | {
 67 |   /* find the first empty bucket */
 68 |   uint64_t hash = db_gethash(d, seq);
 69 |   uint64_t j = hash_getindex(hashtable, hash);
 70 |   while (hash_is_occupied(hashtable, j))
 71 |     j = hash_getnextindex(hashtable, j);
 72 | 
 73 |   hash_set_occupied(hashtable, j);
 74 |   hash_set_value(hashtable, j, hash);
 75 |   hash_set_data(hashtable, j, seq);
 76 |   bloom_set(bloom, hash);
 77 | }
 78 | 
 79 | static void find_variant_matches(uint64_t seed,
 80 |                                  var_s * var,
 81 |                                  unsigned int * * hits_data,
 82 |                                  unsigned int * hits_count,
 83 |                                  uint64_t * hits_alloc)
 84 | {
 85 |   /* compute hash table index */
 86 | 
 87 |   uint64_t j = hash_getindex(hashtable, var->hash);
 88 | 
 89 |   /* find matching buckets */
 90 | 
 91 |   while (hash_is_occupied(hashtable, j))
 92 |     {
 93 |       if (hash_compare_value(hashtable, j, var->hash))
 94 |         {
 95 |           uint64_t hit = hash_get_data(hashtable, j);
 96 | 
 97 |           /* double check that everything matches */
 98 | 
 99 |           unsigned int seed_v_gene = db_get_v_gene(d, seed);
100 |           unsigned int seed_j_gene = db_get_j_gene(d, seed);
101 | 
102 |           unsigned int hit_v_gene = db_get_v_gene(d, hit);
103 |           unsigned int hit_j_gene = db_get_j_gene(d, hit);
104 | 
105 |           if ((seed != hit) &&
106 |               (opt_ignore_genes ||
107 |                ((seed_v_gene == hit_v_gene) &&
108 |                 (seed_j_gene == hit_j_gene))))
109 |             {
110 |               unsigned char * seed_sequence
111 |                 = (unsigned char *) db_getsequence(d, seed);
112 |               unsigned int seed_seqlen
113 |                 = db_getsequencelen(d, seed);
114 |               unsigned char * hit_sequence
115 |                 = (unsigned char *) db_getsequence(d, hit);
116 |               unsigned int hit_seqlen
117 |                 = db_getsequencelen(d, hit);
118 | 
119 |               if (check_variant(seed_sequence, seed_seqlen,
120 |                                 var,
121 |                                 hit_sequence, hit_seqlen))
122 |                 {
123 |                   if (*hits_alloc <= *hits_count)
124 |                     {
125 |                       *hits_alloc += 1024;
126 |                       *hits_data = static_cast<unsigned int *>
127 |                         (xrealloc((*hits_data),
128 |                                   (*hits_alloc) * sizeof(unsigned int)));
129 |                     }
130 |                   (*hits_data)[(*hits_count)++] = hit;
131 |                 }
132 |             }
133 |         }
134 |       j = hash_getnextindex(hashtable, j);
135 |     }
136 | }
137 | 
138 | static void process_variants(uint64_t seed,
139 |                              var_s * variant_list,
140 |                              unsigned int * * hits_data,
141 |                              unsigned int * hits_count,
142 |                              uint64_t * hits_alloc)
143 | {
144 |   unsigned int variant_count = 0;
145 |   * hits_count = 0;
146 | 
147 |   unsigned char * sequence = (unsigned char *) db_getsequence(d, seed);
148 |   unsigned int seqlen = db_getsequencelen(d, seed);
149 |   uint64_t hash = db_gethash(d, seed);
150 |   uint64_t v_gene = db_get_v_gene(d, seed);
151 |   uint64_t j_gene = db_get_j_gene(d, seed);
152 | 
153 |   generate_variants(hash,
154 |                     sequence, seqlen, v_gene, j_gene,
155 |                     variant_list, & variant_count);
156 | 
157 |   for(unsigned int i = 0; i < variant_count; i++)
158 |     {
159 |       var_s * var = variant_list + i;
160 |       if (bloom_get(bloom, var->hash))
161 |         find_variant_matches(seed, var, hits_data, hits_count, hits_alloc);
162 |     }
163 | }
164 | 
165 | static void process_trad(uint64_t seed,
166 |                          unsigned int * * hits_data,
167 |                          unsigned int * hits_count,
168 |                          uint64_t * hits_alloc)
169 | {
170 |   /* Only to be used with no indels (and d >= 3) */
171 | 
172 |   for (uint64_t hit = 0; hit < seqcount; hit++)
173 |     if (seed != hit)
174 |       {
175 |         /* check if everything matches */
176 | 
177 |         unsigned int seed_v_gene = db_get_v_gene(d, seed);
178 |         unsigned int seed_j_gene = db_get_j_gene(d, seed);
179 | 
180 |         unsigned int hit_v_gene = db_get_v_gene(d, hit);
181 |         unsigned int hit_j_gene = db_get_j_gene(d, hit);
182 | 
183 |         if (opt_ignore_genes ||
184 |             ((seed_v_gene == hit_v_gene) && (seed_j_gene == hit_j_gene)))
185 |           {
186 |             unsigned int seed_seqlen = db_getsequencelen(d, seed);
187 |             unsigned int hit_seqlen = db_getsequencelen(d, hit);
188 | 
189 |             if (seed_seqlen == hit_seqlen)
190 |               {
191 |                 unsigned char * seed_sequence
192 |                   = (unsigned char *) db_getsequence(d, seed);
193 |                 unsigned char * hit_sequence
194 |                   = (unsigned char *) db_getsequence(d, hit);
195 | 
196 |                 if (seq_diff(seed_sequence, hit_sequence, seed_seqlen)
197 |                     <= opt_differences)
198 |                   {
199 |                     if (*hits_alloc <= *hits_count)
200 |                       {
201 |                         *hits_alloc += 1024;
202 |                         *hits_data = static_cast<unsigned int *>
203 |                           (xrealloc((*hits_data),
204 |                                     (*hits_alloc) * sizeof(unsigned int)));
205 |                       }
206 |                     (*hits_data)[(*hits_count)++] = hit;
207 |                   }
208 |               }
209 |           }
210 |       }
211 | }
212 | 
213 | static void process_seq(uint64_t seed,
214 |                         var_s * variant_list,
215 |                         unsigned int * * hits_data,
216 |                         unsigned int * hits_count,
217 |                         uint64_t * hits_alloc)
218 | {
219 |   if (opt_differences <= MAXDIFF_HASH)
220 |     process_variants(seed, variant_list, hits_data, hits_count, hits_alloc);
221 |   else
222 |     process_trad(seed, hits_data, hits_count, hits_alloc);
223 | }
224 | 
225 | static void network_thread(int64_t t)
226 | {
227 |   (void) t;
228 | 
229 |   unsigned int longest = db_getlongestsequence(d);
230 |   uint64_t maxvar = max_variants(longest);
231 | 
232 |   uint64_t hits_alloc = 1024;
233 |   auto * hits_data = static_cast<unsigned int *>
234 |     (xmalloc(hits_alloc * sizeof(unsigned int)));
235 | 
236 |   auto * variant_list = static_cast<struct var_s *>
237 |     (xmalloc(maxvar * sizeof(struct var_s)));
238 | 
239 |   pthread_mutex_lock(&network_mutex);
240 | 
241 |   while (network_seq < seqcount)
242 |     {
243 |       unsigned int seed = network_seq++;
244 |       progress_update(seed);
245 | 
246 |       pthread_mutex_unlock(&network_mutex);
247 | 
248 |       unsigned int hits_count = 0;
249 |       process_seq(seed, variant_list,
250 |                   & hits_data, & hits_count, & hits_alloc);
251 | 
252 |       pthread_mutex_lock(&network_mutex);
253 | 
254 |       iteminfo[seed].network_start = network_count;
255 |       iteminfo[seed].network_count = hits_count;
256 | 
257 |       if (network_count + hits_count > network_alloc)
258 |         {
259 |           while (network_count + hits_count > network_alloc)
260 |             network_alloc += 1024 * 1024;
261 | 
262 |           network = static_cast<unsigned int*>
263 |             (xrealloc(network, network_alloc * sizeof(unsigned int)));
264 |         }
265 | 
266 |       for(unsigned int k = 0; k < hits_count; k++)
267 |         network[network_count++] = hits_data[k];
268 |     }
269 | 
270 |   pthread_mutex_unlock(&network_mutex);
271 | 
272 |   xfree(variant_list);
273 |   xfree(hits_data);
274 | }
275 | 
276 | static unsigned int clustersize = 0;
277 | static unsigned int current_cluster_tail = 0;
278 | 
279 | static void process_seed(unsigned int seed)
280 | {
281 |   clustersize++;
282 | 
283 |   unsigned int s = iteminfo[seed].network_start;
284 |   unsigned int c = iteminfo[seed].network_count;
285 | 
286 |   unsigned int clusterid = iteminfo[seed].clusterid;
287 | 
288 |   for(unsigned int i = 0; i < c; i++)
289 |     {
290 |       unsigned int hit = network[s + i];
291 |       if (iteminfo[hit].clusterid == no_cluster)
292 |         {
293 |           /* add hit to cluster, update linked chain */
294 |           iteminfo[hit].clusterid = clusterid;
295 |           iteminfo[current_cluster_tail].next = hit;
296 |           current_cluster_tail = hit;
297 |         }
298 |     }
299 | }
300 | 
301 | void cluster(char * filename)
302 | {
303 |   fprintf(logfile, "Immune receptor repertoire clustering\n\n");
304 | 
305 |   db_init();
306 | 
307 |   d = db_create();
308 |   db_read(d, filename, false, "1");
309 | 
310 |   unsigned int longest = db_getlongestsequence(d);
311 |   seqcount = db_getsequencecount(d);
312 | 
313 |   fprintf(logfile, "\n");
314 |   fprintf(logfile, "Unique V genes:    %" PRIu64 "\n",
315 |           db_get_v_gene_count());
316 |   fprintf(logfile, "Unique J genes:    %" PRIu64 "\n",
317 |           db_get_j_gene_count());
318 |   fprintf(logfile, "\n");
319 | 
320 |   if (opt_differences <= MAXDIFF_HASH)
321 |     {
322 |       zobrist_init(longest + MAX_INSERTS,
323 |                    db_get_v_gene_count(),
324 |                    db_get_j_gene_count());
325 | 
326 |       db_hash(d);
327 | 
328 |       hashtable = hash_init(seqcount);
329 |       bloom = bloom_init(hash_get_tablesize(hashtable) * 2);
330 |     }
331 | 
332 |   iteminfo = static_cast<struct iteminfo_s *>
333 |     (xmalloc(seqcount * sizeof(struct iteminfo_s)));
334 | 
335 |   progress_init("Hashing sequences:", seqcount);
336 |   for(uint64_t i=0; i < seqcount; i++)
337 |     {
338 |       iteminfo[i].clusterid = no_cluster;
339 |       iteminfo[i].next = no_cluster;
340 |       if (opt_differences <= MAXDIFF_HASH)
341 |         hash_insert_cluster(i);
342 |       progress_update(i);
343 |     }
344 |   progress_done();
345 | 
346 |   network = static_cast<unsigned int*>
347 |     (xmalloc(network_alloc * sizeof(unsigned int)));
348 |   network_count = 0;
349 |   network_seq = 0;
350 | 
351 |   pthread_mutex_init(&network_mutex, nullptr);
352 |   progress_init("Building network: ", seqcount);
353 | 
354 |   if (opt_threads == 1)
355 |     {
356 |       network_thread(0);
357 |     }
358 |   else
359 |     {
360 |       ThreadRunner * sim_tr = new ThreadRunner(static_cast<int>(opt_threads),
361 |                                                network_thread);
362 |       sim_tr->run();
363 |       delete sim_tr;
364 |     }
365 | 
366 |   progress_done();
367 |   pthread_mutex_destroy(&network_mutex);
368 | 
369 | 
370 |   unsigned int clustercount = 0;
371 | 
372 |   progress_init("Clustering:       ", seqcount);
373 | 
374 |   /* for each non-clustered item, look for subseeds ... */
375 |   uint64_t x = 0;
376 |   for(unsigned int seed = 0; seed < seqcount; seed++)
377 |     {
378 |       struct iteminfo_s * ap = iteminfo + seed;
379 | 
380 |       if (ap->clusterid == no_cluster)
381 |         {
382 |           /* start a new cluster with a new initial seed */
383 | 
384 |           ap->clusterid = clustercount;
385 |           ap->next = no_cluster;
386 |           current_cluster_tail = seed;
387 |           clustersize = 0;
388 | 
389 |           /* find initial matches */
390 |           process_seed(seed);
391 |           progress_update(++x);
392 | 
393 |           unsigned int subseed = ap->next;
394 | 
395 |           /* process all subseeds */
396 |           while(subseed != no_cluster)
397 |             {
398 |               process_seed(subseed);
399 |               progress_update(++x);
400 |               subseed = iteminfo[subseed].next;
401 |             }
402 | 
403 |           if (clustercount >= clusterinfo_alloc)
404 |             {
405 |               /* allocate memory for more clusters... */
406 |               clusterinfo_alloc += 1024;
407 |               clusterinfo = static_cast<struct clusterinfo_s *>
408 |                 (xrealloc(clusterinfo,
409 |                           clusterinfo_alloc * sizeof(clusterinfo_s)));
410 |             }
411 | 
412 |           struct clusterinfo_s * sp = clusterinfo + clustercount;
413 |           sp->seed = seed;
414 |           sp->size = clustersize;
415 |           clustercount++;
416 |         }
417 |     }
418 | 
419 |   progress_done();
420 | 
421 |   progress_init("Sorting clusters: ", clustercount);
422 |   qsort(clusterinfo, clustercount, sizeof(clusterinfo_s), compare_cluster);
423 |   progress_done();
424 | 
425 |   /* dump clusters */
426 | 
427 |   uint64_t j = 0;
428 |   progress_init("Writing clusters: ", seqcount);
429 |   fprintf(outfile,
430 |           "#cluster_no\tcluster_size\trepertoire_id\tsequence_id\t"
431 |           "duplicate_count\tv_call\tj_call\t%s\n", seq_header);
432 |   for(unsigned int i = 0; i < clustercount; i++)
433 |     {
434 |       unsigned int seed = clusterinfo[i].seed;
435 |       unsigned int size = clusterinfo[i].size;
436 |       for(unsigned int a = seed; a != no_cluster; a = iteminfo[a].next)
437 |         {
438 |           fprintf(outfile,
439 |                   "%u\t%u\t",
440 |                   i + 1,
441 |                   size);
442 |           fprintf(outfile,
443 |                   "%s\t%s\t%" PRIu64 "\t%s\t%s\t",
444 |                   db_get_repertoire_id(d, db_get_repertoire_id_no(d, a)),
445 |                   db_get_sequence_id(d, a),
446 |                   db_get_count(d, a),
447 |                   db_get_v_gene_name(d, a),
448 |                   db_get_j_gene_name(d, a));
449 |           db_fprint_sequence(outfile, d, a);
450 |           fprintf(outfile, "\n");
451 |           j++;
452 |         }
453 |       progress_update(j);
454 |     }
455 |   progress_done();
456 | 
457 |   fprintf(logfile, "\n");
458 |   fprintf(logfile, "Clusters:          %u\n", clustercount);
459 | 
460 |   xfree(network);
461 |   if (clusterinfo)
462 |     xfree(clusterinfo);
463 |   if (iteminfo)
464 |     xfree(iteminfo);
465 | 
466 |   if (opt_differences <= MAXDIFF_HASH)
467 |     {
468 |       bloom_exit(bloom);
469 |       hash_exit(hashtable);
470 |       zobrist_exit();
471 |     }
472 | 
473 |   db_free(d);
474 |   db_exit();
475 | }
476 | 


--------------------------------------------------------------------------------
/src/cluster.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | /* other */
23 | 
24 | void cluster(char * filename);
25 | 


--------------------------------------------------------------------------------
/src/compairr.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | /*
 23 | 
 24 |   This program uses Frederic Mahe's idea for swarm (d=1) to
 25 |   enumerate all variants of a sequence containing a single
 26 |   change (substitution, deletion or insertion) to quickly
 27 |   identify neighbour sequences using a hashing strategy.
 28 | 
 29 |   Please see the following publications for details:
 30 | 
 31 |   Mahe F, Rognes T, Quince C, de Vargas C, Dunthorn M (2014)
 32 |   Swarm: robust and fast clustering method for amplicon-based studies
 33 |   PeerJ 2:e593 https://doi.org/10.7717/peerj.593
 34 | 
 35 |   Mahe F, Rognes T, Quince C, de Vargas C, Dunthorn M (2015)
 36 |   Swarm v2: highly-scalable and high-resolution amplicon clustering
 37 |   PeerJ 3:e1420 https://doi.org/10.7717/peerj.1420
 38 | 
 39 | */
 40 | 
 41 | #include "compairr.h"
 42 | 
 43 | /* OPTIONS */
 44 | 
 45 | static char * progname;
 46 | static char * input1_filename;
 47 | static char * input2_filename;
 48 | 
 49 | bool opt_alternative;
 50 | bool opt_cdr3;
 51 | bool opt_cluster;
 52 | bool opt_distance;
 53 | bool opt_existence;
 54 | bool opt_help;
 55 | bool opt_ignore_counts;
 56 | bool opt_ignore_empty;
 57 | bool opt_ignore_genes;
 58 | bool opt_ignore_unknown;
 59 | bool opt_indels;
 60 | bool opt_matrix;
 61 | bool opt_nucleotides;
 62 | bool opt_no_matrix;
 63 | bool opt_version;
 64 | bool opt_deduplicate;
 65 | char * opt_keep_columns;
 66 | char * opt_log;
 67 | char * opt_output;
 68 | char * opt_pairs;
 69 | char * opt_score_string;
 70 | int64_t opt_differences;
 71 | int64_t opt_score_int;
 72 | int64_t opt_threads;
 73 | 
 74 | /* Other variables */
 75 | 
 76 | const char * seq_header = nullptr;
 77 | 
 78 | FILE * outfile = nullptr;
 79 | FILE * logfile = nullptr;
 80 | FILE * pairsfile = nullptr;
 81 | 
 82 | int keep_columns_count = 0;
 83 | int * keep_columns_no = nullptr;
 84 | char ** keep_columns_names = nullptr;
 85 | char ** keep_columns_strings = nullptr;
 86 | 
 87 | int alphabet_size;
 88 | 
 89 | static char dash[] = "-";
 90 | static char * DASH_FILENAME = dash;
 91 | 
 92 | static const char * score_options[] =
 93 |   { "Product", "Ratio", "Min", "Max", "Mean", "MH", "Jaccard" };
 94 | 
 95 | static const char * score_descr[] =
 96 |   {
 97 |     "Sum of products of counts",
 98 |     "Sum of ratios of counts",
 99 |     "Sum of minimum of counts",
100 |     "Sum of maximum of counts",
101 |     "Sum of mean of counts",
102 |     "Morisita-Horn index",
103 |     "Jaccard index"
104 |   };
105 | 
106 | int64_t args_long(char * str, const char * option);
107 | void args_show();
108 | void args_usage();
109 | void show_header();
110 | void args_init(int argc, char **argv);
111 | void open_files();
112 | void close_files();
113 | 
114 | bool parse_keep_columns()
115 | {
116 |   unsigned int len = strlen(opt_keep_columns);
117 |   keep_columns_count = 1;
118 |   for (unsigned int i = 0; i < len; i++)
119 |     if (opt_keep_columns[i] == ',')
120 |       keep_columns_count++;
121 | 
122 |   keep_columns_no = (int *) xmalloc
123 |     (keep_columns_count * sizeof(int));
124 | 
125 |   keep_columns_names = (char **) xmalloc
126 |     (keep_columns_count * sizeof(char *));
127 | 
128 |   keep_columns_strings = (char **) xmalloc
129 |     (keep_columns_count * sizeof(char *));
130 | 
131 |   for (int j = 0; j < keep_columns_count; j++)
132 |     keep_columns_no[j] = 0;
133 | 
134 |   keep_columns_count = 0;
135 |   unsigned int curlen = 0;
136 |   for (unsigned int i = 0; i < len; i++)
137 |     {
138 |       char c = opt_keep_columns[i];
139 |       if (c == ',')
140 |         {
141 |           if (curlen == 0)
142 |             return false;
143 |           else
144 |             {
145 |               opt_keep_columns[i] = 0;
146 |               keep_columns_names[keep_columns_count] =
147 |                 xstrdup(opt_keep_columns + i - curlen);
148 |               opt_keep_columns[i] = ',';
149 |               keep_columns_count++;
150 |               curlen = 0;
151 |             }
152 |         }
153 |       else if (((c >= 'A') && (c <= 'Z')) ||
154 |                ((c >= 'a') && (c <= 'z')) ||
155 |                ((c >= '0') && (c <= '9')) ||
156 |                (c == '_'))
157 |         {
158 |           curlen++;
159 |         }
160 |       else
161 |         {
162 |           return false;
163 |         }
164 |     }
165 | 
166 |   if (curlen == 0)
167 |     return false;
168 | 
169 |   keep_columns_names[keep_columns_count] =
170 |     xstrdup(opt_keep_columns + len - curlen);
171 |   keep_columns_count++;
172 |   return true;
173 | }
174 | 
175 | int64_t args_long(char * str, const char * option)
176 | {
177 |   char * endptr;
178 |   int64_t temp = strtol(str, & endptr, 10);
179 |   if (*endptr)
180 |     {
181 |       fprintf(stderr, "\nInvalid numeric argument for option %s\n", option);
182 |       exit(1);
183 |     }
184 |   return temp;
185 | }
186 | 
187 | void show_time(const char * prompt)
188 | {
189 |   const int time_string_max = 100;
190 |   char time_string[time_string_max];
191 |   const time_t clock = time(nullptr);
192 |   const struct tm * timeptr = localtime(& clock);
193 |   size_t time_string_len = strftime(time_string,
194 |                                     time_string_max,
195 |                                     "%a %b %d %T %Z %Y",
196 |                                     timeptr);
197 |   fprintf(logfile, "%s%s\n", prompt, time_string_len > 0 ? time_string : "?");
198 | }
199 | 
200 | void args_show()
201 | {
202 |   if (opt_matrix)
203 |     fprintf(logfile, "Command:           Overlap (-m)\n");
204 |   if (opt_cluster)
205 |     fprintf(logfile, "Command:           Cluster (-c)\n");
206 |   if (opt_existence)
207 |     fprintf(logfile, "Command:           Existence (-x)\n");
208 |   if (opt_deduplicate)
209 |     fprintf(logfile, "Command:           Deduplicate (--deduplicate)\n");
210 | 
211 |   if (opt_matrix)
212 |     fprintf(logfile, "Repertoire set 1:  %s\n", input1_filename);
213 |   else
214 |     fprintf(logfile, "Repertoire:        %s\n", input1_filename);
215 |   if (opt_matrix)
216 |     fprintf(logfile, "Repertoire set 2:  %s\n", input2_filename ? input2_filename : "(same as set 1)");
217 |   if (opt_existence)
218 |     fprintf(logfile, "Repertoire set:    %s\n", input2_filename);
219 | 
220 |   fprintf(logfile, "Nucleotides (n):   %s\n", opt_nucleotides ? "Yes" : "No");
221 |   fprintf(logfile, "Differences (d):   %" PRId64 "\n", opt_differences);
222 |   fprintf(logfile, "Indels (i):        %s\n", opt_indels ? "Yes" : "No");
223 |   fprintf(logfile, "Ignore counts (f): %s\n",
224 |           opt_ignore_counts ? "Yes" : "No");
225 |   fprintf(logfile, "Ignore genes (g):  %s\n",
226 |           opt_ignore_genes ? "Yes" : "No");
227 |   fprintf(logfile, "Ign. unknown (u):  %s\n",
228 |           opt_ignore_unknown ? "Yes" : "No");
229 |   fprintf(logfile, "Ignore empty (e):  %s\n",
230 |           opt_ignore_empty ? "Yes" : "No");
231 |   fprintf(logfile, "Use cdr3 column:   %s\n",
232 |           opt_cdr3 ? "Yes" : "No");
233 |   fprintf(logfile, "Threads (t):       %" PRId64 "\n", opt_threads);
234 |   if (opt_no_matrix)
235 |     fprintf(logfile, "Output file (o):   (none)\n");
236 |   else
237 |     fprintf(logfile, "Output file (o):   %s\n", opt_output);
238 |   if (opt_matrix || opt_existence)
239 |     {
240 |       fprintf(logfile, "Output format (a): %s\n", opt_alternative ? "Column" : "Matrix");
241 |       fprintf(logfile, "Score (s):         %s\n", score_descr[opt_score_int]);
242 |       fprintf(logfile, "Pairs file (p):    %s\n", opt_pairs ? opt_pairs : "(none)");
243 |       fprintf(logfile, "Keep columns:      %s\n", opt_keep_columns ? opt_keep_columns : "");
244 |     }
245 |   fprintf(logfile, "Log file (l):      %s\n", opt_log ? opt_log : "(stderr)");
246 | }
247 | 
248 | void args_usage()
249 | {
250 |   fprintf(stderr, "Usage: %s [OPTIONS] TSVFILE1 [TSVFILE2]\n", PROG_CMD);
251 |   fprintf(stderr, "\n");
252 |   fprintf(stderr, "Commands:\n");
253 |   fprintf(stderr, " -h, --help                  display this help and exit\n");
254 |   fprintf(stderr, " -v, --version               display version information\n");
255 |   fprintf(stderr, " -m, --matrix                compute overlap matrix between two sets\n");
256 |   fprintf(stderr, " -x, --existence             check existence of sequences in repertoires\n");
257 |   fprintf(stderr, " -c, --cluster               cluster sequences in one repertoire\n");
258 |   fprintf(stderr, " -z, --deduplicate           deduplicate sequences in repertoires\n");
259 |   fprintf(stderr, "\n");
260 |   fprintf(stderr, "General options:\n");
261 |   fprintf(stderr, " -d, --differences INTEGER   number of differences accepted (0*)\n");
262 |   fprintf(stderr, " -i, --indels                allow insertions or deletions when d=1\n");
263 |   fprintf(stderr, " -f, --ignore-counts         ignore duplicate_count information\n");
264 |   fprintf(stderr, " -g, --ignore-genes          ignore V and J gene information\n");
265 |   fprintf(stderr, " -n, --nucleotides           compare nucleotides, not amino acids\n");
266 |   fprintf(stderr, " -s, --score STRING          MH, Jaccard, product*, ratio, min, max, or mean\n");
267 |   fprintf(stderr, " -t, --threads INTEGER       number of threads to use (1*-256)\n");
268 |   fprintf(stderr, " -u, --ignore-unknown        ignore sequences with unknown symbols\n");
269 |   fprintf(stderr, " -e, --ignore-empty          ignore empty sequences\n");
270 |   fprintf(stderr, "\n");
271 |   fprintf(stderr, "Input/output options:\n");
272 |   fprintf(stderr, " -a, --alternative           output results in three-column format, not matrix\n");
273 |   fprintf(stderr, "     --cdr3                  use the cdr3(_aa) column instead of junction(_aa)\n");
274 |   fprintf(stderr, "     --distance              include sequence distance in pairs file\n");
275 |   fprintf(stderr, " -k, --keep-columns STRING   comma-separated columns to copy to pairs file\n");
276 |   fprintf(stderr, " -l, --log FILENAME          log to file (stderr*)\n");
277 |   fprintf(stderr, " -o, --output FILENAME       output results to file (stdout*)\n");
278 |   fprintf(stderr, "     --no-matrix             do not keep or output any matrix\n");
279 |   fprintf(stderr, " -p, --pairs FILENAME        output matching pairs to file (none*)\n");
280 |   fprintf(stderr, "\n");
281 |   fprintf(stderr, "                             * default value\n");
282 |   fprintf(stderr, "\n");
283 | }
284 | 
285 | void show_header()
286 | {
287 |   fprintf(logfile, "%s %s - %s\n", PROG_NAME, PROG_VERSION, PROG_BRIEF);
288 |   fprintf(logfile, "https://github.com/uio-bmi/compairr\n");
289 |   fprintf(logfile, "\n");
290 | }
291 | 
292 | void args_init(int argc, char **argv)
293 | {
294 |   /* Set defaults */
295 | 
296 |   progname = argv[0];
297 |   input1_filename = nullptr;
298 |   input2_filename = nullptr;
299 | 
300 |   opt_alternative = false;
301 |   opt_cdr3 = false;
302 |   opt_cluster = false;
303 |   opt_deduplicate = false;
304 |   opt_distance = false;
305 |   opt_differences = 0;
306 |   opt_existence = false;
307 |   opt_help = false;
308 |   opt_ignore_counts = false;
309 |   opt_ignore_genes = false;
310 |   opt_ignore_unknown = false;
311 |   opt_ignore_empty = false;
312 |   opt_indels = false;
313 |   opt_keep_columns = nullptr;
314 |   opt_log = nullptr;
315 |   opt_matrix = false;
316 |   opt_nucleotides = false;
317 |   opt_no_matrix = false;
318 |   opt_output = DASH_FILENAME;
319 |   opt_pairs = nullptr;
320 |   opt_score_int = 0;
321 |   opt_score_string = NULL;
322 |   opt_threads = 1;
323 |   opt_version = false;
324 | 
325 |   opterr = 1;
326 | 
327 |   char short_options[] = "acd:efghik:l:mno:p:s:t:uvxz";
328 | 
329 |   /* unused short option letters: bejqrwy */
330 | 
331 |   static struct option long_options[] =
332 |   {
333 |     {"alternative",      no_argument,       nullptr, 'a' },
334 |     {"cdr3",             no_argument,       nullptr, 0   },
335 |     {"cluster",          no_argument,       nullptr, 'c' },
336 |     {"differences",      required_argument, nullptr, 'd' },
337 |     {"distance",         no_argument,       nullptr, 0   },
338 |     {"ignore-empty",     no_argument,       nullptr, 'e' },
339 |     {"ignore-counts",    no_argument,       nullptr, 'f' },
340 |     {"ignore-genes",     no_argument,       nullptr, 'g' },
341 |     {"help",             no_argument,       nullptr, 'h' },
342 |     {"indels",           no_argument,       nullptr, 'i' },
343 |     {"keep-columns",     required_argument, nullptr, 'k' },
344 |     {"log",              required_argument, nullptr, 'l' },
345 |     {"matrix",           no_argument,       nullptr, 'm' },
346 |     {"nucleotides",      no_argument,       nullptr, 'n' },
347 |     {"no-matrix",        no_argument,       nullptr, 0   },
348 |     {"output",           required_argument, nullptr, 'o' },
349 |     {"pairs",            required_argument, nullptr, 'p' },
350 |     {"score",            required_argument, nullptr, 's' },
351 |     {"summands",         required_argument, nullptr, 's' },
352 |     {"threads",          required_argument, nullptr, 't' },
353 |     {"ignore-unknown",   no_argument,       nullptr, 'u' },
354 |     {"version",          no_argument,       nullptr, 'v' },
355 |     {"existence",        no_argument,       nullptr, 'x' },
356 |     {"deduplicate",      no_argument,       nullptr, 'z' },
357 |     {nullptr,            0,                 nullptr, 0   }
358 |   };
359 | 
360 |   enum
361 |     {
362 |       option_alternative,
363 |       option_cdr3,
364 |       option_cluster,
365 |       option_differences,
366 |       option_distance,
367 |       option_ignore_empty,
368 |       option_ignore_counts,
369 |       option_ignore_genes,
370 |       option_help,
371 |       option_indels,
372 |       option_keep_columns,
373 |       option_log,
374 |       option_matrix,
375 |       option_nucleotides,
376 |       option_no_matrix,
377 |       option_output,
378 |       option_pairs,
379 |       option_score,
380 |       option_summands,
381 |       option_threads,
382 |       option_ignore_unknown,
383 |       option_version,
384 |       option_existence,
385 |       option_deduplicate
386 |     };
387 | 
388 |   int used_options[26] = { 0, 0, 0, 0, 0,
389 |                            0, 0, 0, 0, 0,
390 |                            0, 0, 0, 0, 0,
391 |                            0, 0, 0, 0, 0,
392 |                            0, 0, 0, 0, 0,
393 |                            0 };
394 | 
395 |   int option_index = 0;
396 |   int c;
397 | 
398 |   while ((c = getopt_long(argc, argv, short_options, long_options, &option_index)) != -1)
399 |   {
400 | 
401 |     /* check if any option is specified more than once */
402 | 
403 |     if ((c >= 'a') && (c <= 'z'))
404 |       {
405 |         int optindex = c - 'a';
406 |         if (used_options[optindex] == 1)
407 |           {
408 |             int longoptindex = 0;
409 |             while (long_options[longoptindex].name)
410 |               {
411 |                 if (long_options[longoptindex].val == c)
412 |                   break;
413 |                 longoptindex++;
414 |               }
415 | 
416 |             fprintf(stderr,
417 |                     "Error: Option -%c or --%s specified more than once.\n",
418 |                     c,
419 |                     long_options[longoptindex].name);
420 |             exit(1);
421 |           }
422 |         used_options[optindex] = 1;
423 |       }
424 | 
425 |     switch(c)
426 |       {
427 |       case 'a':
428 |         /* alternative */
429 |         opt_alternative = true;
430 |         break;
431 | 
432 |       case 'c':
433 |         /* cluster */
434 |         opt_cluster = true;
435 |         break;
436 | 
437 |       case 'd':
438 |         /* differences */
439 |         opt_differences = args_long(optarg, "-d or --differences");
440 |         break;
441 | 
442 |       case 'e':
443 |         /* ignore-empty */
444 |         opt_ignore_empty = true;
445 |         break;
446 | 
447 |       case 'f':
448 |         /* ignore-counts */
449 |         opt_ignore_counts = true;
450 |         break;
451 | 
452 |       case 'g':
453 |         /* ignore-genes */
454 |         opt_ignore_genes = true;
455 |         break;
456 | 
457 |       case 'h':
458 |         /* help */
459 |         opt_help = true;
460 |         break;
461 | 
462 |       case 'i':
463 |         /* indels */
464 |         opt_indels = true;
465 |         break;
466 | 
467 |       case 'k':
468 |         /* keep_columns */
469 |         opt_keep_columns = optarg;
470 |         break;
471 | 
472 |       case 'l':
473 |         /* log */
474 |         opt_log = optarg;
475 |         break;
476 | 
477 |       case 'm':
478 |         /* matrix */
479 |         opt_matrix = true;
480 |         break;
481 | 
482 |       case 'n':
483 |         /* nucleotides */
484 |         opt_nucleotides = true;
485 |         break;
486 | 
487 |       case 'o':
488 |         /* output-file */
489 |         opt_output = optarg;
490 |         break;
491 | 
492 |       case 'p':
493 |         /* pairs-file */
494 |         opt_pairs = optarg;
495 |         break;
496 | 
497 |       case 's':
498 |         /* score, summands */
499 |         opt_score_string = optarg;
500 |         break;
501 | 
502 |       case 't':
503 |         /* threads */
504 |         opt_threads = args_long(optarg, "-t or --threads");
505 |         break;
506 | 
507 |       case 'u':
508 |         /* ignore-unknown */
509 |         opt_ignore_unknown = true;
510 |         break;
511 | 
512 |       case 'v':
513 |         /* version */
514 |         opt_version = true;
515 |         break;
516 | 
517 |       case 'x':
518 |         /* existence */
519 |         opt_existence = true;
520 |         break;
521 | 
522 |       case 'z':
523 |         /* deduplicate */
524 |         opt_deduplicate = true;
525 |         break;
526 | 
527 |       case 0:
528 |         /* long options only */
529 | 
530 |         switch (option_index)
531 |           {
532 |           case option_cdr3:
533 |             /* cdr3 */
534 |             opt_cdr3 = true;
535 |             break;
536 | 
537 |           case option_distance:
538 |             /* distance */
539 |             opt_distance = true;
540 |             break;
541 | 
542 |           case option_no_matrix:
543 |             /* no_matrix */
544 |             opt_no_matrix = true;
545 |             break;
546 | 
547 |           default:
548 |             show_header();
549 |             args_usage();
550 |             exit(1);
551 |           }
552 |         break;
553 | 
554 |       default:
555 |         show_header();
556 |         args_usage();
557 |         exit(1);
558 |     }
559 |   }
560 | 
561 |   int cmd_count = opt_help + opt_version + opt_matrix + opt_cluster + opt_existence + opt_deduplicate;
562 |   if (cmd_count == 0)
563 |     fatal("Please specify a command (--help, --version, --matrix, --existence, --cluster, or --deduplicate)");
564 |   if (cmd_count > 1)
565 |     fatal("Please specify just one command (--help, --version, --matrix, --existence, --cluster, or --deduplicate)");
566 | 
567 |   if (opt_help || opt_version)
568 |     {
569 |       if (optind != argc)
570 |         fatal("Incorrect number of arguments");
571 |     }
572 |   else if (opt_matrix)
573 |     {
574 |       if (optind + 2 == argc)
575 |         {
576 |           input1_filename = argv[optind];
577 |           input2_filename = argv[optind + 1];
578 |         }
579 |       else if (optind + 1 == argc)
580 |         {
581 |           input1_filename = argv[optind];
582 |           input2_filename = 0;
583 |         }
584 |       else
585 |         {
586 |           fatal("Incorrect number of arguments. One or two input files must be specified.");
587 |         }
588 |     }
589 |   else if (opt_existence)
590 |     {
591 |       if (optind + 2 == argc)
592 |         {
593 |           input1_filename = argv[optind];
594 |           input2_filename = argv[optind + 1];
595 |         }
596 |       else
597 |         {
598 |           fatal("Incorrect number of arguments. Two input files must be specified.");
599 |         }
600 |     }
601 |   else if (opt_cluster || opt_deduplicate)
602 |     {
603 |       if (optind + 1 == argc)
604 |         {
605 |           input1_filename = argv[optind];
606 |         }
607 |       else
608 |         {
609 |           fatal("Incorrect number of arguments. One input file must be specified.");
610 |         }
611 |     }
612 | 
613 |   if (opt_deduplicate)
614 |     {
615 |       if (opt_differences != 0)
616 |         fatal("Option -d or --differences must be 0 for deduplication.");
617 |       if (opt_indels)
618 |         fatal("Option -i or --indels is not allowed for deduplication.");
619 |     }
620 | 
621 |   if (opt_keep_columns)
622 |     {
623 |       if (! opt_pairs)
624 |         fatal("Option --keep-columns only allowed with --pairs options.");
625 |       if (! parse_keep_columns())
626 |         fatal("Illegal list of columns with --keep-columns option. It must be a comma-separated list of column names. Allowed symbols: A-Z, a-z, _, and 0-9.");
627 |     }
628 | 
629 |   if ((opt_threads < 1) || (opt_threads > MAX_THREADS))
630 |     {
631 |       fprintf(stderr, "\nError: Illegal number of threads specified with "
632 |               "-t or --threads, must be in the range 1 to %u.\n", MAX_THREADS);
633 |       exit(1);
634 |     }
635 | 
636 |   if (opt_differences < 0)
637 |     fatal("Differences specified with -d or -differences cannot be negative.");
638 | 
639 |   if (opt_indels && (opt_differences != 1))
640 |     fatal("Indels are only allowed when d=1");
641 | 
642 |   if (opt_cluster)
643 |     {
644 |       if (opt_pairs)
645 |         fatal("Option -p or --pairs is not allowed with -c or --cluster");
646 |       if (opt_alternative)
647 |         fatal("Option -a or --alternative is not allowed with -c or --cluster");
648 |       if (opt_score_string)
649 |         fatal("Option -s or --score is not allowed with -c or --cluster");
650 |     }
651 | 
652 |   if (opt_score_string)
653 |     {
654 |       opt_score_int = -1;
655 |       for(int i = 0; i < score_end; i++)
656 |         if (strcasecmp(opt_score_string, score_options[i]) == 0)
657 |           {
658 |             opt_score_int = i;
659 |             break;
660 |           }
661 |       if (opt_score_int < 0)
662 |         {
663 |           fatal("Argument to -s or --score must be MH, Jaccard, product, ratio, min, max or mean");
664 |         }
665 |     }
666 | 
667 |   if (! opt_matrix)
668 |     {
669 |       if (opt_score_int == score_mh)
670 |         {
671 |           fatal("The Morisita-Horn index is only allowed when computing repertoire overlap");
672 |         }
673 |       if (opt_score_int == score_jaccard)
674 |         {
675 |           fatal("The Jaccard index is only allowed when computing repertoire overlap");
676 |         }
677 |     }
678 | 
679 |   if (opt_differences > 0)
680 |     {
681 |       if (opt_score_int == score_mh)
682 |         {
683 |           fatal("The Morisita-Horn index is not defined when d>0");
684 |         }
685 |       if (opt_score_int == score_jaccard)
686 |         {
687 |           fatal("The Jaccard index is not defined when d>0");
688 |         }
689 |     }
690 | 
691 |   if (opt_nucleotides)
692 |     alphabet_size = 4;
693 |   else
694 |     alphabet_size = 20;
695 | 
696 |   if (opt_cdr3)
697 |     if (opt_nucleotides)
698 |       seq_header = "cdr3";
699 |     else
700 |       seq_header = "cdr3_aa";
701 |   else
702 |     if (opt_nucleotides)
703 |       seq_header = "junction";
704 |     else
705 |       seq_header = "junction_aa";
706 | }
707 | 
708 | void open_files()
709 | {
710 |   /* open files */
711 | 
712 |   if (opt_log)
713 |     {
714 |       logfile = fopen_output(opt_log);
715 |       if (! logfile)
716 |         fatal("Unable to open log file for writing.");
717 |     }
718 | 
719 |   outfile = fopen_output(opt_output);
720 |   if (! outfile)
721 |     fatal("Unable to open output file for writing.");
722 | 
723 |   if (opt_pairs)
724 |     {
725 |       pairsfile = fopen_output(opt_pairs);
726 |       if (! pairsfile)
727 |         fatal("Unable to open pairs file for writing.");
728 |     }
729 | }
730 | 
731 | void close_files()
732 | {
733 |   if (pairsfile)
734 |     fclose(pairsfile);
735 | 
736 |   if (outfile)
737 |     fclose(outfile);
738 | 
739 |   if (logfile)
740 |     fclose(logfile);
741 | }
742 | 
743 | int main(int argc, char** argv)
744 | {
745 |   logfile = stderr;
746 | 
747 |   arch_srandom(1);
748 | 
749 |   args_init(argc, argv);
750 | 
751 |   open_files();
752 | 
753 |   if (opt_version || opt_help)
754 |     {
755 |       show_header();
756 |       if (opt_help)
757 |         args_usage();
758 |       close_files();
759 |       exit(0);
760 |     }
761 | 
762 |   show_header();
763 | 
764 |   show_time("Start time:        ");
765 | 
766 |   args_show();
767 | 
768 |   fprintf(logfile, "\n");
769 | 
770 |   if (opt_matrix || opt_existence)
771 |     overlap(input1_filename, input2_filename);
772 |   else if (opt_deduplicate)
773 |     dedup(input1_filename);
774 |   else
775 |     cluster(input1_filename);
776 | 
777 |   show_time("End time:          ");
778 | 
779 |   if (keep_columns_no)
780 |     {
781 |       xfree(keep_columns_no);
782 |       keep_columns_no = nullptr;
783 |     }
784 | 
785 |   if (keep_columns_names)
786 |     {
787 |       xfree(keep_columns_names);
788 |       keep_columns_names = nullptr;
789 |     }
790 | 
791 |   if (keep_columns_strings)
792 |     {
793 |       xfree(keep_columns_strings);
794 |       keep_columns_strings = nullptr;
795 |     }
796 | 
797 |   close_files();
798 | }
799 | 


--------------------------------------------------------------------------------
/src/compairr.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | #include <inttypes.h>
 23 | 
 24 | #ifndef PRIu64
 25 | #ifdef _WIN32
 26 | #define PRIu64 "I64u"
 27 | #else
 28 | #define PRIu64 "lu"
 29 | #endif
 30 | #endif
 31 | 
 32 | #ifndef PRId64
 33 | #ifdef _WIN32
 34 | #define PRId64 "I64d"
 35 | #else
 36 | #define PRId64 "ld"
 37 | #endif
 38 | #endif
 39 | 
 40 | #include <assert.h>
 41 | #include <stdio.h>
 42 | #include <string.h>
 43 | #include <pthread.h>
 44 | #include <getopt.h>
 45 | #include <stdlib.h>
 46 | #include <regex.h>
 47 | #include <limits.h>
 48 | #include <stdarg.h>
 49 | #include <fcntl.h>
 50 | #include <unistd.h>
 51 | #include <math.h>
 52 | #include <sys/types.h>
 53 | #include <sys/stat.h>
 54 | 
 55 | #include <chrono>
 56 | #include <string>
 57 | #include <vector>
 58 | #include <map>
 59 | 
 60 | #ifdef __APPLE__
 61 | #include <sys/resource.h>
 62 | #include <sys/sysctl.h>
 63 | #elif defined _WIN32
 64 | #include <windows.h>
 65 | #include <psapi.h>
 66 | #else
 67 | #include <sys/resource.h>
 68 | #include <sys/sysinfo.h>
 69 | #endif
 70 | 
 71 | #ifdef __aarch64__
 72 | 
 73 | #include <arm_neon.h>
 74 | 
 75 | #elif defined __x86_64__
 76 | 
 77 | #ifdef __SSE2__
 78 | #include <emmintrin.h>
 79 | #endif
 80 | 
 81 | #ifdef __SSSE3__
 82 | #include <tmmintrin.h>
 83 | #endif
 84 | 
 85 | #define CAST_m128i_ptr(x) (reinterpret_cast<__m128i*>(x))
 86 | 
 87 | #elif defined __PPC__
 88 | 
 89 | #ifdef __LITTLE_ENDIAN__
 90 | #include <altivec.h>
 91 | #else
 92 | #error Big endian ppc64 CPUs not supported
 93 | #endif
 94 | 
 95 | #else
 96 | 
 97 | #error Unknown architecture
 98 | #endif
 99 | 
100 | static_assert(INT_MAX > 32767, "Your compiler uses very short integers.");
101 | 
102 | /* constants */
103 | 
104 | #define PROG_CMD "compairr"
105 | #define PROG_NAME "CompAIRR"
106 | #define PROG_VERSION "1.13.0"
107 | #define PROG_BRIEF "Comparison of Adaptive Immune Receptor Repertoires"
108 | 
109 | const unsigned int MAX_THREADS = 256;
110 | 
111 | const int MAX_INSERTS = 3;
112 | 
113 | const int MAXDIFF_HASH = 2;
114 | 
115 | #ifndef MIN
116 | #define MIN(x,y) ((x)<(y)?(x):(y))
117 | #endif
118 | 
119 | #ifndef MAX
120 | #define MAX(x,y) ((x)>(y)?(x):(y))
121 | #endif
122 | 
123 | extern int alphabet_size;
124 | 
125 | enum
126 |   {
127 |     score_product,
128 |     score_ratio,
129 |     score_min,
130 |     score_max,
131 |     score_mean,
132 |     score_mh,
133 |     score_jaccard,
134 |     score_end
135 |   };
136 | 
137 | /* common data */
138 | 
139 | extern bool opt_alternative;
140 | extern bool opt_cluster;
141 | extern bool opt_cdr3;
142 | extern bool opt_distance;
143 | extern bool opt_existence;
144 | extern bool opt_help;
145 | extern bool opt_ignore_counts;
146 | extern bool opt_ignore_genes;
147 | extern bool opt_ignore_unknown;
148 | extern bool opt_ignore_empty;
149 | extern bool opt_indels;
150 | extern bool opt_matrix;
151 | extern bool opt_nucleotides;
152 | extern bool opt_no_matrix;
153 | extern bool opt_version;
154 | extern bool opt_deduplicate;
155 | extern char * opt_keep_columns;
156 | extern char * opt_log;
157 | extern char * opt_output_file;
158 | extern char * opt_pairs;
159 | extern char * opt_score_string;
160 | extern int64_t opt_differences;
161 | extern int64_t opt_score_int;
162 | extern int64_t opt_threads;
163 | 
164 | extern const char * seq_header;
165 | 
166 | extern FILE * outfile;
167 | extern FILE * logfile;
168 | extern FILE * pairsfile;
169 | 
170 | extern int keep_columns_count;
171 | extern int * keep_columns_no;
172 | extern char ** keep_columns_names;
173 | extern char ** keep_columns_strings;
174 | 
175 | /* header files */
176 | 
177 | #include "util.h"
178 | #include "arch.h"
179 | #include "bloompat.h"
180 | #include "cluster.h"
181 | #include "db.h"
182 | #include "hashtable.h"
183 | #include "overlap.h"
184 | #include "threads.h"
185 | #include "variants.h"
186 | #include "zobrist.h"
187 | #include "dedup.h"
188 | 


--------------------------------------------------------------------------------
/src/db.cc:
--------------------------------------------------------------------------------
   1 | /*
   2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
   3 | 
   4 |     This program is free software: you can redistribute it and/or modify
   5 |     it under the terms of the GNU Affero General Public License as
   6 |     published by the Free Software Foundation, either version 3 of the
   7 |     License, or (at your option) any later version.
   8 | 
   9 |     This program is distributed in the hope that it will be useful,
  10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 |     GNU Affero General Public License for more details.
  13 | 
  14 |     You should have received a copy of the GNU Affero General Public License
  15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 | 
  17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
  18 |     Department of Informatics, University of Oslo,
  19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
  20 | */
  21 | 
  22 | #include "compairr.h"
  23 | 
  24 | #include <string>
  25 | #include <map>
  26 | #include <vector>
  27 | 
  28 | /* How much memory for residues and sequences should we allocate each time? */
  29 | 
  30 | #define MEMCHUNK 1048576
  31 | #define SEQCHUNK 65536
  32 | 
  33 | static signed char map_aa[256] =
  34 |   {
  35 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  36 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  37 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  38 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  39 |     -1,  0, -1,  1,  2,  3,  4,  5,  6,  7, -1,  8,  9, 10, 11, -1,
  40 |     12, 13, 14, 15, 16, -1, 17, 18, -1, 19, -1, -1, -1, -1, -1, -1,
  41 |     -1,  0, -1,  1,  2,  3,  4,  5,  6,  7, -1,  8,  9, 10, 11, -1,
  42 |     12, 13, 14, 15, 16, -1, 17, 18, -1, 19, -1, -1, -1, -1, -1, -1,
  43 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  44 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  45 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  46 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  47 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  48 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  49 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  50 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  51 |   };
  52 | 
  53 | static signed char map_nt[256] =
  54 |   {
  55 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  56 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  57 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  58 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  59 |     -1,  0, -1,  1, -1, -1, -1,  2, -1, -1, -1, -1, -1, -1, -1, -1,
  60 |     -1, -1, -1, -1,  3,  3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  61 |     -1,  0, -1,  1, -1, -1, -1,  2, -1, -1, -1, -1, -1, -1, -1, -1,
  62 |     -1, -1, -1, -1,  3,  3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  63 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  64 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  65 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  66 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  67 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  68 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  69 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  70 |     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  71 |   };
  72 | 
  73 | const static char * aa_chars = "ACDEFGHIKLMNPQRSTVWY";
  74 | const static char * nt_chars = "acgt";
  75 | const char * EMPTYSTRING = "";
  76 | 
  77 | struct seqinfo_s
  78 | {
  79 |   uint64_t hash;
  80 |   uint64_t count;
  81 |   char * sequence_id;
  82 |   char * seq;
  83 |   char * keep; /* extra columns to keep, tab separated */
  84 |   unsigned int seqlen;
  85 |   int repertoire_id_no;
  86 |   int v_gene_no;
  87 |   int j_gene_no;
  88 | };
  89 | 
  90 | typedef struct seqinfo_s seqinfo_t;
  91 | 
  92 | struct db
  93 | {
  94 |   seqinfo_t * seqindex;
  95 |   uint64_t seqindex_alloc;
  96 |   uint64_t sequences;
  97 |   unsigned int longest;
  98 |   unsigned int shortest;
  99 |   char * residues_p;
 100 |   uint64_t residues_alloc;
 101 |   uint64_t residues_count;
 102 |   uint64_t total_duplicate_count;
 103 |   uint64_t repertoire_count;
 104 |   uint64_t ignored_unknown;
 105 |   uint64_t ignored_empty;
 106 |   std::vector<std::string> repertoire_id_vector;
 107 |   std::map<std::string, int> repertoire_id_map;
 108 |   int col_junction;
 109 |   int col_junction_aa;
 110 |   int col_cdr3;
 111 |   int col_cdr3_aa;
 112 |   int col_duplicate_count;
 113 |   int col_v_call;
 114 |   int col_j_call;
 115 |   int col_repertoire_id;
 116 |   int col_sequence_id;
 117 | };
 118 | 
 119 | /* v and j genes are common to both */
 120 | 
 121 | static std::vector<std::string> v_gene_vector;
 122 | static std::map<std::string, int> v_gene_map;
 123 | 
 124 | static std::vector<std::string> j_gene_vector;
 125 | static std::map<std::string, int> j_gene_map;
 126 | 
 127 | void db_init()
 128 | {
 129 |   v_gene_vector.clear();
 130 |   v_gene_map.clear();
 131 |   j_gene_vector.clear();
 132 |   j_gene_map.clear();
 133 | }
 134 | 
 135 | void db_exit()
 136 | {
 137 |   v_gene_vector.clear();
 138 |   v_gene_map.clear();
 139 |   j_gene_vector.clear();
 140 |   j_gene_map.clear();
 141 | }
 142 | 
 143 | struct db * db_create()
 144 | {
 145 |   struct db * d = new db;
 146 | 
 147 |   d->seqindex = nullptr;
 148 |   d->seqindex_alloc = 0;
 149 |   d->sequences = 0;
 150 |   d->longest = 0;
 151 |   d->shortest = UINT_MAX;
 152 |   d->residues_p = nullptr;
 153 |   d->residues_alloc = 0;
 154 |   d->residues_count = 0;
 155 |   d->total_duplicate_count = 0;
 156 |   d->repertoire_count = 0;
 157 |   d->repertoire_id_vector.clear();
 158 |   d->repertoire_id_map.clear();
 159 |   d->col_junction = 0;
 160 |   d->col_junction_aa = 0;
 161 |   d->col_cdr3 = 0;
 162 |   d->col_cdr3_aa = 0;
 163 |   d->col_duplicate_count = 0;
 164 |   d->col_v_call = 0;
 165 |   d->col_j_call = 0;
 166 |   d->col_repertoire_id = 0;
 167 |   d->col_sequence_id = 0;
 168 | 
 169 |   return d;
 170 | }
 171 | 
 172 | void parse_airr_tsv_header(char * line,
 173 |                            struct db * d,
 174 |                            bool require_sequence_id)
 175 | {
 176 |   char delim[] = "\t";
 177 |   char * string = line;
 178 |   char * token = nullptr;
 179 | 
 180 |   int i = 1;
 181 | 
 182 |   while ((token = strsep(& string, delim)) != nullptr)
 183 |     {
 184 |       if (strcmp(token, "repertoire_id") == 0)
 185 |         {
 186 |           d->col_repertoire_id = i;
 187 |         }
 188 |       else if (strcmp(token, "sequence_id") == 0)
 189 |         {
 190 |           d->col_sequence_id = i;
 191 |         }
 192 |       else if (strcmp(token, "duplicate_count") == 0)
 193 |         {
 194 |           d->col_duplicate_count = i;
 195 |         }
 196 |       else if (strcmp(token, "v_call") == 0)
 197 |         {
 198 |           d->col_v_call = i;
 199 |         }
 200 |       else if (strcmp(token, "j_call") == 0)
 201 |         {
 202 |           d->col_j_call = i;
 203 |         }
 204 |       else if (strcmp(token, "junction") == 0)
 205 |         {
 206 |           d->col_junction = i;
 207 |         }
 208 |       else if (strcmp(token, "junction_aa") == 0)
 209 |         {
 210 |           d->col_junction_aa = i;
 211 |         }
 212 |       else if (strcmp(token, "cdr3") == 0)
 213 |         {
 214 |           d->col_cdr3 = i;
 215 |         }
 216 |       else if (strcmp(token, "cdr3_aa") == 0)
 217 |         {
 218 |           d->col_cdr3_aa = i;
 219 |         }
 220 | 
 221 |       for (int j = 0; j < keep_columns_count; j++)
 222 |         {
 223 |           if (strcmp(token, keep_columns_names[j]) == 0)
 224 |             keep_columns_no[j] = i;
 225 |         }
 226 |       i++;
 227 |     }
 228 | 
 229 |   if (! (d->col_sequence_id     || ! require_sequence_id)   ||
 230 |       ! (d->col_duplicate_count ||   opt_ignore_counts)     ||
 231 |       ! (d->col_v_call          ||   opt_ignore_genes)      ||
 232 |       ! (d->col_j_call          ||   opt_ignore_genes)      ||
 233 |       ! (d->col_junction        || ! opt_nucleotides        || opt_cdr3   ) ||
 234 |       ! (d->col_junction_aa     ||   opt_nucleotides        || opt_cdr3   ) ||
 235 |       ! (d->col_cdr3            || ! opt_nucleotides        || ! opt_cdr3 ) ||
 236 |       ! (d->col_cdr3_aa         ||   opt_nucleotides        || ! opt_cdr3 ))
 237 |     {
 238 |       fprintf(logfile,
 239 |         "\nMissing essential column(s) in header of AIRR TSV input file:");
 240 | 
 241 |       if (require_sequence_id && (! d->col_sequence_id))
 242 |         fprintf(logfile, " sequence_id");
 243 |       if ((! opt_ignore_counts) && (! d->col_duplicate_count))
 244 |         fprintf(logfile, " duplicate_count");
 245 |       if (! opt_ignore_genes)
 246 |         {
 247 |           if (! d->col_v_call)
 248 |             fprintf(logfile, " v_call");
 249 |           if (! d->col_j_call)
 250 |             fprintf(logfile, " j_call");
 251 |         }
 252 |       if (opt_cdr3)
 253 |         {
 254 |           if (opt_nucleotides)
 255 |             {
 256 |               if (! d->col_cdr3)
 257 |                 fprintf(logfile, " cdr3");
 258 |             }
 259 |           else
 260 |             {
 261 |               if (! d->col_cdr3_aa)
 262 |                 fprintf(logfile, " cdr3_aa");
 263 |             }
 264 |         }
 265 |       else
 266 |         {
 267 |           if (opt_nucleotides)
 268 |             {
 269 |               if (! d->col_junction)
 270 |                 fprintf(logfile, " junction");
 271 |             }
 272 |           else
 273 |             {
 274 |               if (! d->col_junction_aa)
 275 |                 fprintf(logfile, " junction_aa");
 276 |             }
 277 |         }
 278 | 
 279 |       fprintf(logfile, "\n");
 280 |       exit(1);
 281 |     }
 282 | 
 283 |   bool any_missing = false;
 284 |   for (int j = 0; j < keep_columns_count; j++)
 285 |     if (keep_columns_no[j] < 1)
 286 |       any_missing = true;
 287 |   if (any_missing)
 288 |     {
 289 |       fprintf(logfile,
 290 |               "\nWarning: missing column(s) to keep in header:");
 291 |       for (int j = 0; j < keep_columns_count; j++)
 292 |         if (keep_columns_no[j] < 1)
 293 |           fprintf(logfile, " %s", keep_columns_names[j]);
 294 |       fprintf(logfile, "\n");
 295 |     }
 296 | }
 297 | 
 298 | void parse_airr_tsv_line(char * line,
 299 |                          uint64_t lineno,
 300 |                          struct db * d,
 301 |                          bool require_sequence_id,
 302 |                          const char * default_repertoire_id)
 303 | {
 304 |   const char * repertoire_id = nullptr;
 305 |   const char * sequence_id = nullptr;
 306 |   const char * duplicate_count = nullptr;
 307 |   const char * v_call = nullptr;
 308 |   const char * j_call = nullptr;
 309 |   const char * junction = nullptr;
 310 |   const char * junction_aa = nullptr;
 311 |   const char * cdr3 = nullptr;
 312 |   const char * cdr3_aa = nullptr;
 313 | 
 314 |   for (int k = 0; k < keep_columns_count; k++)
 315 |     keep_columns_strings[k] = nullptr;
 316 | 
 317 |   char delim[] = "\t";
 318 |   char * string = line;
 319 |   char * token = nullptr;
 320 | 
 321 |   int i = 1;
 322 | 
 323 |   while ((token = strsep(& string, delim)) != nullptr)
 324 |     {
 325 |       if (i == d->col_repertoire_id)
 326 |         {
 327 |           repertoire_id = token;
 328 |         }
 329 |       else if (i == d->col_sequence_id)
 330 |         {
 331 |           sequence_id = token;
 332 |         }
 333 |       else if (i == d->col_duplicate_count)
 334 |         {
 335 |           duplicate_count = token;
 336 |         }
 337 |       else if (i == d->col_v_call)
 338 |         {
 339 |           v_call = token;
 340 |         }
 341 |       else if (i == d->col_j_call)
 342 |         {
 343 |           j_call = token;
 344 |         }
 345 |       else if (i == d->col_junction)
 346 |         {
 347 |           junction = token;
 348 |         }
 349 |       else if (i == d->col_junction_aa)
 350 |         {
 351 |           junction_aa = token;
 352 |         }
 353 |       else if (i == d->col_cdr3)
 354 |         {
 355 |           cdr3 = token;
 356 |         }
 357 |       else if (i == d->col_cdr3_aa)
 358 |         {
 359 |           cdr3_aa = token;
 360 |         }
 361 | 
 362 |       for (int k = 0; k < keep_columns_count; k++)
 363 |         if (i == keep_columns_no[k])
 364 |           keep_columns_strings[k] = token;
 365 | 
 366 |       i++;
 367 |     }
 368 | 
 369 | 
 370 |   /* make room for another entry */
 371 | 
 372 |   if (d->sequences >= d->seqindex_alloc)
 373 |     {
 374 |       d->seqindex_alloc += SEQCHUNK;
 375 |       d->seqindex = static_cast<seqinfo_t *>
 376 |         (xrealloc(d->seqindex, d->seqindex_alloc * sizeof(seqinfo_s)));
 377 |     }
 378 | 
 379 |   seqinfo_t * p = d->seqindex + d->sequences;
 380 | 
 381 | 
 382 |   /* make room for more residues */
 383 | 
 384 |   unsigned int len_estimate = 0;
 385 |   if (opt_cdr3)
 386 |     {
 387 |       if (opt_nucleotides)
 388 |         len_estimate = strlen(cdr3);
 389 |       else
 390 |         len_estimate = strlen(cdr3_aa);
 391 |     }
 392 |   else
 393 |     {
 394 |       if (opt_nucleotides)
 395 |         len_estimate = strlen(junction);
 396 |       else
 397 |         len_estimate = strlen(junction_aa);
 398 |     }
 399 | 
 400 |   if (d->residues_count + len_estimate > d->residues_alloc)
 401 |     {
 402 |       d->residues_alloc += MEMCHUNK;
 403 |       d->residues_p = static_cast<char *>
 404 |         (xrealloc(d->residues_p, d->residues_alloc));
 405 |     }
 406 | 
 407 | 
 408 |   /* scan and store sequence */
 409 | 
 410 |   char * q = d->residues_p + d->residues_count;
 411 |   unsigned int seqlen = 0;
 412 |   bool ignore_seq = false;
 413 | 
 414 |   for(unsigned int i = 0; i < len_estimate; i++)
 415 |     {
 416 |       unsigned char c;
 417 |       signed char m;
 418 |       if (opt_nucleotides)
 419 |         {
 420 |           if (opt_cdr3)
 421 |             c = cdr3[i];
 422 |           else
 423 |             c = junction[i];
 424 |           m = map_nt[static_cast<unsigned int>(c)];
 425 |         }
 426 |       else
 427 |         {
 428 |           if (opt_cdr3)
 429 |             c = cdr3_aa[i];
 430 |           else
 431 |             c = junction_aa[i];
 432 |           m = map_aa[static_cast<unsigned int>(c)];
 433 |         }
 434 | 
 435 |       if (m >= 0)
 436 |         {
 437 |           *q++ = m;
 438 |           seqlen++;
 439 |         }
 440 |       else
 441 |         {
 442 |           if ((c >= 32) && (c <= 126))
 443 |             {
 444 |               if (opt_ignore_unknown)
 445 |                 {
 446 |                   ignore_seq = true;
 447 |                   d->ignored_unknown++;
 448 |                 }
 449 |               else
 450 |                 {
 451 |                   fprintf(logfile,
 452 |                           "\n\nError: Illegal character '%c' in sequence "
 453 |                           "on line %" PRIu64 ". Use -u to ignore.\n",
 454 |                           c,
 455 |                           lineno);
 456 |                   exit(1);
 457 |                 }
 458 |             }
 459 |           else
 460 |             {
 461 |               fprintf(logfile,
 462 |                       "\n\nError: Illegal character (ascii no %d) in sequence "
 463 |                       "on line %" PRIu64 "\n",
 464 |                       c,
 465 |                       lineno);
 466 |               exit(1);
 467 |             }
 468 |         }
 469 |     }
 470 | 
 471 |   if (seqlen == 0)
 472 |     {
 473 |       if (opt_ignore_empty)
 474 |         {
 475 |           ignore_seq = true;
 476 |           d->ignored_empty++;
 477 |         }
 478 |       else
 479 |         {
 480 |           fprintf(logfile,
 481 |                   "\n\nError: Empty sequence in sequence "
 482 |                   "on line %" PRIu64 ". Use -e to ignore.\n",
 483 |                   lineno);
 484 |           exit(1);
 485 |         }
 486 |     }
 487 | 
 488 |   if (ignore_seq)
 489 |     {
 490 |       return;
 491 |     }
 492 |   else
 493 |     {
 494 |       d->residues_count += seqlen;
 495 |       p->seqlen = seqlen;
 496 |       if (seqlen > d->longest)
 497 |         d->longest = seqlen;
 498 |       if (seqlen < d->shortest)
 499 |         d->shortest = seqlen;
 500 |     }
 501 | 
 502 | 
 503 |   /* handle repertoire_id */
 504 | 
 505 |   if (! repertoire_id)
 506 |     {
 507 |       repertoire_id = default_repertoire_id;
 508 |     }
 509 | 
 510 |   auto r_it = d->repertoire_id_map.find(repertoire_id);
 511 |   if (r_it != d->repertoire_id_map.end())
 512 |     {
 513 |       p->repertoire_id_no = r_it->second;
 514 |     }
 515 |   else
 516 |     {
 517 |       p->repertoire_id_no = d->repertoire_id_vector.size();
 518 |       d->repertoire_id_vector.push_back(repertoire_id);
 519 |       d->repertoire_id_map.insert({repertoire_id, p->repertoire_id_no});
 520 |     }
 521 | 
 522 | 
 523 |   /* handle sequence_id */
 524 | 
 525 |   if (sequence_id && *sequence_id)
 526 |     {
 527 |       p->sequence_id = xstrdup(sequence_id);
 528 |     }
 529 |   else if (require_sequence_id)
 530 |     {
 531 |       fprintf(logfile,
 532 |               "\n\nError: missing or empty sequence_id value on line %"
 533 |               PRIu64 "\n",
 534 |               lineno);
 535 |       exit(1);
 536 |     }
 537 |   else
 538 |     {
 539 |       p->sequence_id = nullptr;
 540 |     }
 541 | 
 542 | 
 543 |   /* handle duplicate_count */
 544 | 
 545 |   if (duplicate_count && *duplicate_count)
 546 |     {
 547 |       char * endptr = nullptr;
 548 |       long count = strtol(duplicate_count, &endptr, 10);
 549 |       if (endptr && (*endptr == 0) && (count >= 1))
 550 |         {
 551 |           p->count = count;
 552 |         }
 553 |       else
 554 |         {
 555 |           fprintf(logfile, "\n\nError: Illegal duplicate_count on line %"
 556 |                   PRIu64 ": %s\n", lineno, duplicate_count);
 557 |           exit(1);
 558 |         }
 559 |     }
 560 |   else if (opt_ignore_counts)
 561 |     {
 562 |       p->count = 1;
 563 |     }
 564 |   else
 565 |     {
 566 |       fprintf(logfile,
 567 |               "\n\nError: missing or empty duplicate_count on line %"
 568 |               PRIu64 "\n",
 569 |               lineno);
 570 |       exit(1);
 571 |     }
 572 | 
 573 |   d->total_duplicate_count += p->count;
 574 | 
 575 | 
 576 |   /* handle v_call */
 577 | 
 578 |   if (! opt_ignore_genes && ! (v_call && *v_call))
 579 |     {
 580 |       fprintf(logfile,
 581 |               "\n\nError: missing or empty v_call value on line %"
 582 |               PRIu64 "\n",
 583 |               lineno);
 584 |       exit(1);
 585 |     }
 586 | 
 587 |   if (! v_call)
 588 |     {
 589 |       v_call = EMPTYSTRING;
 590 |     }
 591 | 
 592 |   auto v_it = v_gene_map.find(v_call);
 593 |   if (v_it != v_gene_map.end())
 594 |     {
 595 |       p->v_gene_no = v_it->second;
 596 |     }
 597 |   else
 598 |     {
 599 |       p->v_gene_no = v_gene_vector.size();
 600 |       v_gene_vector.push_back(v_call);
 601 |       v_gene_map.insert({v_call, p->v_gene_no});
 602 |     }
 603 | 
 604 | 
 605 |   /* handle j_call */
 606 | 
 607 |   if (! opt_ignore_genes && ! (j_call && *j_call))
 608 |     {
 609 |       fprintf(logfile,
 610 |               "\n\nError: missing or empty j_call value on line %"
 611 |               PRIu64 "\n",
 612 |               lineno);
 613 |       exit(1);
 614 |     }
 615 | 
 616 |   if (! j_call)
 617 |     {
 618 |       j_call = EMPTYSTRING;
 619 |     }
 620 | 
 621 |   auto j_it = j_gene_map.find(j_call);
 622 |   if (j_it != j_gene_map.end())
 623 |     {
 624 |       p->j_gene_no = j_it->second;
 625 |     }
 626 |   else
 627 |     {
 628 |       p->j_gene_no = j_gene_vector.size();
 629 |       j_gene_vector.push_back(j_call);
 630 |       j_gene_map.insert({j_call, p->j_gene_no});
 631 |     }
 632 | 
 633 | 
 634 |   /* handle junction(_aa) or cdr3(_aa) */
 635 | 
 636 |   bool seq_ok = false;
 637 |   if (opt_nucleotides)
 638 |     {
 639 |       if (opt_cdr3)
 640 |         {
 641 |           seq_ok = cdr3 && *cdr3;
 642 |         }
 643 |       else
 644 |         {
 645 |           seq_ok = junction && *junction;
 646 |         }
 647 |     }
 648 |   else
 649 |     {
 650 |       if (opt_cdr3)
 651 |         {
 652 |           seq_ok = cdr3_aa && *cdr3_aa;
 653 |         }
 654 |       else
 655 |         {
 656 |           seq_ok = junction_aa && *junction_aa;
 657 |         }
 658 |     }
 659 | 
 660 |   if (! seq_ok)
 661 |     {
 662 |       fprintf(logfile,
 663 |               "\n\nError: missing or empty %s value on line %"
 664 |               PRIu64 "\n",
 665 |               seq_header,
 666 |               lineno);
 667 |       exit(1);
 668 |     }
 669 | 
 670 | 
 671 |   /* handle keep_columns */
 672 | 
 673 |   unsigned int len = 0;
 674 |   for (int k = 0; k < keep_columns_count; k++)
 675 |     {
 676 |       if (keep_columns_strings[k])
 677 |         len += strlen(keep_columns_strings[k]);
 678 |       len++;
 679 |     }
 680 |   if (len > 0)
 681 |     p->keep = (char *) xmalloc(len);
 682 |   else
 683 |     p->keep = nullptr;
 684 | 
 685 |   len = 0;
 686 |   bool first = true;
 687 |   for (int k = 0; k < keep_columns_count; k++)
 688 |     {
 689 |       if (first)
 690 |         first = false;
 691 |       else
 692 |         p->keep[len++] = '\t';
 693 |       if (keep_columns_strings[k])
 694 |         {
 695 |           strcpy(p->keep + len, keep_columns_strings[k]);
 696 |           len += strlen(keep_columns_strings[k]);
 697 |           keep_columns_strings[k] = nullptr;
 698 |         }
 699 |     }
 700 |   if (p->keep)
 701 |     p->keep[len] = 0;
 702 | 
 703 |   p->hash = 0;
 704 | 
 705 |   d->sequences++;
 706 | }
 707 | 
 708 | void db_read(struct db * d,
 709 |              const char * filename,
 710 |              bool require_sequence_id,
 711 |              const char * default_repertoire_id)
 712 | {
 713 |   FILE * fp = nullptr;
 714 |   if (filename)
 715 |     {
 716 |       fp = fopen_input(filename);
 717 |       if (!fp)
 718 |         {
 719 |           fprintf(logfile,
 720 |                   "\nError: Unable to open input data file (%s).\n",
 721 |                   filename);
 722 |           exit(1);
 723 |         }
 724 |     }
 725 |   else
 726 |     fp = stdin;
 727 | 
 728 |   /* get file size */
 729 | 
 730 |   struct stat fs;
 731 | 
 732 |   if (fstat(fileno(fp), & fs))
 733 |     {
 734 |       fprintf(logfile, "\nUnable to fstat on input file (%s)\n", filename);
 735 |       exit(1);
 736 |     }
 737 |   bool is_regular = S_ISREG(fs.st_mode);
 738 |   uint64_t filesize = is_regular ? (uint64_t)(fs.st_size) : 0;
 739 |   uint64_t fileread = 0;
 740 | 
 741 |   if (! is_regular)
 742 |     fprintf(logfile, "Waiting for data from standard input...\n");
 743 | 
 744 |   size_t line_alloc = 4096;
 745 |   char * line = (char *) xmalloc(line_alloc);
 746 |   uint64_t lineno = 0;
 747 |   ssize_t linelen = 0;
 748 | 
 749 |   d->longest = 0;
 750 |   d->shortest = UINT_MAX;
 751 |   d->ignored_unknown = 0;
 752 |   d->ignored_empty = 0;
 753 | 
 754 |   int state = 0;
 755 | 
 756 |   progress_init("Reading sequences:", filesize);
 757 | 
 758 |   linelen = getline(& line, & line_alloc, fp);
 759 | 
 760 |   if (linelen < 0)
 761 |     fatal("Unable to read from the input file");
 762 | 
 763 |   fileread += linelen;
 764 | 
 765 |   if ((linelen > 0) && (line[linelen-1] == '\n'))
 766 |     {
 767 |       line[linelen-1] = 0;
 768 |       linelen--;
 769 |     }
 770 | 
 771 |   if ((linelen > 0) && (line[linelen-1] == '\r'))
 772 |     {
 773 |       line[linelen-1] = 0;
 774 |       linelen--;
 775 |     }
 776 | 
 777 |   while (linelen >= 0)
 778 |     {
 779 |       lineno++;
 780 | 
 781 |       if (state == 0)
 782 |         {
 783 |           if (line[0] == '#')
 784 |             {
 785 |               /* ignore initial comment section */
 786 |             }
 787 |           else if (line[0] == '@')
 788 |             {
 789 |               /* ignore initial comment section */
 790 |             }
 791 |           else
 792 |             {
 793 |               parse_airr_tsv_header(line,
 794 |                                     d,
 795 |                                     require_sequence_id);
 796 |               state = 1;
 797 |             }
 798 |         }
 799 |       else
 800 |         {
 801 |           parse_airr_tsv_line(line,
 802 |                               lineno,
 803 |                               d,
 804 |                               require_sequence_id,
 805 |                               default_repertoire_id);
 806 |         }
 807 | 
 808 |       /* update progress */
 809 | 
 810 |       if (is_regular)
 811 |         progress_update(fileread);
 812 | 
 813 |       /* get next line */
 814 | 
 815 |       linelen = getline(& line, & line_alloc, fp);
 816 | 
 817 |       if (linelen < 0)
 818 |         break;
 819 | 
 820 |       fileread += linelen;
 821 | 
 822 |       /* remove LF at end of line */
 823 | 
 824 |       if ((linelen > 0) && (line[linelen-1] == '\n'))
 825 |         {
 826 |           line[linelen-1] = 0;
 827 |           linelen--;
 828 |         }
 829 | 
 830 |       /* remove CR at end of line if from DOS/Windows */
 831 | 
 832 |       if ((linelen > 0) && (line[linelen-1] == '\r'))
 833 |         {
 834 |           line[linelen-1] = 0;
 835 |           linelen--;
 836 |         }
 837 |     }
 838 | 
 839 |   progress_done();
 840 | 
 841 |   if (line)
 842 |     xfree(line);
 843 |   line = nullptr;
 844 | 
 845 |   fclose(fp);
 846 | 
 847 |   d->repertoire_count = d->repertoire_id_vector.size();
 848 | 
 849 |   if (d->ignored_unknown > 0)
 850 |     fprintf(logfile, "%" PRIu64 " sequences with unknown symbols ignored.\n", d->ignored_unknown);
 851 | 
 852 |   if (d->ignored_empty > 0)
 853 |     fprintf(logfile, "%" PRIu64 " empty sequences ignored.\n", d->ignored_empty);
 854 | 
 855 |   if (d->sequences > 0)
 856 |     {
 857 |       fprintf(logfile,
 858 |               "Repertoires:       %" PRIu64 "\n"
 859 |               "Sequences:         %" PRIu64 "\n"
 860 |               "Residues:          %" PRIu64 "\n"
 861 |               "Shortest:          %u\n"
 862 |               "Longest:           %u\n"
 863 |               "Average length:    %.1lf\n"
 864 |               "Total dupl. count: %" PRIu64 "\n",
 865 |               d->repertoire_count,
 866 |               d->sequences,
 867 |               d->residues_count,
 868 |               d->shortest,
 869 |               d->longest,
 870 |               1.0 * d->residues_count / d->sequences,
 871 |               d->total_duplicate_count);
 872 |     }
 873 |   else
 874 |     {
 875 |       fprintf(logfile,
 876 |               "Repertoires:       %" PRIu64 "\n"
 877 |               "Sequences:         %" PRIu64 "\n"
 878 |               "Residues:          %" PRIu64 "\n"
 879 |               "Shortest:          -\n"
 880 |               "Longest:           -\n"
 881 |               "Average length:    -\n"
 882 |               "Total dupl. count: %" PRIu64 "\n",
 883 |               d->repertoire_count,
 884 |               d->sequences,
 885 |               d->residues_count,
 886 |               d->total_duplicate_count);
 887 |     }
 888 | 
 889 |   /* add sequence pointers to index table */
 890 | 
 891 |   progress_init("Indexing:         ", d->sequences);
 892 |   char * r = d->residues_p;
 893 |   for(uint64_t i = 0; i < d->sequences; i++)
 894 |     {
 895 |       seqinfo_s * p = d->seqindex + i;
 896 |       p->seq = r;
 897 |       r += p->seqlen;
 898 |       progress_update(i+1);
 899 |     }
 900 |   progress_done();
 901 | }
 902 | 
 903 | void db_hash(struct db * d)
 904 | {
 905 |   progress_init("Computing hashes: ", d->sequences);
 906 |   for(uint64_t i = 0; i < d->sequences; i++)
 907 |     {
 908 |       seqinfo_s * p = d->seqindex + i;
 909 |       d->seqindex[i].hash = zobrist_hash((unsigned char *)(p->seq),
 910 |                                          p->seqlen,
 911 |                                          p->v_gene_no,
 912 |                                          p->j_gene_no);
 913 |       progress_update(i+1);
 914 |     }
 915 |   progress_done();
 916 | }
 917 | 
 918 | void db_free(struct db * d)
 919 | {
 920 |   if (d->residues_p)
 921 |     xfree(d->residues_p);
 922 |   if (d->seqindex)
 923 |     {
 924 |       for (uint64_t i = 0; i < d->sequences; i++)
 925 |         {
 926 |           if (d->seqindex[i].sequence_id)
 927 |             {
 928 |               xfree(d->seqindex[i].sequence_id);
 929 |               d->seqindex[i].sequence_id = nullptr;
 930 |             }
 931 |           if (d->seqindex[i].keep)
 932 |             {
 933 |               xfree(d->seqindex[i].keep);
 934 |               d->seqindex[i].keep = nullptr;
 935 |             }
 936 |         }
 937 |       xfree(d->seqindex);
 938 |     }
 939 |   d->repertoire_id_vector.clear();
 940 |   d->repertoire_id_map.clear();
 941 |   delete d;
 942 | }
 943 | 
 944 | uint64_t db_getsequencecount(struct db * d)
 945 | {
 946 |   return d->sequences;
 947 | }
 948 | 
 949 | uint64_t db_getresiduescount(struct db * d)
 950 | {
 951 |   return d->residues_count;
 952 | }
 953 | 
 954 | unsigned int db_getlongestsequence(struct db * d)
 955 | {
 956 |   return d->longest;
 957 | }
 958 | 
 959 | uint64_t db_get_repertoire_count(struct db * d)
 960 | {
 961 |   return d->repertoire_count;
 962 | }
 963 | 
 964 | uint64_t db_gethash(struct db * d, uint64_t seqno)
 965 | {
 966 |   return d->seqindex[seqno].hash;
 967 | }
 968 | 
 969 | char * db_getsequence(struct db * d, uint64_t seqno)
 970 | {
 971 |   return d->seqindex[seqno].seq;
 972 | }
 973 | 
 974 | unsigned int db_getsequencelen(struct db * d, uint64_t seqno)
 975 | {
 976 |   return d->seqindex[seqno].seqlen;
 977 | }
 978 | 
 979 | uint64_t db_get_v_gene(struct db * d, uint64_t seqno)
 980 | {
 981 |   return d->seqindex[seqno].v_gene_no;
 982 | }
 983 | 
 984 | uint64_t db_get_j_gene(struct db * d, uint64_t seqno)
 985 | {
 986 |   return d->seqindex[seqno].j_gene_no;
 987 | }
 988 | 
 989 | uint64_t db_get_count(struct db * d, uint64_t seqno)
 990 | {
 991 |   return d->seqindex[seqno].count;
 992 | }
 993 | 
 994 | int db_get_repertoire_id_no(struct db * d, uint64_t seqno)
 995 | {
 996 |   return d->seqindex[seqno].repertoire_id_no;
 997 | }
 998 | 
 999 | const char * db_get_repertoire_id(struct db * d, int repertoire_id_no)
1000 | {
1001 |   return d->repertoire_id_vector[repertoire_id_no].c_str();
1002 | }
1003 | 
1004 | char * db_get_sequence_id(struct db * d, uint64_t seqno)
1005 | {
1006 |   char * sid = d->seqindex[seqno].sequence_id;
1007 |   if (sid)
1008 |     return sid;
1009 |   else
1010 |     return (char *) EMPTYSTRING;
1011 | }
1012 | 
1013 | uint64_t db_get_v_gene_count()
1014 | {
1015 |   return v_gene_vector.size();
1016 | }
1017 | 
1018 | uint64_t db_get_j_gene_count()
1019 | {
1020 |   return j_gene_vector.size();
1021 | }
1022 | 
1023 | const char * db_get_v_gene_name(struct db * d, uint64_t seqno)
1024 | {
1025 |   int v_gene_no = d->seqindex[seqno].v_gene_no;
1026 |   return v_gene_vector[v_gene_no].c_str();
1027 | }
1028 | 
1029 | const char * db_get_j_gene_name(struct db * d, uint64_t seqno)
1030 | {
1031 |   int j_gene_no = d->seqindex[seqno].j_gene_no;
1032 |   return j_gene_vector[j_gene_no].c_str();
1033 | }
1034 | 
1035 | void db_fprint_sequence(FILE * f, struct db * d, uint64_t seqno)
1036 | {
1037 |   char * seq = db_getsequence(d, seqno);
1038 |   unsigned int len = db_getsequencelen(d, seqno);
1039 |   if (opt_nucleotides)
1040 |     {
1041 |       for (unsigned int i = 0; i < len; i++)
1042 |         fputc(nt_chars[(int)(seq[i])], f);
1043 |     }
1044 |   else
1045 |     {
1046 |       for (unsigned int i = 0; i < len; i++)
1047 |         fputc(aa_chars[(int)(seq[i])], f);
1048 |     }
1049 | }
1050 | 
1051 | char * db_get_keep_columns(struct db * d, uint64_t seqno)
1052 | {
1053 |   char * keep = d->seqindex[seqno].keep;
1054 |   if (keep)
1055 |     return keep;
1056 |   else
1057 |     return (char *) EMPTYSTRING;
1058 | }
1059 | 


--------------------------------------------------------------------------------
/src/db.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | /* structures and data types */
23 | 
24 | struct db;
25 | 
26 | 
27 | 
28 | /* functions in db.cc */
29 | 
30 | void db_init();
31 | 
32 | void db_exit();
33 | 
34 | struct db * db_create();
35 | 
36 | void db_free(struct db * d);
37 | 
38 | void db_read(struct db * d,
39 |              const char * filename,
40 |              bool require_sequence_id,
41 |              const char * default_repertoire_id);
42 | 
43 | uint64_t db_getsequencecount(struct db * d);
44 | 
45 | uint64_t db_get_repertoire_count(struct db * d);
46 | 
47 | uint64_t db_getresiduescount(struct db * d);
48 | 
49 | unsigned int db_getlongestsequence(struct db * d);
50 | 
51 | char * db_getsequence(struct db * d, uint64_t seqno);
52 | 
53 | unsigned int db_getsequencelen(struct db * d, uint64_t seqno);
54 | 
55 | uint64_t db_gethash(struct db * d, uint64_t seqno);
56 | 
57 | uint64_t db_get_v_gene(struct db * d, uint64_t seqno);
58 | 
59 | uint64_t db_get_j_gene(struct db * d, uint64_t seqno);
60 | 
61 | uint64_t db_get_count(struct db * d, uint64_t seqno);
62 | 
63 | int db_get_repertoire_id_no(struct db * d, uint64_t seqno);
64 | 
65 | const char * db_get_repertoire_id(struct db * d, int repertoire_id_no);
66 | 
67 | char * db_get_sequence_id(struct db * d, uint64_t seqno);
68 | 
69 | void db_hash(struct db * d);
70 | 
71 | uint64_t db_get_v_gene_count();
72 | 
73 | uint64_t db_get_j_gene_count();
74 | 
75 | const char * db_get_v_gene_name(struct db * d, uint64_t seqno);
76 | 
77 | const char * db_get_j_gene_name(struct db * d, uint64_t seqno);
78 | 
79 | void db_fprint_sequence(FILE * f, struct db * d, uint64_t seqno);
80 | 
81 | char * db_get_keep_columns(struct db * d, uint64_t seqno);
82 | 


--------------------------------------------------------------------------------
/src/dedup.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2022 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | #include "compairr.h"
 23 | 
 24 | const uint64_t terminal = -1;
 25 | const uint64_t done = -2;
 26 | 
 27 | static void report(struct db * d,
 28 |                    uint64_t seed,
 29 |                    uint64_t * next_seq)
 30 | {
 31 |   if (next_seq[seed] == done)
 32 |     return;
 33 | 
 34 |   uint64_t count = opt_ignore_counts ? 1 : db_get_count(d, seed);
 35 |   uint64_t link = next_seq[seed];
 36 |   next_seq[seed] = done;
 37 |   while (link != terminal)
 38 |     {
 39 |       count += opt_ignore_counts ? 1 : db_get_count(d, link);
 40 |       uint64_t temp = next_seq[link];
 41 |       next_seq[link] = done;
 42 |       link = temp;
 43 |     }
 44 | 
 45 |   unsigned int seed_rep_id_no = db_get_repertoire_id_no(d, seed);
 46 |   const char * seed_rep_id = db_get_repertoire_id(d, seed_rep_id_no);
 47 |   const char * seed_v_gene_name = db_get_v_gene_name(d, seed);
 48 |   const char * seed_j_gene_name = db_get_j_gene_name(d, seed);
 49 | 
 50 |   fprintf(outfile, "%s", seed_rep_id);
 51 |   fprintf(outfile, "\t%" PRIu64, count);
 52 |   if (! opt_ignore_genes)
 53 |     fprintf(outfile, "\t%s\t%s", seed_v_gene_name, seed_j_gene_name);
 54 |   fprintf(outfile, "\t");
 55 |   db_fprint_sequence(outfile, d, seed);
 56 |   fprintf(outfile, "\n");
 57 | }
 58 | 
 59 | 
 60 | static bool process(struct db * d,
 61 |                     hashtable_s * ht,
 62 |                     struct bloom_s * b,
 63 |                     uint64_t seed,
 64 |                     uint64_t * next_seq)
 65 | {
 66 |   unsigned int seed_rep_id_no = db_get_repertoire_id_no(d, seed);
 67 |   uint64_t seed_v_gene = db_get_v_gene(d, seed);
 68 |   uint64_t seed_j_gene = db_get_j_gene(d, seed);
 69 |   unsigned char * seed_sequence
 70 |     = (unsigned char *) db_getsequence(d, seed);
 71 |   unsigned int seed_seqlen
 72 |     = db_getsequencelen(d, seed);
 73 | 
 74 |   uint64_t last = terminal;
 75 | 
 76 |   /* find the first empty bucket */
 77 |   uint64_t hash = db_gethash(d, seed);
 78 |   uint64_t j = hash_getindex(ht, hash);
 79 |   while (hash_is_occupied(ht, j))
 80 |     {
 81 | #if 1
 82 |       if (((b == nullptr) || bloom_get(b, hash)) &&
 83 |           (hash_compare_value(ht, j, hash)))
 84 | #else
 85 |       if (hash_compare_value(ht, j, hash))
 86 | #endif
 87 |         {
 88 |           uint64_t hit = hash_get_data(ht, j);
 89 | 
 90 |           /* check repertoire id match */
 91 |           unsigned int hit_rep_id_no = db_get_repertoire_id_no(d, hit);
 92 | 
 93 |           if (seed_rep_id_no == hit_rep_id_no)
 94 |             {
 95 |               /* double check that everything matches */
 96 |               unsigned int hit_v_gene = db_get_v_gene(d, hit);
 97 |               unsigned int hit_j_gene = db_get_j_gene(d, hit);
 98 | 
 99 |               if (opt_ignore_genes ||
100 |                   ((seed_v_gene == hit_v_gene) && (seed_j_gene == hit_j_gene)))
101 |                 {
102 |                   unsigned char * hit_sequence
103 |                     = (unsigned char *) db_getsequence(d, hit);
104 |                   unsigned int hit_seqlen
105 |                     = db_getsequencelen(d, hit);
106 | 
107 |                   if ((seed_seqlen == hit_seqlen) &&
108 |                       ! memcmp(seed_sequence, hit_sequence, seed_seqlen))
109 |                     {
110 |                       last = hit;
111 |                     }
112 |                 }
113 |             }
114 |         }
115 |       j = hash_getnextindex(ht, j);
116 |     }
117 | 
118 |   hash_set_occupied(ht, j);
119 |   hash_set_value(ht, j, hash);
120 |   hash_set_data(ht, j, seed);
121 | 
122 |   if (b)
123 |     bloom_set(b, hash);
124 | 
125 |   if (last != terminal)
126 |     {
127 |       next_seq[last] = seed;
128 |       return true;
129 |     }
130 |   else
131 |     return false;
132 | }
133 | 
134 | void dedup(char * filename)
135 | {
136 |   /* deduplicate a repertoire set */
137 | 
138 |   db_init();
139 | 
140 |   struct db * d1 = db_create();
141 | 
142 |   db_read(d1, filename, false, "1");
143 | 
144 |   unsigned int longestsequence = db_getlongestsequence(d1);
145 |   uint64_t sequences = db_getsequencecount(d1);
146 | 
147 |   fprintf(logfile, "Unique V genes:    %" PRIu64 "\n",
148 |           db_get_v_gene_count());
149 | 
150 |   fprintf(logfile, "Unique J genes:    %" PRIu64 "\n",
151 |           db_get_j_gene_count());
152 | 
153 | 
154 |   /* compute hashes for each sequence in database */
155 | 
156 |   zobrist_init(longestsequence,
157 |                db_get_v_gene_count(),
158 |                db_get_j_gene_count());
159 | 
160 |   db_hash(d1);
161 | 
162 |   /* store sequences in a hash table */
163 |   /* use an additional bloom filter for increased speed */
164 |   /* hashing into hash table & bloom filter */
165 | 
166 |   /* alloc and init array of flags indicating processed sequences */
167 | 
168 |   uint64_t * next_seq = (uint64_t *) xmalloc(sequences * sizeof(uint64_t));
169 |   for (uint64_t i = 0; i < sequences; i++)
170 |     next_seq[i] = terminal;
171 | 
172 |   uint64_t dup_seq = 0;
173 | 
174 |   hashtable_s * hashtable = hash_init(sequences);
175 |   struct bloom_s * bloom = bloom_init(hash_get_tablesize(hashtable));
176 | 
177 |   fprintf(outfile, "repertoire_id");
178 |   fprintf(outfile, "\tduplicate_count");
179 |   if (! opt_ignore_genes)
180 |     fprintf(outfile, "\tv_call\tj_call");
181 |   fprintf(outfile, "\t%s\n", seq_header);
182 | 
183 |   progress_init("Deduplicating:    ", sequences);
184 |   for(uint64_t i=0; i < sequences; i++)
185 |     {
186 |       if (process(d1, hashtable, bloom, i, next_seq))
187 |         dup_seq++;
188 |       progress_update(i);
189 |     }
190 |   progress_done();
191 | 
192 |   fprintf(logfile, "Duplicates merged: %" PRIu64 "\n", dup_seq);
193 | 
194 |   progress_init("Writing output:   ", sequences);
195 |   for(uint64_t i=0; i < sequences; i++)
196 |     {
197 |       report(d1, i, next_seq);
198 |       progress_update(i);
199 |     }
200 |   progress_done();
201 | 
202 |   fprintf(logfile, "\n");
203 | 
204 |   bloom_exit(bloom);
205 |   hash_exit(hashtable);
206 |   hashtable = nullptr;
207 | 
208 |   xfree(next_seq);
209 |   next_seq = nullptr;
210 | 
211 |   zobrist_exit();
212 | 
213 |   db_free(d1);
214 |   db_exit();
215 | }
216 | 


--------------------------------------------------------------------------------
/src/dedup.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | void dedup(char * filename);
23 | 


--------------------------------------------------------------------------------
/src/hashtable.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | #include "compairr.h"
23 | 
24 | #define HASHFILLPCT 70
25 | 
26 | void hash_zap(struct hashtable_s * ht)
27 | {
28 |   memset(ht->hash_occupied, 0, (ht->hash_tablesize + 63) / 8);
29 | }
30 | 
31 | struct hashtable_s * hash_init(uint64_t sequences)
32 | {
33 |   struct hashtable_s * ht = (struct hashtable_s *)
34 |     xmalloc(sizeof(struct hashtable_s));
35 | 
36 |   ht->hash_tablesize = 1;
37 |   while (HASHFILLPCT * ht->hash_tablesize < 100 * sequences)
38 |     ht->hash_tablesize <<= 1;
39 | 
40 |   ht->hash_mask = ht->hash_tablesize - 1;
41 | 
42 |   ht->hash_occupied = static_cast<unsigned char *>
43 |     (xmalloc((ht->hash_tablesize + 63) / 8));
44 | 
45 |   hash_zap(ht);
46 | 
47 |   ht->hash_values = static_cast<uint64_t *>
48 |     (xmalloc(ht->hash_tablesize * sizeof(uint64_t)));
49 | 
50 |   ht->hash_data = static_cast<uint64_t *>
51 |     (xmalloc(ht->hash_tablesize * sizeof(uint64_t)));
52 | 
53 |   return ht;
54 | }
55 | 
56 | void hash_exit(struct hashtable_s * ht)
57 | {
58 |   xfree(ht->hash_occupied);
59 |   xfree(ht->hash_values);
60 |   xfree(ht->hash_data);
61 |   xfree(ht);
62 | }
63 | 


--------------------------------------------------------------------------------
/src/hashtable.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | struct hashtable_s
23 | {
24 |   uint64_t hash_mask;
25 |   uint64_t * hash_values;
26 |   uint64_t * hash_data;
27 |   uint64_t hash_tablesize;
28 |   unsigned char * hash_occupied;
29 | };
30 | 
31 | inline uint64_t hash_get_tablesize(struct hashtable_s * ht)
32 | {
33 |   return ht->hash_tablesize;
34 | }
35 | 
36 | inline uint64_t hash_getindex(struct hashtable_s * ht, uint64_t hash)
37 | {
38 |   // Shift bits right to get independence from the simple Bloom filter hash
39 |   hash = hash >> 32;
40 |   return hash & ht->hash_mask;
41 | }
42 | 
43 | inline uint64_t hash_getnextindex(struct hashtable_s * ht, uint64_t j)
44 | {
45 |   return (j+1) & ht->hash_mask;
46 | }
47 | 
48 | inline void hash_set_occupied(struct hashtable_s * ht, uint64_t j)
49 | {
50 |   ht->hash_occupied[j >> 3] |= (1 << (j & 7));
51 | }
52 | 
53 | inline bool hash_is_occupied(struct hashtable_s * ht, uint64_t j)
54 | {
55 |   return ht->hash_occupied[j >> 3] & (1 << (j & 7));
56 | }
57 | 
58 | inline void hash_set_value(struct hashtable_s * ht, uint64_t j, uint64_t hash)
59 | {
60 |   ht->hash_values[j] = hash;
61 | }
62 | 
63 | inline bool hash_compare_value(struct hashtable_s * ht,
64 |                                uint64_t j, uint64_t hash)
65 | {
66 |   return (ht->hash_values[j] == hash);
67 | }
68 | 
69 | inline uint64_t hash_get_data(struct hashtable_s * ht, uint64_t j)
70 | {
71 |   return ht->hash_data[j];
72 | }
73 | 
74 | inline void hash_set_data(struct hashtable_s * ht, uint64_t j, uint64_t x)
75 | {
76 |   ht->hash_data[j] = x;
77 | }
78 | 
79 | void hash_zap(struct hashtable_s * ht);
80 | 
81 | struct hashtable_s * hash_init(uint64_t sequences);
82 | 
83 | void hash_exit(struct hashtable_s * ht);
84 | 


--------------------------------------------------------------------------------
/src/overlap.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | /* other */
23 | 
24 | void overlap(char * set1_filename, char * set2_filename);
25 | 


--------------------------------------------------------------------------------
/src/threads.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | class ThreadRunner
 23 | {
 24 | private:
 25 | 
 26 |   int64_t thread_count;
 27 | 
 28 |   pthread_attr_t attr;
 29 | 
 30 |   struct thread_s
 31 |   {
 32 |     int64_t t;
 33 |     void (*fun)(int64_t t);
 34 |     pthread_t pthread;
 35 |     pthread_mutex_t workmutex;
 36 |     pthread_cond_t workcond;
 37 |     int64_t work; /* 1: work available, 0: wait, -1: quit */
 38 |   } * thread_array;
 39 | 
 40 |   static void * worker(void * vp)
 41 |   {
 42 |     struct thread_s * tip = static_cast<struct thread_s *>(vp);
 43 | 
 44 |     pthread_mutex_lock(&tip->workmutex);
 45 | 
 46 |     /* loop until signalled to quit */
 47 |     while (tip->work >= 0)
 48 |       {
 49 |         /* wait for work available */
 50 |         if (tip->work == 0)
 51 |           pthread_cond_wait(&tip->workcond, &tip->workmutex);
 52 | 
 53 |         if (tip->work > 0)
 54 |           {
 55 |             (*tip->fun)(tip->t);
 56 |             tip->work = 0;
 57 |             pthread_cond_signal(&tip->workcond);
 58 |           }
 59 |       }
 60 | 
 61 |     pthread_mutex_unlock(&tip->workmutex);
 62 |     return nullptr;
 63 |   }
 64 | 
 65 | public:
 66 | 
 67 |   ThreadRunner(int t, void (*f)(int64_t t))
 68 |   {
 69 |     thread_count = t;
 70 | 
 71 |     pthread_attr_init(&attr);
 72 |     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
 73 | 
 74 |     /* allocate memory for thread data */
 75 |     thread_array = static_cast<struct thread_s *>
 76 |       (xmalloc(static_cast<uint64_t>(thread_count) * sizeof(struct thread_s)));
 77 | 
 78 |     /* init and create worker threads */
 79 |     for(int64_t i=0; i<thread_count; i++)
 80 |       {
 81 |         struct thread_s * tip = thread_array + i;
 82 |         tip->t = i;
 83 |         tip->work = 0;
 84 |         tip->fun = f;
 85 |         pthread_mutex_init(&tip->workmutex, nullptr);
 86 |         pthread_cond_init(&tip->workcond, nullptr);
 87 |         if (pthread_create(&tip->pthread,
 88 |                            &attr,
 89 |                            worker,
 90 |                            static_cast<void*>(thread_array + i)))
 91 |           fatal("Cannot create thread");
 92 |       }
 93 |   }
 94 | 
 95 |   ~ThreadRunner()
 96 |   {
 97 |     /* ask threads to quit */
 98 |     /* sleep until they have quit */
 99 |     /* destroy threads */
100 |     /* finish and clean up worker threads */
101 | 
102 |     for(int64_t i=0; i<thread_count; i++)
103 |       {
104 |         struct thread_s * tip = thread_array + i;
105 | 
106 |         /* tell worker to quit */
107 |         pthread_mutex_lock(&tip->workmutex);
108 |         tip->work = -1;
109 |         pthread_cond_signal(&tip->workcond);
110 |         pthread_mutex_unlock(&tip->workmutex);
111 | 
112 |         /* wait for worker to quit */
113 |         if (pthread_join(tip->pthread, nullptr))
114 |           fatal("Cannot join thread");
115 | 
116 |         pthread_cond_destroy(&tip->workcond);
117 |         pthread_mutex_destroy(&tip->workmutex);
118 |       }
119 | 
120 |     xfree(thread_array);
121 |     pthread_attr_destroy(&attr);
122 |   }
123 | 
124 |   void run()
125 |   {
126 |     /* wake up threads */
127 |     for(int64_t i=0; i<thread_count; i++)
128 |       {
129 |         struct thread_s * tip = thread_array + i;
130 |         pthread_mutex_lock(&tip->workmutex);
131 |         tip->work = 1;
132 |         pthread_cond_signal(&tip->workcond);
133 |         pthread_mutex_unlock(&tip->workmutex);
134 |       }
135 | 
136 |     /* wait for threads to finish their work */
137 |     for(int64_t i=0; i<thread_count; i++)
138 |       {
139 |         struct thread_s * tip = thread_array + i;
140 |         pthread_mutex_lock(&tip->workmutex);
141 |         while (tip->work > 0)
142 |           pthread_cond_wait(&tip->workcond, &tip->workmutex);
143 |         pthread_mutex_unlock(&tip->workmutex);
144 |       }
145 |   }
146 | };
147 | 
148 | 


--------------------------------------------------------------------------------
/src/util.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | #include "compairr.h"
 23 | 
 24 | static const char * progress_prompt;
 25 | static uint64_t progress_next;
 26 | static uint64_t progress_size;
 27 | static uint64_t progress_chunk;
 28 | static const uint64_t progress_granularity = 200;
 29 | const size_t memalignment = 16;
 30 | static std::chrono::time_point<std::chrono::steady_clock> time_point_start;
 31 | 
 32 | void progress_init(const char * prompt, uint64_t size)
 33 | {
 34 |   progress_prompt = prompt;
 35 |   progress_size = size;
 36 |   progress_chunk = size < progress_granularity ?
 37 |     1 : size / progress_granularity;
 38 |   progress_next = progress_chunk;
 39 |   if (opt_log)
 40 |     fprintf(logfile, "%s", prompt);
 41 |   else
 42 |     fprintf(logfile, "%s %.0f%%", prompt, 0.0);
 43 |   fflush(logfile);
 44 |   time_point_start = std::chrono::steady_clock::now();
 45 | }
 46 | 
 47 | void progress_update(uint64_t progress)
 48 | {
 49 |   if ((!opt_log) && (progress >= progress_next))
 50 |     {
 51 |       fprintf(logfile, "  \r%s %.0f%%", progress_prompt,
 52 |               100.0 * static_cast<double>(progress)
 53 |               / static_cast<double>(progress_size));
 54 |       progress_next = progress + progress_chunk;
 55 |       fflush(logfile);
 56 |     }
 57 | }
 58 | 
 59 | void progress_done()
 60 | {
 61 |   auto time_point_now = std::chrono::steady_clock::now();
 62 |   double time_diff = 0.000000001 * (time_point_now - time_point_start)
 63 |     / std::chrono::nanoseconds(1);
 64 |   if (opt_log)
 65 |     fprintf(logfile, " %.0f%% (%.9lfs)\n", 100.0, time_diff);
 66 |   else
 67 |     fprintf(logfile, "  \r%s %.0f%% (%.9lfs)\n", progress_prompt, 100.0,
 68 |             time_diff);
 69 |   fflush(logfile);
 70 | }
 71 | 
 72 | int64_t gcd(int64_t a, int64_t b)
 73 | {
 74 |   if (b == 0)
 75 |   {
 76 |     return a;
 77 |   }
 78 |   else
 79 |   {
 80 |     return gcd(b, a % b);
 81 |   }
 82 | }
 83 | 
 84 | [[ noreturn ]] void fatal(const char * msg)
 85 | {
 86 |   fprintf(stderr, "\nError: %s\n", msg);
 87 |   exit(1);
 88 | }
 89 | 
 90 | void * xmalloc(size_t size)
 91 | {
 92 |   if (size == 0)
 93 |     size = 1;
 94 |   void * t = nullptr;
 95 | #ifdef _WIN32
 96 |   t = _aligned_malloc(size, memalignment);
 97 | #else
 98 |   if (posix_memalign(& t, memalignment, size))
 99 |     t = nullptr;
100 | #endif
101 |   if (!t)
102 |     fatal("Unable to allocate enough memory.");
103 |   return t;
104 | }
105 | 
106 | void * xrealloc(void *ptr, size_t size)
107 | {
108 |   if (size == 0)
109 |     size = 1;
110 | #ifdef _WIN32
111 |   void * t = _aligned_realloc(ptr, size, memalignment);
112 | #else
113 |   void * t = realloc(ptr, size);
114 | #endif
115 |   if (!t)
116 |     fatal("Unable to reallocate enough memory.");
117 |   return t;
118 | }
119 | 
120 | void xfree(void * ptr)
121 | {
122 |   if (ptr)
123 |     {
124 | #ifdef _WIN32
125 |       _aligned_free(ptr);
126 | #else
127 |       free(ptr);
128 | #endif
129 |     }
130 |   else
131 |     fatal("Trying to free a null pointer");
132 | }
133 | 
134 | char * xstrdup(const char * s)
135 | {
136 |   char * t = strdup(s);
137 |   if (t == nullptr)
138 |     fatal("Out of memory");
139 |   return t;
140 | }
141 | 
142 | FILE * fopen_input(const char * filename)
143 | {
144 |   /* open the input stream given by filename, but use stdin if name is - */
145 |   if (strcmp(filename, "-") == 0)
146 |     {
147 |       int fd = dup(STDIN_FILENO);
148 |       if (fd < 0)
149 |         return nullptr;
150 |       else
151 |         return fdopen(fd, "rb");
152 |     }
153 |   else
154 |     return fopen(filename, "rb");
155 | }
156 | 
157 | FILE * fopen_output(const char * filename)
158 | {
159 |   /* open the output stream given by filename, but use stdout if name is - */
160 |   if (strcmp(filename, "-") == 0)
161 |     {
162 |       int fd = dup(STDOUT_FILENO);
163 |       if (fd < 0)
164 |         return nullptr;
165 |       else
166 |         return fdopen(fd, "w");
167 |     }
168 |   else
169 |     return fopen(filename, "w");
170 | }
171 | 
172 | int64_t seq_diff(unsigned char * a, unsigned char * b, int64_t len)
173 | {
174 |   /* Count number of different characters in a and b of length len */
175 |   int64_t diffs = 0;
176 |   for (int64_t i = 0; i < len; i++)
177 |     if (*a++ != *b++)
178 |       {
179 |         diffs++;
180 |         if (diffs > opt_differences)
181 |           break;
182 |       }
183 |   return diffs;
184 | }
185 | 


--------------------------------------------------------------------------------
/src/util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | /* functions in util.cc */
23 | 
24 | int64_t gcd(int64_t a, int64_t b);
25 | [[ noreturn ]] void fatal(const char * msg);
26 | void * xmalloc(size_t size);
27 | void * xrealloc(void * ptr, size_t size);
28 | void xfree(void * ptr);
29 | char * xstrdup(const char * s);
30 | void progress_init(const char * prompt, uint64_t size);
31 | void progress_update(uint64_t progress);
32 | void progress_done();
33 | FILE * fopen_input(const char * filename);
34 | FILE * fopen_output(const char * filename);
35 | int64_t seq_diff(unsigned char * a, unsigned char * b, int64_t len);
36 | 


--------------------------------------------------------------------------------
/src/variants.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | #include "compairr.h"
 23 | 
 24 | void ps(unsigned int seqlen, unsigned char * sequence)
 25 | {
 26 |   char ab[] = "ACDEFGHIKLMNPQRSTVWY";
 27 |   for(unsigned int i=0; i < seqlen; i++)
 28 |     printf("%c", ab[sequence[i]]);
 29 | }
 30 | 
 31 | inline void seq_copy(unsigned char * a,
 32 |                      unsigned int a_start,
 33 |                      unsigned char * b,
 34 |                      unsigned int b_start,
 35 |                      unsigned int length)
 36 | {
 37 |   /* copy part of the sequence b to a */
 38 |   memcpy(a + a_start, b + b_start, length);
 39 | }
 40 | 
 41 | inline bool seq_identical(unsigned char * a,
 42 |                           unsigned int a_start,
 43 |                           unsigned char * b,
 44 |                           unsigned int b_start,
 45 |                           unsigned int length)
 46 | {
 47 |   /* compare parts of two sequences a and b */
 48 |   /* return false if different, true if identical */
 49 | 
 50 |   return ! memcmp(a + a_start, b + b_start, length);
 51 | }
 52 | 
 53 | uint64_t max_variants(uint64_t longest)
 54 | {
 55 |   /*
 56 |     calculate upper limit on number of possible variants
 57 | 
 58 |     d=0
 59 |     identical:     1
 60 | 
 61 |     d=1
 62 |     deletions:     longest
 63 |     substitutions: (ab_size - 1) * longest
 64 |     insertions:    (ab_size - 1) * (longest + 1) + 1
 65 | 
 66 |     d=2
 67 |     substitutions: (ab_size - 1) * (ab_size - 1) * longest * (longest - 1) / 2
 68 |   */
 69 | 
 70 |   uint64_t maxvar = 0;
 71 | 
 72 |   // d = 0
 73 |   // identical non-variant
 74 |   maxvar += 1;
 75 | 
 76 |   if (opt_differences >= 1)
 77 |     {
 78 |       // d = 1
 79 |       // substitutions
 80 |       maxvar += longest * (alphabet_size - 1);
 81 | 
 82 |       if (opt_indels)
 83 |         {
 84 |           // deletions
 85 |           maxvar += longest;
 86 | 
 87 |           // insertions
 88 |           maxvar += (longest + 1) * (alphabet_size - 1) + 1;
 89 |         }
 90 |     }
 91 | 
 92 |   if (opt_differences >= 2)
 93 |     {
 94 |       // d = 2
 95 |       // substitutions
 96 |       maxvar += longest * (longest - 1) / 2 *
 97 |         (alphabet_size - 1) * (alphabet_size - 1);
 98 | 
 99 |       if (opt_indels)
100 |         {
101 |           // deletions & insertions
102 |           fatal("Indels not supported for d>1");
103 |         }
104 |     }
105 | 
106 |   return maxvar;
107 | }
108 | 
109 | void generate_variant_sequence(unsigned char * seed_sequence,
110 |                                unsigned int seed_seqlen,
111 |                                struct var_s * var,
112 |                                unsigned char * seq,
113 |                                unsigned int * seqlen)
114 | {
115 |   /* generate the actual sequence of a variant */
116 | 
117 |   switch (var->kind)
118 |     {
119 |     case identical:
120 |       seq_copy(seq, 0, seed_sequence, 0, seed_seqlen);
121 |       * seqlen = seed_seqlen;
122 |       break;
123 | 
124 |     case substitution:
125 |       seq_copy(seq, 0, seed_sequence, 0, seed_seqlen);
126 |       seq[var->pos1] = var->residue1;
127 |       * seqlen = seed_seqlen;
128 |       break;
129 | 
130 |     case deletion:
131 |       seq_copy(seq, 0,
132 |                seed_sequence, 0,
133 |                var->pos1);
134 |       seq_copy(seq, var->pos1,
135 |                seed_sequence, var->pos1 + 1,
136 |                seed_seqlen - var->pos1 - 1);
137 |       * seqlen = seed_seqlen - 1;
138 |       break;
139 | 
140 |     case insertion:
141 |       seq_copy(seq, 0,
142 |                seed_sequence, 0,
143 |                var->pos1);
144 |       seq[var->pos1] = var->residue1;
145 |       seq_copy(seq, var->pos1 + 1,
146 |                seed_sequence, var->pos1,
147 |                seed_seqlen - var->pos1);
148 |       * seqlen = seed_seqlen + 1;
149 |       break;
150 | 
151 |     case sub_sub:
152 |       seq_copy(seq, 0, seed_sequence, 0, seed_seqlen);
153 |       seq[var->pos1] = var->residue1;
154 |       seq[var->pos2] = var->residue2;
155 |       * seqlen = seed_seqlen;
156 |       break;
157 | 
158 |     default:
159 |       fatal("Internal error");
160 |       break;
161 | 
162 |     }
163 | }
164 | 
165 | 
166 | bool check_variant(unsigned char * seed_sequence,
167 |                    unsigned int seed_seqlen,
168 |                    var_s * var,
169 |                    unsigned char * amp_sequence,
170 |                    unsigned int amp_seqlen)
171 | {
172 |   /* make sure seed with given variant is really identical to amp */
173 |   /* we know the hashes are identical */
174 | 
175 |   bool equal = false;
176 | 
177 |   switch (var->kind)
178 |     {
179 |     case identical:
180 |       equal = ((seed_seqlen == amp_seqlen) &&
181 |                (seq_identical(seed_sequence, 0,
182 |                               amp_sequence, 0,
183 |                               seed_seqlen)));
184 |       break;
185 | 
186 |     case substitution:
187 |       equal = ((seed_seqlen == amp_seqlen) &&
188 |                (amp_sequence[var->pos1] == var->residue1) &&
189 |                (seq_identical(seed_sequence, 0,
190 |                               amp_sequence, 0,
191 |                               var->pos1)) &&
192 |                (seq_identical(seed_sequence, var->pos1 + 1,
193 |                               amp_sequence,  var->pos1 + 1,
194 |                               seed_seqlen - var->pos1 - 1)));
195 |       break;
196 | 
197 |     case deletion:
198 |       equal = (((seed_seqlen - 1) == amp_seqlen) &&
199 |                (seq_identical(seed_sequence, 0,
200 |                               amp_sequence, 0,
201 |                               var->pos1)) &&
202 |                (seq_identical(seed_sequence, var->pos1 + 1,
203 |                               amp_sequence,  var->pos1,
204 |                               seed_seqlen - var->pos1 - 1)));
205 |       break;
206 | 
207 |     case insertion:
208 |       equal = (((seed_seqlen + 1) == amp_seqlen) &&
209 |                (amp_sequence[var->pos1] == var->residue1) &&
210 |                (seq_identical(seed_sequence, 0,
211 |                               amp_sequence, 0,
212 |                               var->pos1)) &&
213 |                (seq_identical(seed_sequence, var->pos1,
214 |                               amp_sequence,  var->pos1 + 1,
215 |                               seed_seqlen - var->pos1)));
216 |       break;
217 | 
218 |     case sub_sub:
219 |       equal = ((seed_seqlen == amp_seqlen) &&
220 |                (amp_sequence[var->pos1] == var->residue1) &&
221 |                (amp_sequence[var->pos2] == var->residue2) &&
222 |                (seq_identical(seed_sequence, 0,
223 |                               amp_sequence, 0,
224 |                               var->pos1)) &&
225 |                (seq_identical(seed_sequence, var->pos1 + 1,
226 |                               amp_sequence, var->pos1 + 1,
227 |                               var->pos2 - var->pos1 - 1)) &&
228 |                (seq_identical(seed_sequence, var->pos2 + 1,
229 |                               amp_sequence,  var->pos2 + 1,
230 |                               seed_seqlen - var->pos2 - 1)));
231 |       break;
232 | 
233 |     default:
234 |       fatal("Internal error");
235 |       break;
236 | 
237 |     }
238 | 
239 |   return equal;
240 | }
241 | 
242 | inline void add_variant(uint64_t hash,
243 |                         var_s * variant_list,
244 |                         unsigned int * variant_count,
245 |                         enum mutation_kind_enum kind,
246 |                         unsigned int pos1,
247 |                         unsigned char residue1,
248 |                         unsigned int pos2,
249 |                         unsigned char residue2)
250 | {
251 |   var_s * v = variant_list + (*variant_count)++;
252 |   v->hash = hash;
253 |   v->kind = kind;
254 |   v->pos1 = pos1;
255 |   v->residue1 = residue1;
256 |   v->pos2 = pos2;
257 |   v->residue2 = residue2;
258 | }
259 | 
260 | void generate_variants_0(uint64_t hash,
261 |                          var_s * variant_list,
262 |                          unsigned int * variant_count)
263 | {
264 |   /* identical non-variant */
265 |   add_variant(hash,
266 |               variant_list, variant_count,
267 |               identical, 0, 0, 0, 0);
268 | }
269 | 
270 | void generate_variants_1(uint64_t hash,
271 |                          unsigned char * sequence,
272 |                          unsigned int seqlen,
273 |                          uint64_t v_gene,
274 |                          uint64_t d_gene,
275 |                          var_s * variant_list,
276 |                          unsigned int * variant_count)
277 | {
278 |   /* substitutions */
279 | 
280 |   for(unsigned int i = 0; i < seqlen; i++)
281 |     {
282 |       unsigned char residue1 = sequence[i];
283 |       uint64_t hash1 = hash ^ zobrist_value(i, residue1);
284 |       for (unsigned char v = 0; v < alphabet_size; v++)
285 |         if (v != residue1)
286 |           {
287 |             uint64_t hash2 = hash1 ^ zobrist_value(i, v);
288 | 
289 |             add_variant(hash2,
290 |                         variant_list, variant_count,
291 |                         substitution, i, v, 0, 0);
292 |           }
293 |     }
294 | 
295 |   /* indels */
296 | 
297 |   if (opt_indels)
298 |     {
299 |       /* deletions */
300 | 
301 |       if (seqlen > 1)
302 |         {
303 |           hash = zobrist_hash_delete_first
304 |             (reinterpret_cast<unsigned char *> (sequence),
305 |              seqlen,
306 |              v_gene,
307 |              d_gene);
308 |           add_variant(hash,
309 |                       variant_list, variant_count,
310 |                       deletion, 0, 0, 0, 0);
311 |           unsigned char deleted = sequence[0];
312 |           for(unsigned int i = 1; i < seqlen; i++)
313 |             {
314 |               unsigned char v = sequence[i];
315 |               if (v != deleted)
316 |                 {
317 |                   hash ^= zobrist_value(i - 1, deleted)
318 |                     ^ zobrist_value(i - 1, v);
319 |                   add_variant(hash,
320 |                               variant_list, variant_count,
321 |                               deletion, i, 0, 0, 0);
322 |                   deleted = v;
323 |                 }
324 |             }
325 |         }
326 | 
327 |       /* insertions */
328 | 
329 |       hash = zobrist_hash_insert_first
330 |         (reinterpret_cast<unsigned char *>(sequence),
331 |          seqlen,
332 |          v_gene,
333 |          d_gene);
334 |       for (unsigned char v = 0; v < alphabet_size; v++)
335 |         {
336 |           uint64_t hash1 = hash ^ zobrist_value(0, v);
337 |           add_variant(hash1,
338 |                       variant_list, variant_count,
339 |                       insertion, 0, v, 0, 0);
340 |         }
341 |       for (unsigned int i = 0; i < seqlen; i++)
342 |         {
343 |           unsigned char inserted = sequence[i];
344 |           hash ^= zobrist_value(i, inserted) ^ zobrist_value(i+1, inserted);
345 |           for (unsigned char v = 0; v < alphabet_size; v++)
346 |             if (v != inserted)
347 |               {
348 |                 uint64_t hash1 = hash ^ zobrist_value(i + 1, v);
349 |                 add_variant(hash1,
350 |                             variant_list, variant_count,
351 |                             insertion, i + 1, v, 0, 0);
352 |               }
353 |         }
354 |     }
355 | }
356 | 
357 | void generate_variants_2(uint64_t hash,
358 |                          unsigned char * sequence,
359 |                          unsigned int seqlen,
360 |                          uint64_t v_gene,
361 |                          uint64_t d_gene,
362 |                          var_s * variant_list,
363 |                          unsigned int * variant_count)
364 | {
365 |   (void) v_gene;
366 |   (void) d_gene;
367 | 
368 |   /* generate all double substitutions */
369 | 
370 |   for (unsigned int i = 0; i < seqlen; i++)
371 |     {
372 |       unsigned char res1 = sequence[i];
373 |       uint64_t hash1 = hash ^ zobrist_value(i, res1);
374 | 
375 |       for (unsigned char v = 0; v < alphabet_size; v++)
376 |         {
377 |           if (v != res1)
378 |             {
379 |               uint64_t hash2 = hash1 ^ zobrist_value(i, v);
380 | 
381 |               for (unsigned int j = i + 1; j < seqlen; j++)
382 |                 {
383 |                   unsigned char res2 = sequence[j];
384 |                   uint64_t hash3 = hash2 ^ zobrist_value(j, res2);
385 | 
386 |                   for (unsigned char w = 0; w < alphabet_size; w++)
387 |                     {
388 |                       if (w != res2)
389 |                         {
390 |                           uint64_t hash4 = hash3 ^ zobrist_value(j, w);
391 |                           add_variant(hash4,
392 |                                       variant_list, variant_count,
393 |                                       sub_sub, i, v, j, w);
394 |                         }
395 |                     }
396 |                 }
397 |             }
398 |         }
399 |     }
400 | }
401 | 
402 | void generate_variants(uint64_t hash,
403 |                        unsigned char * sequence,
404 |                        unsigned int seqlen,
405 |                        uint64_t v_gene,
406 |                        uint64_t d_gene,
407 |                        var_s * variant_list,
408 |                        unsigned int * variant_count)
409 | {
410 |   generate_variants_0(hash,
411 |                       variant_list, variant_count);
412 | 
413 |   if (opt_differences >= 1)
414 |     {
415 |       generate_variants_1(hash,
416 |                           sequence, seqlen,
417 |                           v_gene, d_gene,
418 |                           variant_list, variant_count);
419 |     }
420 | 
421 |   if (opt_differences >= 2)
422 |     {
423 |       generate_variants_2(hash,
424 |                           sequence, seqlen,
425 |                           v_gene, d_gene,
426 |                           variant_list, variant_count);
427 |     }
428 | }
429 | 


--------------------------------------------------------------------------------
/src/variants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | /* Variant information */
23 | 
24 | enum mutation_kind_enum
25 |   {
26 |    identical,
27 |    substitution,
28 |    deletion,
29 |    insertion,
30 |    sub_sub
31 |   };
32 | 
33 | /*
34 |   Info about the variants:
35 | 
36 |   For single mutations, only position1 and residue1 is used.
37 | 
38 |   For double mutations, position1 and residue1 applies to the first
39 |   kind of mutation, while position2 and residue2 applies to the second
40 |   kind of mutation. It is the order of the two mutations that matter,
41 |   not their positions in the sequence.
42 | 
43 |   The residues residue1 and residue2 are the new residues that are
44 |   present in the target sequence after insertion or substitution.
45 |   For deletions, the residue is undefined.
46 | 
47 |   The positions position1 and position2 are the positions of the
48 |   deletions, insertions or substitutions. The position1 value is
49 |   the position of the first kind of mutation, while position2 is the
50 |   position of the second kind of mutation. Note that
51 |   position1 > position2 is not unusual.
52 | 
53 |   Positions start at zero.
54 | 
55 |   Positions of insertions refer to the new sequence.
56 |   Positions of deletions refer to the original sequence.
57 |   Positions of substitutions refer to the new sequence.
58 | 
59 |   For mutations of the same kind (del_del, sub_sub, ins_ins), the
60 |   positions (position1 and position2) will always be in order;
61 |   position1 < position2.
62 | 
63 | */
64 | 
65 | struct var_s
66 | {
67 |   uint64_t hash;
68 |   enum mutation_kind_enum kind;
69 |   unsigned int pos1;
70 |   unsigned int pos2;
71 |   unsigned char residue1;
72 |   unsigned char residue2;
73 | };
74 | 
75 | void generate_variant_sequence(unsigned char * seed_sequence,
76 |                                unsigned int seed_seqlen,
77 |                                struct var_s * var,
78 |                                unsigned char * seq,
79 |                                unsigned int * seqlen);
80 | 
81 | bool check_variant(unsigned char * seed_sequence,
82 |                    unsigned int seed_seqlen,
83 |                    struct var_s * var,
84 |                    unsigned char * amp_sequence,
85 |                    unsigned int amp_seqlen);
86 | 
87 | void generate_variants(uint64_t hash,
88 |                        unsigned char * sequence,
89 |                        unsigned int seqlen,
90 |                        uint64_t v_gene,
91 |                        uint64_t d_gene,
92 |                        struct var_s * variant_list,
93 |                        unsigned int * variant_count);
94 | 
95 | uint64_t max_variants(uint64_t longest);
96 | 


--------------------------------------------------------------------------------
/src/zobrist.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU Affero General Public License as
  6 |     published by the Free Software Foundation, either version 3 of the
  7 |     License, or (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU Affero General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU Affero General Public License
 15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
 18 |     Department of Informatics, University of Oslo,
 19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
 20 | */
 21 | 
 22 | #include "compairr.h"
 23 | 
 24 | uint64_t * zobrist_tab_base = nullptr;
 25 | static uint64_t * zobrist_v_base = nullptr;
 26 | static uint64_t * zobrist_d_base = nullptr;
 27 | 
 28 | void zobrist_init(unsigned int n, unsigned int v_genes, unsigned int d_genes)
 29 | {
 30 |   /*
 31 |     Generate 4n or 20n random 64-bit numbers. They will represent the
 32 |     four or twenty different residues in any position (1 to n) of a
 33 |     sequence.  They will be XOR'ed together to form the hash of that
 34 |     sequence.  The number n should be the length of the longest
 35 |     sequence to be hashed including potential additional insertions.
 36 | 
 37 |     The number is generated by xor'ing together four shifted
 38 |     31-bit random numbers.
 39 | 
 40 |     Also make some random values for the V genes and D genes.
 41 |   */
 42 | 
 43 |   /* allocate memory for table */
 44 | 
 45 |   uint64_t numbers = alphabet_size * n + v_genes + d_genes;
 46 | 
 47 |   zobrist_tab_base = static_cast<uint64_t *>
 48 |     (xmalloc(numbers * sizeof(uint64_t)));
 49 | 
 50 |   /* fill table with random 64 bit numbers */
 51 | 
 52 |   for (unsigned int i = 0; i < numbers; i++)
 53 |     {
 54 |       uint64_t z;
 55 |       z = arch_random();
 56 |       z <<= 16;
 57 |       z ^= arch_random();
 58 |       z <<= 16;
 59 |       z ^= arch_random();
 60 |       z <<= 16;
 61 |       z ^= arch_random();
 62 |       zobrist_tab_base[i] = z;
 63 |     }
 64 | 
 65 |   zobrist_v_base = zobrist_tab_base + alphabet_size * n;
 66 |   zobrist_d_base = zobrist_v_base + v_genes;
 67 | }
 68 | 
 69 | void zobrist_exit()
 70 | {
 71 |   xfree(zobrist_tab_base);
 72 | }
 73 | 
 74 | uint64_t zobrist_hash(unsigned char * s,
 75 |                       unsigned int len,
 76 |                       int v_gene,
 77 |                       int d_gene)
 78 | {
 79 |   /* compute the Zobrist hash function of sequence s of length len. */
 80 |   /* len is the actual number of residues in the sequence */
 81 | 
 82 |   uint64_t z = 0;
 83 |   if (! opt_ignore_genes)
 84 |     z ^= zobrist_v_base[v_gene] ^ zobrist_d_base[d_gene];
 85 |   for (unsigned int p = 0; p < len; p++)
 86 |     z ^= zobrist_value(p, s[p]);
 87 |   return z;
 88 | }
 89 | 
 90 | uint64_t zobrist_hash_delete_first(unsigned char * s,
 91 |                                    unsigned int len,
 92 |                                    int v_gene,
 93 |                                    int d_gene)
 94 | {
 95 |   /* compute the Zobrist hash function of sequence s,
 96 |      but delete the first base */
 97 | 
 98 |   uint64_t z = 0;
 99 |   if (! opt_ignore_genes)
100 |     z ^= zobrist_v_base[v_gene] ^ zobrist_d_base[d_gene];
101 |   for(unsigned int p = 1; p < len; p++)
102 |     z ^= zobrist_value(p - 1, s[p]);
103 |   return z;
104 | }
105 | 
106 | uint64_t zobrist_hash_delete_first_two(unsigned char * s,
107 |                                        unsigned int len,
108 |                                        int v_gene,
109 |                                        int d_gene)
110 | {
111 |   /* compute the Zobrist hash function of sequence s,
112 |      but delete the first two bases */
113 | 
114 |   uint64_t z = 0;
115 |   if (! opt_ignore_genes)
116 |     z ^= zobrist_v_base[v_gene] ^ zobrist_d_base[d_gene];
117 |   for(unsigned int p = 2; p < len; p++)
118 |     z ^= zobrist_value(p - 2, s[p]);
119 |   return z;
120 | }
121 | 
122 | uint64_t zobrist_hash_insert_first(unsigned char * s,
123 |                                    unsigned int len,
124 |                                    int v_gene,
125 |                                    int d_gene)
126 | {
127 |   /* compute the Zobrist hash function of sequence s,
128 |      but insert a gap (no value) before the first base */
129 | 
130 |   uint64_t z = 0;
131 |   if (! opt_ignore_genes)
132 |     z ^= zobrist_v_base[v_gene] ^ zobrist_d_base[d_gene];
133 |   for(unsigned int p = 0; p < len; p++)
134 |     z ^= zobrist_value(p + 1, s[p]);
135 |   return z;
136 | }
137 | 
138 | uint64_t zobrist_hash_insert_first_two(unsigned char * s,
139 |                                        unsigned int len,
140 |                                        int v_gene,
141 |                                        int d_gene)
142 | {
143 |   /* compute the Zobrist hash function of sequence s,
144 |      but insert two gaps (no value) before the first base */
145 | 
146 |   uint64_t z = 0;
147 |   if (! opt_ignore_genes)
148 |     z ^= zobrist_v_base[v_gene] ^ zobrist_d_base[d_gene];
149 |   for(unsigned int p = 0; p < len; p++)
150 |     z ^= zobrist_value(p + 2, s[p]);
151 |   return z;
152 | }
153 | 


--------------------------------------------------------------------------------
/src/zobrist.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |     Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
 3 | 
 4 |     This program is free software: you can redistribute it and/or modify
 5 |     it under the terms of the GNU Affero General Public License as
 6 |     published by the Free Software Foundation, either version 3 of the
 7 |     License, or (at your option) any later version.
 8 | 
 9 |     This program is distributed in the hope that it will be useful,
10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |     GNU Affero General Public License for more details.
13 | 
14 |     You should have received a copy of the GNU Affero General Public License
15 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 |     Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
18 |     Department of Informatics, University of Oslo,
19 |     PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 | 
22 | extern uint64_t * zobrist_tab_base;
23 | 
24 | inline uint64_t zobrist_value(unsigned int pos, unsigned char x)
25 | {
26 |   return zobrist_tab_base[alphabet_size * pos + x];
27 | }
28 | 
29 | void zobrist_init(unsigned int longest,
30 |                   unsigned int v_genes,
31 |                   unsigned int d_genes);
32 | 
33 | void zobrist_exit();
34 | 
35 | uint64_t zobrist_hash(unsigned char * s,
36 |                       unsigned int len,
37 |                       int v_gene,
38 |                       int d_gene);
39 | 
40 | uint64_t zobrist_hash_delete_first(unsigned char * s,
41 |                                    unsigned int len,
42 |                                    int v_gene,
43 |                                    int d_gene);
44 | 
45 | uint64_t zobrist_hash_delete_first_two(unsigned char * s,
46 |                                        unsigned int len,
47 |                                        int v_gene,
48 |                                        int d_gene);
49 | 
50 | uint64_t zobrist_hash_insert_first(unsigned char * s,
51 |                                    unsigned int len,
52 |                                    int v_gene,
53 |                                    int d_gene);
54 | 
55 | uint64_t zobrist_hash_insert_first_two(unsigned char * s,
56 |                                        unsigned int len,
57 |                                        int v_gene,
58 |                                        int d_gene);
59 | 


--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
1 | test :
2 | 	sh test.sh
3 | 


--------------------------------------------------------------------------------
/test/expected.tsv:
--------------------------------------------------------------------------------
1 | #	B1	B2
2 | A1	0	7
3 | A2	45	0
4 | 


--------------------------------------------------------------------------------
/test/seta.tsv:
--------------------------------------------------------------------------------
1 | repertoire_id	sequence_id	duplicate_count	v_call	j_call	junction	junction_aa	sequence	rev_comp	productive	d_call	sequence_alignment	germline_alignment	v_cigar	d_cigar	j_cigar
2 | A1	R	1	TCRBV07-06	TCRBJ02-01	tgcgcgagcagcaccagccatgaacagtatttt	CASSTSHEQYF									
3 | A2	S	3	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctatggctataccttt	CASSLRVGGYGYTF									
4 | 


--------------------------------------------------------------------------------
/test/setb.tsv:
--------------------------------------------------------------------------------
1 | repertoire_id	sequence_id	duplicate_count	v_call	j_call	junction	junction_aa	sequence	rev_comp	productive	d_call	sequence_alignment	germline_alignment	v_cigar	d_cigar	j_cigar
2 | B1	T	5	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctatggctataccttt	CASSLRVGGYGYTF									
3 | B1	U	10	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctttggctataccttt	CASSLRVGGFGYTF									
4 | B2	V	7	TCRBV07-06	TCRBJ02-01	tgcgcgagcagcaccagccatcagcagtatttt	CASSTSHQQYF									
5 | 


--------------------------------------------------------------------------------
/test/setc.tsv:
--------------------------------------------------------------------------------
1 | repertoire_id	sequence_id	duplicate_count	v_call	j_call	junction	junction_aa	sequence	rev_comp	productive	d_call	sequence_alignment	germline_alignment	v_cigar	d_cigar	j_cigar
2 | C	X	1	TCRBV07-09	TCRBJ01-02	tgcgcgagcagcctgcgcgtgggcggctttggctataccttt	CASSLRVGGFGYTF									
3 | C	Y	1	TCRBV07-06	TCRBJ02-01	tgcgcgagcagcaccagccatcagcagtatttt	CASSTSHQQYF									
4 | 


--------------------------------------------------------------------------------
/test/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if ! [ -e ../src/compairr ] ; then
 4 |     echo The compairr binary is missing
 5 |     echo Test failed.
 6 |     exit 1
 7 | fi
 8 | 
 9 | ../src/compairr -m seta.tsv setb.tsv -d 1 -i -l compairr.log -o output.tsv
10 | 
11 | if diff -q output.tsv expected.tsv; then
12 |     echo Test completed successfully.
13 | else
14 |     echo Test failed.
15 |     exit 1
16 | fi
17 | 


--------------------------------------------------------------------------------