├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── src
├── Makefile
├── arch.cc
├── arch.h
├── bloompat.cc
├── bloompat.h
├── cluster.cc
├── cluster.h
├── compairr.cc
├── compairr.h
├── db.cc
├── db.h
├── dedup.cc
├── dedup.h
├── hashtable.cc
├── hashtable.h
├── overlap.cc
├── overlap.h
├── threads.h
├── util.cc
├── util.h
├── variants.cc
├── variants.h
├── zobrist.cc
└── zobrist.h
└── test
├── Makefile
├── expected.tsv
├── seta.tsv
├── setb.tsv
├── setc.tsv
└── test.sh
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:3.13
2 | WORKDIR /opt/compairr
3 | COPY Makefile .
4 | COPY src ./src
5 | COPY test ./test
6 | RUN apk add --no-cache libstdc++ make g++ && \
7 | make clean && make && make test && make install && make clean && \
8 | apk del make g++
9 | ENTRYPOINT ["/usr/local/bin/compairr"]
10 | CMD ["--help"]
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU Affero General Public License is a free, copyleft license for
11 | software and other kinds of works, specifically designed to ensure
12 | cooperation with the community in the case of network server software.
13 |
14 | The licenses for most software and other practical works are designed
15 | to take away your freedom to share and change the works. By contrast,
16 | our General Public Licenses are intended to guarantee your freedom to
17 | share and change all versions of a program--to make sure it remains free
18 | software for all its users.
19 |
20 | When we speak of free software, we are referring to freedom, not
21 | price. Our General Public Licenses are designed to make sure that you
22 | have the freedom to distribute copies of free software (and charge for
23 | them if you wish), that you receive source code or can get it if you
24 | want it, that you can change the software or use pieces of it in new
25 | free programs, and that you know you can do these things.
26 |
27 | Developers that use our General Public Licenses protect your rights
28 | with two steps: (1) assert copyright on the software, and (2) offer
29 | you this License which gives you legal permission to copy, distribute
30 | and/or modify the software.
31 |
32 | A secondary benefit of defending all users' freedom is that
33 | improvements made in alternate versions of the program, if they
34 | receive widespread use, become available for other developers to
35 | incorporate. Many developers of free software are heartened and
36 | encouraged by the resulting cooperation. However, in the case of
37 | software used on network servers, this result may fail to come about.
38 | The GNU General Public License permits making a modified version and
39 | letting the public access it on a server without ever releasing its
40 | source code to the public.
41 |
42 | The GNU Affero General Public License is designed specifically to
43 | ensure that, in such cases, the modified source code becomes available
44 | to the community. It requires the operator of a network server to
45 | provide the source code of the modified version running there to the
46 | users of that server. Therefore, public use of a modified version, on
47 | a publicly accessible server, gives the public access to the source
48 | code of the modified version.
49 |
50 | An older license, called the Affero General Public License and
51 | published by Affero, was designed to accomplish similar goals. This is
52 | a different license, not a version of the Affero GPL, but Affero has
53 | released a new version of the Affero GPL which permits relicensing under
54 | this license.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | TERMS AND CONDITIONS
60 |
61 | 0. Definitions.
62 |
63 | "This License" refers to version 3 of the GNU Affero General Public License.
64 |
65 | "Copyright" also means copyright-like laws that apply to other kinds of
66 | works, such as semiconductor masks.
67 |
68 | "The Program" refers to any copyrightable work licensed under this
69 | License. Each licensee is addressed as "you". "Licensees" and
70 | "recipients" may be individuals or organizations.
71 |
72 | To "modify" a work means to copy from or adapt all or part of the work
73 | in a fashion requiring copyright permission, other than the making of an
74 | exact copy. The resulting work is called a "modified version" of the
75 | earlier work or a work "based on" the earlier work.
76 |
77 | A "covered work" means either the unmodified Program or a work based
78 | on the Program.
79 |
80 | To "propagate" a work means to do anything with it that, without
81 | permission, would make you directly or secondarily liable for
82 | infringement under applicable copyright law, except executing it on a
83 | computer or modifying a private copy. Propagation includes copying,
84 | distribution (with or without modification), making available to the
85 | public, and in some countries other activities as well.
86 |
87 | To "convey" a work means any kind of propagation that enables other
88 | parties to make or receive copies. Mere interaction with a user through
89 | a computer network, with no transfer of a copy, is not conveying.
90 |
91 | An interactive user interface displays "Appropriate Legal Notices"
92 | to the extent that it includes a convenient and prominently visible
93 | feature that (1) displays an appropriate copyright notice, and (2)
94 | tells the user that there is no warranty for the work (except to the
95 | extent that warranties are provided), that licensees may convey the
96 | work under this License, and how to view a copy of this License. If
97 | the interface presents a list of user commands or options, such as a
98 | menu, a prominent item in the list meets this criterion.
99 |
100 | 1. Source Code.
101 |
102 | The "source code" for a work means the preferred form of the work
103 | for making modifications to it. "Object code" means any non-source
104 | form of a work.
105 |
106 | A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 |
111 | The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form. A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 |
122 | The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities. However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work. For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 |
135 | The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 |
139 | The Corresponding Source for a work in source code form is that
140 | same work.
141 |
142 | 2. Basic Permissions.
143 |
144 | All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met. This License explicitly affirms your unlimited
147 | permission to run the unmodified Program. The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work. This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force. You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright. Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 |
163 | Conveying under any other circumstances is permitted solely under
164 | the conditions stated below. Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 |
167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 |
169 | No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 |
175 | When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 |
183 | 4. Conveying Verbatim Copies.
184 |
185 | You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 |
193 | You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 |
196 | 5. Conveying Modified Source Versions.
197 |
198 | You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 |
202 | a) The work must carry prominent notices stating that you modified
203 | it, and giving a relevant date.
204 |
205 | b) The work must carry prominent notices stating that it is
206 | released under this License and any conditions added under section
207 | 7. This requirement modifies the requirement in section 4 to
208 | "keep intact all notices".
209 |
210 | c) You must license the entire work, as a whole, under this
211 | License to anyone who comes into possession of a copy. This
212 | License will therefore apply, along with any applicable section 7
213 | additional terms, to the whole of the work, and all its parts,
214 | regardless of how they are packaged. This License gives no
215 | permission to license the work in any other way, but it does not
216 | invalidate such permission if you have separately received it.
217 |
218 | d) If the work has interactive user interfaces, each must display
219 | Appropriate Legal Notices; however, if the Program has interactive
220 | interfaces that do not display Appropriate Legal Notices, your
221 | work need not make them do so.
222 |
223 | A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit. Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 |
233 | 6. Conveying Non-Source Forms.
234 |
235 | You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 |
240 | a) Convey the object code in, or embodied in, a physical product
241 | (including a physical distribution medium), accompanied by the
242 | Corresponding Source fixed on a durable physical medium
243 | customarily used for software interchange.
244 |
245 | b) Convey the object code in, or embodied in, a physical product
246 | (including a physical distribution medium), accompanied by a
247 | written offer, valid for at least three years and valid for as
248 | long as you offer spare parts or customer support for that product
249 | model, to give anyone who possesses the object code either (1) a
250 | copy of the Corresponding Source for all the software in the
251 | product that is covered by this License, on a durable physical
252 | medium customarily used for software interchange, for a price no
253 | more than your reasonable cost of physically performing this
254 | conveying of source, or (2) access to copy the
255 | Corresponding Source from a network server at no charge.
256 |
257 | c) Convey individual copies of the object code with a copy of the
258 | written offer to provide the Corresponding Source. This
259 | alternative is allowed only occasionally and noncommercially, and
260 | only if you received the object code with such an offer, in accord
261 | with subsection 6b.
262 |
263 | d) Convey the object code by offering access from a designated
264 | place (gratis or for a charge), and offer equivalent access to the
265 | Corresponding Source in the same way through the same place at no
266 | further charge. You need not require recipients to copy the
267 | Corresponding Source along with the object code. If the place to
268 | copy the object code is a network server, the Corresponding Source
269 | may be on a different server (operated by you or a third party)
270 | that supports equivalent copying facilities, provided you maintain
271 | clear directions next to the object code saying where to find the
272 | Corresponding Source. Regardless of what server hosts the
273 | Corresponding Source, you remain obligated to ensure that it is
274 | available for as long as needed to satisfy these requirements.
275 |
276 | e) Convey the object code using peer-to-peer transmission, provided
277 | you inform other peers where the object code and Corresponding
278 | Source of the work are being offered to the general public at no
279 | charge under subsection 6d.
280 |
281 | A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 |
285 | A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling. In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage. For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product. A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 |
298 | "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source. The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 |
306 | If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information. But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 |
317 | The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed. Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 |
325 | Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 |
331 | 7. Additional Terms.
332 |
333 | "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law. If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 |
342 | When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it. (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.) You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 |
349 | Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 |
353 | a) Disclaiming warranty or limiting liability differently from the
354 | terms of sections 15 and 16 of this License; or
355 |
356 | b) Requiring preservation of specified reasonable legal notices or
357 | author attributions in that material or in the Appropriate Legal
358 | Notices displayed by works containing it; or
359 |
360 | c) Prohibiting misrepresentation of the origin of that material, or
361 | requiring that modified versions of such material be marked in
362 | reasonable ways as different from the original version; or
363 |
364 | d) Limiting the use for publicity purposes of names of licensors or
365 | authors of the material; or
366 |
367 | e) Declining to grant rights under trademark law for use of some
368 | trade names, trademarks, or service marks; or
369 |
370 | f) Requiring indemnification of licensors and authors of that
371 | material by anyone who conveys the material (or modified versions of
372 | it) with contractual assumptions of liability to the recipient, for
373 | any liability that these contractual assumptions directly impose on
374 | those licensors and authors.
375 |
376 | All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10. If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term. If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 |
386 | If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 |
391 | Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 |
395 | 8. Termination.
396 |
397 | You may not propagate or modify a covered work except as expressly
398 | provided under this License. Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 |
403 | However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 |
410 | Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 |
417 | Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License. If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 |
423 | 9. Acceptance Not Required for Having Copies.
424 |
425 | You are not required to accept this License in order to receive or
426 | run a copy of the Program. Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance. However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work. These actions infringe copyright if you do
431 | not accept this License. Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 |
434 | 10. Automatic Licensing of Downstream Recipients.
435 |
436 | Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License. You are not responsible
439 | for enforcing compliance by third parties with this License.
440 |
441 | An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations. If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 |
451 | You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License. For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 |
459 | 11. Patents.
460 |
461 | A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based. The
463 | work thus licensed is called the contributor's "contributor version".
464 |
465 | A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version. For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 |
475 | Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 |
480 | In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement). To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 |
487 | If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients. "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 |
501 | If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 |
509 | A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License. You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 |
524 | Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 |
528 | 12. No Surrender of Others' Freedom.
529 |
530 | If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License. If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all. For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 |
540 | 13. Remote Network Interaction; Use with the GNU General Public License.
541 |
542 | Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software. This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 |
553 | Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work. The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 |
561 | 14. Revised Versions of this License.
562 |
563 | The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time. Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 |
568 | Each version is given a distinguishing version number. If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation. If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 |
577 | If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 |
582 | Later license versions may give you additional or different
583 | permissions. However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 |
587 | 15. Disclaimer of Warranty.
588 |
589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 |
598 | 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 |
610 | 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 |
627 | To do so, attach the following notices to the program. It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 |
632 |
633 | Copyright (C)
634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published by
637 | the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for CompAIRR
2 |
3 | ifndef PREFIX
4 | PREFIX=/usr/local
5 | endif
6 |
7 | all : compairr
8 |
9 | compairr:
10 | make -C src compairr
11 |
12 | test: compairr
13 | make -C test
14 |
15 | install: compairr test
16 | /usr/bin/install -d $(PREFIX)/bin
17 | /usr/bin/install -c src/compairr $(PREFIX)/bin/compairr
18 |
19 | clean:
20 | make -C src clean
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://docs.airr-community.org/en/stable/swtools/airr_swtools_standard.html)
2 |
3 | # CompAIRR
4 |
5 | CompAIRR (`compairr`) is a command line tool to compare two sets of
6 | adaptive immune receptor repertoires and compute their overlap. It can
7 | also identify which sequences are present in which repertoires.
8 | Furthermore, CompAIRR can cluster the sequences in a repertoire
9 | set. Sequence comparisons can be exact or approximate. CompAIRR has
10 | been shown to be very fast and to have a small memory footprint
11 | compared to similar tools, when up to 2 differences are allowed.
12 |
13 |
14 | ## Installation
15 |
16 | The code is C++11 standard compliant and should compile easily using
17 | `make` and a modern C++ compiler (e.g. GNU GCC or LLVM Clang). Run
18 | `make clean`, `make`, `make test` and `make install` in the main
19 | folder to clean, build, test and install the tool. There are no
20 | dependencies except for the C and C++ standard libraries.
21 |
22 | Binaries for Linux (x86_64) and macOS (x86_64 and Arm64) are also
23 | distributed with each
24 | [release](https://github.com/uio-bmi/compairr/releases/latest).
25 |
26 | A `Dockerfile` is included if you want to make a Docker image. A
27 | docker image may be built with the following command:
28 |
29 | ```sh
30 | docker build -t compairr .
31 | ```
32 |
33 | Ready-made Docker images for CompAIRR can be found on the
34 | [Docker Hub](https://hub.docker.com/r/torognes/compairr).
35 |
36 | CompAIRR can be installed on macOS using homebrew with
37 | `brew install torognes/bioinf/compairr`.
38 |
39 |
40 | ## Tutorial
41 |
42 | For an introduction to how to use CompAIRR, please have a look at the
43 | [CompAIRR tutorial](https://github.com/LonnekeScheffer/compairr-tutorial).
44 |
45 |
46 | ## General options
47 |
48 | Use the `-h` or `--help` option to show some help information.
49 |
50 | Run the program with `-v` or `--version` for version information.
51 |
52 | The type of operation that should be performed is specified with one
53 | of the options `-m`, `-x`, `-c` or `-z` (or the corresponding long option
54 | forms `--matrix`, `--existence`, `--cluster`, or `--deduplicate`).
55 |
56 | The code is multi-threaded. The number of threads may be specified
57 | with the `-t` or `--threads` option.
58 |
59 | The results will be written to standard out (stdout) unless a file
60 | name has been specified with the `-o` or `--output-file` option.
61 |
62 | While the program is running it will print some status and progress
63 | information to standard error (stderr) unless a log file has been
64 | specified with the `-l` or `--log` option. Error messages and warnings
65 | will also be written here.
66 |
67 | The default is to compare amino acid sequences, but nucleotide
68 | sequences are compared if the `-n` or `--nucleotides` option is given.
69 | The accepted amino acid symbols are `ACDEFGHIKLMNPQRSTVWY`, while the
70 | accepted nucleotide symbols are `ACGTU`. Lower case letters are also
71 | accepted. The program will abort with an error message if any other
72 | symbol is encountered in a sequence, unless one specifies the `-u` or
73 | `--ignore-unknown` option, in which case CompAIRR will simply ignore
74 | that sequence. If the program encounters an empty sequence it will
75 | also abort with an error message, unless the `-e` or `--ignore-empty`
76 | option is given.
77 |
78 | By default, the sequences should be given in the `junction` or
79 | `junction_aa` column of the input file, for nucleotide and amino acid
80 | sequences, respectively. Alternatively, the sequences may be present
81 | in the `cdr3` or `cdr3_aa` column, if the `--cdr3` option is given.
82 |
83 | The user can specify how many differences are allowed when comparing
84 | sequences, using the option `-d` or `--differences`. To allow indels
85 | (insertions or deletions) the option `-i` or `--indels` may be
86 | specified, otherwise only substitutions are allowed. By default, no
87 | differences are allowed. The `-i` option is allowed only when d=1. The
88 | number of differences allowed strongly influences the speed of
89 | CompAIRR. The program will be slower as more differences
90 | are allowed. When d=0 or d=1 it is very fast, but it will be relatively
91 | slow with d=2 and even slower when d>2. See the section on performance
92 | below for an example.
93 |
94 | The V and J gene alleles specified for each sequence must also match,
95 | unless the `-g` or `--ignore-genes` option is in effect.
96 |
97 |
98 | ## Computing overlap between two repertoire sets
99 |
100 | To compute the overlap between two repertoire sets, use the `-m` or
101 | `--matrix` option.
102 |
103 | For each of the two repertoire sets there must an input file of
104 | tab-separated values formatted according to [the AIRR standard for
105 | rearrangements](https://docs.airr-community.org/en/stable/datarep/rearrangements.html).
106 | The two input files are specified on the command line without any
107 | preceding option letter. If only one filename is specified on the
108 | command line, or the same filename is specified twice, it is assumed
109 | that the set should be compared to itself. Each file must contain the
110 | repertoire ID and either the nucleotide or the amino acid sequence of
111 | the rearrangement. If the repertoire ID column is missing, all
112 | sequences are assumed to belong to the same repertoire (with ID 1 or
113 | 2, respectively, for the two sets). A sequence ID may also be
114 | included. Unless they should be ignored, the V gene, the J gene, and
115 | the duplicate count is also needed.
116 |
117 | Each set can contain many repertoires and each repertoire can contain
118 | many sequences. The tool will find the sequences in the two sets that
119 | are similar and output a matrix with results.
120 |
121 | CompAIRR assumes that all sequences within each repertoire are
122 | distinct, and that the abundance of each sequence is indicated in the
123 | `duplicate_count` field in the input file. Duplicated sequences,
124 | i.e. identical sequences (with the same V and J genes) within the same
125 | repertoire, may lead to unexpected results. CompAIRR will warn if it
126 | detects duplicates. Duplicates may be merged with the `--deduplicate`
127 | command.
128 |
129 | The similar sequences of each repertoire in each set are found by
130 | comparing the sequences and their V and J genes. The duplicate count
131 | of each sequence is taken into account and a matrix is output
132 | containing a value for each combination of repertoires in the two
133 | sets. The value is usually the sum of the products of the duplicate
134 | counts of all pairs of sequences in the two repertoires that match. If
135 | the option `-f` or `--ignore-counts` is specified, the duplicate count
136 | information is ignored and all counts are treated as 1. Instead of
137 | summing the product of the counts, the ratio, min, max, or mean may be
138 | used if specified with the `-s` or `--score` option. The Morisita-Horn
139 | index or Jaccard index will be calculated if `MH` or `Jaccard` is
140 | specified with the `-s` option. These indices can only be computed
141 | when d=0.
142 |
143 | The output will be a matrix of values in a tab-separated plain text
144 | file. Two different formats can be selected. In the default format,
145 | the first line contains the hash character (`#`) followed by the
146 | repertoire ID's from the second set. The following lines contains the
147 | repertoire ID from the first set, followed by the values corresponding
148 | to the comparison of this repertoire with each of the repertoires in
149 | the second set.
150 |
151 | An alternative output format is used when the `-a` or `--alternative`
152 | option is specified. It will write the results in a three column
153 | format with the repertoire ID from set 1 and set 2 in the two first
154 | columns, respectively, and the value in the third column. There will
155 | be one line for each combination of repertoires in the sets. The very
156 | first line will contain a hash character (`#`) followed by the field
157 | names separated by tabs.
158 |
159 | If the `-p` or `--pairs` option is specified, CompAIRR will write
160 | information about all pairs of matching sequences to a specified TSV
161 | file. Please note that such files may grow very large when there are
162 | many matches. Use of multithreading may be of little use in this
163 | case. The order of the lines in the file is unspecified. The following
164 | columns from both input files will be included in the output:
165 | `repertoire_id`, `sequence_id`, `duplicate_count`, `v_call`, `j_call`,
166 | and `junction`. The term `junction` will be replaced with
167 | `junction_aa`, `cdr3`, or `cdr3_aa` as appropriate. Additional columns
168 | from the input files may be copied to the pairs file using the `-k` or
169 | `--keep-columns` option. Multiple columns, separated by commas (but no
170 | spaces), may be given. A warning will be given if any of the specified
171 | columns are missing. In the header, columns from the first and second
172 | input file will be suffixed by `_1` and `_2`, respectively. The
173 | distance between the sequences will be included if the `--distance`
174 | option is included. This is usually the Hamming distance (minimum
175 | number of substitutions), unless the `--indel` (or `-i`) option is
176 | specified, in which case the distance is the Levenshtein distance
177 | (minimum number of substitutions or indels). If only the information
178 | in the pairs file is required, and not the information in the matrix,
179 | the storage and output of the matrix can be avoided with the
180 | `--no-matrix` option. This may save some memory and time if there are
181 | many repertoires in the sets.
182 |
183 |
184 | ## Analysing in which repertoires a set of sequences are present
185 |
186 | Use the option `-x` or `--existence` to analyse in which repertoires a
187 | set of sequences are present, and create a sequence presence matrix.
188 |
189 | Two input files with repertoire sets in standard format must be
190 | specified on the command line. The first file should contain the
191 | different sequences to analyse. The `sequence_id` column must be
192 | present in this file. If the optional `repertoire_id` column is
193 | present, all those identifiers must be identical. The second file must
194 | contain the repertoires to match. The `repertoire_id` column must be
195 | present in the second file, otherwise the ID will be set to 2 for all
196 | sequences.
197 |
198 | CompAIRR will identify in which repertoires each sequence is present
199 | and will output the results either as a matrix or as a three-column
200 | table (if the `-a` option is specified). The options `-d`, `-i`, `-g`,
201 | and `-n` (and the corresponding long option names `--differences`,
202 | `--indels`, `--ignore-genes`, and `--nucleotides`) will be taken into
203 | account when comparing sequences.
204 |
205 | The output will be in a similar format as when computing the overlap
206 | (above), but the first column will contain the `sequence_id` from the
207 | first file instead of the `repertoire_id`.
208 |
209 | The `-p` or `--pairs` option may be specified to output all pairs of
210 | matching sequences in the same way as for the overlap computation.
211 |
212 |
213 | ## Clustering the sequences in a repertoire
214 |
215 | To cluster the sequences in one repertoire, use the `-c` or
216 | `--cluster` option.
217 |
218 | One input file in tab-separated format must be specified on the
219 | command line.
220 |
221 | The tool will cluster the sequences using single linkage hierarchical
222 | clustering, according to the specified distance and indel options
223 | (`-d`, `--distance`, `-i`, `--indels`). The V and J gene alleles will
224 | be taken into account unless the `-g` or `--ignore-genes` option is
225 | specified. The options `-n` or `--nucleotides` indicate that the
226 | comparison should be performed with nucleotide sequences, not amino
227 | acid sequences. If the repertoire ID column is missing, all
228 | sequences are assumed to belong to the same repertoire (with ID 1).
229 |
230 | The output will be in a similar TSV format as the input file, but
231 | preceded with two additional columns. The first column will contain a
232 | cluster number, starting at 1. The second column will contain the size
233 | of the cluster. The subsequent columns are `repertoire_id`,
234 | `sequence_id`, `duplicate_count`, `v_call`, `j_call`, and `junction`
235 | (or `junction_aa`, `cdr3` or `cdr3_aa`, as appropriate).
236 |
237 | The clusters are sorted by size, in descending order.
238 |
239 |
240 | ## Deduplication
241 |
242 | The `--deduplicate` command may be used to deduplicate a data set by
243 | merging entries in the same repertoire with identical sequences and
244 | identical V and J genes. This may be necessary to get correct results
245 | when computing overlaps between repertoires. Duplicates may be present
246 | for instance in cases were the data set contains both nucleotide and
247 | amino acid sequences from the same rearrangement, where the nucleotide
248 | sequences may be distinct while the amino acid sequences may not be,
249 | due to the degeneracy of the genetic code.
250 |
251 | One input file in TSV format must be specified on the command line.
252 |
253 | Strictly identical sequences in the same repertoire will be merged and
254 | their counts will be added together. If the `-g` or `--ignore_genes`
255 | option is specified, the V and J genes are ignored. The `-n` or
256 | `--nucleotides` option may be specified if the input is nucleotide
257 | sequences, otherwise amino acid sequences will be assumed. If the `-f`
258 | or `--ignore_counts` option is specified, the counts in the input file
259 | will be ignored, and just the number of identical sequences will be
260 | counted. If the repertoire ID column is missing, all sequences are
261 | assumed to belong to the same repertoire (with ID 1).
262 |
263 | The output will be in a similar TSV format as the input file, with the
264 | following columns: `repertoire_id`, `duplicate_count`, `v_call`,
265 | `j_call`, and `junction` (or `junction_aa`, `cdr3` or `cdr3_aa`, as
266 | appropriate). If the `-g` or `--ignore_genes` option is specified, the
267 | `v_call` and `j_call` columns will not be included.
268 |
269 |
270 | ## Input files
271 |
272 | The input files must be in tab-separated value (TSV) format accoring
273 | to the [Rearrangement
274 | Schema](https://docs.airr-community.org/en/stable/datarep/rearrangements.html)
275 | of the [AIRR standards 1.3
276 | documentation](https://docs.airr-community.org/en/stable/).
277 |
278 | The first line must contain the header. The rest of the file must
279 | contain one line per sequence. The following fields should be included:
280 |
281 | * `repertoire_id`: identifier of the repertoire
282 | * `sequence_id`: identifier of the sequence (optional except for for first file when using `-x` or `--existence`)
283 | * `duplicate_count`: number of identical copies of the same rearrangement (required unless `-f` option given)
284 | * `v_call`: V gene name with allele (required unless `-g` option given)
285 | * `j_call`: J gene name with allele (required unless `-g` option given)
286 | * `junction`: nucleotide sequence (required if `-n` option given and `--cdr3` option not given)
287 | * `junction_aa`: amino acid sequence (single letter code) (required unless `-n` or `--cdr3` options given)
288 | * `cdr3`: nucleotide sequence (required if both `-n` and `--cdr3` options given)
289 | * `cdr_aa`: amino acid sequence (single letter code) (required if `--cdr3` option given and `-n` option not given)
290 |
291 | See below for an example. Other fields may be included, but will be
292 | ignored.
293 |
294 |
295 | ## Command line option overview
296 |
297 | The command line should look like this:
298 |
299 | ```
300 | compairr OPTIONS TSVFILE1 [TSVFILE2]
301 | ```
302 |
303 | Exactly one of the command options `-m`, `-x` or `-c` (or their long forms) must be specified. Other options as indicated in the table below could also be included. With the `-m` and `-x` command options, the names of two tab-separated value files with repertoires must also be specified on the command line, with the `-c` command option, only one such file should be specified.
304 |
305 | Short | Long | Argument | Default | Description
306 | ------|--------------------|----------|----------|-------------
307 | `-a` | `--alternative` | | | Output results in three-column format, not matrix
308 | ` ` | `--cdr3` | | | Use the `cdr3` or `cdr3_aa` column instead of `junction` or `junction_aa`
309 | `-c` | `--cluster` | | | Cluster sequences in one repertoire
310 | `-d` | `--differences` | INTEGER | 0 | Number of differences accepted
311 | ` ` | `--distance` | | | Include sequence distance in pairs file
312 | `-e` | `--ignore-empty` | | | Ignore empty sequences
313 | `-f` | `--ignore-counts` | | | Ignore duplicate count information
314 | `-g` | `--ignore-genes` | | | Ignore V and J gene information
315 | `-h` | `--help` | | | Display help text and exit
316 | `-i` | `--indels` | | | Allow insertions or deletions
317 | `-k` | `--keep-columns` | STRING | | Copy given comma-separated columns to pairs file
318 | `-l` | `--log` | FILENAME | (stderr) | Log to specified file instead of stderr
319 | `-m` | `--matrix` | | | Compute overlap matrix between two sets
320 | ` ` | `--no-matrix` | | | Do not keep or output any matrix
321 | `-n` | `--nucleotides` | | | Compare nucleotides, not amino acids
322 | `-o` | `--output` | FILENAME | (stdout) | Output results to specified file instead of stdout
323 | `-p` | `--pairs` | FILENAME | (none) | Output matching pairs to specified file
324 | `-s` | `--score` | STRING | product | Sum `product`, `ratio`, `min`, `max`, or `mean`; or compute `MH` or `Jaccard` index
325 | `-t` | `--threads` | INTEGER | 1 | Number of threads to use (1-256)
326 | `-u` | `--ignore-unknown` | | | Ignore sequences including unknown residue symbols
327 | `-v` | `--version` | | | Display version information
328 | `-x` | `--existence` | | | Check existence of sequences in repertoires
329 | `-z` | `--deduplicate` | | | Deduplicate sequences
330 |
331 |
332 | ## Example 1: Repertoire overlap
333 |
334 | In this example we will compute the overlap of two repertoire sets.
335 |
336 | Let's use two simple input files. The first is `seta.tsv`:
337 |
338 | ```tsv
339 | repertoire_id sequence_id duplicate_count v_call j_call junction junction_aa sequence rev_comp productive d_call sequence_alignment germline_alignment v_cigar d_cigar j_cigar
340 | A1 R 1 TCRBV07-06 TCRBJ02-01 tgcgcgagcagcaccagccatgaacagtatttt CASSTSHEQYF
341 | A2 S 3 TCRBV07-09 TCRBJ01-02 tgcgcgagcagcctgcgcgtgggcggctatggctataccttt CASSLRVGGYGYTF
342 | ```
343 |
344 |
345 | The second is `setb.tsv`:
346 |
347 | ```tsv
348 | repertoire_id sequence_id duplicate_count v_call j_call junction junction_aa sequence rev_comp productive d_call sequence_alignment germline_alignment v_cigar d_cigar j_cigar
349 | B1 T 5 TCRBV07-09 TCRBJ01-02 tgcgcgagcagcctgcgcgtgggcggctatggctataccttt CASSLRVGGYGYTF
350 | B1 U 10 TCRBV07-09 TCRBJ01-02 tgcgcgagcagcctgcgcgtgggcggctttggctataccttt CASSLRVGGFGYTF
351 | B2 V 7 TCRBV07-06 TCRBJ02-01 tgcgcgagcagcaccagccatcagcagtatttt CASSTSHQQYF
352 | ```
353 |
354 | We run the following command:
355 |
356 | `compairr -m seta.tsv setb.tsv -d 1 -o output.tsv -p pairs.tsv`
357 |
358 | Here is the output to the console:
359 |
360 | ```
361 | CompAIRR 1.7.0 - Comparison of Adaptive Immune Receptor Repertoires
362 | https://github.com/uio-bmi/compairr
363 |
364 | Start time: Thu Mar 03 12:29:32 CET 2022
365 | Command (m/c/x): Overlap (-m)
366 | Repertoire set 1: seta.tsv
367 | Repertoire set 2: setb.tsv
368 | Nucleotides (n): No
369 | Differences (d): 1
370 | Indels (i): No
371 | Ignore counts (f): No
372 | Ignore genes (g): No
373 | Ign. unknown (u): No
374 | Threads (t): 1
375 | Output file (o): output.tsv
376 | Output format (a): Matrix
377 | Score (s): Sum of products of counts
378 | Pairs file (p): pairs.tsv
379 | Log file (l): (stderr)
380 |
381 | Immune receptor repertoire set 1
382 |
383 | Reading sequences: 100% (0s)
384 | Repertoires: 2
385 | Sequences: 2
386 | Residues: 25
387 | Shortest: 11
388 | Longest: 14
389 | Average length: 12.5
390 | Total dupl. count: 4
391 | Indexing: 100% (0s)
392 |
393 | Repertoires in set:
394 | # Sequences Count Repertoire ID
395 | 1 1 1 A1
396 | 2 1 3 A2
397 |
398 | Immune receptor repertoire set 2
399 |
400 | Reading sequences: 100% (0s)
401 | Repertoires: 2
402 | Sequences: 3
403 | Residues: 39
404 | Shortest: 11
405 | Longest: 14
406 | Average length: 13.0
407 | Total dupl. count: 22
408 | Indexing: 100% (0s)
409 |
410 | Repertoires in set:
411 | # Sequences Count Repertoire ID
412 | 1 2 15 B1
413 | 2 1 7 B2
414 |
415 | Unique V genes: 2
416 | Unique J genes: 2
417 | Computing hashes: 100% (0s)
418 | Computing hashes: 100% (0s)
419 | Hashing sequences: 100% (0s)
420 | Analysing: 100% (0s)
421 | Writing results: 100% (0s)
422 |
423 | End time: Thu Mar 03 12:29:32 CET 2022
424 | ```
425 |
426 | Repertoires will be sorted alphabetically by ID. The program gives some
427 | statistics on the input files after reading them.
428 |
429 | Here is the result in the `output.tsv` file:
430 |
431 | ```tsv
432 | # B1 B2
433 | A1 0 7
434 | A2 45 0
435 | ```
436 |
437 | And here is the result in the `pairs.tsv` file:
438 |
439 | ```tsv
440 | #repertoire_id_1 sequence_id_1 duplicate_count_1 v_call_1 j_call_1 junction_aa_1 repertoire_id_2 sequence_id_2 duplicate_count_2 v_call_2 j_call_2 junction_aa_2
441 | A1 R 1 TCRBV07-06 TCRBJ02-01 CASSTSHEQYF B2 V 7 TCRBV07-06 TCRBJ02-01 CASSTSHQQYF
442 | A2 S 3 TCRBV07-09 TCRBJ01-02 CASSLRVGGYGYTF B1 T 5 TCRBV07-09 TCRBJ01-02 CASSLRVGGYGYTF
443 | A2 S 3 TCRBV07-09 TCRBJ01-02 CASSLRVGGYGYTF B1 U 10 TCRBV07-09 TCRBJ01-02 CASSLRVGGFGYTF
444 | ```
445 |
446 | Here, sequence R in repertoire A1 is similar to sequence V in
447 | repertoire B2. The only difference is the E and Q in the 8th
448 | position. The gene allele names are also the same. They have duplicate
449 | counts of 1 and 7, respectively. The product is 7. That value is found
450 | in the third column on the second line in the main output file.
451 |
452 | Sequence S in repertoire A2 with duplicate count 3 is similar to both
453 | sequence T and U in repertoire B1, with duplicate counts of 5 and 10,
454 | respectively. Sequence T in B1 is identical, while sequence U in B1
455 | has an F instead of a Y in the 10th position. The result is 3 * (5 +
456 | 10) = 3 * 15 = 45. That value is found in the second column on the
457 | third line of the main output file.
458 |
459 | Since there are no sequences from repertoire A1 similar to B1 or from
460 | A2 similar to B1, the other values in the matrix are zero.
461 |
462 | This small dataset is included in the test folder and the tool can
463 | automatically be tested by running `make test`.
464 |
465 |
466 | ## Example 2: Sequence existence
467 |
468 | In this example we will use the `-x` or `--existence` option to find
469 | out in which repertoires a set of sequences are present.
470 |
471 | The file `setc.tsv` contains the sequences that we will analyse:
472 |
473 | ```tsv
474 | repertoire_id sequence_id duplicate_count v_call j_call junction junction_aa sequence rev_comp productive d_call sequence_alignment germline_alignment v_cigar d_cigar j_cigar
475 | C X 1 TCRBV07-09 TCRBJ01-02 tgcgcgagcagcctgcgcgtgggcggctttggctataccttt CASSLRVGGFGYTF
476 | C Y 1 TCRBV07-06 TCRBJ02-01 tgcgcgagcagcaccagccatcagcagtatttt CASSTSHQQYF
477 | ```
478 |
479 | The file above is included in the folder `test` in the distribution.
480 |
481 | We will compare it to repertoire sets in the file `setb.tsv` described
482 | earlier.
483 |
484 | We run the following command:
485 |
486 | `compairr -x setc.tsv setb.tsv -d 1 -f -o output.tsv -p pairs.tsv`
487 |
488 | Here is the output to the console:
489 |
490 | ```
491 | CompAIRR 1.7.0 - Comparison of Adaptive Immune Receptor Repertoires
492 | https://github.com/uio-bmi/compairr
493 |
494 | Start time: Thu Mar 03 12:31:16 CET 2022
495 | Command (m/c/x): Existence (-x)
496 | Repertoire: setc.tsv
497 | Repertoire set: setb.tsv
498 | Nucleotides (n): No
499 | Differences (d): 1
500 | Indels (i): No
501 | Ignore counts (f): Yes
502 | Ignore genes (g): No
503 | Ign. unknown (u): No
504 | Threads (t): 1
505 | Output file (o): output.tsv
506 | Output format (a): Matrix
507 | Score (s): Sum of products of counts
508 | Pairs file (p): pairs.tsv
509 | Log file (l): (stderr)
510 |
511 | Immune receptor repertoire set 1
512 |
513 | Reading sequences: 100% (0s)
514 | Repertoires: 1
515 | Sequences: 2
516 | Residues: 25
517 | Shortest: 11
518 | Longest: 14
519 | Average length: 12.5
520 | Total dupl. count: 2
521 | Indexing: 100% (0s)
522 |
523 | Repertoires in set:
524 | # Sequences Count Repertoire ID
525 | 1 2 2 C
526 |
527 | Immune receptor repertoire set 2
528 |
529 | Reading sequences: 100% (0s)
530 | Repertoires: 2
531 | Sequences: 3
532 | Residues: 39
533 | Shortest: 11
534 | Longest: 14
535 | Average length: 13.0
536 | Total dupl. count: 22
537 | Indexing: 100% (0s)
538 |
539 | Repertoires in set:
540 | # Sequences Count Repertoire ID
541 | 1 2 15 B1
542 | 2 1 7 B2
543 |
544 | Unique V genes: 2
545 | Unique J genes: 2
546 | Computing hashes: 100% (0s)
547 | Computing hashes: 100% (0s)
548 | Hashing sequences: 100% (0s)
549 | Analysing: 100% (0s)
550 | Writing results: 100% (0s)
551 |
552 | End time: Thu Mar 03 12:31:16 CET 2022
553 | ```
554 |
555 | Here is the result in the `output.tsv` file:
556 |
557 | ```tsv
558 | # B1 B2
559 | X 2 0
560 | Y 0 1
561 | ```
562 |
563 | Please note that the `-f` option was used to ignore the duplicate
564 | counts.
565 |
566 | And here is the result in the `pairs.tsv` file:
567 |
568 | ```tsv
569 | #repertoire_id_1 sequence_id_1 duplicate_count_1 v_call_1 j_call_1 junction_aa_1 repertoire_id_2 sequence_id_2 duplicate_count_2 v_call_2 j_call_2 junction_aa_2
570 | C X 1 TCRBV07-09 TCRBJ01-02 CASSLRVGGFGYTF B1 U 10 TCRBV07-09 TCRBJ01-02 CASSLRVGGFGYTF
571 | C X 1 TCRBV07-09 TCRBJ01-02 CASSLRVGGFGYTF B1 T 5 TCRBV07-09 TCRBJ01-02 CASSLRVGGYGYTF
572 | C Y 1 TCRBV07-06 TCRBJ02-01 CASSTSHQQYF B2 V 7 TCRBV07-06 TCRBJ02-01 CASSTSHQQYF
573 | ```
574 |
575 | The results indicate that sequence X was found (twice) in repertoire
576 | B1 (matching sequences T and U) and that sequence Y was found in
577 | repertoire B2 (matching sequence V).
578 |
579 |
580 | ## Example 3: Clustering sequences
581 |
582 | This time we will cluster the nucleotide sequences in the file
583 | `setb.tsv` using the `-c` or `--cluster` option.
584 |
585 | The command line to run is:
586 |
587 | `compairr -c setb.tsv -d 1 -n -o output.tsv`
588 |
589 | The output during the clustering is as follows:
590 |
591 | ```
592 | CompAIRR 1.7.0 - Comparison of Adaptive Immune Receptor Repertoires
593 | https://github.com/uio-bmi/compairr
594 |
595 | Start time: Thu Mar 03 12:33:05 CET 2022
596 | Command (m/c/x): Cluster (-c)
597 | Repertoire: setb.tsv
598 | Nucleotides (n): Yes
599 | Differences (d): 1
600 | Indels (i): No
601 | Ignore counts (f): No
602 | Ignore genes (g): No
603 | Ign. unknown (u): No
604 | Threads (t): 1
605 | Output file (o): output.tsv
606 | Log file (l): (stderr)
607 |
608 | Immune receptor repertoire clustering
609 |
610 | Reading sequences: 100% (0s)
611 | Repertoires: 2
612 | Sequences: 3
613 | Residues: 117
614 | Shortest: 33
615 | Longest: 42
616 | Average length: 39.0
617 | Total dupl. count: 22
618 | Indexing: 100% (0s)
619 |
620 | Unique V genes: 2
621 | Unique J genes: 2
622 |
623 | Computing hashes: 100% (0s)
624 | Hashing sequences: 100% (0s)
625 | Building network: 100% (0s)
626 | Clustering: 100% (0s)
627 | Sorting clusters: 100% (0s)
628 | Writing clusters: 100% (0s)
629 |
630 | Clusters: 2
631 | End time: Thu Mar 03 12:33:05 CET 2022
632 | ```
633 |
634 | The result in the file `output.tsv` looks like this:
635 |
636 | ```tsv
637 | #cluster_no cluster_size repertoire_id sequence_id duplicate_count v_call j_call junction
638 | 1 2 B1 T 5 TCRBV07-09 TCRBJ01-02 tgcgcgagcagcctgcgcgtgggcggctatggctataccttt
639 | 1 2 B1 U 10 TCRBV07-09 TCRBJ01-02 tgcgcgagcagcctgcgcgtgggcggctttggctataccttt
640 | 2 1 B2 V 7 TCRBV07-06 TCRBJ02-01 tgcgcgagcagcaccagccatcagcagtatttt
641 | ```
642 |
643 | In this case, there are 2 clusters. The first contains 2 sequences (T
644 | and U from B1), while the second cluster contains 1 sequence (V from
645 | B2). The sequences are clustered across repertoires.
646 |
647 |
648 | ## Example 4: Deduplication
649 |
650 | This time we will deduplicate the amino acid sequences in the file
651 | `setb.tsv` using the `-z` or `--deduplicate` option.
652 |
653 | The command line to run is:
654 |
655 | `compairr -z setb.tsv -o output.tsv`
656 |
657 | The output will look like this:
658 |
659 | ```
660 | CompAIRR 1.8.0 - Comparison of Adaptive Immune Receptor Repertoires
661 | https://github.com/uio-bmi/compairr
662 |
663 | Start time: Thu Sep 15 17:10:51 CEST 2022
664 | Command: Deduplicate (--deduplicate)
665 | Repertoire: setb.tsv
666 | Nucleotides (n): No
667 | Differences (d): 0
668 | Indels (i): No
669 | Ignore counts (f): No
670 | Ignore genes (g): No
671 | Ign. unknown (u): No
672 | Threads (t): 1
673 | Output file (o): output.tsv
674 | Log file (l): (stderr)
675 |
676 | Reading sequences: 100% (0s)
677 | Repertoires: 2
678 | Sequences: 3
679 | Residues: 39
680 | Shortest: 11
681 | Longest: 14
682 | Average length: 13.0
683 | Total dupl. count: 22
684 | Indexing: 100% (0s)
685 | Unique V genes: 2
686 | Unique J genes: 2
687 | Computing hashes: 100% (0s)
688 | Deduplicating: 100% (0s)
689 | Duplicates merged: 0
690 | Writing output: 100% (0s)
691 |
692 | End time: Thu Sep 15 17:10:51 CEST 2022
693 | ```
694 |
695 | The result in the file `output.tsv` looks like this:
696 |
697 | ```tsv
698 | repertoire_id duplicate_count v_call j_call junction_aa
699 | B1 5 TCRBV07-09 TCRBJ01-02 CASSLRVGGYGYTF
700 | B1 10 TCRBV07-09 TCRBJ01-02 CASSLRVGGFGYTF
701 | B2 7 TCRBV07-06 TCRBJ02-01 CASSTSHQQYF
702 | ```
703 |
704 | There were no duplicates in this dataset so the output is essentially
705 | identical to the input data, but does not include all the original
706 | columns. If the two sequences in repertoire B1 had been identical, the
707 | two lines would have been merged and the new `duplicate_count` would
708 | have been 15.
709 |
710 |
711 | ## Implementation
712 |
713 | The program is written in C++. The strategy for finding similar
714 | sequences is based on a similar concept developed for the tool
715 | [Swarm](https://github.com/torognes/swarm) (Mahé et al.
716 | 2021). Basically, a 64-bit hash is computed for all sequences in the
717 | sets. All hashes for one set are stored in a Bloom filter and in a
718 | hash table. We then look for matches to sequences in the second set by
719 | looking them up in the Bloom filter and then, if there was a match, in
720 | the hash table. To find matches with 1 or 2 substitutions or indels,
721 | the hashes of all these variant sequences are generated and looked
722 | up. When d>2, a different strategy is used where all sequences are
723 | compared against each other and the number of differences is found.
724 |
725 |
726 | ## Performance
727 |
728 | As a preliminary performance test, Cohort 2 ("Keck") of [the
729 | dataset](https://s3-us-west-2.amazonaws.com/publishedproject-supplements/emerson-2017-natgen/emerson-2017-natgen.zip)
730 | by Emerson et al. (2017) was compared to itself. It contains 120 repertoires
731 | with a total of 24 205 557 extracted sequences. The test was performed
732 | with CompAIRR version 1.3.1. The timing results are shown below.
733 |
734 | Distance | Indels | Threads | Time (s) | Time (mm:ss)
735 | -------: | :----: | ------: | -------: | -----------:
736 | 0 | no | 1 | 18 | 0:18
737 | 0 | no | 4 | 12 | 0:12
738 | 1 | no | 1 | 224 | 3:44
739 | 1 | no | 4 | 72 | 1:12
740 | 1 | yes | 1 | 367 | 6:07
741 | 1 | yes | 4 | 111 | 1:51
742 | 2 | no | 4 | 3200 | 53:20
743 |
744 | When the distance is zero almost all of the time was used to read
745 | files.
746 |
747 | Memory usage was 2.5GB, corresponding to an average of about 100 bytes
748 | per sequence.
749 |
750 | Since this is a comparison of a repertoire set to itself, the dataset
751 | is only read once, and the memory needed is also reduced as compared
752 | to a situation were two different repertoire sets are compared.
753 |
754 | Wall time and memory usage was measured by `/usr/bin/time`. The
755 | analysis was performed on an Apple Mac Mini M1 (2020) with 16GB RAM.
756 |
757 |
758 | ## Benchmarking
759 |
760 | The AIRR overlap functionality of CompAIRR has been thoroughly
761 | benchmarked against similar tools. All data, scripts, and results are
762 | available in a separate [CompAIRR benchmarking
763 | repository](https://github.com/uio-bmi/compairr-benchmarking).
764 |
765 |
766 | ## Tips
767 |
768 | If computer memory is limited, the dataset may be split into blocks
769 | before running CompAIRR on each block separately. Results then needs
770 | to be merged together again afterwards. This may be achieved with a
771 | simple script. We will consider providing such a script.
772 |
773 |
774 | ## Development team
775 |
776 | The code has been developed by Torbjørn Rognes based on code from
777 | Swarm where Frédéric Mahé and Lucas Czech made important
778 | contributions. Geir Kjetil Sandve had the idea of developing a tool
779 | for rapid repertoire set comparison. Lonneke Scheffer has tested and
780 | benchmarked the tool, and suggested new features. Milena Pavlovic and
781 | Victor Greiff have also contributed to the project.
782 |
783 |
784 | ## Support
785 |
786 | We will prioritize fixing important bugs. We will also try to answer
787 | questions, improve documentation and implement suggested enhancements
788 | as time permits. As we have no dedicated funding for this project we
789 | cannot make any guarantees on the level of support.
790 |
791 | To report a potential bug, suggest enhancements or ask questions,
792 | please use one of the following means:
793 |
794 | * [Submit an issue on GitHub](https://github.com/uio-bmi/compairr/issues) (preferred)
795 |
796 | * Send an email to [`torognes@ifi.uio.no`](mailto:torognes@ifi.uio.no)
797 |
798 | If you would like to contribute with code you are most welcome to
799 | [submit a pull request](https://github.com/uio-bmi/compairr/pulls).
800 |
801 |
802 | ## Citing CompAIRR
803 |
804 | Please cite the following if you use CompAIRR in any published work:
805 |
806 | * Rognes T, Scheffer L, Greiff V, Sandve GK (2021) **CompAIRR: ultra-fast comparison of adaptive immune receptor repertoires by exact and approximate sequence matching.** *Bioinformatics*, btac505. doi: [10.1093/bioinformatics/btac505](https://doi.org/10.1093/bioinformatics/btac505)
807 |
808 | The article is also available in preprint form:
809 |
810 | * Rognes T, Scheffer L, Greiff V, Sandve GK (2021) **CompAIRR: ultra-fast comparison of adaptive immune receptor repertoires by exact and approximate sequence matching.** *bioRxiv*, 2021.10.30.466600. doi: [10.1101/2021.10.30.466600](https://doi.org/10.1101/2021.10.30.466600)
811 |
812 |
813 | ## References
814 |
815 | * Emerson RO, DeWitt WS, Vignali M, Gravley J, Hu JK, Osborne EJ, Desmarais C, Klinger M, Carlson CS, Hansen JA, Rieder M, Robins HS (2017) **Immunosequencing identifies signatures of cytomegalovirus exposure history and HLA-mediated effects on the T cell repertoire.** *Nature Genetics*, 49 (5): 659-665. doi: [10.1038/ng.3822](https://doi.org/10.1038/ng.3822)
816 |
817 | * Mahé F, Czech L, Stamatakis A, Quince C, de Vargas C, Dunthorn M, Rognes T (2021) **Swarm v3: Towards Tera-Scale Amplicon Clustering.** *Bioinformatics*, btab493. doi: [10.1093/bioinformatics/btab493](https://doi.org/10.1093/bioinformatics/btab493)
818 |
--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
2 | #
3 | # This program is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU Affero General Public License as
5 | # published by the Free Software Foundation, either version 3 of the
6 | # License, or (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU Affero General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU Affero General Public License
14 | # along with this program. If not, see .
15 | #
16 | # Contact: Torbjorn Rognes ,
17 | # Department of Informatics, University of Oslo,
18 | # PO Box 1080 Blindern, NO-0316 Oslo, Norway
19 |
20 | # Makefile for CompAIRR
21 |
22 | COMMON = -g -std=c++11
23 | EXTRAOBJ =
24 | LINKOPT =
25 | LIBS = -lpthread
26 | WARNINGS = -Wall -Wextra
27 |
28 | # Run "make RELEASE=1" to compile for release
29 | ifdef RELEASE
30 | COMMON += -DNDEBUG
31 | endif
32 |
33 | # Run "make PROFILE=1" to compile for profiling
34 | ifdef PROFILE
35 | COMMON += -pg
36 | endif
37 |
38 | # Run "make COVERAGE=1" to compile for coverage tests
39 | ifdef COVERAGE
40 | COMMON += -fprofile-arcs -ftest-coverage -O0
41 | LIBS += -lgcov
42 | LINKFLAGS += --coverage
43 | else
44 | COMMON += -flto -O3
45 | endif
46 |
47 | # Identify Machine
48 | ifeq ($(CXX), aarch64-linux-gnu-g++)
49 | MACHINE = aarch64
50 | else ifeq ($(CXX), x86_64-linux-gnu-g++)
51 | MACHINE = x86_64
52 | else ifeq ($(CXX), powerpc64le-linux-gnu-g++)
53 | MACHINE = ppc64le
54 | else
55 | MACHINE = $(shell uname -m)
56 | endif
57 |
58 | # Machine specific
59 | ifeq ($(MACHINE), x86_64)
60 | COMMON += -march=x86-64 -mtune=generic
61 | else ifeq ($(MACHINE), arm64)
62 | COMMON += -march=armv8-a+simd -mtune=generic
63 | else ifeq ($(MACHINE), aarch64)
64 | COMMON += -march=armv8-a+simd -mtune=generic
65 | else ifeq ($(MACHINE), ppc64le)
66 | COMMON += -mcpu=power8
67 | endif
68 |
69 | # OS specific
70 | ifeq ($(CXX), x86_64-w64-mingw32-g++)
71 | LIBS += -lpsapi
72 | LINKOPT += -static
73 | else
74 | WARNINGS += -pedantic
75 | endif
76 |
77 | LINKFLAGS = $(COMMON) $(LINKOPT)
78 |
79 | CXXFLAGS = $(COMMON) $(WARNINGS)
80 |
81 | PROG = compairr
82 |
83 | OBJS = arch.o bloompat.o cluster.o compairr.o db.o dedup.o hashtable.o \
84 | overlap.o util.o variants.o zobrist.o
85 |
86 | DEPS = Makefile threads.h \
87 | arch.h bloompat.h cluster.h compairr.h db.h dedup.h hashtable.h \
88 | overlap.h util.h variants.h zobrist.h
89 |
90 | all : $(PROG)
91 |
92 | compairr : $(OBJS) $(DEPS)
93 | $(CXX) $(LINKFLAGS) -o $@ $(OBJS) $(LIBS)
94 |
95 | clean :
96 | rm -f compairr *.o *~ gmon.out *.gcno *.gcda *.gcov
97 |
98 | .o : .cc $(DEPS)
99 | $(CXX) $(CXXFLAGS) -c -o $@ $<
100 |
--------------------------------------------------------------------------------
/src/arch.cc:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | #include "compairr.h"
23 |
24 | uint64_t arch_get_memused()
25 | {
26 | #ifdef _WIN32
27 |
28 | PROCESS_MEMORY_COUNTERS pmc;
29 | GetProcessMemoryInfo(GetCurrentProcess(),
30 | &pmc,
31 | sizeof(PROCESS_MEMORY_COUNTERS));
32 | return pmc.PeakWorkingSetSize;
33 |
34 | #else
35 |
36 | struct rusage r_usage;
37 | getrusage(RUSAGE_SELF, & r_usage);
38 |
39 | # ifdef __APPLE__
40 | /* Mac: ru_maxrss gives the size in bytes */
41 | return static_cast(r_usage.ru_maxrss);
42 | # else
43 | /* Linux: ru_maxrss gives the size in kilobytes */
44 | return static_cast(r_usage.ru_maxrss * 1024);
45 | # endif
46 |
47 | #endif
48 | }
49 |
50 | uint64_t arch_get_memtotal()
51 | {
52 | #ifdef _WIN32
53 |
54 | MEMORYSTATUSEX ms;
55 | ms.dwLength = sizeof(MEMORYSTATUSEX);
56 | GlobalMemoryStatusEx(&ms);
57 | return ms.ullTotalPhys;
58 |
59 | #elif defined(__APPLE__)
60 |
61 | int mib [] = { CTL_HW, HW_MEMSIZE };
62 | int64_t ram = 0;
63 | size_t length = sizeof(ram);
64 | if(sysctl(mib, 2, &ram, &length, nullptr, 0) == -1)
65 | fatal("Cannot determine amount of RAM");
66 | return static_cast(ram);
67 |
68 | #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
69 |
70 | int64_t phys_pages = sysconf(_SC_PHYS_PAGES);
71 | int64_t pagesize = sysconf(_SC_PAGESIZE);
72 | if ((phys_pages == -1) || (pagesize == -1))
73 | fatal("Cannot determine amount of RAM");
74 | return static_cast(pagesize * phys_pages);
75 |
76 | #else
77 |
78 | struct sysinfo si;
79 | if (sysinfo(&si))
80 | fatal("Cannot determine amount of RAM");
81 | return si.totalram * si.mem_unit;
82 |
83 | #endif
84 | }
85 |
86 | void arch_srandom(unsigned int seed)
87 | {
88 | /* initialize pseudo-random number generator */
89 |
90 | #ifdef _WIN32
91 | srand(seed);
92 | #else
93 | srandom(seed);
94 | #endif
95 | }
96 |
97 | uint64_t arch_random()
98 | {
99 | #ifdef _WIN32
100 | return static_cast(rand());
101 | #else
102 | return static_cast(random());
103 | #endif
104 | }
105 |
--------------------------------------------------------------------------------
/src/arch.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | /* functions in arch.cc */
23 |
24 | uint64_t arch_get_memused();
25 | uint64_t arch_get_memtotal();
26 | void arch_srandom(unsigned int seed);
27 | uint64_t arch_random();
28 |
--------------------------------------------------------------------------------
/src/bloompat.cc:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | /*
23 | Blocked bloom filter with precomputed bit patterns
24 | as described in
25 |
26 | Putze F, Sanders P, Singler J (2009)
27 | Cache-, Hash- and Space-Efficient Bloom Filters
28 | Journal of Experimental Algorithmics, 14, 4
29 | https://doi.org/10.1145/1498698.1594230
30 | */
31 |
32 | #include "compairr.h"
33 |
34 | void bloom_patterns_generate(struct bloom_s * b);
35 |
36 | void bloom_patterns_generate(struct bloom_s * b)
37 | {
38 | const unsigned int k = 8;
39 | for (unsigned int i = 0; i < BLOOM_PATTERN_COUNT; i++)
40 | {
41 | uint64_t pattern = 0;
42 | for (unsigned int j = 0; j < k; j++)
43 | {
44 | uint64_t onebit;
45 | onebit = 1ULL << (arch_random() & 63);
46 | while (pattern & onebit)
47 | onebit = 1ULL << (arch_random() & 63);
48 | pattern |= onebit;
49 | }
50 | b->patterns[i] = pattern;
51 | }
52 | }
53 |
54 | void bloom_zap(struct bloom_s * b)
55 | {
56 | memset(b->bitmap, 0xff, b->size);
57 | }
58 |
59 | struct bloom_s * bloom_init(uint64_t size)
60 | {
61 | // Size is in bytes for full bitmap, must be power of 2
62 | // at least 8
63 | size = MAX(size, 8);
64 |
65 | struct bloom_s * b = static_cast(xmalloc(sizeof(struct bloom_s)));
66 |
67 | b->size = size;
68 |
69 | b->mask = (size >> 3) - 1;
70 |
71 | b->bitmap = static_cast(xmalloc(size));
72 |
73 | bloom_zap(b);
74 |
75 | bloom_patterns_generate(b);
76 |
77 | return b;
78 | }
79 |
80 | void bloom_exit(struct bloom_s * b)
81 | {
82 | xfree(b->bitmap);
83 | xfree(b);
84 | }
85 |
--------------------------------------------------------------------------------
/src/bloompat.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | #define BLOOM_PATTERN_SHIFT 10
23 | #define BLOOM_PATTERN_COUNT (1 << BLOOM_PATTERN_SHIFT)
24 | #define BLOOM_PATTERN_MASK (BLOOM_PATTERN_COUNT - 1)
25 |
26 | struct bloom_s
27 | {
28 | uint64_t size;
29 | uint64_t mask;
30 | uint64_t * bitmap;
31 | uint64_t patterns[BLOOM_PATTERN_COUNT];
32 | };
33 |
34 | void bloom_zap(struct bloom_s * b);
35 |
36 | struct bloom_s * bloom_init(uint64_t size);
37 |
38 | void bloom_exit(struct bloom_s * b);
39 |
40 | inline uint64_t * bloom_adr(struct bloom_s * b, uint64_t h)
41 | {
42 | return b->bitmap + ((h >> BLOOM_PATTERN_SHIFT) & b->mask);
43 | }
44 |
45 | inline uint64_t bloom_pat(struct bloom_s * b, uint64_t h)
46 | {
47 | return b->patterns[h & BLOOM_PATTERN_MASK];
48 | }
49 |
50 | inline void bloom_set(struct bloom_s * b, uint64_t h)
51 | {
52 | * bloom_adr(b, h) &= ~ bloom_pat(b, h);
53 | }
54 |
55 | inline bool bloom_get(struct bloom_s * b, uint64_t h)
56 | {
57 | return ! (* bloom_adr(b, h) & bloom_pat(b, h));
58 | }
59 |
--------------------------------------------------------------------------------
/src/cluster.cc:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | #include "compairr.h"
23 |
24 | const unsigned int no_cluster = UINT_MAX;
25 |
26 | static struct iteminfo_s
27 | {
28 | unsigned int clusterid;
29 | unsigned int next;
30 | unsigned int network_start;
31 | unsigned int network_count;
32 | } * iteminfo = 0;
33 |
34 | static struct clusterinfo_s
35 | {
36 | unsigned int seed;
37 | unsigned int size;
38 | } * clusterinfo = 0;
39 |
40 | static uint64_t clusterinfo_alloc = 0;
41 |
42 | static pthread_mutex_t network_mutex;
43 | static unsigned int * network = 0;
44 | static unsigned int network_count = 0;
45 | static unsigned int network_seq = 0;
46 | static uint64_t network_alloc = 0;
47 | static uint64_t seqcount = 0;
48 |
49 | static struct db * d;
50 | static struct bloom_s * bloom = 0;
51 | static hashtable_s * hashtable = 0;
52 |
53 | static int compare_cluster(const void * a, const void * b)
54 | {
55 | clusterinfo_s * x = (clusterinfo_s *) a;
56 | clusterinfo_s * y = (clusterinfo_s *) b;
57 | if (x->size > y->size)
58 | return -1;
59 | else if (x->size < y->size)
60 | return +1;
61 | else
62 | return 0;
63 | }
64 |
65 | static inline void hash_insert_cluster(uint64_t seq)
66 | {
67 | /* find the first empty bucket */
68 | uint64_t hash = db_gethash(d, seq);
69 | uint64_t j = hash_getindex(hashtable, hash);
70 | while (hash_is_occupied(hashtable, j))
71 | j = hash_getnextindex(hashtable, j);
72 |
73 | hash_set_occupied(hashtable, j);
74 | hash_set_value(hashtable, j, hash);
75 | hash_set_data(hashtable, j, seq);
76 | bloom_set(bloom, hash);
77 | }
78 |
79 | static void find_variant_matches(uint64_t seed,
80 | var_s * var,
81 | unsigned int * * hits_data,
82 | unsigned int * hits_count,
83 | uint64_t * hits_alloc)
84 | {
85 | /* compute hash table index */
86 |
87 | uint64_t j = hash_getindex(hashtable, var->hash);
88 |
89 | /* find matching buckets */
90 |
91 | while (hash_is_occupied(hashtable, j))
92 | {
93 | if (hash_compare_value(hashtable, j, var->hash))
94 | {
95 | uint64_t hit = hash_get_data(hashtable, j);
96 |
97 | /* double check that everything matches */
98 |
99 | unsigned int seed_v_gene = db_get_v_gene(d, seed);
100 | unsigned int seed_j_gene = db_get_j_gene(d, seed);
101 |
102 | unsigned int hit_v_gene = db_get_v_gene(d, hit);
103 | unsigned int hit_j_gene = db_get_j_gene(d, hit);
104 |
105 | if ((seed != hit) &&
106 | (opt_ignore_genes ||
107 | ((seed_v_gene == hit_v_gene) &&
108 | (seed_j_gene == hit_j_gene))))
109 | {
110 | unsigned char * seed_sequence
111 | = (unsigned char *) db_getsequence(d, seed);
112 | unsigned int seed_seqlen
113 | = db_getsequencelen(d, seed);
114 | unsigned char * hit_sequence
115 | = (unsigned char *) db_getsequence(d, hit);
116 | unsigned int hit_seqlen
117 | = db_getsequencelen(d, hit);
118 |
119 | if (check_variant(seed_sequence, seed_seqlen,
120 | var,
121 | hit_sequence, hit_seqlen))
122 | {
123 | if (*hits_alloc <= *hits_count)
124 | {
125 | *hits_alloc += 1024;
126 | *hits_data = static_cast
127 | (xrealloc((*hits_data),
128 | (*hits_alloc) * sizeof(unsigned int)));
129 | }
130 | (*hits_data)[(*hits_count)++] = hit;
131 | }
132 | }
133 | }
134 | j = hash_getnextindex(hashtable, j);
135 | }
136 | }
137 |
138 | static void process_variants(uint64_t seed,
139 | var_s * variant_list,
140 | unsigned int * * hits_data,
141 | unsigned int * hits_count,
142 | uint64_t * hits_alloc)
143 | {
144 | unsigned int variant_count = 0;
145 | * hits_count = 0;
146 |
147 | unsigned char * sequence = (unsigned char *) db_getsequence(d, seed);
148 | unsigned int seqlen = db_getsequencelen(d, seed);
149 | uint64_t hash = db_gethash(d, seed);
150 | uint64_t v_gene = db_get_v_gene(d, seed);
151 | uint64_t j_gene = db_get_j_gene(d, seed);
152 |
153 | generate_variants(hash,
154 | sequence, seqlen, v_gene, j_gene,
155 | variant_list, & variant_count);
156 |
157 | for(unsigned int i = 0; i < variant_count; i++)
158 | {
159 | var_s * var = variant_list + i;
160 | if (bloom_get(bloom, var->hash))
161 | find_variant_matches(seed, var, hits_data, hits_count, hits_alloc);
162 | }
163 | }
164 |
165 | static void process_trad(uint64_t seed,
166 | unsigned int * * hits_data,
167 | unsigned int * hits_count,
168 | uint64_t * hits_alloc)
169 | {
170 | /* Only to be used with no indels (and d >= 3) */
171 |
172 | for (uint64_t hit = 0; hit < seqcount; hit++)
173 | if (seed != hit)
174 | {
175 | /* check if everything matches */
176 |
177 | unsigned int seed_v_gene = db_get_v_gene(d, seed);
178 | unsigned int seed_j_gene = db_get_j_gene(d, seed);
179 |
180 | unsigned int hit_v_gene = db_get_v_gene(d, hit);
181 | unsigned int hit_j_gene = db_get_j_gene(d, hit);
182 |
183 | if (opt_ignore_genes ||
184 | ((seed_v_gene == hit_v_gene) && (seed_j_gene == hit_j_gene)))
185 | {
186 | unsigned int seed_seqlen = db_getsequencelen(d, seed);
187 | unsigned int hit_seqlen = db_getsequencelen(d, hit);
188 |
189 | if (seed_seqlen == hit_seqlen)
190 | {
191 | unsigned char * seed_sequence
192 | = (unsigned char *) db_getsequence(d, seed);
193 | unsigned char * hit_sequence
194 | = (unsigned char *) db_getsequence(d, hit);
195 |
196 | if (seq_diff(seed_sequence, hit_sequence, seed_seqlen)
197 | <= opt_differences)
198 | {
199 | if (*hits_alloc <= *hits_count)
200 | {
201 | *hits_alloc += 1024;
202 | *hits_data = static_cast
203 | (xrealloc((*hits_data),
204 | (*hits_alloc) * sizeof(unsigned int)));
205 | }
206 | (*hits_data)[(*hits_count)++] = hit;
207 | }
208 | }
209 | }
210 | }
211 | }
212 |
213 | static void process_seq(uint64_t seed,
214 | var_s * variant_list,
215 | unsigned int * * hits_data,
216 | unsigned int * hits_count,
217 | uint64_t * hits_alloc)
218 | {
219 | if (opt_differences <= MAXDIFF_HASH)
220 | process_variants(seed, variant_list, hits_data, hits_count, hits_alloc);
221 | else
222 | process_trad(seed, hits_data, hits_count, hits_alloc);
223 | }
224 |
225 | static void network_thread(int64_t t)
226 | {
227 | (void) t;
228 |
229 | unsigned int longest = db_getlongestsequence(d);
230 | uint64_t maxvar = max_variants(longest);
231 |
232 | uint64_t hits_alloc = 1024;
233 | auto * hits_data = static_cast
234 | (xmalloc(hits_alloc * sizeof(unsigned int)));
235 |
236 | auto * variant_list = static_cast
237 | (xmalloc(maxvar * sizeof(struct var_s)));
238 |
239 | pthread_mutex_lock(&network_mutex);
240 |
241 | while (network_seq < seqcount)
242 | {
243 | unsigned int seed = network_seq++;
244 | progress_update(seed);
245 |
246 | pthread_mutex_unlock(&network_mutex);
247 |
248 | unsigned int hits_count = 0;
249 | process_seq(seed, variant_list,
250 | & hits_data, & hits_count, & hits_alloc);
251 |
252 | pthread_mutex_lock(&network_mutex);
253 |
254 | iteminfo[seed].network_start = network_count;
255 | iteminfo[seed].network_count = hits_count;
256 |
257 | if (network_count + hits_count > network_alloc)
258 | {
259 | while (network_count + hits_count > network_alloc)
260 | network_alloc += 1024 * 1024;
261 |
262 | network = static_cast
263 | (xrealloc(network, network_alloc * sizeof(unsigned int)));
264 | }
265 |
266 | for(unsigned int k = 0; k < hits_count; k++)
267 | network[network_count++] = hits_data[k];
268 | }
269 |
270 | pthread_mutex_unlock(&network_mutex);
271 |
272 | xfree(variant_list);
273 | xfree(hits_data);
274 | }
275 |
276 | static unsigned int clustersize = 0;
277 | static unsigned int current_cluster_tail = 0;
278 |
279 | static void process_seed(unsigned int seed)
280 | {
281 | clustersize++;
282 |
283 | unsigned int s = iteminfo[seed].network_start;
284 | unsigned int c = iteminfo[seed].network_count;
285 |
286 | unsigned int clusterid = iteminfo[seed].clusterid;
287 |
288 | for(unsigned int i = 0; i < c; i++)
289 | {
290 | unsigned int hit = network[s + i];
291 | if (iteminfo[hit].clusterid == no_cluster)
292 | {
293 | /* add hit to cluster, update linked chain */
294 | iteminfo[hit].clusterid = clusterid;
295 | iteminfo[current_cluster_tail].next = hit;
296 | current_cluster_tail = hit;
297 | }
298 | }
299 | }
300 |
301 | void cluster(char * filename)
302 | {
303 | fprintf(logfile, "Immune receptor repertoire clustering\n\n");
304 |
305 | db_init();
306 |
307 | d = db_create();
308 | db_read(d, filename, false, "1");
309 |
310 | unsigned int longest = db_getlongestsequence(d);
311 | seqcount = db_getsequencecount(d);
312 |
313 | fprintf(logfile, "\n");
314 | fprintf(logfile, "Unique V genes: %" PRIu64 "\n",
315 | db_get_v_gene_count());
316 | fprintf(logfile, "Unique J genes: %" PRIu64 "\n",
317 | db_get_j_gene_count());
318 | fprintf(logfile, "\n");
319 |
320 | if (opt_differences <= MAXDIFF_HASH)
321 | {
322 | zobrist_init(longest + MAX_INSERTS,
323 | db_get_v_gene_count(),
324 | db_get_j_gene_count());
325 |
326 | db_hash(d);
327 |
328 | hashtable = hash_init(seqcount);
329 | bloom = bloom_init(hash_get_tablesize(hashtable) * 2);
330 | }
331 |
332 | iteminfo = static_cast
333 | (xmalloc(seqcount * sizeof(struct iteminfo_s)));
334 |
335 | progress_init("Hashing sequences:", seqcount);
336 | for(uint64_t i=0; i < seqcount; i++)
337 | {
338 | iteminfo[i].clusterid = no_cluster;
339 | iteminfo[i].next = no_cluster;
340 | if (opt_differences <= MAXDIFF_HASH)
341 | hash_insert_cluster(i);
342 | progress_update(i);
343 | }
344 | progress_done();
345 |
346 | network = static_cast
347 | (xmalloc(network_alloc * sizeof(unsigned int)));
348 | network_count = 0;
349 | network_seq = 0;
350 |
351 | pthread_mutex_init(&network_mutex, nullptr);
352 | progress_init("Building network: ", seqcount);
353 |
354 | if (opt_threads == 1)
355 | {
356 | network_thread(0);
357 | }
358 | else
359 | {
360 | ThreadRunner * sim_tr = new ThreadRunner(static_cast(opt_threads),
361 | network_thread);
362 | sim_tr->run();
363 | delete sim_tr;
364 | }
365 |
366 | progress_done();
367 | pthread_mutex_destroy(&network_mutex);
368 |
369 |
370 | unsigned int clustercount = 0;
371 |
372 | progress_init("Clustering: ", seqcount);
373 |
374 | /* for each non-clustered item, look for subseeds ... */
375 | uint64_t x = 0;
376 | for(unsigned int seed = 0; seed < seqcount; seed++)
377 | {
378 | struct iteminfo_s * ap = iteminfo + seed;
379 |
380 | if (ap->clusterid == no_cluster)
381 | {
382 | /* start a new cluster with a new initial seed */
383 |
384 | ap->clusterid = clustercount;
385 | ap->next = no_cluster;
386 | current_cluster_tail = seed;
387 | clustersize = 0;
388 |
389 | /* find initial matches */
390 | process_seed(seed);
391 | progress_update(++x);
392 |
393 | unsigned int subseed = ap->next;
394 |
395 | /* process all subseeds */
396 | while(subseed != no_cluster)
397 | {
398 | process_seed(subseed);
399 | progress_update(++x);
400 | subseed = iteminfo[subseed].next;
401 | }
402 |
403 | if (clustercount >= clusterinfo_alloc)
404 | {
405 | /* allocate memory for more clusters... */
406 | clusterinfo_alloc += 1024;
407 | clusterinfo = static_cast
408 | (xrealloc(clusterinfo,
409 | clusterinfo_alloc * sizeof(clusterinfo_s)));
410 | }
411 |
412 | struct clusterinfo_s * sp = clusterinfo + clustercount;
413 | sp->seed = seed;
414 | sp->size = clustersize;
415 | clustercount++;
416 | }
417 | }
418 |
419 | progress_done();
420 |
421 | progress_init("Sorting clusters: ", clustercount);
422 | qsort(clusterinfo, clustercount, sizeof(clusterinfo_s), compare_cluster);
423 | progress_done();
424 |
425 | /* dump clusters */
426 |
427 | uint64_t j = 0;
428 | progress_init("Writing clusters: ", seqcount);
429 | fprintf(outfile,
430 | "#cluster_no\tcluster_size\trepertoire_id\tsequence_id\t"
431 | "duplicate_count\tv_call\tj_call\t%s\n", seq_header);
432 | for(unsigned int i = 0; i < clustercount; i++)
433 | {
434 | unsigned int seed = clusterinfo[i].seed;
435 | unsigned int size = clusterinfo[i].size;
436 | for(unsigned int a = seed; a != no_cluster; a = iteminfo[a].next)
437 | {
438 | fprintf(outfile,
439 | "%u\t%u\t",
440 | i + 1,
441 | size);
442 | fprintf(outfile,
443 | "%s\t%s\t%" PRIu64 "\t%s\t%s\t",
444 | db_get_repertoire_id(d, db_get_repertoire_id_no(d, a)),
445 | db_get_sequence_id(d, a),
446 | db_get_count(d, a),
447 | db_get_v_gene_name(d, a),
448 | db_get_j_gene_name(d, a));
449 | db_fprint_sequence(outfile, d, a);
450 | fprintf(outfile, "\n");
451 | j++;
452 | }
453 | progress_update(j);
454 | }
455 | progress_done();
456 |
457 | fprintf(logfile, "\n");
458 | fprintf(logfile, "Clusters: %u\n", clustercount);
459 |
460 | xfree(network);
461 | if (clusterinfo)
462 | xfree(clusterinfo);
463 | if (iteminfo)
464 | xfree(iteminfo);
465 |
466 | if (opt_differences <= MAXDIFF_HASH)
467 | {
468 | bloom_exit(bloom);
469 | hash_exit(hashtable);
470 | zobrist_exit();
471 | }
472 |
473 | db_free(d);
474 | db_exit();
475 | }
476 |
--------------------------------------------------------------------------------
/src/cluster.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | /* other */
23 |
24 | void cluster(char * filename);
25 |
--------------------------------------------------------------------------------
/src/compairr.cc:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | /*
23 |
24 | This program uses Frederic Mahe's idea for swarm (d=1) to
25 | enumerate all variants of a sequence containing a single
26 | change (substitution, deletion or insertion) to quickly
27 | identify neighbour sequences using a hashing strategy.
28 |
29 | Please see the following publications for details:
30 |
31 | Mahe F, Rognes T, Quince C, de Vargas C, Dunthorn M (2014)
32 | Swarm: robust and fast clustering method for amplicon-based studies
33 | PeerJ 2:e593 https://doi.org/10.7717/peerj.593
34 |
35 | Mahe F, Rognes T, Quince C, de Vargas C, Dunthorn M (2015)
36 | Swarm v2: highly-scalable and high-resolution amplicon clustering
37 | PeerJ 3:e1420 https://doi.org/10.7717/peerj.1420
38 |
39 | */
40 |
41 | #include "compairr.h"
42 |
43 | /* OPTIONS */
44 |
45 | static char * progname;
46 | static char * input1_filename;
47 | static char * input2_filename;
48 |
49 | bool opt_alternative;
50 | bool opt_cdr3;
51 | bool opt_cluster;
52 | bool opt_distance;
53 | bool opt_existence;
54 | bool opt_help;
55 | bool opt_ignore_counts;
56 | bool opt_ignore_empty;
57 | bool opt_ignore_genes;
58 | bool opt_ignore_unknown;
59 | bool opt_indels;
60 | bool opt_matrix;
61 | bool opt_nucleotides;
62 | bool opt_no_matrix;
63 | bool opt_version;
64 | bool opt_deduplicate;
65 | char * opt_keep_columns;
66 | char * opt_log;
67 | char * opt_output;
68 | char * opt_pairs;
69 | char * opt_score_string;
70 | int64_t opt_differences;
71 | int64_t opt_score_int;
72 | int64_t opt_threads;
73 |
74 | /* Other variables */
75 |
76 | const char * seq_header = nullptr;
77 |
78 | FILE * outfile = nullptr;
79 | FILE * logfile = nullptr;
80 | FILE * pairsfile = nullptr;
81 |
82 | int keep_columns_count = 0;
83 | int * keep_columns_no = nullptr;
84 | char ** keep_columns_names = nullptr;
85 | char ** keep_columns_strings = nullptr;
86 |
87 | int alphabet_size;
88 |
89 | static char dash[] = "-";
90 | static char * DASH_FILENAME = dash;
91 |
92 | static const char * score_options[] =
93 | { "Product", "Ratio", "Min", "Max", "Mean", "MH", "Jaccard" };
94 |
95 | static const char * score_descr[] =
96 | {
97 | "Sum of products of counts",
98 | "Sum of ratios of counts",
99 | "Sum of minimum of counts",
100 | "Sum of maximum of counts",
101 | "Sum of mean of counts",
102 | "Morisita-Horn index",
103 | "Jaccard index"
104 | };
105 |
106 | int64_t args_long(char * str, const char * option);
107 | void args_show();
108 | void args_usage();
109 | void show_header();
110 | void args_init(int argc, char **argv);
111 | void open_files();
112 | void close_files();
113 |
114 | bool parse_keep_columns()
115 | {
116 | unsigned int len = strlen(opt_keep_columns);
117 | keep_columns_count = 1;
118 | for (unsigned int i = 0; i < len; i++)
119 | if (opt_keep_columns[i] == ',')
120 | keep_columns_count++;
121 |
122 | keep_columns_no = (int *) xmalloc
123 | (keep_columns_count * sizeof(int));
124 |
125 | keep_columns_names = (char **) xmalloc
126 | (keep_columns_count * sizeof(char *));
127 |
128 | keep_columns_strings = (char **) xmalloc
129 | (keep_columns_count * sizeof(char *));
130 |
131 | for (int j = 0; j < keep_columns_count; j++)
132 | keep_columns_no[j] = 0;
133 |
134 | keep_columns_count = 0;
135 | unsigned int curlen = 0;
136 | for (unsigned int i = 0; i < len; i++)
137 | {
138 | char c = opt_keep_columns[i];
139 | if (c == ',')
140 | {
141 | if (curlen == 0)
142 | return false;
143 | else
144 | {
145 | opt_keep_columns[i] = 0;
146 | keep_columns_names[keep_columns_count] =
147 | xstrdup(opt_keep_columns + i - curlen);
148 | opt_keep_columns[i] = ',';
149 | keep_columns_count++;
150 | curlen = 0;
151 | }
152 | }
153 | else if (((c >= 'A') && (c <= 'Z')) ||
154 | ((c >= 'a') && (c <= 'z')) ||
155 | ((c >= '0') && (c <= '9')) ||
156 | (c == '_'))
157 | {
158 | curlen++;
159 | }
160 | else
161 | {
162 | return false;
163 | }
164 | }
165 |
166 | if (curlen == 0)
167 | return false;
168 |
169 | keep_columns_names[keep_columns_count] =
170 | xstrdup(opt_keep_columns + len - curlen);
171 | keep_columns_count++;
172 | return true;
173 | }
174 |
175 | int64_t args_long(char * str, const char * option)
176 | {
177 | char * endptr;
178 | int64_t temp = strtol(str, & endptr, 10);
179 | if (*endptr)
180 | {
181 | fprintf(stderr, "\nInvalid numeric argument for option %s\n", option);
182 | exit(1);
183 | }
184 | return temp;
185 | }
186 |
187 | void show_time(const char * prompt)
188 | {
189 | const int time_string_max = 100;
190 | char time_string[time_string_max];
191 | const time_t clock = time(nullptr);
192 | const struct tm * timeptr = localtime(& clock);
193 | size_t time_string_len = strftime(time_string,
194 | time_string_max,
195 | "%a %b %d %T %Z %Y",
196 | timeptr);
197 | fprintf(logfile, "%s%s\n", prompt, time_string_len > 0 ? time_string : "?");
198 | }
199 |
200 | void args_show()
201 | {
202 | if (opt_matrix)
203 | fprintf(logfile, "Command: Overlap (-m)\n");
204 | if (opt_cluster)
205 | fprintf(logfile, "Command: Cluster (-c)\n");
206 | if (opt_existence)
207 | fprintf(logfile, "Command: Existence (-x)\n");
208 | if (opt_deduplicate)
209 | fprintf(logfile, "Command: Deduplicate (--deduplicate)\n");
210 |
211 | if (opt_matrix)
212 | fprintf(logfile, "Repertoire set 1: %s\n", input1_filename);
213 | else
214 | fprintf(logfile, "Repertoire: %s\n", input1_filename);
215 | if (opt_matrix)
216 | fprintf(logfile, "Repertoire set 2: %s\n", input2_filename ? input2_filename : "(same as set 1)");
217 | if (opt_existence)
218 | fprintf(logfile, "Repertoire set: %s\n", input2_filename);
219 |
220 | fprintf(logfile, "Nucleotides (n): %s\n", opt_nucleotides ? "Yes" : "No");
221 | fprintf(logfile, "Differences (d): %" PRId64 "\n", opt_differences);
222 | fprintf(logfile, "Indels (i): %s\n", opt_indels ? "Yes" : "No");
223 | fprintf(logfile, "Ignore counts (f): %s\n",
224 | opt_ignore_counts ? "Yes" : "No");
225 | fprintf(logfile, "Ignore genes (g): %s\n",
226 | opt_ignore_genes ? "Yes" : "No");
227 | fprintf(logfile, "Ign. unknown (u): %s\n",
228 | opt_ignore_unknown ? "Yes" : "No");
229 | fprintf(logfile, "Ignore empty (e): %s\n",
230 | opt_ignore_empty ? "Yes" : "No");
231 | fprintf(logfile, "Use cdr3 column: %s\n",
232 | opt_cdr3 ? "Yes" : "No");
233 | fprintf(logfile, "Threads (t): %" PRId64 "\n", opt_threads);
234 | if (opt_no_matrix)
235 | fprintf(logfile, "Output file (o): (none)\n");
236 | else
237 | fprintf(logfile, "Output file (o): %s\n", opt_output);
238 | if (opt_matrix || opt_existence)
239 | {
240 | fprintf(logfile, "Output format (a): %s\n", opt_alternative ? "Column" : "Matrix");
241 | fprintf(logfile, "Score (s): %s\n", score_descr[opt_score_int]);
242 | fprintf(logfile, "Pairs file (p): %s\n", opt_pairs ? opt_pairs : "(none)");
243 | fprintf(logfile, "Keep columns: %s\n", opt_keep_columns ? opt_keep_columns : "");
244 | }
245 | fprintf(logfile, "Log file (l): %s\n", opt_log ? opt_log : "(stderr)");
246 | }
247 |
248 | void args_usage()
249 | {
250 | fprintf(stderr, "Usage: %s [OPTIONS] TSVFILE1 [TSVFILE2]\n", PROG_CMD);
251 | fprintf(stderr, "\n");
252 | fprintf(stderr, "Commands:\n");
253 | fprintf(stderr, " -h, --help display this help and exit\n");
254 | fprintf(stderr, " -v, --version display version information\n");
255 | fprintf(stderr, " -m, --matrix compute overlap matrix between two sets\n");
256 | fprintf(stderr, " -x, --existence check existence of sequences in repertoires\n");
257 | fprintf(stderr, " -c, --cluster cluster sequences in one repertoire\n");
258 | fprintf(stderr, " -z, --deduplicate deduplicate sequences in repertoires\n");
259 | fprintf(stderr, "\n");
260 | fprintf(stderr, "General options:\n");
261 | fprintf(stderr, " -d, --differences INTEGER number of differences accepted (0*)\n");
262 | fprintf(stderr, " -i, --indels allow insertions or deletions when d=1\n");
263 | fprintf(stderr, " -f, --ignore-counts ignore duplicate_count information\n");
264 | fprintf(stderr, " -g, --ignore-genes ignore V and J gene information\n");
265 | fprintf(stderr, " -n, --nucleotides compare nucleotides, not amino acids\n");
266 | fprintf(stderr, " -s, --score STRING MH, Jaccard, product*, ratio, min, max, or mean\n");
267 | fprintf(stderr, " -t, --threads INTEGER number of threads to use (1*-256)\n");
268 | fprintf(stderr, " -u, --ignore-unknown ignore sequences with unknown symbols\n");
269 | fprintf(stderr, " -e, --ignore-empty ignore empty sequences\n");
270 | fprintf(stderr, "\n");
271 | fprintf(stderr, "Input/output options:\n");
272 | fprintf(stderr, " -a, --alternative output results in three-column format, not matrix\n");
273 | fprintf(stderr, " --cdr3 use the cdr3(_aa) column instead of junction(_aa)\n");
274 | fprintf(stderr, " --distance include sequence distance in pairs file\n");
275 | fprintf(stderr, " -k, --keep-columns STRING comma-separated columns to copy to pairs file\n");
276 | fprintf(stderr, " -l, --log FILENAME log to file (stderr*)\n");
277 | fprintf(stderr, " -o, --output FILENAME output results to file (stdout*)\n");
278 | fprintf(stderr, " --no-matrix do not keep or output any matrix\n");
279 | fprintf(stderr, " -p, --pairs FILENAME output matching pairs to file (none*)\n");
280 | fprintf(stderr, "\n");
281 | fprintf(stderr, " * default value\n");
282 | fprintf(stderr, "\n");
283 | }
284 |
285 | void show_header()
286 | {
287 | fprintf(logfile, "%s %s - %s\n", PROG_NAME, PROG_VERSION, PROG_BRIEF);
288 | fprintf(logfile, "https://github.com/uio-bmi/compairr\n");
289 | fprintf(logfile, "\n");
290 | }
291 |
292 | void args_init(int argc, char **argv)
293 | {
294 | /* Set defaults */
295 |
296 | progname = argv[0];
297 | input1_filename = nullptr;
298 | input2_filename = nullptr;
299 |
300 | opt_alternative = false;
301 | opt_cdr3 = false;
302 | opt_cluster = false;
303 | opt_deduplicate = false;
304 | opt_distance = false;
305 | opt_differences = 0;
306 | opt_existence = false;
307 | opt_help = false;
308 | opt_ignore_counts = false;
309 | opt_ignore_genes = false;
310 | opt_ignore_unknown = false;
311 | opt_ignore_empty = false;
312 | opt_indels = false;
313 | opt_keep_columns = nullptr;
314 | opt_log = nullptr;
315 | opt_matrix = false;
316 | opt_nucleotides = false;
317 | opt_no_matrix = false;
318 | opt_output = DASH_FILENAME;
319 | opt_pairs = nullptr;
320 | opt_score_int = 0;
321 | opt_score_string = NULL;
322 | opt_threads = 1;
323 | opt_version = false;
324 |
325 | opterr = 1;
326 |
327 | char short_options[] = "acd:efghik:l:mno:p:s:t:uvxz";
328 |
329 | /* unused short option letters: bejqrwy */
330 |
331 | static struct option long_options[] =
332 | {
333 | {"alternative", no_argument, nullptr, 'a' },
334 | {"cdr3", no_argument, nullptr, 0 },
335 | {"cluster", no_argument, nullptr, 'c' },
336 | {"differences", required_argument, nullptr, 'd' },
337 | {"distance", no_argument, nullptr, 0 },
338 | {"ignore-empty", no_argument, nullptr, 'e' },
339 | {"ignore-counts", no_argument, nullptr, 'f' },
340 | {"ignore-genes", no_argument, nullptr, 'g' },
341 | {"help", no_argument, nullptr, 'h' },
342 | {"indels", no_argument, nullptr, 'i' },
343 | {"keep-columns", required_argument, nullptr, 'k' },
344 | {"log", required_argument, nullptr, 'l' },
345 | {"matrix", no_argument, nullptr, 'm' },
346 | {"nucleotides", no_argument, nullptr, 'n' },
347 | {"no-matrix", no_argument, nullptr, 0 },
348 | {"output", required_argument, nullptr, 'o' },
349 | {"pairs", required_argument, nullptr, 'p' },
350 | {"score", required_argument, nullptr, 's' },
351 | {"summands", required_argument, nullptr, 's' },
352 | {"threads", required_argument, nullptr, 't' },
353 | {"ignore-unknown", no_argument, nullptr, 'u' },
354 | {"version", no_argument, nullptr, 'v' },
355 | {"existence", no_argument, nullptr, 'x' },
356 | {"deduplicate", no_argument, nullptr, 'z' },
357 | {nullptr, 0, nullptr, 0 }
358 | };
359 |
360 | enum
361 | {
362 | option_alternative,
363 | option_cdr3,
364 | option_cluster,
365 | option_differences,
366 | option_distance,
367 | option_ignore_empty,
368 | option_ignore_counts,
369 | option_ignore_genes,
370 | option_help,
371 | option_indels,
372 | option_keep_columns,
373 | option_log,
374 | option_matrix,
375 | option_nucleotides,
376 | option_no_matrix,
377 | option_output,
378 | option_pairs,
379 | option_score,
380 | option_summands,
381 | option_threads,
382 | option_ignore_unknown,
383 | option_version,
384 | option_existence,
385 | option_deduplicate
386 | };
387 |
388 | int used_options[26] = { 0, 0, 0, 0, 0,
389 | 0, 0, 0, 0, 0,
390 | 0, 0, 0, 0, 0,
391 | 0, 0, 0, 0, 0,
392 | 0, 0, 0, 0, 0,
393 | 0 };
394 |
395 | int option_index = 0;
396 | int c;
397 |
398 | while ((c = getopt_long(argc, argv, short_options, long_options, &option_index)) != -1)
399 | {
400 |
401 | /* check if any option is specified more than once */
402 |
403 | if ((c >= 'a') && (c <= 'z'))
404 | {
405 | int optindex = c - 'a';
406 | if (used_options[optindex] == 1)
407 | {
408 | int longoptindex = 0;
409 | while (long_options[longoptindex].name)
410 | {
411 | if (long_options[longoptindex].val == c)
412 | break;
413 | longoptindex++;
414 | }
415 |
416 | fprintf(stderr,
417 | "Error: Option -%c or --%s specified more than once.\n",
418 | c,
419 | long_options[longoptindex].name);
420 | exit(1);
421 | }
422 | used_options[optindex] = 1;
423 | }
424 |
425 | switch(c)
426 | {
427 | case 'a':
428 | /* alternative */
429 | opt_alternative = true;
430 | break;
431 |
432 | case 'c':
433 | /* cluster */
434 | opt_cluster = true;
435 | break;
436 |
437 | case 'd':
438 | /* differences */
439 | opt_differences = args_long(optarg, "-d or --differences");
440 | break;
441 |
442 | case 'e':
443 | /* ignore-empty */
444 | opt_ignore_empty = true;
445 | break;
446 |
447 | case 'f':
448 | /* ignore-counts */
449 | opt_ignore_counts = true;
450 | break;
451 |
452 | case 'g':
453 | /* ignore-genes */
454 | opt_ignore_genes = true;
455 | break;
456 |
457 | case 'h':
458 | /* help */
459 | opt_help = true;
460 | break;
461 |
462 | case 'i':
463 | /* indels */
464 | opt_indels = true;
465 | break;
466 |
467 | case 'k':
468 | /* keep_columns */
469 | opt_keep_columns = optarg;
470 | break;
471 |
472 | case 'l':
473 | /* log */
474 | opt_log = optarg;
475 | break;
476 |
477 | case 'm':
478 | /* matrix */
479 | opt_matrix = true;
480 | break;
481 |
482 | case 'n':
483 | /* nucleotides */
484 | opt_nucleotides = true;
485 | break;
486 |
487 | case 'o':
488 | /* output-file */
489 | opt_output = optarg;
490 | break;
491 |
492 | case 'p':
493 | /* pairs-file */
494 | opt_pairs = optarg;
495 | break;
496 |
497 | case 's':
498 | /* score, summands */
499 | opt_score_string = optarg;
500 | break;
501 |
502 | case 't':
503 | /* threads */
504 | opt_threads = args_long(optarg, "-t or --threads");
505 | break;
506 |
507 | case 'u':
508 | /* ignore-unknown */
509 | opt_ignore_unknown = true;
510 | break;
511 |
512 | case 'v':
513 | /* version */
514 | opt_version = true;
515 | break;
516 |
517 | case 'x':
518 | /* existence */
519 | opt_existence = true;
520 | break;
521 |
522 | case 'z':
523 | /* deduplicate */
524 | opt_deduplicate = true;
525 | break;
526 |
527 | case 0:
528 | /* long options only */
529 |
530 | switch (option_index)
531 | {
532 | case option_cdr3:
533 | /* cdr3 */
534 | opt_cdr3 = true;
535 | break;
536 |
537 | case option_distance:
538 | /* distance */
539 | opt_distance = true;
540 | break;
541 |
542 | case option_no_matrix:
543 | /* no_matrix */
544 | opt_no_matrix = true;
545 | break;
546 |
547 | default:
548 | show_header();
549 | args_usage();
550 | exit(1);
551 | }
552 | break;
553 |
554 | default:
555 | show_header();
556 | args_usage();
557 | exit(1);
558 | }
559 | }
560 |
561 | int cmd_count = opt_help + opt_version + opt_matrix + opt_cluster + opt_existence + opt_deduplicate;
562 | if (cmd_count == 0)
563 | fatal("Please specify a command (--help, --version, --matrix, --existence, --cluster, or --deduplicate)");
564 | if (cmd_count > 1)
565 | fatal("Please specify just one command (--help, --version, --matrix, --existence, --cluster, or --deduplicate)");
566 |
567 | if (opt_help || opt_version)
568 | {
569 | if (optind != argc)
570 | fatal("Incorrect number of arguments");
571 | }
572 | else if (opt_matrix)
573 | {
574 | if (optind + 2 == argc)
575 | {
576 | input1_filename = argv[optind];
577 | input2_filename = argv[optind + 1];
578 | }
579 | else if (optind + 1 == argc)
580 | {
581 | input1_filename = argv[optind];
582 | input2_filename = 0;
583 | }
584 | else
585 | {
586 | fatal("Incorrect number of arguments. One or two input files must be specified.");
587 | }
588 | }
589 | else if (opt_existence)
590 | {
591 | if (optind + 2 == argc)
592 | {
593 | input1_filename = argv[optind];
594 | input2_filename = argv[optind + 1];
595 | }
596 | else
597 | {
598 | fatal("Incorrect number of arguments. Two input files must be specified.");
599 | }
600 | }
601 | else if (opt_cluster || opt_deduplicate)
602 | {
603 | if (optind + 1 == argc)
604 | {
605 | input1_filename = argv[optind];
606 | }
607 | else
608 | {
609 | fatal("Incorrect number of arguments. One input file must be specified.");
610 | }
611 | }
612 |
613 | if (opt_deduplicate)
614 | {
615 | if (opt_differences != 0)
616 | fatal("Option -d or --differences must be 0 for deduplication.");
617 | if (opt_indels)
618 | fatal("Option -i or --indels is not allowed for deduplication.");
619 | }
620 |
621 | if (opt_keep_columns)
622 | {
623 | if (! opt_pairs)
624 | fatal("Option --keep-columns only allowed with --pairs options.");
625 | if (! parse_keep_columns())
626 | fatal("Illegal list of columns with --keep-columns option. It must be a comma-separated list of column names. Allowed symbols: A-Z, a-z, _, and 0-9.");
627 | }
628 |
629 | if ((opt_threads < 1) || (opt_threads > MAX_THREADS))
630 | {
631 | fprintf(stderr, "\nError: Illegal number of threads specified with "
632 | "-t or --threads, must be in the range 1 to %u.\n", MAX_THREADS);
633 | exit(1);
634 | }
635 |
636 | if (opt_differences < 0)
637 | fatal("Differences specified with -d or -differences cannot be negative.");
638 |
639 | if (opt_indels && (opt_differences != 1))
640 | fatal("Indels are only allowed when d=1");
641 |
642 | if (opt_cluster)
643 | {
644 | if (opt_pairs)
645 | fatal("Option -p or --pairs is not allowed with -c or --cluster");
646 | if (opt_alternative)
647 | fatal("Option -a or --alternative is not allowed with -c or --cluster");
648 | if (opt_score_string)
649 | fatal("Option -s or --score is not allowed with -c or --cluster");
650 | }
651 |
652 | if (opt_score_string)
653 | {
654 | opt_score_int = -1;
655 | for(int i = 0; i < score_end; i++)
656 | if (strcasecmp(opt_score_string, score_options[i]) == 0)
657 | {
658 | opt_score_int = i;
659 | break;
660 | }
661 | if (opt_score_int < 0)
662 | {
663 | fatal("Argument to -s or --score must be MH, Jaccard, product, ratio, min, max or mean");
664 | }
665 | }
666 |
667 | if (! opt_matrix)
668 | {
669 | if (opt_score_int == score_mh)
670 | {
671 | fatal("The Morisita-Horn index is only allowed when computing repertoire overlap");
672 | }
673 | if (opt_score_int == score_jaccard)
674 | {
675 | fatal("The Jaccard index is only allowed when computing repertoire overlap");
676 | }
677 | }
678 |
679 | if (opt_differences > 0)
680 | {
681 | if (opt_score_int == score_mh)
682 | {
683 | fatal("The Morisita-Horn index is not defined when d>0");
684 | }
685 | if (opt_score_int == score_jaccard)
686 | {
687 | fatal("The Jaccard index is not defined when d>0");
688 | }
689 | }
690 |
691 | if (opt_nucleotides)
692 | alphabet_size = 4;
693 | else
694 | alphabet_size = 20;
695 |
696 | if (opt_cdr3)
697 | if (opt_nucleotides)
698 | seq_header = "cdr3";
699 | else
700 | seq_header = "cdr3_aa";
701 | else
702 | if (opt_nucleotides)
703 | seq_header = "junction";
704 | else
705 | seq_header = "junction_aa";
706 | }
707 |
708 | void open_files()
709 | {
710 | /* open files */
711 |
712 | if (opt_log)
713 | {
714 | logfile = fopen_output(opt_log);
715 | if (! logfile)
716 | fatal("Unable to open log file for writing.");
717 | }
718 |
719 | outfile = fopen_output(opt_output);
720 | if (! outfile)
721 | fatal("Unable to open output file for writing.");
722 |
723 | if (opt_pairs)
724 | {
725 | pairsfile = fopen_output(opt_pairs);
726 | if (! pairsfile)
727 | fatal("Unable to open pairs file for writing.");
728 | }
729 | }
730 |
731 | void close_files()
732 | {
733 | if (pairsfile)
734 | fclose(pairsfile);
735 |
736 | if (outfile)
737 | fclose(outfile);
738 |
739 | if (logfile)
740 | fclose(logfile);
741 | }
742 |
743 | int main(int argc, char** argv)
744 | {
745 | logfile = stderr;
746 |
747 | arch_srandom(1);
748 |
749 | args_init(argc, argv);
750 |
751 | open_files();
752 |
753 | if (opt_version || opt_help)
754 | {
755 | show_header();
756 | if (opt_help)
757 | args_usage();
758 | close_files();
759 | exit(0);
760 | }
761 |
762 | show_header();
763 |
764 | show_time("Start time: ");
765 |
766 | args_show();
767 |
768 | fprintf(logfile, "\n");
769 |
770 | if (opt_matrix || opt_existence)
771 | overlap(input1_filename, input2_filename);
772 | else if (opt_deduplicate)
773 | dedup(input1_filename);
774 | else
775 | cluster(input1_filename);
776 |
777 | show_time("End time: ");
778 |
779 | if (keep_columns_no)
780 | {
781 | xfree(keep_columns_no);
782 | keep_columns_no = nullptr;
783 | }
784 |
785 | if (keep_columns_names)
786 | {
787 | xfree(keep_columns_names);
788 | keep_columns_names = nullptr;
789 | }
790 |
791 | if (keep_columns_strings)
792 | {
793 | xfree(keep_columns_strings);
794 | keep_columns_strings = nullptr;
795 | }
796 |
797 | close_files();
798 | }
799 |
--------------------------------------------------------------------------------
/src/compairr.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2012-2021 Torbjorn Rognes and Frederic Mahe
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU Affero General Public License as
6 | published by the Free Software Foundation, either version 3 of the
7 | License, or (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 |
17 | Contact: Torbjorn Rognes ,
18 | Department of Informatics, University of Oslo,
19 | PO Box 1080 Blindern, NO-0316 Oslo, Norway
20 | */
21 |
22 | #include
23 |
24 | #ifndef PRIu64
25 | #ifdef _WIN32
26 | #define PRIu64 "I64u"
27 | #else
28 | #define PRIu64 "lu"
29 | #endif
30 | #endif
31 |
32 | #ifndef PRId64
33 | #ifdef _WIN32
34 | #define PRId64 "I64d"
35 | #else
36 | #define PRId64 "ld"
37 | #endif
38 | #endif
39 |
40 | #include
41 | #include
42 | #include
43 | #include
44 | #include
45 | #include
46 | #include
47 | #include
48 | #include
49 | #include
50 | #include
51 | #include
52 | #include
53 | #include
54 |
55 | #include
56 | #include
57 | #include
58 | #include