├── .editorconfig
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── docs
├── Makefile
├── conf.py
└── index.rst
├── requirements.txt
├── setup.py
├── src
├── .ignore
├── _arrayops.h
├── arrayops.pxi
├── bitcount.h
├── bitops.pxi
├── block.pxi
├── immutablerb.pxi
├── macros.h
├── multirb.pxi
├── rbbinaryops.pxi
└── roaringbitmap.pyx
└── tests
├── benchmarks.py
└── unittests.py
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org/
2 | root = true
3 |
4 | [*]
5 | end_of_line = lf
6 | insert_final_newline = true
7 |
8 | [*.{py,pyx,pxd,pxi,c,h,cpp,css,js}]
9 | charset = utf-8
10 | indent_style = tab
11 | indent_size = 4
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | MANIFEST
2 | src/.*.swp
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 |
8 | # Cython-generated files
9 | src/*.c
10 | src/*.html
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 |
57 | # Sphinx documentation
58 | docs/_build/
59 |
60 | # PyBuilder
61 | target/
62 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 | {description}
294 | Copyright (C) {year} {fullname}
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 |
310 | Also add information on how to contact you by electronic and paper mail.
311 |
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 |
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 |
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License. Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 |
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary. Here is a sample; alter the names:
328 |
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 |
332 | {signature of Ty Coon}, 1 April 1989
333 | Ty Coon, President of Vice
334 |
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs. If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library. If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 |
341 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst LICENSE setup.py
2 | recursive-include src *.h *.c
3 | recursive-exclude src *.pyx *.pxi *.pxd
4 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | python3 setup.py install --user
3 |
4 | clean:
5 | rm -rf build/ src/roaringbitmap.h
6 | find src/ -name '*.c' -delete
7 | find src/ -name '*.so' -delete
8 | find src/ -name '*.pyc' -delete
9 | find src/ -name '*.html' -delete
10 | find tests/ -name '*.pyc' -delete
11 | rm -rf src/__pycache__ tests/__pycache__
12 |
13 | test: all
14 | ulimit -Sv 500000; python3 -m pytest tests/unittests.py
15 |
16 | bench: all
17 | ulimit -Sv 500000; python3 tests/benchmarks.py
18 |
19 | lint:
20 | pycodestyle --ignore=E1,W1,W503 tests/*.py \
21 | && pycodestyle --ignore=E1,W1,F,E901,E225,E227,E211,W503 \
22 | src/*.pyx src/*.pxi
23 |
24 | py2:
25 | python2 setup.py install --user
26 |
27 | test2: py2
28 | python2 -m pytest tests/unittests.py
29 |
30 | bench2: all
31 | ulimit -Sv 500000; python2 tests/benchmarks.py
32 |
33 | debug:
34 | python3-dbg setup.py install --user --debug
35 |
36 | debug2:
37 | python2-dbg setup.py install --user --debug
38 |
39 | testdebug: debug
40 | gdb -ex run --args python3-dbg -m pytest tests/unittests.py -v
41 |
42 | testdebug2: debug2
43 | gdb -ex run --args python2-dbg -m pytest tests/unittests.py -v
44 |
45 | valgrind:
46 | python3-dbg setup.py install --user --debug
47 | valgrind --tool=memcheck --suppressions=valgrind-python.supp \
48 | --leak-check=full --show-leak-kinds=definite \
49 | python3.5-dbg -m pytest tests/unittests.py -v
50 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Roaring Bitmap in Cython
2 | ========================
3 |
4 | A roaring bitmap is an efficient compressed datastructure to store a set
5 | of integers. A Roaring bitmap stores a set of 32-bit integers in a series of
6 | arrays and bitmaps, whichever takes the least space (which is always
7 | ``2 ** 16`` bits or less).
8 |
9 | This datastructure is useful for storing a large number of integers, e.g., for
10 | an inverted index used by search engines and databases. In particular, it is
11 | possible to quickly compute the intersection of a series of sets, which can be
12 | used to implement a query as the conjunction of subqueries.
13 |
14 | This implementation is based on the Java and C implementations at
15 | https://github.com/lemire/RoaringBitmap
16 | and https://github.com/lemire/CRoaring
17 |
18 | Additional features of this implementation:
19 |
20 | - Inverted list representation: blocks that are mostly full are stored
21 | compactly as an array of non-members (instead of as an array of members or a
22 | fixed-size bitmap).
23 | - Collections of immutable roaring bitmaps can be efficiently serialized with
24 | ``mmap`` in a single file.
25 |
26 | Missing features w.r.t. CRoaring:
27 |
28 | - Run-length encoded blocks
29 | - Various AVX2 / SSE optimizations
30 |
31 | See also PyRoaringBitmap, a Python wrapper of CRoaring:
32 | https://github.com/Ezibenroc/PyRoaringBitMap
33 |
34 | License, requirements
35 | ---------------------
36 | The code is licensed under GNU GPL v2, or any later version at your option.
37 |
38 | - Python 2.7+/3.3+ http://www.python.org (headers required, e.g. python-dev package)
39 | - Cython 0.20+ http://www.cython.org
40 |
41 | Installation, usage
42 | -------------------
43 |
44 | ::
45 |
46 | $ git clone https://github.com/andreasvc/roaringbitmap.git
47 | $ cd roaringbitmap
48 | $ make
49 |
50 | (or ``make py2`` for Python 2)
51 |
52 | A ``RoaringBitmap()`` can be used as a replacement for a normal (mutable)
53 | Python set containing (unsigned) 32-bit integers:
54 |
55 | .. code-block:: python
56 |
57 | >>> from roaringbitmap import RoaringBitmap
58 | >>> RoaringBitmap(range(10)) & RoaringBitmap(range(5, 15))
59 | RoaringBitmap({5, 6, 7, 8, 9})
60 |
61 | ``ImmutableRoaringBitmap`` is an immutable variant (analogous to ``frozenset``)
62 | which is stored compactly as a contiguous block of memory.
63 |
64 | A sequence of immutable RoaringBitmaps can be stored in a single file and
65 | accessed efficiently with ``mmap``, without needing to copy or deserialize:
66 |
67 | .. code-block:: python
68 |
69 | >>> from roaringbitmap import MultiRoaringBitmap
70 | >>> mrb = MultiRoaringBitmap([range(n, n + 5) for n in range(10)], filename='index')
71 |
72 | >>> mrb = MultiRoaringBitmap.fromfile('index')
73 | >>> mrb[5]
74 | ImmutableRoaringBitmap({5, 6, 7, 8, 9})
75 |
76 | For API documentation cf. http://roaringbitmap.readthedocs.io
77 |
78 | Benchmarks
79 | ----------
80 | Output of ``$ make bench``::
81 |
82 | small sparse set
83 | 100 runs with sets of 200 random elements n s.t. 0 <= n < 40000
84 | set() RoaringBitmap() ratio
85 | init 0.000834 0.00138 0.603
86 | initsort 0.00085 0.000394 2.16
87 | and 0.00102 8.49e-05 12.1
88 | or 0.00171 0.000169 10.1
89 | xor 0.00152 0.000213 7.11
90 | sub 0.000934 0.000197 4.74
91 | iand 1.29e-05 2.97e-06 4.35
92 | ior 9.7e-06 3.26e-06 2.98
93 | ixor 8.98e-06 3.43e-06 2.62
94 | isub 6.83e-06 3.3e-06 2.07
95 | eq 0.000438 1.17e-05 37.6
96 | neq 6.37e-06 7.81e-06 0.816
97 | jaccard 0.0029 0.000126 23.1
98 |
99 | medium load factor
100 | 100 runs with sets of 59392 random elements n s.t. 0 <= n < 118784
101 | set() RoaringBitmap() ratio
102 | init 0.564 0.324 1.74
103 | initsort 0.696 0.273 2.55
104 | and 0.613 0.000418 1466
105 | or 0.976 0.000292 3344
106 | xor 0.955 0.000294 3250
107 | sub 0.346 0.000316 1092
108 | iand 0.00658 1.14e-05 575
109 | ior 0.00594 1.08e-05 548
110 | ixor 0.00434 1.12e-05 385
111 | isub 0.00431 1.09e-05 397
112 | eq 0.0991 0.000116 851
113 | neq 9.62e-06 1.29e-05 0.743
114 | jaccard 1.62 0.00025 6476
115 |
116 | dense set / high load factor
117 | 100 runs with sets of 39800 random elements n s.t. 0 <= n < 40000
118 | set() RoaringBitmap() ratio
119 | init 0.33 0.0775 4.26
120 | initsort 0.352 0.148 2.38
121 | and 0.24 0.000223 1078
122 | or 0.45 0.000165 2734
123 | xor 0.404 0.000161 2514
124 | sub 0.169 0.000173 973
125 | iand 0.00287 6.02e-06 477
126 | ior 0.00179 6.34e-06 282
127 | ixor 0.00195 5.53e-06 353
128 | isub 0.0017 6.35e-06 267
129 | eq 0.0486 4.65e-05 1045
130 | neq 1.01e-05 1.13e-05 0.888
131 | jaccard 0.722 0.000118 6136
132 |
133 | See https://github.com/Ezibenroc/roaring_analysis/ for a performance comparison
134 | of PyRoaringBitmap and this library.
135 |
136 | References
137 | ----------
138 | - http://roaringbitmap.org/
139 | - Chambi, S., Lemire, D., Kaser, O., & Godin, R. (2016). Better bitmap
140 | performance with Roaring bitmaps. Software: practice and experience, 46(5),
141 | pp. 709-719. http://arxiv.org/abs/1402.6407
142 | - The idea of using the inverted list representation is based on
143 | https://issues.apache.org/jira/browse/LUCENE-5983
144 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = python3 `which sphinx-build`
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # Internal variables.
11 | PAPEROPT_a4 = -D latex_paper_size=a4
12 | PAPEROPT_letter = -D latex_paper_size=letter
13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14 | # the i18n builder cannot share the environment and doctrees with the others
15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
16 |
17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
18 |
19 | help:
20 | @echo "Please use \`make ' where is one of"
21 | @echo " html to make standalone HTML files"
22 | @echo " dirhtml to make HTML files named index.html in directories"
23 | @echo " singlehtml to make a single large HTML file"
24 | @echo " pickle to make pickle files"
25 | @echo " json to make JSON files"
26 | @echo " htmlhelp to make HTML files and a HTML help project"
27 | @echo " qthelp to make HTML files and a qthelp project"
28 | @echo " devhelp to make HTML files and a Devhelp project"
29 | @echo " epub to make an epub"
30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
31 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
32 | @echo " text to make text files"
33 | @echo " man to make manual pages"
34 | @echo " texinfo to make Texinfo files"
35 | @echo " info to make Texinfo files and run them through makeinfo"
36 | @echo " gettext to make PO message catalogs"
37 | @echo " changes to make an overview of all changed/added/deprecated items"
38 | @echo " linkcheck to check all external links for integrity"
39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
40 |
41 | clean:
42 | -rm -rf $(BUILDDIR)/*
43 |
44 | html:
45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
46 | @echo
47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
48 |
49 | dirhtml:
50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
51 | @echo
52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
53 |
54 | singlehtml:
55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
56 | @echo
57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
58 |
59 | pickle:
60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
61 | @echo
62 | @echo "Build finished; now you can process the pickle files."
63 |
64 | json:
65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
66 | @echo
67 | @echo "Build finished; now you can process the JSON files."
68 |
69 | htmlhelp:
70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
71 | @echo
72 | @echo "Build finished; now you can run HTML Help Workshop with the" \
73 | ".hhp project file in $(BUILDDIR)/htmlhelp."
74 |
75 | qthelp:
76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
77 | @echo
78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/roaringbitmap.qhcp"
81 | @echo "To view the help file:"
82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/roaringbitmap.qhc"
83 |
84 | devhelp:
85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
86 | @echo
87 | @echo "Build finished."
88 | @echo "To view the help file:"
89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/roaringbitmap"
90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/roaringbitmap"
91 | @echo "# devhelp"
92 |
93 | epub:
94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
95 | @echo
96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
97 |
98 | latex:
99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | @echo
101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | "(use \`make latexpdf' here to do that automatically)."
104 |
105 | latexpdf:
106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | @echo "Running LaTeX files through pdflatex..."
108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 |
111 | text:
112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | @echo
114 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
115 |
116 | man:
117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | @echo
119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 |
121 | texinfo:
122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | @echo
124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | @echo "Run \`make' in that directory to run these through makeinfo" \
126 | "(use \`make info' here to do that automatically)."
127 |
128 | info:
129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | @echo "Running Texinfo files through makeinfo..."
131 | make -C $(BUILDDIR)/texinfo info
132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 |
134 | gettext:
135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | @echo
137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 |
139 | changes:
140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | @echo
142 | @echo "The overview file is in $(BUILDDIR)/changes."
143 |
144 | linkcheck:
145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | @echo
147 | @echo "Link check complete; look for any errors in the above output " \
148 | "or in $(BUILDDIR)/linkcheck/output.txt."
149 |
150 | doctest:
151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | @echo "Testing of doctests in the sources finished, look at the " \
153 | "results in $(BUILDDIR)/doctest/output.txt."
154 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # This file is execfile()d with the current directory set to its containing dir.
4 | #
5 | # Note that not all possible configuration values are present in this
6 | # autogenerated file.
7 | #
8 | # All configuration values have a default; values that are commented out
9 | # serve to show the default.
10 |
11 | import sys, os
12 |
13 | # If extensions (or modules to document with autodoc) are in another directory,
14 | # add these directories to sys.path here. If the directory is relative to the
15 | # documentation root, use os.path.abspath to make it absolute, like shown here.
16 | #sys.path.insert(0, os.path.abspath('.'))
17 |
18 | # -- General configuration ----------------------------------------------------
19 |
20 | # If your documentation needs a minimal Sphinx version, state it here.
21 | #needs_sphinx = '1.0'
22 |
23 | # Add any Sphinx extension module names here, as strings. They can be
24 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
26 |
27 | # Add any paths that contain templates here, relative to this directory.
28 | templates_path = ['_templates']
29 |
30 | # The suffix of source filenames.
31 | source_suffix = '.rst'
32 |
33 | # The encoding of source files.
34 | #source_encoding = 'utf-8-sig'
35 |
36 | # The master toctree document.
37 | master_doc = 'index'
38 |
39 | # General information about the project.
40 | project = u'roaringbitmap'
41 | copyright = u'2022, Andreas van Cranenburgh'
42 |
43 | # The version info for the project you're documenting, acts as replacement for
44 | # |version| and |release|, also used in various other places throughout the
45 | # built documents.
46 | #
47 | # The short X.Y version.
48 | version = '0.7'
49 | # The full version, including alpha/beta/rc tags.
50 | release = '0.7.2'
51 |
52 | # The language for content autogenerated by Sphinx. Refer to documentation
53 | # for a list of supported languages.
54 | #language = None
55 |
56 | # There are two options for replacing |today|: either, you set today to some
57 | # non-false value, then it is used:
58 | #today = ''
59 | # Else, today_fmt is used as the format for a strftime call.
60 | #today_fmt = '%B %d, %Y'
61 |
62 | # List of patterns, relative to source directory, that match files and
63 | # directories to ignore when looking for source files.
64 | exclude_patterns = ['_build']
65 |
66 | # The reST default role (used for this markup: `text`) to use for all documents
67 | #default_role = None
68 |
69 | # If true, '()' will be appended to :func: etc. cross-reference text.
70 | #add_function_parentheses = True
71 |
72 | # If true, the current module name will be prepended to all description
73 | # unit titles (such as .. function::).
74 | #add_module_names = True
75 |
76 | # If true, sectionauthor and moduleauthor directives will be shown in the
77 | # output. They are ignored by default.
78 | #show_authors = False
79 |
80 | # The name of the Pygments (syntax highlighting) style to use.
81 | pygments_style = 'sphinx'
82 |
83 | # A list of ignored prefixes for module index sorting.
84 | #modindex_common_prefix = []
85 |
86 | autodoc_member_order = 'bysource'
87 | autodoc_default_flags = ['members']
88 |
89 | # -- Options for HTML output --------------------------------------------------
90 |
91 | ## on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org
92 | #on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
93 | #
94 | #if not on_rtd: # only import and set the theme if we're building docs locally
95 | # import sphinx_rtd_theme
96 | # html_theme = 'sphinx_rtd_theme'
97 | # html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
98 | ## otherwise, readthedocs.org uses their theme by default, so no need to specify it
99 |
100 | html_theme = 'nature'
101 |
102 | # The name for this set of Sphinx documents. If None, it defaults to
103 | # " v documentation".
104 | #html_title = None
105 |
106 | # A shorter title for the navigation bar. Default is the same as html_title.
107 | #html_short_title = None
108 |
109 | # The name of an image file (relative to this directory) to place at the top
110 | # of the sidebar.
111 | #html_logo = None
112 |
113 | # The name of an image file (within the static path) to use as favicon of the
114 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
115 | # pixels large.
116 | #html_favicon = None
117 |
118 | # Add any paths that contain custom static files (such as style sheets) here,
119 | # relative to this directory. They are copied after the builtin static files,
120 | # so a file named "default.css" will overwrite the builtin "default.css".
121 | html_static_path = []
122 |
123 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
124 | # using the given strftime format.
125 | #html_last_updated_fmt = '%b %d, %Y'
126 |
127 | # If true, SmartyPants will be used to convert quotes and dashes to
128 | # typographically correct entities.
129 | #html_use_smartypants = True
130 |
131 | # Custom sidebar templates, maps document names to template names.
132 | html_sidebars = {'**': [
133 | 'globaltoc.html',
134 | 'searchbox.html',
135 | #'localtoc.html',
136 | #'relations.html',
137 | #'sourcelink.html',
138 | ], }
139 |
140 | # Additional templates that should be rendered to pages, maps page names to
141 | # template names.
142 | #html_additional_pages = {}
143 |
144 | # If false, no module index is generated.
145 | html_domain_indices = False
146 |
147 | # If false, no index is generated.
148 | html_use_index = False
149 |
150 | # If true, the index is split into individual pages for each letter.
151 | #html_split_index = False
152 |
153 | # If true, links to the reST sources are added to the pages.
154 | html_show_sourcelink = False
155 |
156 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
157 | #html_show_sphinx = True
158 |
159 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
160 | #html_show_copyright = True
161 |
162 | # If true, an OpenSearch description file will be output, and all pages will
163 | # contain a tag referring to it. The value of this option must be the
164 | # base URL from which the finished HTML is served.
165 | #html_use_opensearch = ''
166 |
167 | # This is the file name suffix for HTML files (e.g. ".xhtml").
168 | #html_file_suffix = None
169 |
170 | # Output file base name for HTML help builder.
171 | htmlhelp_basename = 'roaringbitmapdoc'
172 |
173 | # append __init__ docstring to docstring of class
174 | autoclass_content = 'both'
175 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | RoaringBitmap API documentation
2 | ===============================
3 | .. automodule:: roaringbitmap
4 | :members:
5 | :undoc-members:
6 | :show-inheritance:
7 |
8 |
9 | Indices and tables
10 | ==================
11 |
12 | * :ref:`genindex`
13 | * :ref:`modindex`
14 | * :ref:`search`
15 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython>=0.21
2 | sphinx>=1.6.2
3 | pytest>=3.0.0
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """Generic setup.py for Cython code."""
2 | import os
3 | import sys
4 | from distutils.core import setup
5 | from distutils.extension import Extension
6 |
7 | PY2 = sys.version_info[0] == 2
8 |
9 | # In releases, include C sources but not Cython sources; otherwise, use cython
10 | # to figure out which files may need to be re-cythonized.
11 | USE_CYTHON = os.path.exists('src/roaringbitmap.pyx')
12 | if USE_CYTHON:
13 | try:
14 | from Cython.Build import cythonize
15 | from Cython.Distutils import build_ext
16 | from Cython.Compiler import Options
17 | Options.fast_fail = True
18 | except ImportError:
19 | raise RuntimeError('could not import Cython.')
20 | cmdclass = dict(build_ext=build_ext)
21 | else:
22 | cmdclass = dict()
23 |
24 | DEBUG = '--debug' in sys.argv
25 | if DEBUG:
26 | sys.argv.remove('--debug')
27 |
28 | MTUNE = '--with-mtune' in sys.argv
29 | if MTUNE:
30 | sys.argv.remove('--with-mtune')
31 |
32 | with open('README.rst') as inp:
33 | README = inp.read()
34 |
35 | METADATA = dict(name='roaringbitmap',
36 | version='0.7.2',
37 | description='Roaring Bitmap',
38 | long_description=README,
39 | author='Andreas van Cranenburgh',
40 | author_email='A.W.van.Cranenburgh@rug.nl',
41 | url='http://roaringbitmap.readthedocs.io',
42 | license='GPL',
43 | platforms=['Many'],
44 | classifiers=[
45 | 'Development Status :: 4 - Beta',
46 | 'Intended Audience :: Science/Research',
47 | 'License :: OSI Approved :: GNU General Public License (GPL)',
48 | 'Operating System :: POSIX',
49 | 'Programming Language :: Python :: 2.7',
50 | 'Programming Language :: Python :: 3.3',
51 | 'Programming Language :: Cython',
52 | ],
53 | )
54 |
55 | # some of these directives increase performance,
56 | # but at the cost of failing in mysterious ways.
57 | directives = {
58 | 'profile': False,
59 | 'cdivision': True,
60 | 'nonecheck': False,
61 | 'wraparound': False,
62 | 'boundscheck': False,
63 | 'infer_types': None,
64 | 'embedsignature': True,
65 | 'warn.unused': True,
66 | 'warn.unreachable': True,
67 | 'warn.maybe_uninitialized': True,
68 | 'warn.undeclared': False,
69 | 'warn.unused_arg': False,
70 | 'warn.unused_result': False,
71 | }
72 |
73 | if __name__ == '__main__':
74 | if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[:2] < (3, 3):
75 | raise RuntimeError('Python version 2.7 or >= 3.3 required.')
76 | os.environ['GCC_COLORS'] = 'auto'
77 | # NB: could also use Cython compile-time definition,
78 | # but this would lead to different C output for Python 2/3.
79 | extra_compile_args = ['-DPY2=%d' % PY2] # '-fopt-info-vec-missed',
80 | if sys.platform == 'win32':
81 | # https://docs.microsoft.com/en-us/cpp/intrinsics/bitscanforward-bitscanforward64?view=vs-2017
82 | extra_compile_args += ['-EHsc']
83 | else:
84 | extra_compile_args += [
85 | '-Wno-strict-prototypes', '-Wno-unreachable-code', '-Wextra']
86 | extra_link_args = []
87 | if not DEBUG and sys.platform != 'win32':
88 | extra_compile_args += ['-O3', '-DNDEBUG']
89 | extra_compile_args += ['-mtune=native'] if MTUNE else ['-march=native']
90 | extra_link_args += ['-DNDEBUG']
91 | if USE_CYTHON:
92 | if DEBUG:
93 | directives.update(wraparound=True, boundscheck=True)
94 | if sys.platform == 'win32':
95 | extra_compile_args += ['-DDEBUG', '-Od', '-Zi']
96 | extra_link_args += ['-DEBUG']
97 | else:
98 | extra_compile_args += ['-g', '-O0',
99 | # '-fsanitize=address', '-fsanitize=undefined',
100 | '-fno-omit-frame-pointer']
101 | extra_link_args += ['-g']
102 | ext_modules = cythonize(
103 | [Extension(
104 | '*',
105 | sources=['src/*.pyx'],
106 | extra_compile_args=extra_compile_args,
107 | extra_link_args=extra_link_args)],
108 | annotate=True,
109 | compiler_directives=directives,
110 | language_level=3)
111 | else:
112 | ext_modules = [Extension(
113 | 'roaringbitmap',
114 | sources=['src/roaringbitmap.c'],
115 | extra_compile_args=extra_compile_args,
116 | extra_link_args=extra_link_args)]
117 | setup(
118 | cmdclass=cmdclass,
119 | ext_modules=ext_modules,
120 | **METADATA)
121 |
--------------------------------------------------------------------------------
/src/.ignore:
--------------------------------------------------------------------------------
1 | *.c
2 | *.html
3 |
--------------------------------------------------------------------------------
/src/_arrayops.h:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #if defined(__SSE4_2__)
5 | #if defined(_MSC_VER)
6 | #include
7 | #else
8 | #include
9 | #endif
10 | #endif
11 |
12 | /**
13 | * Generic intersection function. Passes unit tests.
14 | *
15 | * From CRoaring, array_util.c
16 | * cf. https://github.com/RoaringBitmap/CRoaring/blob/master/src/array_util.c
17 | */
18 | int32_t intersect_general16(const uint16_t *A, const size_t lenA,
19 | const uint16_t *B, const size_t lenB, uint16_t *out) {
20 | const uint16_t *initout = out;
21 | const uint16_t *endA;
22 | const uint16_t *endB;
23 | if (lenA == 0 || lenB == 0) return 0;
24 | endA = A + lenA;
25 | endB = B + lenB;
26 |
27 | while (1) {
28 | while (*A < *B) {
29 | SKIP_FIRST_COMPARE:
30 | if (++A == endA) return (int32_t)(out - initout);
31 | }
32 | while (*A > *B) {
33 | if (++B == endB) return (int32_t)(out - initout);
34 | }
35 | if (*A == *B) {
36 | *out++ = *A;
37 | if (++A == endA || ++B == endB) return (int32_t)(out - initout);
38 | } else {
39 | goto SKIP_FIRST_COMPARE;
40 | }
41 | }
42 | return (int32_t)(out - initout); /* NOTREACHED */
43 | }
44 |
45 |
46 | #if defined(__SSE4_2__)
47 |
48 | static inline int32_t intersect_uint16(
49 | const uint16_t* __restrict a, size_t a_size,
50 | const uint16_t* __restrict b, size_t b_size,
51 | uint16_t* __restrict result) {
52 | /* from https://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ */
53 | size_t count = 0;
54 | static __m128i shuffle_mask16[256];
55 | static int built_shuffle_mask = 0;
56 | int i, j;
57 | if (!built_shuffle_mask) {
58 | built_shuffle_mask = 1;
59 | for (i = 0; i < 256; i++) {
60 | uint8_t mask[16];
61 | memset(mask, 0xFF, sizeof(mask));
62 | int counter = 0;
63 | for (j = 0; j < 16; j++) {
64 | if (i & (1 << j)) {
65 | mask[counter++] = 2 * j;
66 | mask[counter++] = 2 * j + 1;
67 | }
68 | }
69 | __m128i v_mask = _mm_loadu_si128((const __m128i *)mask);
70 | shuffle_mask16[i] = v_mask;
71 | }
72 | }
73 | size_t i_a = 0, i_b = 0;
74 | size_t st_a = (a_size / 8) * 8;
75 | size_t st_b = (b_size / 8) * 8;
76 |
77 | while(i_a < st_a && i_b < st_b) {
78 | __m128i v_a = _mm_loadu_si128((__m128i *)&a[i_a]);
79 | __m128i v_b = _mm_loadu_si128((__m128i *)&b[i_b]);
80 | __m128i v_cmp = _mm_cmpestrm(v_a, 8, v_b, 8,
81 | _SIDD_UWORD_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK);
82 | int r = _mm_extract_epi32(v_cmp, 0);
83 | __m128i v_shuf = _mm_shuffle_epi8(v_b, shuffle_mask16[r]);
84 | _mm_storeu_si128((__m128i *)&result[count], v_shuf);
85 | count += _mm_popcnt_u32(r);
86 | uint16_t a_max = _mm_extract_epi16(v_a, 7);
87 | uint16_t b_max = _mm_extract_epi16(v_b, 7);
88 | i_a += (a_max <= b_max) * 8;
89 | i_b += (a_max >= b_max) * 8;
90 | }
91 | a += i_a;
92 | a_size -= i_a;
93 | b += i_b;
94 | b_size -= i_b;
95 | result += count;
96 | return count + intersect_general16(a, a_size, b, b_size, result);
97 | }
98 |
99 | #else /* __SSE4_2__ */
100 |
101 | int32_t intersect_uint16(const uint16_t *A, size_t s_a,
102 | const uint16_t *B, size_t s_b, uint16_t *C) {
103 | return intersect_general16(A, s_a, B, s_b, C);
104 | }
105 |
106 | #endif /* __SSE4_2__ */
107 |
--------------------------------------------------------------------------------
/src/arrayops.pxi:
--------------------------------------------------------------------------------
1 | # Set / search operations on integer arrays
2 |
3 | cdef inline int binarysearch(uint16_t *data, int begin, int end,
4 | uint16_t elem) nogil:
5 | """Binary search for short `elem` in array `data`.
6 |
7 | :returns: positive index ``i`` if ``elem`` is found; otherwise return a
8 | negative value ``i`` such that ``-i - 1`` is the index where ``elem``
9 | should be inserted."""
10 | cdef int low = begin
11 | cdef int high = end - 1
12 | cdef int middleidx
13 | cdef uint16_t middleval
14 | # accelerate the possibly common case of a just appended value
15 | if end > 0 and data[end - 1] < elem:
16 | return -end - 1
17 | while low <= high:
18 | middleidx = (low + high) >> 1
19 | middleval = data[middleidx]
20 | if middleval < elem:
21 | low = middleidx + 1
22 | elif middleval > elem:
23 | high = middleidx - 1
24 | else:
25 | return middleidx
26 | return -(low + 1)
27 |
28 |
29 | cdef inline int advance(uint16_t *data, int pos, int length,
30 | uint16_t minitem) nogil:
31 | cdef int lower = pos + 1
32 | cdef int spansize = 1
33 | cdef int upper, mid
34 | if lower >= length or data[lower] >= minitem:
35 | return lower
36 | while lower + spansize < length and data[lower + spansize] < minitem:
37 | spansize *= 2
38 | upper = (lower + spansize) if lower + spansize < length else (length - 1)
39 | if data[upper] == minitem:
40 | return upper
41 | if data[upper] < minitem:
42 | return length
43 | lower += spansize >> 1
44 | while lower + 1 != upper:
45 | mid = (lower + upper) >> 1
46 | if data[mid] == minitem:
47 | return mid
48 | elif data[mid] < minitem:
49 | lower = mid
50 | else:
51 | upper = mid
52 | return upper
53 |
54 |
55 | cdef uint32_t intersect2by2(uint16_t *data1, uint16_t *data2,
56 | int length1, int length2, uint16_t *dest) nogil:
57 | if length1 * 64 < length2:
58 | return intersectgalloping(data1, length1, data2, length2, dest)
59 | elif length2 * 64 < length1:
60 | return intersectgalloping(data2, length2, data1, length1, dest)
61 | if dest is NULL:
62 | return intersectcard(data1, data2, length1, length2)
63 | elif data1 is not dest and data2 is not dest:
64 | # NB: dest must have 8 elements extra capacity
65 | return intersect_uint16(data1, length1, data2, length2, dest)
66 | return intersect_general16(data1, length1, data2, length2, dest)
67 | # return intersectlocal2by2(data1, length1, data2, length2, dest)
68 |
69 |
70 | cdef inline int intersectlocal2by2(uint16_t *data1, int length1,
71 | uint16_t *data2, int length2, uint16_t *dest) nogil:
72 | cdef int k1 = 0, k2 = 0, pos = 0
73 | if length1 == 0 or length2 == 0:
74 | return 0
75 | while True:
76 | if data2[k2] < data1[k1]:
77 | while True:
78 | k2 += 1
79 | if k2 == length2:
80 | return pos
81 | elif data2[k2] >= data1[k1]:
82 | break
83 | elif data1[k1] < data2[k2]:
84 | while True:
85 | k1 += 1
86 | if k1 == length1:
87 | return pos
88 | elif data1[k1] >= data2[k2]:
89 | break
90 | else: # data1[k1] == data2[k2]
91 | dest[pos] = data1[k1]
92 | pos += 1
93 | k1 += 1
94 | if k1 == length1:
95 | return pos
96 | k2 += 1
97 | if k2 == length2:
98 | return pos
99 |
100 |
101 | cdef inline int intersectcard(uint16_t *data1, uint16_t *data2,
102 | int length1, int length2) nogil:
103 | cdef int k1 = 0, k2 = 0, pos = 0
104 | if length1 == 0 or length2 == 0:
105 | return 0
106 | while True:
107 | if data2[k2] < data1[k1]:
108 | while True:
109 | k2 += 1
110 | if k2 == length2:
111 | return pos
112 | elif data2[k2] >= data1[k1]:
113 | break
114 | elif data1[k1] < data2[k2]:
115 | while True:
116 | k1 += 1
117 | if k1 == length1:
118 | return pos
119 | elif data1[k1] >= data2[k2]:
120 | break
121 | else: # data1[k1] == data2[k2]
122 | pos += 1
123 | k1 += 1
124 | if k1 == length1:
125 | return pos
126 | k2 += 1
127 | if k2 == length2:
128 | return pos
129 |
130 |
131 | cdef inline int intersectgalloping(
132 | uint16_t *small, int lensmall,
133 | uint16_t *large, int lenlarge,
134 | uint16_t *dest) nogil:
135 | cdef int k1 = 0, k2 = 0, pos = 0
136 | if lensmall == 0:
137 | return 0
138 | if dest is NULL: # cardinality only
139 | while True:
140 | if large[k1] < small[k2]:
141 | k1 = advance(large, k1, lenlarge, small[k2])
142 | if k1 == lenlarge:
143 | return pos
144 | if small[k2] < large[k1]:
145 | k2 += 1
146 | if k2 == lensmall:
147 | return pos
148 | else: # large[k2] == small[k1]
149 | pos += 1
150 | k2 += 1
151 | if k2 == lensmall:
152 | return pos
153 | k1 = advance(large, k1, lenlarge, small[k2])
154 | if k1 == lenlarge:
155 | return pos
156 | else: # store result
157 | while True:
158 | if large[k1] < small[k2]:
159 | k1 = advance(large, k1, lenlarge, small[k2])
160 | if k1 == lenlarge:
161 | return pos
162 | if small[k2] < large[k1]:
163 | k2 += 1
164 | if k2 == lensmall:
165 | return pos
166 | else: # large[k2] == small[k1]
167 | dest[pos] = small[k2]
168 | pos += 1
169 | k2 += 1
170 | if k2 == lensmall:
171 | return pos
172 | k1 = advance(large, k1, lenlarge, small[k2])
173 | if k1 == lenlarge:
174 | return pos
175 |
176 |
177 | cdef int union2by2(uint16_t *data1, uint16_t *data2,
178 | int length1, int length2, uint16_t *dest) nogil:
179 | cdef int k1 = 0, k2 = 0, pos = 0, n_elems
180 | if length2 == 0:
181 | if dest is not NULL:
182 | memcpy(dest, data1, length1 * sizeof(uint16_t))
183 | return length1
184 | elif length1 == 0:
185 | if dest is not NULL:
186 | memcpy(dest, data2, length2 * sizeof(uint16_t))
187 | return length2
188 | elif length1 > length2:
189 | return union2by2(data2, data1, length2, length1, dest)
190 | if dest is NULL: # cardinality only
191 | while True:
192 | if data1[k1] < data2[k2]:
193 | pos += 1
194 | k1 += 1
195 | if k1 >= length1:
196 | break
197 | elif data1[k1] > data2[k2]:
198 | pos += 1
199 | k2 += 1
200 | if k2 >= length2:
201 | break
202 | else: # data1[k1] == data2[k2]
203 | pos += 1
204 | k1 += 1
205 | k2 += 1
206 | if k1 >= length1 or k2 >= length2:
207 | break
208 | else: # store result
209 | while True:
210 | if data1[k1] < data2[k2]:
211 | dest[pos] = data1[k1]
212 | pos += 1
213 | k1 += 1
214 | if k1 >= length1:
215 | break
216 | elif data1[k1] > data2[k2]:
217 | dest[pos] = data2[k2]
218 | pos += 1
219 | k2 += 1
220 | if k2 >= length2:
221 | break
222 | else: # data1[k1] == data2[k2]
223 | dest[pos] = data1[k1]
224 | pos += 1
225 | k1 += 1
226 | k2 += 1
227 | if k1 >= length1 or k2 >= length2:
228 | break
229 | if k1 < length1:
230 | n_elems = length1 - k1
231 | if dest is not NULL:
232 | memcpy(&(dest[pos]), &(data1[k1]), n_elems * sizeof(uint16_t))
233 | pos += n_elems
234 | elif k2 < length2:
235 | n_elems = length2 - k2
236 | if dest is not NULL:
237 | memcpy(&(dest[pos]), &(data2[k2]), n_elems * sizeof(uint16_t))
238 | pos += n_elems
239 | return pos
240 |
241 |
242 | cdef int union2by2bitmap(uint16_t *data1, uint16_t *data2,
243 | int length1, int length2, uint64_t *dest) nogil:
244 | """Like union2by2, but write result to bitmap."""
245 | cdef int length = 0, pos = 0
246 | memset(dest, 0, BITMAPSIZE)
247 | for pos in range(length1):
248 | SETBIT(dest, data1[pos])
249 | length = length1
250 | for pos in range(length2):
251 | length += TESTBIT(dest, data2[pos]) == 0
252 | SETBIT(dest, data2[pos])
253 | return length
254 |
255 |
256 | cdef int difference(uint16_t *data1, uint16_t *data2,
257 | int length1, int length2, uint16_t *dest) nogil:
258 | cdef int k1 = 0, k2 = 0, pos = 0
259 | if length2 == 0:
260 | if dest is not NULL:
261 | memcpy(dest, data1, length1 * sizeof(uint16_t))
262 | return length1
263 | elif length1 == 0:
264 | return 0
265 | if dest is NULL: # cardinality only
266 | while True:
267 | if data1[k1] < data2[k2]:
268 | pos += 1
269 | k1 += 1
270 | if k1 >= length1:
271 | return pos
272 | elif data1[k1] == data2[k2]:
273 | k1 += 1
274 | k2 += 1
275 | if k1 >= length1:
276 | return pos
277 | elif k2 >= length2:
278 | break
279 | else: # data1[k1] > data2[k2]
280 | k2 += 1
281 | if k2 >= length2:
282 | break
283 | while k1 < length1:
284 | pos += 1
285 | k1 += 1
286 | else: # store result
287 | while True:
288 | if data1[k1] < data2[k2]:
289 | dest[pos] = data1[k1]
290 | pos += 1
291 | k1 += 1
292 | if k1 >= length1:
293 | return pos
294 | elif data1[k1] == data2[k2]:
295 | k1 += 1
296 | k2 += 1
297 | if k1 >= length1:
298 | return pos
299 | elif k2 >= length2:
300 | break
301 | else: # data1[k1] > data2[k2]
302 | k2 += 1
303 | if k2 >= length2:
304 | break
305 | while k1 < length1:
306 | dest[pos] = data1[k1]
307 | pos += 1
308 | k1 += 1
309 | return pos
310 |
311 |
312 | cdef int xor2by2(uint16_t *data1, uint16_t *data2,
313 | int length1, int length2, uint16_t *dest) nogil:
314 | cdef int k1 = 0, k2 = 0, pos = 0
315 | if length2 == 0:
316 | if dest is not NULL:
317 | memcpy(dest, data1, length1 * sizeof(uint16_t))
318 | return length1
319 | elif length1 == 0:
320 | if dest is not NULL:
321 | memcpy(dest, data2, length2 * sizeof(uint16_t))
322 | return length2
323 | if dest is NULL: # cardinality only
324 | while True:
325 | if data1[k1] < data2[k2]:
326 | pos += 1
327 | k1 += 1
328 | if k1 >= length1:
329 | break
330 | elif data1[k1] == data2[k2]:
331 | k1 += 1
332 | k2 += 1
333 | if k1 >= length1 or k2 >= length2:
334 | break
335 | else: # data1[k1] > data2[k2]
336 | pos += 1
337 | k2 += 1
338 | if k2 >= length2:
339 | break
340 | if k1 >= length1:
341 | while k2 < length2:
342 | pos += 1
343 | k2 += 1
344 | elif k2 >= length2:
345 | while k1 < length1:
346 | pos += 1
347 | k1 += 1
348 | else: # store result
349 | while True:
350 | if data1[k1] < data2[k2]:
351 | dest[pos] = data1[k1]
352 | pos += 1
353 | k1 += 1
354 | if k1 >= length1:
355 | break
356 | elif data1[k1] == data2[k2]:
357 | k1 += 1
358 | k2 += 1
359 | if k1 >= length1 or k2 >= length2:
360 | break
361 | else: # data1[k1] > data2[k2]
362 | dest[pos] = data2[k2]
363 | pos += 1
364 | k2 += 1
365 | if k2 >= length2:
366 | break
367 | if k1 >= length1:
368 | while k2 < length2:
369 | dest[pos] = data2[k2]
370 | pos += 1
371 | k2 += 1
372 | elif k2 >= length2:
373 | while k1 < length1:
374 | dest[pos] = data1[k1]
375 | pos += 1
376 | k1 += 1
377 | return pos
378 |
379 |
380 | cdef inline int selectinvertedbinarysearch(
381 | uint16_t *data, int begin, int end, uint16_t i) nogil:
382 | """Custom binary search to find i'th member given array of non-members."""
383 | # 0 1 2 3 4 5 6 7 8 9 10 ... indices
384 | # 0 1 2 ... inverted: indices
385 | # 3 7 11 ... inverted: non-members
386 | # 0 1 2 4 5 6 8 9 10 12 13 ... members
387 | cdef int low = begin
388 | cdef int high = end - 1
389 | cdef int middleidx
390 | cdef uint16_t middleval
391 | if end == 0 or data[0] > i:
392 | return i
393 | elif data[high] - high <= i:
394 | return i + high + 1
395 | # find the pair of non-members between which the i'th member lies
396 | while low < high:
397 | middleidx = (low + high) >> 1
398 | middleval = data[middleidx] - middleidx
399 | if middleval > i:
400 | high = middleidx
401 | else:
402 | low = middleidx + 1
403 | # compute member given index
404 | return i + low
405 |
--------------------------------------------------------------------------------
/src/bitcount.h:
--------------------------------------------------------------------------------
1 | /* Fast cross-platform bit counting using intrinsic functions
2 | *
3 | * This code is based on https://github.com/Noctune/bitcount
4 | * Adapted for 64-bit integers instead of 32 bits.
5 | */
6 |
7 | #ifndef BITCOUNT_H_
8 | #define BITCOUNT_H_
9 |
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 |
14 | #if !defined(BITCOUNT_NO_AUTODETECT)
15 | #if defined(__GNUC__) || defined(__clang__)
16 | #define BITCOUNT_GCC
17 | // FIXME: disabled for debugging
18 | // #elif defined(_MSC_VER) && defined(_M_X64)
19 | // #define BITCOUNT_VS_X64
20 | // #elif defined(_MSC_VER) && defined(_M_IX86)
21 | // #define BITCOUNT_VS_X86
22 | #endif
23 | #endif
24 |
25 | #ifdef _MSC_VER
26 | #define BITCOUNT_INLINE static __inline
27 | #else
28 | #define BITCOUNT_INLINE static inline
29 | #endif
30 |
31 | #ifdef BITCOUNT_VS_X64
32 | #include
33 | #pragma intrinsic(_BitScanForward64,_BitScanReverse64,__popcnt64)
34 | #endif
35 |
36 | #ifdef BITCOUNT_VS_X86
37 | #include
38 | #pragma intrinsic(_BitScanForward,_BitScanReverse,__popcnt)
39 | #endif
40 |
41 | #include
42 | #include
43 | #define BITCOUNT_BITS (sizeof(uint64_t) * CHAR_BIT)
44 |
45 | /* General implementations for systems without intrinsics */
46 | unsigned int bit_clz_general(uint64_t);
47 | unsigned int bit_ctz_general(uint64_t);
48 | unsigned int bit_popcount_general(uint64_t);
49 |
50 | /* Returns the number of leading 0-bits in x, starting at the most significant
51 | bit position. If v is 0, the result is undefined. */
52 | BITCOUNT_INLINE unsigned int bit_clz(uint64_t v) {
53 | #if defined(BITCOUNT_GCC)
54 | return __builtin_clzll(v);
55 | #elif defined(BITCOUNT_VS_X64)
56 | unsigned long result;
57 | _BitScanReverse64(&result, v);
58 | return BITCOUNT_BITS - 1 - result;
59 | #elif defined(BITCOUNT_VS_X86)
60 | unsigned long result;
61 | if ((uint32_t)(v >> 32) != 0) {
62 | _BitScanReverse(&result, (uint32_t)(v >> 32));
63 | } else {
64 | _BitScanReverse(&result, (uint32_t)v);
65 | result += 32;
66 | }
67 | return BITCOUNT_BITS - 1 - result;
68 | #else
69 | return bit_clz_general(v);
70 | #endif
71 | }
72 |
73 | /* Returns the number of trailing 0-bits in x, starting at the least significant
74 | bit position. If v is 0, the result is undefined. */
75 | BITCOUNT_INLINE unsigned int bit_ctz(uint64_t v) {
76 | #if defined(BITCOUNT_GCC)
77 | return __builtin_ctzll(v);
78 | #elif defined(BITCOUNT_VS_X64)
79 | unsigned long result;
80 | _BitScanForward64(&result, v);
81 | return result;
82 | #elif defined(BITCOUNT_VS_X86)
83 | unsigned long result;
84 | /* https://github.com/google/re2/commit/35febd432d9e6d8630845285c7f29eabd1df7beb */
85 | if ((uint32_t)v != 0) {
86 | _BitScanForward(&result, (uint32_t)v);
87 | return (unsigned int)result;
88 | } else {
89 | _BitScanForward(&result, (uint32_t)(v >> 32));
90 | return (unsigned int)(result) + 32;
91 | }
92 | #else
93 | return bit_ctz_general(v);
94 | #endif
95 | }
96 |
97 | /* Returns the number of 1-bits in v. */
98 | BITCOUNT_INLINE unsigned int bit_popcount(uint64_t v) {
99 | #if defined(BITCOUNT_GCC)
100 | return __builtin_popcountll(v);
101 | #elif defined(BITCOUNT_VS_X64)
102 | return __popcnt64(v);
103 | #elif defined(BITCOUNT_VS_X86)
104 | return (__popcnt((uint32_t)v) + __popcnt((uint32_t)(v >> 32)));
105 | #else
106 | return bit_popcount_general(v);
107 | #endif
108 | }
109 |
110 | unsigned int bit_clz_general(uint64_t v) {
111 | /* From http://www.codeproject.com/Tips/784635/UInt-Bit-Operations */
112 | uint64_t i, c;
113 |
114 | i = ~v;
115 | c = ((i ^ (i + 1)) & i) >> 63;
116 |
117 | i = (v >> 32) + 0xffffffff;
118 | i = ((i & 0x100000000) ^ 0x100000000) >> 27;
119 | c += i; v <<= i;
120 |
121 | i = (v >> 48) + 0xffff;
122 | i = ((i & 0x10000) ^ 0x10000) >> 12;
123 | c += i; v <<= i;
124 |
125 | i = (v >> 56) + 0xff;
126 | i = ((i & 0x100) ^ 0x100) >> 5;
127 | c += i; v <<= i;
128 |
129 | i = (v >> 60) + 0xf;
130 | i = ((i & 0x10) ^ 0x10) >> 2;
131 | c += i; v <<= i;
132 |
133 | i = (v >> 62) + 3;
134 | i = ((i & 4) ^ 4) >> 1;
135 | c += i; v <<= i;
136 |
137 | c += (v >> 63) ^ 1;
138 |
139 | return (unsigned int)c;
140 | }
141 |
142 | unsigned int bit_ctz_general(uint64_t v) {
143 | /* From http://www.codeproject.com/Tips/784635/UInt-Bit-Operations */
144 | uint64_t i = ~v;
145 | uint64_t c = ((i ^ (i + 1)) & i) >> 63;
146 |
147 | i = (v & 0xffffffff) + 0xffffffff;
148 | i = ((i & 0x100000000) ^ 0x100000000) >> 27;
149 | c += i; v >>= i;
150 |
151 | i = (v & 0xffff) + 0xffff;
152 | i = ((i & 0x10000) ^ 0x10000) >> 12;
153 | c += i; v >>= i;
154 |
155 | i = (v & 0xff) + 0xff;
156 | i = ((i & 0x100) ^ 0x100) >> 5;
157 | c += i; v >>= i;
158 |
159 | i = (v & 0xf) + 0xf;
160 | i = ((i & 0x10) ^ 0x10) >> 2;
161 | c += i; v >>= i;
162 |
163 | i = (v & 3) + 3;
164 | i = ((i & 4) ^ 4) >> 1;
165 | c += i; v >>= i;
166 |
167 | c += ((v & 1) ^ 1);
168 |
169 | return (unsigned int)c;
170 | }
171 |
172 | unsigned int bit_popcount_general(uint64_t v) {
173 | /* see http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
174 | v -= ((v >> 1) & 0x5555555555555555);
175 | v = (v & 0x3333333333333333) + ((v >> 2) & 0x3333333333333333);
176 | return (((v + (v >> 4)) & 0xF0F0F0F0F0F0F0F) * 0x101010101010101) >> 56;
177 | }
178 |
179 | #ifdef __cplusplus
180 | }
181 | #endif
182 |
183 | #endif /* BITCOUNT_H_ */
184 |
--------------------------------------------------------------------------------
/src/bitops.pxi:
--------------------------------------------------------------------------------
1 | """Oerations on fixed-size bitvectors.
2 |
3 | All bitvector operands are assumed to have ``BLOCKSIZE`` elements (bits).
4 | """
5 |
6 | # Store result, return cardinality
7 | cdef inline uint32_t bitsetintersect(uint64_t *dest,
8 | uint64_t *src1, uint64_t *src2) nogil:
9 | """dest gets the intersection of src1 and src2.
10 |
11 | :returns: number of set bits in result."""
12 | cdef size_t n
13 | cdef uint64_t res1, res2
14 | cdef uint32_t result = 0
15 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
16 | res1 = src1[n] & src2[n]
17 | res2 = src1[n + 1] & src2[n + 1]
18 | dest[n] = res1
19 | dest[n + 1] = res2
20 | result += bit_popcount(res1)
21 | result += bit_popcount(res2)
22 | return result
23 |
24 |
25 | cdef inline uint32_t bitsetunion(uint64_t *dest,
26 | uint64_t *src1, uint64_t *src2) nogil:
27 | """dest gets the union of src1 and src2.
28 |
29 | :returns: number of set bits in result."""
30 | cdef size_t n
31 | cdef uint64_t res1, res2
32 | cdef uint32_t result = 0
33 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
34 | res1 = src1[n] | src2[n]
35 | res2 = src1[n + 1] | src2[n + 1]
36 | dest[n] = res1
37 | dest[n + 1] = res2
38 | result += bit_popcount(res1)
39 | result += bit_popcount(res2)
40 | return result
41 |
42 |
43 | cdef inline uint32_t bitsetxor(uint64_t *dest,
44 | uint64_t *src1, uint64_t *src2) nogil:
45 | """dest gets the xor of src1 and src2.
46 |
47 | :returns: number of set bits in result."""
48 | cdef size_t n
49 | cdef uint64_t res1, res2
50 | cdef uint32_t result = 0
51 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
52 | res1 = src1[n] ^ src2[n]
53 | res2 = src1[n + 1] ^ src2[n + 1]
54 | dest[n] = res1
55 | dest[n + 1] = res2
56 | result += bit_popcount(res1)
57 | result += bit_popcount(res2)
58 | return result
59 |
60 |
61 | cdef inline uint32_t bitsetsubtract(uint64_t *dest,
62 | uint64_t *src1, uint64_t *src2) nogil:
63 | """dest gets the src2 - src1.
64 |
65 | :returns: number of set bits in result."""
66 | cdef size_t n
67 | cdef uint64_t res1, res2
68 | cdef uint32_t result = 0
69 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
70 | res1 = src1[n] & ~src2[n]
71 | res2 = src1[n + 1] & ~src2[n + 1]
72 | dest[n] = res1
73 | dest[n + 1] = res2
74 | result += bit_popcount(res1)
75 | result += bit_popcount(res2)
76 | return result
77 |
78 |
79 | # Only store result, no cardinality
80 | cdef inline void bitsetintersectnocard(uint64_t *dest,
81 | uint64_t *src1, uint64_t *src2) noexcept nogil:
82 | """dest gets the intersection of src1 and src2."""
83 | cdef size_t n
84 | cdef uint64_t res1, res2
85 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
86 | res1 = src1[n] & src2[n]
87 | res2 = src1[n + 1] & src2[n + 1]
88 | dest[n] = res1
89 | dest[n + 1] = res2
90 |
91 |
92 | cdef inline void bitsetunionnocard(uint64_t *dest,
93 | uint64_t *src1, uint64_t *src2) noexcept nogil:
94 | """dest gets the union of src1 and src2."""
95 | cdef size_t n
96 | cdef uint64_t res1, res2
97 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
98 | res1 = src1[n] | src2[n]
99 | res2 = src1[n + 1] | src2[n + 1]
100 | dest[n] = res1
101 | dest[n + 1] = res2
102 |
103 |
104 | cdef inline void bitsetxornocard(uint64_t *dest,
105 | uint64_t *src1, uint64_t *src2) noexcept nogil:
106 | """dest gets the xor of src1 and src2."""
107 | cdef size_t n
108 | cdef uint64_t res1, res2
109 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
110 | res1 = src1[n] ^ src2[n]
111 | res2 = src1[n + 1] ^ src2[n + 1]
112 | dest[n] = res1
113 | dest[n + 1] = res2
114 |
115 |
116 | cdef inline void bitsetsubtractnocard(uint64_t *dest,
117 | uint64_t *src1, uint64_t *src2) noexcept nogil:
118 | """dest gets the src2 - src1."""
119 | cdef size_t n
120 | cdef uint64_t res1, res2
121 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
122 | res1 = src1[n] & ~src2[n]
123 | res2 = src1[n + 1] & ~src2[n + 1]
124 | dest[n] = res1
125 | dest[n + 1] = res2
126 |
127 |
128 | # Count cardinality only
129 | cdef inline uint32_t bitsetintersectcount(
130 | uint64_t *src1, uint64_t *src2) noexcept nogil:
131 | """return the cardinality of the intersection of dest and src.
132 |
133 | :returns: number of set bits in result.
134 | Both operands are assumed to have a fixed number of bits ``BLOCKSIZE``."""
135 | cdef uint32_t result = 0
136 | cdef size_t n
137 | for n in range((BLOCKSIZE // BITSIZE)):
138 | result += bit_popcount(src1[n] & src2[n])
139 | return result
140 |
141 |
142 | # Other operations
143 | cdef inline int iteratesetbits(uint64_t *vec,
144 | uint64_t *cur, int *idx) noexcept nogil:
145 | """Iterate over set bits in an array of unsigned long.
146 |
147 | :param cur: pointer to variable to maintain state,
148 | ``cur`` should be initialized to the first element of
149 | the bit array ``vec``, i.e., ``cur = vec[idx]``.
150 | :param idx: pointer to variable to maintain state,
151 | ``idx`` should be initialized to 0.
152 | :returns: the index of a set bit, or -1 if there are no more set
153 | bits. The result of calling a stopped iterator is undefined.
154 |
155 | e.g.::
156 |
157 | int idx = 0
158 | uint64_t vec[4] = {0, 0, 0, 0b10001}, cur = vec[idx]
159 | iteratesetbits(vec, 4, &cur, &idx) # returns 0
160 | iteratesetbits(vec, 4, &cur, &idx) # returns 4
161 | iteratesetbits(vec, 4, &cur, &idx) # returns -1
162 | """
163 | cdef int tmp
164 | while not cur[0]:
165 | idx[0] += 1
166 | if idx[0] >= (BLOCKSIZE // BITSIZE):
167 | return -1
168 | cur[0] = vec[idx[0]]
169 | tmp = bit_ctz(cur[0]) # index of right-most 1-bit in current slot
170 | cur[0] ^= 1ULL << tmp # TOGGLEBIT(cur, tmp)
171 | return idx[0] * BITSIZE + tmp
172 |
173 |
174 | cdef inline int iterateunsetbits(uint64_t *vec,
175 | uint64_t *cur, int *idx) noexcept nogil:
176 | """Like ``iteratesetbits``, but return indices of zero bits.
177 |
178 | :param cur: should be initialized as: ``cur = ~vec[idx]``.
179 | :param idx: pointer to variables to maintain state,
180 | ``idx`` should be initialized to 0.
181 | """
182 | cdef int tmp
183 | while not cur[0]:
184 | idx[0] += 1
185 | if idx[0] >= (BLOCKSIZE // BITSIZE):
186 | return -1
187 | cur[0] = ~vec[idx[0]]
188 | tmp = bit_ctz(cur[0]) # index of right-most 0-bit in current slot
189 | cur[0] ^= 1ULL << tmp # TOGGLEBIT(cur, tmp)
190 | return idx[0] * BITSIZE + tmp
191 |
192 |
193 | cdef inline int reviteratesetbits(uint64_t *vec, uint64_t *cur,
194 | int *idx) noexcept nogil:
195 | """Iterate in reverse over set bits in an array of unsigned long.
196 |
197 | :param cur: pointer to variable to maintain state,
198 | ``cur`` should be initialized to the last element of
199 | the bit array ``vec``, i.e., ``cur = vec[idx]``.
200 | :param idx: pointer to variable to maintain state,
201 | ``idx`` should be initialized to ``slots - 1``, where slots is the
202 | number of elements in unsigned long array ``vec``.
203 | :returns: the index of a set bit, or -1 if there are no more set
204 | bits. The result of calling a stopped iterator is undefined.
205 |
206 | e.g.::
207 |
208 | int idx = 3
209 | uint64_t vec[4] = {0, 0, 0, 0b10001}, cur = vec[idx]
210 | reviteratesetbits(vec, 4, &cur, &idx) # returns 4
211 | reviteratesetbits(vec, 4, &cur, &idx) # returns 0
212 | reviteratesetbits(vec, 4, &cur, &idx) # returns -1
213 | """
214 | cdef int tmp
215 | while not cur[0]:
216 | idx[0] -= 1
217 | if idx[0] < 0:
218 | return -1
219 | cur[0] = vec[idx[0]]
220 | tmp = BITSIZE - bit_clz(cur[0]) - 1 # index of left-most 1-bit in cur
221 | cur[0] &= ~(1ULL << tmp) # CLEARBIT(cur, tmp)
222 | return idx[0] * BITSIZE + tmp
223 |
224 |
225 | cdef inline uint32_t extractsetbits(uint16_t *dest,
226 | uint64_t *src) noexcept nogil:
227 | """Store set bits of bitvector in preallocated array.
228 |
229 | :returns: number of elements in result."""
230 | cdef size_t n, length = 0, base = 0
231 | cdef uint64_t cur
232 | for n in range((BLOCKSIZE // BITSIZE)):
233 | cur = src[n]
234 | while cur:
235 | dest[length] = base + bit_ctz(cur)
236 | length += 1
237 | cur ^= cur & -cur
238 | base += 64
239 | return length
240 |
241 |
242 | cdef inline uint32_t extractunsetbits(uint16_t *dest,
243 | uint64_t *src) noexcept nogil:
244 | """Store zero bits of bitvector in preallocated array.
245 |
246 | :returns: number of elements in result."""
247 | cdef size_t n, length = 0, base = 0
248 | cdef uint64_t cur
249 | for n in range((BLOCKSIZE // BITSIZE)):
250 | cur = ~src[n]
251 | while cur:
252 | dest[length] = base + bit_ctz(cur)
253 | length += 1
254 | cur ^= cur & -cur
255 | base += 64
256 | return length
257 |
258 |
259 | cdef inline uint32_t extractintersection(
260 | uint16_t *dest, uint64_t *src1, uint64_t *src2) noexcept nogil:
261 | """Compute intersection of bitvectors and store in preallocated array.
262 |
263 | :returns: number of elements in result."""
264 | cdef size_t n, length = 0, base = 0
265 | cdef uint64_t cur
266 | for n in range((BLOCKSIZE // BITSIZE)):
267 | cur = src1[n] & src2[n]
268 | while cur:
269 | dest[length] = base + bit_ctz(cur)
270 | length += 1
271 | cur ^= cur & -cur
272 | base += 64
273 | return length
274 |
275 |
276 | cdef inline bint bitsubset(uint64_t *vec1, uint64_t *vec2) noexcept nogil:
277 | """Test whether vec1 is a subset of vec2.
278 |
279 | i.e., all set bits of vec1 should be set in vec2."""
280 | cdef size_t n
281 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
282 | if (vec1[n] & vec2[n]) != vec1[n] or (
283 | vec1[n + 1] & vec2[n + 1]) != vec1[n + 1]:
284 | return False
285 | return True
286 |
287 |
288 | cdef inline bint bitdisjoint(uint64_t *vec1, uint64_t *vec2) noexcept nogil:
289 | """Test whether vec1 is disjoint from vec2.
290 |
291 | i.e., len(vec1 & vec2) = 0."""
292 | cdef size_t n
293 | for n in range(0, (BLOCKSIZE // BITSIZE), 2):
294 | if (vec1[n] & vec2[n]) or (vec1[n + 1] & vec2[n + 1]):
295 | return False
296 | return True
297 |
298 |
299 | cdef inline int select64(uint64_t w, int i) except -1:
300 | """Given a 64-bit int w, return the position of the ith 1-bit."""
301 | cdef uint64_t part1 = w & 0xFFFFFFFFUL
302 | cdef int wfirsthalf = bit_popcount(part1)
303 | if wfirsthalf > i:
304 | return select32(part1, i)
305 | else:
306 | return select32((w >> 32), i - wfirsthalf) + 32
307 |
308 |
309 | cdef inline int select32(uint32_t w, int i) except -1:
310 | """Given a 32-bit int w, return the position of the ith 1-bit."""
311 | cdef uint64_t part1 = w & 0xFFFFUL
312 | cdef int wfirsthalf = bit_popcount(part1)
313 | if wfirsthalf > i:
314 | return select16(part1, i)
315 | else:
316 | return select16(w >> 16, i - wfirsthalf) + 16
317 |
318 |
319 | cdef inline int select16(uint16_t w, int i) except -1:
320 | """Given a 16-bit int w, return the position of the ith 1-bit."""
321 | cdef int sumtotal = 0, counter
322 | for counter in range(16):
323 | sumtotal += (w >> counter) & 1
324 | if sumtotal > i:
325 | return counter
326 | raise IndexError('select16: index %d out of range 0..%d.' % (
327 | i, bit_popcount(w)))
328 |
329 |
330 | cdef inline void setbitcard(uint64_t *bitmap, uint16_t elem,
331 | uint32_t *cardinality) noexcept nogil:
332 | """Set bit and update cardinality without branch."""
333 | cdef uint32_t i
334 | cdef uint64_t ow, nw
335 | i = BITSLOT(elem)
336 | ow = bitmap[i]
337 | nw = ow | BITMASK(elem)
338 | cardinality[0] += (ow ^ nw) >> (elem % BITSIZE)
339 | bitmap[i] = nw
340 |
341 |
342 | cdef inline void clearbitcard(uint64_t *bitmap, uint16_t elem,
343 | uint32_t *cardinality) noexcept nogil:
344 | """Clear bit and update cardinality without branch."""
345 | cdef uint32_t i
346 | cdef uint64_t ow, nw
347 | i = BITSLOT(elem)
348 | ow = bitmap[i]
349 | nw = ow & ~BITMASK(elem)
350 | cardinality[0] -= (ow ^ nw) >> (elem % BITSIZE)
351 | bitmap[i] = nw
352 |
353 |
354 | cdef inline void togglebitcard(uint64_t *bitmap, uint16_t elem,
355 | uint32_t *cardinality) noexcept nogil:
356 | """Flip bit and update cardinality without branch."""
357 | cdef uint32_t i
358 | cdef uint64_t ow, nw
359 | i = BITSLOT(elem)
360 | ow = bitmap[i]
361 | nw = ow ^ BITMASK(elem)
362 | cardinality[0] += (nw >> (elem % BITSIZE)) - (ow >> (elem % BITSIZE))
363 | bitmap[i] = nw
364 |
--------------------------------------------------------------------------------
/src/immutablerb.pxi:
--------------------------------------------------------------------------------
1 | cdef class ImmutableRoaringBitmap(RoaringBitmap):
2 | """A roaring bitmap that does not allow mutation operations.
3 |
4 | Any operation resulting in a new roaring bitmap is returned as a mutable
5 | RoaringBitmap (except for ``freeze()`` and the ``ImmutableRoaringBitmap``
6 | constructor). Stores data in one contiguous block of memory for efficient
7 | serialization.
8 | """
9 | cdef readonly object _ob # object to be kept for ptr to remain valid
10 | cdef char *ptr # the data
11 | cdef size_t bufsize # length in bytes of data
12 | cdef long _hash # cached hash value, computed as needed
13 |
14 | def __init__(self, iterable=None):
15 | """Return a new RoaringBitmap with elements from ``iterable``.
16 |
17 | The elements ``x`` of a RoaringBitmap must be ``0 <= x < 2 ** 32``.
18 | If ``iterable`` is not specified, a new empty RoaringBitmap is
19 | returned. Note that a sorted iterable will significantly speed up the
20 | construction.
21 | ``iterable`` may be a generator, in which case the generator is
22 | consumed incrementally.
23 | ``iterable`` may be a ``range`` (Python 3) or ``xrange`` (Python 2)
24 | object, which will be constructed efficiently."""
25 | cdef RoaringBitmap ob
26 | cdef ImmutableRoaringBitmap iob
27 | if isinstance(iterable, ImmutableRoaringBitmap):
28 | iob = iterable
29 | self.__setstate__(iob.__getstate__())
30 | else:
31 | ob = ensurerb(iterable or ())
32 | self.__setstate__(ob.__getstate__())
33 |
34 | def __getstate__(self):
35 | """Return a serialized representation (Python array) for pickling."""
36 | if self._ob is None:
37 | state = array.clone(chararray, self.bufsize, False)
38 | memcpy(state.data.as_chars, self.ptr, self.bufsize)
39 | return state
40 | return self._ob
41 |
42 | def __setstate__(self, array.array state):
43 | """Initialize this object with a serialized representation.
44 |
45 | :param state: a char array with the pickle format of RoaringBitmap.
46 | Instead of copying this data, it will be used directly.
47 | """
48 | self._ob = state
49 | # FIXME: 32 byte alignment depends on state.data being aligned.
50 | self._setptr(state.data.as_chars, len(state))
51 |
52 | cdef void _setptr(self, char *ptr, size_t size) noexcept nogil:
53 | self.ptr = ptr
54 | self.offset = ptr
55 | self.bufsize = size
56 | self._hash = -1
57 | self.size = (ptr)[0]
58 | self.capacity = self.size
59 | self.keys = &(ptr[sizeof(uint32_t)])
60 | # pointers will be adjusted on the fly with self.offset
61 | self.data = &(ptr[
62 | sizeof(uint32_t) + self.size * (sizeof(uint16_t))])
63 |
64 | def __hash__(self):
65 | cdef size_t n
66 | if self._hash == -1:
67 | self._hash = 5381
68 | for n in range(self.bufsize):
69 | self._hash = ((self._hash << 5) + self._hash) + self.ptr[n]
70 | # i.e., self._hash *= 33 ^ self.ptr[n]
71 | return self._hash
72 |
73 | def __richcmp__(x, y, int op):
74 | cdef ImmutableRoaringBitmap iob1, iob2
75 | if (isinstance(x, ImmutableRoaringBitmap)
76 | and isinstance(y, ImmutableRoaringBitmap)):
77 | if op == 2: # ==
78 | iob1, iob2 = x, y
79 | if (iob1.bufsize != iob2.bufsize
80 | or iob1.__hash__() != iob2.__hash__()):
81 | return False
82 | return memcmp(iob1.ptr, iob2.ptr, iob1.bufsize) == 0
83 | elif op == 3: # !=
84 | return not (x == y)
85 | return richcmp(x, y, op)
86 |
87 | def __sizeof__(self):
88 | """Return memory usage in bytes."""
89 | return len(self._ob)
90 |
91 | def freeze(self):
92 | """Already immutable, return self."""
93 | return self
94 |
95 | def __repr__(self):
96 | return 'ImmutableRoaringBitmap(%s)' % str(self)
97 |
98 | def copy(self):
99 | """Return a copy of this RoaringBitmap."""
100 | cdef ImmutableRoaringBitmap result = ImmutableRoaringBitmap.__new__(
101 | ImmutableRoaringBitmap)
102 | result.__setstate__(array.copy(self.__getstate__()))
103 | return result
104 |
105 | def __iand__(self, x):
106 | """Unsupported method."""
107 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
108 |
109 | def __isub__(self, x):
110 | """Unsupported method."""
111 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
112 |
113 | def __ior__(self, x):
114 | """Unsupported method."""
115 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
116 |
117 | def __ixor__(self, x):
118 | """Unsupported method."""
119 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
120 |
121 | def add(self, uint32_t elem):
122 | """Unsupported method."""
123 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
124 |
125 | def discard(self, uint32_t elem):
126 | """Unsupported method."""
127 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
128 |
129 | def remove(self, uint32_t elem):
130 | """Unsupported method."""
131 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
132 |
133 | def pop(self):
134 | """Unsupported method."""
135 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
136 |
137 | def update(self, *bitmaps):
138 | """Unsupported method."""
139 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
140 |
141 | def intersection_update(self, *bitmaps):
142 | """Unsupported method."""
143 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
144 |
145 | def difference_update(self, *other):
146 | """Unsupported method."""
147 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
148 |
149 | def symmetric_difference_update(self, other):
150 | """Unsupported method."""
151 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
152 |
153 | def flip_range(self, start, stop):
154 | """Unsupported method."""
155 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
156 |
157 | def clear(self):
158 | """Unsupported method."""
159 | raise ValueError('ImmutableRoaringBitmap cannot be modified.')
160 |
--------------------------------------------------------------------------------
/src/macros.h:
--------------------------------------------------------------------------------
1 | /* http://c-faq.com/misc/bitsets.html */
2 | /* Original, any word size:
3 | #define BITSIZE (8 * sizeof(uint64_t))
4 | #define BITSLOT(b) ((b) / BITSIZE)
5 | #define BITMASK(b) (1ULL << ((b) % BITSIZE))
6 | #define TESTBIT(a, b) ((a)[BITSLOT(b)] & BITMASK(b))
7 | NB: TESTBIT returns 0 or a value with bit b set
8 | Fix word size at 64 bits:
9 | */
10 | #define BITSIZE (64)
11 | #define BITSIZE1 (BITSIZE - 1)
12 | #define BITSLOT(b) ((b) >> 6)
13 | #define BITMASK(b) (1ULL << ((b) & BITSIZE1))
14 | #define SETBIT(a, b) ((a)[BITSLOT(b)] |= BITMASK(b))
15 | #define TOGGLEBIT(a, b) ((a)[BITSLOT(b)] ^= BITMASK(b))
16 | #define CLEARBIT(a, b) ((a)[BITSLOT(b)] &= ~BITMASK(b))
17 | #define BITNSLOTS(nb) (((nb) + BITSIZE1) / BITSIZE)
18 | #define TESTBIT(a, b) (((a)[BITSLOT(b)] >> (b & BITSIZE1)) & 1)
19 | /* NB: TESTBIT returns 0 or 1*/
20 |
21 | #ifdef _MSC_VER
22 | #define ALIGNED_INLINE __inline
23 | #else
24 | #define ALIGNED_INLINE inline
25 | #endif
26 |
27 | /* https://stackoverflow.com/q/16376942 */
28 | ALIGNED_INLINE void* aligned_malloc(size_t size, size_t align) {
29 | void *result;
30 | #ifdef _MSC_VER
31 | result = _aligned_malloc(size, align);
32 | #else
33 | if (posix_memalign(&result, align, size))
34 | result = 0;
35 | #endif
36 | return result;
37 | }
38 |
39 | ALIGNED_INLINE void aligned_free(void *ptr) {
40 | #ifdef _MSC_VER
41 | _aligned_free(ptr);
42 | #else
43 | free(ptr);
44 | #endif
45 | }
46 |
--------------------------------------------------------------------------------
/src/multirb.pxi:
--------------------------------------------------------------------------------
1 | @cython.no_gc_clear
2 | cdef class MultiRoaringBitmap(object):
3 | """A sequence of immutable roaring bitmaps.
4 |
5 | Bitmaps are addressed with 32-bit indices.
6 | Everything is stored in a single contiguous block of memory.
7 |
8 | >>> mrb = MultiRoaringBitmap([
9 | ... RoaringBitmap({0, 1, 2}),
10 | ... RoaringBitmap({1, 6, 8}),
11 | ... RoaringBitmap({1, 7, 2})])
12 | >>> mrb.intersection(list(range(len(mrb))))
13 | RoaringBitmap({1})
14 | >>> mrb[0] | mrb[1]
15 | RoaringBitmap({0, 1, 2, 6, 8})
16 | """
17 | cdef uint32_t size # the number of roaring bitmaps
18 | cdef uint32_t *offsets # byte offset in ptr for each roaring bitmap
19 | cdef uint32_t *sizes # the size in bytes of each roaring bitmap
20 | cdef uint32_t *ptr # the data
21 | cdef object _ob # array or mmap which should be kept alive for ptr
22 | cdef object _file # optionally, file with mmap to be kept open
23 |
24 | def __init__(self, list init, filename=None):
25 | """
26 | :param init: a list of set-like objects (e.g., RoaringBitmaps).
27 | May contain ``None`` elements, which are treated as empty
28 | sets.
29 | :param filename: if given, result is stored in an mmap'd file.
30 | File is overwritten if it already exists."""
31 | cdef ImmutableRoaringBitmap irb
32 | cdef uint32_t alloc, offset
33 | cdef int alignment = 32
34 | cdef Py_buffer buffer
35 | cdef Py_ssize_t size = 0
36 | cdef char *ptr = NULL
37 | cdef int result
38 |
39 | if filename is not None:
40 | flags = os.O_CREAT | os.O_RDWR
41 | if sys.platform == 'win32':
42 | flags |= os.O_BINARY
43 | self._file = os.open(filename, flags)
44 |
45 | tmp = [None if a is None else ImmutableRoaringBitmap(a) for a in init]
46 | self.size = len(tmp)
47 | alloc = sizeof(uint32_t) + 2 * self.size * sizeof(uint32_t)
48 | extra = alignment - alloc % alignment
49 | alloc += extra
50 | offset = alloc
51 | for irb in tmp:
52 | if irb is not None:
53 | alloc += irb.bufsize
54 |
55 | if filename is not None:
56 | os.ftruncate(self._file, alloc)
57 | self._ob = mmap.mmap(
58 | -1 if filename is None else self._file,
59 | alloc, access=mmap.ACCESS_WRITE)
60 | result = getbufptr(self._ob, &ptr, &size, &buffer)
61 | self.ptr = ptr
62 | if result != 0:
63 | raise ValueError('could not get buffer from mmap.')
64 |
65 | self.ptr[0] = self.size
66 | self.offsets = &(self.ptr[1])
67 | self.sizes = &(self.ptr[1 + self.size])
68 | for n in range(1 + 2 * self.size,
69 | 1 + 2 * self.size + extra // sizeof(uint32_t)):
70 | self.ptr[n] = 0
71 | for n, irb in enumerate(tmp):
72 | # offset
73 | self.ptr[1 + n] = offset
74 | # size
75 | if irb is None or irb.size == 0:
76 | self.ptr[1 + n + self.size] = 0
77 | continue
78 | self.ptr[1 + n + self.size] = irb.bufsize
79 | # copy data
80 | memcpy(&((self.ptr)[offset]), irb.ptr, irb.bufsize)
81 | offset += irb.bufsize
82 | if filename is not None:
83 | self._ob.flush()
84 | releasebuf(&buffer)
85 |
86 | def __richcmp__(x, y, int op):
87 | if x is None or y is None:
88 | if op == 2 or op == 3:
89 | return op == 3
90 | raise TypeError
91 | if (not isinstance(x, (MultiRoaringBitmap, list))
92 | or not isinstance(y, (MultiRoaringBitmap, list))):
93 | raise TypeError
94 | if op == 2: # ==
95 | if len(x) != len(y):
96 | return False
97 | return all(a == b for a, b in zip(x, y))
98 | elif op == 3: # !=
99 | if len(x) != len(y):
100 | return True
101 | return not all(a == b for a, b in zip(x, y))
102 | return NotImplemented
103 |
104 | def close(self):
105 | """Close opened file, if any."""
106 | if hasattr(self._ob, 'close'):
107 | self._ob.close()
108 | self._ob = None
109 | if self._file is not None:
110 | os.close(self._file)
111 | self._file = None
112 |
113 | def __enter__(self):
114 | return self
115 |
116 | def __exit__(self, _type, _value, _traceback):
117 | self.close()
118 |
119 | def __getstate__(self):
120 | """Return a serialized representation (Python array) for pickling."""
121 | return bytes(self._ob)
122 |
123 | def __setstate__(self, state):
124 | """Initialize this object with a serialized representation."""
125 | self._ob = state
126 | self.ptr = state
127 | self.size = self.ptr[0]
128 | self.offsets = &(self.ptr[1])
129 | self.sizes = &(self.ptr[1 + self.size])
130 |
131 | @classmethod
132 | def fromfile(cls, filename):
133 | """Load a MultiRoaringBitmap from a file using mmap."""
134 | cdef MultiRoaringBitmap ob
135 | cdef Py_buffer buffer
136 | cdef char *ptr = NULL
137 | cdef Py_ssize_t size = 0
138 | ob = MultiRoaringBitmap.__new__(MultiRoaringBitmap)
139 | flags = os.O_RDONLY
140 | if sys.platform == 'win32':
141 | flags |= os.O_BINARY
142 | ob._file = os.open(filename, flags)
143 | ob._ob = mmap.mmap(ob._file, 0, access=mmap.ACCESS_READ)
144 | result = getbufptr(ob._ob, &ptr, &size, &buffer)
145 | ob.ptr = ptr
146 | if result != 0:
147 | raise ValueError('could not get buffer from mmap.')
148 | ob.size = ob.ptr[0]
149 | ob.offsets = &(ob.ptr[1])
150 | ob.sizes = &(ob.ptr[1 + ob.size])
151 | # rest is data
152 | releasebuf(&buffer)
153 | return ob
154 |
155 | @classmethod
156 | def frombuffer(cls, data, int offset):
157 | """Load a MultiRoaringBitmap from a Python object using the buffer
158 | interface (e.g. bytes or mmap object), starting at ``offset``."""
159 | cdef MultiRoaringBitmap ob = MultiRoaringBitmap.__new__(
160 | MultiRoaringBitmap)
161 | cdef char *ptr = NULL
162 | cdef Py_buffer buffer
163 | cdef Py_ssize_t size = 0
164 | result = getbufptr(data, &ptr, &size, &buffer)
165 | ob.ptr = &ptr[offset]
166 | if result != 0:
167 | raise ValueError('could not get buffer from mmap.')
168 | ob.size = ob.ptr[0]
169 | ob.offsets = &(ob.ptr[1])
170 | ob.sizes = &(ob.ptr[1 + ob.size])
171 | # rest is data
172 | releasebuf(&buffer)
173 | return ob
174 |
175 | def bufsize(self):
176 | """Return size in number of bytes."""
177 | return self.offsets[self.size - 1] + self.sizes[self.size - 1]
178 |
179 | def __len__(self):
180 | return self.size
181 |
182 | def __getitem__(self, i):
183 | """Like self.get(), but handle negative indices, slices and raise
184 | IndexError for invalid index."""
185 | if isinstance(i, slice):
186 | return [self[n] for n in range(*i.indices(self.size))]
187 | elif not isinstance(i, (int, long)):
188 | raise TypeError('Expected integer index or slice object.')
189 | elif i < 0:
190 | i += self.size
191 | result = self.get(i)
192 | if result is None:
193 | raise IndexError
194 | return result
195 |
196 | cpdef get(self, long i):
197 | """Return bitmap `i` as an ``ImmutableRoaringBitmap``, or ``None`` if
198 | `i` is an invalid index."""
199 | cdef ImmutableRoaringBitmap ob1
200 | if i < 0 or i >= self.size:
201 | return None
202 | if self.sizes[i] == 0:
203 | return EMPTYIRB
204 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
205 | ob1._setptr(&(self.ptr)[self.offsets[i]], self.sizes[i])
206 | return ob1
207 |
208 | def getsize(self, long i):
209 | return self.sizes[i]
210 |
211 | def intersection(self, list indices,
212 | uint32_t start=0, uint32_t stop=0xffffffffUL):
213 | """Compute intersection of given a list of indices of roaring bitmaps
214 | in this collection.
215 |
216 | :param start: optional start index.
217 | :param stop: optional end index;
218 | if given, only return elements ``n`` s.t. ``start <= n < stop``.
219 | :returns: the intersection as a mutable RoaringBitmap.
220 | Returns ``None`` when an invalid index is encountered or an empty
221 | result is obtained.
222 | """
223 | cdef ImmutableRoaringBitmap ob1, ob2
224 | cdef RoaringBitmap result
225 | cdef char *ptr = self.ptr
226 | cdef long i, j, numindices = len(indices)
227 | if numindices == 0:
228 | return None
229 | for i in range(numindices):
230 | j = indices[i]
231 | if j < 0 or j >= self.size or self.sizes[j] == 0:
232 | return None
233 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
234 | if numindices == 1:
235 | i = indices[0]
236 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
237 | if start or stop < 0xffffffffUL:
238 | return rb_clamp(ob1, start, stop)
239 | return ob1
240 | indices.sort(key=self.getsize)
241 | ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
242 | # TODO with nogil?:
243 | i, j = indices[0], indices[1]
244 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
245 | ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
246 | if start or stop < 0xffffffffUL:
247 | result = rb_clamp(ob1, start, stop)
248 | rb_iand(result, ob2)
249 | else:
250 | result = rb_and(ob1, ob2)
251 | for i in range(2, numindices):
252 | j = indices[i]
253 | # swap out contents of ImmutableRoaringBitmap object
254 | ob1._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
255 | rb_iand(result, ob1)
256 | if result.size == 0:
257 | return None
258 | return result
259 |
260 | def andor_len_pairwise(self, array.array indices1, array.array indices2,
261 | array.array resultand, array.array resultor):
262 | """Pairwise intersection/union cardinality for pairs of roaring bitmaps
263 | in this collection given by ``zip(indices1, indices2)``.
264 |
265 | :param indices1: input array
266 | :param indices2: input array
267 | :param resultand: result array
268 | :param resultor: result array
269 |
270 | All parameters should be Python arrays of type 'L', all preallocated
271 | with the same length; result arrays need not be initialized.
272 |
273 | >>> result1 = array.array('L', [0] * 3)
274 | >>> result2 = array.array('L', [0] * 3)
275 | >>> mrb.intersection_card_pairwise(array.array('L', [0, 6, 8]),
276 | ... array.array('L', [1, 7, 6]), result1, result2)
277 | >>> result1
278 | array.array('L', [3, 2, 56])
279 | >>> result2
280 | array.array('L', [6, 4, 123])
281 | """
282 | cdef char *ptr = self.ptr
283 | cdef int i, j, n, lenindices1 = len(indices1)
284 | cdef ImmutableRoaringBitmap ob1, ob2
285 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
286 | ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
287 | with nogil:
288 | for n in range(lenindices1):
289 | i, j = indices1.data.as_ulongs[n], indices2.data.as_ulongs[n]
290 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
291 | ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
292 | if self.sizes[i] and self.sizes[j]:
293 | rb_andor_len(ob1, ob2, &(resultand.data.as_ulongs[n]),
294 | &(resultor.data.as_ulongs[n]))
295 | else:
296 | resultand.data.as_ulongs[n] = 0
297 | resultor.data.as_ulongs[n] = 0
298 |
299 | def jaccard_dist(self, array.array indices1, array.array indices2):
300 | """Compute the Jaccard distances for pairs of roaring bitmaps
301 | in this collection given by ``zip(indices1, indices2)``.
302 |
303 | >>> mrb.jaccard_dist(array.array('L', [0, 6, 8]),
304 | ... array.array('L', [1, 7, 6]))
305 | array.array('d', [0.3, 0.2, 0.56])
306 |
307 | :param indices1: input array
308 | :param indices2: input array
309 | :returns: a Python array of floats with the jaccard distances.
310 |
311 | ``indices1`` and ``indices2`` should be arrays of unsigned long
312 | integers, created with ``array.array('L')``. Ensure that all indices
313 | `i` are in the range ``0 <= i < len(self)``.
314 | """
315 | cdef ImmutableRoaringBitmap ob1, ob2
316 | cdef array.array result = array.clone(dblarray, len(indices1), False)
317 | cdef char *ptr = self.ptr
318 | cdef int i, j, n, lenindices1 = len(indices1)
319 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
320 | ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
321 | with nogil:
322 | for n in range(lenindices1):
323 | i, j = indices1.data.as_ulongs[n], indices2.data.as_ulongs[n]
324 | ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
325 | ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
326 | result.data.as_doubles[n] = (rb_jaccard_dist(ob1, ob2)
327 | if self.sizes[i] and self.sizes[j] else 1)
328 | return result
329 |
330 | def jaccard_dist_single(self, RoaringBitmap rb):
331 | """Compute the Jaccard distances for `rb` with all roaring bitmaps
332 | in this collection.
333 |
334 | >>> mrb.jaccard_dist_single(RoaringBitmap([1, 6, 19, 22]))
335 | array.array('d', [0.3, 0.2, 0.56])
336 |
337 | :param rb: a roaring bitmap.
338 | :returns: a Python array of floats with the jaccard distances with
339 | length equal to `len(self)`.
340 | """
341 | cdef ImmutableRoaringBitmap ob1, ob2
342 | cdef array.array result = array.clone(dblarray, len(self), False)
343 | cdef char *ptr = self.ptr
344 | cdef uint32_t n
345 | ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
346 | ob2 = ImmutableRoaringBitmap(rb)
347 | with nogil:
348 | for n in range(self.size):
349 | ob1._setptr(&(ptr[self.offsets[n]]), self.sizes[n])
350 | result.data.as_doubles[n] = rb_jaccard_dist(ob1, ob2)
351 | return result
352 |
--------------------------------------------------------------------------------
/src/rbbinaryops.pxi:
--------------------------------------------------------------------------------
1 | cdef inline richcmp(x, y, int op):
2 | """Considers comparisons to RoaringBitmaps and sets;
3 | other types raise a TypeError."""
4 | cdef RoaringBitmap ob1, ob2
5 | cdef size_t n
6 | if x is None or y is None:
7 | if op == 2 or op == 3:
8 | return op == 3
9 | raise TypeError
10 | if (not isinstance(x, (RoaringBitmap, set))
11 | or not isinstance(y, (RoaringBitmap, set))):
12 | raise TypeError
13 | if op == 2: # ==
14 | ob1, ob2 = ensurerb(x), ensurerb(y)
15 | if ob1.size != ob2.size:
16 | return False
17 | if memcmp(ob1.keys, ob2.keys, ob1.size * sizeof(uint16_t)) != 0:
18 | return False
19 | for n in range(ob1.size):
20 | if ob1.data[n].cardinality != ob2.data[n].cardinality:
21 | return False
22 | for n in range(ob1.size):
23 | if memcmp(
24 | (ob1.offset + ob1.data[n].buf.offset),
25 | (ob2.offset + ob2.data[n].buf.offset),
26 | getsize(&(ob1.data[n])) * sizeof(uint16_t)) != 0:
27 | return False
28 | return True
29 | elif op == 3: # !=
30 | return not richcmp(x, y, 2)
31 | elif op == 1: # <=
32 | return ensurerb(x).issubset(y)
33 | elif op == 5: # >=
34 | return ensurerb(x).issuperset(y)
35 | elif op == 0: # <
36 | return len(x) < len(y) and ensurerb(x).issubset(y)
37 | elif op == 4: # >
38 | return len(x) > len(y) and ensurerb(x).issuperset(y)
39 | return NotImplemented
40 |
41 |
42 | cdef inline RoaringBitmap rb_iand(RoaringBitmap ob1, RoaringBitmap ob2):
43 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0
44 | cdef uint16_t *keys = NULL
45 | cdef Block *data = NULL
46 | cdef Block b2
47 | if ob2.size == 0:
48 | for pos1 in range(ob1.size):
49 | aligned_free(ob1.data[pos1].buf.ptr)
50 | ob1._resize(0)
51 | elif ob1.size > 0:
52 | ob1.capacity = min(ob1.size, ob2.size)
53 | ob1._tmpalloc(ob1.capacity, &keys, &data)
54 | while True:
55 | if ob1.keys[pos1] < ob2.keys[pos2]:
56 | aligned_free(ob1.data[pos1].buf.ptr)
57 | pos1 += 1
58 | if pos1 == ob1.size:
59 | break
60 | elif ob1.keys[pos1] > ob2.keys[pos2]:
61 | pos2 += 1
62 | if pos2 == ob2.size:
63 | break
64 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
65 | block_iand(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
66 | if ob1.data[pos1].cardinality > 0:
67 | keys[res] = ob1.keys[pos1]
68 | data[res] = ob1.data[pos1]
69 | res += 1
70 | else:
71 | aligned_free(ob1.data[pos1].buf.ptr)
72 | pos1 += 1
73 | pos2 += 1
74 | if pos1 == ob1.size or pos2 == ob2.size:
75 | break
76 | ob1._replacearrays(keys, data, res)
77 | return ob1
78 |
79 |
80 | cdef inline RoaringBitmap rb_isub(RoaringBitmap ob1, RoaringBitmap ob2):
81 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0
82 | cdef uint16_t *keys = NULL
83 | cdef Block *data = NULL
84 | cdef Block b2
85 | if pos1 < ob1.size and pos2 < ob2.size:
86 | ob1.capacity = ob1.size
87 | ob1._tmpalloc(ob1.capacity, &keys, &data)
88 | while True:
89 | if ob1.keys[pos1] < ob2.keys[pos2]:
90 | keys[res] = ob1.keys[pos1]
91 | data[res] = ob1.data[pos1]
92 | res += 1
93 | pos1 += 1
94 | if pos1 == ob1.size:
95 | break
96 | elif ob1.keys[pos1] > ob2.keys[pos2]:
97 | pos2 += 1
98 | if pos2 == ob2.size:
99 | break
100 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
101 | block_isub(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
102 | if ob1.data[pos1].cardinality > 0:
103 | keys[res] = ob1.keys[pos1]
104 | data[res] = ob1.data[pos1]
105 | res += 1
106 | else:
107 | aligned_free(ob1.data[pos1].buf.ptr)
108 | pos1 += 1
109 | pos2 += 1
110 | if pos1 == ob1.size or pos2 == ob2.size:
111 | break
112 | if pos2 == ob2.size:
113 | for pos1 in range(pos1, ob1.size):
114 | keys[res] = ob1.keys[pos1]
115 | data[res] = ob1.data[pos1]
116 | res += 1
117 | ob1._replacearrays(keys, data, res)
118 | return ob1
119 |
120 |
121 | cdef inline RoaringBitmap rb_ior(RoaringBitmap ob1, RoaringBitmap ob2):
122 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0
123 | cdef uint16_t *keys = NULL
124 | cdef Block *data = NULL
125 | cdef Block b2
126 | if ob2.size == 0:
127 | return ob1
128 | ob1.capacity = ob1.size + ob2.size
129 | ob1._tmpalloc(ob1.capacity, &keys, &data)
130 | if pos1 < ob1.size and pos2 < ob2.size:
131 | while True:
132 | if ob1.keys[pos1] < ob2.keys[pos2]:
133 | keys[res] = ob1.keys[pos1]
134 | data[res] = ob1.data[pos1]
135 | res += 1
136 | pos1 += 1
137 | if pos1 == ob1.size:
138 | break
139 | elif ob1.keys[pos1] > ob2.keys[pos2]:
140 | keys[res] = ob2.keys[pos2]
141 | block_copy(&(data[res]), ob2._getblk(pos2, &b2))
142 | res += 1
143 | pos2 += 1
144 | if pos2 == ob2.size:
145 | break
146 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
147 | block_ior(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
148 | keys[res] = ob1.keys[pos1]
149 | data[res] = ob1.data[pos1]
150 | res += 1
151 | pos1 += 1
152 | pos2 += 1
153 | if pos1 == ob1.size or pos2 == ob2.size:
154 | break
155 | if pos1 == ob1.size:
156 | for pos2 in range(pos2, ob2.size):
157 | keys[res] = ob2.keys[pos2]
158 | block_copy(&(data[res]), ob2._getblk(pos2, &b2))
159 | res += 1
160 | elif pos2 == ob2.size:
161 | for pos1 in range(pos1, ob1.size):
162 | keys[res] = ob1.keys[pos1]
163 | data[res] = ob1.data[pos1]
164 | res += 1
165 | ob1._replacearrays(keys, data, res)
166 | return ob1
167 |
168 |
169 | cdef inline RoaringBitmap rb_ixor(RoaringBitmap ob1, RoaringBitmap ob2):
170 | cdef uint32_t pos1 = 0, pos2 = 0, res = 0
171 | cdef uint16_t *keys = NULL
172 | cdef Block *data = NULL
173 | cdef Block b2
174 | ob1.capacity = ob1.size + ob2.size
175 | ob1._tmpalloc(ob1.capacity, &keys, &data)
176 | if pos1 < ob1.size and pos2 < ob2.size:
177 | while True:
178 | if ob1.keys[pos1] < ob2.keys[pos2]:
179 | keys[res] = ob1.keys[pos1]
180 | data[res] = ob1.data[pos1]
181 | res += 1
182 | pos1 += 1
183 | if pos1 == ob1.size:
184 | break
185 | elif ob1.keys[pos1] > ob2.keys[pos2]:
186 | keys[res] = ob2.keys[pos2]
187 | block_copy(&(data[res]), ob2._getblk(pos2, &b2))
188 | res += 1
189 | pos2 += 1
190 | if pos2 == ob2.size:
191 | break
192 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
193 | block_ixor(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
194 | if ob1.data[pos1].cardinality > 0:
195 | keys[res] = ob1.keys[pos1]
196 | data[res] = ob1.data[pos1]
197 | res += 1
198 | else:
199 | aligned_free(ob1.data[pos1].buf.ptr)
200 | pos1 += 1
201 | pos2 += 1
202 | if pos1 == ob1.size or pos2 == ob2.size:
203 | break
204 | if pos1 == ob1.size:
205 | for pos2 in range(pos2, ob2.size):
206 | keys[res] = ob2.keys[pos2]
207 | block_copy(&(data[res]), ob2._getblk(pos2, &b2))
208 | res += 1
209 | elif pos2 == ob2.size:
210 | for pos1 in range(pos1, ob1.size):
211 | keys[res] = ob1.keys[pos1]
212 | data[res] = ob1.data[pos1]
213 | res += 1
214 | ob1._replacearrays(keys, data, res)
215 | return ob1
216 |
217 |
218 | cdef inline RoaringBitmap rb_and(RoaringBitmap ob1, RoaringBitmap ob2):
219 | cdef RoaringBitmap result = RoaringBitmap()
220 | cdef uint32_t pos1 = 0, pos2 = 0
221 | cdef Block b1, b2
222 | if pos1 < ob1.size and pos2 < ob2.size:
223 | # initialize to zero so that unallocated blocks can be detected
224 | result._initarray(min(ob1.size, ob2.size))
225 | while True:
226 | if ob1.keys[pos1] < ob2.keys[pos2]:
227 | pos1 += 1
228 | if pos1 == ob1.size:
229 | break
230 | elif ob1.keys[pos1] > ob2.keys[pos2]:
231 | pos2 += 1
232 | if pos2 == ob2.size:
233 | break
234 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
235 | block_and(&(result.data[result.size]),
236 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
237 | if result.data[result.size].cardinality:
238 | result.keys[result.size] = ob1.keys[pos1]
239 | result.size += 1
240 | pos1 += 1
241 | pos2 += 1
242 | if pos1 == ob1.size or pos2 == ob2.size:
243 | break
244 | aligned_free(result.data[result.size].buf.ptr)
245 | result._resize(result.size)
246 | return result
247 |
248 |
249 | cdef inline RoaringBitmap rb_sub(RoaringBitmap ob1, RoaringBitmap ob2):
250 | cdef RoaringBitmap result = RoaringBitmap()
251 | cdef uint32_t pos1 = 0, pos2 = 0
252 | cdef Block b1, b2
253 | result._initarray(ob1.size)
254 | if pos1 < ob1.size and pos2 < ob2.size:
255 | while True:
256 | if ob1.keys[pos1] < ob2.keys[pos2]:
257 | result._insertcopy(
258 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
259 | pos1 += 1
260 | if pos1 == ob1.size:
261 | break
262 | elif ob1.keys[pos1] > ob2.keys[pos2]:
263 | pos2 += 1
264 | if pos2 == ob2.size:
265 | break
266 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
267 | block_sub(&(result.data[result.size]),
268 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
269 | if result.data[result.size].cardinality > 0:
270 | result.keys[result.size] = ob1.keys[pos1]
271 | result.size += 1
272 | pos1 += 1
273 | pos2 += 1
274 | if pos1 == ob1.size or pos2 == ob2.size:
275 | break
276 | if pos2 == ob2.size:
277 | for pos1 in range(pos1, ob1.size):
278 | result._insertcopy(
279 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
280 | aligned_free(result.data[result.size].buf.ptr)
281 | result._resize(result.size)
282 | if pos2 == ob2.size:
283 | while pos1 < ob1.size:
284 | result._insertcopy(
285 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
286 | pos1 += 1
287 | return result
288 |
289 |
290 | cdef inline RoaringBitmap rb_or(RoaringBitmap ob1, RoaringBitmap ob2):
291 | cdef RoaringBitmap result = RoaringBitmap()
292 | cdef uint32_t pos1 = 0, pos2 = 0
293 | cdef Block b1, b2
294 | if pos1 < ob1.size and pos2 < ob2.size:
295 | result._initarray(ob1.size + ob2.size)
296 | while True:
297 | if ob1.keys[pos1] < ob2.keys[pos2]:
298 | result._insertcopy(
299 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
300 | pos1 += 1
301 | if pos1 == ob1.size:
302 | break
303 | elif ob1.keys[pos1] > ob2.keys[pos2]:
304 | result._insertcopy(
305 | result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2))
306 | pos2 += 1
307 | if pos2 == ob2.size:
308 | break
309 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
310 | block_or(&(result.data[result.size]),
311 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
312 | result.keys[result.size] = ob1.keys[pos1]
313 | result.size += 1
314 | pos1 += 1
315 | pos2 += 1
316 | if pos1 == ob1.size or pos2 == ob2.size:
317 | break
318 | if pos1 == ob1.size:
319 | result._extendarray(ob2.size - pos2)
320 | for pos2 in range(pos2, ob2.size):
321 | result._insertcopy(result.size,
322 | ob2.keys[pos2], ob2._getblk(pos2, &b2))
323 | elif pos2 == ob2.size:
324 | result._extendarray(ob1.size - pos1)
325 | for pos1 in range(pos1, ob1.size):
326 | result._insertcopy(
327 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
328 | result._resize(result.size)
329 | return result
330 |
331 |
332 | cdef inline RoaringBitmap rb_xor(RoaringBitmap ob1, RoaringBitmap ob2):
333 | cdef RoaringBitmap result = RoaringBitmap()
334 | cdef uint32_t pos1 = 0, pos2 = 0
335 | cdef Block b1, b2
336 | if pos1 < ob1.size and pos2 < ob2.size:
337 | result._initarray(ob1.size + ob2.size)
338 | while True:
339 | if ob1.keys[pos1] < ob2.keys[pos2]:
340 | result._insertcopy(
341 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
342 | pos1 += 1
343 | if pos1 == ob1.size:
344 | break
345 | elif ob1.keys[pos1] > ob2.keys[pos2]:
346 | result._insertcopy(
347 | result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2))
348 | pos2 += 1
349 | if pos2 == ob2.size:
350 | break
351 | else: # ob1.keys[pos1] == ob2.keys[pos2]:
352 | block_xor(&(result.data[result.size]),
353 | ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
354 | if result.data[result.size].cardinality > 0:
355 | result.keys[result.size] = ob1.keys[pos1]
356 | result.size += 1
357 | pos1 += 1
358 | pos2 += 1
359 | if pos1 == ob1.size or pos2 == ob2.size:
360 | break
361 | aligned_free(result.data[result.size].buf.ptr)
362 | if pos1 == ob1.size:
363 | result._extendarray(ob2.size - pos2)
364 | for pos2 in range(pos2, ob2.size):
365 | result._insertcopy(
366 | result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2))
367 | elif pos2 == ob2.size:
368 | result._extendarray(ob1.size - pos1)
369 | for pos1 in range(pos1, ob1.size):
370 | result._insertcopy(
371 | result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
372 | result._resize(result.size)
373 | return result
374 |
375 |
376 | cdef bint rb_isdisjoint(RoaringBitmap self, RoaringBitmap ob):
377 | cdef Block b1, b2
378 | cdef size_t n
379 | cdef int i = 0
380 | if self.size == 0 or ob.size == 0:
381 | return True
382 | for n in range(self.size):
383 | i = ob._binarysearch(i, ob.size, self.keys[n])
384 | if i < 0:
385 | if -i - 1 >= ob.size:
386 | return True
387 | i = -i - 1
388 | elif not block_isdisjoint(self._getblk(n, &b1), ob._getblk(i, &b2)):
389 | return False
390 | return True
391 |
392 |
393 | cdef inline bint rb_issubset(RoaringBitmap self, RoaringBitmap ob):
394 | cdef Block b1, b2
395 | cdef size_t n
396 | cdef int i = 0
397 | if self.size == 0:
398 | return True
399 | elif ob.size == 0:
400 | return False
401 | for n in range(self.size):
402 | i = ob._binarysearch(i, ob.size, self.keys[n])
403 | if i < 0:
404 | return False
405 | i = 0
406 | for n in range(self.size):
407 | i = ob._binarysearch(i, ob.size, self.keys[n])
408 | if not block_issubset(self._getblk(n, &b1), ob._getblk(i, &b2)):
409 | return False
410 | return True
411 |
412 |
413 | cdef inline RoaringBitmap rb_clamp(RoaringBitmap self,
414 | uint32_t start, uint32_t stop):
415 | cdef Block b1
416 | cdef RoaringBitmap result = RoaringBitmap()
417 | cdef int ii = self._getindex(highbits(start))
418 | cdef int jj = ii
419 | cdef int i = -ii - 1 if ii < 0 else ii
420 | cdef int j = i
421 | if highbits(start) != highbits(stop):
422 | jj = self._getindex(highbits(stop))
423 | # when block was not found, round down to preceding block
424 | j = -jj - 2 if jj < 0 else jj
425 | if i >= self.size or j < 0:
426 | return result
427 | result._initarray(j - i + 1)
428 | block_clamp(
429 | &(result.data[0]), self._getblk(i, &b1),
430 | lowbits(start) if i == ii else 0,
431 | lowbits(stop) if ii == jj and ii >= 0 else BLOCKSIZE)
432 | if result.data[result.size].cardinality:
433 | result.keys[result.size] = self.keys[i]
434 | result.size += 1
435 | else:
436 | aligned_free(result.data[0].buf.ptr)
437 | for n in range(i + 1, j):
438 | block_copy(&(result.data[result.size]), self._getblk(n, &b1))
439 | result.keys[result.size] = self.keys[n]
440 | result.size += 1
441 | if i != j:
442 | block_clamp(
443 | &(result.data[result.size]), self._getblk(j, &b1),
444 | 0, lowbits(stop) if jj >= 0 else BLOCKSIZE)
445 | if result.data[result.size].cardinality:
446 | result.keys[result.size] = self.keys[j]
447 | result.size += 1
448 | else:
449 | aligned_free(result.data[result.size].buf.ptr)
450 | result._resize(result.size)
451 | return result
452 |
453 |
454 | cdef inline void rb_andor_len(RoaringBitmap ob1, RoaringBitmap ob2,
455 | unsigned long *intersection_result,
456 | unsigned long *union_result) noexcept nogil:
457 | cdef Block b1, b2
458 | cdef uint32_t pos1 = 0, pos2 = 0, tmp1, tmp2
459 | union_result[0] = intersection_result[0] = 0
460 | if pos1 < ob1.size and pos2 < ob2.size:
461 | while True:
462 | if ob1.keys[pos1] < ob2.keys[pos2]:
463 | union_result[0] += ob1.data[pos1].cardinality
464 | pos1 += 1
465 | if pos1 == ob1.size:
466 | break
467 | elif ob1.keys[pos1] > ob2.keys[pos2]:
468 | union_result[0] += ob2.data[pos2].cardinality
469 | pos2 += 1
470 | if pos2 == ob2.size:
471 | break
472 | else:
473 | tmp1 = tmp2 = 0
474 | block_andorlen(
475 | ob1._getblk(pos1, &b1),
476 | ob2._getblk(pos2, &b2),
477 | &tmp1, &tmp2)
478 | intersection_result[0] += tmp1
479 | union_result[0] += tmp2
480 | pos1 += 1
481 | pos2 += 1
482 | if pos1 == ob1.size or pos2 == ob2.size:
483 | break
484 | if pos1 == ob1.size and pos2 < ob2.size:
485 | for pos2 in range(pos2, ob2.size):
486 | union_result[0] += ob2.data[pos2].cardinality
487 | elif pos2 == ob2.size and pos1 < ob1.size:
488 | for pos1 in range(pos1, ob1.size):
489 | union_result[0] += ob1.data[pos1].cardinality
490 |
491 |
492 | cdef inline double rb_jaccard_dist(RoaringBitmap ob1,
493 | RoaringBitmap ob2) noexcept nogil:
494 | cdef unsigned long union_result = 0, intersection_result = 0
495 | rb_andor_len(ob1, ob2, &intersection_result, &union_result)
496 | if union_result == 0:
497 | return 1
498 | return 1 - (intersection_result / union_result)
499 |
--------------------------------------------------------------------------------
/src/roaringbitmap.pyx:
--------------------------------------------------------------------------------
1 | """Roaring bitmap in Cython.
2 |
3 | A Roaring bitmap stores a set of 32 bit integers compactly while allowing for
4 | efficient set operations. The space of integers is partitioned into blocks
5 | of ``2 ** 16`` integers. The representation for a block depends on the number
6 | of elements it contains:
7 |
8 | <= 4096 elements:
9 | an array of up to ``1 << 12`` 16-bit integers that are part of the set.
10 |
11 | >= 61140 elements:
12 | an array of up to ``1 << 12`` 16-bit integers that are not part of the set.
13 |
14 | otherwise:
15 | a fixed bitmap of ``1 << 16`` (65536) bits with a 1-bit for each element.
16 |
17 | A ``RoaringBitmap`` can be used as a replacement for a mutable
18 | Python ``set`` containing unsigned 32-bit integers:
19 |
20 | >>> from roaringbitmap import RoaringBitmap
21 | >>> RoaringBitmap(range(10)) & RoaringBitmap(range(5, 15))
22 | RoaringBitmap({5, 6, 7, 8, 9})
23 |
24 | ``ImmutableRoaringBitmap`` is an immutable variant (analogous to ``frozenset``)
25 | which is stored compactly as a contiguous block of memory.
26 |
27 | ``MultiRoaringBitmap`` stores a sequence of immutable roaring bitmaps
28 | in an efficiently serializable, contiguous block of memory.
29 | """
30 | # TODOs
31 | # [ ] SSE/AVX2 intrinsics:
32 | # array intersection [x] SSE; [ ] AVX
33 | # bitmap=>array [ ] SSE; [ ] AVX
34 | # [ ] separate cardinality & binary ops for bitmaps
35 | # [ ] and; [-] or; [ ] xor; [ ] sub
36 | # slower in benchmarks
37 | # [ ] check growth strategy of arrays
38 | # [ ] more operations:
39 | # [ ] efficient shifts
40 | # [ ] operate on slices without instantiating range as temp object
41 | # [ ] subclass Set ABC?
42 | # [ ] error checking, robustness
43 |
44 | import io
45 | import os
46 | import sys
47 | import mmap
48 | import heapq
49 | import array
50 |
51 | from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int32_t
52 | from libc.stdio cimport printf
53 | from libc.stdlib cimport free, malloc, calloc, realloc, abort
54 | from libc.string cimport memset, memcpy, memcmp, memmove
55 | from cpython.buffer cimport PyBUF_SIMPLE, Py_buffer, PyObject_CheckBuffer, \
56 | PyObject_GetBuffer, PyBuffer_Release
57 | from cpython cimport array
58 | cimport cython
59 |
60 | cdef extern from *:
61 | cdef bint PY2
62 |
63 |
64 | cdef extern from "Python.h":
65 | int PyObject_CheckReadBuffer(object)
66 | int PyObject_AsReadBuffer(object, const void **, Py_ssize_t *)
67 |
68 |
69 | cdef extern from "macros.h":
70 | int BITSIZE
71 | int BITSLOT(int b) nogil
72 | int BITNSLOTS(int nb) nogil
73 | void SETBIT(uint64_t a[], int b) nogil
74 | void CLEARBIT(uint64_t a[], int b) nogil
75 | uint64_t TESTBIT(uint64_t a[], int b) nogil
76 | uint64_t BITMASK(int b) nogil
77 | void *aligned_malloc(size_t size, size_t align) nogil
78 | void aligned_free(void *ptr) nogil
79 |
80 | cdef extern from "bitcount.h":
81 | unsigned int bit_clz(uint64_t) nogil
82 | unsigned int bit_ctz(uint64_t) nogil
83 | unsigned int bit_popcount(uint64_t) nogil
84 | size_t BITCOUNT_BITS
85 | size_t UINT64_MAX
86 |
87 |
88 | cdef extern from "_arrayops.h":
89 | int32_t intersect_uint16(uint16_t *A, size_t lenA,
90 | uint16_t *B, size_t lenB, uint16_t *out) nogil
91 | int32_t intersect_general16(uint16_t *A, size_t lenA,
92 | uint16_t *B, size_t lenB, uint16_t *out) nogil
93 |
94 |
95 | cdef union Buffer:
96 | void *ptr
97 | uint16_t *sparse
98 | uint64_t *dense
99 | size_t offset
100 | uint64_t _padding # ensure that this union takes at least 64 bits.
101 |
102 |
103 | cdef struct Block:
104 | # A set of 2**16 integers, stored as bitmap or array.
105 | #
106 | # This block may contain a bitvector (DENSE) or a sparse array;
107 | # The array can contain elements corresponding to 0-bits (INVERTED)
108 | # or 1-bits (POSITIVE).
109 | Buffer buf # data: sparse array or fixed-size bitvector
110 | uint32_t cardinality # the number of elements
111 | uint16_t capacity # number of allocated uint16_t elements
112 | uint16_t state # either DENSE, INVERTED, or POSITIVE
113 | # NB: make state uint16_t so that the struct is 16 bytes without padding.
114 |
115 | # The maximum number of elements in a block
116 | DEF BLOCKSIZE = 1 << 16
117 |
118 | # The number of bytes to store a bitmap of 2**16 bits:
119 | DEF BITMAPSIZE = BLOCKSIZE // 8
120 |
121 | # Maximum length of positive/inverted sparse arrays:
122 | DEF MAXARRAYLENGTH = 1 << 12
123 |
124 | # Capacity (elements) to allocate for an empty array
125 | DEF INITCAPACITY = 4
126 |
127 | # Extra elements in result to accomodate SSE/AVX vector operations
128 | DEF OVERALLOC = 8
129 |
130 | # The different ways a block may store its elements:
131 | DEF DENSE = 0
132 | DEF POSITIVE = 1
133 | DEF INVERTED = 2
134 |
135 | include "bitops.pxi"
136 | include "arrayops.pxi"
137 | include "block.pxi"
138 | include "rbbinaryops.pxi"
139 | include "immutablerb.pxi"
140 | include "multirb.pxi"
141 |
142 | chararray = array.array(b'B' if PY2 else 'B')
143 | dblarray = array.array(b'd' if PY2 else 'd')
144 | longarray = array.array(b'L' if PY2 else 'L')
145 | RANGE = xrange if PY2 else range
146 | EMPTYIRB = ImmutableRoaringBitmap()
147 |
148 |
149 | cdef class RoaringBitmap(object):
150 | """A compact, mutable set of 32-bit integers."""
151 | cdef Block *data # pointer and size of array/bitmap with elements
152 | cdef uint16_t *keys # the high bits of elements in each block
153 | cdef uint32_t size # the number of blocks
154 | cdef uint32_t capacity # the allocated capacity for blocks
155 | cdef size_t offset # used for immutable bitmaps with relative pointers
156 |
157 | def __cinit__(self, *args, **kwargs):
158 | self.keys = self.data = NULL
159 | self.capacity = self.size = self.offset = 0
160 |
161 | def __init__(self, iterable=None):
162 | """Return a new RoaringBitmap with elements from ``iterable``.
163 |
164 | The elements ``x`` of a RoaringBitmap must be ``0 <= x < 2 ** 32``.
165 | If ``iterable`` is not specified, a new empty RoaringBitmap is
166 | returned. Note that a sorted iterable will significantly speed up the
167 | construction.
168 | ``iterable`` may be a generator, in which case the generator is
169 | consumed incrementally.
170 | ``iterable`` may be a ``range`` (Python 3) or ``xrange`` (Python 2)
171 | object, which will be constructed efficiently."""
172 | cdef size_t n
173 | cdef Block b1
174 | cdef RoaringBitmap ob
175 | if isinstance(iterable, RANGE):
176 | _, (start, stop, step) = iterable.__reduce__()
177 | if 0 <= start < stop and step >= 1:
178 | self._initrange(start, stop, step)
179 | return
180 | # fall through on non-trivial use of range()
181 | if isinstance(iterable, (list, tuple, set, dict, RANGE)):
182 | self._init2pass(iterable)
183 | elif isinstance(iterable, RoaringBitmap):
184 | ob = iterable
185 | self._extendarray(ob.size)
186 | for n in range(ob.size):
187 | self._insertcopy(self.size, ob.keys[n], ob._getblk(n, &b1))
188 | elif iterable is not None:
189 | self._inititerator(iterable)
190 |
191 | def __dealloc__(self):
192 | if self.data is not NULL and self.offset == 0:
193 | for n in range(self.size):
194 | aligned_free(self.data[n].buf.ptr)
195 | free(self.keys)
196 | free(self.data)
197 | self.keys = self.data = NULL
198 | self.size = 0
199 |
200 | def copy(self):
201 | """Return a copy of this RoaringBitmap."""
202 | cdef RoaringBitmap result = RoaringBitmap()
203 | cdef size_t n
204 | result._extendarray(self.size)
205 | for n in range(self.size):
206 | result._insertcopy(result.size, self.keys[n], &(self.data[n]))
207 | return result
208 |
209 | def freeze(self):
210 | """Return an immutable copy of this RoaringBitmap."""
211 | cdef ImmutableRoaringBitmap result = ImmutableRoaringBitmap.__new__(
212 | ImmutableRoaringBitmap)
213 | result.__setstate__(self.__getstate__())
214 | return result
215 |
216 | def __contains__(self, uint32_t elem):
217 | cdef int i = self._getindex(highbits(elem))
218 | cdef Block b1
219 | if i >= 0:
220 | return block_contains(
221 | self._getblk(i, &b1), lowbits(elem))
222 | return False
223 |
224 | def __richcmp__(x, y, int op):
225 | return richcmp(x, y, op)
226 |
227 | def isdisjoint(self, other):
228 | """Return True if two RoaringBitmaps have a null intersection."""
229 | return rb_isdisjoint(self, ensurerb(other))
230 |
231 | def issubset(self, other):
232 | """Report whether another set contains this RoaringBitmap."""
233 | return rb_issubset(self, ensurerb(other))
234 |
235 | def issuperset(self, other):
236 | """Report whether this RoaringBitmap contains another set."""
237 | return other.issubset(self)
238 |
239 | def min(self):
240 | """Return smallest element in this RoaringBitmap."""
241 | return self.select(0)
242 |
243 | def max(self):
244 | """Return largest element in this RoaringBitmap."""
245 | return next(reversed(self))
246 |
247 | def __and__(x, y):
248 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
249 | return rb_and(ob1, ob2)
250 |
251 | def __sub__(x, y):
252 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
253 | return rb_sub(ob1, ob2)
254 |
255 | def __or__(x, y):
256 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
257 | return rb_or(ob1, ob2)
258 |
259 | def __xor__(x, y):
260 | cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
261 | return rb_xor(ob1, ob2)
262 |
263 | def __iand__(self, x):
264 | cdef RoaringBitmap ob2 = ensurerb(x)
265 | return rb_iand(self, ob2)
266 |
267 | def __isub__(self, x):
268 | cdef RoaringBitmap ob2 = ensurerb(x)
269 | return rb_isub(self, ob2)
270 |
271 | def __ior__(self, x):
272 | cdef RoaringBitmap ob2 = ensurerb(x)
273 | return rb_ior(self, ob2)
274 |
275 | def __ixor__(self, x):
276 | cdef RoaringBitmap ob2 = ensurerb(x)
277 | return rb_ixor(self, ob2)
278 |
279 | def add(self, uint32_t elem):
280 | """Add an element to the set.
281 |
282 | This has no effect if the element is already present."""
283 | cdef Block *block
284 | cdef uint16_t key = highbits(elem)
285 | cdef int i = self._getindex(key)
286 | if i >= 0:
287 | block = &(self.data[i])
288 | else:
289 | block = self._insertempty(-i - 1, key)
290 | block.state = POSITIVE
291 | block.cardinality = 0
292 | block.buf.sparse = allocsparse(INITCAPACITY)
293 | block.capacity = INITCAPACITY
294 | block_add(block, lowbits(elem))
295 | block_convert(block)
296 |
297 | def clamp(self, uint32_t start, uint32_t stop):
298 | """Return new set with range of values restricted to ``(start, stop)``.
299 | """
300 | return rb_clamp(self, start, stop)
301 |
302 | def discard(self, uint32_t elem):
303 | """Remove an element from the set if it is a member.
304 |
305 | If the element is not a member, do nothing."""
306 | cdef int i = self._getindex(highbits(elem))
307 | if i >= 0:
308 | block_discard(&(self.data[i]), lowbits(elem))
309 | if self.data[i].cardinality == 0:
310 | self._removeatidx(i)
311 |
312 | def remove(self, uint32_t elem):
313 | """Remove an element from the set; it must be a member.
314 |
315 | If the element is not a member, raise a KeyError."""
316 | cdef int i = self._getindex(highbits(elem))
317 | cdef uint32_t x
318 | if i >= 0:
319 | x = self.data[i].cardinality
320 | block_discard(&(self.data[i]), lowbits(elem))
321 | if x == self.data[i].cardinality:
322 | raise KeyError(elem)
323 | if self.data[i].cardinality == 0:
324 | self._removeatidx(i)
325 | else:
326 | raise KeyError(elem)
327 |
328 | def pop(self):
329 | """Remove and return the largest element."""
330 | cdef uint32_t high, low
331 | if self.size == 0:
332 | raise ValueError('pop from empty roaringbitmap')
333 | high = self.keys[self.size - 1]
334 | low = block_pop(&(self.data[self.size - 1]))
335 | if self.data[self.size - 1].cardinality == 0:
336 | self._removeatidx(self.size - 1)
337 | return high << 16 | low
338 |
339 | def clear(self):
340 | """Remove all elements from this RoaringBitmap."""
341 | cdef size_t n
342 | for n in range(self.size):
343 | aligned_free(self.data[n].buf.ptr)
344 | free(self.keys)
345 | free(self.data)
346 | self.size = 0
347 | self.keys = malloc(INITCAPACITY * sizeof(uint16_t))
348 | self.data = malloc(INITCAPACITY * sizeof(Block))
349 | if self.keys is NULL or self.data is NULL:
350 | raise MemoryError(INITCAPACITY)
351 | self.capacity = INITCAPACITY
352 |
353 | def __lshift__(self, other):
354 | return self.__rshift__(-other)
355 |
356 | def __rshift__(self, int other):
357 | # FIXME: replace with optimized implementation
358 | return RoaringBitmap([elem + other for elem in self
359 | if 0 <= elem + other < 1 << 32])
360 |
361 | # def __ilshift__(self, other):
362 | # raise NotImplementedError
363 |
364 | # def __irshift__(self, other):
365 | # raise NotImplementedError
366 |
367 | def __invert__(self):
368 | """Return copy with smallest to largest elements inverted."""
369 | return self.symmetric_difference(
370 | RANGE(self.min(), self.max() + 1))
371 |
372 | def __iter__(self):
373 | cdef Block *block
374 | cdef Block b1
375 | cdef uint32_t high, i
376 | cdef uint64_t cur
377 | cdef int n, idx, low
378 | for i in range(self.size):
379 | block = self._getblk(i, &b1)
380 | high = ((self.keys[i])) << 16
381 | if block.cardinality == BLOCKSIZE:
382 | for low in range(BLOCKSIZE):
383 | yield high | low
384 | elif block.state == DENSE:
385 | idx = 0
386 | cur = block.buf.dense[idx]
387 | n = iteratesetbits(block.buf.dense, &cur, &idx)
388 | while n != -1:
389 | yield high | n
390 | n = iteratesetbits(block.buf.dense, &cur, &idx)
391 | elif block.state == POSITIVE:
392 | for n in range(block.cardinality):
393 | low = block.buf.sparse[n]
394 | yield high | low
395 | elif block.state == INVERTED:
396 | for low in range(block.buf.sparse[0]):
397 | yield high | low
398 | if block.cardinality < BLOCKSIZE - 1:
399 | for n in range(BLOCKSIZE - block.cardinality - 1):
400 | for low in range(
401 | block.buf.sparse[n] + 1,
402 | block.buf.sparse[n + 1]):
403 | yield high | low
404 | for low in range(block.buf.sparse[
405 | BLOCKSIZE - block.cardinality - 1] + 1, BLOCKSIZE):
406 | yield high | low
407 |
408 | def __reversed__(self):
409 | cdef Block *block
410 | cdef Block b1
411 | cdef uint32_t high, i
412 | cdef uint64_t cur
413 | cdef int n, idx, low
414 | for i in range(self.size - 1, -1, -1):
415 | block = self._getblk(i, &b1)
416 | high = ((self.keys[i])) << 16
417 | if block.cardinality == BLOCKSIZE:
418 | for low in reversed(range(BLOCKSIZE)):
419 | yield high | low
420 | elif block.state == POSITIVE:
421 | for n in reversed(range(block.cardinality)):
422 | low = block.buf.sparse[n]
423 | yield high | low
424 | elif block.state == DENSE:
425 | idx = BITNSLOTS(BLOCKSIZE) - 1
426 | cur = block.buf.dense[idx]
427 | n = reviteratesetbits(block.buf.dense, &cur, &idx)
428 | while n != -1:
429 | low = n
430 | yield high | low
431 | n = reviteratesetbits(block.buf.dense, &cur, &idx)
432 | elif block.state == INVERTED:
433 | for low in reversed(range(block.buf.sparse[
434 | BLOCKSIZE - block.cardinality - 1] + 1, BLOCKSIZE)):
435 | yield high | low
436 | if block.cardinality < BLOCKSIZE - 1:
437 | for n in reversed(range(BLOCKSIZE - block.cardinality - 1)):
438 | for low in reversed(range(
439 | block.buf.sparse[n] + 1,
440 | block.buf.sparse[n + 1])):
441 | yield high | low
442 | for low in reversed(range(block.buf.sparse[0])):
443 | yield high | low
444 |
445 | def __len__(self):
446 | cdef size_t result = 0, n
447 | for n in range(self.size):
448 | result += self.data[n].cardinality
449 | return result
450 |
451 | def __sizeof__(self):
452 | """Return memory usage in bytes (incl. overallocation)."""
453 | cdef uint32_t result = 0
454 | for n in range(self.size):
455 | result += (sizeof(uint16_t) + sizeof(Block)
456 | + self.data[n].capacity * sizeof(uint16_t))
457 | return result
458 |
459 | def numelem(self):
460 | """Return total number of uint16_t elements stored."""
461 | cdef uint32_t result = 0
462 | for n in range(self.size):
463 | result += 1 + getsize(&(self.data[n]))
464 | return result
465 |
466 | def __bool__(self):
467 | return self.size
468 |
469 | def __str__(self):
470 | return '{%s}' % ', '.join([str(a) for a in self])
471 |
472 | def __repr__(self):
473 | return 'RoaringBitmap(%s)' % str(self)
474 |
475 | def debuginfo(self, verbose=False):
476 | """Return a string describing the internal representation of this set.
477 | """
478 | cdef Block b1
479 | return 'keys=%d, cap=%d, data={%s}' % (
480 | self.size, self.capacity, ', '.join([
481 | block_repr(self.keys[n], self._getblk(n, &b1), verbose)
482 | for n in range(self.size)]))
483 |
484 | def _keys(self):
485 | return [self.keys[n] for n in range(self.size)]
486 |
487 | def __getstate__(self):
488 | """Return a serialized representation (Python array) for pickling."""
489 | cdef array.array state
490 | cdef Block *ob
491 | cdef uint32_t extra, alignment = 32
492 | cdef size_t n, size
493 | cdef size_t alloc # total allocated bytes for pickle
494 | cdef size_t offset1 = sizeof(uint32_t) # keys, data
495 | cdef size_t offset2 # buffers
496 | # compute total size to allocate
497 | # add padding to ensure bitmaps are 32-byte aligned
498 | alloc = offset1 + self.size * (sizeof(uint16_t) + sizeof(Block))
499 | alloc += alignment - alloc % alignment
500 | for n in range(self.size):
501 | alloc += getsize(&(self.data[n])) * sizeof(uint16_t)
502 | alloc += alignment - alloc % alignment
503 | state = array.clone(chararray, alloc, False)
504 | (state.data.as_chars)[0] = self.size
505 | size = self.size * sizeof(uint16_t)
506 | memcpy(&(state.data.as_chars[offset1]), self.keys, size)
507 | offset1 += size
508 | offset2 = offset1 + self.size * sizeof(Block)
509 | # add zero padding bytes
510 | extra = alignment - offset2 % alignment
511 | memset(&(state.data.as_chars[offset2]), 0, extra)
512 | offset2 += extra
513 | for n in range(self.size):
514 | # copy block
515 | ob = (&(state.data.as_chars[offset1]))
516 | ob[0] = self.data[n]
517 | ob.capacity = getsize(&(self.data[n]))
518 | ob.buf.ptr = offset2
519 | offset1 += sizeof(Block)
520 | # copy buffer of block
521 | size = ob.capacity * sizeof(uint16_t)
522 | memcpy(&(state.data.as_chars[offset2]), self.data[n].buf.ptr, size)
523 | offset2 += size
524 | # add zero padding bytes
525 | extra = alignment - offset2 % alignment
526 | memset(&(state.data.as_chars[offset2]), 0, extra)
527 | offset2 += extra
528 | return state
529 |
530 | def __setstate__(self, array.array state):
531 | """Initialize this object with a serialized representation."""
532 | cdef char *buf = state.data.as_chars
533 | cdef void *tmp1
534 | cdef void *tmp2
535 | cdef Block *data
536 | cdef size_t n, size, offset = sizeof(uint32_t)
537 | self.clear()
538 | self.size = (buf)[0]
539 | self.capacity = max(self.size, INITCAPACITY)
540 | tmp1 = realloc(self.keys, self.capacity * sizeof(uint16_t))
541 | tmp2 = realloc(self.data, self.capacity * sizeof(Block))
542 | if tmp1 is NULL or tmp2 is NULL:
543 | raise MemoryError(self.size)
544 | self.keys = tmp1
545 | self.data = tmp2
546 | memcpy(self.keys, &(buf[offset]), self.size * sizeof(uint16_t))
547 | offset += self.size * sizeof(uint16_t)
548 | data = &(buf[offset])
549 | for n in range(self.size):
550 | self.data[n] = data[n]
551 | offset = data[n].buf.offset
552 | if data[n].state == DENSE:
553 | self.data[n].buf.dense = allocdense()
554 | size = BITMAPSIZE
555 | else:
556 | self.data[n].buf.sparse = allocsparse(data[n].capacity)
557 | size = data[n].capacity * sizeof(uint16_t)
558 | memcpy(self.data[n].buf.ptr, &(buf[offset]), size)
559 |
560 | def intersection(self, *other):
561 | """Return the intersection of two or more sets as a new RoaringBitmap.
562 |
563 | (i.e. elements that are common to all of the sets.)"""
564 | cdef RoaringBitmap result
565 | if len(other) == 0:
566 | return self
567 | elif len(other) == 1:
568 | return self & other[0]
569 | other = sorted([self] + [ensurerb(a) for a in other],
570 | key=RoaringBitmap.numelem)
571 | result = other[0] & other[1]
572 | for ob in other[2:]:
573 | result &= ob
574 | if result.size == 0:
575 | break
576 | return result
577 |
578 | def union(self, *other):
579 | """Return the union of two or more sets as a new set.
580 |
581 | (i.e. all elements that are in at least one of the sets.)"""
582 | if len(other) == 0:
583 | return self
584 | elif len(other) == 1:
585 | return self | other[0]
586 | queue = [(ob1.numelem(), ob1) for ob1 in map(ensurerb, other)]
587 | queue.append((self.numelem(), self))
588 | heapq.heapify(queue)
589 | while len(queue) > 1:
590 | _, ob1 = heapq.heappop(queue)
591 | _, ob2 = heapq.heappop(queue)
592 | result = ob1 | ob2
593 | heapq.heappush(queue, (result.numelem(), result))
594 | _, result = heapq.heappop(queue)
595 | return result
596 |
597 | def difference(self, *other):
598 | """Return the difference of two or more sets as a new RoaringBitmap.
599 |
600 | (i.e, self - other[0] - other[1] - ...)"""
601 | cdef RoaringBitmap result
602 | if len(other) == 0:
603 | return self
604 | other = sorted(map(ensurerb, other),
605 | key=RoaringBitmap.numelem, reverse=True)
606 | result = self - other[0]
607 | for ob in other[1:]:
608 | result -= ob
609 | if result.size == 0:
610 | break
611 | return result
612 |
613 | def symmetric_difference(self, other):
614 | """Return the symmetric difference of two sets as a new RoaringBitmap.
615 |
616 | (i.e. all elements that are in exactly one of the sets.)"""
617 | return self ^ other
618 |
619 | def update(self, *other):
620 | """In-place union update of this RoaringBitmap.
621 |
622 | With one argument, add items from the iterable to this set;
623 | with more arguments: add the union of given ``RoaringBitmap`` objects.
624 |
625 | NB: since range objects are recognized by the constructor, this
626 | provides an efficient way to set a range of bits:
627 |
628 | >>> rb = RoaringBitmap(range(5))
629 | >>> rb.update(range(3, 7))
630 | >>> rb
631 | RoaringBitmap({0, 1, 2, 3, 4, 5, 6})
632 | """
633 | cdef RoaringBitmap ob1, ob2
634 | if len(other) == 0:
635 | return
636 | if len(other) == 1:
637 | self |= other[0]
638 | return
639 | queue = [(ob1.numelem(), ob1) for ob1 in map(ensurerb, other)]
640 | heapq.heapify(queue)
641 | while len(queue) > 1:
642 | _, ob1 = heapq.heappop(queue)
643 | _, ob2 = heapq.heappop(queue)
644 | result = ob1 | ob2
645 | heapq.heappush(queue, (result.numelem(), result))
646 | _, result = heapq.heappop(queue)
647 | self |= result
648 |
649 | def intersection_update(self, *other):
650 | """Intersect this set in-place with one or more ``RoaringBitmap``
651 | objects.
652 |
653 | NB: since range objects are recognized by the constructor, this
654 | provides an efficient way to restrict the set to a range of elements:
655 |
656 | >>> rb = RoaringBitmap(range(5))
657 | >>> rb.intersection_update(range(3, 7))
658 | >>> rb
659 | RoaringBitmap({3, 4})
660 | """
661 | if len(other) == 0:
662 | return
663 | elif len(other) == 1:
664 | self &= other[0]
665 | return
666 | other = sorted(map(ensurerb, other), key=RoaringBitmap.numelem)
667 | for ob in other:
668 | self &= ob
669 | if self.size == 0:
670 | break
671 |
672 | def difference_update(self, *other):
673 | """Remove all elements of other RoaringBitmaps from this one."""
674 | for ob in other:
675 | self -= ob
676 | if self.size == 0:
677 | break
678 |
679 | def symmetric_difference_update(self, other):
680 | """Update set to symmetric difference of itself and another."""
681 | self ^= other
682 |
683 | def flip_range(self, uint32_t start, uint32_t stop):
684 | """In-place negation for range(start, stop)."""
685 | self.symmetric_difference_update(RANGE(start, stop))
686 |
687 | def intersection_len(self, other):
688 | """Return the cardinality of the intersection.
689 |
690 | Optimized version of ``len(self & other)``."""
691 | cdef RoaringBitmap ob1 = ensurerb(self)
692 | cdef RoaringBitmap ob2 = ensurerb(other)
693 | cdef Block b1, b2
694 | cdef uint32_t pos1 = 0, pos2 = 0
695 | cdef size_t result = 0
696 | if pos1 < ob1.size and pos2 < ob2.size:
697 | while True:
698 | if ob1.keys[pos1] < ob2.keys[pos2]:
699 | pos1 += 1
700 | if pos1 == ob1.size:
701 | break
702 | elif ob1.keys[pos1] > ob2.keys[pos2]:
703 | pos2 += 1
704 | if pos2 == ob2.size:
705 | break
706 | else:
707 | result += block_andlen(
708 | ob1._getblk(pos1, &b1),
709 | ob2._getblk(pos2, &b2))
710 | pos1 += 1
711 | pos2 += 1
712 | if pos1 == ob1.size or pos2 == ob2.size:
713 | break
714 | return result
715 |
716 | def union_len(self, other):
717 | """Return the cardinality of the union.
718 |
719 | Optimized version of ``len(self | other)``."""
720 | cdef RoaringBitmap ob1 = ensurerb(self)
721 | cdef RoaringBitmap ob2 = ensurerb(other)
722 | cdef Block b1, b2
723 | cdef uint32_t pos1 = 0, pos2 = 0
724 | cdef size_t result = 0
725 | if pos1 < ob1.size and pos2 < ob2.size:
726 | while True:
727 | if ob1.keys[pos1] < ob2.keys[pos2]:
728 | result += ob1.data[pos1].cardinality
729 | pos1 += 1
730 | if pos1 == ob1.size:
731 | break
732 | elif ob1.keys[pos1] > ob2.keys[pos2]:
733 | result += ob2.data[pos2].cardinality
734 | pos2 += 1
735 | if pos2 == ob2.size:
736 | break
737 | else:
738 | result += block_orlen(
739 | ob1._getblk(pos1, &b1),
740 | ob2._getblk(pos2, &b2))
741 | pos1 += 1
742 | pos2 += 1
743 | if pos1 == ob1.size or pos2 == ob2.size:
744 | break
745 | if pos1 == ob1.size and pos2 < ob2.size:
746 | for pos2 in range(pos2, ob2.size):
747 | result += ob2.data[pos2].cardinality
748 | elif pos2 == ob2.size and pos1 < ob1.size:
749 | for pos1 in range(pos1, ob1.size):
750 | result += ob1.data[pos1].cardinality
751 | return result
752 |
753 | def jaccard_dist(self, other):
754 | """Return the Jaccard distance.
755 |
756 | Optimized version of ``1 - len(self & other) / len(self | other)``.
757 | Counts of union and intersection are performed simultaneously."""
758 | cdef RoaringBitmap ob1 = ensurerb(self)
759 | cdef RoaringBitmap ob2 = ensurerb(other)
760 | return rb_jaccard_dist(ob1, ob2)
761 |
762 | def rank(self, uint32_t x):
763 | """Return the number of elements ``<= x`` that are in this set."""
764 | cdef Block b1
765 | cdef size_t size = 0, n
766 | cdef uint16_t xhigh = highbits(x)
767 | for n in range(self.size):
768 | if self.keys[n] < xhigh:
769 | size += self.data[n].cardinality
770 | elif self.keys[n] > xhigh:
771 | return size
772 | else:
773 | return size + block_rank(
774 | self._getblk(n, &b1),
775 | lowbits(x))
776 | return size
777 |
778 | def select(self, int i):
779 | """Return the ith element that is in this set.
780 |
781 | :param i: a 0-based index."""
782 | cdef Block b1
783 | cdef uint32_t leftover = i
784 | cdef uint32_t n, keycontrib, lowcontrib
785 | if i < 0:
786 | raise IndexError('select: index %d out of range 0..%d.' % (
787 | i, len(self)))
788 | for n in range(self.size):
789 | if self.data[n].cardinality > leftover:
790 | keycontrib = self.keys[n] << 16
791 | lowcontrib = block_select(
792 | self._getblk(n, &b1),
793 | leftover)
794 | return keycontrib | lowcontrib
795 | leftover -= self.data[n].cardinality
796 | raise IndexError('select: index %d out of range 0..%d.' % (
797 | i, len(self)))
798 |
799 | def index(self, uint32_t x):
800 | """Return the 0-based index of `x` in this set.
801 |
802 | Equivalent to ``sorted(self).index(x)``."""
803 | if x in self:
804 | return self.rank(x) - 1
805 | raise IndexError
806 |
807 | def _ridx(self, i):
808 | if i < 0:
809 | return len(self) + i
810 | return i
811 |
812 | def _slice(self, i):
813 | """Return the range of values for a given a range of indices i."""
814 | start = 0 if i.start is None else self._ridx(i.start)
815 | stop = len(self) if i.stop is None else self._ridx(i.stop)
816 | return RANGE(
817 | self.select(start), self.select(stop - 1) + 1)
818 |
819 | def __getitem__(self, i):
820 | """Get element with rank `i`, or a slice.
821 |
822 | In the case of a slice, a new roaringbitmap is returned."""
823 | if isinstance(i, slice):
824 | if i.step is None or i.step == 1:
825 | return self.intersection(self._slice(i))
826 | elif i.step <= 0:
827 | raise ValueError
828 | else: # i.step > 1 FIXME we could do better
829 | start, stop, step = i.indices(len(self))
830 | return RoaringBitmap(
831 | [self[x] for x in RANGE(start, stop, step)])
832 | elif isinstance(i, (int, long)):
833 | return self.select(self._ridx(i))
834 | else:
835 | raise TypeError('Expected integer index or slice object.')
836 |
837 | def __delitem__(self, i):
838 | """Discard element with rank `i`, or a slice."""
839 | if isinstance(i, slice):
840 | if i.step is None or i.step == 1:
841 | self.difference_update(self._slice(i))
842 | elif i.step <= 0:
843 | raise ValueError
844 | else: # i.step > 1 FIXME we could do better
845 | start, stop, step = i.indices(len(self))
846 | self.difference_update(RoaringBitmap([
847 | self[x] for x in RANGE(start, stop, step)]))
848 | elif isinstance(i, (int, long)):
849 | self.discard(self.select(self._ridx(i)))
850 | else:
851 | raise TypeError('Expected integer index or slice object.')
852 |
853 | def _initrange(self, uint32_t start, uint32_t stop, uint32_t step):
854 | cdef Block *block = NULL
855 | cdef uint32_t key, blockstart, blockstop, gap
856 | cdef uint32_t tmp = start
857 | cdef uint64_t n
858 | if step >= (1 << 16):
859 | n = start
860 | while n < stop:
861 | self.add(n)
862 | n += step
863 | return
864 | while True:
865 | key = highbits(tmp)
866 | blockstart = lowbits(tmp)
867 | blockstop = min(stop - (key << 16), 1 << 16)
868 | block = self._insertempty(self.size, key)
869 | block_initrange(block, blockstart, blockstop, step)
870 | gap = blockstop - blockstart + step - 1
871 | tmp += gap - (gap % step)
872 | if tmp >= stop:
873 | break
874 |
875 | def _init2pass(self, iterable):
876 | cdef Block *block = NULL
877 | cdef uint32_t elem
878 | cdef uint16_t key
879 | cdef int i, prev = -1
880 | # gather keys and count elements for each block
881 | for elem in iterable:
882 | key = highbits(elem)
883 | if key != prev:
884 | i = self._getindex(key)
885 | if i < 0:
886 | block = self._insertempty(-i - 1, key)
887 | block.cardinality = block.capacity = 0
888 | else:
889 | block = &(self.data[i])
890 | prev = key
891 | block.capacity += 1 # NB: wraps to 0 for block with all elements set
892 | # allocate blocks
893 | for i in range(self.size):
894 | block = &(self.data[i])
895 | if 0 < block.capacity < MAXARRAYLENGTH:
896 | block.buf.sparse = allocsparse(block.capacity)
897 | block.state = POSITIVE
898 | else: # if necessary, will convert to inverted later
899 | block.capacity = BITMAPSIZE // sizeof(uint16_t)
900 | block.buf.dense = allocdense()
901 | memset(block.buf.dense, 0, BITMAPSIZE)
902 | block.state = DENSE
903 | # second pass, add elements for each block
904 | prev = -1
905 | for elem in iterable:
906 | key = highbits(elem)
907 | if key != prev:
908 | i = self._getindex(key)
909 | if prev != -1:
910 | block_convert(block)
911 | block = &(self.data[i])
912 | prev = key
913 | block_add(block, lowbits(elem))
914 | if prev != -1:
915 | block_convert(block)
916 |
917 | def _inititerator(self, iterable):
918 | cdef Block *block = NULL
919 | cdef uint32_t elem
920 | cdef uint16_t key
921 | cdef int n
922 | cdef dict tmp = {}
923 | cdef list values
924 | for elem in iterable:
925 | key = highbits(elem)
926 | if key not in tmp:
927 | tmp[key] = set()
928 | tmp[key].add(lowbits(elem))
929 | for key in sorted(tmp):
930 | values = sorted(tmp[key])
931 | block = self._insertempty(self.size, key)
932 | block.cardinality = len(values)
933 | if block.cardinality < MAXARRAYLENGTH:
934 | block.capacity = block.cardinality
935 | block.buf.sparse = allocsparse(block.capacity)
936 | block.state = POSITIVE
937 | for n, elem in enumerate(values):
938 | block.buf.sparse[n] = elem
939 | elif block.cardinality == BLOCKSIZE:
940 | block_initrange(block, 0, BLOCKSIZE, 1)
941 | else:
942 | block.capacity = BITMAPSIZE // sizeof(uint16_t)
943 | block.buf.dense = allocdense()
944 | memset(block.buf.dense, 0, BITMAPSIZE)
945 | block.state = DENSE
946 | for elem in values:
947 | SETBIT(block.buf.dense, elem)
948 | block_convert(block)
949 |
950 | # def _inititerator(self, iterable):
951 | # cdef Block *block = NULL
952 | # cdef uint32_t elem
953 | # cdef uint16_t key
954 | # cdef int i, prev = -1
955 | # for elem in iterable:
956 | # key = highbits(elem)
957 | # if key != prev:
958 | # i = self._getindex(key)
959 | # if i >= 0:
960 | # block = &(self.data[i])
961 | # else:
962 | # block = self._insertempty(-i - 1, key)
963 | # block.state = POSITIVE
964 | # block.cardinality = 0
965 | # block.buf.sparse = allocsparse(INITCAPACITY)
966 | # block.capacity = INITCAPACITY
967 | # prev = key
968 | # block_add(block, lowbits(elem))
969 | # block_convert(block)
970 |
971 | cdef _initarray(self, int k):
972 | """Allocate k elements and initialize pointers to zero."""
973 | self._extendarray(k)
974 | memset(self.data, 0, self.capacity * sizeof(Block))
975 |
976 | cdef _extendarray(self, int k):
977 | """Extend allocation with k extra elements + amortization."""
978 | cdef size_t desired = self.size + k
979 | cdef size_t newcapacity
980 | cdef void *tmp1
981 | cdef void *tmp2
982 | if desired < self.capacity:
983 | return
984 | newcapacity = 2 * desired if self.size < 1024 else 5 * desired // 4
985 | tmp1 = realloc(self.keys, newcapacity * sizeof(uint16_t))
986 | tmp2 = realloc(self.data, newcapacity * sizeof(Block))
987 | if tmp1 is NULL or tmp2 is NULL:
988 | raise MemoryError(newcapacity)
989 | self.keys = tmp1
990 | self.data = tmp2
991 | self.capacity = newcapacity
992 |
993 | cdef _resize(self, int k):
994 | """Set size and if necessary reduce array allocation to k elements."""
995 | cdef void *tmp1
996 | cdef void *tmp2
997 | if k > INITCAPACITY and k * 2 < self.capacity:
998 | tmp1 = realloc(self.keys, k * sizeof(uint16_t))
999 | tmp2 = realloc(self.data, k * sizeof(Block))
1000 | if tmp1 is NULL or tmp2 is NULL:
1001 | raise MemoryError((k, self.size, self.capacity))
1002 | self.keys = tmp1
1003 | self.data = tmp2
1004 | self.capacity = k
1005 | self.size = k
1006 |
1007 | cdef _tmpalloc(self, int size, uint16_t **keys, Block **data):
1008 | keys[0] = malloc(size * sizeof(uint16_t))
1009 | data[0] = calloc(size, sizeof(Block))
1010 | if keys[0] is NULL or data[0] is NULL:
1011 | raise MemoryError(size)
1012 |
1013 | cdef _replacearrays(self, uint16_t *keys, Block *data, int size):
1014 | free(self.keys)
1015 | free(self.data)
1016 | self.keys = keys
1017 | self.data = data
1018 | self.size = size
1019 | self._resize(self.size) # truncate
1020 |
1021 | cdef _removeatidx(self, int i):
1022 | """Remove the i'th element."""
1023 | aligned_free(self.data[i].buf.ptr)
1024 | memmove(&(self.keys[i]), &(self.keys[i + 1]),
1025 | (self.size - i - 1) * sizeof(uint16_t))
1026 | memmove(&(self.data[i]), &(self.data[i + 1]),
1027 | (self.size - i - 1) * sizeof(Block))
1028 | self.size -= 1
1029 |
1030 | cdef Block *_insertempty(self, int i, uint16_t key):
1031 | """Insert a new, uninitialized block."""
1032 | self._extendarray(1)
1033 | if i < self.size:
1034 | memmove(&(self.keys[i + 1]), &(self.keys[i]),
1035 | (self.size - i) * sizeof(uint16_t))
1036 | memmove(&(self.data[i + 1]), &(self.data[i]),
1037 | (self.size - i) * sizeof(Block))
1038 | self.size += 1
1039 | self.keys[i] = key
1040 | self.data[i].buf.ptr = NULL
1041 | return &(self.data[i])
1042 |
1043 | cdef _insertcopy(self, int i, uint16_t key, Block *block):
1044 | """Insert a copy of given block."""
1045 | cdef size_t size
1046 | self._extendarray(1)
1047 | if i < self.size:
1048 | memmove(&(self.keys[i + 1]), &(self.keys[i]),
1049 | (self.size - i) * sizeof(uint16_t))
1050 | memmove(&(self.data[i + 1]), &(self.data[i]),
1051 | (self.size - i) * sizeof(Block))
1052 | size = getsize(block)
1053 | self.keys[i] = key
1054 | self.data[i] = block[0]
1055 | if self.data[i].state == DENSE:
1056 | self.data[i].buf.dense = allocdense()
1057 | elif self.data[i].state in (POSITIVE, INVERTED):
1058 | self.data[i].buf.sparse = allocsparse(size)
1059 | self.data[i].capacity = size
1060 | memcpy(self.data[i].buf.ptr, block.buf.ptr, size * sizeof(uint16_t))
1061 | self.size += 1
1062 |
1063 | cdef int _getindex(self, uint16_t key):
1064 | if self.size == 0:
1065 | return -1
1066 | # Common case of appending in last block:
1067 | if self.keys[self.size - 1] == key:
1068 | return self.size - 1
1069 | return self._binarysearch(0, self.size, key)
1070 |
1071 | cdef int _binarysearch(self, int begin, int end, uint16_t key):
1072 | """Binary search for key.
1073 |
1074 | :returns: positive index ``i`` if ``key`` is found;
1075 | negative value ``i`` if ``elem`` is not found,
1076 | but would fit at ``-i - 1``."""
1077 | cdef int low = begin, high = end - 1
1078 | cdef int middleidx, middleval
1079 | while low <= high:
1080 | middleidx = (low + high) >> 1
1081 | middleval = self.keys[middleidx]
1082 | if middleval < key:
1083 | low = middleidx + 1
1084 | elif middleval > key:
1085 | high = middleidx - 1
1086 | else:
1087 | return middleidx
1088 | return -(low + 1)
1089 |
1090 | def _checkconsistency(self):
1091 | """Verify that arrays are sorted and free of duplicates."""
1092 | cdef Block b1
1093 | cdef Block *b2
1094 | cdef size_t n, m
1095 | for n in range(self.size):
1096 | assert self.data[n].state in (DENSE, POSITIVE, INVERTED)
1097 | assert 1 <= self.data[n].cardinality < 1 << 16
1098 | assert getsize(&(self.data[n])) <= self.data[n].capacity
1099 | if self.data[n].state == POSITIVE:
1100 | assert 1 <= self.data[n].cardinality < MAXARRAYLENGTH
1101 | elif self.data[n].state == DENSE:
1102 | assert (MAXARRAYLENGTH <= self.data[n].cardinality
1103 | <= BLOCKSIZE - MAXARRAYLENGTH)
1104 | elif self.data[n].state == INVERTED:
1105 | assert (BLOCKSIZE - MAXARRAYLENGTH < self.data[n].cardinality
1106 | < BLOCKSIZE)
1107 | if n + 1 < self.size:
1108 | assert self.keys[n] < self.keys[n + 1], (
1109 | n, self.keys[n], self.keys[n + 1])
1110 | if self.data[n].state != DENSE:
1111 | for m in range(getsize(&(self.data[n])) - 1):
1112 | b2 = self._getblk(n, &b1)
1113 | assert b2.buf.sparse[m] < b2.buf.sparse[m + 1], (
1114 | m, b2.buf.sparse[m], b2.buf.sparse[m + 1])
1115 |
1116 | cdef inline Block *_getblk(self, int i, Block *tmp) noexcept nogil:
1117 | """Get pointer to block `i`. If there is an offset, copy this block
1118 | to ``tmp`` and add offset to its pointer, otherwise return block itself.
1119 | """
1120 | # a bit unelegant, but this makes it possible to use the same code
1121 | # for mutable & immutable variants.
1122 | if not 0 <= i < self.size:
1123 | printf('illegal index %d; size=%d\n', i, self.size)
1124 | abort()
1125 | if self.offset:
1126 | tmp[0] = self.data[i]
1127 | tmp.buf.ptr = (tmp.buf.offset + self.offset)
1128 | return tmp
1129 | return &(self.data[i])
1130 |
1131 |
1132 | cdef inline RoaringBitmap ensurerb(obj):
1133 | """Convert set-like ``obj`` to RoaringBitmap if necessary."""
1134 | if isinstance(obj, RoaringBitmap):
1135 | return obj
1136 | return RoaringBitmap(obj)
1137 |
1138 |
1139 | cdef inline uint16_t highbits(uint32_t x) noexcept nogil:
1140 | return x >> 16
1141 |
1142 |
1143 | cdef inline uint16_t lowbits(uint32_t x) noexcept nogil:
1144 | return x & 0xFFFF
1145 |
1146 |
1147 | cdef inline uint32_t min(uint32_t a, uint32_t b) noexcept nogil:
1148 | return a if a <= b else b
1149 |
1150 |
1151 | cdef inline uint32_t max(uint32_t a, uint32_t b) noexcept nogil:
1152 | return a if a >= b else b
1153 |
1154 |
1155 | cdef inline int getbufptr(
1156 | object obj, char ** ptr, Py_ssize_t * size, Py_buffer * buf):
1157 | """Get a pointer from bytes/buffer object ``obj``.
1158 |
1159 | On success, return 0, and set ``ptr``, ``size``, and possibly ``buf``."""
1160 | cdef int result = -1
1161 | ptr[0] = NULL
1162 | size[0] = 0
1163 | if PY2:
1164 | # Although the new-style buffer interface was backported to Python 2.6,
1165 | # some modules, notably mmap, only support the old buffer interface.
1166 | # Cf. http://bugs.python.org/issue9229
1167 | if PyObject_CheckReadBuffer(obj) == 1:
1168 | result = PyObject_AsReadBuffer(
1169 | obj, ptr, size)
1170 | elif PyObject_CheckBuffer(obj) == 1: # new-style Buffer interface
1171 | result = PyObject_GetBuffer(obj, buf, PyBUF_SIMPLE)
1172 | if result == 0:
1173 | ptr[0] = buf.buf
1174 | size[0] = buf.len
1175 | return result
1176 |
1177 |
1178 | cdef inline void releasebuf(Py_buffer *buf):
1179 | """Release buffer if necessary."""
1180 | if not PY2:
1181 | PyBuffer_Release(buf)
1182 |
1183 |
1184 | def bitcounttests():
1185 | assert bit_ctz(2) == 1
1186 | assert bit_ctz(3) == 0
1187 | assert bit_ctz(0x80000000) == 31
1188 | assert bit_ctz(0x1000) == 12
1189 | assert bit_ctz(UINT64_MAX) == 0
1190 | assert bit_clz(1) == BITCOUNT_BITS - 1
1191 | assert bit_clz(4) == BITCOUNT_BITS - 3
1192 | assert bit_clz(0x80000000) == BITCOUNT_BITS - 32
1193 | assert bit_clz(0x1000) == BITCOUNT_BITS - 13
1194 | assert bit_clz(UINT64_MAX) == 0
1195 | assert bit_popcount(0x1) == 1
1196 | assert bit_popcount(0x10) == 1
1197 | assert bit_popcount(0x101001) == 3
1198 | assert bit_popcount(3) == 2
1199 | assert bit_popcount(UINT64_MAX) == BITCOUNT_BITS
1200 | assert bit_popcount(0) == 0
1201 | return True
1202 |
1203 |
1204 | def aligned_malloc_tests():
1205 | cdef void *ptr = NULL
1206 | ptr = aligned_malloc(1024, sizeof(void *))
1207 | assert ptr is not NULL
1208 | (ptr)[0] = 1234
1209 | aligned_free(ptr)
1210 | return True
1211 |
1212 |
1213 | def mmaptests():
1214 | cdef Py_buffer buffer
1215 | cdef Py_ssize_t size = 0
1216 | cdef char *ptr = NULL
1217 | cdef uint32_t *uptr
1218 | cdef int result
1219 |
1220 | alignment = 32
1221 | alloc = sizeof(uint32_t) + 8 * sizeof(uint32_t)
1222 | extra = alignment - alloc % alignment
1223 | alloc += extra + 1024
1224 |
1225 | ob = mmap.mmap(-1, alloc, access=mmap.ACCESS_WRITE)
1226 | result = getbufptr(ob, &ptr, &size, &buffer)
1227 | if result != 0:
1228 | raise ValueError('could not get buffer from mmap.')
1229 | uptr = ptr
1230 | uptr[0] = 1234
1231 | return True
1232 |
1233 |
1234 | __all__ = ['RoaringBitmap', 'ImmutableRoaringBitmap', 'MultiRoaringBitmap']
1235 |
--------------------------------------------------------------------------------
/tests/benchmarks.py:
--------------------------------------------------------------------------------
1 | """Benchmarks for roaringbitmap"""
2 | from __future__ import division, print_function, absolute_import, \
3 | unicode_literals
4 | import random
5 | import timeit
6 |
7 | N = 1 << 17 # number of random elements
8 | M = 100 # number of test runs
9 | MAX = 1 << 20 # range of elements
10 | DATA1, DATA2 = None, None
11 |
12 |
13 | def pair():
14 | random.seed(42)
15 | data1 = [random.randint(0, MAX) for _ in range(N)]
16 | data2 = data1[:len(data1) // 2]
17 | data2.extend(random.randint(0, MAX) for _ in range(N // 2))
18 | return data1, data2
19 |
20 |
21 | def bench_init():
22 | a = timeit.Timer('set(DATA1)',
23 | setup='from __main__ import DATA1').timeit(number=M)
24 | b = timeit.Timer('rb = RoaringBitmap(DATA1)',
25 | setup='from __main__ import DATA1; '
26 | 'from roaringbitmap import RoaringBitmap; '
27 | ).timeit(number=M)
28 | return a, b
29 |
30 |
31 | def bench_initsort():
32 | a = timeit.Timer('set(data)',
33 | setup='from __main__ import DATA1; '
34 | 'data = sorted(DATA1)').timeit(number=M)
35 | b = timeit.Timer('rb = RoaringBitmap(data)',
36 | setup='from __main__ import DATA1; '
37 | 'from roaringbitmap import RoaringBitmap; '
38 | 'data = sorted(DATA1)'
39 | ).timeit(number=M)
40 | return a, b
41 |
42 |
43 | def bench_eq():
44 | # benchmark equality with equal operands
45 | a = timeit.Timer('ref == ref2',
46 | setup='from __main__ import DATA1; '
47 | 'ref = set(DATA1); ref2 = set(DATA1)').timeit(number=M)
48 | b = timeit.Timer('rb == rb2',
49 | setup='from __main__ import DATA1; '
50 | 'from roaringbitmap import RoaringBitmap; '
51 | 'rb = RoaringBitmap(DATA1); '
52 | 'rb2 = RoaringBitmap(DATA1)').timeit(number=M)
53 | return a, b
54 |
55 |
56 | def bench_neq():
57 | # benchmark non-equality with non-equal operands
58 | a = timeit.Timer('ref != ref2',
59 | setup='from __main__ import DATA1, DATA2; '
60 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
61 | b = timeit.Timer('rb != rb2',
62 | setup='from __main__ import DATA1, DATA2; '
63 | 'from roaringbitmap import RoaringBitmap; '
64 | 'rb = RoaringBitmap(DATA1); '
65 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
66 | return a, b
67 |
68 |
69 | def bench_and():
70 | a = timeit.Timer('ref & ref2',
71 | setup='from __main__ import DATA1, DATA2; '
72 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
73 | b = timeit.Timer('rb & rb2',
74 | setup='from __main__ import DATA1, DATA2; '
75 | 'from roaringbitmap import RoaringBitmap; '
76 | 'rb = RoaringBitmap(DATA1); '
77 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
78 | return a, b
79 |
80 |
81 | def bench_or():
82 | a = timeit.Timer('ref | ref2',
83 | setup='from __main__ import DATA1, DATA2; '
84 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
85 | b = timeit.Timer('rb | rb2',
86 | setup='from __main__ import DATA1, DATA2; '
87 | 'from roaringbitmap import RoaringBitmap; '
88 | 'rb = RoaringBitmap(DATA1); '
89 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
90 | return a, b
91 |
92 |
93 | def bench_xor():
94 | a = timeit.Timer('ref ^ ref2',
95 | setup='from __main__ import DATA1, DATA2; '
96 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
97 | b = timeit.Timer('rb ^ rb2',
98 | setup='from __main__ import DATA1, DATA2; '
99 | 'from roaringbitmap import RoaringBitmap; '
100 | 'rb = RoaringBitmap(DATA1); '
101 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
102 | return a, b
103 |
104 |
105 | def bench_sub():
106 | a = timeit.Timer('ref - ref2',
107 | setup='from __main__ import DATA1, DATA2; '
108 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
109 | b = timeit.Timer('rb - rb2',
110 | setup='from __main__ import DATA1, DATA2; '
111 | 'from roaringbitmap import RoaringBitmap; '
112 | 'rb = RoaringBitmap(DATA1); '
113 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
114 | return a, b
115 |
116 |
117 | def bench_iand():
118 | aa = [timeit.Timer('ref &= ref2',
119 | setup='from __main__ import DATA1, DATA2; '
120 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
121 | for _ in range(M)]
122 | bb = [timeit.Timer('rb &= rb2',
123 | setup='from __main__ import DATA1, DATA2; '
124 | 'from roaringbitmap import RoaringBitmap; '
125 | 'rb = RoaringBitmap(DATA1); '
126 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
127 | for _ in range(M)]
128 | return sum(aa) / M, sum(bb) / M
129 |
130 |
131 | def bench_ior():
132 | aa = [timeit.Timer('ref |= ref2',
133 | setup='from __main__ import DATA1, DATA2; '
134 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
135 | for _ in range(M)]
136 | bb = [timeit.Timer('rb |= rb2',
137 | setup='from __main__ import DATA1, DATA2; '
138 | 'from roaringbitmap import RoaringBitmap; '
139 | 'rb = RoaringBitmap(DATA1); '
140 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
141 | for _ in range(M)]
142 | return sum(aa) / M, sum(bb) / M
143 |
144 |
145 | def bench_ixor():
146 | aa = [timeit.Timer('ref ^= ref2',
147 | setup='from __main__ import DATA1, DATA2; '
148 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
149 | for _ in range(M)]
150 | bb = [timeit.Timer('rb ^= rb2',
151 | setup='from __main__ import DATA1, DATA2; '
152 | 'from roaringbitmap import RoaringBitmap; '
153 | 'rb = RoaringBitmap(DATA1); '
154 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
155 | for _ in range(M)]
156 | return sum(aa) / M, sum(bb) / M
157 |
158 |
159 | def bench_isub():
160 | aa = [timeit.Timer('ref -= ref2',
161 | setup='from __main__ import DATA1, DATA2; '
162 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
163 | for _ in range(M)]
164 | bb = [timeit.Timer('rb -= rb2',
165 | setup='from __main__ import DATA1, DATA2; '
166 | 'from roaringbitmap import RoaringBitmap; '
167 | 'rb = RoaringBitmap(DATA1); '
168 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
169 | for _ in range(M)]
170 | return sum(aa) / M, sum(bb) / M
171 |
172 |
173 | def bench_andlen():
174 | a = timeit.Timer('len(ref & ref2)',
175 | setup='from __main__ import DATA1, DATA2; '
176 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
177 | b = timeit.Timer('rb.intersection_len(rb2)',
178 | setup='from __main__ import DATA1, DATA2; '
179 | 'from roaringbitmap import RoaringBitmap; '
180 | 'rb = RoaringBitmap(DATA1); '
181 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
182 | return a, b
183 |
184 |
185 | def bench_orlen():
186 | a = timeit.Timer('len(ref | ref2)',
187 | setup='from __main__ import DATA1, DATA2; '
188 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
189 | b = timeit.Timer('rb.union_len(rb2)',
190 | setup='from __main__ import DATA1, DATA2; '
191 | 'from roaringbitmap import RoaringBitmap; '
192 | 'rb = RoaringBitmap(DATA1); '
193 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
194 | return a, b
195 |
196 |
197 | def bench_jaccard():
198 | a = timeit.Timer('1 - (len(ref & ref2) / len(ref | ref2))',
199 | setup='from __main__ import DATA1, DATA2; '
200 | 'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
201 | b = timeit.Timer('rb.jaccard_dist(rb2)',
202 | setup='from __main__ import DATA1, DATA2; '
203 | 'from roaringbitmap import RoaringBitmap; '
204 | 'rb = RoaringBitmap(DATA1); '
205 | 'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
206 | return a, b
207 |
208 |
209 | def main():
210 | global N, MAX, DATA1, DATA2
211 | for x in range(3):
212 | if x == 0: # benchmark positive blocks
213 | print('small sparse set')
214 | N = 200 # number of random elements
215 | MAX = 40000 # range of elements
216 | elif x == 1: # benchmark bitmap blocks
217 | print('medium load factor')
218 | N = 59392
219 | MAX = 118784
220 | elif x == 2: # benchmark inverted blocks
221 | print('dense set / high load factor')
222 | N = 40000 - 200
223 | MAX = 40000
224 | elif x == 3: # benchmark large number of small blocks
225 | print('large sparse set') # don't use RoaringBitmap for this case
226 | N = 1 << 12
227 | MAX = 1 << 31
228 | DATA1, DATA2 = pair()
229 |
230 | fmt = '%12s %8s %16s %8s'
231 | numfmt = '%8.3g'
232 | print('%d runs with sets of %d random elements n s.t. 0 <= n < %d' % (
233 | M, N, MAX))
234 | print(fmt % ('', 'set()', 'RoaringBitmap()', 'ratio'))
235 | for func in (bench_init, bench_initsort,
236 | bench_and, bench_or, bench_xor, bench_sub,
237 | bench_iand, bench_ior, bench_ixor, bench_isub,
238 | bench_eq, bench_neq,
239 | # bench_andlen, bench_orlen,
240 | bench_jaccard):
241 | a, b = func()
242 | ratio = a / b
243 | print(fmt % (func.__name__.split('_', 1)[1].ljust(12),
244 | numfmt % a, numfmt % b,
245 | (numfmt % ratio) if ratio < 100 else int(ratio)))
246 | print()
247 |
248 |
249 | if __name__ == '__main__':
250 | main()
251 |
--------------------------------------------------------------------------------
/tests/unittests.py:
--------------------------------------------------------------------------------
1 | """Unit tests for roaringbitmap"""
2 | from __future__ import division, absolute_import, unicode_literals
3 | import sys
4 | import array
5 | import pytest
6 | import pickle
7 | import tempfile
8 | from random import seed, choice, sample, randint
9 | try:
10 | import faulthandler
11 | faulthandler.enable()
12 | except ImportError:
13 | pass
14 | from roaringbitmap import (RoaringBitmap, ImmutableRoaringBitmap,
15 | MultiRoaringBitmap, bitcounttests, aligned_malloc_tests, mmaptests)
16 | PY2 = sys.version_info[0] == 2
17 | if PY2:
18 | range = xrange
19 | from itertools import izip_longest as zip_longest
20 | else:
21 | from itertools import zip_longest
22 |
23 | # (numitems, maxnum)
24 | PARAMS = [
25 | ('empty', 0, (1 << 16) - 1),
26 | ('positive', 200, (1 << 16) - 1),
27 | ('dense', 5000, (1 << 16) - 1),
28 | ('inverted', 4000, (1 << 16) - 1),
29 | ('many keys', 4000, (1 << 25) - 1)
30 | ]
31 |
32 |
33 | def _single():
34 | seed(42)
35 | result = []
36 | for name, elements, maxnum in PARAMS:
37 | if name == 'inverted':
38 | result.append((name, list(set(range(1 << 16))
39 | - {randint(0, maxnum) for _ in range(elements)})))
40 | else:
41 | result.append((name, sorted(
42 | randint(0, maxnum) for _ in range(elements))))
43 | return result
44 |
45 |
46 | @pytest.fixture(scope='module')
47 | def single():
48 | return _single()
49 |
50 |
51 | @pytest.fixture(scope='module')
52 | def pair():
53 | result = []
54 | for name1, a in _single():
55 | for name2, b in _single():
56 | if name2 != 'empty':
57 | b = sorted(b[:len(b) // 2] + a[len(a) // 2:])
58 | result.append((name1 + ':' + name2, a, b))
59 | return result
60 |
61 |
62 | @pytest.fixture(scope='module')
63 | def multi():
64 | a = sorted(randint(0, 2000)
65 | for _ in range(randint(100, 2000)))
66 | result = [sorted([randint(0, 2000)
67 | for _ in range(randint(100, 2000))] + a)
68 | for _ in range(100)]
69 | return result
70 |
71 |
72 | def abbr(a):
73 | return a[:500] + '...' + a[-500:]
74 |
75 |
76 | def test_fixtures(single):
77 | for name, data in single:
78 | rb = RoaringBitmap(data)
79 | if name == 'many keys':
80 | assert len(rb._keys()) > 100
81 | elif name == 'empty':
82 | assert len(rb) == 0
83 | else:
84 | assert name[0].upper() in rb.debuginfo()
85 |
86 |
87 | def test_bitcount():
88 | assert bitcounttests()
89 |
90 |
91 | def test_aligned_malloc():
92 | assert aligned_malloc_tests()
93 |
94 |
95 | def test_mmap():
96 | assert mmaptests()
97 |
98 |
99 | class Test_roaringbitmap(object):
100 | def test_inittrivial(self):
101 | data = list(range(5))
102 | ref = set(data)
103 | rb = RoaringBitmap(data)
104 | rb._checkconsistency()
105 | assert ref == rb
106 |
107 | def test_initsorted(self, single):
108 | for name, data in single:
109 | ref = set(sorted(data))
110 | rb = RoaringBitmap(sorted(data))
111 | rb._checkconsistency()
112 | assert ref == rb, name
113 |
114 | def test_initunsorted(self, single):
115 | for name, data in single:
116 | ref = set(data)
117 | rb = RoaringBitmap(data)
118 | rb._checkconsistency()
119 | assert ref == rb, name
120 |
121 | def test_inititerator(self, single):
122 | for name, data in single:
123 | ref = set(a for a in data)
124 | rb = RoaringBitmap(a for a in data)
125 | rb._checkconsistency()
126 | assert ref == rb, name
127 |
128 | def test_initrange(self):
129 | # creates a positive, dense, and inverted block, respectively
130 | for n in [400, 6000, 61241]:
131 | ref = set(range(23, n))
132 | rb = RoaringBitmap(range(23, n))
133 | rb._checkconsistency()
134 | assert ref == rb, ('range(23, %d)' % n)
135 |
136 | def test_initrangestep(self):
137 | # creates a positive, dense, and inverted block, respectively
138 | for n in [400, 6000, 61241]:
139 | for step in (2, 7, 113):
140 | ref = set(range(23, n * step, step))
141 | rb = RoaringBitmap(range(23, n * step, step))
142 | rb._checkconsistency()
143 | assert ref == rb, ('range(23, %d, %d)' % (n, step))
144 | n = 100 * (1 << 16)
145 | step = (1 << 16) + 7
146 | ref = set(range(23, n, step))
147 | rb = RoaringBitmap(range(23, n, step))
148 | rb._checkconsistency()
149 | assert ref == rb, ('range(23, %d, %d)' % (n, step))
150 |
151 | def test_inititerableallset(self):
152 | rb = RoaringBitmap(list(range(0, 0xffff + 1)))
153 | assert len(rb) == 0xffff + 1
154 |
155 | def test_add(self, single):
156 | for name, data in single:
157 | ref = set()
158 | rb = RoaringBitmap()
159 | for n in sorted(data):
160 | ref.add(n)
161 | rb.add(n)
162 | assert rb == ref, name
163 | with pytest.raises(OverflowError):
164 | rb.add(-1)
165 | rb.add(1 << 32)
166 | rb.add(0)
167 | rb.add((1 << 32) - 1)
168 | rb._checkconsistency()
169 |
170 | def test_discard(self, single):
171 | for name, data in single:
172 | ref = set()
173 | rb = RoaringBitmap()
174 | for n in sorted(data):
175 | ref.add(n)
176 | rb.add(n)
177 | for n in sorted(data):
178 | ref.discard(n)
179 | rb.discard(n)
180 | rb._checkconsistency()
181 | assert len(ref) == 0, name
182 | assert len(rb) == 0, name
183 | assert rb == ref, name
184 |
185 | def test_pop(self):
186 | rb = RoaringBitmap([60748, 28806, 54664, 28597, 58922, 75684, 56364,
187 | 67421, 52608, 55686, 10427, 48506, 64363, 14506, 73077, 59035,
188 | 70246, 19875, 73145, 40225, 58664, 6597, 65554, 73102, 26636,
189 | 74227, 59566, 19023])
190 | while rb:
191 | rb.pop()
192 | rb._checkconsistency()
193 | assert len(rb) == 0
194 |
195 | def test_contains(self, single):
196 | for name, data in single:
197 | ref = set(data)
198 | rb = RoaringBitmap(data)
199 | for a in data:
200 | assert a in ref, name
201 | assert a in rb, name
202 | for a in set(range(20000)) - set(data):
203 | assert a not in ref, name
204 | assert a not in rb, name
205 | rb._checkconsistency()
206 |
207 | def test_eq(self, single):
208 | for name, data in single:
209 | ref, ref2 = set(data), set(data)
210 | rb, rb2 = RoaringBitmap(data), RoaringBitmap(data)
211 | assert (ref == ref2) == (rb == rb2), name
212 |
213 | def test_neq(self, pair):
214 | for name, data1, data2 in pair:
215 | ref, ref2 = set(data1), set(data2)
216 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
217 | assert (ref != ref2) == (rb != rb2), name
218 |
219 | def test_iter(self, single):
220 | for name, data in single:
221 | rb = RoaringBitmap(data)
222 | assert list(iter(rb)) == sorted(set(data)), name
223 |
224 | def test_reversed(self, single):
225 | for name, data in single:
226 | rb = RoaringBitmap(data)
227 | for a, b in zip_longest(reversed(rb), reversed(sorted(set(data)))):
228 | assert a == b, name
229 |
230 | def test_iand(self, pair):
231 | for name, data1, data2 in pair:
232 | ref, ref2 = set(data1), set(data2)
233 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
234 | ref &= ref2
235 | rb &= rb2
236 | rb._checkconsistency()
237 | assert rb == ref, name
238 |
239 | def test_ior(self, pair):
240 | for name, data1, data2 in pair:
241 | ref, ref2 = set(data1), set(data2)
242 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
243 | ref |= ref2
244 | rb |= rb2
245 | rb._checkconsistency()
246 | assert rb == ref, name
247 |
248 | def test_ixor(self, pair):
249 | for name, data1, data2 in pair:
250 | ref, ref2 = set(data1), set(data2)
251 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
252 | ref ^= ref2
253 | rb ^= rb2
254 | rb._checkconsistency()
255 | assert len(ref) == len(rb), name
256 | assert ref == rb, name
257 |
258 | def test_isub(self, pair):
259 | for name, data1, data2 in pair:
260 | ref, ref2 = set(data1), set(data2)
261 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
262 | ref -= ref2
263 | rb -= rb2
264 | rb._checkconsistency()
265 | assert len(ref) <= len(set(data1))
266 | assert len(rb) <= len(set(data1)), name
267 | assert len(ref) == len(rb), name
268 | assert ref == rb, name
269 |
270 | def test_and(self, pair):
271 | for name, data1, data2 in pair:
272 | ref, ref2 = set(data1), set(data2)
273 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
274 | assert ref & ref2 == set(rb & rb2), name
275 |
276 | def test_or(self, pair):
277 | for name, data1, data2 in pair:
278 | ref, ref2 = set(data1), set(data2)
279 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
280 | assert ref | ref2 == set(rb | rb2), name
281 |
282 | def test_xor(self, pair):
283 | for name, data1, data2 in pair:
284 | ref, ref2 = set(data1), set(data2)
285 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
286 | assert ref ^ ref2 == set(rb ^ rb2), name
287 |
288 | def test_sub(self, pair):
289 | for name, data1, data2 in pair:
290 | ref, ref2 = set(data1), set(data2)
291 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
292 | assert ref - ref2 == set(rb - rb2), name
293 |
294 | def test_subset(self, pair):
295 | for name, data1, data2 in pair:
296 | ref, ref2 = set(data1), set(data2)
297 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
298 | refans = ref <= ref2
299 | assert (set(rb) <= ref2) == refans, name
300 | assert (rb <= rb2) == refans, name
301 | k = len(data2) // 2
302 | ref, rb = set(data2[:k]), RoaringBitmap(data2[:k])
303 | refans = ref <= ref2
304 | assert (set(rb) <= ref2) == refans, name
305 | assert (ref <= set(rb2)) == refans, name
306 | assert (rb <= rb2) == refans, (name, rb.debuginfo())
307 |
308 | def test_disjoint(self, pair):
309 | for name, data1, data2 in pair:
310 | ref, ref2 = set(data1), set(data2)
311 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
312 | refans = ref.isdisjoint(ref2)
313 | assert rb.isdisjoint(rb2) == refans, name
314 | data3 = [a for a in data2 if a not in ref]
315 | ref3, rb3 = set(data3), RoaringBitmap(data3)
316 | refans2 = ref.isdisjoint(ref3)
317 | assert rb.isdisjoint(rb3) == refans2, name
318 |
319 | def test_clamp(self, single):
320 | for name, data in single:
321 | if len(data) == 0:
322 | continue
323 | a, b = sorted(sample(data, 2))
324 | ref = set(data).intersection(range(a, b))
325 | rb = RoaringBitmap(data).intersection(range(a, b))
326 | rb2 = RoaringBitmap(data).clamp(a, b)
327 | assert a <= rb2.min() and rb2.max() < b, name
328 | assert ref == rb2, (name, a, b)
329 | assert rb == rb2, (name, a, b)
330 |
331 | def test_clamp_issue12(self):
332 | b = RoaringBitmap([1, 2, 3])
333 | assert b.clamp(0, 65536) == b
334 | assert b.clamp(0, 65537) == b
335 | assert b.clamp(0, 65538) == b
336 | assert b.clamp(0, 65539) == b
337 |
338 | def test_clamp2(self):
339 | a = RoaringBitmap([0x00010001])
340 | b = RoaringBitmap([0x00030003, 0x00050005])
341 | c = RoaringBitmap([0x00070007])
342 | x = a | b | c
343 | assert x.clamp(0, 0x000FFFFF) == x
344 | assert x.clamp(0x000200FF, 0x000FFFFF) == b | c
345 | assert x.clamp(0x00030003, 0x000FFFFF) == b | c
346 | assert x.clamp(0, 0x00060006) == a | b
347 | assert x.clamp(0, 0x00050006) == a | b
348 | assert x.clamp(0, 0x00050005) == a | RoaringBitmap([0x00030003])
349 |
350 | def test_aggregateand(self, multi):
351 | ref = set(multi[0])
352 | ref.intersection_update(*[set(a) for a in multi[1:]])
353 | rb = RoaringBitmap(multi[0])
354 | rb.intersection_update(*[RoaringBitmap(a) for a in multi[1:]])
355 | rb._checkconsistency()
356 | assert rb == ref
357 |
358 | def test_aggregateor(self, multi):
359 | ref = set(multi[0])
360 | ref.update(*[set(a) for a in multi[1:]])
361 | rb = RoaringBitmap(multi[0])
362 | rb.update(*[RoaringBitmap(a) for a in multi[1:]])
363 | rb._checkconsistency()
364 | assert rb == ref
365 |
366 | def test_andlen(self, pair):
367 | for name, data1, data2 in pair:
368 | ref, ref2 = set(data1), set(data2)
369 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
370 | assert len(rb & rb2) == rb.intersection_len(rb2), name
371 | assert len(ref & ref2) == rb.intersection_len(rb2), name
372 |
373 | def test_orlen(self, pair):
374 | for name, data1, data2 in pair:
375 | ref, ref2 = set(data1), set(data2)
376 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
377 | assert len(ref | ref2) == rb.union_len(rb2), name
378 | assert len(rb | rb2) == rb.union_len(rb2), name
379 |
380 | def test_jaccard_dist(self, pair):
381 | for name, data1, data2 in pair:
382 | if len(data1) == 0 and len(data2) == 0:
383 | continue
384 | ref, ref2 = set(data1), set(data2)
385 | rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
386 | assert len(ref & ref2) / float(len(ref | ref2)) == pytest.approx(
387 | rb.intersection_len(rb2) / float(rb.union_len(rb2))), name
388 | assert (1 - (len(ref & ref2) / float(len(ref | ref2)))
389 | == pytest.approx(rb.jaccard_dist(rb2))), name
390 |
391 | def test_rank(self, single):
392 | for name, data in single:
393 | if len(data) == 0:
394 | continue
395 | ref = sorted(set(data))
396 | rb = RoaringBitmap(data)
397 | for _ in range(10):
398 | x = choice(ref)
399 | assert x in rb, name
400 | assert rb.rank(x) == ref.index(x) + 1, name
401 |
402 | def test_select(self, single):
403 | for name, data in single:
404 | if len(data) == 0:
405 | continue
406 | ref = sorted(set(data))
407 | rb = RoaringBitmap(data)
408 | lrb = list(rb)
409 | idx = [randint(0, len(ref) - 1) for _ in range(10)]
410 | for i in idx:
411 | assert lrb[i] == ref[i], (name, i, len(ref))
412 | assert rb.select(i) in rb, name
413 | assert rb.select(i) == ref[i], name
414 | assert rb[i] == ref[i], name
415 | assert rb.rank(rb.select(i)) - 1 == i, name
416 | if rb.select(i) + 1 in rb:
417 | assert rb.rank(rb.select(i) + 1) - 1 == i + 1, name
418 | else:
419 | assert rb.rank(rb.select(i) + 1) - 1 == i, name
420 |
421 | def test_rank2(self):
422 | rb = RoaringBitmap(range(0, 100000, 7))
423 | rb.update(range(100000, 200000, 1000))
424 | for k in range(100000):
425 | assert rb.rank(k) == 1 + k // 7
426 | for k in range(100000, 200000):
427 | assert rb.rank(k) == 1 + 100000 // 7 + 1 + (k - 100000) // 1000
428 |
429 | def test_select2(self):
430 | gap = 1
431 | while gap <= 1024:
432 | rb = RoaringBitmap(range(0, 100000, gap))
433 | for k in range(0, 100000 // gap):
434 | assert rb.select(k) == k * gap
435 | gap *= 2
436 |
437 | def test_select_issue15(self):
438 | rb = RoaringBitmap(range(0x10000, 0x1ffff + 1))
439 | assert rb[0] == 0x10000
440 | rb.discard(0x10010)
441 | assert rb[0] == 0x10000
442 | rb = RoaringBitmap(range(0x10, 0xffff + 1))
443 | assert rb[0] == 0x10, rb.debuginfo(True)
444 | rb = RoaringBitmap(range(0x10010, 0x1ffff + 1))
445 | assert rb[0] == 0x10010, rb.debuginfo(True)
446 | lst = list(range(1, 0xccbb))
447 | lst.extend(range(0xcccc, 0xfffc))
448 | rb = RoaringBitmap(lst)
449 | for n in (0, 0xcccc, -1):
450 | assert lst[n] == rb[n], (n, lst[n], rb[n])
451 |
452 | def test_pickle(self, single):
453 | for name, data in single:
454 | rb = RoaringBitmap(data)
455 | rb_pickled = pickle.dumps(rb, protocol=-1)
456 | rb_unpickled = pickle.loads(rb_pickled)
457 | rb._checkconsistency()
458 | assert rb_unpickled == rb, name
459 |
460 | def test_invalid(self):
461 | with pytest.raises(TypeError):
462 | rb = RoaringBitmap([1, 2, 'a'])
463 | with pytest.raises(TypeError):
464 | RoaringBitmap([1, 2]) < [1, 2, 3]
465 |
466 | def test_slices(self): # issue 20
467 | ref = list(range(10))
468 | rb = RoaringBitmap(range(10))
469 | assert list(rb[::2]) == ref[::2]
470 | with pytest.raises(ValueError):
471 | _ = rb[::-2]
472 | with pytest.raises(ValueError):
473 | _ = rb[::0]
474 | del rb[::2]
475 | del ref[::2]
476 | assert list(rb) == ref
477 |
478 | def test_minmax(self):
479 | rb = RoaringBitmap(range(0, 61440))
480 | assert rb.min() == 0
481 | assert rb.max() == 61439
482 | rb1 = RoaringBitmap(range(0, 61441))
483 | assert rb1.min() == 0
484 | assert rb1.max() == 61440
485 | assert rb1[61440] == 61440
486 | assert list(rb1)[61440] == 61440
487 |
488 | def test_issue19(self):
489 | a = RoaringBitmap()
490 | b = RoaringBitmap(range(4095))
491 | c = RoaringBitmap(range(2))
492 | a |= b
493 | a |= c
494 | assert len(a - b - c) == 0
495 | assert len((b | c) - b - c) == 0
496 |
497 | def test_issue22(self):
498 | rb = RoaringBitmap(range(0, 61440))
499 | rb1 = RoaringBitmap(range(0, 61441))
500 | assert len(rb ^ rb) == 0
501 | assert len(rb - rb) == 0
502 | assert len(rb1 ^ rb1) == 0
503 | assert len(rb1 - rb1) == 0
504 | assert len(~rb) == 0
505 | assert len(~rb1) == 0
506 |
507 | rb1 = RoaringBitmap(range(0, 61441))
508 | assert len(rb ^ rb) == 0
509 | rb1 ^= rb1
510 | assert len(rb1) == 0
511 |
512 | rb1 = RoaringBitmap(range(0, 61441))
513 | rb1 -= rb1
514 | assert len(rb1) == 0
515 |
516 | def test_issue24(self):
517 | r = RoaringBitmap(range(131071))
518 | assert r.pop() == 131070
519 | assert r.pop() == 131069
520 |
521 | rr = r - RoaringBitmap([130752])
522 | assert 130752 not in rr
523 | assert rr.pop() == 131068
524 |
525 | r.difference_update(RoaringBitmap([130752]))
526 | assert 130752 not in r
527 | assert r.pop() == 131068
528 |
529 | def test_issue25(self):
530 | r = RoaringBitmap({1})
531 | r.intersection_update(RoaringBitmap([]))
532 | assert len(r) == 0
533 |
534 | def test_issue28(self):
535 | rbm = RoaringBitmap()
536 | rbm.add(3995084765)
537 | r = rbm.clamp(0, 8388607)
538 | assert len(r) == 0
539 |
540 | def test_issue34(self):
541 | seed(232992)
542 | set_a = sample(range(235342), k=169308)
543 | set_b = sample(range(255999), k=255713)
544 | rba = RoaringBitmap(set_a)
545 | rbb = RoaringBitmap(set_b)
546 | assert rba - rbb == set(set_a) - set(set_b)
547 | rba -= rbb
548 | assert rba == set(set_a) - set(set_b)
549 |
550 |
551 | class Test_immutablerb(object):
552 | def test_inittrivial(self):
553 | data = list(range(5))
554 | ref = set(data)
555 | rb = ImmutableRoaringBitmap(data)
556 | rb._checkconsistency()
557 | assert ref == rb
558 | assert type(rb) == ImmutableRoaringBitmap
559 |
560 | def test_initsorted(self, single):
561 | for name, data in single:
562 | ref = set(sorted(data))
563 | rb = ImmutableRoaringBitmap(sorted(data))
564 | rb._checkconsistency()
565 | assert ref == rb, name
566 |
567 | def test_initunsorted(self, single):
568 | for name, data in single:
569 | ref = set(data)
570 | rb = ImmutableRoaringBitmap(data)
571 | rb._checkconsistency()
572 | assert ref == rb, name
573 |
574 | def test_inititerator(self, single):
575 | for name, data in single:
576 | ref = set(a for a in data)
577 | rb = ImmutableRoaringBitmap(a for a in data)
578 | rb._checkconsistency()
579 | assert ref == rb, name
580 |
581 | def test_initrange(self):
582 | # creates a positive, dense, and inverted block, respectively
583 | for n in [400, 6000, 61241]:
584 | ref = set(range(23, n))
585 | rb = ImmutableRoaringBitmap(range(23, n))
586 | rb._checkconsistency()
587 | assert ref == rb, n
588 |
589 | def test_initrb(self):
590 | r = RoaringBitmap(range(5))
591 | i = ImmutableRoaringBitmap(r)
592 | r = RoaringBitmap(i)
593 | assert r == i
594 |
595 | i = ImmutableRoaringBitmap(range(5))
596 | r = RoaringBitmap(i)
597 | assert r == i
598 |
599 | def test_pickle(self, single):
600 | for name, data in single:
601 | rb = ImmutableRoaringBitmap(data)
602 | rb_pickled = pickle.dumps(rb, protocol=-1)
603 | rb_unpickled = pickle.loads(rb_pickled)
604 | rb._checkconsistency()
605 | assert rb_unpickled == rb, name
606 | assert type(rb) == ImmutableRoaringBitmap, name
607 |
608 | def test_and(self, pair):
609 | for name, data1, data2 in pair:
610 | ref, ref2 = set(data1), set(data2)
611 | rb = ImmutableRoaringBitmap(data1)
612 | rb2 = ImmutableRoaringBitmap(data2)
613 | assert ref & ref2 == set(rb & rb2), name
614 | assert type(rb & rb2) == RoaringBitmap, name
615 |
616 | def test_or(self, pair):
617 | for name, data1, data2 in pair:
618 | ref, ref2 = set(data1), set(data2)
619 | rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2)
620 | assert ref | ref2 == set(rb | rb2), name
621 |
622 | def test_xor(self, pair):
623 | for name, data1, data2 in pair:
624 | ref, ref2 = set(data1), set(data2)
625 | rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2)
626 | assert ref ^ ref2 == set(rb ^ rb2), name
627 |
628 | def test_sub(self, pair):
629 | for name, data1, data2 in pair:
630 | ref, ref2 = set(data1), set(data2)
631 | rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2)
632 | assert ref - ref2 == set(rb - rb2), name
633 |
634 | def test_aggregateand(self, multi):
635 | ref = set(multi[0])
636 | res1 = ref.intersection(*[set(a) for a in multi[1:]])
637 | rb = ImmutableRoaringBitmap(multi[0])
638 | res2 = rb.intersection(*[ImmutableRoaringBitmap(a) for a in multi[1:]])
639 | res2._checkconsistency()
640 | assert res1 == res2
641 |
642 | def test_aggregateor(self, multi):
643 | ref = set(multi[0])
644 | res1 = ref.union(*[set(a) for a in multi[1:]])
645 | rb = ImmutableRoaringBitmap(multi[0])
646 | res2 = rb.union(*[ImmutableRoaringBitmap(a) for a in multi[1:]])
647 | res2._checkconsistency()
648 | assert res1 == res2
649 |
650 | def test_andlen(self, pair):
651 | for name, data1, data2 in pair:
652 | ref, ref2 = set(data1), set(data2)
653 | rb = ImmutableRoaringBitmap(data1)
654 | rb2 = ImmutableRoaringBitmap(data2)
655 | assert len(rb & rb2) == rb.intersection_len(rb2), name
656 | assert len(ref & ref2) == rb.intersection_len(rb2), name
657 |
658 | def test_orlen(self, pair):
659 | for name, data1, data2 in pair:
660 | ref, ref2 = set(data1), set(data2)
661 | rb = ImmutableRoaringBitmap(data1)
662 | rb2 = ImmutableRoaringBitmap(data2)
663 | assert len(ref | ref2) == rb.union_len(rb2), name
664 | assert len(rb | rb2) == rb.union_len(rb2), name
665 |
666 | def test_jaccard_dist(self, pair):
667 | for name, data1, data2 in pair:
668 | if len(data1) == 0 and len(data2) == 0:
669 | continue
670 | ref, ref2 = set(data1), set(data2)
671 | rb = ImmutableRoaringBitmap(data1)
672 | rb2 = ImmutableRoaringBitmap(data2)
673 | assert len(ref & ref2) / float(len(ref | ref2)) == pytest.approx(
674 | rb.intersection_len(rb2) / float(rb.union_len(rb2))), name
675 | assert (1 - (len(ref & ref2) / float(len(ref | ref2)))
676 | == pytest.approx(rb.jaccard_dist(rb2))), name
677 |
678 | def test_rank(self, single):
679 | for name, data in single:
680 | if len(data) == 0:
681 | continue
682 | ref = sorted(set(data))
683 | rb = ImmutableRoaringBitmap(data)
684 | for _ in range(10):
685 | x = choice(ref)
686 | assert x in rb, name
687 | assert rb.rank(x) == ref.index(x) + 1, name
688 |
689 | def test_select(self, single):
690 | for name, data in single:
691 | if len(data) == 0:
692 | continue
693 | ref = sorted(set(data))
694 | rb = ImmutableRoaringBitmap(data)
695 | lrb = list(rb)
696 | idx = [0, 1, 2] + [
697 | randint(0, len(ref) - 1) for _ in range(10)] + [
698 | len(ref) - 1, len(ref) - 2]
699 | for i in idx:
700 | assert lrb[i] == ref[i], name
701 | assert rb.select(i) in rb, name
702 | assert rb.select(i) == ref[i], name
703 | assert rb.rank(rb.select(i)) - 1 == i, name
704 | if rb.select(i) + 1 in rb:
705 | assert rb.rank(rb.select(i) + 1) - 1 == i + 1, name
706 | else:
707 | assert rb.rank(rb.select(i) + 1) - 1 == i, name
708 |
709 | def test_rank2(self):
710 | rb = ImmutableRoaringBitmap(range(0, 100000, 7))
711 | rb = rb.union(range(100000, 200000, 1000))
712 | for k in range(100000):
713 | assert rb.rank(k) == 1 + k // 7
714 | for k in range(100000, 200000):
715 | assert rb.rank(k) == 1 + 100000 // 7 + 1 + (k - 100000) // 1000
716 |
717 | def test_select2(self):
718 | gap = 1
719 | while gap <= 1024:
720 | rb = ImmutableRoaringBitmap(range(0, 100000, gap))
721 | for k in range(0, 100000 // gap):
722 | assert rb.select(k) == k * gap
723 | gap *= 2
724 |
725 |
726 | class Test_multirb(object):
727 | def test_init(self, multi):
728 | orig = [RoaringBitmap(a) for a in multi]
729 | mrb = MultiRoaringBitmap(orig)
730 | assert len(orig) == len(mrb)
731 | for rb1, rb2 in zip(orig, mrb):
732 | assert rb1 == rb2
733 |
734 | def test_none(self, multi):
735 | orig = [RoaringBitmap(a) for a in multi]
736 | orig.insert(4, RoaringBitmap())
737 | mrb = MultiRoaringBitmap(orig)
738 | assert len(orig) == len(mrb)
739 | for rb1, rb2 in zip(orig, mrb):
740 | assert rb1 == rb2
741 | assert mrb.intersection([4, 5]) is None
742 |
743 | def test_aggregateand(self, multi):
744 | ref = set(multi[0])
745 | res1 = ref.intersection(*[set(a) for a in multi[1:]])
746 | mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
747 | res2 = mrb.intersection(list(range(len(mrb))))
748 | assert res1 == res2
749 |
750 | def test_jaccard(self, multi):
751 | mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
752 | indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8])
753 | indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6])
754 | res = mrb.jaccard_dist(indices1, indices2)
755 | ref = array.array(b'd' if PY2 else 'd', [mrb[i].jaccard_dist(mrb[j])
756 | for i, j in zip(indices1, indices2)])
757 | assert res == ref
758 |
759 | def test_andor_len_pairwise(self, multi):
760 | mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
761 | indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8])
762 | indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6])
763 | res1 = array.array(b'L' if PY2 else 'L', [0] * len(indices1))
764 | res2 = array.array(b'L' if PY2 else 'L', [0] * len(indices1))
765 | mrb.andor_len_pairwise(indices1, indices2, res1, res2)
766 | ref1 = array.array(b'L' if PY2 else 'L')
767 | ref2 = array.array(b'L' if PY2 else 'L')
768 | for i, j in zip(indices1, indices2):
769 | ref1.append(len(mrb[i] & mrb[j]))
770 | ref2.append(len(mrb[i] | mrb[j]))
771 | assert res1 == ref1
772 | assert res2 == ref2
773 |
774 | def test_clamp(self, multi):
775 | a, b = sorted(sample(multi[0], 2))
776 | ref = set.intersection(
777 | *[set(x) for x in multi]) & set(range(a, b))
778 | mrb = MultiRoaringBitmap([RoaringBitmap(x) for x in multi])
779 | rb = mrb.intersection(list(range(len(mrb))), start=a, stop=b)
780 | assert a <= rb.min() and rb.max() < b
781 | assert ref == rb
782 |
783 | def test_serialize(self, multi):
784 | orig = [RoaringBitmap(a) for a in multi]
785 | mrb = MultiRoaringBitmap(orig)
786 | with tempfile.NamedTemporaryFile(delete=False) as tmp:
787 | mrb2 = MultiRoaringBitmap(orig, filename=tmp.name)
788 | del mrb2
789 | mrb_deserialized = MultiRoaringBitmap.fromfile(tmp.name)
790 | assert len(orig) == len(mrb)
791 | assert len(orig) == len(mrb_deserialized)
792 | for rb1, rb2, rb3 in zip(orig, mrb, mrb_deserialized):
793 | assert rb1 == rb2
794 | assert rb1 == rb3
795 | rb3._checkconsistency()
796 | assert type(rb3) == ImmutableRoaringBitmap
797 |
798 | def test_multi1(self):
799 | for_multi = []
800 | for i in range(5):
801 | for_multi += [RoaringBitmap(sample(range(99999), 200))]
802 | mrb = MultiRoaringBitmap(for_multi)
803 | assert len(mrb) == 5
804 | assert mrb[4] == for_multi[4]
805 | with pytest.raises(IndexError):
806 | mrb[5]
807 | assert mrb[-1] == for_multi[-1]
808 | list(mrb)
809 | for n, rb in enumerate(mrb):
810 | assert rb == for_multi[n], n
811 |
812 | def test_multi2(self):
813 | for_multi_pre = []
814 | for x in range(3):
815 | for_multi = []
816 | for i in range(5):
817 | for_multi += [RoaringBitmap(sample(range(99999), 200))]
818 | mrb = MultiRoaringBitmap(for_multi)
819 | for_multi_pre += [mrb[0], mrb[1]]
820 |
821 | assert type(for_multi_pre) is list
822 | for_multi_pre[-1]
823 | list(for_multi_pre)
824 |
825 | def test_eq(self, multi):
826 | orig = [RoaringBitmap(a) for a in multi]
827 | mrb = MultiRoaringBitmap(orig)
828 | mrb2 = MultiRoaringBitmap(orig)
829 | mrb3 = MultiRoaringBitmap(orig[1:])
830 | assert mrb == orig
831 | assert mrb == mrb2
832 | assert mrb != orig[1:]
833 | assert mrb != mrb3
834 |
--------------------------------------------------------------------------------