├── .editorconfig
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── docs
    ├── Makefile
    ├── conf.py
    └── index.rst
├── requirements.txt
├── setup.py
├── src
    ├── .ignore
    ├── _arrayops.h
    ├── arrayops.pxi
    ├── bitcount.h
    ├── bitops.pxi
    ├── block.pxi
    ├── immutablerb.pxi
    ├── macros.h
    ├── multirb.pxi
    ├── rbbinaryops.pxi
    └── roaringbitmap.pyx
└── tests
    ├── benchmarks.py
    └── unittests.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org/
 2 | root = true
 3 | 
 4 | [*]
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | 
 8 | [*.{py,pyx,pxd,pxi,c,h,cpp,css,js}]
 9 | charset = utf-8
10 | indent_style = tab
11 | indent_size = 4
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | MANIFEST
 2 | src/.*.swp
 3 | 
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | 
 8 | # Cython-generated files
 9 | src/*.c
10 | src/*.html
11 | 
12 | # C extensions
13 | *.so
14 | 
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | 
50 | # Translations
51 | *.mo
52 | *.pot
53 | 
54 | # Django stuff:
55 | *.log
56 | 
57 | # Sphinx documentation
58 | docs/_build/
59 | 
60 | # PyBuilder
61 | target/
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst LICENSE setup.py
2 | recursive-include src *.h *.c
3 | recursive-exclude src *.pyx *.pxi *.pxd
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	python3 setup.py install --user
 3 | 
 4 | clean:
 5 | 	rm -rf build/ src/roaringbitmap.h
 6 | 	find src/ -name '*.c' -delete
 7 | 	find src/ -name '*.so' -delete
 8 | 	find src/ -name '*.pyc' -delete
 9 | 	find src/ -name '*.html' -delete
10 | 	find tests/ -name '*.pyc' -delete
11 | 	rm -rf src/__pycache__ tests/__pycache__
12 | 
13 | test: all
14 | 	ulimit -Sv 500000; python3 -m pytest tests/unittests.py
15 | 
16 | bench: all
17 | 	ulimit -Sv 500000; python3 tests/benchmarks.py
18 | 
19 | lint:
20 | 	pycodestyle --ignore=E1,W1,W503 tests/*.py \
21 | 	&& pycodestyle --ignore=E1,W1,F,E901,E225,E227,E211,W503 \
22 | 			src/*.pyx src/*.pxi
23 | 
24 | py2:
25 | 	python2 setup.py install --user
26 | 
27 | test2: py2
28 | 	python2 -m pytest tests/unittests.py
29 | 
30 | bench2: all
31 | 	ulimit -Sv 500000; python2 tests/benchmarks.py
32 | 
33 | debug:
34 | 	python3-dbg setup.py install --user --debug
35 | 
36 | debug2:
37 | 	python2-dbg setup.py install --user --debug
38 | 
39 | testdebug: debug
40 | 	gdb -ex run --args python3-dbg -m pytest tests/unittests.py -v
41 | 
42 | testdebug2: debug2
43 | 	gdb -ex run --args python2-dbg -m pytest tests/unittests.py -v
44 | 
45 | valgrind:
46 | 	python3-dbg setup.py install --user --debug
47 | 	valgrind --tool=memcheck --suppressions=valgrind-python.supp \
48 | 		--leak-check=full --show-leak-kinds=definite \
49 | 		python3.5-dbg -m pytest tests/unittests.py -v
50 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Roaring Bitmap in Cython
  2 | ========================
  3 | 
  4 | A roaring bitmap is an efficient compressed datastructure to store a set
  5 | of integers. A Roaring bitmap stores a set of 32-bit integers in a series of
  6 | arrays and bitmaps, whichever takes the least space (which is always
  7 | ``2 ** 16`` bits or less).
  8 | 
  9 | This datastructure is useful for storing a large number of integers, e.g., for
 10 | an inverted index used by search engines and databases. In particular, it is
 11 | possible to quickly compute the intersection of a series of sets, which can be
 12 | used to implement a query as the conjunction of subqueries.
 13 | 
 14 | This implementation is based on the Java and C implementations at
 15 | https://github.com/lemire/RoaringBitmap
 16 | and https://github.com/lemire/CRoaring
 17 | 
 18 | Additional features of this implementation:
 19 | 
 20 | - Inverted list representation: blocks that are mostly full are stored
 21 |   compactly as an array of non-members (instead of as an array of members or a
 22 |   fixed-size bitmap).
 23 | - Collections of immutable roaring bitmaps can be efficiently serialized with
 24 |   ``mmap`` in a single file.
 25 | 
 26 | Missing features w.r.t. CRoaring:
 27 | 
 28 | - Run-length encoded blocks
 29 | - Various AVX2 / SSE optimizations
 30 | 
 31 | See also PyRoaringBitmap, a Python wrapper of CRoaring:
 32 | https://github.com/Ezibenroc/PyRoaringBitMap
 33 | 
 34 | License, requirements
 35 | ---------------------
 36 | The code is licensed under GNU GPL v2, or any later version at your option.
 37 | 
 38 | - Python 2.7+/3.3+  http://www.python.org (headers required, e.g. python-dev package)
 39 | - Cython 0.20+      http://www.cython.org
 40 | 
 41 | Installation, usage
 42 | -------------------
 43 | 
 44 | ::
 45 | 
 46 |     $ git clone https://github.com/andreasvc/roaringbitmap.git
 47 |     $ cd roaringbitmap
 48 |     $ make
 49 | 
 50 | (or ``make py2`` for Python 2)
 51 | 
 52 | A ``RoaringBitmap()`` can be used as a replacement for a normal (mutable)
 53 | Python set containing (unsigned) 32-bit integers:
 54 | 
 55 | .. code-block:: python
 56 | 
 57 |     >>> from roaringbitmap import RoaringBitmap
 58 |     >>> RoaringBitmap(range(10)) & RoaringBitmap(range(5, 15))
 59 |     RoaringBitmap({5, 6, 7, 8, 9})
 60 | 
 61 | ``ImmutableRoaringBitmap`` is an immutable variant (analogous to ``frozenset``)
 62 | which is stored compactly as a contiguous block of memory.
 63 | 
 64 | A sequence of immutable RoaringBitmaps can be stored in a single file and
 65 | accessed efficiently with ``mmap``, without needing to copy or deserialize:
 66 | 
 67 | .. code-block:: python
 68 | 
 69 |     >>> from roaringbitmap import MultiRoaringBitmap
 70 |     >>> mrb = MultiRoaringBitmap([range(n, n + 5) for n in range(10)], filename='index')
 71 | 
 72 |     >>> mrb = MultiRoaringBitmap.fromfile('index')
 73 |     >>> mrb[5]
 74 |     ImmutableRoaringBitmap({5, 6, 7, 8, 9})
 75 | 
 76 | For API documentation cf. http://roaringbitmap.readthedocs.io
 77 | 
 78 | Benchmarks
 79 | ----------
 80 | Output of ``$ make bench``::
 81 | 
 82 |     small sparse set
 83 |     100 runs with sets of 200 random elements n s.t. 0 <= n < 40000
 84 |                     set()  RoaringBitmap()    ratio
 85 |     init         0.000834          0.00138    0.603
 86 |     initsort      0.00085         0.000394     2.16
 87 |     and           0.00102         8.49e-05     12.1
 88 |     or            0.00171         0.000169     10.1
 89 |     xor           0.00152         0.000213     7.11
 90 |     sub          0.000934         0.000197     4.74
 91 |     iand         1.29e-05         2.97e-06     4.35
 92 |     ior           9.7e-06         3.26e-06     2.98
 93 |     ixor         8.98e-06         3.43e-06     2.62
 94 |     isub         6.83e-06          3.3e-06     2.07
 95 |     eq           0.000438         1.17e-05     37.6
 96 |     neq          6.37e-06         7.81e-06    0.816
 97 |     jaccard        0.0029         0.000126     23.1
 98 | 
 99 |     medium load factor
100 |     100 runs with sets of 59392 random elements n s.t. 0 <= n < 118784
101 |                     set()  RoaringBitmap()    ratio
102 |     init            0.564            0.324     1.74
103 |     initsort        0.696            0.273     2.55
104 |     and             0.613         0.000418     1466
105 |     or              0.976         0.000292     3344
106 |     xor             0.955         0.000294     3250
107 |     sub             0.346         0.000316     1092
108 |     iand          0.00658         1.14e-05      575
109 |     ior           0.00594         1.08e-05      548
110 |     ixor          0.00434         1.12e-05      385
111 |     isub          0.00431         1.09e-05      397
112 |     eq             0.0991         0.000116      851
113 |     neq          9.62e-06         1.29e-05    0.743
114 |     jaccard          1.62          0.00025     6476
115 | 
116 |     dense set / high load factor
117 |     100 runs with sets of 39800 random elements n s.t. 0 <= n < 40000
118 |                     set()  RoaringBitmap()    ratio
119 |     init             0.33           0.0775     4.26
120 |     initsort        0.352            0.148     2.38
121 |     and              0.24         0.000223     1078
122 |     or               0.45         0.000165     2734
123 |     xor             0.404         0.000161     2514
124 |     sub             0.169         0.000173      973
125 |     iand          0.00287         6.02e-06      477
126 |     ior           0.00179         6.34e-06      282
127 |     ixor          0.00195         5.53e-06      353
128 |     isub           0.0017         6.35e-06      267
129 |     eq             0.0486         4.65e-05     1045
130 |     neq          1.01e-05         1.13e-05    0.888
131 |     jaccard         0.722         0.000118     6136
132 | 
133 | See https://github.com/Ezibenroc/roaring_analysis/ for a performance comparison
134 | of PyRoaringBitmap and this library.
135 | 
136 | References
137 | ----------
138 | - http://roaringbitmap.org/
139 | - Chambi, S., Lemire, D., Kaser, O., & Godin, R. (2016). Better bitmap
140 |   performance with Roaring bitmaps. Software: practice and experience, 46(5),
141 |   pp. 709-719. http://arxiv.org/abs/1402.6407
142 | - The idea of using the inverted list representation is based on
143 |   https://issues.apache.org/jira/browse/LUCENE-5983
144 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = python3 `which sphinx-build`
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/roaringbitmap.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/roaringbitmap.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/roaringbitmap"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/roaringbitmap"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is execfile()d with the current directory set to its containing dir.
  4 | #
  5 | # Note that not all possible configuration values are present in this
  6 | # autogenerated file.
  7 | #
  8 | # All configuration values have a default; values that are commented out
  9 | # serve to show the default.
 10 | 
 11 | import sys, os
 12 | 
 13 | # If extensions (or modules to document with autodoc) are in another directory,
 14 | # add these directories to sys.path here. If the directory is relative to the
 15 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 16 | #sys.path.insert(0, os.path.abspath('.'))
 17 | 
 18 | # -- General configuration ----------------------------------------------------
 19 | 
 20 | # If your documentation needs a minimal Sphinx version, state it here.
 21 | #needs_sphinx = '1.0'
 22 | 
 23 | # Add any Sphinx extension module names here, as strings. They can be
 24 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 26 | 
 27 | # Add any paths that contain templates here, relative to this directory.
 28 | templates_path = ['_templates']
 29 | 
 30 | # The suffix of source filenames.
 31 | source_suffix = '.rst'
 32 | 
 33 | # The encoding of source files.
 34 | #source_encoding = 'utf-8-sig'
 35 | 
 36 | # The master toctree document.
 37 | master_doc = 'index'
 38 | 
 39 | # General information about the project.
 40 | project = u'roaringbitmap'
 41 | copyright = u'2022, Andreas van Cranenburgh'
 42 | 
 43 | # The version info for the project you're documenting, acts as replacement for
 44 | # |version| and |release|, also used in various other places throughout the
 45 | # built documents.
 46 | #
 47 | # The short X.Y version.
 48 | version = '0.7'
 49 | # The full version, including alpha/beta/rc tags.
 50 | release = '0.7.2'
 51 | 
 52 | # The language for content autogenerated by Sphinx. Refer to documentation
 53 | # for a list of supported languages.
 54 | #language = None
 55 | 
 56 | # There are two options for replacing |today|: either, you set today to some
 57 | # non-false value, then it is used:
 58 | #today = ''
 59 | # Else, today_fmt is used as the format for a strftime call.
 60 | #today_fmt = '%B %d, %Y'
 61 | 
 62 | # List of patterns, relative to source directory, that match files and
 63 | # directories to ignore when looking for source files.
 64 | exclude_patterns = ['_build']
 65 | 
 66 | # The reST default role (used for this markup: `text`) to use for all documents
 67 | #default_role = None
 68 | 
 69 | # If true, '()' will be appended to :func: etc. cross-reference text.
 70 | #add_function_parentheses = True
 71 | 
 72 | # If true, the current module name will be prepended to all description
 73 | # unit titles (such as .. function::).
 74 | #add_module_names = True
 75 | 
 76 | # If true, sectionauthor and moduleauthor directives will be shown in the
 77 | # output. They are ignored by default.
 78 | #show_authors = False
 79 | 
 80 | # The name of the Pygments (syntax highlighting) style to use.
 81 | pygments_style = 'sphinx'
 82 | 
 83 | # A list of ignored prefixes for module index sorting.
 84 | #modindex_common_prefix = []
 85 | 
 86 | autodoc_member_order = 'bysource'
 87 | autodoc_default_flags = ['members']
 88 | 
 89 | # -- Options for HTML output --------------------------------------------------
 90 | 
 91 | ## on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org
 92 | #on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 93 | #
 94 | #if not on_rtd:  # only import and set the theme if we're building docs locally
 95 | #	import sphinx_rtd_theme
 96 | #	html_theme = 'sphinx_rtd_theme'
 97 | #	html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 98 | ## otherwise, readthedocs.org uses their theme by default, so no need to specify it
 99 | 
100 | html_theme = 'nature'
101 | 
102 | # The name for this set of Sphinx documents.  If None, it defaults to
103 | # "<project> v<release> documentation".
104 | #html_title = None
105 | 
106 | # A shorter title for the navigation bar.  Default is the same as html_title.
107 | #html_short_title = None
108 | 
109 | # The name of an image file (relative to this directory) to place at the top
110 | # of the sidebar.
111 | #html_logo = None
112 | 
113 | # The name of an image file (within the static path) to use as favicon of the
114 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
115 | # pixels large.
116 | #html_favicon = None
117 | 
118 | # Add any paths that contain custom static files (such as style sheets) here,
119 | # relative to this directory. They are copied after the builtin static files,
120 | # so a file named "default.css" will overwrite the builtin "default.css".
121 | html_static_path = []
122 | 
123 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
124 | # using the given strftime format.
125 | #html_last_updated_fmt = '%b %d, %Y'
126 | 
127 | # If true, SmartyPants will be used to convert quotes and dashes to
128 | # typographically correct entities.
129 | #html_use_smartypants = True
130 | 
131 | # Custom sidebar templates, maps document names to template names.
132 | html_sidebars = {'**': [
133 | 		'globaltoc.html',
134 | 		'searchbox.html',
135 | 		#'localtoc.html',
136 | 		#'relations.html',
137 | 		#'sourcelink.html',
138 | 		], }
139 | 
140 | # Additional templates that should be rendered to pages, maps page names to
141 | # template names.
142 | #html_additional_pages = {}
143 | 
144 | # If false, no module index is generated.
145 | html_domain_indices = False
146 | 
147 | # If false, no index is generated.
148 | html_use_index = False
149 | 
150 | # If true, the index is split into individual pages for each letter.
151 | #html_split_index = False
152 | 
153 | # If true, links to the reST sources are added to the pages.
154 | html_show_sourcelink = False
155 | 
156 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
157 | #html_show_sphinx = True
158 | 
159 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
160 | #html_show_copyright = True
161 | 
162 | # If true, an OpenSearch description file will be output, and all pages will
163 | # contain a <link> tag referring to it.  The value of this option must be the
164 | # base URL from which the finished HTML is served.
165 | #html_use_opensearch = ''
166 | 
167 | # This is the file name suffix for HTML files (e.g. ".xhtml").
168 | #html_file_suffix = None
169 | 
170 | # Output file base name for HTML help builder.
171 | htmlhelp_basename = 'roaringbitmapdoc'
172 | 
173 | # append __init__ docstring to docstring of class
174 | autoclass_content = 'both'
175 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | RoaringBitmap API documentation
 2 | ===============================
 3 | .. automodule:: roaringbitmap
 4 |     :members:
 5 |     :undoc-members:
 6 |     :show-inheritance:
 7 | 
 8 | 
 9 | Indices and tables
10 | ==================
11 | 
12 | * :ref:`genindex`
13 | * :ref:`modindex`
14 | * :ref:`search`
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython>=0.21
2 | sphinx>=1.6.2
3 | pytest>=3.0.0
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | """Generic setup.py for Cython code."""
  2 | import os
  3 | import sys
  4 | from distutils.core import setup
  5 | from distutils.extension import Extension
  6 | 
  7 | PY2 = sys.version_info[0] == 2
  8 | 
  9 | # In releases, include C sources but not Cython sources; otherwise, use cython
 10 | # to figure out which files may need to be re-cythonized.
 11 | USE_CYTHON = os.path.exists('src/roaringbitmap.pyx')
 12 | if USE_CYTHON:
 13 | 	try:
 14 | 		from Cython.Build import cythonize
 15 | 		from Cython.Distutils import build_ext
 16 | 		from Cython.Compiler import Options
 17 | 		Options.fast_fail = True
 18 | 	except ImportError:
 19 | 		raise RuntimeError('could not import Cython.')
 20 | 	cmdclass = dict(build_ext=build_ext)
 21 | else:
 22 | 	cmdclass = dict()
 23 | 
 24 | DEBUG = '--debug' in sys.argv
 25 | if DEBUG:
 26 | 	sys.argv.remove('--debug')
 27 | 
 28 | MTUNE = '--with-mtune' in sys.argv
 29 | if MTUNE:
 30 |         sys.argv.remove('--with-mtune')
 31 | 
 32 | with open('README.rst') as inp:
 33 | 	README = inp.read()
 34 | 
 35 | METADATA = dict(name='roaringbitmap',
 36 | 		version='0.7.2',
 37 | 		description='Roaring Bitmap',
 38 | 		long_description=README,
 39 | 		author='Andreas van Cranenburgh',
 40 | 		author_email='A.W.van.Cranenburgh@rug.nl',
 41 | 		url='http://roaringbitmap.readthedocs.io',
 42 | 		license='GPL',
 43 | 		platforms=['Many'],
 44 | 		classifiers=[
 45 | 				'Development Status :: 4 - Beta',
 46 | 				'Intended Audience :: Science/Research',
 47 | 				'License :: OSI Approved :: GNU General Public License (GPL)',
 48 | 				'Operating System :: POSIX',
 49 | 				'Programming Language :: Python :: 2.7',
 50 | 				'Programming Language :: Python :: 3.3',
 51 | 				'Programming Language :: Cython',
 52 | 		],
 53 | )
 54 | 
 55 | # some of these directives increase performance,
 56 | # but at the cost of failing in mysterious ways.
 57 | directives = {
 58 | 		'profile': False,
 59 | 		'cdivision': True,
 60 | 		'nonecheck': False,
 61 | 		'wraparound': False,
 62 | 		'boundscheck': False,
 63 | 		'infer_types': None,
 64 | 		'embedsignature': True,
 65 | 		'warn.unused': True,
 66 | 		'warn.unreachable': True,
 67 | 		'warn.maybe_uninitialized': True,
 68 | 		'warn.undeclared': False,
 69 | 		'warn.unused_arg': False,
 70 | 		'warn.unused_result': False,
 71 | 		}
 72 | 
 73 | if __name__ == '__main__':
 74 | 	if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[:2] < (3, 3):
 75 | 		raise RuntimeError('Python version 2.7 or >= 3.3 required.')
 76 | 	os.environ['GCC_COLORS'] = 'auto'
 77 | 	# NB: could also use Cython compile-time definition,
 78 | 	# but this would lead to different C output for Python 2/3.
 79 | 	extra_compile_args = ['-DPY2=%d' % PY2]  # '-fopt-info-vec-missed',
 80 | 	if sys.platform == 'win32':
 81 | 		# https://docs.microsoft.com/en-us/cpp/intrinsics/bitscanforward-bitscanforward64?view=vs-2017
 82 | 		extra_compile_args += ['-EHsc']
 83 | 	else:
 84 | 		extra_compile_args += [
 85 | 				'-Wno-strict-prototypes', '-Wno-unreachable-code', '-Wextra']
 86 | 	extra_link_args = []
 87 | 	if not DEBUG and sys.platform != 'win32':
 88 | 		extra_compile_args += ['-O3', '-DNDEBUG']
 89 | 		extra_compile_args += ['-mtune=native'] if MTUNE else ['-march=native']
 90 | 		extra_link_args += ['-DNDEBUG']
 91 | 	if USE_CYTHON:
 92 | 		if DEBUG:
 93 | 			directives.update(wraparound=True, boundscheck=True)
 94 | 			if sys.platform == 'win32':
 95 | 				extra_compile_args += ['-DDEBUG', '-Od', '-Zi']
 96 | 				extra_link_args += ['-DEBUG']
 97 | 			else:
 98 | 				extra_compile_args += ['-g', '-O0',
 99 | 						# '-fsanitize=address', '-fsanitize=undefined',
100 | 						'-fno-omit-frame-pointer']
101 | 				extra_link_args += ['-g']
102 | 		ext_modules = cythonize(
103 | 				[Extension(
104 | 					'*',
105 | 					sources=['src/*.pyx'],
106 | 					extra_compile_args=extra_compile_args,
107 | 					extra_link_args=extra_link_args)],
108 | 				annotate=True,
109 | 				compiler_directives=directives,
110 | 				language_level=3)
111 | 	else:
112 | 		ext_modules = [Extension(
113 | 				'roaringbitmap',
114 | 				sources=['src/roaringbitmap.c'],
115 | 				extra_compile_args=extra_compile_args,
116 | 				extra_link_args=extra_link_args)]
117 | 	setup(
118 | 			cmdclass=cmdclass,
119 | 			ext_modules=ext_modules,
120 | 			**METADATA)
121 | 


--------------------------------------------------------------------------------
/src/.ignore:
--------------------------------------------------------------------------------
1 | *.c
2 | *.html
3 | 


--------------------------------------------------------------------------------
/src/_arrayops.h:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | 
  4 | #if defined(__SSE4_2__)
  5 |     #if defined(_MSC_VER)
  6 |         #include <intrin.h>
  7 |     #else
  8 |         #include <nmmintrin.h>
  9 |     #endif
 10 | #endif
 11 | 
 12 | /**
 13 |  * Generic intersection function. Passes unit tests.
 14 |  *
 15 |  * From CRoaring, array_util.c
 16 |  * cf. https://github.com/RoaringBitmap/CRoaring/blob/master/src/array_util.c
 17 |  */
 18 | int32_t intersect_general16(const uint16_t *A, const size_t lenA,
 19 |         const uint16_t *B, const size_t lenB, uint16_t *out) {
 20 |     const uint16_t *initout = out;
 21 |     const uint16_t *endA;
 22 |     const uint16_t *endB;
 23 |     if (lenA == 0 || lenB == 0) return 0;
 24 |     endA = A + lenA;
 25 |     endB = B + lenB;
 26 | 
 27 |     while (1) {
 28 |         while (*A < *B) {
 29 | SKIP_FIRST_COMPARE:
 30 |             if (++A == endA) return (int32_t)(out - initout);
 31 |         }
 32 |         while (*A > *B) {
 33 |             if (++B == endB) return (int32_t)(out - initout);
 34 |         }
 35 |         if (*A == *B) {
 36 |             *out++ = *A;
 37 |             if (++A == endA || ++B == endB) return (int32_t)(out - initout);
 38 |         } else {
 39 |             goto SKIP_FIRST_COMPARE;
 40 |         }
 41 |     }
 42 |     return (int32_t)(out - initout);  /* NOTREACHED */
 43 | }
 44 | 
 45 | 
 46 | #if defined(__SSE4_2__)
 47 | 
 48 | static inline int32_t intersect_uint16(
 49 |         const uint16_t* __restrict a, size_t a_size,
 50 |        const uint16_t* __restrict b, size_t b_size,
 51 |        uint16_t* __restrict result) {
 52 |     /* from https://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ */
 53 |     size_t count = 0;
 54 |     static __m128i shuffle_mask16[256];
 55 |     static int built_shuffle_mask = 0;
 56 |     int i, j;
 57 |     if (!built_shuffle_mask) {
 58 |         built_shuffle_mask = 1;
 59 |         for (i = 0; i < 256; i++) {
 60 |             uint8_t mask[16];
 61 |             memset(mask, 0xFF, sizeof(mask));
 62 |             int counter = 0;
 63 |             for (j = 0; j < 16; j++) {
 64 |                 if (i & (1 << j)) {
 65 |                     mask[counter++] = 2 * j;
 66 |                     mask[counter++] = 2 * j + 1;
 67 |                 }
 68 |             }
 69 |             __m128i v_mask = _mm_loadu_si128((const __m128i *)mask);
 70 |             shuffle_mask16[i] = v_mask;
 71 |         }
 72 |     }
 73 |     size_t i_a = 0, i_b = 0;
 74 |     size_t st_a = (a_size / 8) * 8;
 75 |     size_t st_b = (b_size / 8) * 8;
 76 | 
 77 |     while(i_a < st_a && i_b < st_b) {
 78 |         __m128i v_a = _mm_loadu_si128((__m128i *)&a[i_a]);
 79 |         __m128i v_b = _mm_loadu_si128((__m128i *)&b[i_b]);
 80 |         __m128i v_cmp = _mm_cmpestrm(v_a, 8, v_b, 8,
 81 |                 _SIDD_UWORD_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK);
 82 |         int r = _mm_extract_epi32(v_cmp, 0);
 83 |         __m128i v_shuf = _mm_shuffle_epi8(v_b, shuffle_mask16[r]);
 84 |         _mm_storeu_si128((__m128i *)&result[count], v_shuf);
 85 |         count += _mm_popcnt_u32(r);
 86 |         uint16_t a_max = _mm_extract_epi16(v_a, 7);
 87 |         uint16_t b_max = _mm_extract_epi16(v_b, 7);
 88 |         i_a += (a_max <= b_max) * 8;
 89 |         i_b += (a_max >= b_max) * 8;
 90 |     }
 91 |     a += i_a;
 92 |     a_size -= i_a;
 93 |     b += i_b;
 94 |     b_size -= i_b;
 95 |     result += count;
 96 |     return count + intersect_general16(a, a_size, b, b_size, result);
 97 | }
 98 | 
 99 | #else  /* __SSE4_2__ */
100 | 
101 | int32_t intersect_uint16(const uint16_t *A, size_t s_a,
102 |         const uint16_t *B, size_t s_b, uint16_t *C) {
103 |     return intersect_general16(A, s_a, B, s_b, C);
104 | }
105 | 
106 | #endif  /* __SSE4_2__ */
107 | 


--------------------------------------------------------------------------------
/src/arrayops.pxi:
--------------------------------------------------------------------------------
  1 | # Set / search operations on integer arrays
  2 | 
  3 | cdef inline int binarysearch(uint16_t *data, int begin, int end,
  4 | 		uint16_t elem) nogil:
  5 | 	"""Binary search for short `elem` in array `data`.
  6 | 
  7 | 	:returns: positive index ``i`` if ``elem`` is found; otherwise return a
  8 | 		negative value ``i`` such that ``-i - 1`` is the index where ``elem``
  9 | 		should be inserted."""
 10 | 	cdef int low = begin
 11 | 	cdef int high = end - 1
 12 | 	cdef int middleidx
 13 | 	cdef uint16_t middleval
 14 | 	# accelerate the possibly common case of a just appended value
 15 | 	if end > 0 and data[end - 1] < elem:
 16 | 		return -end - 1
 17 | 	while low <= high:
 18 | 		middleidx = (low + high) >> 1
 19 | 		middleval = data[middleidx]
 20 | 		if middleval < elem:
 21 | 			low = middleidx + 1
 22 | 		elif middleval > elem:
 23 | 			high = middleidx - 1
 24 | 		else:
 25 | 			return middleidx
 26 | 	return -(low + 1)
 27 | 
 28 | 
 29 | cdef inline int advance(uint16_t *data, int pos, int length,
 30 | 		uint16_t minitem) nogil:
 31 | 	cdef int lower = pos + 1
 32 | 	cdef int spansize = 1
 33 | 	cdef int upper, mid
 34 | 	if lower >= length or data[lower] >= minitem:
 35 | 		return lower
 36 | 	while lower + spansize < length and data[lower + spansize] < minitem:
 37 | 		spansize *= 2
 38 | 	upper = (lower + spansize) if lower + spansize < length else (length - 1)
 39 | 	if data[upper] == minitem:
 40 | 		return upper
 41 | 	if data[upper] < minitem:
 42 | 		return length
 43 | 	lower += spansize >> 1
 44 | 	while lower + 1 != upper:
 45 | 		mid = (<unsigned int>lower + <unsigned int>upper) >> 1
 46 | 		if data[mid] == minitem:
 47 | 			return mid
 48 | 		elif data[mid] < minitem:
 49 | 			lower = mid
 50 | 		else:
 51 | 			upper = mid
 52 | 	return upper
 53 | 
 54 | 
 55 | cdef uint32_t intersect2by2(uint16_t *data1, uint16_t *data2,
 56 | 		int length1, int length2, uint16_t *dest) nogil:
 57 | 	if length1 * 64 < length2:
 58 | 		return intersectgalloping(data1, length1, data2, length2, dest)
 59 | 	elif length2 * 64 < length1:
 60 | 		return intersectgalloping(data2, length2, data1, length1, dest)
 61 | 	if dest is NULL:
 62 | 		return intersectcard(data1, data2, length1, length2)
 63 | 	elif data1 is not dest and data2 is not dest:
 64 | 		# NB: dest must have 8 elements extra capacity
 65 | 		return intersect_uint16(data1, length1, data2, length2, dest)
 66 | 	return intersect_general16(data1, length1, data2, length2, dest)
 67 | 	# return intersectlocal2by2(data1, length1, data2, length2, dest)
 68 | 
 69 | 
 70 | cdef inline int intersectlocal2by2(uint16_t *data1, int length1,
 71 | 		uint16_t *data2, int length2, uint16_t *dest) nogil:
 72 | 	cdef int k1 = 0, k2 = 0, pos = 0
 73 | 	if length1 == 0 or length2 == 0:
 74 | 		return 0
 75 | 	while True:
 76 | 		if data2[k2] < data1[k1]:
 77 | 			while True:
 78 | 				k2 += 1
 79 | 				if k2 == length2:
 80 | 					return pos
 81 | 				elif data2[k2] >= data1[k1]:
 82 | 					break
 83 | 		elif data1[k1] < data2[k2]:
 84 | 			while True:
 85 | 				k1 += 1
 86 | 				if k1 == length1:
 87 | 					return pos
 88 | 				elif data1[k1] >= data2[k2]:
 89 | 					break
 90 | 		else:  # data1[k1] == data2[k2]
 91 | 			dest[pos] = data1[k1]
 92 | 			pos += 1
 93 | 			k1 += 1
 94 | 			if k1 == length1:
 95 | 				return pos
 96 | 			k2 += 1
 97 | 			if k2 == length2:
 98 | 				return pos
 99 | 
100 | 
101 | cdef inline int intersectcard(uint16_t *data1, uint16_t *data2,
102 | 		int length1, int length2) nogil:
103 | 	cdef int k1 = 0, k2 = 0, pos = 0
104 | 	if length1 == 0 or length2 == 0:
105 | 		return 0
106 | 	while True:
107 | 		if data2[k2] < data1[k1]:
108 | 			while True:
109 | 				k2 += 1
110 | 				if k2 == length2:
111 | 					return pos
112 | 				elif data2[k2] >= data1[k1]:
113 | 					break
114 | 		elif data1[k1] < data2[k2]:
115 | 			while True:
116 | 				k1 += 1
117 | 				if k1 == length1:
118 | 					return pos
119 | 				elif data1[k1] >= data2[k2]:
120 | 					break
121 | 		else:  # data1[k1] == data2[k2]
122 | 			pos += 1
123 | 			k1 += 1
124 | 			if k1 == length1:
125 | 				return pos
126 | 			k2 += 1
127 | 			if k2 == length2:
128 | 				return pos
129 | 
130 | 
131 | cdef inline int intersectgalloping(
132 | 		uint16_t *small, int lensmall,
133 | 		uint16_t *large, int lenlarge,
134 | 		uint16_t *dest) nogil:
135 | 	cdef int k1 = 0, k2 = 0, pos = 0
136 | 	if lensmall == 0:
137 | 		return 0
138 | 	if dest is NULL:  # cardinality only
139 | 		while True:
140 | 			if large[k1] < small[k2]:
141 | 				k1 = advance(large, k1, lenlarge, small[k2])
142 | 				if k1 == lenlarge:
143 | 					return pos
144 | 			if small[k2] < large[k1]:
145 | 				k2 += 1
146 | 				if k2 == lensmall:
147 | 					return pos
148 | 			else:  # large[k2] == small[k1]
149 | 				pos += 1
150 | 				k2 += 1
151 | 				if k2 == lensmall:
152 | 					return pos
153 | 				k1 = advance(large, k1, lenlarge, small[k2])
154 | 				if k1 == lenlarge:
155 | 					return pos
156 | 	else:  # store result
157 | 		while True:
158 | 			if large[k1] < small[k2]:
159 | 				k1 = advance(large, k1, lenlarge, small[k2])
160 | 				if k1 == lenlarge:
161 | 					return pos
162 | 			if small[k2] < large[k1]:
163 | 				k2 += 1
164 | 				if k2 == lensmall:
165 | 					return pos
166 | 			else:  # large[k2] == small[k1]
167 | 				dest[pos] = small[k2]
168 | 				pos += 1
169 | 				k2 += 1
170 | 				if k2 == lensmall:
171 | 					return pos
172 | 				k1 = advance(large, k1, lenlarge, small[k2])
173 | 				if k1 == lenlarge:
174 | 					return pos
175 | 
176 | 
177 | cdef int union2by2(uint16_t *data1, uint16_t *data2,
178 | 		int length1, int length2, uint16_t *dest) nogil:
179 | 	cdef int k1 = 0, k2 = 0, pos = 0, n_elems
180 | 	if length2 == 0:
181 | 		if dest is not NULL:
182 | 			memcpy(dest, data1, length1 * sizeof(uint16_t))
183 | 		return length1
184 | 	elif length1 == 0:
185 | 		if dest is not NULL:
186 | 			memcpy(dest, data2, length2 * sizeof(uint16_t))
187 | 		return length2
188 | 	elif length1 > length2:
189 | 		return union2by2(data2, data1, length2, length1, dest)
190 | 	if dest is NULL:  # cardinality only
191 | 		while True:
192 | 			if data1[k1] < data2[k2]:
193 | 				pos += 1
194 | 				k1 += 1
195 | 				if k1 >= length1:
196 | 					break
197 | 			elif data1[k1] > data2[k2]:
198 | 				pos += 1
199 | 				k2 += 1
200 | 				if k2 >= length2:
201 | 					break
202 | 			else:  # data1[k1] == data2[k2]
203 | 				pos += 1
204 | 				k1 += 1
205 | 				k2 += 1
206 | 				if k1 >= length1 or k2 >= length2:
207 | 					break
208 | 	else:  # store result
209 | 		while True:
210 | 			if data1[k1] < data2[k2]:
211 | 				dest[pos] = data1[k1]
212 | 				pos += 1
213 | 				k1 += 1
214 | 				if k1 >= length1:
215 | 					break
216 | 			elif data1[k1] > data2[k2]:
217 | 				dest[pos] = data2[k2]
218 | 				pos += 1
219 | 				k2 += 1
220 | 				if k2 >= length2:
221 | 					break
222 | 			else:  # data1[k1] == data2[k2]
223 | 				dest[pos] = data1[k1]
224 | 				pos += 1
225 | 				k1 += 1
226 | 				k2 += 1
227 | 				if k1 >= length1 or k2 >= length2:
228 | 					break
229 | 	if k1 < length1:
230 | 		n_elems = length1 - k1
231 | 		if dest is not NULL:
232 | 			memcpy(&(dest[pos]), &(data1[k1]), n_elems * sizeof(uint16_t))
233 | 		pos += n_elems
234 | 	elif k2 < length2:
235 | 		n_elems = length2 - k2
236 | 		if dest is not NULL:
237 | 			memcpy(&(dest[pos]), &(data2[k2]), n_elems * sizeof(uint16_t))
238 | 		pos += n_elems
239 | 	return pos
240 | 
241 | 
242 | cdef int union2by2bitmap(uint16_t *data1, uint16_t *data2,
243 | 		int length1, int length2, uint64_t *dest) nogil:
244 | 	"""Like union2by2, but write result to bitmap."""
245 | 	cdef int length = 0, pos = 0
246 | 	memset(dest, 0, BITMAPSIZE)
247 | 	for pos in range(length1):
248 | 		SETBIT(dest, data1[pos])
249 | 	length = length1
250 | 	for pos in range(length2):
251 | 		length += TESTBIT(dest, data2[pos]) == 0
252 | 		SETBIT(dest, data2[pos])
253 | 	return length
254 | 
255 | 
256 | cdef int difference(uint16_t *data1, uint16_t *data2,
257 | 		int length1, int length2, uint16_t *dest) nogil:
258 | 	cdef int k1 = 0, k2 = 0, pos = 0
259 | 	if length2 == 0:
260 | 		if dest is not NULL:
261 | 			memcpy(<void *>dest, <void *>data1, length1 * sizeof(uint16_t))
262 | 		return length1
263 | 	elif length1 == 0:
264 | 		return 0
265 | 	if dest is NULL:  # cardinality only
266 | 		while True:
267 | 			if data1[k1] < data2[k2]:
268 | 				pos += 1
269 | 				k1 += 1
270 | 				if k1 >= length1:
271 | 					return pos
272 | 			elif data1[k1] == data2[k2]:
273 | 				k1 += 1
274 | 				k2 += 1
275 | 				if k1 >= length1:
276 | 					return pos
277 | 				elif k2 >= length2:
278 | 					break
279 | 			else:  # data1[k1] > data2[k2]
280 | 				k2 += 1
281 | 				if k2 >= length2:
282 | 					break
283 | 		while k1 < length1:
284 | 			pos += 1
285 | 			k1 += 1
286 | 	else:  # store result
287 | 		while True:
288 | 			if data1[k1] < data2[k2]:
289 | 				dest[pos] = data1[k1]
290 | 				pos += 1
291 | 				k1 += 1
292 | 				if k1 >= length1:
293 | 					return pos
294 | 			elif data1[k1] == data2[k2]:
295 | 				k1 += 1
296 | 				k2 += 1
297 | 				if k1 >= length1:
298 | 					return pos
299 | 				elif k2 >= length2:
300 | 					break
301 | 			else:  # data1[k1] > data2[k2]
302 | 				k2 += 1
303 | 				if k2 >= length2:
304 | 					break
305 | 		while k1 < length1:
306 | 			dest[pos] = data1[k1]
307 | 			pos += 1
308 | 			k1 += 1
309 | 	return pos
310 | 
311 | 
312 | cdef int xor2by2(uint16_t *data1, uint16_t *data2,
313 | 		int length1, int length2, uint16_t *dest) nogil:
314 | 	cdef int k1 = 0, k2 = 0, pos = 0
315 | 	if length2 == 0:
316 | 		if dest is not NULL:
317 | 			memcpy(<void *>dest, <void *>data1, length1 * sizeof(uint16_t))
318 | 		return length1
319 | 	elif length1 == 0:
320 | 		if dest is not NULL:
321 | 			memcpy(<void *>dest, <void *>data2, length2 * sizeof(uint16_t))
322 | 		return length2
323 | 	if dest is NULL:  # cardinality only
324 | 		while True:
325 | 			if data1[k1] < data2[k2]:
326 | 				pos += 1
327 | 				k1 += 1
328 | 				if k1 >= length1:
329 | 					break
330 | 			elif data1[k1] == data2[k2]:
331 | 				k1 += 1
332 | 				k2 += 1
333 | 				if k1 >= length1 or k2 >= length2:
334 | 					break
335 | 			else:  # data1[k1] > data2[k2]
336 | 				pos += 1
337 | 				k2 += 1
338 | 				if k2 >= length2:
339 | 					break
340 | 		if k1 >= length1:
341 | 			while k2 < length2:
342 | 				pos += 1
343 | 				k2 += 1
344 | 		elif k2 >= length2:
345 | 			while k1 < length1:
346 | 				pos += 1
347 | 				k1 += 1
348 | 	else:  # store result
349 | 		while True:
350 | 			if data1[k1] < data2[k2]:
351 | 				dest[pos] = data1[k1]
352 | 				pos += 1
353 | 				k1 += 1
354 | 				if k1 >= length1:
355 | 					break
356 | 			elif data1[k1] == data2[k2]:
357 | 				k1 += 1
358 | 				k2 += 1
359 | 				if k1 >= length1 or k2 >= length2:
360 | 					break
361 | 			else:  # data1[k1] > data2[k2]
362 | 				dest[pos] = data2[k2]
363 | 				pos += 1
364 | 				k2 += 1
365 | 				if k2 >= length2:
366 | 					break
367 | 		if k1 >= length1:
368 | 			while k2 < length2:
369 | 				dest[pos] = data2[k2]
370 | 				pos += 1
371 | 				k2 += 1
372 | 		elif k2 >= length2:
373 | 			while k1 < length1:
374 | 				dest[pos] = data1[k1]
375 | 				pos += 1
376 | 				k1 += 1
377 | 	return pos
378 | 
379 | 
380 | cdef inline int selectinvertedbinarysearch(
381 | 		uint16_t *data, int begin, int end, uint16_t i) nogil:
382 | 	"""Custom binary search to find i'th member given array of non-members."""
383 | 	# 0 1 2   3 4 5   6 7  8    9 10 ... indices
384 | 	#       0       1         2      ... inverted: indices
385 | 	#       3       7        11      ... inverted: non-members
386 | 	# 0 1 2   4 5 6   8 9 10   12 13 ... members
387 | 	cdef int low = begin
388 | 	cdef int high = end - 1
389 | 	cdef int middleidx
390 | 	cdef uint16_t middleval
391 | 	if end == 0 or data[0] > i:
392 | 		return i
393 | 	elif data[high] - high <= i:
394 | 		return i + high + 1
395 | 	# find the pair of non-members between which the i'th member lies
396 | 	while low < high:
397 | 		middleidx = (low + high) >> 1
398 | 		middleval = data[middleidx] - middleidx
399 | 		if middleval > i:
400 | 			high = middleidx
401 | 		else:
402 | 			low = middleidx + 1
403 | 	# compute member given index
404 | 	return i + low
405 | 


--------------------------------------------------------------------------------
/src/bitcount.h:
--------------------------------------------------------------------------------
  1 | /* Fast cross-platform bit counting using intrinsic functions
  2 |  *
  3 |  * This code is based on https://github.com/Noctune/bitcount
  4 |  * Adapted for 64-bit integers instead of 32 bits.
  5 |  */
  6 | 
  7 | #ifndef BITCOUNT_H_
  8 | #define BITCOUNT_H_
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | #if !defined(BITCOUNT_NO_AUTODETECT)
 15 | 	#if defined(__GNUC__) || defined(__clang__)
 16 | 		#define BITCOUNT_GCC
 17 | 	// FIXME: disabled for debugging
 18 | 	// #elif defined(_MSC_VER) && defined(_M_X64)
 19 | 	// 	#define BITCOUNT_VS_X64
 20 | 	// #elif defined(_MSC_VER) && defined(_M_IX86)
 21 | 	// 	#define BITCOUNT_VS_X86
 22 | 	#endif
 23 | #endif
 24 | 
 25 | #ifdef _MSC_VER
 26 | #define BITCOUNT_INLINE static __inline
 27 | #else
 28 | #define BITCOUNT_INLINE static inline
 29 | #endif
 30 | 
 31 | #ifdef BITCOUNT_VS_X64
 32 | #include <intrin.h>
 33 | #pragma intrinsic(_BitScanForward64,_BitScanReverse64,__popcnt64)
 34 | #endif
 35 | 
 36 | #ifdef BITCOUNT_VS_X86
 37 | #include <intrin.h>
 38 | #pragma intrinsic(_BitScanForward,_BitScanReverse,__popcnt)
 39 | #endif
 40 | 
 41 | #include <limits.h>
 42 | #include <stdint.h>
 43 | #define BITCOUNT_BITS (sizeof(uint64_t) * CHAR_BIT)
 44 | 
 45 | /* General implementations for systems without intrinsics */
 46 | unsigned int bit_clz_general(uint64_t);
 47 | unsigned int bit_ctz_general(uint64_t);
 48 | unsigned int bit_popcount_general(uint64_t);
 49 | 
 50 | /* Returns the number of leading 0-bits in x, starting at the most significant
 51 |    bit position. If v is 0, the result is undefined. */
 52 | BITCOUNT_INLINE unsigned int bit_clz(uint64_t v) {
 53 | 	#if defined(BITCOUNT_GCC)
 54 | 	return __builtin_clzll(v);
 55 | 	#elif defined(BITCOUNT_VS_X64)
 56 | 	unsigned long result;
 57 | 	_BitScanReverse64(&result, v);
 58 | 	return BITCOUNT_BITS - 1 - result;
 59 | 	#elif defined(BITCOUNT_VS_X86)
 60 | 	unsigned long result;
 61 | 	if ((uint32_t)(v >> 32) != 0) {
 62 | 		_BitScanReverse(&result, (uint32_t)(v >> 32));
 63 | 	} else {
 64 | 		_BitScanReverse(&result, (uint32_t)v);
 65 | 		result += 32;
 66 | 	}
 67 | 	return BITCOUNT_BITS - 1 - result;
 68 | 	#else
 69 | 	return bit_clz_general(v);
 70 | 	#endif
 71 | }
 72 | 
 73 | /* Returns the number of trailing 0-bits in x, starting at the least significant
 74 |    bit position. If v is 0, the result is undefined. */
 75 | BITCOUNT_INLINE unsigned int bit_ctz(uint64_t v) {
 76 | 	#if defined(BITCOUNT_GCC)
 77 | 	return __builtin_ctzll(v);
 78 | 	#elif defined(BITCOUNT_VS_X64)
 79 | 	unsigned long result;
 80 | 	_BitScanForward64(&result, v);
 81 | 	return result;
 82 | 	#elif defined(BITCOUNT_VS_X86)
 83 | 	unsigned long result;
 84 | 	/* https://github.com/google/re2/commit/35febd432d9e6d8630845285c7f29eabd1df7beb */
 85 | 	if ((uint32_t)v != 0) {
 86 | 		_BitScanForward(&result, (uint32_t)v);
 87 | 		return (unsigned int)result;
 88 | 	} else {
 89 | 		_BitScanForward(&result, (uint32_t)(v >> 32));
 90 | 		return (unsigned int)(result) + 32;
 91 | 	}
 92 | 	#else
 93 | 	return bit_ctz_general(v);
 94 | 	#endif
 95 | }
 96 | 
 97 | /* Returns the number of 1-bits in v. */
 98 | BITCOUNT_INLINE unsigned int bit_popcount(uint64_t v) {
 99 | 	#if defined(BITCOUNT_GCC)
100 | 	return __builtin_popcountll(v);
101 | 	#elif defined(BITCOUNT_VS_X64)
102 | 	return __popcnt64(v);
103 | 	#elif defined(BITCOUNT_VS_X86)
104 | 	return (__popcnt((uint32_t)v) + __popcnt((uint32_t)(v >> 32)));
105 | 	#else
106 | 	return bit_popcount_general(v);
107 | 	#endif
108 | }
109 | 
110 | unsigned int bit_clz_general(uint64_t v) {
111 | 	/* From http://www.codeproject.com/Tips/784635/UInt-Bit-Operations */
112 | 	uint64_t i, c;
113 | 
114 | 	i = ~v;
115 | 	c = ((i ^ (i + 1)) & i) >> 63;
116 | 
117 | 	i = (v >> 32) + 0xffffffff;
118 | 	i = ((i & 0x100000000) ^ 0x100000000) >> 27;
119 | 	c += i;  v <<= i;
120 | 
121 | 	i = (v >> 48) + 0xffff;
122 | 	i = ((i & 0x10000) ^ 0x10000) >> 12;
123 | 	c += i;  v <<= i;
124 | 
125 | 	i = (v >> 56) + 0xff;
126 | 	i = ((i & 0x100) ^ 0x100) >> 5;
127 | 	c += i;  v <<= i;
128 | 
129 | 	i = (v >> 60) + 0xf;
130 | 	i = ((i & 0x10) ^ 0x10) >> 2;
131 | 	c += i;  v <<= i;
132 | 
133 | 	i = (v >> 62) + 3;
134 | 	i = ((i & 4) ^ 4) >> 1;
135 | 	c += i;  v <<= i;
136 | 
137 | 	c += (v >> 63) ^ 1;
138 | 
139 | 	return (unsigned int)c;
140 | }
141 | 
142 | unsigned int bit_ctz_general(uint64_t v) {
143 | 	/* From http://www.codeproject.com/Tips/784635/UInt-Bit-Operations */
144 | 	uint64_t i = ~v;
145 | 	uint64_t c = ((i ^ (i + 1)) & i) >> 63;
146 | 
147 | 	i = (v & 0xffffffff) + 0xffffffff;
148 | 	i = ((i & 0x100000000) ^ 0x100000000) >> 27;
149 | 	c += i;  v >>= i;
150 | 
151 | 	i = (v & 0xffff) + 0xffff;
152 | 	i = ((i & 0x10000) ^ 0x10000) >> 12;
153 | 	c += i;  v >>= i;
154 | 
155 | 	i = (v & 0xff) + 0xff;
156 | 	i = ((i & 0x100) ^ 0x100) >> 5;
157 | 	c += i;  v >>= i;
158 | 
159 | 	i = (v & 0xf) + 0xf;
160 | 	i = ((i & 0x10) ^ 0x10) >> 2;
161 | 	c += i;  v >>= i;
162 | 
163 | 	i = (v & 3) + 3;
164 | 	i = ((i & 4) ^ 4) >> 1;
165 | 	c += i;  v >>= i;
166 | 
167 | 	c += ((v & 1) ^ 1);
168 | 
169 | 	return (unsigned int)c;
170 | }
171 | 
172 | unsigned int bit_popcount_general(uint64_t v) {
173 | 	/* see http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
174 | 	v -= ((v >> 1) & 0x5555555555555555);
175 | 	v = (v & 0x3333333333333333) + ((v >> 2) & 0x3333333333333333);
176 | 	return (((v + (v >> 4)) & 0xF0F0F0F0F0F0F0F) * 0x101010101010101) >> 56;
177 | }
178 | 
179 | #ifdef __cplusplus
180 | }
181 | #endif
182 | 
183 | #endif /* BITCOUNT_H_ */
184 | 


--------------------------------------------------------------------------------
/src/bitops.pxi:
--------------------------------------------------------------------------------
  1 | """Oerations on fixed-size bitvectors.
  2 | 
  3 | All bitvector operands are assumed to have ``BLOCKSIZE`` elements (bits).
  4 | """
  5 | 
  6 | # Store result, return cardinality
  7 | cdef inline uint32_t bitsetintersect(uint64_t *dest,
  8 | 		uint64_t *src1, uint64_t *src2) nogil:
  9 | 	"""dest gets the intersection of src1 and src2.
 10 | 
 11 | 	:returns: number of set bits in result."""
 12 | 	cdef size_t n
 13 | 	cdef uint64_t res1, res2
 14 | 	cdef uint32_t result = 0
 15 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
 16 | 		res1 = src1[n] & src2[n]
 17 | 		res2 = src1[n + 1] & src2[n + 1]
 18 | 		dest[n] = res1
 19 | 		dest[n + 1] = res2
 20 | 		result += bit_popcount(res1)
 21 | 		result += bit_popcount(res2)
 22 | 	return result
 23 | 
 24 | 
 25 | cdef inline uint32_t bitsetunion(uint64_t *dest,
 26 | 		uint64_t *src1, uint64_t *src2) nogil:
 27 | 	"""dest gets the union of src1 and src2.
 28 | 
 29 | 	:returns: number of set bits in result."""
 30 | 	cdef size_t n
 31 | 	cdef uint64_t res1, res2
 32 | 	cdef uint32_t result = 0
 33 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
 34 | 		res1 = src1[n] | src2[n]
 35 | 		res2 = src1[n + 1] | src2[n + 1]
 36 | 		dest[n] = res1
 37 | 		dest[n + 1] = res2
 38 | 		result += bit_popcount(res1)
 39 | 		result += bit_popcount(res2)
 40 | 	return result
 41 | 
 42 | 
 43 | cdef inline uint32_t bitsetxor(uint64_t *dest,
 44 | 		uint64_t *src1, uint64_t *src2) nogil:
 45 | 	"""dest gets the xor of src1 and src2.
 46 | 
 47 | 	:returns: number of set bits in result."""
 48 | 	cdef size_t n
 49 | 	cdef uint64_t res1, res2
 50 | 	cdef uint32_t result = 0
 51 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
 52 | 		res1 = src1[n] ^ src2[n]
 53 | 		res2 = src1[n + 1] ^ src2[n + 1]
 54 | 		dest[n] = res1
 55 | 		dest[n + 1] = res2
 56 | 		result += bit_popcount(res1)
 57 | 		result += bit_popcount(res2)
 58 | 	return result
 59 | 
 60 | 
 61 | cdef inline uint32_t bitsetsubtract(uint64_t *dest,
 62 | 		uint64_t *src1, uint64_t *src2) nogil:
 63 | 	"""dest gets the src2 - src1.
 64 | 
 65 | 	:returns: number of set bits in result."""
 66 | 	cdef size_t n
 67 | 	cdef uint64_t res1, res2
 68 | 	cdef uint32_t result = 0
 69 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
 70 | 		res1 = src1[n] & ~src2[n]
 71 | 		res2 = src1[n + 1] & ~src2[n + 1]
 72 | 		dest[n] = res1
 73 | 		dest[n + 1] = res2
 74 | 		result += bit_popcount(res1)
 75 | 		result += bit_popcount(res2)
 76 | 	return result
 77 | 
 78 | 
 79 | # Only store result, no cardinality
 80 | cdef inline void bitsetintersectnocard(uint64_t *dest,
 81 | 		uint64_t *src1, uint64_t *src2) noexcept nogil:
 82 | 	"""dest gets the intersection of src1 and src2."""
 83 | 	cdef size_t n
 84 | 	cdef uint64_t res1, res2
 85 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
 86 | 		res1 = src1[n] & src2[n]
 87 | 		res2 = src1[n + 1] & src2[n + 1]
 88 | 		dest[n] = res1
 89 | 		dest[n + 1] = res2
 90 | 
 91 | 
 92 | cdef inline void bitsetunionnocard(uint64_t *dest,
 93 | 		uint64_t *src1, uint64_t *src2) noexcept nogil:
 94 | 	"""dest gets the union of src1 and src2."""
 95 | 	cdef size_t n
 96 | 	cdef uint64_t res1, res2
 97 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
 98 | 		res1 = src1[n] | src2[n]
 99 | 		res2 = src1[n + 1] | src2[n + 1]
100 | 		dest[n] = res1
101 | 		dest[n + 1] = res2
102 | 
103 | 
104 | cdef inline void bitsetxornocard(uint64_t *dest,
105 | 		uint64_t *src1, uint64_t *src2) noexcept nogil:
106 | 	"""dest gets the xor of src1 and src2."""
107 | 	cdef size_t n
108 | 	cdef uint64_t res1, res2
109 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
110 | 		res1 = src1[n] ^ src2[n]
111 | 		res2 = src1[n + 1] ^ src2[n + 1]
112 | 		dest[n] = res1
113 | 		dest[n + 1] = res2
114 | 
115 | 
116 | cdef inline void bitsetsubtractnocard(uint64_t *dest,
117 | 		uint64_t *src1, uint64_t *src2) noexcept nogil:
118 | 	"""dest gets the src2 - src1."""
119 | 	cdef size_t n
120 | 	cdef uint64_t res1, res2
121 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
122 | 		res1 = src1[n] & ~src2[n]
123 | 		res2 = src1[n + 1] & ~src2[n + 1]
124 | 		dest[n] = res1
125 | 		dest[n + 1] = res2
126 | 
127 | 
128 | # Count cardinality only
129 | cdef inline uint32_t bitsetintersectcount(
130 | 		uint64_t *src1, uint64_t *src2) noexcept nogil:
131 | 	"""return the cardinality of the intersection of dest and src.
132 | 
133 | 	:returns: number of set bits in result.
134 | 	Both operands are assumed to have a fixed number of bits ``BLOCKSIZE``."""
135 | 	cdef uint32_t result = 0
136 | 	cdef size_t n
137 | 	for n in range(<size_t>(BLOCKSIZE // BITSIZE)):
138 | 		result += bit_popcount(src1[n] & src2[n])
139 | 	return result
140 | 
141 | 
142 | # Other operations
143 | cdef inline int iteratesetbits(uint64_t *vec,
144 | 		uint64_t *cur, int *idx) noexcept nogil:
145 | 	"""Iterate over set bits in an array of unsigned long.
146 | 
147 | 	:param cur: pointer to variable to maintain state,
148 | 		``cur`` should be initialized to the first element of
149 | 		the bit array ``vec``, i.e., ``cur = vec[idx]``.
150 | 	:param idx: pointer to variable to maintain state,
151 | 		``idx`` should be initialized to 0.
152 | 	:returns: the index of a set bit, or -1 if there are no more set
153 | 		bits. The result of calling a stopped iterator is undefined.
154 | 
155 | 	e.g.::
156 | 
157 | 		int idx = 0
158 | 		uint64_t vec[4] = {0, 0, 0, 0b10001}, cur = vec[idx]
159 | 		iteratesetbits(vec, 4, &cur, &idx) # returns 0
160 | 		iteratesetbits(vec, 4, &cur, &idx) # returns 4
161 | 		iteratesetbits(vec, 4, &cur, &idx) # returns -1
162 | 	"""
163 | 	cdef int tmp
164 | 	while not cur[0]:
165 | 		idx[0] += 1
166 | 		if idx[0] >= <int>(BLOCKSIZE // BITSIZE):
167 | 			return -1
168 | 		cur[0] = vec[idx[0]]
169 | 	tmp = bit_ctz(cur[0])  # index of right-most 1-bit in current slot
170 | 	cur[0] ^= 1ULL << tmp  # TOGGLEBIT(cur, tmp)
171 | 	return idx[0] * BITSIZE + tmp
172 | 
173 | 
174 | cdef inline int iterateunsetbits(uint64_t *vec,
175 | 		uint64_t *cur, int *idx) noexcept nogil:
176 | 	"""Like ``iteratesetbits``, but return indices of zero bits.
177 | 
178 | 	:param cur: should be initialized as: ``cur = ~vec[idx]``.
179 | 	:param idx: pointer to variables to maintain state,
180 | 		``idx`` should be initialized to 0.
181 | 	"""
182 | 	cdef int tmp
183 | 	while not cur[0]:
184 | 		idx[0] += 1
185 | 		if idx[0] >= <int>(BLOCKSIZE // BITSIZE):
186 | 			return -1
187 | 		cur[0] = ~vec[idx[0]]
188 | 	tmp = bit_ctz(cur[0])  # index of right-most 0-bit in current slot
189 | 	cur[0] ^= 1ULL << tmp  # TOGGLEBIT(cur, tmp)
190 | 	return idx[0] * BITSIZE + tmp
191 | 
192 | 
193 | cdef inline int reviteratesetbits(uint64_t *vec, uint64_t *cur,
194 | 		int *idx) noexcept nogil:
195 | 	"""Iterate in reverse over set bits in an array of unsigned long.
196 | 
197 | 	:param cur: pointer to variable to maintain state,
198 | 		``cur`` should be initialized to the last element of
199 | 		the bit array ``vec``, i.e., ``cur = vec[idx]``.
200 | 	:param idx: pointer to variable to maintain state,
201 | 		``idx`` should be initialized to ``slots - 1``, where slots is the
202 | 		number of elements in unsigned long array ``vec``.
203 | 	:returns: the index of a set bit, or -1 if there are no more set
204 | 		bits. The result of calling a stopped iterator is undefined.
205 | 
206 | 	e.g.::
207 | 
208 | 		int idx = 3
209 | 		uint64_t vec[4] = {0, 0, 0, 0b10001}, cur = vec[idx]
210 | 		reviteratesetbits(vec, 4, &cur, &idx) # returns 4
211 | 		reviteratesetbits(vec, 4, &cur, &idx) # returns 0
212 | 		reviteratesetbits(vec, 4, &cur, &idx) # returns -1
213 | 	"""
214 | 	cdef int tmp
215 | 	while not cur[0]:
216 | 		idx[0] -= 1
217 | 		if idx[0] < 0:
218 | 			return -1
219 | 		cur[0] = vec[idx[0]]
220 | 	tmp = BITSIZE - bit_clz(cur[0]) - 1  # index of left-most 1-bit in cur
221 | 	cur[0] &= ~(1ULL << tmp)  # CLEARBIT(cur, tmp)
222 | 	return idx[0] * BITSIZE + tmp
223 | 
224 | 
225 | cdef inline uint32_t extractsetbits(uint16_t *dest,
226 | 		uint64_t *src) noexcept nogil:
227 | 	"""Store set bits of bitvector in preallocated array.
228 | 
229 | 	:returns: number of elements in result."""
230 | 	cdef size_t n, length = 0, base = 0
231 | 	cdef uint64_t cur
232 | 	for n in range(<size_t>(BLOCKSIZE // BITSIZE)):
233 | 		cur = src[n]
234 | 		while cur:
235 | 			dest[length] = base + bit_ctz(cur)
236 | 			length += 1
237 | 			cur ^= cur & -cur
238 | 		base += 64
239 | 	return length
240 | 
241 | 
242 | cdef inline uint32_t extractunsetbits(uint16_t *dest,
243 | 		uint64_t *src) noexcept nogil:
244 | 	"""Store zero bits of bitvector in preallocated array.
245 | 
246 | 	:returns: number of elements in result."""
247 | 	cdef size_t n, length = 0, base = 0
248 | 	cdef uint64_t cur
249 | 	for n in range(<size_t>(BLOCKSIZE // BITSIZE)):
250 | 		cur = ~src[n]
251 | 		while cur:
252 | 			dest[length] = base + bit_ctz(cur)
253 | 			length += 1
254 | 			cur ^= cur & -cur
255 | 		base += 64
256 | 	return length
257 | 
258 | 
259 | cdef inline uint32_t extractintersection(
260 | 		uint16_t *dest, uint64_t *src1, uint64_t *src2) noexcept nogil:
261 | 	"""Compute intersection of bitvectors and store in preallocated array.
262 | 
263 | 	:returns: number of elements in result."""
264 | 	cdef size_t n, length = 0, base = 0
265 | 	cdef uint64_t cur
266 | 	for n in range(<size_t>(BLOCKSIZE // BITSIZE)):
267 | 		cur = src1[n] & src2[n]
268 | 		while cur:
269 | 			dest[length] = base + bit_ctz(cur)
270 | 			length += 1
271 | 			cur ^= cur & -cur
272 | 		base += 64
273 | 	return length
274 | 
275 | 
276 | cdef inline bint bitsubset(uint64_t *vec1, uint64_t *vec2) noexcept nogil:
277 | 	"""Test whether vec1 is a subset of vec2.
278 | 
279 | 	i.e., all set bits of vec1 should be set in vec2."""
280 | 	cdef size_t n
281 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
282 | 		if (vec1[n] & vec2[n]) != vec1[n] or (
283 | 				vec1[n + 1] & vec2[n + 1]) != vec1[n + 1]:
284 | 			return False
285 | 	return True
286 | 
287 | 
288 | cdef inline bint bitdisjoint(uint64_t *vec1, uint64_t *vec2) noexcept nogil:
289 | 	"""Test whether vec1 is disjoint from vec2.
290 | 
291 | 	i.e., len(vec1 & vec2) = 0."""
292 | 	cdef size_t n
293 | 	for n in range(0, <size_t>(BLOCKSIZE // BITSIZE), 2):
294 | 		if (vec1[n] & vec2[n]) or (vec1[n + 1] & vec2[n + 1]):
295 | 			return False
296 | 	return True
297 | 
298 | 
299 | cdef inline int select64(uint64_t w, int i) except -1:
300 | 	"""Given a 64-bit int w, return the position of the ith 1-bit."""
301 | 	cdef uint64_t part1 = w & 0xFFFFFFFFUL
302 | 	cdef int wfirsthalf = bit_popcount(part1)
303 | 	if wfirsthalf > i:
304 | 		return select32(part1, i)
305 | 	else:
306 | 		return select32(<uint32_t>(w >> 32), i - wfirsthalf) + 32
307 | 
308 | 
309 | cdef inline int select32(uint32_t w, int i) except -1:
310 | 	"""Given a 32-bit int w, return the position of the ith 1-bit."""
311 | 	cdef uint64_t part1 = w & 0xFFFFUL
312 | 	cdef int wfirsthalf = bit_popcount(part1)
313 | 	if wfirsthalf > i:
314 | 		return select16(part1, i)
315 | 	else:
316 | 		return select16(w >> 16, i - wfirsthalf) + 16
317 | 
318 | 
319 | cdef inline int select16(uint16_t w, int i) except -1:
320 | 	"""Given a 16-bit int w, return the position of the ith 1-bit."""
321 | 	cdef int sumtotal = 0, counter
322 | 	for counter in range(16):
323 | 		sumtotal += (w >> counter) & 1
324 | 		if sumtotal > i:
325 | 			return counter
326 | 	raise IndexError('select16: index %d out of range 0..%d.' % (
327 | 			i, bit_popcount(w)))
328 | 
329 | 
330 | cdef inline void setbitcard(uint64_t *bitmap, uint16_t elem,
331 | 		uint32_t *cardinality) noexcept nogil:
332 | 	"""Set bit and update cardinality without branch."""
333 | 	cdef uint32_t i
334 | 	cdef uint64_t ow, nw
335 | 	i = BITSLOT(elem)
336 | 	ow = bitmap[i]
337 | 	nw = ow | BITMASK(elem)
338 | 	cardinality[0] += (ow ^ nw) >> (elem % BITSIZE)
339 | 	bitmap[i] = nw
340 | 
341 | 
342 | cdef inline void clearbitcard(uint64_t *bitmap, uint16_t elem,
343 | 		uint32_t *cardinality) noexcept nogil:
344 | 	"""Clear bit and update cardinality without branch."""
345 | 	cdef uint32_t i
346 | 	cdef uint64_t ow, nw
347 | 	i = BITSLOT(elem)
348 | 	ow = bitmap[i]
349 | 	nw = ow & ~BITMASK(elem)
350 | 	cardinality[0] -= (ow ^ nw) >> (elem % BITSIZE)
351 | 	bitmap[i] = nw
352 | 
353 | 
354 | cdef inline void togglebitcard(uint64_t *bitmap, uint16_t elem,
355 | 		uint32_t *cardinality) noexcept nogil:
356 | 	"""Flip bit and update cardinality without branch."""
357 | 	cdef uint32_t i
358 | 	cdef uint64_t ow, nw
359 | 	i = BITSLOT(elem)
360 | 	ow = bitmap[i]
361 | 	nw = ow ^ BITMASK(elem)
362 | 	cardinality[0] += (nw >> (elem % BITSIZE)) - (ow >> (elem % BITSIZE))
363 | 	bitmap[i] = nw
364 | 


--------------------------------------------------------------------------------
/src/immutablerb.pxi:
--------------------------------------------------------------------------------
  1 | cdef class ImmutableRoaringBitmap(RoaringBitmap):
  2 | 	"""A roaring bitmap that does not allow mutation operations.
  3 | 
  4 | 	Any operation resulting in a new roaring bitmap is returned as a mutable
  5 | 	RoaringBitmap (except for ``freeze()`` and the ``ImmutableRoaringBitmap``
  6 | 	constructor). Stores data in one contiguous block of memory for efficient
  7 | 	serialization.
  8 | 	"""
  9 | 	cdef readonly object _ob  # object to be kept for ptr to remain valid
 10 | 	cdef char *ptr  # the data
 11 | 	cdef size_t bufsize  # length in bytes of data
 12 | 	cdef long _hash  # cached hash value, computed as needed
 13 | 
 14 | 	def __init__(self, iterable=None):
 15 | 		"""Return a new RoaringBitmap with elements from ``iterable``.
 16 | 
 17 | 		The elements ``x`` of a RoaringBitmap must be ``0 <= x < 2 ** 32``.
 18 | 		If ``iterable`` is not specified, a new empty RoaringBitmap is
 19 | 		returned. Note that a sorted iterable will significantly speed up the
 20 | 		construction.
 21 | 		``iterable`` may be a generator, in which case the generator is
 22 | 		consumed incrementally.
 23 | 		``iterable`` may be a ``range`` (Python 3) or ``xrange`` (Python 2)
 24 | 		object, which will be constructed efficiently."""
 25 | 		cdef RoaringBitmap ob
 26 | 		cdef ImmutableRoaringBitmap iob
 27 | 		if isinstance(iterable, ImmutableRoaringBitmap):
 28 | 			iob = iterable
 29 | 			self.__setstate__(iob.__getstate__())
 30 | 		else:
 31 | 			ob = ensurerb(iterable or ())
 32 | 			self.__setstate__(ob.__getstate__())
 33 | 
 34 | 	def __getstate__(self):
 35 | 		"""Return a serialized representation (Python array) for pickling."""
 36 | 		if self._ob is None:
 37 | 			state = array.clone(chararray, self.bufsize, False)
 38 | 			memcpy(state.data.as_chars, self.ptr, self.bufsize)
 39 | 			return state
 40 | 		return self._ob
 41 | 
 42 | 	def __setstate__(self, array.array state):
 43 | 		"""Initialize this object with a serialized representation.
 44 | 
 45 | 		:param state: a char array with the pickle format of RoaringBitmap.
 46 | 			Instead of copying this data, it will be used directly.
 47 | 		"""
 48 | 		self._ob = state
 49 | 		# FIXME: 32 byte alignment depends on state.data being aligned.
 50 | 		self._setptr(state.data.as_chars, len(state))
 51 | 
 52 | 	cdef void _setptr(self, char *ptr, size_t size) noexcept nogil:
 53 | 		self.ptr = ptr
 54 | 		self.offset = <size_t>ptr
 55 | 		self.bufsize = size
 56 | 		self._hash = -1
 57 | 		self.size = (<uint32_t *>ptr)[0]
 58 | 		self.capacity = self.size
 59 | 		self.keys = <uint16_t *>&(ptr[sizeof(uint32_t)])
 60 | 		# pointers will be adjusted on the fly with self.offset
 61 | 		self.data = <Block *>&(ptr[
 62 | 				sizeof(uint32_t) + self.size * (sizeof(uint16_t))])
 63 | 
 64 | 	def __hash__(self):
 65 | 		cdef size_t n
 66 | 		if self._hash == -1:
 67 | 			self._hash = 5381
 68 | 			for n in range(self.bufsize):
 69 | 				self._hash = ((self._hash << 5) + self._hash) + self.ptr[n]
 70 | 				# i.e., self._hash *= 33 ^ self.ptr[n]
 71 | 		return self._hash
 72 | 
 73 | 	def __richcmp__(x, y, int op):
 74 | 		cdef ImmutableRoaringBitmap iob1, iob2
 75 | 		if (isinstance(x, ImmutableRoaringBitmap)
 76 | 				and isinstance(y, ImmutableRoaringBitmap)):
 77 | 			if op == 2:  # ==
 78 | 				iob1, iob2 = x, y
 79 | 				if (iob1.bufsize != iob2.bufsize
 80 | 						or iob1.__hash__() != iob2.__hash__()):
 81 | 					return False
 82 | 				return memcmp(iob1.ptr, iob2.ptr, iob1.bufsize) == 0
 83 | 			elif op == 3:  # !=
 84 | 				return not (x == y)
 85 | 		return richcmp(x, y, op)
 86 | 
 87 | 	def __sizeof__(self):
 88 | 		"""Return memory usage in bytes."""
 89 | 		return len(self._ob)
 90 | 
 91 | 	def freeze(self):
 92 | 		"""Already immutable, return self."""
 93 | 		return self
 94 | 
 95 | 	def __repr__(self):
 96 | 		return 'ImmutableRoaringBitmap(%s)' % str(self)
 97 | 
 98 | 	def copy(self):
 99 | 		"""Return a copy of this RoaringBitmap."""
100 | 		cdef ImmutableRoaringBitmap result = ImmutableRoaringBitmap.__new__(
101 | 				ImmutableRoaringBitmap)
102 | 		result.__setstate__(array.copy(self.__getstate__()))
103 | 		return result
104 | 
105 | 	def __iand__(self, x):
106 | 		"""Unsupported method."""
107 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
108 | 
109 | 	def __isub__(self, x):
110 | 		"""Unsupported method."""
111 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
112 | 
113 | 	def __ior__(self, x):
114 | 		"""Unsupported method."""
115 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
116 | 
117 | 	def __ixor__(self, x):
118 | 		"""Unsupported method."""
119 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
120 | 
121 | 	def add(self, uint32_t elem):
122 | 		"""Unsupported method."""
123 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
124 | 
125 | 	def discard(self, uint32_t elem):
126 | 		"""Unsupported method."""
127 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
128 | 
129 | 	def remove(self, uint32_t elem):
130 | 		"""Unsupported method."""
131 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
132 | 
133 | 	def pop(self):
134 | 		"""Unsupported method."""
135 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
136 | 
137 | 	def update(self, *bitmaps):
138 | 		"""Unsupported method."""
139 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
140 | 
141 | 	def intersection_update(self, *bitmaps):
142 | 		"""Unsupported method."""
143 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
144 | 
145 | 	def difference_update(self, *other):
146 | 		"""Unsupported method."""
147 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
148 | 
149 | 	def symmetric_difference_update(self, other):
150 | 		"""Unsupported method."""
151 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
152 | 
153 | 	def flip_range(self, start, stop):
154 | 		"""Unsupported method."""
155 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
156 | 
157 | 	def clear(self):
158 | 		"""Unsupported method."""
159 | 		raise ValueError('ImmutableRoaringBitmap cannot be modified.')
160 | 


--------------------------------------------------------------------------------
/src/macros.h:
--------------------------------------------------------------------------------
 1 | /* http://c-faq.com/misc/bitsets.html */
 2 | /* Original, any word size:
 3 | #define BITSIZE				(8 * sizeof(uint64_t))
 4 | #define BITSLOT(b)			((b) / BITSIZE)
 5 | #define BITMASK(b)			(1ULL << ((b) % BITSIZE))
 6 | #define TESTBIT(a, b)		((a)[BITSLOT(b)] & BITMASK(b))
 7 | NB: TESTBIT returns 0 or a value with bit b set
 8 | Fix word size at 64 bits:
 9 |  */
10 | #define BITSIZE				(64)
11 | #define BITSIZE1			(BITSIZE - 1)
12 | #define BITSLOT(b)			((b) >> 6)
13 | #define BITMASK(b)			(1ULL << ((b) & BITSIZE1))
14 | #define SETBIT(a, b)		((a)[BITSLOT(b)] |= BITMASK(b))
15 | #define TOGGLEBIT(a, b)		((a)[BITSLOT(b)] ^= BITMASK(b))
16 | #define CLEARBIT(a, b)		((a)[BITSLOT(b)] &= ~BITMASK(b))
17 | #define BITNSLOTS(nb)		(((nb) + BITSIZE1) / BITSIZE)
18 | #define TESTBIT(a, b)		(((a)[BITSLOT(b)] >> (b & BITSIZE1)) & 1)
19 | /* NB: TESTBIT returns 0 or 1*/
20 | 
21 | #ifdef _MSC_VER
22 | #define ALIGNED_INLINE __inline
23 | #else
24 | #define ALIGNED_INLINE inline
25 | #endif
26 | 
27 | /* https://stackoverflow.com/q/16376942 */
28 | ALIGNED_INLINE void* aligned_malloc(size_t size, size_t align) {
29 | 	void *result;
30 | 	#ifdef _MSC_VER
31 | 	result = _aligned_malloc(size, align);
32 | 	#else
33 | 	if (posix_memalign(&result, align, size))
34 | 		result = 0;
35 | 	#endif
36 | 	return result;
37 | }
38 | 
39 | ALIGNED_INLINE void aligned_free(void *ptr) {
40 | 	#ifdef _MSC_VER
41 | 	_aligned_free(ptr);
42 | 	#else
43 | 	free(ptr);
44 | 	#endif
45 | }
46 | 


--------------------------------------------------------------------------------
/src/multirb.pxi:
--------------------------------------------------------------------------------
  1 | @cython.no_gc_clear
  2 | cdef class MultiRoaringBitmap(object):
  3 | 	"""A sequence of immutable roaring bitmaps.
  4 | 
  5 | 	Bitmaps are addressed with 32-bit indices.
  6 | 	Everything is stored in a single contiguous block of memory.
  7 | 
  8 | 	>>> mrb = MultiRoaringBitmap([
  9 | 	...    RoaringBitmap({0, 1, 2}),
 10 | 	...    RoaringBitmap({1, 6, 8}),
 11 | 	...    RoaringBitmap({1, 7, 2})])
 12 | 	>>> mrb.intersection(list(range(len(mrb))))
 13 | 	RoaringBitmap({1})
 14 | 	>>> mrb[0] | mrb[1]
 15 | 	RoaringBitmap({0, 1, 2, 6, 8})
 16 | 	"""
 17 | 	cdef uint32_t size  # the number of roaring bitmaps
 18 | 	cdef uint32_t *offsets  # byte offset in ptr for each roaring bitmap
 19 | 	cdef uint32_t *sizes  # the size in bytes of each roaring bitmap
 20 | 	cdef uint32_t *ptr  # the data
 21 | 	cdef object _ob  # array or mmap which should be kept alive for ptr
 22 | 	cdef object _file  # optionally, file with mmap to be kept open
 23 | 
 24 | 	def __init__(self, list init, filename=None):
 25 | 		"""
 26 | 		:param init: a list of set-like objects (e.g., RoaringBitmaps).
 27 | 			May contain ``None`` elements, which are treated as empty
 28 | 			sets.
 29 | 		:param filename: if given, result is stored in an mmap'd file.
 30 | 			File is overwritten if it already exists."""
 31 | 		cdef ImmutableRoaringBitmap irb
 32 | 		cdef uint32_t alloc, offset
 33 | 		cdef int alignment = 32
 34 | 		cdef Py_buffer buffer
 35 | 		cdef Py_ssize_t size = 0
 36 | 		cdef char *ptr = NULL
 37 | 		cdef int result
 38 | 
 39 | 		if filename is not None:
 40 | 			flags = os.O_CREAT | os.O_RDWR
 41 | 			if sys.platform == 'win32':
 42 | 				flags |= os.O_BINARY
 43 | 			self._file = os.open(filename, flags)
 44 | 
 45 | 		tmp = [None if a is None else ImmutableRoaringBitmap(a) for a in init]
 46 | 		self.size = len(tmp)
 47 | 		alloc = sizeof(uint32_t) + 2 * self.size * sizeof(uint32_t)
 48 | 		extra = alignment - alloc % alignment
 49 | 		alloc += extra
 50 | 		offset = alloc
 51 | 		for irb in tmp:
 52 | 			if irb is not None:
 53 | 				alloc += irb.bufsize
 54 | 
 55 | 		if filename is not None:
 56 | 			os.ftruncate(self._file, alloc)
 57 | 		self._ob = mmap.mmap(
 58 | 				-1 if filename is None else self._file,
 59 | 				alloc, access=mmap.ACCESS_WRITE)
 60 | 		result = getbufptr(self._ob, &ptr, &size, &buffer)
 61 | 		self.ptr = <uint32_t *>ptr
 62 | 		if result != 0:
 63 | 			raise ValueError('could not get buffer from mmap.')
 64 | 
 65 | 		self.ptr[0] = self.size
 66 | 		self.offsets = &(self.ptr[1])
 67 | 		self.sizes = &(self.ptr[1 + self.size])
 68 | 		for n in range(1 + 2 * self.size,
 69 | 				1 + 2 * self.size + extra // sizeof(uint32_t)):
 70 | 			self.ptr[n] = 0
 71 | 		for n, irb in enumerate(tmp):
 72 | 			# offset
 73 | 			self.ptr[1 + n] = offset
 74 | 			# size
 75 | 			if irb is None or irb.size == 0:
 76 | 				self.ptr[1 + n + self.size] = 0
 77 | 				continue
 78 | 			self.ptr[1 + n + self.size] = irb.bufsize
 79 | 			# copy data
 80 | 			memcpy(&((<char *>self.ptr)[offset]), irb.ptr, irb.bufsize)
 81 | 			offset += irb.bufsize
 82 | 		if filename is not None:
 83 | 			self._ob.flush()
 84 | 		releasebuf(&buffer)
 85 | 
 86 | 	def __richcmp__(x, y, int op):
 87 | 		if x is None or y is None:
 88 | 			if op == 2 or op == 3:
 89 | 				return op == 3
 90 | 			raise TypeError
 91 | 		if (not isinstance(x, (MultiRoaringBitmap, list))
 92 | 				or not isinstance(y, (MultiRoaringBitmap, list))):
 93 | 			raise TypeError
 94 | 		if op == 2:  # ==
 95 | 			if len(x) != len(y):
 96 | 				return False
 97 | 			return all(a == b for a, b in zip(x, y))
 98 | 		elif op == 3:  # !=
 99 | 			if len(x) != len(y):
100 | 				return True
101 | 			return not all(a == b for a, b in zip(x, y))
102 | 		return NotImplemented
103 | 
104 | 	def close(self):
105 | 		"""Close opened file, if any."""
106 | 		if hasattr(self._ob, 'close'):
107 | 			self._ob.close()
108 | 			self._ob = None
109 | 			if self._file is not None:
110 | 				os.close(self._file)
111 | 				self._file = None
112 | 
113 | 	def __enter__(self):
114 | 		return self
115 | 
116 | 	def __exit__(self, _type, _value, _traceback):
117 | 		self.close()
118 | 
119 | 	def __getstate__(self):
120 | 		"""Return a serialized representation (Python array) for pickling."""
121 | 		return bytes(self._ob)
122 | 
123 | 	def __setstate__(self, state):
124 | 		"""Initialize this object with a serialized representation."""
125 | 		self._ob = state
126 | 		self.ptr = <uint32_t *><char *>state
127 | 		self.size = self.ptr[0]
128 | 		self.offsets = &(self.ptr[1])
129 | 		self.sizes = &(self.ptr[1 + self.size])
130 | 
131 | 	@classmethod
132 | 	def fromfile(cls, filename):
133 | 		"""Load a MultiRoaringBitmap from a file using mmap."""
134 | 		cdef MultiRoaringBitmap ob
135 | 		cdef Py_buffer buffer
136 | 		cdef char *ptr = NULL
137 | 		cdef Py_ssize_t size = 0
138 | 		ob = MultiRoaringBitmap.__new__(MultiRoaringBitmap)
139 | 		flags = os.O_RDONLY
140 | 		if sys.platform == 'win32':
141 | 			flags |= os.O_BINARY
142 | 		ob._file = os.open(filename, flags)
143 | 		ob._ob = mmap.mmap(ob._file, 0, access=mmap.ACCESS_READ)
144 | 		result = getbufptr(ob._ob, &ptr, &size, &buffer)
145 | 		ob.ptr = <uint32_t *>ptr
146 | 		if result != 0:
147 | 			raise ValueError('could not get buffer from mmap.')
148 | 		ob.size = ob.ptr[0]
149 | 		ob.offsets = &(ob.ptr[1])
150 | 		ob.sizes = &(ob.ptr[1 + ob.size])
151 | 		# rest is data
152 | 		releasebuf(&buffer)
153 | 		return ob
154 | 
155 | 	@classmethod
156 | 	def frombuffer(cls, data, int offset):
157 | 		"""Load a MultiRoaringBitmap from a Python object using the buffer
158 | 		interface (e.g. bytes or mmap object), starting at ``offset``."""
159 | 		cdef MultiRoaringBitmap ob = MultiRoaringBitmap.__new__(
160 | 				MultiRoaringBitmap)
161 | 		cdef char *ptr = NULL
162 | 		cdef Py_buffer buffer
163 | 		cdef Py_ssize_t size = 0
164 | 		result = getbufptr(data, &ptr, &size, &buffer)
165 | 		ob.ptr = <uint32_t *>&ptr[offset]
166 | 		if result != 0:
167 | 			raise ValueError('could not get buffer from mmap.')
168 | 		ob.size = ob.ptr[0]
169 | 		ob.offsets = &(ob.ptr[1])
170 | 		ob.sizes = &(ob.ptr[1 + ob.size])
171 | 		# rest is data
172 | 		releasebuf(&buffer)
173 | 		return ob
174 | 
175 | 	def bufsize(self):
176 | 		"""Return size in number of bytes."""
177 | 		return self.offsets[self.size - 1] + self.sizes[self.size - 1]
178 | 
179 | 	def __len__(self):
180 | 		return self.size
181 | 
182 | 	def __getitem__(self, i):
183 | 		"""Like self.get(), but handle negative indices, slices and raise
184 | 		IndexError for invalid index."""
185 | 		if isinstance(i, slice):
186 | 			return [self[n] for n in range(*i.indices(self.size))]
187 | 		elif not isinstance(i, (int, long)):
188 | 			raise TypeError('Expected integer index or slice object.')
189 | 		elif i < 0:
190 | 			i += self.size
191 | 		result = self.get(i)
192 | 		if result is None:
193 | 			raise IndexError
194 | 		return result
195 | 
196 | 	cpdef get(self, long i):
197 | 		"""Return bitmap `i` as an ``ImmutableRoaringBitmap``, or ``None`` if
198 | 		`i` is an invalid index."""
199 | 		cdef ImmutableRoaringBitmap ob1
200 | 		if i < 0 or i >= self.size:
201 | 			return None
202 | 		if self.sizes[i] == 0:
203 | 			return EMPTYIRB
204 | 		ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
205 | 		ob1._setptr(&(<char *>self.ptr)[self.offsets[i]], self.sizes[i])
206 | 		return ob1
207 | 
208 | 	def getsize(self, long i):
209 | 		return self.sizes[i]
210 | 
211 | 	def intersection(self, list indices,
212 | 			uint32_t start=0, uint32_t stop=0xffffffffUL):
213 | 		"""Compute intersection of given a list of indices of roaring bitmaps
214 | 		in this collection.
215 | 
216 | 		:param start: optional start index.
217 | 		:param stop: optional end index;
218 | 			if given, only return elements ``n`` s.t. ``start <= n < stop``.
219 | 		:returns: the intersection as a mutable RoaringBitmap.
220 | 			Returns ``None`` when an invalid index is encountered or an empty
221 | 			result is obtained.
222 | 		"""
223 | 		cdef ImmutableRoaringBitmap ob1, ob2
224 | 		cdef RoaringBitmap result
225 | 		cdef char *ptr = <char *>self.ptr
226 | 		cdef long i, j, numindices = len(indices)
227 | 		if numindices == 0:
228 | 			return None
229 | 		for i in range(numindices):
230 | 			j = indices[i]
231 | 			if j < 0 or j >= self.size or self.sizes[j] == 0:
232 | 				return None
233 | 		ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
234 | 		if numindices == 1:
235 | 			i = indices[0]
236 | 			ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
237 | 			if start or stop < 0xffffffffUL:
238 | 				return rb_clamp(ob1, start, stop)
239 | 			return ob1
240 | 		indices.sort(key=self.getsize)
241 | 		ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
242 | 		# TODO with nogil?:
243 | 		i, j = indices[0], indices[1]
244 | 		ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
245 | 		ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
246 | 		if start or stop < 0xffffffffUL:
247 | 			result = rb_clamp(ob1, start, stop)
248 | 			rb_iand(result, ob2)
249 | 		else:
250 | 			result = rb_and(ob1, ob2)
251 | 		for i in range(2, numindices):
252 | 			j = indices[i]
253 | 			# swap out contents of ImmutableRoaringBitmap object
254 | 			ob1._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
255 | 			rb_iand(result, ob1)
256 | 			if result.size == 0:
257 | 				return None
258 | 		return result
259 | 
260 | 	def andor_len_pairwise(self, array.array indices1, array.array indices2,
261 | 			array.array resultand, array.array resultor):
262 | 		"""Pairwise intersection/union cardinality for pairs of roaring bitmaps
263 | 		in this collection given by ``zip(indices1, indices2)``.
264 | 
265 | 		:param indices1: input array
266 | 		:param indices2: input array
267 | 		:param resultand: result array
268 | 		:param resultor: result array
269 | 
270 | 		All parameters should be Python arrays of type 'L', all preallocated
271 | 		with the same length; result arrays need not be initialized.
272 | 
273 | 		>>> result1 = array.array('L', [0] * 3)
274 | 		>>> result2 = array.array('L', [0] * 3)
275 | 		>>> mrb.intersection_card_pairwise(array.array('L', [0, 6, 8]),
276 | 		...			array.array('L', [1, 7, 6]), result1, result2)
277 | 		>>> result1
278 | 		array.array('L', [3, 2, 56])
279 | 		>>> result2
280 | 		array.array('L', [6, 4, 123])
281 | 		"""
282 | 		cdef char *ptr = <char *>self.ptr
283 | 		cdef int i, j, n, lenindices1 = len(indices1)
284 | 		cdef ImmutableRoaringBitmap ob1, ob2
285 | 		ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
286 | 		ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
287 | 		with nogil:
288 | 			for n in range(lenindices1):
289 | 				i, j = indices1.data.as_ulongs[n], indices2.data.as_ulongs[n]
290 | 				ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
291 | 				ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
292 | 				if self.sizes[i] and self.sizes[j]:
293 | 					rb_andor_len(ob1, ob2, &(resultand.data.as_ulongs[n]),
294 | 							&(resultor.data.as_ulongs[n]))
295 | 				else:
296 | 					resultand.data.as_ulongs[n] = 0
297 | 					resultor.data.as_ulongs[n] = 0
298 | 
299 | 	def jaccard_dist(self, array.array indices1, array.array indices2):
300 | 		"""Compute the Jaccard distances for pairs of roaring bitmaps
301 | 		in this collection given by ``zip(indices1, indices2)``.
302 | 
303 | 		>>> mrb.jaccard_dist(array.array('L', [0, 6, 8]),
304 | 		...			array.array('L', [1, 7, 6]))
305 | 		array.array('d', [0.3, 0.2, 0.56])
306 | 
307 | 		:param indices1: input array
308 | 		:param indices2: input array
309 | 		:returns: a Python array of floats with the jaccard distances.
310 | 
311 | 		``indices1`` and ``indices2`` should be arrays of unsigned long
312 | 		integers, created with ``array.array('L')``. Ensure that all indices
313 | 		`i` are in the range ``0 <= i < len(self)``.
314 | 		"""
315 | 		cdef ImmutableRoaringBitmap ob1, ob2
316 | 		cdef array.array result = array.clone(dblarray, len(indices1), False)
317 | 		cdef char *ptr = <char *>self.ptr
318 | 		cdef int i, j, n, lenindices1 = len(indices1)
319 | 		ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
320 | 		ob2 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
321 | 		with nogil:
322 | 			for n in range(lenindices1):
323 | 				i, j = indices1.data.as_ulongs[n], indices2.data.as_ulongs[n]
324 | 				ob1._setptr(&(ptr[self.offsets[i]]), self.sizes[i])
325 | 				ob2._setptr(&(ptr[self.offsets[j]]), self.sizes[j])
326 | 				result.data.as_doubles[n] = (rb_jaccard_dist(ob1, ob2)
327 | 						if self.sizes[i] and self.sizes[j] else 1)
328 | 		return result
329 | 
330 | 	def jaccard_dist_single(self, RoaringBitmap rb):
331 | 		"""Compute the Jaccard distances for `rb` with all roaring bitmaps
332 | 		in this collection.
333 | 
334 | 		>>> mrb.jaccard_dist_single(RoaringBitmap([1, 6, 19, 22]))
335 | 		array.array('d', [0.3, 0.2, 0.56])
336 | 
337 | 		:param rb: a roaring bitmap.
338 | 		:returns: a Python array of floats with the jaccard distances with
339 | 			length equal to `len(self)`.
340 | 		"""
341 | 		cdef ImmutableRoaringBitmap ob1, ob2
342 | 		cdef array.array result = array.clone(dblarray, len(self), False)
343 | 		cdef char *ptr = <char *>self.ptr
344 | 		cdef uint32_t n
345 | 		ob1 = ImmutableRoaringBitmap.__new__(ImmutableRoaringBitmap)
346 | 		ob2 = ImmutableRoaringBitmap(rb)
347 | 		with nogil:
348 | 			for n in range(self.size):
349 | 				ob1._setptr(&(ptr[self.offsets[n]]), self.sizes[n])
350 | 				result.data.as_doubles[n] = rb_jaccard_dist(ob1, ob2)
351 | 		return result
352 | 


--------------------------------------------------------------------------------
/src/rbbinaryops.pxi:
--------------------------------------------------------------------------------
  1 | cdef inline richcmp(x, y, int op):
  2 | 	"""Considers comparisons to RoaringBitmaps and sets;
  3 | 	other types raise a TypeError."""
  4 | 	cdef RoaringBitmap ob1, ob2
  5 | 	cdef size_t n
  6 | 	if x is None or y is None:
  7 | 		if op == 2 or op == 3:
  8 | 			return op == 3
  9 | 		raise TypeError
 10 | 	if (not isinstance(x, (RoaringBitmap, set))
 11 | 			or not isinstance(y, (RoaringBitmap, set))):
 12 | 		raise TypeError
 13 | 	if op == 2:  # ==
 14 | 		ob1, ob2 = ensurerb(x), ensurerb(y)
 15 | 		if ob1.size != ob2.size:
 16 | 			return False
 17 | 		if memcmp(ob1.keys, ob2.keys, ob1.size * sizeof(uint16_t)) != 0:
 18 | 			return False
 19 | 		for n in range(ob1.size):
 20 | 			if ob1.data[n].cardinality != ob2.data[n].cardinality:
 21 | 				return False
 22 | 		for n in range(ob1.size):
 23 | 			if memcmp(
 24 | 					<void *>(ob1.offset + ob1.data[n].buf.offset),
 25 | 					<void *>(ob2.offset + ob2.data[n].buf.offset),
 26 | 					getsize(&(ob1.data[n])) * sizeof(uint16_t)) != 0:
 27 | 				return False
 28 | 		return True
 29 | 	elif op == 3:  # !=
 30 | 		return not richcmp(x, y, 2)
 31 | 	elif op == 1:  # <=
 32 | 		return ensurerb(x).issubset(y)
 33 | 	elif op == 5:  # >=
 34 | 		return ensurerb(x).issuperset(y)
 35 | 	elif op == 0:  # <
 36 | 		return len(x) < len(y) and ensurerb(x).issubset(y)
 37 | 	elif op == 4:  # >
 38 | 		return len(x) > len(y) and ensurerb(x).issuperset(y)
 39 | 	return NotImplemented
 40 | 
 41 | 
 42 | cdef inline RoaringBitmap rb_iand(RoaringBitmap ob1, RoaringBitmap ob2):
 43 | 	cdef uint32_t pos1 = 0, pos2 = 0, res = 0
 44 | 	cdef uint16_t *keys = NULL
 45 | 	cdef Block *data = NULL
 46 | 	cdef Block b2
 47 | 	if ob2.size == 0:
 48 | 		for pos1 in range(ob1.size):
 49 | 			aligned_free(ob1.data[pos1].buf.ptr)
 50 | 		ob1._resize(0)
 51 | 	elif ob1.size > 0:
 52 | 		ob1.capacity = min(ob1.size, ob2.size)
 53 | 		ob1._tmpalloc(ob1.capacity, &keys, &data)
 54 | 		while True:
 55 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
 56 | 				aligned_free(ob1.data[pos1].buf.ptr)
 57 | 				pos1 += 1
 58 | 				if pos1 == ob1.size:
 59 | 					break
 60 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
 61 | 				pos2 += 1
 62 | 				if pos2 == ob2.size:
 63 | 					break
 64 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
 65 | 				block_iand(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
 66 | 				if ob1.data[pos1].cardinality > 0:
 67 | 					keys[res] = ob1.keys[pos1]
 68 | 					data[res] = ob1.data[pos1]
 69 | 					res += 1
 70 | 				else:
 71 | 					aligned_free(ob1.data[pos1].buf.ptr)
 72 | 				pos1 += 1
 73 | 				pos2 += 1
 74 | 				if pos1 == ob1.size or pos2 == ob2.size:
 75 | 					break
 76 | 		ob1._replacearrays(keys, data, res)
 77 | 	return ob1
 78 | 
 79 | 
 80 | cdef inline RoaringBitmap rb_isub(RoaringBitmap ob1, RoaringBitmap ob2):
 81 | 	cdef uint32_t pos1 = 0, pos2 = 0, res = 0
 82 | 	cdef uint16_t *keys = NULL
 83 | 	cdef Block *data = NULL
 84 | 	cdef Block b2
 85 | 	if pos1 < ob1.size and pos2 < ob2.size:
 86 | 		ob1.capacity = ob1.size
 87 | 		ob1._tmpalloc(ob1.capacity, &keys, &data)
 88 | 		while True:
 89 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
 90 | 				keys[res] = ob1.keys[pos1]
 91 | 				data[res] = ob1.data[pos1]
 92 | 				res += 1
 93 | 				pos1 += 1
 94 | 				if pos1 == ob1.size:
 95 | 					break
 96 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
 97 | 				pos2 += 1
 98 | 				if pos2 == ob2.size:
 99 | 					break
100 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
101 | 				block_isub(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
102 | 				if ob1.data[pos1].cardinality > 0:
103 | 					keys[res] = ob1.keys[pos1]
104 | 					data[res] = ob1.data[pos1]
105 | 					res += 1
106 | 				else:
107 | 					aligned_free(ob1.data[pos1].buf.ptr)
108 | 				pos1 += 1
109 | 				pos2 += 1
110 | 				if pos1 == ob1.size or pos2 == ob2.size:
111 | 					break
112 | 		if pos2 == ob2.size:
113 | 			for pos1 in range(pos1, ob1.size):
114 | 				keys[res] = ob1.keys[pos1]
115 | 				data[res] = ob1.data[pos1]
116 | 				res += 1
117 | 		ob1._replacearrays(keys, data, res)
118 | 	return ob1
119 | 
120 | 
121 | cdef inline RoaringBitmap rb_ior(RoaringBitmap ob1, RoaringBitmap ob2):
122 | 	cdef uint32_t pos1 = 0, pos2 = 0, res = 0
123 | 	cdef uint16_t *keys = NULL
124 | 	cdef Block *data = NULL
125 | 	cdef Block b2
126 | 	if ob2.size == 0:
127 | 		return ob1
128 | 	ob1.capacity = ob1.size + ob2.size
129 | 	ob1._tmpalloc(ob1.capacity, &keys, &data)
130 | 	if pos1 < ob1.size and pos2 < ob2.size:
131 | 		while True:
132 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
133 | 				keys[res] = ob1.keys[pos1]
134 | 				data[res] = ob1.data[pos1]
135 | 				res += 1
136 | 				pos1 += 1
137 | 				if pos1 == ob1.size:
138 | 					break
139 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
140 | 				keys[res] = ob2.keys[pos2]
141 | 				block_copy(&(data[res]), ob2._getblk(pos2, &b2))
142 | 				res += 1
143 | 				pos2 += 1
144 | 				if pos2 == ob2.size:
145 | 					break
146 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
147 | 				block_ior(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
148 | 				keys[res] = ob1.keys[pos1]
149 | 				data[res] = ob1.data[pos1]
150 | 				res += 1
151 | 				pos1 += 1
152 | 				pos2 += 1
153 | 				if pos1 == ob1.size or pos2 == ob2.size:
154 | 					break
155 | 	if pos1 == ob1.size:
156 | 		for pos2 in range(pos2, ob2.size):
157 | 			keys[res] = ob2.keys[pos2]
158 | 			block_copy(&(data[res]), ob2._getblk(pos2, &b2))
159 | 			res += 1
160 | 	elif pos2 == ob2.size:
161 | 		for pos1 in range(pos1, ob1.size):
162 | 			keys[res] = ob1.keys[pos1]
163 | 			data[res] = ob1.data[pos1]
164 | 			res += 1
165 | 	ob1._replacearrays(keys, data, res)
166 | 	return ob1
167 | 
168 | 
169 | cdef inline RoaringBitmap rb_ixor(RoaringBitmap ob1, RoaringBitmap ob2):
170 | 	cdef uint32_t pos1 = 0, pos2 = 0, res = 0
171 | 	cdef uint16_t *keys = NULL
172 | 	cdef Block *data = NULL
173 | 	cdef Block b2
174 | 	ob1.capacity = ob1.size + ob2.size
175 | 	ob1._tmpalloc(ob1.capacity, &keys, &data)
176 | 	if pos1 < ob1.size and pos2 < ob2.size:
177 | 		while True:
178 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
179 | 				keys[res] = ob1.keys[pos1]
180 | 				data[res] = ob1.data[pos1]
181 | 				res += 1
182 | 				pos1 += 1
183 | 				if pos1 == ob1.size:
184 | 					break
185 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
186 | 				keys[res] = ob2.keys[pos2]
187 | 				block_copy(&(data[res]), ob2._getblk(pos2, &b2))
188 | 				res += 1
189 | 				pos2 += 1
190 | 				if pos2 == ob2.size:
191 | 					break
192 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
193 | 				block_ixor(&(ob1.data[pos1]), ob2._getblk(pos2, &b2))
194 | 				if ob1.data[pos1].cardinality > 0:
195 | 					keys[res] = ob1.keys[pos1]
196 | 					data[res] = ob1.data[pos1]
197 | 					res += 1
198 | 				else:
199 | 					aligned_free(ob1.data[pos1].buf.ptr)
200 | 				pos1 += 1
201 | 				pos2 += 1
202 | 				if pos1 == ob1.size or pos2 == ob2.size:
203 | 					break
204 | 	if pos1 == ob1.size:
205 | 		for pos2 in range(pos2, ob2.size):
206 | 			keys[res] = ob2.keys[pos2]
207 | 			block_copy(&(data[res]), ob2._getblk(pos2, &b2))
208 | 			res += 1
209 | 	elif pos2 == ob2.size:
210 | 		for pos1 in range(pos1, ob1.size):
211 | 			keys[res] = ob1.keys[pos1]
212 | 			data[res] = ob1.data[pos1]
213 | 			res += 1
214 | 	ob1._replacearrays(keys, data, res)
215 | 	return ob1
216 | 
217 | 
218 | cdef inline RoaringBitmap rb_and(RoaringBitmap ob1, RoaringBitmap ob2):
219 | 	cdef RoaringBitmap result = RoaringBitmap()
220 | 	cdef uint32_t pos1 = 0, pos2 = 0
221 | 	cdef Block b1, b2
222 | 	if pos1 < ob1.size and pos2 < ob2.size:
223 | 		# initialize to zero so that unallocated blocks can be detected
224 | 		result._initarray(min(ob1.size, ob2.size))
225 | 		while True:
226 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
227 | 				pos1 += 1
228 | 				if pos1 == ob1.size:
229 | 					break
230 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
231 | 				pos2 += 1
232 | 				if pos2 == ob2.size:
233 | 					break
234 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
235 | 				block_and(&(result.data[result.size]),
236 | 						ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
237 | 				if result.data[result.size].cardinality:
238 | 					result.keys[result.size] = ob1.keys[pos1]
239 | 					result.size += 1
240 | 				pos1 += 1
241 | 				pos2 += 1
242 | 				if pos1 == ob1.size or pos2 == ob2.size:
243 | 					break
244 | 		aligned_free(result.data[result.size].buf.ptr)
245 | 		result._resize(result.size)
246 | 	return result
247 | 
248 | 
249 | cdef inline RoaringBitmap rb_sub(RoaringBitmap ob1, RoaringBitmap ob2):
250 | 	cdef RoaringBitmap result = RoaringBitmap()
251 | 	cdef uint32_t pos1 = 0, pos2 = 0
252 | 	cdef Block b1, b2
253 | 	result._initarray(ob1.size)
254 | 	if pos1 < ob1.size and pos2 < ob2.size:
255 | 		while True:
256 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
257 | 				result._insertcopy(
258 | 						result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
259 | 				pos1 += 1
260 | 				if pos1 == ob1.size:
261 | 					break
262 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
263 | 				pos2 += 1
264 | 				if pos2 == ob2.size:
265 | 					break
266 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
267 | 				block_sub(&(result.data[result.size]),
268 | 						ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
269 | 				if result.data[result.size].cardinality > 0:
270 | 					result.keys[result.size] = ob1.keys[pos1]
271 | 					result.size += 1
272 | 				pos1 += 1
273 | 				pos2 += 1
274 | 				if pos1 == ob1.size or pos2 == ob2.size:
275 | 					break
276 | 		if pos2 == ob2.size:
277 | 			for pos1 in range(pos1, ob1.size):
278 | 				result._insertcopy(
279 | 						result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
280 | 		aligned_free(result.data[result.size].buf.ptr)
281 | 		result._resize(result.size)
282 | 	if pos2 == ob2.size:
283 | 		while pos1 < ob1.size:
284 | 			result._insertcopy(
285 | 					result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
286 | 			pos1 += 1
287 | 	return result
288 | 
289 | 
290 | cdef inline RoaringBitmap rb_or(RoaringBitmap ob1, RoaringBitmap ob2):
291 | 	cdef RoaringBitmap result = RoaringBitmap()
292 | 	cdef uint32_t pos1 = 0, pos2 = 0
293 | 	cdef Block b1, b2
294 | 	if pos1 < ob1.size and pos2 < ob2.size:
295 | 		result._initarray(ob1.size + ob2.size)
296 | 		while True:
297 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
298 | 				result._insertcopy(
299 | 						result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
300 | 				pos1 += 1
301 | 				if pos1 == ob1.size:
302 | 					break
303 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
304 | 				result._insertcopy(
305 | 						result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2))
306 | 				pos2 += 1
307 | 				if pos2 == ob2.size:
308 | 					break
309 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
310 | 				block_or(&(result.data[result.size]),
311 | 						ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
312 | 				result.keys[result.size] = ob1.keys[pos1]
313 | 				result.size += 1
314 | 				pos1 += 1
315 | 				pos2 += 1
316 | 				if pos1 == ob1.size or pos2 == ob2.size:
317 | 					break
318 | 	if pos1 == ob1.size:
319 | 		result._extendarray(ob2.size - pos2)
320 | 		for pos2 in range(pos2, ob2.size):
321 | 			result._insertcopy(result.size,
322 | 					ob2.keys[pos2], ob2._getblk(pos2, &b2))
323 | 	elif pos2 == ob2.size:
324 | 		result._extendarray(ob1.size - pos1)
325 | 		for pos1 in range(pos1, ob1.size):
326 | 			result._insertcopy(
327 | 					result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
328 | 	result._resize(result.size)
329 | 	return result
330 | 
331 | 
332 | cdef inline RoaringBitmap rb_xor(RoaringBitmap ob1, RoaringBitmap ob2):
333 | 	cdef RoaringBitmap result = RoaringBitmap()
334 | 	cdef uint32_t pos1 = 0, pos2 = 0
335 | 	cdef Block b1, b2
336 | 	if pos1 < ob1.size and pos2 < ob2.size:
337 | 		result._initarray(ob1.size + ob2.size)
338 | 		while True:
339 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
340 | 				result._insertcopy(
341 | 						result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
342 | 				pos1 += 1
343 | 				if pos1 == ob1.size:
344 | 					break
345 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
346 | 				result._insertcopy(
347 | 						result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2))
348 | 				pos2 += 1
349 | 				if pos2 == ob2.size:
350 | 					break
351 | 			else:  # ob1.keys[pos1] == ob2.keys[pos2]:
352 | 				block_xor(&(result.data[result.size]),
353 | 						ob1._getblk(pos1, &b1), ob2._getblk(pos2, &b2))
354 | 				if result.data[result.size].cardinality > 0:
355 | 					result.keys[result.size] = ob1.keys[pos1]
356 | 					result.size += 1
357 | 				pos1 += 1
358 | 				pos2 += 1
359 | 				if pos1 == ob1.size or pos2 == ob2.size:
360 | 					break
361 | 		aligned_free(result.data[result.size].buf.ptr)
362 | 	if pos1 == ob1.size:
363 | 		result._extendarray(ob2.size - pos2)
364 | 		for pos2 in range(pos2, ob2.size):
365 | 			result._insertcopy(
366 | 					result.size, ob2.keys[pos2], ob2._getblk(pos2, &b2))
367 | 	elif pos2 == ob2.size:
368 | 		result._extendarray(ob1.size - pos1)
369 | 		for pos1 in range(pos1, ob1.size):
370 | 			result._insertcopy(
371 | 					result.size, ob1.keys[pos1], ob1._getblk(pos1, &b1))
372 | 	result._resize(result.size)
373 | 	return result
374 | 
375 | 
376 | cdef bint rb_isdisjoint(RoaringBitmap self, RoaringBitmap ob):
377 | 	cdef Block b1, b2
378 | 	cdef size_t n
379 | 	cdef int i = 0
380 | 	if self.size == 0 or ob.size == 0:
381 | 		return True
382 | 	for n in range(self.size):
383 | 		i = ob._binarysearch(i, ob.size, self.keys[n])
384 | 		if i < 0:
385 | 			if -i - 1 >= <int>ob.size:
386 | 				return True
387 | 			i = -i - 1
388 | 		elif not block_isdisjoint(self._getblk(n, &b1), ob._getblk(i, &b2)):
389 | 			return False
390 | 	return True
391 | 
392 | 
393 | cdef inline bint rb_issubset(RoaringBitmap self, RoaringBitmap ob):
394 | 	cdef Block b1, b2
395 | 	cdef size_t n
396 | 	cdef int i = 0
397 | 	if self.size == 0:
398 | 		return True
399 | 	elif ob.size == 0:
400 | 		return False
401 | 	for n in range(self.size):
402 | 		i = ob._binarysearch(i, ob.size, self.keys[n])
403 | 		if i < 0:
404 | 			return False
405 | 	i = 0
406 | 	for n in range(self.size):
407 | 		i = ob._binarysearch(i, ob.size, self.keys[n])
408 | 		if not block_issubset(self._getblk(n, &b1), ob._getblk(i, &b2)):
409 | 			return False
410 | 	return True
411 | 
412 | 
413 | cdef inline RoaringBitmap rb_clamp(RoaringBitmap self,
414 | 		uint32_t start, uint32_t stop):
415 | 	cdef Block b1
416 | 	cdef RoaringBitmap result = RoaringBitmap()
417 | 	cdef int ii = self._getindex(highbits(start))
418 | 	cdef int jj = ii
419 | 	cdef int i = -ii - 1 if ii < 0 else ii
420 | 	cdef int j = i
421 | 	if highbits(start) != highbits(stop):
422 | 		jj = self._getindex(highbits(stop))
423 | 		# when block was not found, round down to preceding block
424 | 		j = -jj - 2 if jj < 0 else jj
425 | 	if i >= <int32_t>self.size or j < 0:
426 | 		return result
427 | 	result._initarray(j - i + 1)
428 | 	block_clamp(
429 | 			&(result.data[0]), self._getblk(i, &b1),
430 | 			lowbits(start) if i == ii else 0,
431 | 			lowbits(stop) if ii == jj and ii >= 0 else BLOCKSIZE)
432 | 	if result.data[result.size].cardinality:
433 | 		result.keys[result.size] = self.keys[i]
434 | 		result.size += 1
435 | 	else:
436 | 		aligned_free(result.data[0].buf.ptr)
437 | 	for n in range(i + 1, j):
438 | 		block_copy(&(result.data[result.size]), self._getblk(n, &b1))
439 | 		result.keys[result.size] = self.keys[n]
440 | 		result.size += 1
441 | 	if i != j:
442 | 		block_clamp(
443 | 				&(result.data[result.size]), self._getblk(j, &b1),
444 | 				0, lowbits(stop) if jj >= 0 else BLOCKSIZE)
445 | 		if result.data[result.size].cardinality:
446 | 			result.keys[result.size] = self.keys[j]
447 | 			result.size += 1
448 | 		else:
449 | 			aligned_free(result.data[result.size].buf.ptr)
450 | 	result._resize(result.size)
451 | 	return result
452 | 
453 | 
454 | cdef inline void rb_andor_len(RoaringBitmap ob1, RoaringBitmap ob2,
455 | 		unsigned long *intersection_result,
456 | 		unsigned long *union_result) noexcept nogil:
457 | 	cdef Block b1, b2
458 | 	cdef uint32_t pos1 = 0, pos2 = 0, tmp1, tmp2
459 | 	union_result[0] = intersection_result[0] = 0
460 | 	if pos1 < ob1.size and pos2 < ob2.size:
461 | 		while True:
462 | 			if ob1.keys[pos1] < ob2.keys[pos2]:
463 | 				union_result[0] += ob1.data[pos1].cardinality
464 | 				pos1 += 1
465 | 				if pos1 == ob1.size:
466 | 					break
467 | 			elif ob1.keys[pos1] > ob2.keys[pos2]:
468 | 				union_result[0] += ob2.data[pos2].cardinality
469 | 				pos2 += 1
470 | 				if pos2 == ob2.size:
471 | 					break
472 | 			else:
473 | 				tmp1 = tmp2 = 0
474 | 				block_andorlen(
475 | 						ob1._getblk(pos1, &b1),
476 | 						ob2._getblk(pos2, &b2),
477 | 						&tmp1, &tmp2)
478 | 				intersection_result[0] += tmp1
479 | 				union_result[0] += tmp2
480 | 				pos1 += 1
481 | 				pos2 += 1
482 | 				if pos1 == ob1.size or pos2 == ob2.size:
483 | 					break
484 | 	if pos1 == ob1.size and pos2 < ob2.size:
485 | 		for pos2 in range(pos2, ob2.size):
486 | 			union_result[0] += ob2.data[pos2].cardinality
487 | 	elif pos2 == ob2.size and pos1 < ob1.size:
488 | 		for pos1 in range(pos1, ob1.size):
489 | 			union_result[0] += ob1.data[pos1].cardinality
490 | 
491 | 
492 | cdef inline double rb_jaccard_dist(RoaringBitmap ob1,
493 | 		RoaringBitmap ob2) noexcept nogil:
494 | 	cdef unsigned long union_result = 0, intersection_result = 0
495 | 	rb_andor_len(ob1, ob2, &intersection_result, &union_result)
496 | 	if union_result == 0:
497 | 		return 1
498 | 	return 1 - (intersection_result / <double>union_result)
499 | 


--------------------------------------------------------------------------------
/src/roaringbitmap.pyx:
--------------------------------------------------------------------------------
   1 | """Roaring bitmap in Cython.
   2 | 
   3 | A Roaring bitmap stores a set of 32 bit integers compactly while allowing for
   4 | efficient set operations. The space of integers is partitioned into blocks
   5 | of ``2 ** 16`` integers. The representation for a block depends on the number
   6 | of elements it contains:
   7 | 
   8 | <= 4096 elements:
   9 | 	an array of up to ``1 << 12`` 16-bit integers that are part of the set.
  10 | 
  11 | >= 61140 elements:
  12 | 	an array of up to ``1 << 12`` 16-bit integers that are not part of the set.
  13 | 
  14 | otherwise:
  15 | 	a fixed bitmap of ``1 << 16`` (65536) bits with a 1-bit for each element.
  16 | 
  17 | A ``RoaringBitmap`` can be used as a replacement for a mutable
  18 | Python ``set`` containing unsigned 32-bit integers:
  19 | 
  20 | >>> from roaringbitmap import RoaringBitmap
  21 | >>> RoaringBitmap(range(10)) & RoaringBitmap(range(5, 15))
  22 | RoaringBitmap({5, 6, 7, 8, 9})
  23 | 
  24 | ``ImmutableRoaringBitmap`` is an immutable variant (analogous to ``frozenset``)
  25 | which is stored compactly as a contiguous block of memory.
  26 | 
  27 | ``MultiRoaringBitmap`` stores a sequence of immutable roaring bitmaps
  28 | in an efficiently serializable, contiguous block of memory.
  29 | """
  30 | # TODOs
  31 | # [ ] SSE/AVX2 intrinsics:
  32 | #     array intersection [x] SSE; [ ] AVX
  33 | #     bitmap=>array [ ] SSE; [ ] AVX
  34 | # [ ] separate cardinality & binary ops for bitmaps
  35 | #     [ ] and; [-] or; [ ] xor; [ ] sub
  36 | #     slower in benchmarks
  37 | # [ ] check growth strategy of arrays
  38 | # [ ] more operations:
  39 | #     [ ] efficient shifts
  40 | #     [ ] operate on slices without instantiating range as temp object
  41 | # [ ] subclass Set ABC?
  42 | # [ ] error checking, robustness
  43 | 
  44 | import io
  45 | import os
  46 | import sys
  47 | import mmap
  48 | import heapq
  49 | import array
  50 | 
  51 | from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int32_t
  52 | from libc.stdio cimport printf
  53 | from libc.stdlib cimport free, malloc, calloc, realloc, abort
  54 | from libc.string cimport memset, memcpy, memcmp, memmove
  55 | from cpython.buffer cimport PyBUF_SIMPLE, Py_buffer, PyObject_CheckBuffer, \
  56 | 		PyObject_GetBuffer, PyBuffer_Release
  57 | from cpython cimport array
  58 | cimport cython
  59 | 
  60 | cdef extern from *:
  61 | 	cdef bint PY2
  62 | 
  63 | 
  64 | cdef extern from "Python.h":
  65 | 	int PyObject_CheckReadBuffer(object)
  66 | 	int PyObject_AsReadBuffer(object, const void **, Py_ssize_t *)
  67 | 
  68 | 
  69 | cdef extern from "macros.h":
  70 | 	int BITSIZE
  71 | 	int BITSLOT(int b) nogil
  72 | 	int BITNSLOTS(int nb) nogil
  73 | 	void SETBIT(uint64_t a[], int b) nogil
  74 | 	void CLEARBIT(uint64_t a[], int b) nogil
  75 | 	uint64_t TESTBIT(uint64_t a[], int b) nogil
  76 | 	uint64_t BITMASK(int b) nogil
  77 | 	void *aligned_malloc(size_t size, size_t align) nogil
  78 | 	void aligned_free(void *ptr) nogil
  79 | 
  80 | cdef extern from "bitcount.h":
  81 | 	unsigned int bit_clz(uint64_t) nogil
  82 | 	unsigned int bit_ctz(uint64_t) nogil
  83 | 	unsigned int bit_popcount(uint64_t) nogil
  84 | 	size_t BITCOUNT_BITS
  85 | 	size_t UINT64_MAX
  86 | 
  87 | 
  88 | cdef extern from "_arrayops.h":
  89 | 	int32_t intersect_uint16(uint16_t *A, size_t lenA,
  90 | 			uint16_t *B, size_t lenB, uint16_t *out) nogil
  91 | 	int32_t intersect_general16(uint16_t *A, size_t lenA,
  92 | 			uint16_t *B, size_t lenB, uint16_t *out) nogil
  93 | 
  94 | 
  95 | cdef union Buffer:
  96 | 	void *ptr
  97 | 	uint16_t *sparse
  98 | 	uint64_t *dense
  99 | 	size_t offset
 100 | 	uint64_t _padding  # ensure that this union takes at least 64 bits.
 101 | 
 102 | 
 103 | cdef struct Block:
 104 | 	# A set of 2**16 integers, stored as bitmap or array.
 105 | 	#
 106 | 	# This block may contain a bitvector (DENSE) or a sparse array;
 107 | 	# The array can contain elements corresponding to 0-bits (INVERTED)
 108 | 	# or 1-bits (POSITIVE).
 109 | 	Buffer buf  # data: sparse array or fixed-size bitvector
 110 | 	uint32_t cardinality  # the number of elements
 111 | 	uint16_t capacity  # number of allocated uint16_t elements
 112 | 	uint16_t state  # either DENSE, INVERTED, or POSITIVE
 113 | 	# NB: make state uint16_t so that the struct is 16 bytes without padding.
 114 | 
 115 | # The maximum number of elements in a block
 116 | DEF BLOCKSIZE = 1 << 16
 117 | 
 118 | # The number of bytes to store a bitmap of 2**16 bits:
 119 | DEF BITMAPSIZE = BLOCKSIZE // 8
 120 | 
 121 | # Maximum length of positive/inverted sparse arrays:
 122 | DEF MAXARRAYLENGTH = 1 << 12
 123 | 
 124 | # Capacity (elements) to allocate for an empty array
 125 | DEF INITCAPACITY = 4
 126 | 
 127 | # Extra elements in result to accomodate SSE/AVX vector operations
 128 | DEF OVERALLOC = 8
 129 | 
 130 | # The different ways a block may store its elements:
 131 | DEF DENSE = 0
 132 | DEF POSITIVE = 1
 133 | DEF INVERTED = 2
 134 | 
 135 | include "bitops.pxi"
 136 | include "arrayops.pxi"
 137 | include "block.pxi"
 138 | include "rbbinaryops.pxi"
 139 | include "immutablerb.pxi"
 140 | include "multirb.pxi"
 141 | 
 142 | chararray = array.array(b'B' if PY2 else 'B')
 143 | dblarray = array.array(b'd' if PY2 else 'd')
 144 | longarray = array.array(b'L' if PY2 else 'L')
 145 | RANGE = xrange if PY2 else range
 146 | EMPTYIRB = ImmutableRoaringBitmap()
 147 | 
 148 | 
 149 | cdef class RoaringBitmap(object):
 150 | 	"""A compact, mutable set of 32-bit integers."""
 151 | 	cdef Block *data  # pointer and size of array/bitmap with elements
 152 | 	cdef uint16_t *keys  # the high bits of elements in each block
 153 | 	cdef uint32_t size  # the number of blocks
 154 | 	cdef uint32_t capacity  # the allocated capacity for blocks
 155 | 	cdef size_t offset  # used for immutable bitmaps with relative pointers
 156 | 
 157 | 	def __cinit__(self, *args, **kwargs):
 158 | 		self.keys = self.data = NULL
 159 | 		self.capacity = self.size = self.offset = 0
 160 | 
 161 | 	def __init__(self, iterable=None):
 162 | 		"""Return a new RoaringBitmap with elements from ``iterable``.
 163 | 
 164 | 		The elements ``x`` of a RoaringBitmap must be ``0 <= x < 2 ** 32``.
 165 | 		If ``iterable`` is not specified, a new empty RoaringBitmap is
 166 | 		returned. Note that a sorted iterable will significantly speed up the
 167 | 		construction.
 168 | 		``iterable`` may be a generator, in which case the generator is
 169 | 		consumed incrementally.
 170 | 		``iterable`` may be a ``range`` (Python 3) or ``xrange`` (Python 2)
 171 | 		object, which will be constructed efficiently."""
 172 | 		cdef size_t n
 173 | 		cdef Block b1
 174 | 		cdef RoaringBitmap ob
 175 | 		if isinstance(iterable, RANGE):
 176 | 			_, (start, stop, step) = iterable.__reduce__()
 177 | 			if 0 <= start < stop and step >= 1:
 178 | 				self._initrange(start, stop, step)
 179 | 				return
 180 | 			# fall through on non-trivial use of range()
 181 | 		if isinstance(iterable, (list, tuple, set, dict, RANGE)):
 182 | 			self._init2pass(iterable)
 183 | 		elif isinstance(iterable, RoaringBitmap):
 184 | 			ob = iterable
 185 | 			self._extendarray(ob.size)
 186 | 			for n in range(ob.size):
 187 | 				self._insertcopy(self.size, ob.keys[n], ob._getblk(n, &b1))
 188 | 		elif iterable is not None:
 189 | 			self._inititerator(iterable)
 190 | 
 191 | 	def __dealloc__(self):
 192 | 		if self.data is not NULL and self.offset == 0:
 193 | 			for n in range(self.size):
 194 | 				aligned_free(self.data[n].buf.ptr)
 195 | 			free(<void *>self.keys)
 196 | 			free(<void *>self.data)
 197 | 			self.keys = self.data = NULL
 198 | 			self.size = 0
 199 | 
 200 | 	def copy(self):
 201 | 		"""Return a copy of this RoaringBitmap."""
 202 | 		cdef RoaringBitmap result = RoaringBitmap()
 203 | 		cdef size_t n
 204 | 		result._extendarray(self.size)
 205 | 		for n in range(self.size):
 206 | 			result._insertcopy(result.size, self.keys[n], &(self.data[n]))
 207 | 		return result
 208 | 
 209 | 	def freeze(self):
 210 | 		"""Return an immutable copy of this RoaringBitmap."""
 211 | 		cdef ImmutableRoaringBitmap result = ImmutableRoaringBitmap.__new__(
 212 | 				ImmutableRoaringBitmap)
 213 | 		result.__setstate__(self.__getstate__())
 214 | 		return result
 215 | 
 216 | 	def __contains__(self, uint32_t elem):
 217 | 		cdef int i = self._getindex(highbits(elem))
 218 | 		cdef Block b1
 219 | 		if i >= 0:
 220 | 			return block_contains(
 221 | 					self._getblk(i, &b1), lowbits(elem))
 222 | 		return False
 223 | 
 224 | 	def __richcmp__(x, y, int op):
 225 | 		return richcmp(x, y, op)
 226 | 
 227 | 	def isdisjoint(self, other):
 228 | 		"""Return True if two RoaringBitmaps have a null intersection."""
 229 | 		return rb_isdisjoint(self, ensurerb(other))
 230 | 
 231 | 	def issubset(self, other):
 232 | 		"""Report whether another set contains this RoaringBitmap."""
 233 | 		return rb_issubset(self, ensurerb(other))
 234 | 
 235 | 	def issuperset(self, other):
 236 | 		"""Report whether this RoaringBitmap contains another set."""
 237 | 		return other.issubset(self)
 238 | 
 239 | 	def min(self):
 240 | 		"""Return smallest element in this RoaringBitmap."""
 241 | 		return self.select(0)
 242 | 
 243 | 	def max(self):
 244 | 		"""Return largest element in this RoaringBitmap."""
 245 | 		return next(reversed(self))
 246 | 
 247 | 	def __and__(x, y):
 248 | 		cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
 249 | 		return rb_and(ob1, ob2)
 250 | 
 251 | 	def __sub__(x, y):
 252 | 		cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
 253 | 		return rb_sub(ob1, ob2)
 254 | 
 255 | 	def __or__(x, y):
 256 | 		cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
 257 | 		return rb_or(ob1, ob2)
 258 | 
 259 | 	def __xor__(x, y):
 260 | 		cdef RoaringBitmap ob1 = ensurerb(x), ob2 = ensurerb(y)
 261 | 		return rb_xor(ob1, ob2)
 262 | 
 263 | 	def __iand__(self, x):
 264 | 		cdef RoaringBitmap ob2 = ensurerb(x)
 265 | 		return rb_iand(self, ob2)
 266 | 
 267 | 	def __isub__(self, x):
 268 | 		cdef RoaringBitmap ob2 = ensurerb(x)
 269 | 		return rb_isub(self, ob2)
 270 | 
 271 | 	def __ior__(self, x):
 272 | 		cdef RoaringBitmap ob2 = ensurerb(x)
 273 | 		return rb_ior(self, ob2)
 274 | 
 275 | 	def __ixor__(self, x):
 276 | 		cdef RoaringBitmap ob2 = ensurerb(x)
 277 | 		return rb_ixor(self, ob2)
 278 | 
 279 | 	def add(self, uint32_t elem):
 280 | 		"""Add an element to the set.
 281 | 
 282 | 		This has no effect if the element is already present."""
 283 | 		cdef Block *block
 284 | 		cdef uint16_t key = highbits(elem)
 285 | 		cdef int i = self._getindex(key)
 286 | 		if i >= 0:
 287 | 			block = &(self.data[i])
 288 | 		else:
 289 | 			block = self._insertempty(-i - 1, key)
 290 | 			block.state = POSITIVE
 291 | 			block.cardinality = 0
 292 | 			block.buf.sparse = allocsparse(INITCAPACITY)
 293 | 			block.capacity = INITCAPACITY
 294 | 		block_add(block, lowbits(elem))
 295 | 		block_convert(block)
 296 | 
 297 | 	def clamp(self, uint32_t start, uint32_t stop):
 298 | 		"""Return new set with range of values restricted to ``(start, stop)``.
 299 | 		"""
 300 | 		return rb_clamp(self, start, stop)
 301 | 
 302 | 	def discard(self, uint32_t elem):
 303 | 		"""Remove an element from the set if it is a member.
 304 | 
 305 | 		If the element is not a member, do nothing."""
 306 | 		cdef int i = self._getindex(highbits(elem))
 307 | 		if i >= 0:
 308 | 			block_discard(&(self.data[i]), lowbits(elem))
 309 | 			if self.data[i].cardinality == 0:
 310 | 				self._removeatidx(i)
 311 | 
 312 | 	def remove(self, uint32_t elem):
 313 | 		"""Remove an element from the set; it must be a member.
 314 | 
 315 | 		If the element is not a member, raise a KeyError."""
 316 | 		cdef int i = self._getindex(highbits(elem))
 317 | 		cdef uint32_t x
 318 | 		if i >= 0:
 319 | 			x = self.data[i].cardinality
 320 | 			block_discard(&(self.data[i]), lowbits(elem))
 321 | 			if x == self.data[i].cardinality:
 322 | 				raise KeyError(elem)
 323 | 			if self.data[i].cardinality == 0:
 324 | 				self._removeatidx(i)
 325 | 		else:
 326 | 			raise KeyError(elem)
 327 | 
 328 | 	def pop(self):
 329 | 		"""Remove and return the largest element."""
 330 | 		cdef uint32_t high, low
 331 | 		if self.size == 0:
 332 | 			raise ValueError('pop from empty roaringbitmap')
 333 | 		high = self.keys[self.size - 1]
 334 | 		low = block_pop(&(self.data[self.size - 1]))
 335 | 		if self.data[self.size - 1].cardinality == 0:
 336 | 			self._removeatidx(self.size - 1)
 337 | 		return high << 16 | low
 338 | 
 339 | 	def clear(self):
 340 | 		"""Remove all elements from this RoaringBitmap."""
 341 | 		cdef size_t n
 342 | 		for n in range(self.size):
 343 | 			aligned_free(self.data[n].buf.ptr)
 344 | 		free(self.keys)
 345 | 		free(self.data)
 346 | 		self.size = 0
 347 | 		self.keys = <uint16_t *>malloc(INITCAPACITY * sizeof(uint16_t))
 348 | 		self.data = <Block *>malloc(INITCAPACITY * sizeof(Block))
 349 | 		if self.keys is NULL or self.data is NULL:
 350 | 			raise MemoryError(INITCAPACITY)
 351 | 		self.capacity = INITCAPACITY
 352 | 
 353 | 	def __lshift__(self, other):
 354 | 		return self.__rshift__(-other)
 355 | 
 356 | 	def __rshift__(self, int other):
 357 | 		# FIXME: replace with optimized implementation
 358 | 		return RoaringBitmap([elem + other for elem in self
 359 | 				if 0 <= elem + other < 1 << 32])
 360 | 
 361 | 	# def __ilshift__(self, other):
 362 | 	# 	raise NotImplementedError
 363 | 
 364 | 	# def __irshift__(self, other):
 365 | 	# 	raise NotImplementedError
 366 | 
 367 | 	def __invert__(self):
 368 | 		"""Return copy with smallest to largest elements inverted."""
 369 | 		return self.symmetric_difference(
 370 | 				RANGE(self.min(), self.max() + 1))
 371 | 
 372 | 	def __iter__(self):
 373 | 		cdef Block *block
 374 | 		cdef Block b1
 375 | 		cdef uint32_t high, i
 376 | 		cdef uint64_t cur
 377 | 		cdef int n, idx, low
 378 | 		for i in range(self.size):
 379 | 			block = self._getblk(i, &b1)
 380 | 			high = (<uint32_t>(self.keys[i])) << 16
 381 | 			if block.cardinality == BLOCKSIZE:
 382 | 				for low in range(BLOCKSIZE):
 383 | 					yield high | low
 384 | 			elif block.state == DENSE:
 385 | 				idx = 0
 386 | 				cur = block.buf.dense[idx]
 387 | 				n = iteratesetbits(block.buf.dense, &cur, &idx)
 388 | 				while n != -1:
 389 | 					yield high | n
 390 | 					n = iteratesetbits(block.buf.dense, &cur, &idx)
 391 | 			elif block.state == POSITIVE:
 392 | 				for n in range(<int>block.cardinality):
 393 | 					low = block.buf.sparse[n]
 394 | 					yield high | low
 395 | 			elif block.state == INVERTED:
 396 | 				for low in range(block.buf.sparse[0]):
 397 | 					yield high | low
 398 | 				if block.cardinality < BLOCKSIZE - 1:
 399 | 					for n in range(<int>BLOCKSIZE - block.cardinality - 1):
 400 | 						for low in range(
 401 | 								block.buf.sparse[n] + 1,
 402 | 								block.buf.sparse[n + 1]):
 403 | 							yield high | low
 404 | 					for low in range(block.buf.sparse[
 405 | 							BLOCKSIZE - block.cardinality - 1] + 1, BLOCKSIZE):
 406 | 						yield high | low
 407 | 
 408 | 	def __reversed__(self):
 409 | 		cdef Block *block
 410 | 		cdef Block b1
 411 | 		cdef uint32_t high, i
 412 | 		cdef uint64_t cur
 413 | 		cdef int n, idx, low
 414 | 		for i in range(self.size - 1, -1, -1):
 415 | 			block = self._getblk(i, &b1)
 416 | 			high = (<uint32_t>(self.keys[i])) << 16
 417 | 			if block.cardinality == BLOCKSIZE:
 418 | 				for low in reversed(range(BLOCKSIZE)):
 419 | 					yield high | low
 420 | 			elif block.state == POSITIVE:
 421 | 				for n in reversed(range(block.cardinality)):
 422 | 					low = block.buf.sparse[n]
 423 | 					yield high | low
 424 | 			elif block.state == DENSE:
 425 | 				idx = BITNSLOTS(BLOCKSIZE) - 1
 426 | 				cur = block.buf.dense[idx]
 427 | 				n = reviteratesetbits(block.buf.dense, &cur, &idx)
 428 | 				while n != -1:
 429 | 					low = n
 430 | 					yield high | low
 431 | 					n = reviteratesetbits(block.buf.dense, &cur, &idx)
 432 | 			elif block.state == INVERTED:
 433 | 				for low in reversed(range(block.buf.sparse[
 434 | 							BLOCKSIZE - block.cardinality - 1] + 1, BLOCKSIZE)):
 435 | 					yield high | low
 436 | 				if block.cardinality < BLOCKSIZE - 1:
 437 | 					for n in reversed(range(BLOCKSIZE - block.cardinality - 1)):
 438 | 						for low in reversed(range(
 439 | 								block.buf.sparse[n] + 1,
 440 | 								block.buf.sparse[n + 1])):
 441 | 							yield high | low
 442 | 				for low in reversed(range(block.buf.sparse[0])):
 443 | 					yield high | low
 444 | 
 445 | 	def __len__(self):
 446 | 		cdef size_t result = 0, n
 447 | 		for n in range(self.size):
 448 | 			result += self.data[n].cardinality
 449 | 		return result
 450 | 
 451 | 	def __sizeof__(self):
 452 | 		"""Return memory usage in bytes (incl. overallocation)."""
 453 | 		cdef uint32_t result = 0
 454 | 		for n in range(self.size):
 455 | 			result += (sizeof(uint16_t) + sizeof(Block)
 456 | 					+ self.data[n].capacity * sizeof(uint16_t))
 457 | 		return result
 458 | 
 459 | 	def numelem(self):
 460 | 		"""Return total number of uint16_t elements stored."""
 461 | 		cdef uint32_t result = 0
 462 | 		for n in range(self.size):
 463 | 			result += 1 + getsize(&(self.data[n]))
 464 | 		return result
 465 | 
 466 | 	def __bool__(self):
 467 | 		return <bint>self.size
 468 | 
 469 | 	def __str__(self):
 470 | 		return '{%s}' % ', '.join([str(a) for a in self])
 471 | 
 472 | 	def __repr__(self):
 473 | 		return 'RoaringBitmap(%s)' % str(self)
 474 | 
 475 | 	def debuginfo(self, verbose=False):
 476 | 		"""Return a string describing the internal representation of this set.
 477 | 		"""
 478 | 		cdef Block b1
 479 | 		return 'keys=%d, cap=%d, data={%s}' % (
 480 | 				self.size, self.capacity, ', '.join([
 481 | 					block_repr(self.keys[n], self._getblk(n, &b1), verbose)
 482 | 					for n in range(self.size)]))
 483 | 
 484 | 	def _keys(self):
 485 | 		return [self.keys[n] for n in range(self.size)]
 486 | 
 487 | 	def __getstate__(self):
 488 | 		"""Return a serialized representation (Python array) for pickling."""
 489 | 		cdef array.array state
 490 | 		cdef Block *ob
 491 | 		cdef uint32_t extra, alignment = 32
 492 | 		cdef size_t n, size
 493 | 		cdef size_t alloc  # total allocated bytes for pickle
 494 | 		cdef size_t offset1 = sizeof(uint32_t)  # keys, data
 495 | 		cdef size_t offset2  # buffers
 496 | 		# compute total size to allocate
 497 | 		# add padding to ensure bitmaps are 32-byte aligned
 498 | 		alloc = offset1 + self.size * (sizeof(uint16_t) + sizeof(Block))
 499 | 		alloc += alignment - alloc % alignment
 500 | 		for n in range(self.size):
 501 | 			alloc += getsize(&(self.data[n])) * sizeof(uint16_t)
 502 | 			alloc += alignment - alloc % alignment
 503 | 		state = array.clone(chararray, alloc, False)
 504 | 		(<uint32_t *>state.data.as_chars)[0] = self.size
 505 | 		size = self.size * sizeof(uint16_t)
 506 | 		memcpy(&(state.data.as_chars[offset1]), self.keys, size)
 507 | 		offset1 += size
 508 | 		offset2 = offset1 + self.size * sizeof(Block)
 509 | 		# add zero padding bytes
 510 | 		extra = alignment - offset2 % alignment
 511 | 		memset(&(state.data.as_chars[offset2]), 0, extra)
 512 | 		offset2 += extra
 513 | 		for n in range(self.size):
 514 | 			# copy block
 515 | 			ob = (<Block *>&(state.data.as_chars[offset1]))
 516 | 			ob[0] = self.data[n]
 517 | 			ob.capacity = getsize(&(self.data[n]))
 518 | 			ob.buf.ptr = <void *>offset2
 519 | 			offset1 += sizeof(Block)
 520 | 			# copy buffer of block
 521 | 			size = ob.capacity * sizeof(uint16_t)
 522 | 			memcpy(&(state.data.as_chars[offset2]), self.data[n].buf.ptr, size)
 523 | 			offset2 += size
 524 | 			# add zero padding bytes
 525 | 			extra = alignment - offset2 % alignment
 526 | 			memset(&(state.data.as_chars[offset2]), 0, extra)
 527 | 			offset2 += extra
 528 | 		return state
 529 | 
 530 | 	def __setstate__(self, array.array state):
 531 | 		"""Initialize this object with a serialized representation."""
 532 | 		cdef char *buf = state.data.as_chars
 533 | 		cdef void *tmp1
 534 | 		cdef void *tmp2
 535 | 		cdef Block *data
 536 | 		cdef size_t n, size, offset = sizeof(uint32_t)
 537 | 		self.clear()
 538 | 		self.size = (<uint32_t *>buf)[0]
 539 | 		self.capacity = max(self.size, INITCAPACITY)
 540 | 		tmp1 = realloc(self.keys, self.capacity * sizeof(uint16_t))
 541 | 		tmp2 = realloc(self.data, self.capacity * sizeof(Block))
 542 | 		if tmp1 is NULL or tmp2 is NULL:
 543 | 			raise MemoryError(self.size)
 544 | 		self.keys = <uint16_t *>tmp1
 545 | 		self.data = <Block *>tmp2
 546 | 		memcpy(self.keys, &(buf[offset]), self.size * sizeof(uint16_t))
 547 | 		offset += self.size * sizeof(uint16_t)
 548 | 		data = <Block *>&(buf[offset])
 549 | 		for n in range(self.size):
 550 | 			self.data[n] = data[n]
 551 | 			offset = data[n].buf.offset
 552 | 			if data[n].state == DENSE:
 553 | 				self.data[n].buf.dense = allocdense()
 554 | 				size = BITMAPSIZE
 555 | 			else:
 556 | 				self.data[n].buf.sparse = allocsparse(data[n].capacity)
 557 | 				size = data[n].capacity * sizeof(uint16_t)
 558 | 			memcpy(self.data[n].buf.ptr, &(buf[offset]), size)
 559 | 
 560 | 	def intersection(self, *other):
 561 | 		"""Return the intersection of two or more sets as a new RoaringBitmap.
 562 | 
 563 | 		(i.e. elements that are common to all of the sets.)"""
 564 | 		cdef RoaringBitmap result
 565 | 		if len(other) == 0:
 566 | 			return self
 567 | 		elif len(other) == 1:
 568 | 			return self & other[0]
 569 | 		other = sorted([self] + [ensurerb(a) for a in other],
 570 | 				key=RoaringBitmap.numelem)
 571 | 		result = other[0] & other[1]
 572 | 		for ob in other[2:]:
 573 | 			result &= ob
 574 | 			if result.size == 0:
 575 | 				break
 576 | 		return result
 577 | 
 578 | 	def union(self, *other):
 579 | 		"""Return the union of two or more sets as a new set.
 580 | 
 581 | 		(i.e. all elements that are in at least one of the sets.)"""
 582 | 		if len(other) == 0:
 583 | 			return self
 584 | 		elif len(other) == 1:
 585 | 			return self | other[0]
 586 | 		queue = [(ob1.numelem(), ob1) for ob1 in map(ensurerb, other)]
 587 | 		queue.append((self.numelem(), self))
 588 | 		heapq.heapify(queue)
 589 | 		while len(queue) > 1:
 590 | 			_, ob1 = heapq.heappop(queue)
 591 | 			_, ob2 = heapq.heappop(queue)
 592 | 			result = ob1 | ob2
 593 | 			heapq.heappush(queue, (result.numelem(), result))
 594 | 		_, result = heapq.heappop(queue)
 595 | 		return result
 596 | 
 597 | 	def difference(self, *other):
 598 | 		"""Return the difference of two or more sets as a new RoaringBitmap.
 599 | 
 600 | 		(i.e, self - other[0] - other[1] - ...)"""
 601 | 		cdef RoaringBitmap result
 602 | 		if len(other) == 0:
 603 | 			return self
 604 | 		other = sorted(map(ensurerb, other),
 605 | 				key=RoaringBitmap.numelem, reverse=True)
 606 | 		result = self - other[0]
 607 | 		for ob in other[1:]:
 608 | 			result -= ob
 609 | 			if result.size == 0:
 610 | 				break
 611 | 		return result
 612 | 
 613 | 	def symmetric_difference(self, other):
 614 | 		"""Return the symmetric difference of two sets as a new RoaringBitmap.
 615 | 
 616 | 		(i.e. all elements that are in exactly one of the sets.)"""
 617 | 		return self ^ other
 618 | 
 619 | 	def update(self, *other):
 620 | 		"""In-place union update of this RoaringBitmap.
 621 | 
 622 | 		With one argument, add items from the iterable to this set;
 623 | 		with more arguments: add the union of given ``RoaringBitmap`` objects.
 624 | 
 625 | 		NB: since range objects are recognized by the constructor, this
 626 | 		provides an efficient way to set a range of bits:
 627 | 
 628 | 		>>> rb = RoaringBitmap(range(5))
 629 | 		>>> rb.update(range(3, 7))
 630 | 		>>> rb
 631 | 		RoaringBitmap({0, 1, 2, 3, 4, 5, 6})
 632 | 		"""
 633 | 		cdef RoaringBitmap ob1, ob2
 634 | 		if len(other) == 0:
 635 | 			return
 636 | 		if len(other) == 1:
 637 | 			self |= other[0]
 638 | 			return
 639 | 		queue = [(ob1.numelem(), ob1) for ob1 in map(ensurerb, other)]
 640 | 		heapq.heapify(queue)
 641 | 		while len(queue) > 1:
 642 | 			_, ob1 = heapq.heappop(queue)
 643 | 			_, ob2 = heapq.heappop(queue)
 644 | 			result = ob1 | ob2
 645 | 			heapq.heappush(queue, (result.numelem(), result))
 646 | 		_, result = heapq.heappop(queue)
 647 | 		self |= result
 648 | 
 649 | 	def intersection_update(self, *other):
 650 | 		"""Intersect this set in-place with one or more ``RoaringBitmap``
 651 | 		objects.
 652 | 
 653 | 		NB: since range objects are recognized by the constructor, this
 654 | 		provides an efficient way to restrict the set to a range of elements:
 655 | 
 656 | 		>>> rb = RoaringBitmap(range(5))
 657 | 		>>> rb.intersection_update(range(3, 7))
 658 | 		>>> rb
 659 | 		RoaringBitmap({3, 4})
 660 | 		"""
 661 | 		if len(other) == 0:
 662 | 			return
 663 | 		elif len(other) == 1:
 664 | 			self &= other[0]
 665 | 			return
 666 | 		other = sorted(map(ensurerb, other), key=RoaringBitmap.numelem)
 667 | 		for ob in other:
 668 | 			self &= ob
 669 | 			if self.size == 0:
 670 | 				break
 671 | 
 672 | 	def difference_update(self, *other):
 673 | 		"""Remove all elements of other RoaringBitmaps from this one."""
 674 | 		for ob in other:
 675 | 			self -= ob
 676 | 			if self.size == 0:
 677 | 				break
 678 | 
 679 | 	def symmetric_difference_update(self, other):
 680 | 		"""Update set to symmetric difference of itself and another."""
 681 | 		self ^= other
 682 | 
 683 | 	def flip_range(self, uint32_t start, uint32_t stop):
 684 | 		"""In-place negation for range(start, stop)."""
 685 | 		self.symmetric_difference_update(RANGE(start, stop))
 686 | 
 687 | 	def intersection_len(self, other):
 688 | 		"""Return the cardinality of the intersection.
 689 | 
 690 | 		Optimized version of ``len(self & other)``."""
 691 | 		cdef RoaringBitmap ob1 = ensurerb(self)
 692 | 		cdef RoaringBitmap ob2 = ensurerb(other)
 693 | 		cdef Block b1, b2
 694 | 		cdef uint32_t pos1 = 0, pos2 = 0
 695 | 		cdef size_t result = 0
 696 | 		if pos1 < ob1.size and pos2 < ob2.size:
 697 | 			while True:
 698 | 				if ob1.keys[pos1] < ob2.keys[pos2]:
 699 | 					pos1 += 1
 700 | 					if pos1 == ob1.size:
 701 | 						break
 702 | 				elif ob1.keys[pos1] > ob2.keys[pos2]:
 703 | 					pos2 += 1
 704 | 					if pos2 == ob2.size:
 705 | 						break
 706 | 				else:
 707 | 					result += block_andlen(
 708 | 							ob1._getblk(pos1, &b1),
 709 | 							ob2._getblk(pos2, &b2))
 710 | 					pos1 += 1
 711 | 					pos2 += 1
 712 | 					if pos1 == ob1.size or pos2 == ob2.size:
 713 | 						break
 714 | 		return result
 715 | 
 716 | 	def union_len(self, other):
 717 | 		"""Return the cardinality of the union.
 718 | 
 719 | 		Optimized version of ``len(self | other)``."""
 720 | 		cdef RoaringBitmap ob1 = ensurerb(self)
 721 | 		cdef RoaringBitmap ob2 = ensurerb(other)
 722 | 		cdef Block b1, b2
 723 | 		cdef uint32_t pos1 = 0, pos2 = 0
 724 | 		cdef size_t result = 0
 725 | 		if pos1 < ob1.size and pos2 < ob2.size:
 726 | 			while True:
 727 | 				if ob1.keys[pos1] < ob2.keys[pos2]:
 728 | 					result += ob1.data[pos1].cardinality
 729 | 					pos1 += 1
 730 | 					if pos1 == ob1.size:
 731 | 						break
 732 | 				elif ob1.keys[pos1] > ob2.keys[pos2]:
 733 | 					result += ob2.data[pos2].cardinality
 734 | 					pos2 += 1
 735 | 					if pos2 == ob2.size:
 736 | 						break
 737 | 				else:
 738 | 					result += block_orlen(
 739 | 							ob1._getblk(pos1, &b1),
 740 | 							ob2._getblk(pos2, &b2))
 741 | 					pos1 += 1
 742 | 					pos2 += 1
 743 | 					if pos1 == ob1.size or pos2 == ob2.size:
 744 | 						break
 745 | 		if pos1 == ob1.size and pos2 < ob2.size:
 746 | 			for pos2 in range(pos2, ob2.size):
 747 | 				result += ob2.data[pos2].cardinality
 748 | 		elif pos2 == ob2.size and pos1 < ob1.size:
 749 | 			for pos1 in range(pos1, ob1.size):
 750 | 				result += ob1.data[pos1].cardinality
 751 | 		return result
 752 | 
 753 | 	def jaccard_dist(self, other):
 754 | 		"""Return the Jaccard distance.
 755 | 
 756 | 		Optimized version of ``1 - len(self & other) / len(self | other)``.
 757 | 		Counts of union and intersection are performed simultaneously."""
 758 | 		cdef RoaringBitmap ob1 = ensurerb(self)
 759 | 		cdef RoaringBitmap ob2 = ensurerb(other)
 760 | 		return rb_jaccard_dist(ob1, ob2)
 761 | 
 762 | 	def rank(self, uint32_t x):
 763 | 		"""Return the number of elements ``<= x`` that are in this set."""
 764 | 		cdef Block b1
 765 | 		cdef size_t size = 0, n
 766 | 		cdef uint16_t xhigh = highbits(x)
 767 | 		for n in range(self.size):
 768 | 			if self.keys[n] < xhigh:
 769 | 				size += self.data[n].cardinality
 770 | 			elif self.keys[n] > xhigh:
 771 | 				return size
 772 | 			else:
 773 | 				return size + block_rank(
 774 | 						self._getblk(n, &b1),
 775 | 						lowbits(x))
 776 | 		return size
 777 | 
 778 | 	def select(self, int i):
 779 | 		"""Return the ith element that is in this set.
 780 | 
 781 | 		:param i: a 0-based index."""
 782 | 		cdef Block b1
 783 | 		cdef uint32_t leftover = <uint32_t>i
 784 | 		cdef uint32_t n, keycontrib, lowcontrib
 785 | 		if i < 0:
 786 | 			raise IndexError('select: index %d out of range 0..%d.' % (
 787 | 					i, len(self)))
 788 | 		for n in range(self.size):
 789 | 			if self.data[n].cardinality > leftover:
 790 | 				keycontrib = self.keys[n] << 16
 791 | 				lowcontrib = block_select(
 792 | 						self._getblk(n, &b1),
 793 | 						leftover)
 794 | 				return keycontrib | lowcontrib
 795 | 			leftover -= self.data[n].cardinality
 796 | 		raise IndexError('select: index %d out of range 0..%d.' % (
 797 | 				i, len(self)))
 798 | 
 799 | 	def index(self, uint32_t x):
 800 | 		"""Return the 0-based index of `x` in this set.
 801 | 
 802 | 		Equivalent to ``sorted(self).index(x)``."""
 803 | 		if x in self:
 804 | 			return self.rank(x) - 1
 805 | 		raise IndexError
 806 | 
 807 | 	def _ridx(self, i):
 808 | 		if i < 0:
 809 | 			return len(self) + i
 810 | 		return i
 811 | 
 812 | 	def _slice(self, i):
 813 | 		"""Return the range of values for a given a range of indices i."""
 814 | 		start = 0 if i.start is None else self._ridx(i.start)
 815 | 		stop = len(self) if i.stop is None else self._ridx(i.stop)
 816 | 		return RANGE(
 817 | 				self.select(start), self.select(stop - 1) + 1)
 818 | 
 819 | 	def __getitem__(self, i):
 820 | 		"""Get element with rank `i`, or a slice.
 821 | 
 822 | 		In the case of a slice, a new roaringbitmap is returned."""
 823 | 		if isinstance(i, slice):
 824 | 			if i.step is None or i.step == 1:
 825 | 				return self.intersection(self._slice(i))
 826 | 			elif i.step <= 0:
 827 | 				raise ValueError
 828 | 			else:  # i.step > 1  FIXME we could do better
 829 | 				start, stop, step = i.indices(len(self))
 830 | 				return RoaringBitmap(
 831 | 						[self[x] for x in RANGE(start, stop, step)])
 832 | 		elif isinstance(i, (int, long)):
 833 | 			return self.select(self._ridx(i))
 834 | 		else:
 835 | 			raise TypeError('Expected integer index or slice object.')
 836 | 
 837 | 	def __delitem__(self, i):
 838 | 		"""Discard element with rank `i`, or a slice."""
 839 | 		if isinstance(i, slice):
 840 | 			if i.step is None or i.step == 1:
 841 | 				self.difference_update(self._slice(i))
 842 | 			elif i.step <= 0:
 843 | 				raise ValueError
 844 | 			else:  # i.step > 1  FIXME we could do better
 845 | 				start, stop, step = i.indices(len(self))
 846 | 				self.difference_update(RoaringBitmap([
 847 | 						self[x] for x in RANGE(start, stop, step)]))
 848 | 		elif isinstance(i, (int, long)):
 849 | 			self.discard(self.select(self._ridx(i)))
 850 | 		else:
 851 | 			raise TypeError('Expected integer index or slice object.')
 852 | 
 853 | 	def _initrange(self, uint32_t start, uint32_t stop, uint32_t step):
 854 | 		cdef Block *block = NULL
 855 | 		cdef uint32_t key, blockstart, blockstop, gap
 856 | 		cdef uint32_t tmp = start
 857 | 		cdef uint64_t n
 858 | 		if step >= (1 << 16):
 859 | 			n = start
 860 | 			while n < stop:
 861 | 				self.add(n)
 862 | 				n += step
 863 | 			return
 864 | 		while True:
 865 | 			key = highbits(tmp)
 866 | 			blockstart = lowbits(tmp)
 867 | 			blockstop = min(stop - (key << 16), 1 << 16)
 868 | 			block = self._insertempty(self.size, key)
 869 | 			block_initrange(block, blockstart, blockstop, step)
 870 | 			gap = blockstop - blockstart + step - 1
 871 | 			tmp += gap - (gap % step)
 872 | 			if tmp >= stop:
 873 | 				break
 874 | 
 875 | 	def _init2pass(self, iterable):
 876 | 		cdef Block *block = NULL
 877 | 		cdef uint32_t elem
 878 | 		cdef uint16_t key
 879 | 		cdef int i, prev = -1
 880 | 		# gather keys and count elements for each block
 881 | 		for elem in iterable:
 882 | 			key = highbits(elem)
 883 | 			if key != prev:
 884 | 				i = self._getindex(key)
 885 | 				if i < 0:
 886 | 					block = self._insertempty(-i - 1, key)
 887 | 					block.cardinality = block.capacity = 0
 888 | 				else:
 889 | 					block = &(self.data[i])
 890 | 				prev = key
 891 | 			block.capacity += 1  # NB: wraps to 0 for block with all elements set
 892 | 		# allocate blocks
 893 | 		for i in range(<int>self.size):
 894 | 			block = &(self.data[i])
 895 | 			if 0 < block.capacity < MAXARRAYLENGTH:
 896 | 				block.buf.sparse = allocsparse(block.capacity)
 897 | 				block.state = POSITIVE
 898 | 			else:  # if necessary, will convert to inverted later
 899 | 				block.capacity = BITMAPSIZE // sizeof(uint16_t)
 900 | 				block.buf.dense = allocdense()
 901 | 				memset(block.buf.dense, 0, BITMAPSIZE)
 902 | 				block.state = DENSE
 903 | 		# second pass, add elements for each block
 904 | 		prev = -1
 905 | 		for elem in iterable:
 906 | 			key = highbits(elem)
 907 | 			if key != prev:
 908 | 				i = self._getindex(key)
 909 | 				if prev != -1:
 910 | 					block_convert(block)
 911 | 				block = &(self.data[i])
 912 | 				prev = key
 913 | 			block_add(block, lowbits(elem))
 914 | 		if prev != -1:
 915 | 			block_convert(block)
 916 | 
 917 | 	def _inititerator(self, iterable):
 918 | 		cdef Block *block = NULL
 919 | 		cdef uint32_t elem
 920 | 		cdef uint16_t key
 921 | 		cdef int n
 922 | 		cdef dict tmp = {}
 923 | 		cdef list values
 924 | 		for elem in iterable:
 925 | 			key = highbits(elem)
 926 | 			if key not in tmp:
 927 | 				tmp[key] = set()
 928 | 			tmp[key].add(lowbits(elem))
 929 | 		for key in sorted(tmp):
 930 | 			values = sorted(tmp[key])
 931 | 			block = self._insertempty(self.size, key)
 932 | 			block.cardinality = len(values)
 933 | 			if block.cardinality < MAXARRAYLENGTH:
 934 | 				block.capacity = block.cardinality
 935 | 				block.buf.sparse = allocsparse(block.capacity)
 936 | 				block.state = POSITIVE
 937 | 				for n, elem in enumerate(values):
 938 | 					block.buf.sparse[n] = elem
 939 | 			elif block.cardinality == BLOCKSIZE:
 940 | 				block_initrange(block, 0, BLOCKSIZE, 1)
 941 | 			else:
 942 | 				block.capacity = BITMAPSIZE // sizeof(uint16_t)
 943 | 				block.buf.dense = allocdense()
 944 | 				memset(block.buf.dense, 0, BITMAPSIZE)
 945 | 				block.state = DENSE
 946 | 				for elem in values:
 947 | 					SETBIT(block.buf.dense, elem)
 948 | 			block_convert(block)
 949 | 
 950 | 	# def _inititerator(self, iterable):
 951 | 	# 	cdef Block *block = NULL
 952 | 	# 	cdef uint32_t elem
 953 | 	# 	cdef uint16_t key
 954 | 	# 	cdef int i, prev = -1
 955 | 	# 	for elem in iterable:
 956 | 	# 		key = highbits(elem)
 957 | 	# 		if key != prev:
 958 | 	# 			i = self._getindex(key)
 959 | 	# 			if i >= 0:
 960 | 	# 				block = &(self.data[i])
 961 | 	# 			else:
 962 | 	# 				block = self._insertempty(-i - 1, key)
 963 | 	# 				block.state = POSITIVE
 964 | 	# 				block.cardinality = 0
 965 | 	# 				block.buf.sparse = allocsparse(INITCAPACITY)
 966 | 	# 				block.capacity = INITCAPACITY
 967 | 	# 			prev = key
 968 | 	# 		block_add(block, lowbits(elem))
 969 | 	# 		block_convert(block)
 970 | 
 971 | 	cdef _initarray(self, int k):
 972 | 		"""Allocate k elements and initialize pointers to zero."""
 973 | 		self._extendarray(k)
 974 | 		memset(self.data, 0, self.capacity * sizeof(Block))
 975 | 
 976 | 	cdef _extendarray(self, int k):
 977 | 		"""Extend allocation with k extra elements + amortization."""
 978 | 		cdef size_t desired = self.size + k
 979 | 		cdef size_t newcapacity
 980 | 		cdef void *tmp1
 981 | 		cdef void *tmp2
 982 | 		if desired < self.capacity:
 983 | 			return
 984 | 		newcapacity = 2 * desired if self.size < 1024 else 5 * desired // 4
 985 | 		tmp1 = realloc(self.keys, newcapacity * sizeof(uint16_t))
 986 | 		tmp2 = realloc(self.data, newcapacity * sizeof(Block))
 987 | 		if tmp1 is NULL or tmp2 is NULL:
 988 | 			raise MemoryError(newcapacity)
 989 | 		self.keys = <uint16_t *>tmp1
 990 | 		self.data = <Block *>tmp2
 991 | 		self.capacity = newcapacity
 992 | 
 993 | 	cdef _resize(self, int k):
 994 | 		"""Set size and if necessary reduce array allocation to k elements."""
 995 | 		cdef void *tmp1
 996 | 		cdef void *tmp2
 997 | 		if k > INITCAPACITY and k * 2 < <int>self.capacity:
 998 | 			tmp1 = realloc(self.keys, k * sizeof(uint16_t))
 999 | 			tmp2 = realloc(self.data, k * sizeof(Block))
1000 | 			if tmp1 is NULL or tmp2 is NULL:
1001 | 				raise MemoryError((k, self.size, self.capacity))
1002 | 			self.keys = <uint16_t *>tmp1
1003 | 			self.data = <Block *>tmp2
1004 | 			self.capacity = k
1005 | 		self.size = k
1006 | 
1007 | 	cdef _tmpalloc(self, int size, uint16_t **keys, Block **data):
1008 | 		keys[0] = <uint16_t *>malloc(size * sizeof(uint16_t))
1009 | 		data[0] = <Block *>calloc(size, sizeof(Block))
1010 | 		if keys[0] is NULL or data[0] is NULL:
1011 | 			raise MemoryError(size)
1012 | 
1013 | 	cdef _replacearrays(self, uint16_t *keys, Block *data, int size):
1014 | 		free(self.keys)
1015 | 		free(self.data)
1016 | 		self.keys = keys
1017 | 		self.data = data
1018 | 		self.size = size
1019 | 		self._resize(self.size)  # truncate
1020 | 
1021 | 	cdef _removeatidx(self, int i):
1022 | 		"""Remove the i'th element."""
1023 | 		aligned_free(self.data[i].buf.ptr)
1024 | 		memmove(&(self.keys[i]), &(self.keys[i + 1]),
1025 | 				(self.size - i - 1) * sizeof(uint16_t))
1026 | 		memmove(&(self.data[i]), &(self.data[i + 1]),
1027 | 				(self.size - i - 1) * sizeof(Block))
1028 | 		self.size -= 1
1029 | 
1030 | 	cdef Block *_insertempty(self, int i, uint16_t key):
1031 | 		"""Insert a new, uninitialized block."""
1032 | 		self._extendarray(1)
1033 | 		if i < <int>self.size:
1034 | 			memmove(&(self.keys[i + 1]), &(self.keys[i]),
1035 | 					(self.size - i) * sizeof(uint16_t))
1036 | 			memmove(&(self.data[i + 1]), &(self.data[i]),
1037 | 					(self.size - i) * sizeof(Block))
1038 | 		self.size += 1
1039 | 		self.keys[i] = key
1040 | 		self.data[i].buf.ptr = NULL
1041 | 		return &(self.data[i])
1042 | 
1043 | 	cdef _insertcopy(self, int i, uint16_t key, Block *block):
1044 | 		"""Insert a copy of given block."""
1045 | 		cdef size_t size
1046 | 		self._extendarray(1)
1047 | 		if i < <int>self.size:
1048 | 			memmove(&(self.keys[i + 1]), &(self.keys[i]),
1049 | 					(self.size - i) * sizeof(uint16_t))
1050 | 			memmove(&(self.data[i + 1]), &(self.data[i]),
1051 | 					(self.size - i) * sizeof(Block))
1052 | 		size = getsize(block)
1053 | 		self.keys[i] = key
1054 | 		self.data[i] = block[0]
1055 | 		if self.data[i].state == DENSE:
1056 | 			self.data[i].buf.dense = allocdense()
1057 | 		elif self.data[i].state in (POSITIVE, INVERTED):
1058 | 			self.data[i].buf.sparse = allocsparse(size)
1059 | 			self.data[i].capacity = size
1060 | 		memcpy(self.data[i].buf.ptr, block.buf.ptr, size * sizeof(uint16_t))
1061 | 		self.size += 1
1062 | 
1063 | 	cdef int _getindex(self, uint16_t key):
1064 | 		if self.size == 0:
1065 | 			return -1
1066 | 		# Common case of appending in last block:
1067 | 		if self.keys[self.size - 1] == key:
1068 | 			return self.size - 1
1069 | 		return self._binarysearch(0, self.size, key)
1070 | 
1071 | 	cdef int _binarysearch(self, int begin, int end, uint16_t key):
1072 | 		"""Binary search for key.
1073 | 
1074 | 		:returns: positive index ``i`` if ``key`` is found;
1075 | 			negative value ``i`` if ``elem`` is not found,
1076 | 			but would fit at ``-i - 1``."""
1077 | 		cdef int low = begin, high = end - 1
1078 | 		cdef int middleidx, middleval
1079 | 		while low <= high:
1080 | 			middleidx = (low + high) >> 1
1081 | 			middleval = self.keys[middleidx]
1082 | 			if middleval < key:
1083 | 				low = middleidx + 1
1084 | 			elif middleval > key:
1085 | 				high = middleidx - 1
1086 | 			else:
1087 | 				return middleidx
1088 | 		return -(low + 1)
1089 | 
1090 | 	def _checkconsistency(self):
1091 | 		"""Verify that arrays are sorted and free of duplicates."""
1092 | 		cdef Block b1
1093 | 		cdef Block *b2
1094 | 		cdef size_t n, m
1095 | 		for n in range(self.size):
1096 | 			assert self.data[n].state in (DENSE, POSITIVE, INVERTED)
1097 | 			assert 1 <= self.data[n].cardinality < 1 << 16
1098 | 			assert getsize(&(self.data[n])) <= self.data[n].capacity
1099 | 			if self.data[n].state == POSITIVE:
1100 | 				assert 1 <= self.data[n].cardinality < MAXARRAYLENGTH
1101 | 			elif self.data[n].state == DENSE:
1102 | 				assert (MAXARRAYLENGTH <= self.data[n].cardinality
1103 | 						<= BLOCKSIZE - MAXARRAYLENGTH)
1104 | 			elif self.data[n].state == INVERTED:
1105 | 				assert (BLOCKSIZE - MAXARRAYLENGTH < self.data[n].cardinality
1106 | 						< BLOCKSIZE)
1107 | 			if n + 1 < self.size:
1108 | 				assert self.keys[n] < self.keys[n + 1], (
1109 | 						n, self.keys[n], self.keys[n + 1])
1110 | 			if self.data[n].state != DENSE:
1111 | 				for m in range(getsize(&(self.data[n])) - 1):
1112 | 					b2 = self._getblk(n, &b1)
1113 | 					assert b2.buf.sparse[m] < b2.buf.sparse[m + 1], (
1114 | 							m, b2.buf.sparse[m], b2.buf.sparse[m + 1])
1115 | 
1116 | 	cdef inline Block *_getblk(self, int i, Block *tmp) noexcept nogil:
1117 | 		"""Get pointer to block `i`. If there is an offset, copy this block
1118 | 		to ``tmp`` and add offset to its pointer, otherwise return block itself.
1119 | 		"""
1120 | 		# a bit unelegant, but this makes it possible to use the same code
1121 | 		# for mutable & immutable variants.
1122 | 		if not 0 <= i < <int>self.size:
1123 | 			printf('illegal index %d; size=%d\n', i, self.size)
1124 | 			abort()
1125 | 		if self.offset:
1126 | 			tmp[0] = self.data[i]
1127 | 			tmp.buf.ptr = <void *>(tmp.buf.offset + self.offset)
1128 | 			return tmp
1129 | 		return &(self.data[i])
1130 | 
1131 | 
1132 | cdef inline RoaringBitmap ensurerb(obj):
1133 | 	"""Convert set-like ``obj`` to RoaringBitmap if necessary."""
1134 | 	if isinstance(obj, RoaringBitmap):
1135 | 		return obj
1136 | 	return RoaringBitmap(obj)
1137 | 
1138 | 
1139 | cdef inline uint16_t highbits(uint32_t x) noexcept nogil:
1140 | 	return x >> 16
1141 | 
1142 | 
1143 | cdef inline uint16_t lowbits(uint32_t x) noexcept nogil:
1144 | 	return x & 0xFFFF
1145 | 
1146 | 
1147 | cdef inline uint32_t min(uint32_t a, uint32_t b) noexcept nogil:
1148 | 	return a if a <= b else b
1149 | 
1150 | 
1151 | cdef inline uint32_t max(uint32_t a, uint32_t b) noexcept nogil:
1152 | 	return a if a >= b else b
1153 | 
1154 | 
1155 | cdef inline int getbufptr(
1156 | 		object obj, char ** ptr, Py_ssize_t * size, Py_buffer * buf):
1157 | 	"""Get a pointer from bytes/buffer object ``obj``.
1158 | 
1159 | 	On success, return 0, and set ``ptr``, ``size``, and possibly ``buf``."""
1160 | 	cdef int result = -1
1161 | 	ptr[0] = NULL
1162 | 	size[0] = 0
1163 | 	if PY2:
1164 | 		# Although the new-style buffer interface was backported to Python 2.6,
1165 | 		# some modules, notably mmap, only support the old buffer interface.
1166 | 		# Cf. http://bugs.python.org/issue9229
1167 | 		if PyObject_CheckReadBuffer(obj) == 1:
1168 | 			result = PyObject_AsReadBuffer(
1169 | 					obj, <const void **>ptr, size)
1170 | 	elif PyObject_CheckBuffer(obj) == 1:  # new-style Buffer interface
1171 | 		result = PyObject_GetBuffer(obj, buf, PyBUF_SIMPLE)
1172 | 		if result == 0:
1173 | 			ptr[0] = <char *>buf.buf
1174 | 			size[0] = buf.len
1175 | 	return result
1176 | 
1177 | 
1178 | cdef inline void releasebuf(Py_buffer *buf):
1179 | 	"""Release buffer if necessary."""
1180 | 	if not PY2:
1181 | 		PyBuffer_Release(buf)
1182 | 
1183 | 
1184 | def bitcounttests():
1185 | 	assert bit_ctz(2) == 1
1186 | 	assert bit_ctz(3) == 0
1187 | 	assert bit_ctz(0x80000000) == 31
1188 | 	assert bit_ctz(0x1000) == 12
1189 | 	assert bit_ctz(UINT64_MAX) == 0
1190 | 	assert bit_clz(1) == BITCOUNT_BITS - 1
1191 | 	assert bit_clz(4) == BITCOUNT_BITS - 3
1192 | 	assert bit_clz(0x80000000) == BITCOUNT_BITS - 32
1193 | 	assert bit_clz(0x1000) == BITCOUNT_BITS - 13
1194 | 	assert bit_clz(UINT64_MAX) == 0
1195 | 	assert bit_popcount(0x1) == 1
1196 | 	assert bit_popcount(0x10) == 1
1197 | 	assert bit_popcount(0x101001) == 3
1198 | 	assert bit_popcount(3) == 2
1199 | 	assert bit_popcount(UINT64_MAX) == BITCOUNT_BITS
1200 | 	assert bit_popcount(0) == 0
1201 | 	return True
1202 | 
1203 | 
1204 | def aligned_malloc_tests():
1205 | 	cdef void *ptr = NULL
1206 | 	ptr = aligned_malloc(1024, sizeof(void *))
1207 | 	assert ptr is not NULL
1208 | 	(<uint64_t *>ptr)[0] = 1234
1209 | 	aligned_free(ptr)
1210 | 	return True
1211 | 
1212 | 
1213 | def mmaptests():
1214 | 	cdef Py_buffer buffer
1215 | 	cdef Py_ssize_t size = 0
1216 | 	cdef char *ptr = NULL
1217 | 	cdef uint32_t *uptr
1218 | 	cdef int result
1219 | 
1220 | 	alignment = 32
1221 | 	alloc = sizeof(uint32_t) + 8 * sizeof(uint32_t)
1222 | 	extra = alignment - alloc % alignment
1223 | 	alloc += extra + 1024
1224 | 
1225 | 	ob = mmap.mmap(-1, alloc, access=mmap.ACCESS_WRITE)
1226 | 	result = getbufptr(ob, &ptr, &size, &buffer)
1227 | 	if result != 0:
1228 | 		raise ValueError('could not get buffer from mmap.')
1229 | 	uptr = <uint32_t *>ptr
1230 | 	uptr[0] = 1234
1231 | 	return True
1232 | 
1233 | 
1234 | __all__ = ['RoaringBitmap', 'ImmutableRoaringBitmap', 'MultiRoaringBitmap']
1235 | 


--------------------------------------------------------------------------------
/tests/benchmarks.py:
--------------------------------------------------------------------------------
  1 | """Benchmarks for roaringbitmap"""
  2 | from __future__ import division, print_function, absolute_import, \
  3 | 		unicode_literals
  4 | import random
  5 | import timeit
  6 | 
  7 | N = 1 << 17  # number of random elements
  8 | M = 100  # number of test runs
  9 | MAX = 1 << 20  # range of elements
 10 | DATA1, DATA2 = None, None
 11 | 
 12 | 
 13 | def pair():
 14 | 	random.seed(42)
 15 | 	data1 = [random.randint(0, MAX) for _ in range(N)]
 16 | 	data2 = data1[:len(data1) // 2]
 17 | 	data2.extend(random.randint(0, MAX) for _ in range(N // 2))
 18 | 	return data1, data2
 19 | 
 20 | 
 21 | def bench_init():
 22 | 	a = timeit.Timer('set(DATA1)',
 23 | 			setup='from __main__ import DATA1').timeit(number=M)
 24 | 	b = timeit.Timer('rb = RoaringBitmap(DATA1)',
 25 | 			setup='from __main__ import DATA1; '
 26 | 				'from roaringbitmap import RoaringBitmap; '
 27 | 				).timeit(number=M)
 28 | 	return a, b
 29 | 
 30 | 
 31 | def bench_initsort():
 32 | 	a = timeit.Timer('set(data)',
 33 | 			setup='from __main__ import DATA1; '
 34 | 				'data = sorted(DATA1)').timeit(number=M)
 35 | 	b = timeit.Timer('rb = RoaringBitmap(data)',
 36 | 			setup='from __main__ import DATA1; '
 37 | 				'from roaringbitmap import RoaringBitmap; '
 38 | 				'data = sorted(DATA1)'
 39 | 				).timeit(number=M)
 40 | 	return a, b
 41 | 
 42 | 
 43 | def bench_eq():
 44 | 	# benchmark equality with equal operands
 45 | 	a = timeit.Timer('ref == ref2',
 46 | 			setup='from __main__ import DATA1; '
 47 | 				'ref = set(DATA1); ref2 = set(DATA1)').timeit(number=M)
 48 | 	b = timeit.Timer('rb == rb2',
 49 | 			setup='from __main__ import DATA1; '
 50 | 				'from roaringbitmap import RoaringBitmap; '
 51 | 				'rb = RoaringBitmap(DATA1); '
 52 | 				'rb2 = RoaringBitmap(DATA1)').timeit(number=M)
 53 | 	return a, b
 54 | 
 55 | 
 56 | def bench_neq():
 57 | 	# benchmark non-equality with non-equal operands
 58 | 	a = timeit.Timer('ref != ref2',
 59 | 			setup='from __main__ import DATA1, DATA2; '
 60 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
 61 | 	b = timeit.Timer('rb != rb2',
 62 | 			setup='from __main__ import DATA1, DATA2; '
 63 | 				'from roaringbitmap import RoaringBitmap; '
 64 | 				'rb = RoaringBitmap(DATA1); '
 65 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
 66 | 	return a, b
 67 | 
 68 | 
 69 | def bench_and():
 70 | 	a = timeit.Timer('ref & ref2',
 71 | 			setup='from __main__ import DATA1, DATA2; '
 72 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
 73 | 	b = timeit.Timer('rb & rb2',
 74 | 			setup='from __main__ import DATA1, DATA2; '
 75 | 				'from roaringbitmap import RoaringBitmap; '
 76 | 				'rb = RoaringBitmap(DATA1); '
 77 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
 78 | 	return a, b
 79 | 
 80 | 
 81 | def bench_or():
 82 | 	a = timeit.Timer('ref | ref2',
 83 | 			setup='from __main__ import DATA1, DATA2; '
 84 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
 85 | 	b = timeit.Timer('rb | rb2',
 86 | 			setup='from __main__ import DATA1, DATA2; '
 87 | 				'from roaringbitmap import RoaringBitmap; '
 88 | 				'rb = RoaringBitmap(DATA1); '
 89 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
 90 | 	return a, b
 91 | 
 92 | 
 93 | def bench_xor():
 94 | 	a = timeit.Timer('ref ^ ref2',
 95 | 			setup='from __main__ import DATA1, DATA2; '
 96 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
 97 | 	b = timeit.Timer('rb ^ rb2',
 98 | 			setup='from __main__ import DATA1, DATA2; '
 99 | 				'from roaringbitmap import RoaringBitmap; '
100 | 				'rb = RoaringBitmap(DATA1); '
101 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
102 | 	return a, b
103 | 
104 | 
105 | def bench_sub():
106 | 	a = timeit.Timer('ref - ref2',
107 | 			setup='from __main__ import DATA1, DATA2; '
108 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
109 | 	b = timeit.Timer('rb - rb2',
110 | 			setup='from __main__ import DATA1, DATA2; '
111 | 				'from roaringbitmap import RoaringBitmap; '
112 | 				'rb = RoaringBitmap(DATA1); '
113 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
114 | 	return a, b
115 | 
116 | 
117 | def bench_iand():
118 | 	aa = [timeit.Timer('ref &= ref2',
119 | 			setup='from __main__ import DATA1, DATA2; '
120 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
121 | 			for _ in range(M)]
122 | 	bb = [timeit.Timer('rb &= rb2',
123 | 			setup='from __main__ import DATA1, DATA2; '
124 | 				'from roaringbitmap import RoaringBitmap; '
125 | 				'rb = RoaringBitmap(DATA1); '
126 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
127 | 			for _ in range(M)]
128 | 	return sum(aa) / M, sum(bb) / M
129 | 
130 | 
131 | def bench_ior():
132 | 	aa = [timeit.Timer('ref |= ref2',
133 | 			setup='from __main__ import DATA1, DATA2; '
134 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
135 | 			for _ in range(M)]
136 | 	bb = [timeit.Timer('rb |= rb2',
137 | 			setup='from __main__ import DATA1, DATA2; '
138 | 				'from roaringbitmap import RoaringBitmap; '
139 | 				'rb = RoaringBitmap(DATA1); '
140 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
141 | 			for _ in range(M)]
142 | 	return sum(aa) / M, sum(bb) / M
143 | 
144 | 
145 | def bench_ixor():
146 | 	aa = [timeit.Timer('ref ^= ref2',
147 | 			setup='from __main__ import DATA1, DATA2; '
148 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
149 | 			for _ in range(M)]
150 | 	bb = [timeit.Timer('rb ^= rb2',
151 | 			setup='from __main__ import DATA1, DATA2; '
152 | 				'from roaringbitmap import RoaringBitmap; '
153 | 				'rb = RoaringBitmap(DATA1); '
154 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
155 | 			for _ in range(M)]
156 | 	return sum(aa) / M, sum(bb) / M
157 | 
158 | 
159 | def bench_isub():
160 | 	aa = [timeit.Timer('ref -= ref2',
161 | 			setup='from __main__ import DATA1, DATA2; '
162 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=1)
163 | 			for _ in range(M)]
164 | 	bb = [timeit.Timer('rb -= rb2',
165 | 			setup='from __main__ import DATA1, DATA2; '
166 | 				'from roaringbitmap import RoaringBitmap; '
167 | 				'rb = RoaringBitmap(DATA1); '
168 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=1)
169 | 			for _ in range(M)]
170 | 	return sum(aa) / M, sum(bb) / M
171 | 
172 | 
173 | def bench_andlen():
174 | 	a = timeit.Timer('len(ref & ref2)',
175 | 			setup='from __main__ import DATA1, DATA2; '
176 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
177 | 	b = timeit.Timer('rb.intersection_len(rb2)',
178 | 			setup='from __main__ import DATA1, DATA2; '
179 | 				'from roaringbitmap import RoaringBitmap; '
180 | 				'rb = RoaringBitmap(DATA1); '
181 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
182 | 	return a, b
183 | 
184 | 
185 | def bench_orlen():
186 | 	a = timeit.Timer('len(ref | ref2)',
187 | 			setup='from __main__ import DATA1, DATA2; '
188 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
189 | 	b = timeit.Timer('rb.union_len(rb2)',
190 | 			setup='from __main__ import DATA1, DATA2; '
191 | 				'from roaringbitmap import RoaringBitmap; '
192 | 				'rb = RoaringBitmap(DATA1); '
193 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
194 | 	return a, b
195 | 
196 | 
197 | def bench_jaccard():
198 | 	a = timeit.Timer('1 - (len(ref & ref2) / len(ref | ref2))',
199 | 			setup='from __main__ import DATA1, DATA2; '
200 | 				'ref = set(DATA1); ref2 = set(DATA2)').timeit(number=M)
201 | 	b = timeit.Timer('rb.jaccard_dist(rb2)',
202 | 			setup='from __main__ import DATA1, DATA2; '
203 | 				'from roaringbitmap import RoaringBitmap; '
204 | 				'rb = RoaringBitmap(DATA1); '
205 | 				'rb2 = RoaringBitmap(DATA2)').timeit(number=M)
206 | 	return a, b
207 | 
208 | 
209 | def main():
210 | 	global N, MAX, DATA1, DATA2
211 | 	for x in range(3):
212 | 		if x == 0:  # benchmark positive blocks
213 | 			print('small sparse set')
214 | 			N = 200  # number of random elements
215 | 			MAX = 40000  # range of elements
216 | 		elif x == 1:  # benchmark bitmap blocks
217 | 			print('medium load factor')
218 | 			N = 59392
219 | 			MAX = 118784
220 | 		elif x == 2:  # benchmark inverted blocks
221 | 			print('dense set / high load factor')
222 | 			N = 40000 - 200
223 | 			MAX = 40000
224 | 		elif x == 3:  # benchmark large number of small blocks
225 | 			print('large sparse set')  # don't use RoaringBitmap for this case
226 | 			N = 1 << 12
227 | 			MAX = 1 << 31
228 | 		DATA1, DATA2 = pair()
229 | 
230 | 		fmt = '%12s %8s %16s %8s'
231 | 		numfmt = '%8.3g'
232 | 		print('%d runs with sets of %d random elements n s.t. 0 <= n < %d' % (
233 | 				M, N, MAX))
234 | 		print(fmt % ('', 'set()', 'RoaringBitmap()', 'ratio'))
235 | 		for func in (bench_init, bench_initsort,
236 | 				bench_and, bench_or, bench_xor, bench_sub,
237 | 				bench_iand, bench_ior, bench_ixor, bench_isub,
238 | 				bench_eq, bench_neq,
239 | 				# bench_andlen, bench_orlen,
240 | 				bench_jaccard):
241 | 			a, b = func()
242 | 			ratio = a / b
243 | 			print(fmt % (func.__name__.split('_', 1)[1].ljust(12),
244 | 					numfmt % a, numfmt % b,
245 | 					(numfmt % ratio) if ratio < 100 else int(ratio)))
246 | 		print()
247 | 
248 | 
249 | if __name__ == '__main__':
250 | 	main()
251 | 


--------------------------------------------------------------------------------
/tests/unittests.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for roaringbitmap"""
  2 | from __future__ import division, absolute_import, unicode_literals
  3 | import sys
  4 | import array
  5 | import pytest
  6 | import pickle
  7 | import tempfile
  8 | from random import seed, choice, sample, randint
  9 | try:
 10 | 	import faulthandler
 11 | 	faulthandler.enable()
 12 | except ImportError:
 13 | 	pass
 14 | from roaringbitmap import (RoaringBitmap, ImmutableRoaringBitmap,
 15 | 		MultiRoaringBitmap, bitcounttests, aligned_malloc_tests, mmaptests)
 16 | PY2 = sys.version_info[0] == 2
 17 | if PY2:
 18 | 	range = xrange
 19 | 	from itertools import izip_longest as zip_longest
 20 | else:
 21 | 	from itertools import zip_longest
 22 | 
 23 | # (numitems, maxnum)
 24 | PARAMS = [
 25 | 		('empty', 0, (1 << 16) - 1),
 26 | 		('positive', 200, (1 << 16) - 1),
 27 | 		('dense', 5000, (1 << 16) - 1),
 28 | 		('inverted', 4000, (1 << 16) - 1),
 29 | 		('many keys', 4000, (1 << 25) - 1)
 30 | 		]
 31 | 
 32 | 
 33 | def _single():
 34 | 	seed(42)
 35 | 	result = []
 36 | 	for name, elements, maxnum in PARAMS:
 37 | 		if name == 'inverted':
 38 | 			result.append((name, list(set(range(1 << 16))
 39 | 				- {randint(0, maxnum) for _ in range(elements)})))
 40 | 		else:
 41 | 			result.append((name, sorted(
 42 | 				randint(0, maxnum) for _ in range(elements))))
 43 | 	return result
 44 | 
 45 | 
 46 | @pytest.fixture(scope='module')
 47 | def single():
 48 | 	return _single()
 49 | 
 50 | 
 51 | @pytest.fixture(scope='module')
 52 | def pair():
 53 | 	result = []
 54 | 	for name1, a in _single():
 55 | 		for name2, b in _single():
 56 | 			if name2 != 'empty':
 57 | 				b = sorted(b[:len(b) // 2] + a[len(a) // 2:])
 58 | 			result.append((name1 + ':' + name2, a, b))
 59 | 	return result
 60 | 
 61 | 
 62 | @pytest.fixture(scope='module')
 63 | def multi():
 64 | 	a = sorted(randint(0, 2000)
 65 | 			for _ in range(randint(100, 2000)))
 66 | 	result = [sorted([randint(0, 2000)
 67 | 			for _ in range(randint(100, 2000))] + a)
 68 | 			for _ in range(100)]
 69 | 	return result
 70 | 
 71 | 
 72 | def abbr(a):
 73 | 	return a[:500] + '...' + a[-500:]
 74 | 
 75 | 
 76 | def test_fixtures(single):
 77 | 	for name, data in single:
 78 | 		rb = RoaringBitmap(data)
 79 | 		if name == 'many keys':
 80 | 			assert len(rb._keys()) > 100
 81 | 		elif name == 'empty':
 82 | 			assert len(rb) == 0
 83 | 		else:
 84 | 			assert name[0].upper() in rb.debuginfo()
 85 | 
 86 | 
 87 | def test_bitcount():
 88 | 	assert bitcounttests()
 89 | 
 90 | 
 91 | def test_aligned_malloc():
 92 | 	assert aligned_malloc_tests()
 93 | 
 94 | 
 95 | def test_mmap():
 96 | 	assert mmaptests()
 97 | 
 98 | 
 99 | class Test_roaringbitmap(object):
100 | 	def test_inittrivial(self):
101 | 		data = list(range(5))
102 | 		ref = set(data)
103 | 		rb = RoaringBitmap(data)
104 | 		rb._checkconsistency()
105 | 		assert ref == rb
106 | 
107 | 	def test_initsorted(self, single):
108 | 		for name, data in single:
109 | 			ref = set(sorted(data))
110 | 			rb = RoaringBitmap(sorted(data))
111 | 			rb._checkconsistency()
112 | 			assert ref == rb, name
113 | 
114 | 	def test_initunsorted(self, single):
115 | 		for name, data in single:
116 | 			ref = set(data)
117 | 			rb = RoaringBitmap(data)
118 | 			rb._checkconsistency()
119 | 			assert ref == rb, name
120 | 
121 | 	def test_inititerator(self, single):
122 | 		for name, data in single:
123 | 			ref = set(a for a in data)
124 | 			rb = RoaringBitmap(a for a in data)
125 | 			rb._checkconsistency()
126 | 			assert ref == rb, name
127 | 
128 | 	def test_initrange(self):
129 | 		# creates a positive, dense, and inverted block, respectively
130 | 		for n in [400, 6000, 61241]:
131 | 			ref = set(range(23, n))
132 | 			rb = RoaringBitmap(range(23, n))
133 | 			rb._checkconsistency()
134 | 			assert ref == rb, ('range(23, %d)' % n)
135 | 
136 | 	def test_initrangestep(self):
137 | 		# creates a positive, dense, and inverted block, respectively
138 | 		for n in [400, 6000, 61241]:
139 | 			for step in (2, 7, 113):
140 | 				ref = set(range(23, n * step, step))
141 | 				rb = RoaringBitmap(range(23, n * step, step))
142 | 				rb._checkconsistency()
143 | 				assert ref == rb, ('range(23, %d, %d)' % (n, step))
144 | 		n = 100 * (1 << 16)
145 | 		step = (1 << 16) + 7
146 | 		ref = set(range(23, n, step))
147 | 		rb = RoaringBitmap(range(23, n, step))
148 | 		rb._checkconsistency()
149 | 		assert ref == rb, ('range(23, %d, %d)' % (n, step))
150 | 
151 | 	def test_inititerableallset(self):
152 | 		rb = RoaringBitmap(list(range(0, 0xffff + 1)))
153 | 		assert len(rb) == 0xffff + 1
154 | 
155 | 	def test_add(self, single):
156 | 		for name, data in single:
157 | 			ref = set()
158 | 			rb = RoaringBitmap()
159 | 			for n in sorted(data):
160 | 				ref.add(n)
161 | 				rb.add(n)
162 | 			assert rb == ref, name
163 | 			with pytest.raises(OverflowError):
164 | 				rb.add(-1)
165 | 				rb.add(1 << 32)
166 | 			rb.add(0)
167 | 			rb.add((1 << 32) - 1)
168 | 			rb._checkconsistency()
169 | 
170 | 	def test_discard(self, single):
171 | 		for name, data in single:
172 | 			ref = set()
173 | 			rb = RoaringBitmap()
174 | 			for n in sorted(data):
175 | 				ref.add(n)
176 | 				rb.add(n)
177 | 			for n in sorted(data):
178 | 				ref.discard(n)
179 | 				rb.discard(n)
180 | 			rb._checkconsistency()
181 | 			assert len(ref) == 0, name
182 | 			assert len(rb) == 0, name
183 | 			assert rb == ref, name
184 | 
185 | 	def test_pop(self):
186 | 		rb = RoaringBitmap([60748, 28806, 54664, 28597, 58922, 75684, 56364,
187 | 			67421, 52608, 55686, 10427, 48506, 64363, 14506, 73077, 59035,
188 | 			70246, 19875, 73145, 40225, 58664, 6597, 65554, 73102, 26636,
189 | 			74227, 59566, 19023])
190 | 		while rb:
191 | 			rb.pop()
192 | 		rb._checkconsistency()
193 | 		assert len(rb) == 0
194 | 
195 | 	def test_contains(self, single):
196 | 		for name, data in single:
197 | 			ref = set(data)
198 | 			rb = RoaringBitmap(data)
199 | 			for a in data:
200 | 				assert a in ref, name
201 | 				assert a in rb, name
202 | 			for a in set(range(20000)) - set(data):
203 | 				assert a not in ref, name
204 | 				assert a not in rb, name
205 | 			rb._checkconsistency()
206 | 
207 | 	def test_eq(self, single):
208 | 		for name, data in single:
209 | 			ref, ref2 = set(data), set(data)
210 | 			rb, rb2 = RoaringBitmap(data), RoaringBitmap(data)
211 | 			assert (ref == ref2) == (rb == rb2), name
212 | 
213 | 	def test_neq(self, pair):
214 | 		for name, data1, data2 in pair:
215 | 			ref, ref2 = set(data1), set(data2)
216 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
217 | 			assert (ref != ref2) == (rb != rb2), name
218 | 
219 | 	def test_iter(self, single):
220 | 		for name, data in single:
221 | 			rb = RoaringBitmap(data)
222 | 			assert list(iter(rb)) == sorted(set(data)), name
223 | 
224 | 	def test_reversed(self, single):
225 | 		for name, data in single:
226 | 			rb = RoaringBitmap(data)
227 | 			for a, b in zip_longest(reversed(rb), reversed(sorted(set(data)))):
228 | 				assert a == b, name
229 | 
230 | 	def test_iand(self, pair):
231 | 		for name, data1, data2 in pair:
232 | 			ref, ref2 = set(data1), set(data2)
233 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
234 | 			ref &= ref2
235 | 			rb &= rb2
236 | 			rb._checkconsistency()
237 | 			assert rb == ref, name
238 | 
239 | 	def test_ior(self, pair):
240 | 		for name, data1, data2 in pair:
241 | 			ref, ref2 = set(data1), set(data2)
242 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
243 | 			ref |= ref2
244 | 			rb |= rb2
245 | 			rb._checkconsistency()
246 | 			assert rb == ref, name
247 | 
248 | 	def test_ixor(self, pair):
249 | 		for name, data1, data2 in pair:
250 | 			ref, ref2 = set(data1), set(data2)
251 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
252 | 			ref ^= ref2
253 | 			rb ^= rb2
254 | 			rb._checkconsistency()
255 | 			assert len(ref) == len(rb), name
256 | 			assert ref == rb, name
257 | 
258 | 	def test_isub(self, pair):
259 | 		for name, data1, data2 in pair:
260 | 			ref, ref2 = set(data1), set(data2)
261 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
262 | 			ref -= ref2
263 | 			rb -= rb2
264 | 			rb._checkconsistency()
265 | 			assert len(ref) <= len(set(data1))
266 | 			assert len(rb) <= len(set(data1)), name
267 | 			assert len(ref) == len(rb), name
268 | 			assert ref == rb, name
269 | 
270 | 	def test_and(self, pair):
271 | 		for name, data1, data2 in pair:
272 | 			ref, ref2 = set(data1), set(data2)
273 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
274 | 			assert ref & ref2 == set(rb & rb2), name
275 | 
276 | 	def test_or(self, pair):
277 | 		for name, data1, data2 in pair:
278 | 			ref, ref2 = set(data1), set(data2)
279 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
280 | 			assert ref | ref2 == set(rb | rb2), name
281 | 
282 | 	def test_xor(self, pair):
283 | 		for name, data1, data2 in pair:
284 | 			ref, ref2 = set(data1), set(data2)
285 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
286 | 			assert ref ^ ref2 == set(rb ^ rb2), name
287 | 
288 | 	def test_sub(self, pair):
289 | 		for name, data1, data2 in pair:
290 | 			ref, ref2 = set(data1), set(data2)
291 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
292 | 			assert ref - ref2 == set(rb - rb2), name
293 | 
294 | 	def test_subset(self, pair):
295 | 		for name, data1, data2 in pair:
296 | 			ref, ref2 = set(data1), set(data2)
297 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
298 | 			refans = ref <= ref2
299 | 			assert (set(rb) <= ref2) == refans, name
300 | 			assert (rb <= rb2) == refans, name
301 | 			k = len(data2) // 2
302 | 			ref, rb = set(data2[:k]), RoaringBitmap(data2[:k])
303 | 			refans = ref <= ref2
304 | 			assert (set(rb) <= ref2) == refans, name
305 | 			assert (ref <= set(rb2)) == refans, name
306 | 			assert (rb <= rb2) == refans, (name, rb.debuginfo())
307 | 
308 | 	def test_disjoint(self, pair):
309 | 		for name, data1, data2 in pair:
310 | 			ref, ref2 = set(data1), set(data2)
311 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
312 | 			refans = ref.isdisjoint(ref2)
313 | 			assert rb.isdisjoint(rb2) == refans, name
314 | 			data3 = [a for a in data2 if a not in ref]
315 | 			ref3, rb3 = set(data3), RoaringBitmap(data3)
316 | 			refans2 = ref.isdisjoint(ref3)
317 | 			assert rb.isdisjoint(rb3) == refans2, name
318 | 
319 | 	def test_clamp(self, single):
320 | 		for name, data in single:
321 | 			if len(data) == 0:
322 | 				continue
323 | 			a, b = sorted(sample(data, 2))
324 | 			ref = set(data).intersection(range(a, b))
325 | 			rb = RoaringBitmap(data).intersection(range(a, b))
326 | 			rb2 = RoaringBitmap(data).clamp(a, b)
327 | 			assert a <= rb2.min() and rb2.max() < b, name
328 | 			assert ref == rb2, (name, a, b)
329 | 			assert rb == rb2, (name, a, b)
330 | 
331 | 	def test_clamp_issue12(self):
332 | 		b = RoaringBitmap([1, 2, 3])
333 | 		assert b.clamp(0, 65536) == b
334 | 		assert b.clamp(0, 65537) == b
335 | 		assert b.clamp(0, 65538) == b
336 | 		assert b.clamp(0, 65539) == b
337 | 
338 | 	def test_clamp2(self):
339 | 		a = RoaringBitmap([0x00010001])
340 | 		b = RoaringBitmap([0x00030003, 0x00050005])
341 | 		c = RoaringBitmap([0x00070007])
342 | 		x = a | b | c
343 | 		assert x.clamp(0, 0x000FFFFF) == x
344 | 		assert x.clamp(0x000200FF, 0x000FFFFF) == b | c
345 | 		assert x.clamp(0x00030003, 0x000FFFFF) == b | c
346 | 		assert x.clamp(0, 0x00060006) == a | b
347 | 		assert x.clamp(0, 0x00050006) == a | b
348 | 		assert x.clamp(0, 0x00050005) == a | RoaringBitmap([0x00030003])
349 | 
350 | 	def test_aggregateand(self, multi):
351 | 		ref = set(multi[0])
352 | 		ref.intersection_update(*[set(a) for a in multi[1:]])
353 | 		rb = RoaringBitmap(multi[0])
354 | 		rb.intersection_update(*[RoaringBitmap(a) for a in multi[1:]])
355 | 		rb._checkconsistency()
356 | 		assert rb == ref
357 | 
358 | 	def test_aggregateor(self, multi):
359 | 		ref = set(multi[0])
360 | 		ref.update(*[set(a) for a in multi[1:]])
361 | 		rb = RoaringBitmap(multi[0])
362 | 		rb.update(*[RoaringBitmap(a) for a in multi[1:]])
363 | 		rb._checkconsistency()
364 | 		assert rb == ref
365 | 
366 | 	def test_andlen(self, pair):
367 | 		for name, data1, data2 in pair:
368 | 			ref, ref2 = set(data1), set(data2)
369 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
370 | 			assert len(rb & rb2) == rb.intersection_len(rb2), name
371 | 			assert len(ref & ref2) == rb.intersection_len(rb2), name
372 | 
373 | 	def test_orlen(self, pair):
374 | 		for name, data1, data2 in pair:
375 | 			ref, ref2 = set(data1), set(data2)
376 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
377 | 			assert len(ref | ref2) == rb.union_len(rb2), name
378 | 			assert len(rb | rb2) == rb.union_len(rb2), name
379 | 
380 | 	def test_jaccard_dist(self, pair):
381 | 		for name, data1, data2 in pair:
382 | 			if len(data1) == 0 and len(data2) == 0:
383 | 				continue
384 | 			ref, ref2 = set(data1), set(data2)
385 | 			rb, rb2 = RoaringBitmap(data1), RoaringBitmap(data2)
386 | 			assert len(ref & ref2) / float(len(ref | ref2)) == pytest.approx(
387 | 					rb.intersection_len(rb2) / float(rb.union_len(rb2))), name
388 | 			assert (1 - (len(ref & ref2) / float(len(ref | ref2)))
389 | 					== pytest.approx(rb.jaccard_dist(rb2))), name
390 | 
391 | 	def test_rank(self, single):
392 | 		for name, data in single:
393 | 			if len(data) == 0:
394 | 				continue
395 | 			ref = sorted(set(data))
396 | 			rb = RoaringBitmap(data)
397 | 			for _ in range(10):
398 | 				x = choice(ref)
399 | 				assert x in rb, name
400 | 				assert rb.rank(x) == ref.index(x) + 1, name
401 | 
402 | 	def test_select(self, single):
403 | 		for name, data in single:
404 | 			if len(data) == 0:
405 | 				continue
406 | 			ref = sorted(set(data))
407 | 			rb = RoaringBitmap(data)
408 | 			lrb = list(rb)
409 | 			idx = [randint(0, len(ref) - 1) for _ in range(10)]
410 | 			for i in idx:
411 | 				assert lrb[i] == ref[i], (name, i, len(ref))
412 | 				assert rb.select(i) in rb, name
413 | 				assert rb.select(i) == ref[i], name
414 | 				assert rb[i] == ref[i], name
415 | 				assert rb.rank(rb.select(i)) - 1 == i, name
416 | 				if rb.select(i) + 1 in rb:
417 | 					assert rb.rank(rb.select(i) + 1) - 1 == i + 1, name
418 | 				else:
419 | 					assert rb.rank(rb.select(i) + 1) - 1 == i, name
420 | 
421 | 	def test_rank2(self):
422 | 		rb = RoaringBitmap(range(0, 100000, 7))
423 | 		rb.update(range(100000, 200000, 1000))
424 | 		for k in range(100000):
425 | 			assert rb.rank(k) == 1 + k // 7
426 | 		for k in range(100000, 200000):
427 | 			assert rb.rank(k) == 1 + 100000 // 7 + 1 + (k - 100000) // 1000
428 | 
429 | 	def test_select2(self):
430 | 		gap = 1
431 | 		while gap <= 1024:
432 | 			rb = RoaringBitmap(range(0, 100000, gap))
433 | 			for k in range(0, 100000 // gap):
434 | 				assert rb.select(k) == k * gap
435 | 			gap *= 2
436 | 
437 | 	def test_select_issue15(self):
438 | 		rb = RoaringBitmap(range(0x10000, 0x1ffff + 1))
439 | 		assert rb[0] == 0x10000
440 | 		rb.discard(0x10010)
441 | 		assert rb[0] == 0x10000
442 | 		rb = RoaringBitmap(range(0x10, 0xffff + 1))
443 | 		assert rb[0] == 0x10, rb.debuginfo(True)
444 | 		rb = RoaringBitmap(range(0x10010, 0x1ffff + 1))
445 | 		assert rb[0] == 0x10010, rb.debuginfo(True)
446 | 		lst = list(range(1, 0xccbb))
447 | 		lst.extend(range(0xcccc, 0xfffc))
448 | 		rb = RoaringBitmap(lst)
449 | 		for n in (0, 0xcccc, -1):
450 | 			assert lst[n] == rb[n], (n, lst[n], rb[n])
451 | 
452 | 	def test_pickle(self, single):
453 | 		for name, data in single:
454 | 			rb = RoaringBitmap(data)
455 | 			rb_pickled = pickle.dumps(rb, protocol=-1)
456 | 			rb_unpickled = pickle.loads(rb_pickled)
457 | 			rb._checkconsistency()
458 | 			assert rb_unpickled == rb, name
459 | 
460 | 	def test_invalid(self):
461 | 		with pytest.raises(TypeError):
462 | 			rb = RoaringBitmap([1, 2, 'a'])
463 | 		with pytest.raises(TypeError):
464 | 			RoaringBitmap([1, 2]) < [1, 2, 3]
465 | 
466 | 	def test_slices(self):  # issue 20
467 | 		ref = list(range(10))
468 | 		rb = RoaringBitmap(range(10))
469 | 		assert list(rb[::2]) == ref[::2]
470 | 		with pytest.raises(ValueError):
471 | 			_ = rb[::-2]
472 | 		with pytest.raises(ValueError):
473 | 			_ = rb[::0]
474 | 		del rb[::2]
475 | 		del ref[::2]
476 | 		assert list(rb) == ref
477 | 
478 | 	def test_minmax(self):
479 | 		rb = RoaringBitmap(range(0, 61440))
480 | 		assert rb.min() == 0
481 | 		assert rb.max() == 61439
482 | 		rb1 = RoaringBitmap(range(0, 61441))
483 | 		assert rb1.min() == 0
484 | 		assert rb1.max() == 61440
485 | 		assert rb1[61440] == 61440
486 | 		assert list(rb1)[61440] == 61440
487 | 
488 | 	def test_issue19(self):
489 | 		a = RoaringBitmap()
490 | 		b = RoaringBitmap(range(4095))
491 | 		c = RoaringBitmap(range(2))
492 | 		a |= b
493 | 		a |= c
494 | 		assert len(a - b - c) == 0
495 | 		assert len((b | c) - b - c) == 0
496 | 
497 | 	def test_issue22(self):
498 | 		rb = RoaringBitmap(range(0, 61440))
499 | 		rb1 = RoaringBitmap(range(0, 61441))
500 | 		assert len(rb ^ rb) == 0
501 | 		assert len(rb - rb) == 0
502 | 		assert len(rb1 ^ rb1) == 0
503 | 		assert len(rb1 - rb1) == 0
504 | 		assert len(~rb) == 0
505 | 		assert len(~rb1) == 0
506 | 
507 | 		rb1 = RoaringBitmap(range(0, 61441))
508 | 		assert len(rb ^ rb) == 0
509 | 		rb1 ^= rb1
510 | 		assert len(rb1) == 0
511 | 
512 | 		rb1 = RoaringBitmap(range(0, 61441))
513 | 		rb1 -= rb1
514 | 		assert len(rb1) == 0
515 | 
516 | 	def test_issue24(self):
517 | 		r = RoaringBitmap(range(131071))
518 | 		assert r.pop() == 131070
519 | 		assert r.pop() == 131069
520 | 
521 | 		rr = r - RoaringBitmap([130752])
522 | 		assert 130752 not in rr
523 | 		assert rr.pop() == 131068
524 | 
525 | 		r.difference_update(RoaringBitmap([130752]))
526 | 		assert 130752 not in r
527 | 		assert r.pop() == 131068
528 | 
529 | 	def test_issue25(self):
530 | 		r = RoaringBitmap({1})
531 | 		r.intersection_update(RoaringBitmap([]))
532 | 		assert len(r) == 0
533 | 
534 | 	def test_issue28(self):
535 | 		rbm = RoaringBitmap()
536 | 		rbm.add(3995084765)
537 | 		r = rbm.clamp(0, 8388607)
538 | 		assert len(r) == 0
539 | 
540 | 	def test_issue34(self):
541 | 		seed(232992)
542 | 		set_a = sample(range(235342), k=169308)
543 | 		set_b = sample(range(255999), k=255713)
544 | 		rba = RoaringBitmap(set_a)
545 | 		rbb = RoaringBitmap(set_b)
546 | 		assert rba - rbb == set(set_a) - set(set_b)
547 | 		rba -= rbb
548 | 		assert rba == set(set_a) - set(set_b)
549 | 
550 | 
551 | class Test_immutablerb(object):
552 | 	def test_inittrivial(self):
553 | 		data = list(range(5))
554 | 		ref = set(data)
555 | 		rb = ImmutableRoaringBitmap(data)
556 | 		rb._checkconsistency()
557 | 		assert ref == rb
558 | 		assert type(rb) == ImmutableRoaringBitmap
559 | 
560 | 	def test_initsorted(self, single):
561 | 		for name, data in single:
562 | 			ref = set(sorted(data))
563 | 			rb = ImmutableRoaringBitmap(sorted(data))
564 | 			rb._checkconsistency()
565 | 			assert ref == rb, name
566 | 
567 | 	def test_initunsorted(self, single):
568 | 		for name, data in single:
569 | 			ref = set(data)
570 | 			rb = ImmutableRoaringBitmap(data)
571 | 			rb._checkconsistency()
572 | 			assert ref == rb, name
573 | 
574 | 	def test_inititerator(self, single):
575 | 		for name, data in single:
576 | 			ref = set(a for a in data)
577 | 			rb = ImmutableRoaringBitmap(a for a in data)
578 | 			rb._checkconsistency()
579 | 			assert ref == rb, name
580 | 
581 | 	def test_initrange(self):
582 | 		# creates a positive, dense, and inverted block, respectively
583 | 		for n in [400, 6000, 61241]:
584 | 			ref = set(range(23, n))
585 | 			rb = ImmutableRoaringBitmap(range(23, n))
586 | 			rb._checkconsistency()
587 | 			assert ref == rb, n
588 | 
589 | 	def test_initrb(self):
590 | 		r = RoaringBitmap(range(5))
591 | 		i = ImmutableRoaringBitmap(r)
592 | 		r = RoaringBitmap(i)
593 | 		assert r == i
594 | 
595 | 		i = ImmutableRoaringBitmap(range(5))
596 | 		r = RoaringBitmap(i)
597 | 		assert r == i
598 | 
599 | 	def test_pickle(self, single):
600 | 		for name, data in single:
601 | 			rb = ImmutableRoaringBitmap(data)
602 | 			rb_pickled = pickle.dumps(rb, protocol=-1)
603 | 			rb_unpickled = pickle.loads(rb_pickled)
604 | 			rb._checkconsistency()
605 | 			assert rb_unpickled == rb, name
606 | 			assert type(rb) == ImmutableRoaringBitmap, name
607 | 
608 | 	def test_and(self, pair):
609 | 		for name, data1, data2 in pair:
610 | 			ref, ref2 = set(data1), set(data2)
611 | 			rb = ImmutableRoaringBitmap(data1)
612 | 			rb2 = ImmutableRoaringBitmap(data2)
613 | 			assert ref & ref2 == set(rb & rb2), name
614 | 			assert type(rb & rb2) == RoaringBitmap, name
615 | 
616 | 	def test_or(self, pair):
617 | 		for name, data1, data2 in pair:
618 | 			ref, ref2 = set(data1), set(data2)
619 | 			rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2)
620 | 			assert ref | ref2 == set(rb | rb2), name
621 | 
622 | 	def test_xor(self, pair):
623 | 		for name, data1, data2 in pair:
624 | 			ref, ref2 = set(data1), set(data2)
625 | 			rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2)
626 | 			assert ref ^ ref2 == set(rb ^ rb2), name
627 | 
628 | 	def test_sub(self, pair):
629 | 		for name, data1, data2 in pair:
630 | 			ref, ref2 = set(data1), set(data2)
631 | 			rb, rb2 = ImmutableRoaringBitmap(data1), ImmutableRoaringBitmap(data2)
632 | 			assert ref - ref2 == set(rb - rb2), name
633 | 
634 | 	def test_aggregateand(self, multi):
635 | 		ref = set(multi[0])
636 | 		res1 = ref.intersection(*[set(a) for a in multi[1:]])
637 | 		rb = ImmutableRoaringBitmap(multi[0])
638 | 		res2 = rb.intersection(*[ImmutableRoaringBitmap(a) for a in multi[1:]])
639 | 		res2._checkconsistency()
640 | 		assert res1 == res2
641 | 
642 | 	def test_aggregateor(self, multi):
643 | 		ref = set(multi[0])
644 | 		res1 = ref.union(*[set(a) for a in multi[1:]])
645 | 		rb = ImmutableRoaringBitmap(multi[0])
646 | 		res2 = rb.union(*[ImmutableRoaringBitmap(a) for a in multi[1:]])
647 | 		res2._checkconsistency()
648 | 		assert res1 == res2
649 | 
650 | 	def test_andlen(self, pair):
651 | 		for name, data1, data2 in pair:
652 | 			ref, ref2 = set(data1), set(data2)
653 | 			rb = ImmutableRoaringBitmap(data1)
654 | 			rb2 = ImmutableRoaringBitmap(data2)
655 | 			assert len(rb & rb2) == rb.intersection_len(rb2), name
656 | 			assert len(ref & ref2) == rb.intersection_len(rb2), name
657 | 
658 | 	def test_orlen(self, pair):
659 | 		for name, data1, data2 in pair:
660 | 			ref, ref2 = set(data1), set(data2)
661 | 			rb = ImmutableRoaringBitmap(data1)
662 | 			rb2 = ImmutableRoaringBitmap(data2)
663 | 			assert len(ref | ref2) == rb.union_len(rb2), name
664 | 			assert len(rb | rb2) == rb.union_len(rb2), name
665 | 
666 | 	def test_jaccard_dist(self, pair):
667 | 		for name, data1, data2 in pair:
668 | 			if len(data1) == 0 and len(data2) == 0:
669 | 				continue
670 | 			ref, ref2 = set(data1), set(data2)
671 | 			rb = ImmutableRoaringBitmap(data1)
672 | 			rb2 = ImmutableRoaringBitmap(data2)
673 | 			assert len(ref & ref2) / float(len(ref | ref2)) == pytest.approx(
674 | 					rb.intersection_len(rb2) / float(rb.union_len(rb2))), name
675 | 			assert (1 - (len(ref & ref2) / float(len(ref | ref2)))
676 | 					== pytest.approx(rb.jaccard_dist(rb2))), name
677 | 
678 | 	def test_rank(self, single):
679 | 		for name, data in single:
680 | 			if len(data) == 0:
681 | 				continue
682 | 			ref = sorted(set(data))
683 | 			rb = ImmutableRoaringBitmap(data)
684 | 			for _ in range(10):
685 | 				x = choice(ref)
686 | 				assert x in rb, name
687 | 				assert rb.rank(x) == ref.index(x) + 1, name
688 | 
689 | 	def test_select(self, single):
690 | 		for name, data in single:
691 | 			if len(data) == 0:
692 | 				continue
693 | 			ref = sorted(set(data))
694 | 			rb = ImmutableRoaringBitmap(data)
695 | 			lrb = list(rb)
696 | 			idx = [0, 1, 2] + [
697 | 					randint(0, len(ref) - 1) for _ in range(10)] + [
698 | 					len(ref) - 1, len(ref) - 2]
699 | 			for i in idx:
700 | 				assert lrb[i] == ref[i], name
701 | 				assert rb.select(i) in rb, name
702 | 				assert rb.select(i) == ref[i], name
703 | 				assert rb.rank(rb.select(i)) - 1 == i, name
704 | 				if rb.select(i) + 1 in rb:
705 | 					assert rb.rank(rb.select(i) + 1) - 1 == i + 1, name
706 | 				else:
707 | 					assert rb.rank(rb.select(i) + 1) - 1 == i, name
708 | 
709 | 	def test_rank2(self):
710 | 		rb = ImmutableRoaringBitmap(range(0, 100000, 7))
711 | 		rb = rb.union(range(100000, 200000, 1000))
712 | 		for k in range(100000):
713 | 			assert rb.rank(k) == 1 + k // 7
714 | 		for k in range(100000, 200000):
715 | 			assert rb.rank(k) == 1 + 100000 // 7 + 1 + (k - 100000) // 1000
716 | 
717 | 	def test_select2(self):
718 | 		gap = 1
719 | 		while gap <= 1024:
720 | 			rb = ImmutableRoaringBitmap(range(0, 100000, gap))
721 | 			for k in range(0, 100000 // gap):
722 | 				assert rb.select(k) == k * gap
723 | 			gap *= 2
724 | 
725 | 
726 | class Test_multirb(object):
727 | 	def test_init(self, multi):
728 | 		orig = [RoaringBitmap(a) for a in multi]
729 | 		mrb = MultiRoaringBitmap(orig)
730 | 		assert len(orig) == len(mrb)
731 | 		for rb1, rb2 in zip(orig, mrb):
732 | 			assert rb1 == rb2
733 | 
734 | 	def test_none(self, multi):
735 | 		orig = [RoaringBitmap(a) for a in multi]
736 | 		orig.insert(4, RoaringBitmap())
737 | 		mrb = MultiRoaringBitmap(orig)
738 | 		assert len(orig) == len(mrb)
739 | 		for rb1, rb2 in zip(orig, mrb):
740 | 			assert rb1 == rb2
741 | 		assert mrb.intersection([4, 5]) is None
742 | 
743 | 	def test_aggregateand(self, multi):
744 | 		ref = set(multi[0])
745 | 		res1 = ref.intersection(*[set(a) for a in multi[1:]])
746 | 		mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
747 | 		res2 = mrb.intersection(list(range(len(mrb))))
748 | 		assert res1 == res2
749 | 
750 | 	def test_jaccard(self, multi):
751 | 		mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
752 | 		indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8])
753 | 		indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6])
754 | 		res = mrb.jaccard_dist(indices1, indices2)
755 | 		ref = array.array(b'd' if PY2 else 'd', [mrb[i].jaccard_dist(mrb[j])
756 | 				for i, j in zip(indices1, indices2)])
757 | 		assert res == ref
758 | 
759 | 	def test_andor_len_pairwise(self, multi):
760 | 		mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
761 | 		indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8])
762 | 		indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6])
763 | 		res1 = array.array(b'L' if PY2 else 'L', [0] * len(indices1))
764 | 		res2 = array.array(b'L' if PY2 else 'L', [0] * len(indices1))
765 | 		mrb.andor_len_pairwise(indices1, indices2, res1, res2)
766 | 		ref1 = array.array(b'L' if PY2 else 'L')
767 | 		ref2 = array.array(b'L' if PY2 else 'L')
768 | 		for i, j in zip(indices1, indices2):
769 | 			ref1.append(len(mrb[i] & mrb[j]))
770 | 			ref2.append(len(mrb[i] | mrb[j]))
771 | 		assert res1 == ref1
772 | 		assert res2 == ref2
773 | 
774 | 	def test_clamp(self, multi):
775 | 		a, b = sorted(sample(multi[0], 2))
776 | 		ref = set.intersection(
777 | 				*[set(x) for x in multi]) & set(range(a, b))
778 | 		mrb = MultiRoaringBitmap([RoaringBitmap(x) for x in multi])
779 | 		rb = mrb.intersection(list(range(len(mrb))), start=a, stop=b)
780 | 		assert a <= rb.min() and rb.max() < b
781 | 		assert ref == rb
782 | 
783 | 	def test_serialize(self, multi):
784 | 		orig = [RoaringBitmap(a) for a in multi]
785 | 		mrb = MultiRoaringBitmap(orig)
786 | 		with tempfile.NamedTemporaryFile(delete=False) as tmp:
787 | 			mrb2 = MultiRoaringBitmap(orig, filename=tmp.name)
788 | 			del mrb2
789 | 			mrb_deserialized = MultiRoaringBitmap.fromfile(tmp.name)
790 | 			assert len(orig) == len(mrb)
791 | 			assert len(orig) == len(mrb_deserialized)
792 | 			for rb1, rb2, rb3 in zip(orig, mrb, mrb_deserialized):
793 | 				assert rb1 == rb2
794 | 				assert rb1 == rb3
795 | 				rb3._checkconsistency()
796 | 				assert type(rb3) == ImmutableRoaringBitmap
797 | 
798 | 	def test_multi1(self):
799 | 		for_multi = []
800 | 		for i in range(5):
801 | 			for_multi += [RoaringBitmap(sample(range(99999), 200))]
802 | 		mrb = MultiRoaringBitmap(for_multi)
803 | 		assert len(mrb) == 5
804 | 		assert mrb[4] == for_multi[4]
805 | 		with pytest.raises(IndexError):
806 | 			mrb[5]
807 | 		assert mrb[-1] == for_multi[-1]
808 | 		list(mrb)
809 | 		for n, rb in enumerate(mrb):
810 | 			assert rb == for_multi[n], n
811 | 
812 | 	def test_multi2(self):
813 | 		for_multi_pre = []
814 | 		for x in range(3):
815 | 			for_multi = []
816 | 			for i in range(5):
817 | 				for_multi += [RoaringBitmap(sample(range(99999), 200))]
818 | 			mrb = MultiRoaringBitmap(for_multi)
819 | 			for_multi_pre += [mrb[0], mrb[1]]
820 | 
821 | 		assert type(for_multi_pre) is list
822 | 		for_multi_pre[-1]
823 | 		list(for_multi_pre)
824 | 
825 | 	def test_eq(self, multi):
826 | 		orig = [RoaringBitmap(a) for a in multi]
827 | 		mrb = MultiRoaringBitmap(orig)
828 | 		mrb2 = MultiRoaringBitmap(orig)
829 | 		mrb3 = MultiRoaringBitmap(orig[1:])
830 | 		assert mrb == orig
831 | 		assert mrb == mrb2
832 | 		assert mrb != orig[1:]
833 | 		assert mrb != mrb3
834 | 


--------------------------------------------------------------------------------